From da87d648b3dae56635008ef8f86a9fa41518cbbc Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Tue, 17 Aug 2021 16:51:34 -0700
Subject: [PATCH 001/530] `F.avg_pool3` CUDA backward: gpuAtomicAddNoReturn ->
 fastAtomicAdd (#63387)

Summary:
Rel: https://github.com/pytorch/pytorch/issues/62695

In the following two tables, I set `kernel_size` to 3 and `stride` to 2.
In benchmark, input tensors have the shape of (N, C, n_features, n_features, n_features).
Tested on RTX3080 w/ CUDA11.4 Update 1.

## This PR

|   N |   C |   n_features | dtype         |        time |
|----:|----:|-------------:|:--------------|------------:|
|  32 |   3 |            8 | torch.float16 | 7.46846e-05 |
|  32 |   3 |            8 | torch.float32 | 8.18968e-05 |
|  32 |   3 |           32 | torch.float16 | 0.000156748 |
|  32 |   3 |           32 | torch.float32 | 0.000165236 |
|  32 |   3 |          128 | torch.float16 | 0.00549854  |
|  32 |   3 |          128 | torch.float32 | 0.008926    |

## master (6acd87f)

|   N |   C |   n_features | dtype         |        time |
|----:|----:|-------------:|:--------------|------------:|
|  32 |   3 |            8 | torch.float16 | 7.60436e-05 |
|  32 |   3 |            8 | torch.float32 | 7.55072e-05 |
|  32 |   3 |           32 | torch.float16 | 0.000189292 |
|  32 |   3 |           32 | torch.float32 | 0.000168645 |
|  32 |   3 |          128 | torch.float16 | 0.00699538  |
|  32 |   3 |          128 | torch.float32 | 0.00890226  |

master's time divided by PR's time is as follows:

| N | C | n_features | master / PR |
|---:|---:|---------------:|----------------:|
| 32 | 3 | 8 | 1.018 |
| 32 | 3 | 32 | 1.208 |
| 32 | 3 | 128 | 1.272|

cc: xwang233 ptrblck ngimel

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63387

Reviewed By: mruberry

Differential Revision: D30381434

Pulled By: ngimel

fbshipit-source-id: 3b97aee4b0d457a0277a0d31ac56d4151134c099
---
 aten/src/ATen/native/cuda/AveragePool3d.cu | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/cuda/AveragePool3d.cu b/aten/src/ATen/native/cuda/AveragePool3d.cu
index 671b354734db0..6c712af93cc68 100644
--- a/aten/src/ATen/native/cuda/AveragePool3d.cu
+++ b/aten/src/ATen/native/cuda/AveragePool3d.cu
@@ -5,6 +5,7 @@
 #include <ATen/cuda/detail/TensorInfo.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/detail/KernelUtils.h>
+#include <ATen/native/cuda/KernelUtils.cuh>
 #include <THC/THCAtomics.cuh>
 #include <THC/THCNumerics.cuh>
 #include <c10/macros/Macros.h>
@@ -210,7 +211,7 @@ __global__ void avg_pool3d_cuda_update_grad_input_atomic(
   int dT, int dH, int dW,
   int padT, int padH, int padW,
   bool count_include_pad,
-  int offsetZ, int divisor_override)
+  int offsetZ, int divisor_override, const int gradInput_numel)
 {
   int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
   int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
@@ -253,7 +254,8 @@ __global__ void avg_pool3d_cuda_update_grad_input_atomic(
       {
         for (int iCol = wstart; iCol < wend; ++iCol)
         {
-          gpuAtomicAddNoReturn(&gradInput[slice][iFrame][iRow][iCol], val);
+          const int index = slice * gradInput.stride(0) + iFrame * gradInput.stride(1) + iRow * gradInput.stride(2) + iCol * gradInput.stride(3);
+          fastAtomicAdd(gradInput.data(), index, gradInput_numel, val, true);
         }
       }
     }
@@ -568,7 +570,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cuda) (
                   dT, dH, dW,
                   padT, padH, padW,
                   count_include_pad,
-                  offsetZ, divisor);
+                  offsetZ, divisor, work_grad_input.numel());
             C10_CUDA_KERNEL_LAUNCH_CHECK();
           }
           else {

From 975542c3146df9ad5c4fc74fc4651a22e194954e Mon Sep 17 00:00:00 2001
From: zhouzhuojie <zhouzhuojie@gmail.com>
Date: Tue, 17 Aug 2021 16:53:08 -0700
Subject: [PATCH 002/530] Add more ciflow labels for more workflows (#63410)

Summary:
- Add more ciflow labels and enable it for more workflows.
- Only the 'ciflow/default' workflows are run by default on pull_request time
- Other labels can be manually triggered by (adding the labels + unassign pytorchbot), OR wait for pytorchbot's comment opt-in rollout
- The label design is a logical operator `OR`, i.e. adding ('ciflow/cuda' + 'ciflow/win') will trigger the union of them. (design feedback is needed here)

Typical default workflows for normal PRs.

<details>
<summary>Generated label rules</summary>

![image](https://user-images.githubusercontent.com/658840/129779905-eb5e56dd-a696-4040-9eb6-71ecb6487dc1.png)

```
{
  "label_rules": {
    "ciflow/all": [
      "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
      "libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
      "linux-bionic-cuda10.2-py3.9-gcc7",
      "linux-bionic-py3.8-gcc9-coverage",
      "linux-xenial-cuda10.2-py3.6-gcc7",
      "linux-xenial-cuda11.1-py3.6-gcc7",
      "linux-xenial-py3.6-gcc5.4",
      "linux-xenial-py3.6-gcc7-bazel-test",
      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
      "periodic-linux-xenial-cuda11.3-py3.6-gcc7",
      "periodic-win-vs2019-cuda11.3-py3",
      "win-vs2019-cpu-py3",
      "win-vs2019-cuda10.1-py3",
      "win-vs2019-cuda11.1-py3"
    ],
    "ciflow/bazel": [
      "linux-xenial-py3.6-gcc7-bazel-test"
    ],
    "ciflow/coverage": [
      "linux-bionic-py3.8-gcc9-coverage"
    ],
    "ciflow/cpu": [
      "linux-bionic-py3.8-gcc9-coverage",
      "linux-xenial-py3.6-gcc5.4",
      "linux-xenial-py3.6-gcc7-bazel-test",
      "win-vs2019-cpu-py3"
    ],
    "ciflow/cuda": [
      "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
      "libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
      "linux-bionic-cuda10.2-py3.9-gcc7",
      "linux-xenial-cuda10.2-py3.6-gcc7",
      "linux-xenial-cuda11.1-py3.6-gcc7",
      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
      "periodic-linux-xenial-cuda11.3-py3.6-gcc7",
      "periodic-win-vs2019-cuda11.3-py3",
      "win-vs2019-cuda10.1-py3",
      "win-vs2019-cuda11.1-py3"
    ],
    "ciflow/default": [
      "linux-bionic-py3.8-gcc9-coverage",
      "linux-xenial-cuda11.1-py3.6-gcc7",
      "linux-xenial-py3.6-gcc5.4",
      "linux-xenial-py3.6-gcc7-bazel-test",
      "win-vs2019-cpu-py3",
      "win-vs2019-cuda10.1-py3"
    ],
    "ciflow/libtorch": [
      "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
      "libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7"
    ],
    "ciflow/linux": [
      "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
      "libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
      "linux-bionic-cuda10.2-py3.9-gcc7",
      "linux-bionic-py3.8-gcc9-coverage",
      "linux-xenial-cuda10.2-py3.6-gcc7",
      "linux-xenial-cuda11.1-py3.6-gcc7",
      "linux-xenial-py3.6-gcc5.4",
      "linux-xenial-py3.6-gcc7-bazel-test",
      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
      "periodic-linux-xenial-cuda11.3-py3.6-gcc7"
    ],
    "ciflow/scheduled": [
      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
      "periodic-linux-xenial-cuda11.3-py3.6-gcc7",
      "periodic-win-vs2019-cuda11.3-py3"
    ],
    "ciflow/slow": [
      "linux-bionic-cuda10.2-py3.9-gcc7",
      "linux-xenial-cuda10.2-py3.6-gcc7"
    ],
    "ciflow/win": [
      "periodic-win-vs2019-cuda11.3-py3",
      "win-vs2019-cpu-py3",
      "win-vs2019-cuda10.1-py3",
      "win-vs2019-cuda11.1-py3"
    ]
  },
  "version": "v1"
}
```
</details>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63410

Reviewed By: ngimel

Differential Revision: D30378553

Pulled By: zhouzhuojie

fbshipit-source-id: 4e0953740793e5e72b95018f8ab2ce4a6a364c38
---
 .github/generated-ciflow-ruleset.json         |  65 +++++++++++
 .github/scripts/generate_ci_workflows.py      | 102 +++++++++++++-----
 ...torch-linux-xenial-cuda10.2-py3.6-gcc7.yml |  12 ++-
 ...torch-linux-xenial-cuda11.1-py3.6-gcc7.yml |  12 ++-
 ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml |  17 ++-
 ...rated-linux-bionic-py3.8-gcc9-coverage.yml |   2 +-
 ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml |   2 +-
 ...rated-linux-xenial-cuda11.1-py3.6-gcc7.yml |  17 ++-
 .../generated-linux-xenial-py3.6-gcc5.4.yml   |  17 ++-
 ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml |   2 +-
 ...torch-linux-xenial-cuda11.3-py3.6-gcc7.yml |   2 +-
 ...iodic-linux-xenial-cuda11.3-py3.6-gcc7.yml |   2 +-
 ...rated-periodic-win-vs2019-cuda11.3-py3.yml |   2 +-
 .../generated-win-vs2019-cpu-py3.yml          |  13 ++-
 .../generated-win-vs2019-cuda10.1-py3.yml     |  13 ++-
 .../generated-win-vs2019-cuda11.1-py3.yml     |  14 ++-
 16 files changed, 240 insertions(+), 54 deletions(-)

diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json
index 70d5fd45298e8..80b2cabfff788 100644
--- a/.github/generated-ciflow-ruleset.json
+++ b/.github/generated-ciflow-ruleset.json
@@ -1,19 +1,84 @@
 {
   "label_rules": {
+    "ciflow/all": [
+      "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
+      "libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
+      "linux-bionic-cuda10.2-py3.9-gcc7",
+      "linux-bionic-py3.8-gcc9-coverage",
+      "linux-xenial-cuda10.2-py3.6-gcc7",
+      "linux-xenial-cuda11.1-py3.6-gcc7",
+      "linux-xenial-py3.6-gcc5.4",
+      "linux-xenial-py3.6-gcc7-bazel-test",
+      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
+      "periodic-linux-xenial-cuda11.3-py3.6-gcc7",
+      "periodic-win-vs2019-cuda11.3-py3",
+      "win-vs2019-cpu-py3",
+      "win-vs2019-cuda10.1-py3",
+      "win-vs2019-cuda11.1-py3"
+    ],
+    "ciflow/bazel": [
+      "linux-xenial-py3.6-gcc7-bazel-test"
+    ],
+    "ciflow/coverage": [
+      "linux-bionic-py3.8-gcc9-coverage"
+    ],
+    "ciflow/cpu": [
+      "linux-bionic-py3.8-gcc9-coverage",
+      "linux-xenial-py3.6-gcc5.4",
+      "linux-xenial-py3.6-gcc7-bazel-test",
+      "win-vs2019-cpu-py3"
+    ],
+    "ciflow/cuda": [
+      "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
+      "libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
+      "linux-bionic-cuda10.2-py3.9-gcc7",
+      "linux-xenial-cuda10.2-py3.6-gcc7",
+      "linux-xenial-cuda11.1-py3.6-gcc7",
+      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
+      "periodic-linux-xenial-cuda11.3-py3.6-gcc7",
+      "periodic-win-vs2019-cuda11.3-py3",
+      "win-vs2019-cuda10.1-py3",
+      "win-vs2019-cuda11.1-py3"
+    ],
     "ciflow/default": [
       "linux-bionic-py3.8-gcc9-coverage",
+      "linux-xenial-cuda11.1-py3.6-gcc7",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
       "win-vs2019-cpu-py3",
       "win-vs2019-cuda10.1-py3"
     ],
+    "ciflow/libtorch": [
+      "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
+      "libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
+      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7"
+    ],
+    "ciflow/linux": [
+      "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
+      "libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
+      "linux-bionic-cuda10.2-py3.9-gcc7",
+      "linux-bionic-py3.8-gcc9-coverage",
+      "linux-xenial-cuda10.2-py3.6-gcc7",
+      "linux-xenial-cuda11.1-py3.6-gcc7",
+      "linux-xenial-py3.6-gcc5.4",
+      "linux-xenial-py3.6-gcc7-bazel-test",
+      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
+      "periodic-linux-xenial-cuda11.3-py3.6-gcc7"
+    ],
     "ciflow/scheduled": [
       "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
       "periodic-linux-xenial-cuda11.3-py3.6-gcc7",
       "periodic-win-vs2019-cuda11.3-py3"
     ],
     "ciflow/slow": [
+      "linux-bionic-cuda10.2-py3.9-gcc7",
       "linux-xenial-cuda10.2-py3.6-gcc7"
+    ],
+    "ciflow/win": [
+      "periodic-win-vs2019-cuda11.3-py3",
+      "win-vs2019-cpu-py3",
+      "win-vs2019-cuda10.1-py3",
+      "win-vs2019-cuda11.1-py3"
     ]
   },
   "version": "v1"
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 0d6844bf8dadc..fce50ac7811e5 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -29,24 +29,16 @@
     LINUX_CUDA_TEST_RUNNER,
 }
 
+CUDA_RUNNERS = {
+    WINDOWS_CUDA_TEST_RUNNER,
+    LINUX_CUDA_TEST_RUNNER,
+}
+CPU_RUNNERS = {
+    WINDOWS_CPU_TEST_RUNNER,
+    LINUX_CPU_TEST_RUNNER,
+}
+
 
-# TODO: ------------- Remove the comment once fully rollout -------------------
-#       Rollout Strategy:
-#       1. Manual Phase
-#          step 1. Add 'ciflow/default' label to the PR
-#          step 2. Once there's an [unassigned] event from PR, it should rerun
-#          step 3. Remove 'ciflow/default' label
-#          step 4. Trigger the [unassigned] event again, it should not rerun
-#       2. Probot Phase 1 (manual on 1 workflow)
-#          step 1. Probot automatically add labels based on the context
-#          step 2. Manually let probot trigger [unassigned] event
-#       3. Probot Phase 2 (auto on 1 workflows)
-#          step 1. Modify the workflows so that they only listen on [unassigned] events
-#          step 2. Probot automatically adds labels automatically based on the context
-#          step 3. Probot automatically triggers [unassigned] event
-#       4. Probot Phase 3 (auto on many workflows)
-#          step 1. Enable it for all workflows
-#       -----------------------------------------------------------------------
 @dataclass
 class CIFlowConfig:
     enabled: bool = False
@@ -67,11 +59,11 @@ def gen_root_job_condition(self) -> None:
         # Once fully rollout, we can have strict constraints
         # e.g. ADD      env.GITHUB_ACTOR == '{self.trigger_actor}
         #      REMOVE   github.event.action !='{self.trigger_action}'
-        label_conditions = [f"github.event.action == '{self.trigger_action}'"] + \
-            [f"contains(github.event.pull_request.labels.*.name, '{label}')" for label in self.labels]
+        label_conditions = [
+            f"contains(github.event.pull_request.labels.*.name, '{label}')" for label in sorted(self.labels)]
         self.root_job_condition = f"(github.event_name != 'pull_request') || " \
             f"(github.event.action !='{self.trigger_action}') || " \
-            f"({' && '.join(label_conditions)})"
+            f"({' || '.join(label_conditions)})"
 
     def reset_root_job(self) -> None:
         self.root_job_name = ''
@@ -156,6 +148,9 @@ def __post_init__(self) -> None:
             else:
                 self.num_test_shards_on_pull_request = self.num_test_shards
 
+        # Add ciflow/all to labels
+        self.ciflow_config.labels.add('ciflow/all')
+
         self.assert_valid()
 
     def assert_valid(self) -> None:
@@ -165,6 +160,20 @@ def assert_valid(self) -> None:
         if self.arch == 'windows':
             assert self.test_runner_type in WINDOWS_RUNNERS, err_message
 
+        if self.ciflow_config.enabled:
+            # make sure if ciflow/default is set, we then need to set trigger_action_only to False
+            assert self.ciflow_config.trigger_action_only != ('ciflow/default' in self.ciflow_config.labels)
+            assert self.on_pull_request
+            assert 'ciflow/all' in self.ciflow_config.labels
+            if self.arch == 'linux':
+                assert 'ciflow/linux' in self.ciflow_config.labels
+            if self.arch == 'windows':
+                assert 'ciflow/win' in self.ciflow_config.labels
+            if self.test_runner_type in CUDA_RUNNERS:
+                assert 'ciflow/cuda' in self.ciflow_config.labels
+            if self.test_runner_type in CPU_RUNNERS:
+                assert 'ciflow/cpu' in self.ciflow_config.labels
+
     def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         output_file_path = GITHUB_DIR / f"workflows/generated-{self.build_environment}.yml"
         with open(output_file_path, "w") as output_file:
@@ -183,6 +192,10 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         test_runner_type=WINDOWS_CPU_TEST_RUNNER,
         on_pull_request=True,
         num_test_shards=2,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            labels={'ciflow/default', 'ciflow/cpu', 'ciflow/win'}
+        ),
     ),
     CIWorkflow(
         arch="windows",
@@ -192,6 +205,10 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         on_pull_request=True,
         only_run_smoke_tests_on_pull_request=True,
         num_test_shards=2,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            labels={'ciflow/default', 'ciflow/cuda', 'ciflow/win'}
+        ),
     ),
     CIWorkflow(
         arch="windows",
@@ -199,6 +216,12 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         cuda_version="11.1",
         test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
         num_test_shards=2,
+        on_pull_request=True,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            trigger_action_only=True,
+            labels={'ciflow/cuda', 'ciflow/win'}
+        ),
     ),
     CIWorkflow(
         arch="windows",
@@ -211,7 +234,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         ciflow_config=CIFlowConfig(
             enabled=True,
             trigger_action_only=True,
-            labels={'ciflow/scheduled'}
+            labels={'ciflow/scheduled', 'ciflow/win', 'ciflow/cuda'}
         ),
     ),
 ]
@@ -225,6 +248,10 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         on_pull_request=True,
         enable_doc_jobs=True,
         num_test_shards=2,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            labels={'ciflow/default', 'ciflow/linux', 'ciflow/cpu'}
+        ),
     ),
     # CIWorkflow(
     #     arch="linux",
@@ -268,6 +295,12 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
         test_runner_type=LINUX_CUDA_TEST_RUNNER,
         num_test_shards=2,
+        on_pull_request=True,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            trigger_action_only=True,
+            labels={'ciflow/slow', 'ciflow/linux', 'ciflow/cuda'}
+        ),
     ),
     CIWorkflow(
         arch="linux",
@@ -284,7 +317,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         ciflow_config=CIFlowConfig(
             enabled=True,
             trigger_action_only=True,
-            labels=set(['ciflow/slow']),
+            labels=set(['ciflow/slow', 'ciflow/linux', 'ciflow/cuda']),
         ),
     ),
     CIWorkflow(
@@ -293,6 +326,12 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
         test_runner_type=LINUX_CUDA_TEST_RUNNER,
         is_libtorch=True,
+        on_pull_request=True,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            trigger_action_only=True,
+            labels=set(['ciflow/libtorch', 'ciflow/linux', 'ciflow/cuda']),
+        ),
     ),
     CIWorkflow(
         arch="linux",
@@ -300,6 +339,11 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
         test_runner_type=LINUX_CUDA_TEST_RUNNER,
         num_test_shards=2,
+        on_pull_request=True,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            labels=set(['ciflow/default', 'ciflow/linux', 'ciflow/cuda']),
+        ),
     ),
     CIWorkflow(
         arch="linux",
@@ -307,6 +351,12 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
         test_runner_type=LINUX_CUDA_TEST_RUNNER,
         is_libtorch=True,
+        on_pull_request=True,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            trigger_action_only=True,
+            labels=set(['ciflow/libtorch', 'ciflow/linux', 'ciflow/cuda']),
+        ),
     ),
     CIWorkflow(
         arch="linux",
@@ -319,7 +369,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         ciflow_config=CIFlowConfig(
             enabled=True,
             trigger_action_only=True,
-            labels={'ciflow/scheduled'}
+            labels={'ciflow/scheduled', 'ciflow/linux', 'ciflow/cuda'}
         ),
     ),
     CIWorkflow(
@@ -333,7 +383,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         ciflow_config=CIFlowConfig(
             enabled=True,
             trigger_action_only=True,
-            labels={'ciflow/scheduled'},
+            labels={'ciflow/scheduled', 'ciflow/linux', 'ciflow/libtorch', 'ciflow/cuda'},
         ),
     ),
     # CIWorkflow(
@@ -364,7 +414,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         num_test_shards=2,
         ciflow_config=CIFlowConfig(
             enabled=True,
-            labels=set(['ciflow/default']),
+            labels={'ciflow/default', 'ciflow/coverage', 'ciflow/linux', 'ciflow/cpu'},
         ),
     ),
     # CIWorkflow(
@@ -433,7 +483,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         on_pull_request=True,
         ciflow_config=CIFlowConfig(
             enabled=True,
-            labels=set(['ciflow/default']),
+            labels={'ciflow/default', 'ciflow/bazel', 'ciflow/cpu', 'ciflow/linux'},
         ),
     ),
 ]
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
index a783b9b1886ec..780de8e1919e9 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -4,7 +4,8 @@
 name: libtorch-linux-xenial-cuda10.2-py3.6-gcc7
 
 on:
-  # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
+  pull_request:
+    types: [unassigned]
   push:
     branches:
       - master
@@ -28,9 +29,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
+    needs: [ciflow_should_run]
     env:
       DOCKER_BUILDKIT: 1
     timeout-minutes: 90
@@ -104,7 +112,7 @@ jobs:
 
   build:
     runs-on: linux.2xlarge
-    needs: [calculate-docker-image, ]
+    needs: [calculate-docker-image, ciflow_should_run]
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: libtorch-linux-xenial-cuda10.2-py3.6-gcc7-build
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
index da2bbc1400388..3d586ae322e8e 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -4,7 +4,8 @@
 name: libtorch-linux-xenial-cuda11.1-py3.6-gcc7
 
 on:
-  # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
+  pull_request:
+    types: [unassigned]
   push:
     branches:
       - master
@@ -28,9 +29,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
+    needs: [ciflow_should_run]
     env:
       DOCKER_BUILDKIT: 1
     timeout-minutes: 90
@@ -104,7 +112,7 @@ jobs:
 
   build:
     runs-on: linux.2xlarge
-    needs: [calculate-docker-image, ]
+    needs: [calculate-docker-image, ciflow_should_run]
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: libtorch-linux-xenial-cuda11.1-py3.6-gcc7-build
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index a9011b7047832..7aa572d83321b 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -4,7 +4,8 @@
 name: linux-bionic-cuda10.2-py3.9-gcc7
 
 on:
-  # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
+  pull_request:
+    types: [unassigned]
   push:
     branches:
       - master
@@ -28,9 +29,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
+    needs: [ciflow_should_run]
     env:
       DOCKER_BUILDKIT: 1
     timeout-minutes: 90
@@ -104,7 +112,7 @@ jobs:
 
   build:
     runs-on: linux.2xlarge
-    needs: [calculate-docker-image, ]
+    needs: [calculate-docker-image, ciflow_should_run]
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-build
@@ -215,6 +223,7 @@ jobs:
   generate-test-matrix:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ubuntu-18.04
+    needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
       ENABLE_JIT_LEGACY_TEST: ''
@@ -242,7 +251,7 @@ jobs:
         run: .github/scripts/generate_pytorch_test_matrix.py
 
   test:
-    needs: [calculate-docker-image, build, generate-test-matrix, ]
+    needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run]
     strategy:
       matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
       fail-fast: false
@@ -400,7 +409,7 @@ jobs:
   # logs (like test); we can always move it back to the other one, but it
   # doesn't create the best experience
   render_test_results:
-    needs: [generate-test-matrix, test, ]
+    needs: [generate-test-matrix, test, ciflow_should_run]
     if: ${{ needs.test.result != 'skipped' || failure() }}
     runs-on: linux.2xlarge
     strategy:
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index 3663c591ab806..eda7568a809dd 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -31,7 +31,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (github.event.action == 'unassigned' && contains(github.event.pull_request.labels.*.name, 'ciflow/default')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/coverage') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 38fe8593fc3eb..c50cac76a2c3e 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -31,7 +31,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (github.event.action == 'unassigned' && contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
index a5f0488644596..cf2395e9ca829 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -4,7 +4,8 @@
 name: linux-xenial-cuda11.1-py3.6-gcc7
 
 on:
-  # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
+  pull_request:
+    types: [opened, synchronize, reopened, unassigned]
   push:
     branches:
       - master
@@ -28,9 +29,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
+    needs: [ciflow_should_run]
     env:
       DOCKER_BUILDKIT: 1
     timeout-minutes: 90
@@ -104,7 +112,7 @@ jobs:
 
   build:
     runs-on: linux.2xlarge
-    needs: [calculate-docker-image, ]
+    needs: [calculate-docker-image, ciflow_should_run]
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: linux-xenial-cuda11.1-py3.6-gcc7-build
@@ -215,6 +223,7 @@ jobs:
   generate-test-matrix:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ubuntu-18.04
+    needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
       ENABLE_JIT_LEGACY_TEST: ''
@@ -242,7 +251,7 @@ jobs:
         run: .github/scripts/generate_pytorch_test_matrix.py
 
   test:
-    needs: [calculate-docker-image, build, generate-test-matrix, ]
+    needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run]
     strategy:
       matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
       fail-fast: false
@@ -400,7 +409,7 @@ jobs:
   # logs (like test); we can always move it back to the other one, but it
   # doesn't create the best experience
   render_test_results:
-    needs: [generate-test-matrix, test, ]
+    needs: [generate-test-matrix, test, ciflow_should_run]
     if: ${{ needs.test.result != 'skipped' || failure() }}
     runs-on: linux.2xlarge
     strategy:
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index 14e22d85edc26..dd3cb50cfc903 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -5,6 +5,7 @@ name: linux-xenial-py3.6-gcc5.4
 
 on:
   pull_request:
+    types: [opened, synchronize, reopened, unassigned]
   push:
     branches:
       - master
@@ -28,9 +29,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
+    needs: [ciflow_should_run]
     env:
       DOCKER_BUILDKIT: 1
     timeout-minutes: 90
@@ -104,7 +112,7 @@ jobs:
 
   build:
     runs-on: linux.2xlarge
-    needs: [calculate-docker-image, ]
+    needs: [calculate-docker-image, ciflow_should_run]
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: linux-xenial-py3.6-gcc5.4-build
@@ -215,6 +223,7 @@ jobs:
   generate-test-matrix:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ubuntu-18.04
+    needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.2xlarge
       ENABLE_JIT_LEGACY_TEST: ''
@@ -242,7 +251,7 @@ jobs:
         run: .github/scripts/generate_pytorch_test_matrix.py
 
   test:
-    needs: [calculate-docker-image, build, generate-test-matrix, ]
+    needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run]
     strategy:
       matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
       fail-fast: false
@@ -400,7 +409,7 @@ jobs:
   # logs (like test); we can always move it back to the other one, but it
   # doesn't create the best experience
   render_test_results:
-    needs: [generate-test-matrix, test, ]
+    needs: [generate-test-matrix, test, ciflow_should_run]
     if: ${{ needs.test.result != 'skipped' || failure() }}
     runs-on: linux.2xlarge
     strategy:
@@ -460,7 +469,7 @@ jobs:
 
   pytorch_python_doc_build:
     runs-on: linux.2xlarge
-    needs: [calculate-docker-image, build, ]
+    needs: [calculate-docker-image, build, ciflow_should_run]
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
     steps:
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index 87c6df024b6e4..dbfba5f1fa74d 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -31,7 +31,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (github.event.action == 'unassigned' && contains(github.event.pull_request.labels.*.name, 'ciflow/default')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 4aa29abb09d6d..8352b229f4fae 100644
--- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -29,7 +29,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (github.event.action == 'unassigned' && contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 5ec1ddb8516eb..dcbd19d661eb1 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -29,7 +29,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (github.event.action == 'unassigned' && contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml
index 78c536c0bbd11..9487ea5a8fdb3 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml
@@ -34,7 +34,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (github.event.action == 'unassigned' && contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index 2769f7c498eef..fb2a097c64452 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -5,6 +5,7 @@ name: win-vs2019-cpu-py3
 
 on:
   pull_request:
+    types: [opened, synchronize, reopened, unassigned]
   push:
     branches:
       - master
@@ -31,12 +32,19 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
   build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: "windows.4xlarge"
     defaults:
       run:
         working-directory: pytorch-${{ github.run_id }}
+    needs: [ciflow_should_run]
     env:
       JOB_BASE_NAME: win-vs2019-cpu-py3-build
       http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
@@ -90,6 +98,7 @@ jobs:
 
   generate-test-matrix:
     if: ${{ github.repository_owner == 'pytorch' }}
+    needs: [ciflow_should_run]
     runs-on: ubuntu-18.04
     env:
       TEST_RUNNER_TYPE: windows.4xlarge
@@ -121,7 +130,7 @@ jobs:
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       RUN_SMOKE_TESTS_ONLY_ON_PR: False
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    needs: [build, generate-test-matrix, ]
+    needs: [build, generate-test-matrix, ciflow_should_run]
     strategy:
       matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
       fail-fast: false
@@ -198,7 +207,7 @@ jobs:
   # logs (like test); we can always move it back to the other one, but it
   # doesn't create the best experience
   render_test_results:
-    needs: [generate-test-matrix, test, ]
+    needs: [generate-test-matrix, test, ciflow_should_run]
     if: ${{ needs.test.result != 'skipped' || failure() }}
     runs-on: linux.2xlarge
     strategy:
diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
index d94ba7850ee32..2fbc8650f43e1 100644
--- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
@@ -5,6 +5,7 @@ name: win-vs2019-cuda10.1-py3
 
 on:
   pull_request:
+    types: [opened, synchronize, reopened, unassigned]
   push:
     branches:
       - master
@@ -33,12 +34,19 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
   build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: "windows.4xlarge"
     defaults:
       run:
         working-directory: pytorch-${{ github.run_id }}
+    needs: [ciflow_should_run]
     env:
       JOB_BASE_NAME: win-vs2019-cuda10.1-py3-build
       http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
@@ -100,6 +108,7 @@ jobs:
 
   generate-test-matrix:
     if: ${{ github.repository_owner == 'pytorch' }}
+    needs: [ciflow_should_run]
     runs-on: ubuntu-18.04
     env:
       TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu
@@ -131,7 +140,7 @@ jobs:
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       RUN_SMOKE_TESTS_ONLY_ON_PR: True
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    needs: [build, generate-test-matrix, ]
+    needs: [build, generate-test-matrix, ciflow_should_run]
     strategy:
       matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
       fail-fast: false
@@ -216,7 +225,7 @@ jobs:
   # logs (like test); we can always move it back to the other one, but it
   # doesn't create the best experience
   render_test_results:
-    needs: [generate-test-matrix, test, ]
+    needs: [generate-test-matrix, test, ciflow_should_run]
     if: ${{ needs.test.result != 'skipped' || failure() }}
     runs-on: linux.2xlarge
     strategy:
diff --git a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml
index 9c9b733aef445..8b52a07055608 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml
@@ -4,6 +4,8 @@
 name: win-vs2019-cuda11.1-py3
 
 on:
+  pull_request:
+    types: [unassigned]
   push:
     branches:
       - master
@@ -32,12 +34,19 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
   build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: "windows.4xlarge"
     defaults:
       run:
         working-directory: pytorch-${{ github.run_id }}
+    needs: [ciflow_should_run]
     env:
       JOB_BASE_NAME: win-vs2019-cuda11.1-py3-build
       http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
@@ -99,6 +108,7 @@ jobs:
 
   generate-test-matrix:
     if: ${{ github.repository_owner == 'pytorch' }}
+    needs: [ciflow_should_run]
     runs-on: ubuntu-18.04
     env:
       TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu
@@ -130,7 +140,7 @@ jobs:
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       RUN_SMOKE_TESTS_ONLY_ON_PR: False
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    needs: [build, generate-test-matrix, ]
+    needs: [build, generate-test-matrix, ciflow_should_run]
     strategy:
       matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
       fail-fast: false
@@ -215,7 +225,7 @@ jobs:
   # logs (like test); we can always move it back to the other one, but it
   # doesn't create the best experience
   render_test_results:
-    needs: [generate-test-matrix, test, ]
+    needs: [generate-test-matrix, test, ciflow_should_run]
     if: ${{ needs.test.result != 'skipped' || failure() }}
     runs-on: linux.2xlarge
     strategy:

From cd5e9dcc1dd2d240aa034b1f02b9886b44b1efa6 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 17 Aug 2021 16:54:09 -0700
Subject: [PATCH 003/530] [quant][graphmode][fx][fix] Fix quantization for
 tuple arguments (#63376)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63376

Previously when tuple is an argument for a quantizable op it would be transformed to a list by mistake,
this PR fixes that.

Test Plan:
python test/test_quantization.py TestQuantizeFx.test_preserve_tuple

Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D30357642

fbshipit-source-id: 82d10805d9c00c003cc99983dca68b6455ff7b2e
---
 test/quantization/fx/test_quantize_fx.py | 22 ++++++++++++++++++++++
 torch/quantization/fx/prepare.py         |  2 +-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 7940eb73114c6..2f5f7c4a27f6e 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -2807,6 +2807,28 @@ def forward(self, x):
         m = convert_fx(m, is_reference=True)
         m(torch.rand(2, 1, 5, 5))
 
+    def test_preserve_tuple(self):
+        """ Test tuple input type is preserved
+        """
+        from typing import List
+
+        class LSTM(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lstm = nn.LSTM(50, 50, 1)
+
+            def forward(self, inputs: torch.Tensor, state: List[torch.Tensor]):
+                h = state[0]
+                c = state[1]
+                return self.lstm(inputs, (h, c))
+
+        m = LSTM().eval()
+        m = prepare_fx(m, {"": default_qconfig})
+        # make sure the arg[1] of lstm module is a tuple
+        for n in m.graph.nodes:
+            if n.target == "lstm":
+                self.assertEqual(type(n.args[1]), tuple)
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     """Unit tests for individual ops
diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py
index ab137487b3cc8..873d11acaa82e 100644
--- a/torch/quantization/fx/prepare.py
+++ b/torch/quantization/fx/prepare.py
@@ -324,7 +324,7 @@ def maybe_insert_input_observer_for_arg_or_kwarg(
                 graph, node_name_to_target_dtype,
                 qhandler, prepare_custom_config_dict)
             new_arg_to_return.append(new_inner_arg)
-        return new_arg_to_return
+        return type(arg)(new_arg_to_return)
 
     if not isinstance(arg, Node):
         return arg

From 5b8862abf14add4e280b92c415d840f02582eba4 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Tue, 17 Aug 2021 17:12:32 -0700
Subject: [PATCH 004/530] [DDP] Support step_param for AdamW (#63382)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63382

Per title
ghstack-source-id: 135966156

Test Plan: CI

Reviewed By: SciPioneer

Differential Revision: D30255446

fbshipit-source-id: e6ffbf339db0bc5b4702d02b74a462309df07c75
---
 test/distributed/test_c10d_nccl.py            | 18 ++++++-
 test/test_functional_optim.py                 | 13 ++++-
 torch/distributed/optim/functional_adamw.py   | 49 +++++++++++++++++++
 .../_internal/distributed/distributed_test.py | 30 +++++++++++-
 4 files changed, 106 insertions(+), 4 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 2e5045153b149..285053d6d1c7e 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -53,9 +53,11 @@
 if not IS_WINDOWS:
     from torch.distributed.optim.functional_sgd import _FunctionalSGD
     from torch.distributed.optim.functional_adam import _FunctionalAdam
+    from torch.distributed.optim.functional_adamw import _FunctionalAdamW
     _SUPPORTED_OPTIM_MAPPING = {
         _FunctionalSGD: torch.optim.SGD,
-        _FunctionalAdam: torch.optim.Adam
+        _FunctionalAdam: torch.optim.Adam,
+        _FunctionalAdamW: torch.optim.AdamW,
     }
 
 if TEST_WITH_TSAN:
@@ -1737,6 +1739,20 @@ def test_hook_then_sgd_nccl_grad_as_bucket_view(self):
             gradient_as_bucket_view=True
         )
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_hook_then_adamw_nccl(self):
+        adamw_lr = 1e-2
+        adamw_betas = (0.9, 0.99)
+        adamw_eps = 1e-6
+        self._test_hook_then_optimizer(
+            _FunctionalAdamW,
+            adamw_lr,
+            betas=adamw_betas,
+            eps=adamw_eps,
+            gradient_as_bucket_view=True
+        )
+
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_hook_then_adam_nccl(self):
diff --git a/test/test_functional_optim.py b/test/test_functional_optim.py
index c37823427fc1d..59af691faf36c 100644
--- a/test/test_functional_optim.py
+++ b/test/test_functional_optim.py
@@ -3,15 +3,17 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.optim import SGD, Adam
+from torch.optim import SGD, Adam, AdamW
 from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS
 
 if not IS_WINDOWS:
     from torch.distributed.optim.functional_sgd import _FunctionalSGD
     from torch.distributed.optim.functional_adam import _FunctionalAdam
+    from torch.distributed.optim.functional_adamw import _FunctionalAdamW
     _SUPPORTED_OPTIM_MAPPING = {
         SGD: _FunctionalSGD,
-        Adam: _FunctionalAdam
+        Adam: _FunctionalAdam,
+        AdamW: _FunctionalAdamW,
     }
 
 
@@ -102,6 +104,13 @@ def test_functional_optim_parity_sgd(self):
     def test_functional_optim_parity_adam(self):
         self._test_functional_optim_parity(Adam, 1e-2, betas=(0.9, 0.999), eps=1e-6)
 
+    @unittest.skipIf(
+        IS_WINDOWS,
+        "Functional optimizer not support on windows, see https://github.com/pytorch/pytorch/issues/62137",
+    )
+    def test_functional_optim_parity_adam_w(self):
+        self._test_functional_optim_parity(AdamW, 1e-2, betas=(0.9, 0.999), eps=1e-6)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/optim/functional_adamw.py b/torch/distributed/optim/functional_adamw.py
index 5623a0b8d6841..0159aa35a5539 100644
--- a/torch/distributed/optim/functional_adamw.py
+++ b/torch/distributed/optim/functional_adamw.py
@@ -53,6 +53,55 @@ def __init__(
         # param group as it's not a common use case.
         self.param_group = {"params": params}
 
+    def step_param(self, param: Tensor, grad: Optional[Tensor]):
+        params_with_grad = []
+        grads = []
+        exp_avgs = []
+        exp_avg_sqs = []
+        max_exp_avg_sqs = []
+        state_steps: List[int] = []
+        if grad is not None:
+            params_with_grad.append(param)
+            grads.append(grad)
+        # Lazy state initialization
+        if param not in self.state:
+            self.state[param] = {}
+            state = self.state[param]
+            state['step'] = torch.tensor(0.0)
+            # Exponential moving average of gradient values
+            state['exp_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+            # Exponential moving average of squared gradient values
+            state['exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+            if self.amsgrad:
+                # Maintains max of all exp. moving avg. of sq. grad. values
+                state['max_exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+
+        state = self.state[param]
+
+        exp_avgs.append(state['exp_avg'])
+        exp_avg_sqs.append(state['exp_avg_sq'])
+
+        if self.amsgrad:
+            max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+
+        # update the steps for each param group update
+        state['step'] += 1
+        # record the step after step update
+        state_steps.append(state['step'].item())
+        with torch.no_grad():
+            F.adamw(params_with_grad,
+                    grads,
+                    exp_avgs,
+                    exp_avg_sqs,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=self.amsgrad,
+                    beta1=self.defaults['beta1'],
+                    beta2=self.defaults['beta2'],
+                    lr=self.defaults['lr'],
+                    weight_decay=self.defaults['weight_decay'],
+                    eps=self.defaults['eps'])
+
     def step(self, gradients: List[Optional[Tensor]]):
         params = self.param_group['params']
         params_with_grad = []
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 54a22b01bd667..6ef94c99aa739 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -70,9 +70,11 @@
     import torch.distributed.optim.post_localSGD_optimizer as post_localSGD_optimizer
     from torch.distributed.optim.functional_sgd import _FunctionalSGD
     from torch.distributed.optim.functional_adam import _FunctionalAdam
+    from torch.distributed.optim.functional_adamw import _FunctionalAdamW
     _SUPPORTED_OPTIM_MAPPING = {
         _FunctionalSGD: torch.optim.SGD,
-        _FunctionalAdam: torch.optim.Adam
+        _FunctionalAdam: torch.optim.Adam,
+        _FunctionalAdamW: torch.optim.AdamW,
     }
 
 from torch.utils.data.distributed import DistributedSampler
@@ -3999,6 +4001,32 @@ def _test_ddp_hook_with_optimizer_parity(
                     )
                     dist.barrier()
 
+        @sandcastle_skip_if(
+            BACKEND != "nccl" and BACKEND != "gloo",
+            "Only Nccl & Gloo backend support DistributedDataParallel",
+        )
+        @sandcastle_skip_if(
+            IS_WINDOWS,
+            "FunctionalAdam not yet supported with Windows, see https://github.com/pytorch/pytorch/issues/62137"
+        )
+        @skip_if_lt_x_gpu(2)
+        @skip_if_rocm
+        def test_ddp_hook_with_optimizer_parity_adamw(self):
+            for grad_as_bucket_view, static_graph in itertools.product(
+                [True, False], [True, False]
+            ):
+                adamw_lr = 1e-2
+                adamw_betas = (0.9, 0.99)
+                adamw_eps = 1e-6
+                self._test_ddp_hook_with_optimizer_parity(
+                    grad_as_bucket_view,
+                    static_graph,
+                    _FunctionalAdamW,
+                    adamw_lr,
+                    betas=adamw_betas,
+                    eps=adamw_eps,
+                )
+
         @sandcastle_skip_if(
             BACKEND != "nccl" and BACKEND != "gloo",
             "Only Nccl & Gloo backend support DistributedDataParallel",

From dcf90b797c42ab237ca68d0e816c0d76122ee931 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Tue, 17 Aug 2021 17:12:32 -0700
Subject: [PATCH 005/530] [BE] remove _SUPPORTED_OPTIM_MAP from tests (#63383)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63383

Per title
ghstack-source-id: 135966157

Test Plan: CI

Reviewed By: SciPioneer

Differential Revision: D30358921

fbshipit-source-id: 965e054e525194b1ee55980340df275bab355c9b
---
 test/distributed/test_c10d_nccl.py                 |  9 +++------
 test/test_functional_optim.py                      | 14 ++------------
 .../_internal/distributed/distributed_test.py      | 10 ++++------
 3 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 285053d6d1c7e..f7f6681b43a76 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -49,16 +49,12 @@
     sandcastle_skip_if,
 )
 from torch.utils.checkpoint import checkpoint
+from torch.distributed.optim import functional_optim_map
 
 if not IS_WINDOWS:
     from torch.distributed.optim.functional_sgd import _FunctionalSGD
     from torch.distributed.optim.functional_adam import _FunctionalAdam
     from torch.distributed.optim.functional_adamw import _FunctionalAdamW
-    _SUPPORTED_OPTIM_MAPPING = {
-        _FunctionalSGD: torch.optim.SGD,
-        _FunctionalAdam: torch.optim.Adam,
-        _FunctionalAdamW: torch.optim.AdamW,
-    }
 
 if TEST_WITH_TSAN:
     print(
@@ -1639,7 +1635,8 @@ def _test_hook_then_optimizer(
         gpu_model_allreduce = self._gpu_model_with_ddp_comm_hook(
             process_group, default.allreduce_hook, gradient_as_bucket_view, hook_state
         )
-        sgd = _SUPPORTED_OPTIM_MAPPING.get(functional_optim_cls)(
+        mapping = {v: k for k, v in functional_optim_map.items()}
+        sgd = mapping.get(functional_optim_cls)(
             gpu_model_allreduce.parameters(),
             *functional_optim_args,
             **functional_optim_kwargs,
diff --git a/test/test_functional_optim.py b/test/test_functional_optim.py
index 59af691faf36c..98a3f06805dba 100644
--- a/test/test_functional_optim.py
+++ b/test/test_functional_optim.py
@@ -5,17 +5,7 @@
 import torch.nn.functional as F
 from torch.optim import SGD, Adam, AdamW
 from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS
-
-if not IS_WINDOWS:
-    from torch.distributed.optim.functional_sgd import _FunctionalSGD
-    from torch.distributed.optim.functional_adam import _FunctionalAdam
-    from torch.distributed.optim.functional_adamw import _FunctionalAdamW
-    _SUPPORTED_OPTIM_MAPPING = {
-        SGD: _FunctionalSGD,
-        Adam: _FunctionalAdam,
-        AdamW: _FunctionalAdamW,
-    }
-
+from torch.distributed.optim import functional_optim_map
 
 class MyModule(torch.nn.Module):
     def __init__(self):
@@ -39,7 +29,7 @@ def _test_functional_optim_parity(self, optim_cls, *args, **kwargs):
         optim_params = module_optim.parameters()
         functional_params = module_functional.parameters()
         optim = optim_cls(optim_params, *args, **kwargs)
-        functional_optim_cls = _SUPPORTED_OPTIM_MAPPING.get(optim_cls, None)
+        functional_optim_cls = functional_optim_map.get(optim_cls, None)
         if not functional_optim_cls:
             raise ValueError(f"Functional optimizer not implemented for {optim_cls}")
         optim_functional = functional_optim_cls(
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 6ef94c99aa739..2a126ab894a06 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -66,16 +66,13 @@
     sandcastle_skip_if,
 )
 
+from torch.distributed.optim import functional_optim_map
+
 if not IS_WINDOWS:
     import torch.distributed.optim.post_localSGD_optimizer as post_localSGD_optimizer
     from torch.distributed.optim.functional_sgd import _FunctionalSGD
     from torch.distributed.optim.functional_adam import _FunctionalAdam
     from torch.distributed.optim.functional_adamw import _FunctionalAdamW
-    _SUPPORTED_OPTIM_MAPPING = {
-        _FunctionalSGD: torch.optim.SGD,
-        _FunctionalAdam: torch.optim.Adam,
-        _FunctionalAdamW: torch.optim.AdamW,
-    }
 
 from torch.utils.data.distributed import DistributedSampler
 
@@ -3949,7 +3946,8 @@ def _test_ddp_hook_with_optimizer_parity(
                     if static_graph:
                         ddp_model_with_no_hook._set_static_graph()
 
-                    optimizer_no_hook = _SUPPORTED_OPTIM_MAPPING.get(functional_optim_cls)(
+                    mapping = {v: k for k, v in functional_optim_map.items()}
+                    optimizer_no_hook = mapping.get(functional_optim_cls)(
                         ddp_model_with_no_hook.parameters(),
                         *functional_optim_args,
                         **functional_optim_kwargs,

From f12f667e12d8c6f3356267ec87ae15e7fdd0ed76 Mon Sep 17 00:00:00 2001
From: Aliaksandr Ivanou <aivanou@fb.com>
Date: Tue, 17 Aug 2021 19:54:30 -0700
Subject: [PATCH 006/530] [torch] Set default log level for torch elastic
 (#63214)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63214

The default log level in fb and oss is different: in oss we use WARNING and in fb we use INFO.

Test Plan: unittests, f291441502

Reviewed By: cbalioglu

Differential Revision: D30296298

fbshipit-source-id: 89067352be767255fbc66e790ec333582de64c6c
---
 torch/distributed/elastic/utils/log_level.py | 14 ++++++++++++++
 torch/distributed/elastic/utils/logging.py   |  4 +++-
 2 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 torch/distributed/elastic/utils/log_level.py

diff --git a/torch/distributed/elastic/utils/log_level.py b/torch/distributed/elastic/utils/log_level.py
new file mode 100644
index 0000000000000..87ea0f7d64182
--- /dev/null
+++ b/torch/distributed/elastic/utils/log_level.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def get_log_level() -> str:
+    """
+    Return default log level for pytorch.
+    """
+    return "WARNING"
diff --git a/torch/distributed/elastic/utils/logging.py b/torch/distributed/elastic/utils/logging.py
index 19c68c03cf552..e4f1345e4c339 100644
--- a/torch/distributed/elastic/utils/logging.py
+++ b/torch/distributed/elastic/utils/logging.py
@@ -12,6 +12,8 @@
 import warnings
 from typing import Optional
 
+from torch.distributed.elastic.utils.log_level import get_log_level
+
 
 def get_logger(name: Optional[str] = None):
     """
@@ -32,7 +34,7 @@ def get_logger(name: Optional[str] = None):
 
 def _setup_logger(name: Optional[str] = None):
     log = logging.getLogger(name)
-    log.setLevel(os.environ.get("LOGLEVEL", "WARNING"))
+    log.setLevel(os.environ.get("LOGLEVEL", get_log_level()))
     return log
 
 
From 3fd8e09102c32d47a5af15c86be979eb75008d49 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Tue, 17 Aug 2021 20:12:51 -0700
Subject: [PATCH 007/530] Fix RPC Python User Function Error Handling (#63406)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63406

The `RemoteException` will be thrown on the caller side when converting
the response message to IValue. Since it is a Python error, the error
message needs to be extracted explicitly and clear the `PyErr`.

Test Plan: Imported from OSS

Reviewed By: rohan-varma, ngimel

Differential Revision: D30372741

Pulled By: mrshenli

fbshipit-source-id: 1f72a7ee0c39cc2ef070f99884c142f7b3e0543d
---
 .../csrc/distributed/rpc/python_functions.cpp | 20 +++++++++++++++++++
 .../_internal/distributed/rpc/rpc_test.py     | 16 ++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/rpc/python_functions.cpp b/torch/csrc/distributed/rpc/python_functions.cpp
index 272377166fc5f..2d6533d797175 100644
--- a/torch/csrc/distributed/rpc/python_functions.cpp
+++ b/torch/csrc/distributed/rpc/python_functions.cpp
@@ -152,6 +152,26 @@ c10::intrusive_ptr<JitFuture> toPyJitFuture(
             IValue ivalue;
             try {
               ivalue = toPyIValue(message);
+            } catch (py::error_already_set& e) {
+              py::gil_scoped_acquire acquire;
+              // FIXME: this is a temporary solution to add a special-case for
+              // ValueError and TypeError, as those are already used in our tests.
+              // We should have a more comprehensive coverage for other types of
+              // exceptions as well.
+              if (e.matches(PyExc_ValueError)) {
+                child->setErrorIfNeeded(
+                    std::make_exception_ptr(pybind11::value_error(e.what())));
+              } else if (e.matches(PyExc_TypeError)) {
+                child->setErrorIfNeeded(
+                    std::make_exception_ptr(pybind11::type_error(e.what())));
+              } else {
+                // py::error_already_set requires GIL to destruct, take special care.
+                child->setErrorIfNeeded(
+                    std::make_exception_ptr(std::runtime_error(e.what())));
+              }
+              e.restore();
+              PyErr_Clear();
+              return;
             } catch (std::exception& e) {
               child->setErrorIfNeeded(std::current_exception());
               return;
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index c95b7216c4a67..ae57ea5f40f8c 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1039,6 +1039,20 @@ def test_multi_rpc(self):
             )
             self.assertEqual(ret, torch.ones(n, n) * 2)
 
+    @dist_init
+    def test_future_wait_twice(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        futs = []
+        for i in range(20):
+            futs.append(rpc.rpc_async(dst, raise_func))
+
+        with self.assertRaisesRegex(ValueError, "Expected error"):
+            torch.futures.wait_all(futs)
+
+        for fut in futs:
+            with self.assertRaisesRegex(ValueError, "Expected error"):
+                fut.wait()
+
     def _run_uneven_workload(self, num_repeat=30):
         # worker0 drives and waits for worker1 and worker2
         # throughout the test.
@@ -3210,7 +3224,7 @@ def test_function_not_on_callee(self):
             # Ensure that we have the attribute on this module. Otherwise, the test could fail due to a caller-side pickling error.
             self.assertTrue(hasattr(this_module, "foo_add"))
             with self.assertRaisesRegex(
-                AttributeError, "RPC pickler does not serialize"
+                RuntimeError, "RPC pickler does not serialize"
             ):
                 rpc.rpc_sync(callee_worker, foo_add, args=())
 

From 15144ade25bfd5528cad982e82e47f2b57b65af6 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 17 Aug 2021 21:35:55 -0700
Subject: [PATCH 008/530] [fx2trt] Add quantize_per_tensor support (#63447)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63447

Only available in TRT 8.0 and above

Test Plan: buck run mode/opt caffe2/torch/fb/fx2trt:test_quantize_per_tensor

Reviewed By: 842974287

Differential Revision: D30322844

fbshipit-source-id: dfd925e3432de128f2925b1aa55d6125e63359af
---
 .../fx2trt/converters/acc_ops_converters.py   | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
index 506bf2cdbec93..0bca6e28c83b6 100644
--- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
+++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
@@ -1106,3 +1106,35 @@ def acc_ops_permute(network, target, args, kwargs, name):
     layer.second_transpose = tuple(permutation)
     layer.name = name
     return layer.get_output(0)
+
+@tensorrt_converter(acc_ops.quantize_per_tensor)
+def acc_ops_quantize_per_tensor(network, target, args, kwargs, name):
+    input_val = kwargs["input"]
+
+    if not isinstance(input_val, trt.tensorrt.ITensor):
+        raise RuntimeError(f"{name} received input {input_val} that is not part "
+                           "of the TensorRT region!")
+
+    q_scale = acc_utils.get_field_from_acc_out_ty(kwargs["acc_out_ty"], "q_scale")
+    q_zero_point = acc_utils.get_field_from_acc_out_ty(kwargs["acc_out_ty"], "q_zero_point")
+    dtype = acc_utils.get_field_from_acc_out_ty(kwargs["acc_out_ty"], "dtype")
+    if dtype not in (torch.quint8, torch.qint8, torch.qint32):
+        raise RuntimeError("Only support (torch.quint8, torch.qint8, torch.qint32) "
+                           f"quantized type in quantize_per_tensor, get {dtype}.")
+
+    if q_zero_point != 0:
+        raise RuntimeError(f"Only support zero_point == 0, get {q_zero_point}")
+
+    # temporarily set q_scale to 1 to make sure the q_scale is different
+    # for quantize and dequantize to avoid the error
+    # TODO: follow up with nvidia TensorRT team to repro and fix the problem
+    q_scale = 1
+    scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([float(q_scale)], dtype=np.float32)))
+    scale_layer.name = input_val.name + ".quant.scale"
+    scale = scale_layer.get_output(0)
+    assert trt.__version__ > "8.0", "Explicit quantize op is only supported in "
+    "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
+    layer = network.add_quantize(input=input_val, scale=scale)
+    layer.axis = 0
+    layer.name = input_val.name + ".quant"
+    return layer.get_output(0)

From 2fd14735d677d1c2cf87e4c76aea2111bc30c17b Mon Sep 17 00:00:00 2001
From: Pavithran Ramachandran <pavithran@fb.com>
Date: Tue, 17 Aug 2021 22:26:22 -0700
Subject: [PATCH 009/530] [easy][PyTorchEdge] print error message when failing
 to load model file (#63404)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63404

# Context
Loading a model file using `fopen` might error out for multiple reasons. Repro'ing the error on devices takes some time and efforts. Logging the error no# will help in debugging and fixing the error quickly.

# Mitigation
Printout the error message of the `fopen` to help users debug the issue.

Test Plan:
```
(base) [pavithran@devvm1803.vll0 /data/users/pavithran/fbsource] buck run xplat/caffe2/fb/lite_predictor:lite_predictor -- --model=/home/pavithran/models/prod/GAaNhAoTIV6cIvgJAHn30m8NR1QgbmQwAAAA.ptl --use_bundled_input=0
Building: finished in 0.5 sec (100%) 354/354 jobs, 0/354 updated
  Total time: 0.6 sec
Run with 24 threads
Run with 24 threads
Loading model...
terminate called after throwing an instance of 'c10::Error'
  what():  open file failed because of errno 2 on fopen: No such file or directory, file path: /home/pavithran/models/prod/GAaNhAoTIV6cIvgJAHn30m8NR1QgbmQwAAAA.ptl
Exception raised from RAIIFile at xplat/caffe2/caffe2/serialize/file_adapter.cc:15 (most recent call first):
(no backtrace available)
```

Reviewed By: dhruvbird

Differential Revision: D30372308

fbshipit-source-id: 5346e828f53f6bc5d871b403586566a3332a389a
---
 caffe2/serialize/file_adapter.cc | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/caffe2/serialize/file_adapter.cc b/caffe2/serialize/file_adapter.cc
index 701270b566145..1fddce970a84f 100644
--- a/caffe2/serialize/file_adapter.cc
+++ b/caffe2/serialize/file_adapter.cc
@@ -1,7 +1,8 @@
 #include "caffe2/serialize/file_adapter.h"
 #include <c10/util/Exception.h>
+#include <cerrno>
 #include <cstdio>
-
+#include <string>
 #include "caffe2/core/common.h"
 
 namespace caffe2 {
@@ -10,7 +11,20 @@ namespace serialize {
 FileAdapter::RAIIFile::RAIIFile(const std::string& file_name) {
   fp_ = fopen(file_name.c_str(), "rb");
   if (fp_ == nullptr) {
-    AT_ERROR("open file failed, file path: ", file_name);
+    char buf[1024];
+    buf[0] = '\0';
+#if defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER))
+  strerror_s(buf, sizeof(buf), errno);
+#else
+  strerror_r(errno, buf, sizeof(buf));
+#endif
+    AT_ERROR(
+        "open file failed because of errno ",
+        errno,
+        " on fopen: ",
+        buf,
+        ", file path: ",
+        file_name);
   }
 }
 

From d431c77d76b8c92dfddfdfec2ce29b2e46c441c2 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@fb.com>
Date: Tue, 17 Aug 2021 23:10:48 -0700
Subject: [PATCH 010/530] [sharded_tensor] fix typing issue for placement
 (#63426)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63426

placement should either be a string or a _remote_device, this fixes the type to match the behaviors
ghstack-source-id: 136041125

Reviewed By: pritamdamania87

Differential Revision: D30379702

fbshipit-source-id: 34e226494240923b433e3a39cc08c84d42cdad6b
---
 torch/distributed/_sharding_spec/_internals.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/_sharding_spec/_internals.py b/torch/distributed/_sharding_spec/_internals.py
index a519a9a3e2b7b..3f2ab2f1a4ea8 100644
--- a/torch/distributed/_sharding_spec/_internals.py
+++ b/torch/distributed/_sharding_spec/_internals.py
@@ -1,5 +1,6 @@
-from typing import List
+from typing import List, Union
 from dataclasses import dataclass
+from torch.distributed.remote_device import _remote_device
 
 import torch
 
@@ -24,7 +25,7 @@ class ShardMetadata(object):
 
     shard_offsets: List[int]
     shard_lengths: List[int]
-    placement: torch.distributed._remote_device
+    placement: Union[str, _remote_device]
 
     def __post_init__(self):
         if isinstance(self.placement, str):

From f8a84a80cdd15aaea12147d7cb35d199aa302dc5 Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Wed, 18 Aug 2021 01:58:05 -0700
Subject: [PATCH 011/530] [5/N] Run opt-asan with detect_leaks=0 (#63361)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63361

Python multiprocessing doesn't support LSAN and causes false positives
instead. As a result, disabling LSAN for these tests so that we can still run
with opt-asan
ghstack-source-id: 135962489

Test Plan: waitforbuildbot

Reviewed By: rohan-varma

Differential Revision: D30352269

fbshipit-source-id: f6ab5abce7bdef00cd5e1f5977424d2b151174af
---
 torch/testing/_internal/common_distributed.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index fdad0ad0222fa..74ed9a069604c 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -526,10 +526,6 @@ def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
         self.file_name = file_name
         self.run_test(test_name, parent_pipe, signal_send_pipe, event_listener_thread)
 
-        # exit to avoid run teardown() for fork processes
-        # Use os._exit() as it is the recommended way for child processes.
-        os._exit(0)
-
     def run_test(
         self, test_name: str, parent_pipe, signal_pipe=None, event_listener_thread=None
     ) -> None:

From 0f2f6a79cbc74ef2776c3555eb05b70aa80212bf Mon Sep 17 00:00:00 2001
From: Michael Dagitses <mikeyd@fb.com>
Date: Wed, 18 Aug 2021 03:59:51 -0700
Subject: [PATCH 012/530] clarify the documentation of `torch.meshgrid`
 (#62977)

Summary:
Also warn about the behavior differences from `numpy.meshgrid`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62977

Reviewed By: mruberry, ngimel

Differential Revision: D30220930

Pulled By: dagitses

fbshipit-source-id: ae6587b41792721cae2135376c58121b4634e296
---
 docs/source/_static/img/meshgrid.png | Bin 0 -> 117332 bytes
 torch/functional.py                  |  65 ++++++++++++++++++++++++---
 2 files changed, 59 insertions(+), 6 deletions(-)
 create mode 100644 docs/source/_static/img/meshgrid.png

diff --git a/docs/source/_static/img/meshgrid.png b/docs/source/_static/img/meshgrid.png
new file mode 100644
index 0000000000000000000000000000000000000000..97ad0661fc218c9c8a6a9e68a02dbccc2d8f686b
GIT binary patch
literal 117332
zcmeFZXE<En_c%HYqZ2(wizJ9LgfL3fDA7xzMHvx;XrqfdqbCw1M2k*{5Yd7}8$^g6
z5;bA8=!`!49iMOc{r>m4Z|;lx?#_PBbM`rB@3mLkYp=8RI`h;}Uz3*VG8F&-puMYg
z#~1(rk*Yupl$>;$<eg}_xbimER0Wjx!<I=GwGQTYQF?lS8>BiEKnA=70ADmA6#(!u
zfb5?-0H6)z{BPYDDDe+a3DPsm7-s<F-))|g&KJKf(n0F|ud|+=58Bnk$JO~Eub+dL
zx3h=4q?n|blnAerj}O{g4i0zwZv!z8FGo13A2=8IKRD!q{sSKb$OZqW4#be!0gM@s
zT}dZOw3dZ80KnaLaRA*1eHBULuR7l|_c7Pgm9zJ76SI5d@z6mmzzu!T3!o4nN2<Cx
z_}K9VxVgG}%LUxN@=psnQvKpK{0i?sO?+H#UoqD+<W=+Va^RH~6BiS|qDaNd%d6n^
z2qkBHN8{h!NmsY8T#!Se#^2vx%wO^!N+e`uW#QsC;WuxJl3Iv*KXLc53lMep=KmLv
z|G>H9;BD{ajP@aMhW7%l-9rywpW9ciTnzNz-@owmaYp^mNbcVMCW}NM{Nf2*LQEX~
z|J02{?0@n5KY9h&q5swCAFBQ}n8F1pIRh_e2NJp$NGVDv`~&{~xc5KEDUhJcsd+ir
z`FMEU^YCz0{C_KO1^E9R!oPtlz%SVGzu5CH-v4u##C%06QqTVgQjv<2Pr?NNPzKz+
zqjE0*xYk6OoMq;@`QS0_pA3h)wXWZ5EmK4WcAk?|K=PWd1#o-gCgnlYYn)yHALZ{y
z*DT<Ci9sA*JMLQCln1XIKz5<ZytNV%$X+~9!LBD|LFJX`gDQu<o!=ir_p;8H|KtqF
zhV(DrU8%0y-8<{&e*bus`@P@#%$vc*{;D2W*MG~BAf1ozJ!fw_Ns62QE#Y7bU>7Z)
zH}AhoE?72^1w&W1Yc=1C{ZA<a%u6TA{-+FLVO><p6sCb<UVoMUQ%nI~od0VmWx7XX
zxnSYjY_k7_%qtA|O#6uLKgEZaq6##KeVrYb`CrJSk$;o_4{maaB;=NG+5_tS|0RMn
zvO75azlEy{TL$n!zFoZH_+QA6$aw4ii+V4XZeH0GAXhQu{qle4t^y@a{ulMX$`oEC
z<hab2KX?5PWD;B4{=*br@Bg1I|7Q9BM{Lnu!QH%&;d4rgqoX6&&8eo5U%wnoigYQ%
zbfMq1{w29gFjKJMN>Ap6p0Qid!4!kX=wJxs@&0->zh-<SAqU}^ua_Yu#69)o--^#J
zO437Q4I*pH=PJeEM|y4rrx-e}DoUp7k?<@|b&NmvtZ@{e1E8bg_b<uwcaZeyIkThW
z@AyRkDBGe1bpky+JwBkYz54qt4ZfHPxTR3V@F*&G?u`jb`7f@*o1{7I{X;b3-nfTF
z1HC)~b^{H}fMjeHzlbozp?rS&0`C8UIu8I@T@QnVSq#b;1G;HAb@{YXX#xybUNEyt
zy<XOu+X=duQH~c1nPsv0`6oXPs7r(fK-`;XLqsTRgcD7olzYgCh{&*Ns#YlT?TB6y
zGZ5q#lh(ds=;n7#9*VTR9p1v#zOQ*-Sg3Kxx4l%YNLcsQ1s*RDiJL&DuUW8ES#c;u
zTAyU;Vw$aJ(mC&V3^T2}?2Kue2$)mGy{7Mt%w&~AkE-ll8J52eKu`E_Iyme5#Y{sr
zEgI8<DST>q@*WJ#E73S*kn_zLkTZiKZ|kShdooM>Nz+pBG00Ys9g?ZCg5;hEkmf(q
z%9BLS+<Q51JB<i1)Y5I&p7qZ&f<fOt|6O4zZ9a@*q<phehpIA%kL=4?2R>fwT88J3
zXdTAV!S(|e@(AI2L=_LD<JD!1h4yccpP6>08oFOIXmGxUGjUK}NW_`Jnwde{TUs26
zZ<PrXZv_@F>>lC^g2a`AQpKCMG=(S_LZM;h+Y8o`%LU25Z&*0wWu0j(%|T7lSA-zO
z?1jvas8wEf9A5Lrr!A1|Mf~{tp2&pj6w+=J%$%q3@*DN<otwB$pq7+Ve%lF;xPQE6
z34b~76`duE-bIs$(=@Ptq#dtHw?ucom>ndBI_=~M3l;(LUhQmf(d0v_@r__DSwVPT
z(Db*UhOPxpC6Won5leckUfKP{jC^fqXqek+f{Igi*lFC#$4AGZTAfE!6UEpF*-_`Q
zrrzY)vV%7-?B&fKujs9QAjCSb*0$Or>w{8Cx6&(BnfuThO^o<$LSkG(>>QV3f2M5p
zV?IO*G0nLT#-4C7RzTS=tD;(2L&ZtBT{Q1&oC_yW(6LM%ZpWX;Tv(48h`<YEI$(WQ
zN1|?41Rrh_4PWWA7Mg||p$s%qWe2w|EPkg9win<DNK&G2iyOZo#^So%u>B|uO&6OZ
z2vQ)t)MN#96l-INN{HReD<rAsT3@UI*8K~?bJ5^0%*8)^n=|_8H1ilkp|=m)fjdqy
zFU4{U_qSgGw~i8DF@usZJ|;6iY`*E*{>4>(BZ(d6aw}z-%apEgbS{3rjLD8Z81)${
z58+juDM1&ap|}}zJDDTG38<k>zmUQYYsN#_3=F#rsbbkSA6;Yr{^gSO_yCN3^B?Lz
z&R8{Pz?8d?8>13R5HJ4Gax&#`_wHn&TJJ^u2cI6_hstj-rAgRF@9pk>6=CCxxq=x?
zVz~Oe*qS0UZ{CB=L;moqz_zMnxQ!i=K<>cX6#G;vki4|T&#C*#O^R>wB-w~hxgb(E
zufdelD$brK;GHzIUTJ`quK`veiP^zvm7^bRJ%r85YnTiaBv93y(ymZ42r&AJxXVb{
z7@v~w%Fo!;K{dWamuUUBdLhWC`+jzX1dTcC-EiuR^sTv4krGv9u1f<2*CNUoo@<CY
zz&hnfW1`DOkwq=n=)~tVi8}A3-dMC1kMBZnq?hnKnP&`|iDFv|rxvRiuXgMd;X(AG
z{MRQOChI)v7IVU;vY-4-Py69@&z+<!c?@U@U>gm#SW`;bq?GQ<#P`t~*<MHYa#QCr
zPDR-V6J{g%X_|8djp_(e!jL{Msp3_i3*inwP>xCFQNiLh4S9JjS;7lb0gu1leA_yo
zfIjblhcc!@o7NoQl97iSe;U+~c`(8o4y8KkD_@PHF!tUeMv3IB0IJsA<zaA&6CXV>
zlIhDmvBMWmT<i5Sg+ij$D<*b(%pg%V<h3p_l0M-?NDc(c^B_9jUB)vgw6FVtXia2!
zr0$24uP^nlpKJ1j{7ezCcvzg}XGyY`a&6k;xp5Sliu>SPhD>xDC1boQt{Lk#Sv$V+
zs}xM`TyZ6rS@(^!C^M)Pl9CKU);=)Z=62Cnh}sCG0-mS<Nd{1dz;4sV&%OPKGy|M3
zW<c?;0E%R@HbCC9!G)O+L6DJZE%z})BQ*4cMc8yK4&Lm<X6^HmWIf@@^xG<GE8Q(`
zX#_>P0o7+o`f@_OlR^7ym0a?Ek*XD*bKd~!my@yg_yFa|r)+HxzLgMO?&sM{c@{6E
zpo`BeuH(LhtCIy1MXxEdhLc3@<I+V)U?njf?^N<sx3dDx%ObUQS(D+h27(!43@|3*
zl5$(`4Y+b8DM{++o{4~!BOrrFVh_l;$aR~sxGy}8Rg~(j1{F{n7(|0MCo?2}bWK@^
z4=#nH{-HB*Q?X5B=!JB1jY#qXhvIU4T!0#)@nt?VOAz>Vu7LgdwRLCN?V!In^u@Z=
z{=JXJrB@cG+~mJGl_taw<-o<X1I<8p2t++Z`0Ti2&Zkc9Ad%ivo!@H-1k`w*#SA>n
zL5W47*EEW$J~<&>q@KaX7&>;Zs|37s<@RB$n#7O4vfG402W`*S%k1zLp^x1ks@i*M
zLZ52#VLTACkGOEfdcL5K-*mDTy?!o|%=605^-h>Q42ee}r)r!{(w@MZc83J9<VShT
z0xS$TARCDFd?tp6E;H52>#1imPiq3_djdW0cOIuiJP=MFFCKZRI+PjIEgS{t0p#5L
zdllr2uv3=%qhe?ZHBM`LojAC^ivFoG*`>o_qIl!RTljq_luc<ku#XChNIkVrC;6p_
z6c_lg%>p2t+FM>#zq;n}ym%w8)KwaS1iyV?=X}EW=}PHGx~N>1q;tcr8`O?YL(f_2
zp%VK%c*$d>#Z2$;sSgk8qJ2p(xn$((y!WAxZXwA<8~_0h!I7U<qQM%ah@Ryc6G{E$
z-1Hbf^)zl`jH)tB!L-3-QvP!Hgs!Z2M*rUn$F||DQ+=5+I3r7fL6Vohk-aR&g_k@&
z66A91dUejU<*iPtrH6-yFD=LK4_{{TB}IkBuN4MpihKS}q?jpZA{Ue}hZZsv`ZE~n
zZ(Rzdt5Y4kkrSPv_@O=7tPgkpPf~&aX~LsLi)AM+T#SOI=$j$Fvbfymc+K3m;dQ@S
zo5aGkAprrAb9ph`&k7znB!x8u2z(y$s(;we^E}0LHcwNEzi}q1yEPOb!bZ(1^6m!Y
zOY+ikaSoje$hl?!GYmDSz);c%nK$;(nx3`Jk~!9#B=4Jg-heyImb_)iy*KkN!sk0?
zT9XrQDH^7)q<FEWZy6<UnNdY@fJMS%TiJ?L(t#=d+~k&rYIc{dhoSoXr$hW<efFtJ
zAjYJYNaa$(_ZYnG@!F2xcKD&RgK-}8c`_OU&**4Z;)W4@=Ed(r$8@{6AZsO<7&$wP
zF;0GgO&6~%^sa+`n}=54hejsBq#h?G3=w?)SeT$og-w1xQ?%%W_br-Px=_aWGr4Z&
z6a)sp_(S=@vv_oylwj)In^V0|a+hq9E$A#eZ=;6Sbi}cB(r{dkbs1GrLNt2~)33`+
zZELUIcF5zJX!xid)`0Fge7uz>f+qPR%P-ZKLEO@;uUba9Ab1T%0C&J%vh(HA=}!R?
zmujH&!LxT+Kt#1kmAv^k48c60sAp1w<TWK;r_UiTI8YaA;l+tSA#k}F87D5ZuL|t|
z9izOz^?~1ekiBOh=_BV{#f4;(hlK0RtY;|CCw{A5trd=&p~tHqr<~bx#$9yuEq=Q0
z+ZmXN=~(a$E(5#uxC%At4&#CG4xm3uywt!pvHP#0jRo58J9S%%ZK7MtDKr<z;o&w4
zD(M+YVWBmW@gFbs^HZysLRUPcV(2L@mPeRXuee4g7<v^#4;BJD@fWwguK7T>d->fs
zpwso>?07T5#a`B!_?5{3LW}ocao6@xNgW=!)mrO1X}4cZyPw7L$8of2?b2$tf=~)s
zG)SrA*G3SB(z8V3-9uUTu9upSMrIH(>TRSvqgDYEITW#drC0n(Qp~i1mqRcWkGoQb
z?C**<QN>kr!~J;5#+OV6E}|TX?iKrR#D$;m9DMr0y&_pw(Hr?3A)fZVMVGbhPK2v$
z`410r8*(#p3+cAoXT-VM;6twmvuwxc;P?O}btu32r`cS}1qdws6Y39nHHXQ%q|EI{
zo1vX@4TQ|$^}$dj8&hf=dvwX~M8LWNqHQLXWOZUyb7`Z@DGZ_Y5j?Aa9&&SvF{Gfn
z9^sy&P`UC=P#8|aRKH&6Be6?99x*}DOjNlXa3f0D?Or#arMd^^6hpE;PBAXB1Mkqv
zU>?ObRj^}q*R6NbN!+mBZ-&y`0eEcv$ia3GJi(d}`3Y3Ys<<sV+}_)xY*TiqU**er
zqJJLs2Ngs&w(fu1X)))Mt9evdsY0oGkZc(Z)6`ZMU)taPEY2b~5gBv(`fD$5dBnqv
zLj(G%K%7**LX5sNIbBI@!gEa7e!+Npfkr<k!9}Cb!{|>kP9NSf;M9mE0Rh=8;mHw)
zO&k|K0*BawR6##5l02Mkw3|kN3x6wC`ihNn+RN?Nh>>FnH(TFd_pxSe+Nx^HoUOWZ
z$=N25VBT2H!{DE_%flyXN|LAvYx>*HwN6ESb^QBvGJo7!&Wg+?o^}rS@-2@-C8>Ro
zZCP!%Gp2fWjZmT-$^zA)-3i9%m6GCbQFqKqEXEu&YsIN1a?4)5jecuIQ>O}W{nckq
zxY=R**H$#}j<7SQ5JZqk!5B(7%(DKgeDTJZ%WCE|mgb4U(zPZSBJqz=Xc4ATh09Vs
zTSs~d2zJBKG7Bb`6UXc)MU_A3Px7(5;U$fIM5rioN8xZjsn&1Z@<Hg)vh*)Ke9-T!
zXFrEm*J@)%zW|Z1`n-fU>}5XI<tP!~H64C}Cjem+R{_ISHEKGNyL671o#wIbZh}91
zxmPKw!~9y>?NL?1T9i?k0q!Kax7cf7ZfUkFT@d<gKpj_h;gjg<#t*nlw8?70kFO~?
zlSP~mH0&b5Y#py-0w#=1q2cO>22V-4A}De7941dEj;<VenlR`(DzP-fPP1ndO-7e~
z@_8(VW0}Rwim}?z3HOqX{jw{i&h(-60HtCDcQB!q!OP6%i9>Is1v%o3cv$N&8X7dS
z<>t44xZ|e8y4!=AbZj{U%iH-S^kv^q>va5_yXI;4PKCQ(A)os4>g!5ww*oG)%N+)h
zimKe2>{P%Wz|mTz;@_`akl_8as+CCKGw`x+sLct3L1d{72llO0--yF)b2Zu+6bxkP
z^9d{|O5vb$Oy`L&`Sb`tf0z1fO)iN<&{A)p(u(#ALuK<6iV8nUGJQF5s`xhX!)kl3
znFQyIxDa-zdc;uAM+$?jZ4;u15bw!2S=bMCHudqeb#6tgOt5W(d(-$t(};gI+?#k!
zc?YFi`yl(bYnW4#YZ3LG=B2uRvTJR&orD^jejKe>-}@*YITzWXk+{;=p@nU^DhX)A
z=zDz1W7RMm7o>(-n!&3L^CAt_c~wapFajiLlQ&{ZZ7(s!Vsra$PV-Js4J_ckKp+q0
zZ(Pv_Xbki92|jwa?0^aLL#Z~c!6Z9w>x);EANP5<?$WBJVzXOgv5(*Bi(iF65~eQ`
zvIAF2#AWQal}MUZ+C$e}s<6EpCCqJ0;Lw$Du-{i&EvtS{xP|vW+-l+1JaYZD;uvxo
z_H2E8(*<uLF!5B<O6UnH=JsAMwcnp#sgq9yvX~=k{C*bu@4Ga?qIj+0nt+s^tRaNy
z<_o7AJGEsD@5*q<>v5_$FCZ1G_N7Ck%ZR7EQM~sPBROrWH*WS<;^Uk+Xn)UQkJ3Wb
zz#;TJp+h5vjfKqkB!Pylix9!{fxz1+?Wo*4NHMFWI{cxs)V)qzQLA+XO#F#sbW+B=
z#38>pv|Y&dMDWzeib_&X6|tZk;OVdY;TjpaaW~?}>FwkF$rayFY`>?;TE*g@>hs(;
ztwASR_&W_-MLd(M<qf~%b7VSGg}$6}hdX<+0V_V)^gmC#O@<o|lm!|7^>i9W4#@BR
zD##-KK+nz~>xX;>bvd>uV>-=Ui9>v<)KM6nr+A7UvH^u1-NHN@l9R#PN*&1|9)Qsd
zJ>`C0^fJ~g{my_E{v8=pqt`4YD>E}d?0P=DiiUqmivx3D!N;)pEhgt5FQn5-=+R@x
z>oKy~*r>iR8oBlb_9dw?J_tz{dvf5jo^Ro*Kw!91TB&9#UJ`R9Qv8rVpr%k95E{iz
zKBNbCuiU)7pE;iCzPfXB6HSJkDs%Sh5ZYPWfPbvHzZvq46{a)_C)?(ar8pk=&A*nu
z{upfi#im?1@RLn>T%cf%?);^q!4JUDimC*MMxq!`US9-1&D(gdd}apy2Gzrd=YRwH
z!6HF<Dxi*ppkd9eHaxX9%i&l0R$2)E2}R?LQE0(?YMXVooS0^<<Kzg9fI>g;06u4*
z{!x^2rkwQYS%#wY^1wYFd)OdvUv3+czcrZ-rzUzS;s$*^du~7{Y(PIucl*n9w#o;N
znVg|-4Vi5mG`^mL9FCccntw3X(FwVoo?v{@he_A<l|H|=r^9parum9}r(4xVB%}Hg
znARvl1#qv~a$J(GdtJ(%eh-`u0L8bfjq(4!cl|?Z!CHk49W7KvccCo3-Fqp@k9Gq{
zQ^s<%8-zGb_E5t<D0+F5fR8P3C9S2~P)(KDNFWt(k3K9n>H*ixn-oJQipcf{txUZD
z#on-7mf(eL8p({JR+X4;TI>7|zm>EKq}92(naOXjc<Ax>Q+6hkObffozCWuMEi74%
zOHhhH!zRO?%#LFq$A7(=!0?Do5!_rxU+B{Ll3oQE?9}-lgZaEx)$?%QmaTa>lK=b*
z?VDa{?uvAQfg0<WKfUWle-snB2~tVr>5e>23pPQclQxcVY+@B)cd|s`VJlN8!*<4@
z32K8I*#cG!g0c^q>>#LscFAf6x2>o=M&ZL#VwqC59M|{svwDmaEp%h;p8_0WtMY4F
zmCDp69F;ATjq%zXyg61JIc<|$dB6Y#JiGqbhrxnv6?(TkMy{{J{@)8?{Juq)1e-!F
zOR$wXZ&)yub0M8bR4x*OM554kiG%poAUPy`ln`ztx0|;=LmcI)yWM;8ln43B1DH;(
zj#racjaZ=8J%irUJ)?OZ(JIiVve<saW(7=}Lpc>c9Y25moyl!-qsbkZ=DOIMW!FO7
zH`&KO>va46U4Tq+2A-4|t;W{)Rlq8o_U+<t?|~O`7!-GU=WBBAw{7(7H-tC6gcPH=
z+(NbaRjt<Js;;CNz8X9EJ_QtkOgws;e<$4nh2;?>Th6B1ij*qR*?Kz@(NvmEt|Mc)
zl^l15NHYn_{jKncIs6`F%1w;HMBoXi1xcr&7SF>?TE~$A#I(GhGk~m&R^<qt&pSFC
z?~<C{BGKYqc=J_tAfCvzx**6)W7S84{*+kcQ(z^YWcAtWt>}Z=-V|aTh~cKf3w(r<
zG+<-Ok-j`|qF3g?^9mmGhFIJH68C;w=QdqEaQ(PL+3%Z3!*YDk`cbN_yU7VyMN?rD
zxli>YmR^ifiP#na_$n3JMk#Oy)=kw77@>~jc~X-n&Jvn@SDbK{f^Md?<72@!T_07+
z8W&T8AYQtK*s8}&@1>cma?~{GjGp_<Gn^q0`99)-4P4oY=T5VHyN075hVG1e&d(X~
zFxcWhF@t6eL3qB<_Y8v5KZg9|u<@iot*U5(5eH-J3ejiRG9DO#b$mz14n;xE3lbDB
zx1DFWlvk2bv))M~FV4f;VICUJ$=ODNZJ}R(>vMSG*?aZS><z(1EnY9eAn%IP7`c|^
z4P_~Do?6r6isP<ZCDrS;9-(7X=jF71Qyj(PHgL8l4?LaI^EkPmj()Y{c8o5i4!C+^
z2(8IW41g3eg94r>@#;~W2iDwWkZR8f6p!Y&B6hGX<-<rGu@RJE|0`epBQB%?nT}wM
zJ;{g1E(m;LD6D|M0i#5)7&^A-@?DMFv9)=ja<H1G12{-_HYr5YE!b$r$>~e(!`t@H
z4p4&6%jG4gI{BSQRDp`V>L=?ADMP;Av@P#Sdb7iA@!anxlVZW!*a(%UUMFjgN^x9^
zX0{=|k+i67_*@5w(Dql<Z}Hg+zoScRQi_NTMV~R&zHE1jEL-^yetd6<$!s;Qb~W+O
zY7byh%yuX-#sRi4?Qpg#b0S2o1mQfmVi+dpkz<34AS6A>9Iqhv$2(ytTF}YP9IMlh
z;rRu+jyJwOEhx(r=jHKzp^C|UXrO*r-ARb|L$DiQqS|@N!eaYh?skB?`+I}yv%0Cd
zxwm4_6TDQw0RhD=%Ao1vDYNwk!u5ui7r|+blol?EEEUL%%3@Ig{om+(fOr6M+tJX1
z6vs?kuvq<;8MN7S5KS1xOS5x*niT>caxjl0zU2;xg()(MtZK+d{QTQD*~>ldoRre5
zYCNGUbM6h|RXm$R7(oKfyAci<A;&QX<8N$yU-zXizhMhWbfvKUP$eX?K<{9jUed4|
zz}05J3MvDOU~#EFYJA$SR+7E|O^*ueBg1M!mYnB}FW-H$9l1?%i{>J0yaTRWdpf{f
zT~fhlfT6HO2O4bM%6BQ8%Z<5NsVg-^XNBn256Yt-t~D$-yurc-KOBg?h%)haPxXVu
z^s?jsJSW7+(*Q0rzgb`fL+K+}-qfe<fg@f7(C}vLx#T*tx@XV@Oc-FkhYc6$*>P)p
zM&`~ttFue~ZB1>bJ9t|B=VVj(C@J_fY_YXny4dlV-j)iwbLy1Q4<`yr%7U|HDvY`x
zZY9;OlsK-f^z_%AQXCn83v1~1r$l&i6DTwf@53Yce$V&^PT6#>8j|B>t8G!OX`Y|*
z7`V0XM$jQ=sRxiX^!~xZ_38z|zvQJD$&~~pselWrd>DZ>9tH!gr0vM*U0P<cRu<cg
zb;<TVrq%+Q4>isUoX<WUVf^taAaUk2S&yBI0OL+$x6mpYg9XpP^WI@U=x+mCeFDjW
z*{Do%8ggS)KE2Wjx|f)q&J?qUMc`G<<l}>JNYgg4FmjY4eXo2FjroDw+vjAo(57+(
zu)_V6x*3Qz&OCSgs}w?IN@Es%lD-5FvKC5PDF%KC*~xKJLi$fNw28G4T#F0v_gCv5
zEUI!)C>U`u#sG9=nDb-#hymtsfe*|eR9_9P5TjvI+nhfKj~)Na&(I01b&-T9*fTkS
zHf2pduaOYB01BuY@o>A>@Gu9s%n&$B>9dunQU2UuhJPW#mI~)IOU_1hl@;^jtGscr
zLJA&5hBtrzp&-$e{EMmX*}@fGE9{S6obKfzNiBu_G5bpLxbHdEXZb2%sN(Vi&wRf7
zV#LtX<ugiKm=9e?<yp%w|HJb%PJqFi4VQCG*8{BD8Dnlqt$)ija<LnC4?fLZ2Y~G-
z0X;nj+qjRnx#%at0(<uSX1BcmP#Fe`I}7^MyfJIMrqrmq+|JnFOWRsR{nRk~>Pba5
zFXWeL8l%e&41Mq1Ipl3U=)3r-5f?pEJrGpNZ7{?{KKp$xL5X_=MDhzu!UfFr9&um<
zO2UCn{2fe4HF;uTB9ruG{XHy5Ip`JW=H+@xx}AY|2T}}~oI+~shbKqc#4X#$I3Tw^
zSXKMgEIZ6$TOiv)HayZoR*E1fy1DFnBg$+wE4aj~HGY#?tezM3S&(f|$kJuOgSqIY
zo{u+iXP^bC6oj{E5k;p^2g-1op@Jog_Q#I6f`UzQZISaufM!}WvFo0^cdKp?k+alo
zVYB>G6!l;Ej8=oog*CBos))W>M!WtZV9RTL#Y5`MUc<Oi=oaBhxz<yh@dJ#(8m;oX
zJO+lkhaJ!GJkONBJc$n>FYK`7JyV_5qEz}7)h+Wols}sk5^C*fX4z*L4E3E)-0%IS
zmSSZ1-f0K1Rq(sWB={v78m5sUv!_)+<2=dt2c1&NJq63Y*x{eR10ony8FN&E1hZZ@
z(kLi=$|S#r(sr#;O{~z#vXJ$(lowfK`Rx_(VLCa&q9vIcUzG#Mg9IZ>p6@M{k)?_w
zUEr;A2_LH-c$}AvuW_@@y`DU`8r$3-KWCQNH1M=(BZr6_fY<!ua5e61049O4gwXv&
z>$zjV^`jsz)22uWdS`hi;G{_~#y{|Uw<1z*l%~2y`SxsnrV)!+B-f9b3JtVE**nK>
z18!sw<2Oe2R=f=O3(PmhE9z}DMP5u4W-i*byioKdxjLXVCbv+r%LFg1FAU-SNS(Q&
zhRE2%N?4ZT{Z-&6mj-gNwd#jsN;vCFX)Q(&!$gEH&=y~uOdh}=1*lj5v!Bzb<@w1(
zaf>5EV^T$4JHL#l;6+Q)mIJl2z-TC^gCo)rS2RrLTQP`<&uNYmA(wUIog3mpEi7jF
z+muzRf}dRVN!C={Wt{F3%5F8+gr-+k_2eMZufEn7-+JRYn1HlLF3QbXG$)-GCb4ET
z)lZ{(H!DShsQJD}E49vF1$2TI2QnYW&fgYuY_OFcRXqB0ynY^h%(*ttl<+a=GC+=T
z0)hAOH>h-Ww^(Z3T#g5(yp8_TOf$0+8um=;!O6Y05v~SRBaG>aZ@w^PrUE`L@{dh)
z)7hTtO7>COS`bswXUw-JeNhSdINF#&WhgQ3CQElXBn2tHGs<>~6B<UmS!5s=1eh}f
z6fyt|(Dwv***3!yub<UqG*c!<To;<t?ExA%btb<aS8^x_DPYcJSnA?ybW8_ayKBl6
ziTc@4cZ+!4>{~IxvUvzZA!j@_qOEkvZcu-GWpD8%F5M1AP0AP%MFX&cdGAyTqSkID
zQPD-jPu=bEh*bW<XwY!N$WFuTd~_2%%l>Fp7SF$gX%7_9tRFMLy+s&{8509NQ0RM?
znW@P=npx0Q!mP!|8XX>|OLo=HXDo8l3|)?x;=|sI5P)QGk<xI^pXTA55P<xFr+1#m
zr5*n8o<-g2nDdZNw`;cVK9~sdf<8eTHZ!!G>B|<?izACRo^J)pJ%TsZ-y$d-oW%M4
z4pu7K&x9)}5Z~Rdc$~$x|0Q#cla>uG$LD_SWU8R!J0$2~aajJ?BAKMXc!|J3m2K0B
zuDt#CD?z&@j&ALL*K3hyp0(&RSNTJt?8q(Ws;n<vj(g2Ab|YmR(w#ZwV$%gP_qfm}
zG%JJSzwQfC5)xy?BeP+9BCCLz356KriV>LsV9fYONTEJ^EJZ6tj8O5Hq0OK0C0V)t
zRJA%tc0f4&X-(g@V}cPBPnjale2uhm%TMy~)Plg==nf|Q%;l{+Cmh04%AC8#bX#gt
z4q=aE6GZ|1J=31xEO?0oZmDY+&GNi|09Sye%BlD{KcJz|Tln>t<YDSE<8LoS3%0$N
z(j3NjXQ?6w&>HTgVv}9v0x@yM_VIoLl*1Xaz58V=S)08V#Tr&;ihEgmD<TxdB$tf(
zWrx4Gy&32(K6J)#DXKZr;}q%uu48}w$zaP>V7h_+Z2Re>DQmyKuabVhG7Ro0?{wV5
zue{F`3m73gEfSqny?ea%hkhb8Y?nHS7{!%5cK_5P<HPPTh0X4ZAV=NO#MPMxJW>x3
zzv4-52nfEjR{FOAG|wq-y`)&}TzonZc9IuX<5ZVpN#HowwT&NJ)&6A~cG8vqQ7=t2
z%U#<4ebZikhu>6H()vkt%7k;VSe3d!7E-dvpj1Q#=^{wfPxllGV<h~<67_G3w>#{>
zsESFeZhPhlj*mD*{gfdZf!{Ht_Yx9o8=lsdWDV7h>lT|B@?y}C&)Y0H|KaDrhx`ms
ze=1e-D)OPa4Ix(G1yL2YE~F=E8JriNXT889+L#qDKuRi^0+9<TRoju1`FYvb{$ebi
zS$P-Up4(o4g(WF*7Su4(tK<S7GrZ#eGNN3BeSv*d@VK`({fSp3gG`!NUaQolQDQr9
z+(WE5SUO_j{ux<Y$#BuJ$bO>RL~Z+qRoytNt`}Lm;|lZ$+^o?uDhfD0S(?_}0HbZ}
zImQ<4E@d?1AI7ov*9=Ub*EH>w6rWOKch<*!d+`l!Ee}FuTz@}ID6?F*WjIM+f6&n5
zT0BtRU^OU~K2vlwn_t+}>>kz~wd4B!&3<YAk1bvEhvm=fA1>HljZ~5k-my#Dw8qN1
z^*?%};GYpQAndetGsY$oLn|2ip)%|nx^(gM?;nFPnU=G}ag-U2evZSab@P}_P=~DF
z){~y~wFMp8$^oyh%yn{W<KBX0?3vGSt)n$uIHf3)1VJz~W;YV6JGVOWAx>KD@;Z;Q
z(S3PsGNFYzMpDE+I_&a~E0-4vu5QJ`q5j<^1Kb7mD>mfGdp{q8HXq$GYT>of1|q(4
z&T39#d3wm_sLL5`HQv;2#luNiF-_5vB0O0PGXuz8<#tSgO1@s=t!iiGhvE+@q`<&x
zFLJc*89s`RoRT*B7VthOk5SVS+095NhWZ;!MD4Bo`qpbzt=l0SBFd8%ZW}KEvt-nz
zS0E^-dB7V9ZMpX6XCV>*kL^$UTuRfA;>s0U>5j^{fC+>Cw{f*zu)({Vo=HmTfGI^Y
zI&?4}I>na$<o93ev(Naue%~w>$7Ie<mDXCq&R8Ej8PKTKQ&ihrsdXOBM(U&vcxHw^
z`mw`>Zhd~UwW!HrYWRS-xgKNnlIq=A=Ugr@cz!v;Hsp-Z{-<26-<$1EEJToxM}jFH
zMmKHi_Nq8*9lP`QsHo|1_+$fgekFhIaK?UhJ;v62Mu={5$X4->P3RFQXw~K8r?`G4
z`F$3^lbnHvJ;Cr%i7jI&PV<y+iIMedUd>&C1e1o*9ZF$gNaRbi_(lbX;AcF}w!~j&
z6zyh3CT(wmzy8o%RuZfs=$JA|Tjn=LBuEK@NvRYzvqKIaEAiC6(2)8{98at~!Qh`H
zK?>oU)nHmEO$N;+g%V{h8jaGe-tP1*{*$1fp!Rus#U&?4(|jzg9J?VV0X^uTn06w;
z?=oU=go0HV7q?Oodo@1i0OCMBtl$r<Asqskx}ztKY+WJEVv2n-!(3i`N*#_1CRWtQ
zk%1SO<B-$LPhk+aDh${>%|h<&a$MZ9&DC&PPO&~LGv!w6nj+dV56SuEpFUlHdb7_{
z`NF;V0OpxzD3QA3^<+v--#>@iU)w^|`EBn^RMov7kQ-!~gNu(N7ni*|D6w3JH?&{Z
zqjU46m|Z>NzS-2x2AnZD>ekNLGz_+cB7T^<PS_cI2>juGl5;Trt0nZyAHtW?wFzPP
z_S^6%J2$*L*s4iRt9{1*JwvYGLsrM@x(&Z{+0~yP@rwFJsNxpnPexqRc5ynZQXMRJ
zUf_8?W%(Hu)(n{m(#NBdf!&#wdWPjs#!^A<`}Y&0o5%?-L!&S0<ir>hd}m`$S<}e5
z2)2b}^Za6GtoI2eXCw#Ff@tX4KL7v*V0xJ&2@;EXl|{)4xn}4iOad7Jy^#!OP--*-
zd}dlV3P;aPJ24xFdObK$)#i|D&FV5dn7~g@$JhM0uNoy_u-|ZI{Z1b-D|wuM>~^rX
z>)h`m4d_46!5UK4u1#>}(2>*269gLC#HV+DHy^1toNoB~tsF*Kgcf`R!cc11xYO_P
z&#pn>O>f4i?eGvxbN#)p_VgbGpxDq^@k9G)#M<BBA@B=MN>9)CxYef*Xk;^D10`^>
zA-K5tv*OO4r%lMZf!;c~Kgs9J=PPyt4_oF!wtSA4=Q!7jZO_Wad)zA%8s`@FOFxD^
zS>r%HjeHR|y90o|<(U;~y%gMCoL-Ugo<XFFp<<i~*d=iX)<&=L)E7V@LNs<?6l&H&
z=!W5bxgVFSt7;2$V~!*cg2E|G-O&$kPy?S`5<o%)p3_E~o8j%d*|szSOsuF}Mk!cB
z7#H{(e*6<XNS)Rcm6k)OQs)5GxU9m%#b=}2DX)WRIMc<AZX#_xJ6I$h*oy;NoSfJM
zr1*0s4y*Y#%c}dI$wS!LpTv4}Jmp?mw8N>U<(d=vUy(UzvrrFV9dkT39vo!8S#><D
z6lF^*I~>TClSQ5Pp3uvByv*u7TRcn!%D7Fl9Dscqy8bjw>wQ@(&yniTU)gUxsb8+`
zb377Z)ZO0JTibSt`XiE%6rh1zZL-u@bhgc*_A3_hg&ry<+B-m3v}Ri@=+Ks)=JfIP
z;mY-jI#wVi9ga0R-hn%2tVLf8nm;su%pK?%T(9mO{!5f-GaIO;E#Eykj5+H&JBjlj
zI}e<Gv#$5?^Ea6*{bgd*v7^|=)*LT+6@v41nrTLYd7$`<IpRGjP@-R|jaDT3j#m7<
z#c>TgK_Xz;_vR(B1?p4NkW};$qz90kE`JL@nj-pO_TiOea}m$NAC%x=Vt7?NQ)3NH
z--y}gvUxC_J$*nG=S3bI(&sSb-a|F^n}V}a1)@ydNbN6qa<qX^?fyMUUdc(s`N@tN
zQIetNMp};)Hlp>JN_M*Jb%SJT@}}+H#>_g>9<(sIQZS^sS$%MF(nTj#>;;xtv1pgy
zs^QPq^VyX3{r3B-A)SqzZwV4%3Die(UjiS3@!V$Q*N@4RVt)S)>mu7yQ8H=ibhKxp
zU$5hjkDK#jx;}K@+LJ(RT08NUyMCpD@r>cc%{jX$@0nRoCMzftxzbTq%wZUP89B?=
znSA~;AV;h92rpgdL`%D02iSp+Jfs-D-H`CQ08O0G+vpS(iWuJP*b!!+>+opytH-~3
zQak)QV!C-IX`hsk;TU#kJ2g`bO&RA~<TK;5_jLM2ruSNRP(6eXA5h#Mt2u7f7~pA6
z?!#x)RJ{*XJc1lz`BkW%F`Ehd+~5&^(#dmIAh->vxaTlcnOG?jUiI}hXEKux<p@v&
zw99d1V!_9*-;80B`(NC)E7;+$km3}hXBMjT+4mvn%w>d7@PH{5!_zqj5J3a!s1bJ~
zs>?sb*}3tvxh#B*sJ@Ib9rWHWGfi_jNgf*0rhp}|v@P_}Fh?=-t{vNUV`Qkn({W<I
zb%R?+^tyEE<l&NCC^edZ?F%tDb`9J3`kk-+V7sFwBIr-qfae|5se85`-{vcYA@H<%
zG~1B>fo6aas&n?-n(|%mgc>o3z2;^;t5_^$;C1nW)HcHzPG?ZY9Q2Ct=PSR5nP=;-
zB;O9R#rZzgN>Ek49HxE<psV@9z9`DdjhP|~VY=R-7`wPnyiE~$c{bcuw!PW=AZb5q
zGkDDXWOBNqu4OB=*W*rk(2SAlNbX1H5JMYoBA+^wcU?1D>zL#z&J<Vg*W88hXwp?O
z<le!Vw7%c2FrHGO)V?%mgT2K{Qgk12I($3l(YyWohKlQ@82GYA&&)h^{U=Ja2=PY=
zq8c=qcgvs?#qiaUL7O7dQiP8-tA?vM*ojC}k=C1TVZE$5&oBagJVc|R#{YXhMgyN`
zh>5V^PKmnr-5F9~ley%GyrWgo!?Q}-r^>fHWQ^77ca(PBo_j*!R{n8o(~~y0)A2<9
zfD3K%d3tD3b+99!+B(OMn{dnJAYI~(Fd5%Z{#tX#gcreL#D-v5fWUCri87w`)u>T8
z`=>l}ZJ}<%$hVv7(U<qMhotK79RMRL7?!ff4q&^Ln}8Xf@SKNcfZVj)h{P1bc)>s$
z)2Y+QhTC6!Q?s6I(@gr$G9C|X5;Z4x4JOx=TGnZ$|0K|E>`1?#3(pQV&zz8VNW}9$
z-~Wzk+jZjF>pf$XW`4!W7j=!aQ*Tk7Y>NDm8ely7PA`d~(SALb(UR~)5S#=$)mYAJ
z$zRIn9i+g=Nn5tD4OJB43iMLKsTJ@$U5o<72oZ)p#9OloO9hc)q2dqs4TSWxm8MQf
zTdD__%Z(J}MCfU`vE$GAtv?S-B_$}#s|P&XUJO2WF3v*fq=-BP0<S^jCa|I>^A5b<
zGzpT*TOR=A667M;mJyNLGTR(lUj<0-+)6vWlI>n9is)O>FVIr9VuZc0I&3wpVsdm-
zmJ*$B3;5a>lGaX7G-{=c*?(fpMg<LHBUFy`=ayiH52fs`%$)ju3wjS>)*&+-Q5uWB
z<oPw#V@M_Ss!g`ybnwBYm|Sv!WuX=UOgT_2uS6>bZ=a|+)L`h%PKZa$he5-X4W)x;
zor2u|=4?<6thhC}wo2Jj1<sZ1w;qJGZ-kx59&c{8wCXF^xx0qL-e1aXwccB1j^wxU
zd@Ov2A-Y0S@E?zF>8<8!Y^pEDUkFm8j=#(BHUN_oy%SVs_46e^o9)-Q8a<hIHn$q9
zra<3aHD+?=Yuv<eFEU+XzGnS$@!H$gws0oet%BWRU4b7so#hX&iy$cFzz(6DQ2%h!
zzOMee1#J#zJk*Nvs|MGHUe?HZ@)|FzrfDBtN>uzoW$V&x96Vkg&zOm)5L0xX^h|y_
zPVx7y*z`*W?A{ZUTfGn3iHxG+JM{~5p)vK){c*<_5JsGxM*5JE@Vn)FX=U+*J;wv%
zO@_xSPGgoFrWXI6eHS|^{@fJQImIR2H+}rYeDmP51n4JAv?1t5(zQ1GySN#-b}FE?
z5YLHsA`5UeoE-~aWVE)4PTfwUi3%_qB487e_s@SxU$VRVe%Jr%qJfH_#|-SFG>rz#
z<;V#<-{G&LQMMQKg(Gydb)TQfb+Y`_Z>rRw0v>Y4{b19Kewt{u`APkZ<&M5;&4Yw1
zF-L}3H$?VI9X>!($avi1#@zyI=p1-Y1L(I(@CB!H5bBdQeoI2<1j~A{VRZLytj&*i
zRfIQ)ttgio#=mQ&%0{y@kw7t0QX?M&9H)1Ft?{Kv;5YH+4?yk|Jh3Qpc7FK$5S{jo
z=Npt+ZfMRtJW_F~815ouaLD;XB72sUESCMaTE$|i#s_v80s3&i6R)G`Ruqv)o=Sy6
z0n0ju`IgbRf*s22Z7H49ZHa26j7$nyIrX<SdaG_>Vqcr?Fp#H|%mGya<W40wy1r#6
zj)l++t&KNyntu3$3{4quR1KXzpSdkPuz9LU^w?H;=W@O=tAkN|lQ15}FdI?1xah_0
zcOLVdr}@YOZoWm}<TizPK<{sB1Us;9D7|d37IRI2*%de%$(&@(Gin|++Km59l@kGk
z-wk+qO;3WCfjD*ZSpb?IefCDsiMEHVMJOUBudtphKVg5#lZoyKB}uX0RjcNAcAt7Z
zKIgW5%g)Tem{Qo%ZNIkP<DRb`if7X+ThGNz=)HKC3i%OgY{nWaXGx)Z#ESBSa6A(m
z5j#qk?$2Z9g^)3+Jmu8p)N|(SmX{hV8de;KW_RaGIW!xhx_c<WA}@M3W~k{&3C#}`
zcwgw-i1XoIT`?;cI3*vAQQXZ#RCRruObDLm-;aU^^EJ<?aNObCYjq|+5`WgwsdTo5
zKOb#5+v__1GTPLyNWjuZ-g?EvV7@2!!-4d1*W)Ixq_A{R8af}<Y>oJUp`k4w3A*sl
zb#*e4f?SeZUw_#xH}=U5<!fQ6MRT}*lQMDd_CgRn&l6DuH?kshY4F5HT%Z#8axt`i
zUNeRIEl=+v0%is{N3LIA&6wrfw%YbwR8)OaeR}NY{$g@>Y<bh<{NZ1^1jg8;(>bxU
z-i5AWm3V&xGZrJL*<yng4D6$#qhOG^C@|HxUf;ENSkNdT%yv#O;_=mNoB6W>_mS#*
zwy}<C04(f=GmK~N!P<j&H975}_bb}lGmJ8&2sO<@1oTfO88-cjxs&k1`|=0l>W+PN
zlc81L&Nj~vO82*v8e5Bw>*x`DKl+yeKUfvKGbTJAmv+Z7uqjNuQJjiMe5HyLVN&_p
zS>#j=y*=dI&bj`)`HHy^yoVU+e>K=ip9N!zP91(P5(!QU0lL-}D&1sVo+qOkK4>`)
zY1rwPp958xHLcwD>$^32C$L*^s%Y~fl~r(FW;)m#NB(zF@j&)?PU&p!Jwaevs5tYX
z%x|>X>-*EE8~*ENVS8qCGEZzySIj1ZrgpM{rZLR6x8@LGcQsNyZHDMXvgNEMK2O?#
zxgz3e6sA^XXF29^Bn9s~8q$Hk=pVqu>O{53Wg<r-O3_0*ym$@;^G!!y2T5*lNu(s0
zK4D1s9ND~-M<bWtWdm~GYc~>`we(EG)w%n6FP*+|P3H-cUsA}J0c~Jxzc+-QeyeYE
zRCOSSN!nAyrz@oZuNDi&4Wx|k*(S%O$5WfdaDo0zH(WQ^k`nuUE9gMMjxo0HiC8dp
zb@}KzZ>ACGDAgKt{RYr?b@x!`>v0a+Q(c^s(TEwt=zIq@o?ILrLBttwA)x2$x7Z9x
zo|J4OA2ZQ>1J9+SoZLPaJpL$S^QHMGhZyU?5!d6ju;Zs$O##@`4O)-gLNZ-k-O5ky
zpVF3nw)*riZkSsRg(mwt<FL=s7B>nf!cn2G|KK4?S6+<|C#TH4N7icchB;eV926EU
zr%`zI)-3h2OH{z>knY+?-!C;R<;SUFrUO?$MCWQeJ;zfhnV*XITl($e8`s@Ds1>n0
zN2?#}NYXvsrwAiZv?>vcB%*I(H=hb*i=)nX&ImE3j~@Oy{#mTZqN)1%QYIxN;Opk&
zHqWzHEErGAR;L-KNrYUhAy?K7E@cx^jx|GXZ-9B6<H&O)$}fUoAn%!B7SVy^GT5Sd
z)%%Ve8W<Erk=bHKVQLwije>zGYGyHgm-EZ0EUO0bzeRMHW?WCa;Y3Od+ewk6?fK$J
zi%_WVrr6@}e7lVj-#zQtdJKJK&qiq5jq-tE=5D}=CLzfB3R-x9D(~qYj;tPa+Fd#w
zwIm98-KLe4QgE3Y9ajSs2&HhqCh3KCcq*AevmSO@V&|S@ZZKeNHUyuiHo;LR?i(6|
zOdHGg{JhP@s6#vf`|vLG8adfah3#`lnfqc##vh9HMj3d``H_Ik_;`>0r%@~KN_YwS
zPF0Sk$|C!@P@Ts_5>W@Xo^nhP04>fL>^B4%CR3_WDkC0V8~i5wEQ(jSopWQhDBK%i
zmwKow%kc({FU(vkRr!7v*Fua-6A^zRxJkhPE29uJ^(q8&E8LJ50x6x$(65JV2HE1S
z)1IC%<q+@SQCC{8F4RuCQ<nmEDa|;h0#33>QZO%tTFH^oT4d$^c?jZPXKvKt02GA|
zmmosx9{+kjhMk_A<dlky*C4jGA`J(y;A_!7R<N%eZsj`*^{myQFwYX<bN#vQO`Ib}
zW6*xPSVRymC>1$_5g^OWZ295Qy*EER0Ok`r9$$@4%;pbr3;%e0@L83iIt)<%JWG5W
z{$9nx8#ujKv|&z8m^KF8@OFy1SHi@0Qh5iX$^giufNH=E5CNB3n5F{Fp~vqnTC}>S
z@#&66er?ajZ+k>V^n^60^0M;4OfwJZ&0jA-A+`?SeD7YSAlY0MAMwe91I3nkL49$=
zO``UALoi1{y=#|DOdIvSXnkAt_(F%LS??#?L!Rl1*KuqUuhgl7?HA5c5-5xZ6h73@
zzHZ=B>Hym?dj(Ihr3g>3)tS7~KcLTLA8hu7%AmT9p-bR-&9RxCuDA8fBhL)vxo$j1
zZBf!I1{j4CZrOKU!4w2*3y%@Rd<*$L!Zt7GboCGk@bhi>{z2IJ!OO@1;#sszql(2^
zdBX-3^;cn=t!FDk4>>$;Z6Sg<y81j#>-B}{+DuUd_IpiWnpBhGp<MnPy1ShnumSx2
zO|cUVpTThmT(4L>QZhpc(cQ-J<*{rlM5C!cv(1qQC`ms;Cvhxkn5zY}1=FC14lDy$
zG~`mK=>oFKLQzfQRqp4!35R`Q$7}e0HtRg0NN=dQ^o)d>pGtS7g6{Dzi<Zx?@XtVE
zVrO=}2O+@0OAD%L4>^?C@1&h;0FHwYz3b+ejVT6hsLIh^*Wvqi&HQI5(Ymqob?a+?
zO;|VJZ%{au1Fk1r_0<Tqk8C8STakk8Z2nff)TNb$Jh2K$<dEcB0c52|Ug>7yI>A0E
z?j#mJ@@aA{ENuOFKegq+&#dSq;#R*5OSKJG@xpM!P`4RT>sUzf=!P34XFsbvX?VkF
zb624JI|@$Q#-pyRNyqrmIR}z5A-SOCdFw{%(90GZ+xM?qj9T8mZnj}3KB-{Wt}hVW
z5}W7}4o>D-&4jTrdFnES;vs6w<8^B6sIw}Q%hxZNgV^OCSkDT#D~yCXk~Z*5D=+W9
z=aV!PcS2_s_nzN+RR@Cv(@H=3D-(~6yQN1;8AJAFaJ*a6>Z7}U8NiM)F24DLZ<zp_
zY}M_GR-F~J6@^`vBB(o3nQh`DOu3txD*V<b@SN8z`+jy4Ot_)?qC8`RAzxVvf;NM2
zXIqN~6PZ)vwdpSC16P0GlB#0we!pg9F?Cqq37EUu0mcbbOU!2=Y&-*irVE}~D424y
zjq?o(pQv5gHq_i;8Xa9cRK@d0dlK<ly+wt-NlkJ067~@mEN$>@7T*Qu8x_<D7Bn!a
z@$5}5W(LkcO-skKz}GCkbYk}{!8f->Boxf0190D`2+rAjiHYa8`Rfj!i);4r`a8?9
zUzgJ|S>CX#8?@d7-{NG7qs0>5#2lHP7LC{a=f40NjN~yXKZO)&IZF@1bE1q(YPH!x
z)GDc=+w4Gf+qtjQ>r7bEXFe_M#)&r|g7dR6Xc|+Eq14HsCWXQ7CYm2~E7u<1gdGXe
z>i{n`JIv4v4$X{RttTx+L35nb?8P*S`;`gf9%p|&$k+XUduk>hr-m-4zF%%R`t@-_
ze)_PVz4Vct^0;H>&r0=u(8<KdQ%gZUY+=M1>8mpKkFIk3AT|{Z0Xq$eOZr@i`gb(R
zc;w&6a<-5@2StQZxM!m%F8!c%rNu-_usbs#?*cobIZp?7)468u7?5&#fhn_3;MDH;
zf<|!y@;<J2U=hoqg*C8QFi>Gi8A|`Q%r!>yZL-$wpx|l3*tb6UIE$y#r(pr-+^dsy
zJqvQEGgal73Uy4w6^6D%3yHQQTZ)5YM%m5$-q}OF&xWi4{+6?ph(b*u6iRmqyV-UN
zA8->=kyB}4oi^PhR@y&<424@6oe2sPgyW|xfegP9jVp|8+UED|2-B70i9Y%~R9Yt*
z>09Gj6rE3_H%e?orp*9-HvKE)YTJl1%Fr9RJqBuMTR@J&{zty?u+!c?Y2R5s`DNmq
z>jcetcNVp(?#llK`9KE0s%Hs6M-fi-N!=U`4thk3XiMoLE{&(ZvF^=Gm9o)Z*-(fO
zLvbaH@}m&&Fq@`*3lH&WyCuBrz(=2J#=+;^PD8iT)oNca!-p2E9a<78$LlxgC_`^=
zul?W$Kd`_0tG|kI=(VqXt^M*Z|8gW_`HtlTDFE<@4}tymZ~u10iwEqv`2FAi{W#gf
zvL%y>-}}AatEQ>NAJH-1{N`$Wyp?*Kl!uOuz@qaqNYH_KS3@7pV$#tyNC2N*w9HB_
zzx?u;t!U|?)y_kD4cyQRYz8j|IV=nO#<IW+p;gNQd3ogYUI^%IvEI)vZPpy2qZ#;u
zA(-+B2L`a&+5PatE9~NnF0ca=H`<|RcG|)vGi>$qTVvOMi0Kf@n6O=_`D{1fVij;<
zc*K<jk8nX*F_HTWICl7Z7=k^7H^R~L5{?z15VEYdlQsUttUKrfM+j5|yvM<lNP{AR
z!dMAQcojGfp=<@U0j8R+@+;uPyXVJ+@GoAgaW2^7c0h;4eFg!8<o<BXk>!gnmOs3#
z1rNQ5t}IN17as3S@N)S52)=Q0+u;dz{=Av?(4$-I-p8J`x14mmt#@9dh<1CMk%>GN
zfb<dd3E-=qg5MQV$HEgGwQxuMB@v`e6?=pw$|^HhlvsV*7lFr;0qIGD6#)x@HWU&L
zFT@W}5T@|KggNOZ`w5oqJ9gP4tDdyWFFG&U8kT@c3rMJL?v=8l0MKuE8aeR=0gu8?
z*>!s$v?0Xt<tX=!H{KYVD+!Z=bH*8G*tfs^?HFSyGwlUOUg3y<4)~K5fMc|hY5Y7i
zSZ3om3F)*G3`g-8Fo2Dc!L4T;t6{LE6M`FIJ=ow;iw@CAUIuC$!k<Bm^>}Vv!@5Aa
zW`uK!quh}x1GF*-XtN@hf3W5cAHmV648#KypC@&|1}^d1q<Gn7m&T0kyFb3)Z1)kL
zvHHwy_fG5f`|bI{U-2Owhr`SlA<L|o**>5dO_}wR&SCqtMjl@oVqHP#^UHf85ka2D
z1B$esk|03z0XL<S-__Z=Cvf)?D?kw(e;uvEso<%EJ2mkyemY)aUAz{tC?moLP5zML
z2ueYe$-^j9SQ%LIaDneRCv+kF{Z0T`9~{nvNiGOn0VqB)1P?IvJ2kEdZ1*oA_MAG?
zzHs+y`#;|FO1reTHw4n|-GzyK_3Gk?9fLmtkcZv%v|ZX5b)>c-diA>~$msk9w`dU^
zjWDM)QVtXX<wt<gAAv(#Mxe?<z=;{skp^Cn4&{LMN_yHZ@$wtdLiJAadilxSKlkr6
zOm)8Y_>6n6*A1<%JqK?3Hu#%y(8og=cyRm%js(2m@VgwQIK~+sIq)~?|4KLB{Hy>R
zV>L=crIRxV&|qo!>J$h#-f@HvhXWpP!9_=9R?dbRZux50<X?*};PbF)2La7j0{Mn}
zGs3wH51N66;~kzM0PuA`0hSA9N6joL2)=l&-}t~}Ab~e5Cy)k<MUT&BIIgy0^?KjJ
z?q37kvBlR-5usi$Zq8=5T<il-J$W4a0SI?N=eF-cSW>@^!wV8<LpX-e_40XGzZJlL
zGZ1@PmKYG;N0QBd)nx!+$vNy$0ua6x_=+kt38%&RD6@PtDbn%c{^4pK%0QWT)b=P(
zvD2R!;v{DI#DkOBy~B-7jj_Dp!Tt~RB9tIk3Or?$1wIrO4-|kt+w5(B+4LS;arblf
zwOjA9-~CUo^BL|`p9#*0_ClM~20GTv*(65$Wu}PmQ@@d5Q2U@B)o0{ac`?i{wc1fD
zD9A4wp<mVk!f5xDleV4S#0M|fOGDqHFX$FSED5A1FYOp!0S~xx#PPe^Z?SydziILJ
zpZ%JB`eT3Q?TfZVI<;ZtXx;}leK5t>C^zYOs4tv;^9~Fi^?kxfm;NRUr~JGFqgEQ?
z#=FZ3z%f+A3|{Iibkwq=5!0}n?M&tT<=?<!Pyi=0TLuxX^JOibS%2F3%6Xgp4Xxl{
z&_;kVb7^KhelYmZ25ic}Y%u)_Ap#l!K)FcQtP@P72R>#?u6~7ft_sGy2+l*5S=u9O
z{7Bcq-Ojrm*5jMphzp!vSN`I{6%#x|s1KwL&!t@mM{|>T48aIW1g0wnY>Z>r>*}5n
zFv9##JBTtA12F<StO|e%9+0C_MqtOrzog748y@zOHUll-6ixnqn4^i}7rtig4xzx|
zc^rRycfhaprL36u<I}_7b_Hi*kKgz18vxnxcf`MS5T_slhbx(E3?v<U1>DLp!7l^?
ze=HNY1>=b0o9YU~20u%==*(I6!EgNBe&xE0{aD}$@x28UEa+giivYyBLmg`a9mnfv
z2ka|hgB?PUhnXZo5nQ!|Ulebc{)o@`)K9d9A;2!@R~ZD7ae|qD`IE=#hC-A8e$x->
z*YJX|M6mFw%ZYv(<);6*J$Q7Yj{yf=Q5X(~lHaNTu9dzPKWn9f=6d17QJ!N7Mpgih
zf!d-2(n%PcXymnoKWPZZ=NXDqeg;N_8?%0dJ`XscWvEU#5X5vweP{>ak89QBUymLH
zGy^{~f4)M<lbTTL<*OAA@5Nu<87SBKCM=dju!^u`fxzKf+o|1C{1d?2?K^8W+VshL
zZNJN5pSO7ZA%qK8o$)g}#yQNw^iP_Q;FV<oe|Lod;Yc0~2otBxcEtdbI#z?C&SUl*
zG**N@!rf`<m;c^Aurj0$oFQ}v)+q{!>InFuRJc+T@8nS%^QZ!TVT~Z_unT{Leazg!
zMW74si5%~nGObV$JYSgj$?W*!*rUIPBILhAdGHN@<P8f1Fo?%<+|wud%QHLdyFY!%
z-t&&@-8xXjgO2*;!O>n3e(d$2ZrHm*JEje@xs-IY5oW;%N|XocmHH(PI;wndc!Up!
z117L|QuxCGJEalWvJe29vay*E9#B5^59kg8ECej^kp=~d2YyWSrK#!DyKVK_r|rM|
z)*sj>|LJ3)EF5x$G{w~}?a&9#>_(phSpgV*p0|XN0f>Ps21FN>jXInZ0Mhb~-za@8
zLX?;M?4af0vJ#HqaSu1NAh0#S)`Bru7=scpz`>d=NBmgp2M_QSKNy^P@o5>bT%g3T
z(XY|PLG@q}E*|0wHa?0GGbg?ifB>oPg>d19@Y}p)o7rt^?4-;4>}fyx-V>Yk81TJb
zVg()Gn!)p~`gNg=V09kBix4>oiir2HgM01JE*GfN=7e=%qT`A+bWhK$-HCKqfb+CR
zD$0a|3qTIXf%2>0Ioz@<i@9RY0(t=Ls*)dE4o`v}lYa<n_8WjF#pN3SCrq8`#r1Vp
z_A$h=gJX5!5;p+)D>$OG!1RY>l<)w>h6|6l6o~y396|G|l&t6EY4+DQ-)C>Q_Ch=L
zB>!Tde|3>K)fZ)=o@lFTv-k)<X1&xGiY5Y(dgM4L{KjI0g@8KM{sF-P9%<xKTnZb^
zS37Da?_jWN#iw}055jmTE9K-PJ!~pO;Im5}OB~1LC@VbRc<fidd6R?RWXn%q>S0{R
zT_}?&Kfs^jt3^*a44S}^-RN^5D*&U<^QJMP86yzsPzcL<P92WHjm}7iYLvbn9P-KJ
zk8oplaOIU(hM9CYX8qt54&Y%nhhC21KraI|zg3=kbT$enKV@KGXEuYQ3|KP=YZo?f
zd9YZ(m!JRKH$u4(sde{(SC_j!TepEvp{{jn_7XRP`(Wwj>kxcMcL`muvq+6n;K6-I
zeD)lI6=7b?Vnf`xP({#^FBE}3-#E3)uNY=$eUA&)*kSLGD$0PX3;u(4gR;gz4N?BM
zAT?&_1(6<xLs)~)1$sDUzvL-kM7+PVToD@>Bdy1IHfG*z^y|Yyfl#MB{Pwhb!LVzJ
zAM<me&rv?b6=in;<g^v^RxBft*Wm^o9`C=ST+TO922%77jsz~9G1YgYZ?QXm{;0k4
z&94Yryov@Cjr!I@{n7qtdpv|A01<lHQLp_2D2<%Cr@pmbj*qo~FdVSbHrq<uf|iu7
z05}SNgcDTIz!*ar@acoJY5IyjdCajo6r~^Ed5``12R~y|E`Gi3^V2`iJ-6N_dq11*
z{7d-(Pk6)&`HhN3dhp6MlSgT@KQs<x1z>3W8<E^}Oa#AXF3og^bU+~=La<p!UK|}?
zJMb}?a@{Vt5uB-6v{|@mI^kddMKGgaFv|i58<UuI;F>9G3Rmwbe9C}T0a(D7QiFoj
zjuUKVDOg1Ax#yl(??-Xb;F-b(FSFz)RzL0NcKZ#0zC+oCWe7He8-iTUd0#9Act8YG
z7YJlBRGh!ZFBhs&RVV>omarsv*r5YHA7+N_!WMyB2w2Zcx~dTO-<~-(4F>EA9#Dv;
z(kTI=3P;+ajTF3a0gr;<xS6wifePX8a3)OjD|{#Q6!iF-zVL<e0x$}IfM8bddmC8S
zFM<neLX@4Q5_l9!jhCD}&>V#?=wO-UAQv2+IK%$+_J{3t*Is1v=FISZQ!JAO&>DZJ
z8`TwMq5gPq#3AHa3c%u(S_jxnh|)k?l0tyuKzjyWstgD%@aj2m63+u2jndYGF&yF0
z4*sHEi{%rP0l7JIX8Y@Lw({{c?(erZoW0miJZV9EkAuAxlpo>Eashl{3}LJgZ{dqv
ztum61u?{N)j^DM?35NV;(ve247R+W~DF1l%vjT9G)hq*p%;2rg;?>Ag4ScZT(Fqs?
zz=a8P&6+h~iq{Mv?IarL27_6S7Y=3$Yb$H!9KmCkGI1J!%lVr9ox*2HfieJ}HC}iL
zUTAN}0UO@HTg;@)h%otMMn*v3#CP6{^j&@eV81=({512O=dKJO3J};h?E@g~1O%!3
zjwIqdmITF935l?HC){6D7SGE%dYIr{3Fz~ybdOBj;lkGiFPraNz{b9S$m=*NaJzhf
zKH|cx0LCC$8FVYw1Afc;5LrPQCi@~S0v`(l!ke-Oo*v(rh!Vmh1;@)3^kFe@7im3x
zEFpNB5DsBYs0a!8;E-eEpwr(kI*RZBLK<+ozs-gHd2_nm0<gh;@z@%>{$&?Ll16`m
z=kTH8(z+vvwuk^EoOXjET<ZcJ?j9g50ubSs?jHaSW#a*#T#AdhcH5-j4VNz(Q+l8k
z8fp7Kz58DKZ-4nwv$HR;<x5Vn3(r3<%8n%rC4l7uDM$!^gt_<ue*{ZB5#5A?n+uKT
zPt7>hpLnU?wel)WIZk;Luixc#o$xy=07p?Rr>4nPg+Ilh1JkJKXoEQ#J&q2GAmbJ-
zgeB&3W;VRzfTs=ud|<2NM1n#7kRBWimf&E$Rcq$pWky9=o1wWDsuaF1CjvfYK-mzz
z@@v7U7be)$3l;@76QZ;*SXI~iOPR^j<KL}c`^<Cp&tLzEO+LpJ1)o_XFcEId$`P=v
zcZVY2{Ph=)=@&{x8h%%V9cknue}7!0tqK`W?{NDL`G<x`LRvSU$2G(8?pS;sJ!&3-
z9(C*KJU?M<_TsHj$N~UB;y-DFPLvB~<_=FP6-q`N=_3mcdMYr}a?y@r`NG>nq)}RE
zLpdU!w}n0z{%W70KvdI#pny%#0avVC#G1{VWdHV~6?WA{XT}#7u@2N~qk>s04tO}?
z6i;0iyeRq>vs;!7IG&1<fB@8ad)CWw%!Uzu+CP925#{!>0T0J--cwu=F-CtxpUN#7
zC;6`boxAqfXTSJ$v!&;|0pPHmar$XNKYc>{pg+JT`h-qg(D&dGRx@oVgg1-@ILa?R
zpa3jhyg0%MOX-Nf${)pn3*OVmNvrTQE#nvXiQ{3M;Yrg%EA%n`q~R%?@&29_fTN&>
zX?!$T8n%RZJ7j4<(cn@Kh8jB^LLb3llh4H$UmP6@!JN(p1PA!(AXcxVk{%p5W&m_>
z*83S4b$Jlw6}|YRSDwf?$RDLG!)Hw%+6A9<jp(d}rVRVO`|b;qKN|_LDrkmSD{oO2
zvwQA+!0g#OtoM{R+6#LMOM%i?F9Gs$hr`#q6oI(#rw~YeYz`<AGWmArWeUNZidVvc
zgI{q7Rlf^>ulEIf7qlTz0awMRv@ZBPEb0V-&e21JE6%-wCyv+S2ILiKo+$46Cw-L1
zmC=f&B6v~}ehP~;l{QkKal%nxfnUYz)+x&4@Mw2@Es!iC9d+e}eFyFA6Q|i{e(<Ed
z`_1d^!n6D<lz!wQWdkIYo$9z8p8gJA2$@IFKY4JpF$551Z&}N*4zZb#K1Dl0_@fAD
zH$4IqVIv_}w3VXW^F9_1yz0}GN&SVe$$nJxr}x}%pZoOxW54v4->}>N?Iv4x+TyCf
zO!)yEgegLvv4l8;G<_eQaKeBz>2d+e6#bXJijPbAoAN_+tz5a%xc{Huy}iBmM}PE3
z!DH1YI8aVL^q~)hC6H4;?B)2a-})`P_S$QM29_H?{_&3oP1Mn!{K=o#(xpo)ZA@xy
zihcaPX9Zx4>y`#j2cQABGwT=m>L6){R+!S@)3Isvbasxft#wVE?{Ra6osL5u;>r%V
z8X(GX<==t<9^JZE{$2p^NQX<`%dl$wR=F5R>D&woe8I2R2L#H1fQFZ?!UdOLqZ~1a
z3pN7@x2kEdtVI`bQ~a`&t=qQSSH69lC%G_0*+CCyr+&Ct20)6*<nN)*3tVyG4lgcb
zD!eK73wU~1MZLr=fW^<)K<9bGlHl+uZ#5qPnOOrGSn(9X3-y6G31clykOnkl5+5o*
z3)tintcWPm7W5a&12g%^>-?7^i_(M!0}r6BAkKY$bKNf&06BG@0ot5x4h!D7sHdEs
zGU)TLE)U}(BCG(RBpf(scii{5oqtAer1PY8UGS-HZ2gEdA7!MS$<a3u^yvqDuy(QE
zg*HlB+7wCyGiLfc?N_r>+72+~Qd|lfDPMSnli&2+?K^hbKY!_)W=pQI^_#cZyi2aI
z6BiV-ecGJTi>|UlpbeHNNR*!$HwpoJbXZP<FZk>Sy7baZ1B{d-Xb^n8lmG2+f4lwN
z-~FAv@|CZ&4}bW>_7DH?5B8^j`lpds>G;839TWvD3ruJPi@3k}o4<)V_~=JJYPZ~S
zi@obz@3QZG?|Y$yP!F}nKk|)7mlc39tW`RU8s|`izluO3r}5H=(+012ItT*}I5`(j
zr%X-$<f)er7|>V`wr~(nC&s5^GXV05U<O9;;!+&YB*lXF!5nGCs}(C&#B7VfgR|ac
z_~3z#W}RS@57-P&z=rm8um@gRh7>-utz7kl-SovT+OjLIv8}!rpg5!MsVJYr_CN8j
zn$N>ymLBmEj3nb7hrq_u?+C02V+4l__AW0cN(M<KT*(8>s`CJZF9O&7zBp%J<f$f!
zmm-hqBSde|ox-Z*07vRXW3{YAIzA6CK4%OJ=x`sH0n#y`z<Ar`lmtP-jn65As7q$w
zo{stbB<C|0hi(V);(osq{=}*FzkagH-toFCY{A@Fw%@;}P~|%q9hYF%t2<BDl1t$L
zn>K<^I07`?KaegLu-69VfWF9{6|4lzj8kQRb}M=zD#g_jkxe8P9^n8tv-$7e@l*Ti
zXZ}CC^wn3|y^s307hZLd&G&wG*vqN3Dfn_&Ss{>u1blb}f3U8xf8Zk@`AB@^hne_w
z*Ij45y}eallAeCWkzPIuhVaL|``z!h>#x7w-uvG7Mjr*Agg5+0frqD99w`rwKK<|i
z{_n<@197i^{p;;BpZSbE^2j4$0gb>8`jZuaF|2Gl3=Ov2aoQybI6QQi)M-$nGoXDV
zm^t>ut#zbFaH`{^`O5F4p_6IANaGcTDUZPolRtwRCV3QsdTZvu0Iu?<Fw=L%1B<~F
z_`Cz3a@B%Q*_1DZRqLH}z=Id;Re+`xHaHsb$g@ZMK^Htf{K?&pVuwxZnPO{q?YABu
zj0(XEW|vsu6+QwFI;$>uPeMI<;4jZq$>M&zIsv{b9~Uy!XUI;`K-fi$hgb4OLaYtI
zs5}5Dxv&d(+5vQuCTQ^3Qe0RT2&}pS_5`Nt=pKrz<cqHsI;^-_wgR&NS|~ilnm#uS
z7T-(oIhvn7a9DnfklX(bIh;k4C)i_8_>GJ!pSCx=+$}F|5h&>E$0ximDMyvgp}hcC
zu<3*J6@@Vyra!QM0EhLB*)Ycr2fMEyx`8M9$uB&-55)mTaB1J*p{;M;veiEQxv$#1
zD=xGH`}dixf5EQ0_+0Du{Vhl8C;(+TQX0XLDoEh!?d=WA9s6q#>TKM-`R1EruMKf!
z$9@Z}2{_;(*ja91yg^BqBMyrlAN+xaZ+zn$@$i{+_>Hww3IXw?1x6>jtN@H*t<(6@
zPBIiyX!O8BQ)b|iqeG(&6UU&#?&Wsw*(X16$}?f`2sZ+hFX5r7b40G!J2A9imTx3H
zfej2?`W^xPNAMY#SWCoufoV_2@Gz&Q@CA$flpmP&oYJDWNI7DF&_{?;*tKZuhwBQ2
zo3PhEztKMS<y*~`zuaEjwa=y?^e7vT4|?H(9v7UP0EqVrePue+-|2e+q#w1vy`+T9
z^ikHx$LyLVhYF9Y83QJKuh0h+&ETtvcqyQ(9tK2tTzriXFeFocVDmW6;J=6Kn*!DR
zP9tCa<AR@}9t1=w`0eBa7lnpdzrPk>i#V(pT>f)_z!?+mJ9n?NS6p^hSP05>Uz>g~
z48NnyYIopOzu{eb0r<o(?F^x${R8QJ{ptPz+9!RR{PZcxrt%0+yQKgY4miN1pa0~p
zpV=*6`MzE7%1i8spDr=mWv4G)VpBYB5B(W_x8Ovhd~l*{)4zBS>};_8o!|MLfU2^k
zDAI8aID#c#<$!*c1>hNRto^_9o$riE17ir|QR={zjvr~<cz@3dz&NU58asnTd9Y!|
zjL@V54dtGFI*B@A+UXGdbRwDLFS+EBFv-)gaY|c{)xafxtxm7bNx8T#7zGA1Aez5S
z{<S(UIC;ol&j}y&V~WKzn_lTF+=9`l9Ff*(eE89ow)4JQ?A&YLWGgrCvT2hTv}KX0
z44O_-@eKeMjy{|1cj=&-zI%$&&;C3$-n`si_(I@^BH+CBujvuSzrx{{cmuCUulzMY
zi)WFh8ehPoGy`}9ynvOI(UW$$G7y8c$6?MNK!x&Dl@hmNa8)u2gySoyPu@uH!sMVU
z2#aP<wNL+it-W{si+1+XMYh{FUry9l6e%rff#G<o<%p1x{WE|kj*7C=k7(0Uj+xD-
z_w_T|W%C?+Xh@4NOBDhH8E^$lWuqize1nw^7)74qgFu@$Z?S*;>{rarUFNfYzbSE#
zD*&gSZKs~HDDbrkzaD<Uq5tWfu?xQFcbb=3yZSugSp86(NRNXjIK~5%EMTDwyyi8p
zu@8LU17ZFr4ZMx^7pV(4weVG<$F-OA%yC`Ljwu|CUkU(&25Vk--g##X9z(eij{yeQ
z>Nx!70S7afyY9LxIwWRut@qQZn(0`;6}+^f2%`ft0N#7=y|Lbp0;6MigTW^r7`2>W
zlZNF423q*T>|cXSnRdae1(Pz6@75pu#L@NIZa<?wG4NNg`+MP>o>U>IhI{Nw-jxSm
z@1=ABG>NPJR=GVc;QDXY*M&R96?ymroC<wazLjD33({&fWyP=NOJZdesKkph3H5Rw
zVGKU|;ErMu?}A;!Kj23#=XCpgX45{q@6k1JTOhO4Dy|@{_*Jgggwj!YRXf7hJAD>~
z86Sa28)u1*i>%nB%UuKrJQP*hD`s=rET(H<AlS%9JH){+J&g)QUMT>4))#Bao&LGs
zAAIXy?dskI_WafzcGwlLv(H>;^X4Asd`&sp2=d@@;6?!gKT9My=qEgN`rX^xYoGk&
zC&N-lJo^XW6MYs%fMdEijem?CIM(<102E6wSY1#gm=NKxSbpkLp9*VY1a#P+YylX<
z3a4T7#NalJbO4i%gejf|j>(@HGo1}w8YEgpNj^HO1`Gxr)-Rd)FIloAj?y5k8OWN&
zk<ueR$Ok@~2pRC`=&TXq5VXy}uf^LGHoJ;h@7G=hW)h8H1Fx2YAIvzOTC>(Z`RQBj
z#0!_%hTZ$E+Xqx1hAUaf>+hbnY9>#G`ha(`cv_AER&s|#<DE7mt3idHg05=HNYk%m
zL?%HgU^0mN%y?jFD)cF3C<?VU2s(IA10Pd<#O&Xf8Zh%0WC?oVr_e(QaiySoVNjG(
z>7(QUd!i5EtoK(PMXV+(eu$34L9d7T#Xz^+x7yxx-6b|{s(;DS_hcxG($;DhRUD)A
z0xfA9PI*atq8|~)L;K{oA%c!RjKZz^2ef~Hc&rFm322MJ1_uxD@a0lmgb|OM;tIqj
zU;F>NfBur$8LzOt&b#A!+&cEiW_$bpb*0UA9MCmf4jSoil!-Fdb1DnBGv0XPjd7&!
zEpK^CoC5gT*S;3nZoBO^d+S@@8s%hD;a~stUx&35`0shodt#TraBvFXh8u3MKl`&k
zv#)&REAg;510I!mQ;(+c5}p-+@lums4jM2sas(Tf;xJG!V@A7f)*N4p9qJTxL<Rr^
zCAc^u#{k6ncy{!*t6->uGZ66(K4vqR`!W0Tks#IwTZIoC4Q`Z6gCIO&w!kcE$&w`l
zz7PohT26xlzThaG;_iLGXBAuTvKeQ*!M5z&R|s#Xq3XDNCv|CDBok~Oh<t6k7_$9u
z92sPg_V?JZ*FTuYEZWWcMIAYifSJUDk1);r5r^RNdgMA`)h)?h;>Z)TcQPc-;Sr*8
zLsxO~faEFvz~wj?SUtRIeV`mH?AM|dxZV!pONRsYEd)j|1#UnTL=-+(CG0$S*p|8i
z@Z|^B+wVWK!7g<Zwj%;!lv9GRTz{Tm)LlRCUW%GJQN55(Gg`uEpIQn~pP;|9TtHu>
z&!9Lf4aW{?@9JapU-3bhi{F0p^5jW&*U#^_pWOT<yYltFZudR?yq!I7hS}cdt@qT0
zu2^&zv_!orcl$o&VO&B;Lvu6kUGI9AvBXIKWhXzMe8zG47h@g{Va~|_Hd(?yc!KhP
z<2Pwh7}yhqGQhH*ocONa<K52+z&NO38Z{1+7oC|iu*~||^@^jD4Ygql2f`myBqn+A
zVe)5y5sr57C=Y`Z<~amCClwgfbZcO%@PWw#-T3eYdZ8b4Am-BxF1R2p3k*Ow(Gfpt
z{87AMk_5P%{olQ3uYLR0+dbr5+jHQsO$K+rxI(HG5EFAaBCT*0!5`J=f08sI<s4Sw
z@bz8^0<Is2i>wG+pM`rt6j_2*36|vaQv-c2WCsdurEwoPmA2Invp+tt5Kp=)&Lp3Q
zkp|j|_M=~fNA7-Lq@vuF`r>3jO>=VvUu-;d_yO0IBGB(Sbl4X3c=-?a*-f|KYm*P}
zwyD!*`1$<eYFg@^hjxHarS7Ox)oVh`h`&O2J!LaRUZ#zFQV2*#d02A8v`u;Fo7z8s
z;3JIwDTRPpFv1qNn#}{R$4{(m*}BdC_m}?7?DT7G+wQ$~{8ZoD<0miNsBUL2JFVDk
z>F}DNJ1s-KcW3}Ndz1!)Nm|N|&$y<sE$ZJ9aH@$1NBLMfNuBZvPH%6o@yID&zdOax
z3cyiN(?e-|<Yf?`VL$fRV<FtwS<Lz&g8`1Xp*V2huq2>0^I2fDY%bp+4&gjl2X2I4
zosy1=)dJ;$nG4DVWkW!>1D`z5$Ad+I0Z&&R0~=)n9yJSFbOMh&z{8*9JK2By(|hbo
zpZO0r|1Yw=E{F=Tm-4;f;Q&5RFD}4k68DPo=rj+hOTSA$i#!#mEE!bw3>Sn>XOX`K
z?#SZ6DtY3%-~q`}M1u3uwgfMrVSqsQu&@ZEicBS4@Fgr75HJ8?|8Gt$3@CR${wR~<
z0~|*HxeSPZP|M%_E}(4sB>VAGTkY+yneHDx-0zz^pYnE8SSC;|nK9FrX%o;-J>o}M
z9L`Yti-TzBgsWXVX+B`33IW2NzR5#)x?I317t+)2wOoJ?yq=z^cIQ3!*{xr?$<Dj(
zGF!iGxB11n&SV!6v*x&>e0<TA8kVf}NO%vO&GbQ)n>;vvD<6suWn;X+DGl#wI{j99
zerx>Xz0+J)0LHk+)u>w~gT{_y5YxGMIyQoz#?M0<b&yu!D?J^B850h(8w1MamtP)x
z3K0B5#X){}01j>jW7fhEza>kSL>U;c+bsidp`Q+pH36#vOBGysjB8bK5gpJBTzC%*
zOpjmx_AL(4KNagjX(tK*<#Oc!v7|DBvwswIAr?ylNNg8hPOIDi;nyD=(F5X?q3Tx>
zv)Rw-3!#ohAYFP$@D$m|<21Sz;U!U3xFm5pBf&s|B6ragtEnk3yYEo~4napXh8KQS
zLOk67q`@;6{$5|eD3&{l%z>Jq;%EG&4gyvM4%-EH9y(%YOq*msTD#ppjMQW2oq4+7
zXTLj?NZK!ZVpzJOZE!IS0+97{1SestTl}D*f{(c2bkMH$G*q}LZS;EtDq%b*(#-xj
zW=C6<Lcrb{ctIPcWV_sg^)FxfmMyyaQroz7r_G-3_ZIk)z(Y@O^fUU)?fCig9Bv_a
zhf1&VL!Sh7E4+}97aaU@{#t1orBCBJ={+j|$6#$UFwuGFybNk=!ecFrwQ>X~n!W~r
zp%jlnidi|cWOmXbM44HzA+K4pc_aLkgN}>v$85=tX7&?s9E!mTSMJ!2;MIy_Ab>{*
zY1aDSQ#*SJfDKQf8NM)sgwKyUU%IDGvmJY&cb`pmd`H~0&Js5c!76!IgjD`=d!NIL
z$nXcmCO-xfJjADBQCWRP<fd@U>d;*zL0}eUcBLwO!aThb8fgivMH6Vs<d81lqaYNs
zY+%F}5ZRkhASnJi@4Q0$ainpT_qY^@bnXh!z~VY!C(#4AEGSM_IJMD!e1;lofX9)3
zpQr8N=--yTcIPjiv@@5S6m1AIIrYYFahAteA4UkW6okNJ;~WYBPiiJ7kCbWkyDE<=
zG`H#jCuk1^vDMs-(?gr+0|#Zp5k_B-(nB15Y?t%nML#L@?OVTZ-}>6m?CdK~wY7e;
zqIK=JS-vb_Pwudb-m%<{b45uGo}_pb-i{9}#+ae@^LB9OJict4nQ&}zfDWS>xi;ak
zmaR{gQW^~~?Us!EI0i-C)y=G&YdTqG&<vwp_~cW^N4elg4pxj;yy6uxTWQziFM6P#
z0R<rqz4%xyQpG}pr1CWK#ShYuj{ys71r~)%eM8c=oqO!_|MVX=_q>I+&u2*?^gLaW
z4`r<ojQ&!fVMea^2&sg><h@8<_iw-`>Z-_Ef<wHw7nT@`kN8yRNCA?DY=CjTg%Tj`
zB~9n=l$|oAydzr?j!@{&5NRsys6UjqBoRMJ6a2w<;g!ZmTp`ek+W?EDE(9{S0-{WG
zxlqHQXL0PW$BVet_h8rwJ@(!E*Vx;xzs#minPdlD5Tf|u=s&EPtIyE(P;OBQ(y#g<
zESWu*mkVgSaw#tq*2q_SV2rfT6E4w1-AN&U*Wf=f?$bD6|MYJ^u-;1+RTcYLGp4$=
zV6SK0ZpY7^?fr94w0qh<aH#{zpuSPYC7gr#z`#l&UbG8mFg!WFHV3i-P@6BK7TQXK
zr?X+IWEP{VaM|UL=^vp<=fo<Y8A~g$RbX(?8JR8U;vHR%!&kJ3SEpB=R($1^a)I>%
zivo@~Xm5vZ(ZGNL9~jgzaWZ)GEduz`u3`Z_&<0<4I9|DV^X5=05F+3C{#PCS!*<GX
zZ?Z>U+)>>A7X|7U6hR--5>q<TR!TucB!5csk%7A3rSJWr{cmYJI1rA3G;STNe#s9S
zs&G9IN#ugQTEjvCJN`N^iu@=8g#a#;9AXs*z{-0HpCpxJq$nJU2OwQ~`*waloP@w9
zOznkqya#@7n|-lQpa>&f+)U?@Y=YeD+Q2g32k`9`FWA%TH`#fo`w0@a9LTAi@teA(
z{~(0eRUflr-y?uUiaMqZ(HDrvihx2uAEX`7*3{PoyP38NejZ@pkEToMW0NKHuXyw^
z+rHusJJWamKfLBSo8i0tso#Bm4A4$q=;fXrU)E#!faMZdO6XhgM*LK{C{K!~QTf0R
zPl1!&=yRY}Pal0MGRB4+pkvXY5NsUv!5rV)+Z)H%h(oZc!>L0Oj#J)-2r7}JMNl#b
zYO`E=j8Fp>10Q(sg|}Jwz#uQT?yX+EI&Sqth~9J0Jwp=yDldaGvp9D8!xyX;C>so3
z?G7@O6S(jNC5SR`X%GY8wr$((dq2L%i|<#u9!Rbjt?=I&RALSFQVL7b4447GBwYzq
z!=H|$Dq9MC@!bJz=}x~lj3PI<^<IHDc$-PeP$>#|Cb{XInRrD^7ts7SZ6vG^QWhvz
zpe<+sM6@jw01r9>5IJb8^2dd^UrK|b5cTch)J>G9l0JTWd~Evj8SH)jZ2%MiaHw3+
zleSxgkgQtJfuK)s<<l`ctUg!~XbbEe;9<FdK7#`{ID&=<Dau}Xt$L&pE!DtLegS_h
zz4@}+=f3(avjta#^0a6B9$RwK3BJ^F$e!4?%PlE$?An)IWlI*H5=&n20>ubkkROEr
zexV>qF%l0{P8BWXIbr-}GJ^uptP{*~9j5t}Mq$*cTse;LG+yaC#m@@BQC`ywCJZn%
zTxOoEXLH1*eC}Qin~uRd`Sd<m5lN4u1Jk*&E^u)Wo8^Fk;G?rkfC5|ZgTYojc~D>&
zKr#8#@lh@qKy+F`X<PN7pH2*oYu2m@%LTK3)_B1qUbF)j*zg4tF-j42!BxbB9dT3s
z^S*xig-_pNr(V3+Hu~NFlk1qzfrAig&V{%NAz29Via<m_MgZ~6KpQAX<A`AFJ#aW)
zuij8<kb?^1;Y5+9;%Eo`uE<+_qPGYq9)9S<KK*F}kpE;2+D}e6&N%gu((hpt-K2l0
zBKRY|jL!**O_d1$04})x1DNh{iyeCpy4;EoJ4ohCvD<&K*53Nci*3%#>8@b*E38o;
z6&%p6^3hg!aD>rjSuUWzp(xN+u@2C-SRNw{N`N*KVogXD0>Kk}d~geXyo|vgXn@Z9
z{LcSxeE!pR@vGiyYc_1ODKmSbzwh>qlQwyZz2U50n>k~8GzH2CFUZT7z@8oYMY@jy
zVV>@zpkImy_~N^G01PPr3aiHs@&ix2)bBLC;42MrX*$BwxHNAkzGnqsjBCA7W27-7
zXwbS@Z)PwW?A$#agna4<gB6c-5}w>2z^t6jgzOqlH_N5)()YpO6Hj`|!5S;d1p^7f
zpR&s=&pREpRYx8MGJIe&I5RtER)Qq}-0e7(5uWo|QI<paY8Zn)vqh8*;vQP@n1g@J
zW=~sY>-_e=>D>Jv;;3IR2L41|#b{b3Diy2+uYAY~p+-9AN;Rbi24sY(OGA(DpV<tA
znPkzc!wLsKPy*ak4vd$$Dmmg?HDEx};A<(rcwMw<<*$^<@gw|eIYR5jMSAj~SYbin
zy;2s{kzgX=BhHf?bVcBl8B^>lkGx=i=-&po?A%$=MjG`u;FRML`f|h};1T$`l~2|I
z+L~s?%!Z{9V11%}LaWx_X{%-0g<~{)a8L4fy3@y%FMjhoj`CbP;2TESU^#ngw_CQR
z`fPuTFaP)1`KK?oIew4A0k>|EPkog3U*1RY|FicdfO;0y{r?Hs*PES$5OQw_VF%fi
z7AUw>6x3RmU#(hOtySBff3>y#{a5|}+-O};(W*t@uUNrq-JpsfqDTSDsw`niNJ0W3
z`+jq8cHsa0oO#ak&imZ=eRJ;(NKEb|?|tT(IdkUBJagupIkUlgC<F{5fTgag3IXK{
ziwn3=0KkPK4Si7~7jZnvhk--%b@9!J3qKY=>YH!HU?t+{)97>J8<00J2dW!T0QSKz
zAB`0O!?Bi({M*~x-JA{c(1@d<SIH-w^fd5h%oJ&Wg8^U^&eeu#ICux<Lc<4Fo=^Cr
z69_!bo_RSCqAbYci~<%EQU=NrmV}@?@@R%L4+noJNUZXQZWJV3;EqnWuXv%0G+}i)
z_0&_{%*pp4U*flEG<cKt|C=3dTXgn6TL2Menf&KX)BXtY#)4FtIE=3iAaT6AS8<N8
zK0Fiz-(euUUbZZqBlLH02Z92JXm`ie!GpERJBv;c8J)oTYvWG2LE?fIVXDImo@O`%
ztGKjVqLz0(<)n|U3d9MnJnP@-j1*ejCNCQx+J?xpWrxjMxZF-T_HZvAsr%xp?S^*6
z4@VrMU6f6D0>OenL(ygJ0Bsx|1_p%yL6a;;NvR_}Z3!2=;xEtgA8185v+w_kfBUu_
z_eZDL%1v90S@_Ws{u?^C*jOn78y9S{Sy}^t0<&Ysc9pmG4WSp_;z>UPJ`|r&jIg{R
zsHqEXIK)wY>Vf(|F`_Q0gQ#QZ%HKVa{t!+c;JNU^3%!oX_ah(qh@bF>A4gig!NnI}
z?7MzA7>FrT;OKX7Kl|Cw?4pY<@(Y59V<7T|H@v~)4#YpfRR?0(_sKHT*ki;Hx9i1G
zyEJZGFifKYgwTg6n!yFb&^e*8Gje9m6J{Vi`2?Okk6_MV)I6^0&&$D)yKI@mEKdi3
za)BccXyFO&psR|j`W|JY1A&Rb&s={;M@Kojpbi6nHGQjh=%!wv5tBc3p;W{=0^Wf^
zaeDlT<#zGaw^-Y;Gi|+Yo*Sk(6`~G(68^2ency*fv-nQ+Q0MAVWl_qf?2NJYsN?h_
zf)!rp0)d`B{~qrE36|XmT(g_g+yez^>yW0GA}qc|1g0v;^J$RQgOU<ej@n&gsAc4B
z0c3JX@YLcw(Ep@1?&RxFYM)dm`Kw()xXuLl^@Gdof1SHSYZO$!d0A_SdhH{R+~q=$
zcEfs>e0M>Z$yxJ4yFp21R7^VBC+!rRl$UrfcacFguRP!W`l*|t+TYLF2;ce9HD*U0
zX<K*fw9z`vvQxMIO_c($di_S5Bn5ygfae}Q%jslYhIHOKm+Cv!2TBoTB8+lTMtGev
z!AqeKU_patQ4UzB%fJJRBQ7ifc_!2c_;43oaDkry_>OnH!+!X~AKLrg_dff<4}ReB
zQEw<+Opzkg>C+JSfAcqgW7l1Gov+y;?>D~jjrR7pzul*EB3+*6f#e@h0QP~e9UTCq
zAk^qIF!>{N5I%G)2<1I>2*E5on>B10lF=*7#+d<_wz*^{_QyxZ)Tmk_e{i&N#8VEK
z9_0f68E2ePj3jz07vRn7U{AQApC1eTuq=dv6pBUQ;<+bUBd@40Xk~sE;m?@|)D3ZT
zEYT4K&IOMwHhc7Dn|k`oZ0V-0He58N`Rj@xk4)kzy4(qGDfc3*3KQ=QrEo+FAGnw<
za6I5I@&JUW?3EyQz`7Dp6O8&2iXuTPmk_4!mP4z)73FpiS5R_1;H}5)^(Z*-B$x13
zbM{4lc_ApP(jd$dy**Qdal1^~lgAFT%O75CA6nn3@56VAysEkM?>*H)!dbNrut=0q
zF(Xm<3VvWjj55sB0qr+N#3%-|J=!(x4P4X@{ydNVQCUlcLOM*N^*ipq*Dn9+6?WRo
zUt|wH`nXLQFM&Nq=UIrZE!%h6w29jHf6SpaSqhCGDph`?8m`=z(+J(cBZSk}<O?g|
zC8h~D%m3C}Z}oHe8QsHctUJPr5Dq-l34;5kn{KjCed<#l^{Q9B%D(*NFMA!LKtw%J
z-{+ijP9)bMMlZ7Wyyrd6(|iwjnEGciFAgQ)Kz0KPz;pcdVUTF#FbeaC?C;M<v@i^C
zy>Ypas-__yMjUi52xdAX_QzwA%Y|pFe83sZR)r%RhvLFwMMmbB`{&G=<9X&nG}1;I
z=!ULVF6ajj0-9%x&Y?fb8#sg3cyETAcwng;6bkB}D_G;Q72=^U@Z%@!wmZ3eWRlOH
z4)J=aDn^Y3n$w|g;ZC?hgu0+B>M)mg39!{)wT%$Ckv?xOqRki3VPz;?04NOW0fNzk
z(l;!@+*2usUQ=D5xIw>AfE!`%Z<RmFpE6dkq;`}qnuK<TEz*;Ky5_Ymu7(M$t^^c8
zY4lHSI`l^*&D4#6>mJx-i<hmoV-A}jbkNd}N^;pN7wsdER22e*JOT#+2mi$lowPY%
zQ2=7j96^TS4^G-W@zIW}`9ywudXk>ee|Ps*``%R={o6F#q5b`E-EJvMETSB$K4{XY
z;r7UeE%uV5X4<r=y5y@E+3yK|#8>HqettOO=;u(1-uAY)c^JyknP;A9KmPHLy-#}6
zo8IK%`5zP}F7%;JaPTVIlTjpaQ8(eIzR2h1n{W2ZgQ#m9)|ne`ywNVX<PujVzVn^$
z*!kz5pW0&H_8fZu|1+QfJjdTSF=blB7N&A;yJCS&Z%qDG@`L$uh5&|lIv9jLoex4E
z;fZ$L%;cZrCOx?M1v6LU5scttt{+9Fx0mgJvy}o6Wr6-!jSnO5=}2GjM%tX#+)q3-
z1IzYGOq3`^F;zi0zi{kNrxWGkl%M{$=KrU5*iIe(lgqILTN6@|)}P>Fm|VKtUk26-
zLV{}~WI(w$$Ah0=vqXw>!A-n=h47ZN%@G#<gu|y93SsXNMV)wCa3!Xam+Dm&C(w)z
zd3~lL%O&k3$O0<fP%sE)&OZcv309&nJf1JHx@ZQS1QLc1vU?Y<u-Vhb=vy}`V~On<
z_2lga#0Wat8;UEAHjJQ$|1gKgbV0s|E-VC$UJ-hv4`m?Qb~PVI-rx`LL$$L1p8M~y
zOTP3OJMCqEX3JNv^R+AEv`b?B#?98X{b`#qahxrmx57?5|4dir`NqA?qd~Sk(n1Fc
zI|7<r72GL;!p$$rRK-J?ad~|3Z8aQv!HtCl;m-Xfyc2%zx#!w>=bh(u`TEzt-t%S`
z0~S#70Ox_^1{8qj_-oVP5NeqGStJuCe;5Oy*jopK!W^W*Ph;j8%n+m7d?6kkQln9M
zvpm5UICB{C1wTy9Vm$D&o`8ixy(vdCyyO9H;=v!*k$lP`%AC^^e$o-&%uzQny@FB%
zeAIs<rHFVTR2>i2{Qolh?Jusj87G}%JL~c&1R-I-yNf5u)tQE1t_ake)>?|7o{*v}
zCEg;g`tCv<SZGiia=Oz9ETt&0=N19{joMpANPG)0dtDUSQzr_k4=V(}!zq6tpl7h?
zDTTPx<w-r(|CIHXXiu0^G~cUMr;#LsDAOuVer^ox-mUn_!|mqzEA6afr`ed{ba|xt
z-8<?c>W#XD_h@JM;e9LwjEo6qK`zQV(@}94QOv9Jq}{^nq@`YX@(UOF#5-~HHSpsP
z|BHJo4%LW%kgeF*Wd~0fqXn&+VpP41)f#|d9HqSePCNeSf3Ok5IaI4}eNp7u>KT1o
z^iROjZy=}}Ir<Wmq`X|z4gCTx(uELTx^$^eDR8kE7D)0TPpp&S2!H20-{}Ju6o3QK
zHLmRgu_k`UWriVX%*+eYNZZ@neVZ9ffFKP9=_^^J;|C+q=wT4@V(&fkUX2#w^)?Ua
z<rlOEQ_&fL8zqKQ{SeT&zLW!+!JE6hbBNJ-mHxa#h_n$FIIDQ$9g9ND_tWumkrHvu
z=q8R%6uKUs|A?~NErBxBR&=Rj^6s;szfulWnH=?|QA{cHf#rvihj0dnUws`YK*81U
z7O*9Jk;a9*Mm1F&(2*+uWJA^@t8;lkL0c+N4T|#W`&C`k7iGr8?8yqUYGtpcO7A}M
zSAx2U1M-QyGk-Y^AmW+oc)GYA7>c@fJ*jDcA$IK(o9xw(uCOzXpVc_S-(hI9KiU#)
zmG%VRhojxbR6w*xOyIOtlwcGD1Q}L5ekc#{IqAR!?~{%n`1$eWfVp?xL-xhbTx=)4
z@C@tR#$sG4A3FJOr*L*^?tiQ-63IX{MN<{&dflMwnS4SMbg~$b^#S36=6DAVhqZ)(
z68!*ce%P{n<&{_3SHAKUFEeWhIIDp&GX2OxL6mx|khrLuyicJ%nF?Sr;eqM~6oBXG
z%SXee!D8}<;bCkTfX3Qi7isBGz#Y~FOo)tb(UcKzk$+$EAx$t+(24Np@Euk;W5TPR
zAJAL;;D$NFfgjwASeWmJZs?~>y`euk5}vt>aPVOI!%P_S{oo}&>M&0i{$WbDS)JBx
zT12HT)$5T1Z0f%ldDaAHqJXlgLk>jl3u&mQCka~hPRCc$4jjLT2T9G}^IO*9fJD%f
zGZ;!&;x0nGqs<kO60*+Yw+8mojE>mhU>+M(2!CNh5%6Psp~jV=n)pr`1?@$RLZ{PK
zM}}U2D@ue{!Zg8Q0V_IurQM}p+fd=^-fg$ed)&@Ap<Tewo#R_Y8_d63?k7H12x$AP
z9f-q-7ztw?fVboB0{9zcfPN$t0!oRH<ws4?zx?ngKa)@M=%VBCH5+WC6xbcw<G*%G
zw@ufX|Ll}lrf{2koVEbYaGuO*1Lrg0z)2ZV0GI~B5mwDB`Wwo}_B~egzxTcGwaYHM
z%&xlXDlaoi0T(<mbwVAW%rm-=E!5QK%U}L-kNeoiKIUsLm}0&3(o20IVGLMW@%=MV
z=6w;zfCBIwf0Z!h!JLg4ps(ab=fWi^Xu@d0Faw9!z;OJkOxBw`O1x4|pq0@JPw>+j
zaee@uPngd8nny4?w2&7g96C3w3M|HBA1@sldG?l3p#fas@W-etwpfNm0lHh&f8gPJ
zEqh{x{nss80661h>(Uk~A2OAT0BZR~zW#JN3V{_7qOTO>?vDuJ$mlL%BHFw2Ciyr_
zgM1Lg2uVi$<ORKix#LM1{m3_HiDv>TAeq8oo)q{<PZa4SXzS=Eyxv8sy7m#-XP~SN
z2v--~<3$1=h1tQ0OaOkIuyF8in~FSH>i~YWY>T~bS5ij!o^>=tX1{}Hw53-0=W)R2
z3PJ3-k1Kv-cL5_>R?Cx?=>*mfjME`L!!++dS_1U$d+)akzjU=7bJj^d+HaHa->!Br
zR;&M)ui5Bu!}Pvx<raJOEB;V8WDO|W$TMFtDEue@@MJ3&{LW=z-~H})ee}=BJ+=>G
zzQ60PyS&WQ)17zT>FB5z6bJI5PQb$gL#zeVDZ2=cKmK@!eJ<NWtFk}aaRUm#vt7e`
zz)!<-4L$4d_K>7$beOm3AP{gc7S;tu4hT40-mx@A)FOiP;AIqY+ikabC&S2`un_(&
zk~fV4Z`1=q784&o7!-}ZH>2|^au}ZajURZS5sL!!wwmv+;^G||5ij@L|9~yK{V|()
z(!sWQNA2q0l%^1I`S%nJH5Vd~kUYEw;ewR(o=(qVq+Ek&lCB2OzzTvEF8}~Q07*na
zRHu5-mOsA`2un*{$4(wqy2xK>2+kY=6|$IL@8Rmo8L<@vWpFXV_xA=F1zeB#fJJ~K
z{L{9%08a}A;|)4>l(mXGNrwV0SiN0a+_c(W?ZB@q78pE3`v?d8v+c+e=j|!WyH&n9
zUGPaL^0aYAz4>882tKR>jDVMDE_}GGR#UVJ-sk>x)<2)VvumqoH*Pdr|F+uD5t>5K
z=>N&>3SYm;PCWW>KLVJ%o@qyYLpSvRt@wN6NXKA?UpT@TcyLfxIF$NE4p@|iP#gk=
z@VtH_{6KgPC;<DwSMEzk(1=_bFhAzBVJJrai~vqJ@p-<8Cq}o8()Knim;~j5ZA%De
zw&moOh2Ao9vpf;XI0Q6vo-8DcRnmm_l}Uk*4xYMUodOmM6bj6KbZ#gXy`>vC;IFyi
zW?h~#QuLVZkV$(;^j|KUYQhAX>m9<Sh<8E5JHh}2S)5?>ISYsJK3)U^7-<6lfBx<O
zk+yh7Ji<=v76uKG3AhkK0f}HQX#;oQ4S?*)N_wX%r@>Pb3ZBA&!)edLh*CONBnq&e
zkEqZhhT;c`pbZSUVpN1b<?9-1>76%3hl|`{gYC}y7u%GvLv7@!F)lc07qmOzXb;g|
zXghsvZ#~P=I3#$fzxZ?7pqC$xaFkrigo&GQ78CB2()&Mm+-0Bn#J}4KFFMYatzK_j
z_uJXM-A0bkB0o*Djnz7Ur@Z`B_9&Zq$iZGOVVU#{p5Z^xN8c2D-y8H%R$!wZ{2S~0
z1s~@yfs;7kt8sxN;10CsfCBIwe{mWhE;@z2k`w7*dNh6x_Mwq?badFLkt2K#y)O*b
zD1Y*ZRkpD*8IwPw4g?}D%F!r&tFT~R=m$5q0y00(=sW~&e{lzM(<!n}fx79-e1A?i
zbW)F9+CzQaj{p@x>b~x^Lyn)M(Z8lb{PPvkDTI_K)QbxvJ^dT2lXigxPzdxwvN;GN
zbl&+Pz`_q6(%=es_h)!STj?A$iz6c9p#x#T4}Z|-(lx^oX&hIH8AP7Wzga1fvv~3;
z+H(p`2v}`S6zBb?keW&psmZ;}P9FS(C07@l%2ST~%Tx{uZHa`GCRBl4yvhpj(DE(z
zQk@gLQAhPIUHX`&n$q5T%;n6T(^hCVxiSFa{<c9UD*FQ-_%qt(-r}y(RUNTo0v{&}
z!IEVwmDOq+Hsl1`s1^Uih7Ixg{|%dUbn&(wHe=Fw+pX!fCwDXAKgf=nbGY+-3?6cN
zv*PsI3%=m}=v(%LtQif}@K6LA;cFGw2!4ML8&Cj#kH2sTB^ZD+>rnvM8_zr-mH>B(
z+Z#t72xLtDOe-*d#>!eo7=-N&L#56-O(|f~#IgYH_V)I@TCYHP>0}YuJcI5y^eFGt
z`cgNPb>rqvNitn%H0nHCy>5q%^wF@eAmMOc-+7%E;T4fq0PCT6rKK)P2M&MN6My+$
zK)B$>uTiR-Psy)H&m%X_m-H0z{_a4YKBa4rM?qIAk3hpIz0;cKQ4`%tRkH$6Uu^kf
z9=UG6zta#1izjuUa1Ntkl|eym2^17lFN253QY2-HGa0yo*cTd7Uj^Qhh9ZpR4vug5
zVEfIgE}J-Ix*dJw5jxAEQ<kQ5c0U#^%7ws08Q_PtE7}n4h+jB(uD>qu><x})dFMFE
z2jR+&l@I>?hwa#NUT7P1=Kol2?Hi>vHoG<AAEEZUU5c#RhNJ*cWj1@9>#Fc13K8G2
zst}QXo>wz^T7^R&eM<~b_7-2OyawVL*T8@R(70ZHr=fjq_#rGpFtbn&CS}yjD2A;<
z-eDGUflfHByzYIm$OF3Q09cF{iwT);WMw`bLSO05a|b_+4-Q7=C@^fpV@d)1xc=e?
zHys~!g2ECO1)d0h!u#XsupWPck(3k+<q_lA5?3x1LP%$3YKiL-)O%RIlJKaY3NJnu
z5QOC}e+OUrE*cO@Do?^4xU$5csuR{loNq~#AcaoPx{_y3Ylg1;CMDhA&j38X^d9^D
zQ+v&U6rZQhlDk<S9F=%f0`g{y9~qX^mUY2(z)2a%bMkg9cy0%rt0{o{AKPpXE?Q;>
zKmQmTrTOnREt2D8J}g<W_nxC-;UU^k%&+Gs^MRALz%N``FA>n<=`^V<eM!>-UDGJq
zIYR!r8-692O*XM@to=sj|6vLrJ$j_BvR-db4%KaVe(t|46B78#pFGI^_@u+FdrPOz
z>AzdNGh90mCX@<MjMk(4>Dwj}`dS{+qHw{N1Ma!0fdK{JxvBSl=Ng>{f*Ik)>4B`0
z=QJ&t60IF3Wj-%@$(_4)ySXpxD_iCd9)vSLw)ZX7LLwI9(NS`cP%J=diKIm&_@SRp
z7sY}OfQ30I72wA8wKE}2bVf`kuqB9#Mi|MlZ<x5g>>vPNIOrX!j&i$9r@y%MF72W^
zL1xokHXPG7K-G#xpqF8#s0wuHG`vs{JiY#ekgh5Nu0WJYGrU=t=<^Y_r_I9g_6UPE
z1Q&Fn2ql5)a3#$}upSOpO4I_D@G_!|^w2WAN4~XsNZ*$CYT9&*UxBE6@a?b+0RNPd
zXxW8yo}>_<WS}(oxA!R&lGSi~t_mmJP)q?z{P2?1cGk%{^FfOuckIw9j>1h_N7&O|
zSlfc4$jN-HtY`X&wuy2L3_q*`A-HHC{G$CqL96x)T*T*TNcT)R>V?IQi<drbANuES
z+2OjcVy#C1TV?K_IAwyAoup(BQMtFOtRpoQGG3<vE?>OSPCfo8KMixI6!G<1H^bT)
z4kMypiQNS#MC{a|-Y7$EA)stgzGmei9{#5u=?ezjb5jEY3czzy@BPY^P6j4~dE&TT
z_P=sz4a^krFeQSN&W{CK-P)?gbGXvh=3yI?01g;3w)im`XE7d{dI<i#>9k0TfM%64
zi~X2N;Px&qCPHZMspDu&wYvPGQ>9ZzVWHFJ9s=^AbHep!<PTo9R1O|I#C~(nop#;#
zZnqiFn{Jz>R3T3qI@O^H(lE*el5p0=HwN|eNyP{NJ;6)*BDQ>27@>n8s*Eqd@`t1H
zKY~-ea2p`6)imXqy^b}<Q1g?<1#>M3A9Xf#i0+&~C`J^pUMEG~9$v+r$AL&c1<@wJ
zMZ1oLe-fbyv(uf*?iL5HAIOrj;Bl!2L*=p2L+#fKbR(Pu;252Gze@)N;h;H`Pu^)q
z96rOrR)k|6pe^xZiVa#2f+z#5LPwC%zQ7a05IneQn})}pZAa}u%a5=9x!J~tt!>P?
zwt0Ixf_R*^{h`dVg>{0g0=vW?n>hzy{8+P<tL?}+Y5fpN*DNh2#)U#e`LT{rC-5<4
zrw(!p3FV^>$RAfNKjG9Bb&3+fuMwOFR{oH-imw9VudU!(#WljUKZgw{0KX?+mqv?V
zh7mFO!(<_t5z;VJFb16y3IS;_{lmyx#r(s^N_j{;*`w6J;zBqxGDkQwdIm4}$RFJO
z>A-^Nz`<NU3Jfdpv0i{L^6x=dF9JvlUAVYUfvrG<F?~Ti^+P^UN4-Gn@X$;BJ|IiW
zbvN9oWQ=aofi`IYV1bE1e##@6_ag_o;uPk4+slzZ;qftYKj|VIf9|T`dN014L*aM)
z;A`aq5AbXa94-=TC((Jz90hnOTMq{we$_Y$LD4}>50~R<iZ`VKl<!uaBADe9?aA{H
z9=>Zyg*WoZT~GZdcB@AJGe!@wA1q#P|L?lZc2xT`)3Udc#Jp{QA0-$^I06n~$DVU|
zkX<S)T4l!#2TGwZAPBJ_FiLJ@9RMdTXa{eEdBijHhju<vLiLfwOYB2m`cIp4`dPMm
zbGLQrNZ-*~^}kaG|E$*fhY3;yT&Wjb6EymNiVD{m4~HBw%_$)c#fwk|9&9H@DMGlj
z{)avM)WM^VKI*)}AR+1?O#75Clz}KW$SFGt0C@SvdvcLBzEQka(?xhAxK?phJO|4A
zfC8`&eCcScjYx?`iy&fdk|TZdV{7S*stgpuj<ozn>aOo~o!w%@bP8K^jgcQPq=6aZ
zbUcJJogJNE&;Wekj<mgb(jkExS3Q6S;TopI{wM;Z2_cO|fsU885GXQqP&nxCiwpQr
zhPdAVMTM<nW@|OFJlHQ#5#pMV^DG+WLzucykBX5WU1|=S`-uyOpUyGj!4rN3SEmQ)
zz&&0t9;f#*tib7=eBpPxR0hrm@NPcC?e77X+R;H+3+h~gr~O8qMKsoUsPEzG@~@9g
zd;rdr5}f=RCaHmyND{291)g^T7_7wczD-0;)9Ux-k9FJPC)V1LQUDs_E->bNi7*WO
zv}KfI1R~+I;kd{r7l!PTK_FrsAPsF5{s}xh!(SF+FFjsr%B;M6^#Ax48__Yzc4@@V
zRzF7jV<qetEnR6d51#D#FJHe|){v35wzJFZ_?g-QINie`CDsl>J3k!tf`CUUqA#M4
zK$wR@z+z;SZxkWwfG4~ZWsfpbZ`2<y!u+N3C*nf!Am3<n`8NtUlB1wOW8_I#z*gfT
z9GC;mRXO@VEB`)SUKkEW;umxAVe${bOarev!h%^bgAUcI=^+Z+s@1+@W%f+l%~C4h
z5Slz0J!6KC)$)w`alvf(bB5x5PY%2YXhsY%-+#s#XLz3A59SM+b0%(u8#o*t3p9hf
zdSW2;hJ%Lidq9gnkv}wJ>b(E{c{)t#BpW(xgk5vf?Pe1=9yN8`b!d=T1x;nROBatI
zQolkZ=Ha=&Oc);0q;h&lOISe=&ks7V1Po%4UzQ#uycT5e+CzzG7*^o)1o<DGuBQAN
zWuYB+PD>oP<EfN)o^_7N=_UYVUd|NjUN1sn(l6ukf+RnG<NKx51rEaXV<Dhl2Q1?5
zd1QsX<n&{dRieK{n<vctz(rf7J@La4hJa^%08?=Yv^a<-78Al>VJeUN=TQg<hmXM(
zbVnXp)_wA__GgrY`GSRy+J`^$ZJTrEEZZRS{&r1SQ3q`r^{?Bo*$&b*$>h0u!zMdK
zM*t7g8kg0px7eS)_+(dF*bbP}lKV*)f*a!C-6$VC2wx*)W4eGc!lz*!K;ftC@MH`O
zqCTs9N;vqrqWiO-{jAf>8jXMa$A9!Zqu%0)#qX1!{G{hk9v}M9hkPm!I$6tb;e{8L
zQ#izN?RumI=0I~jrN@C*1N&%sY4`{sv|cU?V!<CK|CsNG?3^j{V&uY<E?448+kS>w
z=hk!@ARPgYJYY<;_uFs3-5HnBGa7pc=V~4>DCuCDR*ranFgqO~rv@@&;>0fKqf^Ln
z6BcP&;Ux|@(o#Q5HQ&z&okfILE$En{(+K>)_r?JS9dt@`INTF(+G(e%&Zpa=N0->$
z*WYH-X13W*SyC7=*9o)8!X-&vOb?-!#8D(@L7C&q<4MyZ9px;`TEJFt)V~HUS_ppV
zj<EVNcwN>fcKDP_q)`x|f<y6ylsYVRFA26rZ!I#u8xUBn-1W%2f;&N#e|-wVY0x)S
zTD~n)Hd~nUpD@gBSg=a--ZDXJZA`7qW%8CD=!+kK@F?w>9}fO$Z*TW4t!!(JX)dg5
zD5|tslmr$SLKp2AK7_Y;kDnv!!yhlBaO{@w{>gPWn5|oA)3v{U#pW)Xq`7{qA-?@j
z%D~8>625}n#9}~&b6Aiq@6-)|M_AjqvA%mCt+(h&K$Jb6<O|<XexC3TiVi3CvA%}|
z#0TpnLIwe>&0%paXZhc7!wsbZK-rrKf0PNX?tRN!-eTYU<~Ln=`07``>V!c9`SZkm
z{No?D6HYk6zVVH3`1$|ue)qebCgMK*=}-F<1~^{xn%CId-u5=1&IugA9B6L90<aH!
z@o2n|LSx1uI2qY8-_N`o`}=W?O#U8`{RsttAozk#l)1k(y83hbPA%S%MSxt$g9S~D
z{MmlTkIoDx&S}a}JCg#FGhH5!pSTdD?62k*NBTlPF2^762<|;OV56gm8xG?h0!)hJ
zTV?20v_^iw&~Z_ArV{APP`K!rq3huViv+q%X4Io?<(8BWmVh3PpnRc4@jXlVN^~w3
z>)`dV_+XL7;po^RAN-lCgC%J$%p#rA*1<(Mc;wIH@Rzhjp0)g{^$IBUmhjZz{9WNm
zsmR0e7w}~;op-vyGA-H*m=ZvqBydwmEj{pMJA#fVi=JFNtXAkJ08%RsCzvhbg=0qz
zwc8hMuqRe;v?&wD3XUm|{=1Tf0tNiR5638!g@gz|+7@k;1+OT@@C9&~v{^e4(*?o5
zw7Z-qs`(Vzl@GoJ?RCMT#rEM3e9dN^_Iz#iTdR28hoA+2W3}~fOINoP*e7k$xG`E2
zu+95|Vd@W_6kWLXnKPWfX#ddBTbI)Y9h5a3{R|corV9vXs(^Aau({@%YkaiM0bN)g
z=_jg=%YNwpUiiWndemS4^<Ue|UiLD3*Sp?T9)V1sKwtLrpa0x0xZnbZJOBLi?VtYX
zpM3B{-7@;e3V<UHC4}wAtaE~%=vN%z0QobZ0PI5_8peU?Xo$=u(m^n??aK{&;2=Fu
zelTY0C{6o)wWnW7!0Z{5ZP)H6%bf%;07^z9;ashr=$ycfi#!4jMxxWBW2+wN8v;6D
z^JjD*fj5r}Ke!n!frn8)ogEen1a~9Fs5dwxKkzb-%jkspT{;}<4U_5)b)I+p=01@z
z-M>%TNkRQ8s46+C?=JAFZ~&$HbD<S}o<+S1pl}5oZz15ys4OnR>YmC6e04PCX^1P7
z24A$N_Ts>G351a}$wE_yzC@hj>&ry4Vj7|-A8p6$n{s51u=oxHd#wJHRguG!RV4Sa
z#Y;lyRRpaikV@n`Bec9tpxdRi4Wr20cH4r-R@uo%&q`$1qe;m6`8V1f$!W9jLWCiR
z_)!;Z-DMFSKP-H)m=I-vABq9IL|W<xT>QcX&UgnW0`{8geyN~`ZT4ZO+r5u0u{NFa
z&otm}@jNh{S`RRN>Lj)IF1HG7l2R~^$`M_&XC3N`hV$<c`1|tAX`}qmMHu;0{&UYg
z*UQG%;<vy3?RN9cH~T4|^mp)co<H@26(t<$QL3;Eu*M)@S&)e$fx>|!Ki;WZ`U(^b
zrfYyFuD!k8aRlB2@j0LX>?2>DMu`wY^JZj=iI9<PnByTcXR20Y!(_n}z+nQ#3NUWe
zFk8Q6n{{v9=A(WX69EdtAVBwoADn?Z?}&mH!eBH6DZ-z5K}OM}X(XT{9Z&KGKffv+
zQ5NWA<b)ZO1&VZ_(YcUEGZ*Qr@evNqVF>{z9RuG1+#_|I@`_b!>_^=Gci2qZCJPGA
z58x5aNgCBU<@D|#nFQk<Sfm#&o)y32a{&KStp`<39$E93zAT=wz~||dzYF^k7yt@V
z1mIWtNRxvBr*Qu^)N|0G;F^oP>+Bnfj&PFI{CPV6_F4TTxH{xDaL3mu4&2bGbq|&3
zCs_sPdo~BzP{noaR++UU=C+4d*sEWtEwYMhL|K+;zrB<#z9Z#A5Yo1A2s#89=5lxk
z9$|FM)Bwsq@Qj>M2H+)x7YYFm`1pDK33i0q@dB;<|L_McvyRhVYAe@m^tt~L68>Yx
zjJC&DtoGIZQ#Jah4`5;Nl(sR(IU${z8X0}Wc$+Nc_emCoHf#r;b)Ww<ssrkQDFOJH
z0nVTQ`JdO}jWQ7kALeZbUeD<xp1Pzyc*aN`7~ZMxoQ6mX&hXb|egM4;C;<Dw$EFb?
zgjh6#aHCV`t>A1#e$@g%L#LY`qt0Yo_X@l27x&olb31JIoFhDWFi0a@)v#a~aE3Av
zjD&z@1u!f9ng8qP=y2wP*_#Py@KtdIUh)pybbQPoE?l_K@BU`wK!=G7n*1gELwMk?
z!bF~Q5a3*@d08wWtWzi#KNcO$W6PefHFy8aj(ownwyIOdA1e<PKRGw|=MqN|2-*@L
z2r3gqK!YcE_yHS!z`&X3L6{t2k&g@aNSmTdH2xa-7jZmDuQT=27QmF&bI$Pwc#(g7
z4!ji(ItrKws(Ut#OTxEK1Pd2O9P^EESuS-bd5QMn5vl`EQPQF}0$lhfFLVA_0orsP
z!1asQ+W)QXafeK9vmFwGvEtovJxhP09Y(u^H*mZ&0AQ*h78AxoI;H~SAfA{mU|;~h
z!50WVSA-=9fB55{3ENg{(^bBkbm-4EE&QFNRsY@H+dQA`+5$LCZGMDy22rm=lxMfL
z2d)zt=N&WC4xTc}Ed@cRCwhkefSVs{k0=k0dVm)jITWAZb>guAQa2&!={s@M$(l85
z9F}^aZBTbu^vDlCbrr6aMKWOa_h<8A)BaxaeW-jiP6QBhVysGK;Si0tmEdefKIuXM
zps_~>$(`U!mON@p7tOcH(`VX|M<1h3quxB(3}+q=t>EN`i~L|-7S1uJhWVWJ1PIc|
zKgZF?A9(Y)z>PwHRfM@ytRl=M(jkzB-=1918gRTvXTwwiqkj}GI*J@`SWf0WG~bgB
z*Gg&jl9zdU-l-k}cM|^5$rQBZG!j&%(@QD1i2zsa_q6uttET;*i@_X^gXMWgx}XHy
z;V+)mcanIA7!i8%U5js9rq1%Mt$T<xB~<ccupUt0(wC|MdRX?Zm}*JVt$vc-`u6p)
zyynma+Dg7a5ag@=f!BX>WCx6l(KbW(0xVs&#U5X|Ui?y1HVSOd>I1&a72;@5v`dun
z;2TE$?d|QpsEvhfjFw5q?iAMTz!NCNSPX{ieE)|ZS!AF1(3kC~Gmh52f6V(-f&`;>
z1#r<1)xdGUY!T!Vpz&H0z!@Rqv_8l-?y%<_Jx6#(xR4B*o{fqCH)Vxx20OS``E_)3
z*hLpz<kml6;aQX<1_3A$@Gp+KLdY|HibHAo<u8Bf(*;;rP!L#C6>*ndda3ufyyxG;
z0S=Hqja9A#r0~ymF<}B4K8(QOIINI|Awt`2#T2b*4+Q`P0uv&g%02h|RwlQt_Q&U(
zX$w|twoRQ|Y>dp?oE6>|7mNb!{Lr+)4`VS3i7AD?YS%el=%JHjM1+};=>)WVgfxPi
z`U&_}v{Qflpc5A~gO`qjj)W0NSX=0L123@DW0$64Zo8XKLw6L&ObVtPCwS{z%1S|{
zNB*KGCd`BRLzo=274#)#zyVSmFm9r!uE{quZ>wC*odYWxm>Rx82}v!Pqo~DugySWw
zf#7uj;*_vi_C4rI@I@-ekZ2-Au7LNEi2U)boNVP$K=~kEvb(4AGLae;17#@@#^3tl
zraD9+KmhK>@*(I49$jUBaKhoglhHrKQ@&6D8u51?2LA$27;TnyI{867OtsN>tXZ>K
z%JJj&@Bev~*_@f01xa@RZ0ypuUkP^(Qd+OAfJ4PQTQ&dBlm~5zT>xuzJn-zv<IGkq
zv7>bYAc_P$7kmKC&xWI1On1cSy-^;)XPoGH`Q?|}JKpgQd*v%%X`lMkr|i4m{jMir
z(cho`>7TmNL>;j^;B~KioqgmZAF;20{p)u1)mQuU3BVU$e6jB&U}4~uS6*rV`Jeyk
z7MG~wMl|g2VFL=lbNGdW+0pb*KmBwUI57g<8?u6N5YCv&SlGtGndd+M1<r88R_g?_
z4O{G}!*mi@9~FSWO((`d8))22Td*jH1w1fQE=YS*j$lBZ;N?ljz<fW7MLyq;veDPh
z34Gx&o5tlv?d|P80&w%GI@>B<6o9qs*4uwv_DehD#CBWPwOtFy5K=M1FP;c7&jYa&
zarnd4@=pSfIE7_&1!VcmlOx>RQb3vl0<*UJQ$QK-(M4Ro8^Obi^o|QUli;i&EpR(s
zil<z-c&0bc7IE9dDP4n52TN@|h|g0Zw&g@?65$a?sH+7YkiKDHCocQ}PjxE12u`X%
z?Kt3q=(5Up13O}{-Lq(wR>SYG5wh<1sdtsK*04WwKH50ExhF@RrFu*A;Iva5VQ~=8
z)Tsvx*AV;F&A+xw|MfyU_Uu1#p^lmVDYX&GHEPrdUGB5Zjy~cr=dYzY2Y~B;v2cwY
zt_8bPr`qj|Ll2=6d3n|y--^De%5!<flrwnuum0+<bRNeke)1xtdin$$7C{CG@G9<{
zbI!5ruDj0HP+W4!CB8M7aPR<o{q@)T<zrv^(wFSmV~-`OtYd#U!2S#<0Q<lPr}5G$
zpz#O8_m?ahIye|9v#J=&LVJ6=b#!z{Fze8f-Mh3mxXl(UTVsc7HvpYLUsyiqqXPps
z^ZhUqt8@|0<ll;^o0W?+K|7i}W;_l*itU487VRzWzy-b3El=>qIs#0pm=T*<TcR8b
z7cCLV57@*p=h;JRHrpuepGK}CK7}%Mm@2co6j1Zmg%`p85Ijl*BYKxuMf&_p5Y$6}
zCmjA)3$;|y!sA3FItt<E@x<j0+7SFslOtCw&&WFg*HA<_La1UQ2L}pB1BXc&LApl=
z4zI6N157Pw;9BppEchM)-vg+?lft1}35$o9xA-ohVsLjNqIA-jq4tdhS_iOki*<+>
zc5dD6(*e=O>+{+Bx4;|iu`d`SZKR3*g1%#$=ESeP;U-1R);gb2_VDtxx~^KM{%O^J
zo6P)c)@`ss+6k~j7Oriw08CUEeAL;cIe6U!xJ3$(%|6s-h##1Gspb`VK2uNd^F6UN
zgy3$5J79Uz7sNI}Zx2eubRfTQgforGFL!~by}jM|4Y*VTRhAp5m*=PkXt*@ey>Z}x
zu@U~5*H8dBh@+ze;m@gln*UViJh^SOp9io>qk_K7f70oI8>VGsj*w<+O`QG(?R!&(
z5bn%vLOWIwgf!C${lOhf58j}ebp+sLu@!iMX+$%o;$2e6?wR+H=e<j2)Q~B$tuhHy
zkJcZ)9T0`nx9I8U_#_-XF5n9||MuDU@6i@=e1a>5If?}DmE5ZNlRUyAe{j?}@MMUJ
zETgl{L3@VP7KJvd&pd7NgS<XJM(w?K3AV@OM<t5VBb0@|ct9Vg0#24Pr<HQHV9847
z8>~*WZ{jIm{CZE*o*@HgwDmowXbpi<`<r{`*_SW;q@8f~@wQQ>ew3-nn(H4c;XX>!
ze6-gQDi;eNS4!DoDj;nM)G3ClgGV1)Wq<a<6E*!f-qQw+JfBTRpG1A{$whk#OI;{X
zgyRp(2zc@^;&K;p4793oIoyHd-*fFBNVV~MTGn7xIs>$71Srox`lCPcEkyZzJoAC0
z)G<sQKiVE!wb3SzXB95vp*=g86}*`JG5KRsBdoV`^38I98KE6_#~pY0bOmb(5ZZf#
zn|cYF@4D+Qx3KgEZ%Q}V_etMy%N;s7?j&2^tvd_UfzpZSf3g5L#f1>_&N3680989s
zy|cQ#7M{U7u4aG8BTtui%=q=Ii*JCZniYyjW@0Z>)X`kvttvMHrg-(PIB&v7kX6Nk
zBBS8er>MRmxTs%Xu_TZHDr?CoYkK3a?-Xz(wQ#SCEQLpxe>BA;iwS!LQ1B3pvXR^M
zw>@Z+NAC9ZAgsY*u^8<!XrX=P0)z0qaj7o$9tUlEv)1xl{$0)e+bkQQ(f^XiR%(_1
zL4NViFs=0O)=7U@`PxR0P`%0urY(V7{kvx4X0JaMwp!;Vo&SHZJt^zp?%g^wK-N9V
z80CiEaL+{9!39q6_hu?2@@`};XrxFr3JV;>wTch81LZlO0PG`QIy#%ak`v6y=!F9>
zFv+v8mPQVPRi_omhjYGXO`B*p-J=uw4xjGK-Ynn1Plp(D@^n0y{bAm|&iCi}<hY?7
z#p1{#kL>mS{v22C&+$@T)<AGfFlNE1lUAJof@sK)q4xL_E9|EKy2WOmINR3k(n(?J
zvsp8N2oc)C<-&)+T4(714ybS!xMiZ0mbkKWt)wfkmyaIN5m4X|&50<&73QP+fB{P-
z?-Fr%P)^c3YK<nvIl3HA0T11^N+<#oEj>r_&GN3|^~}6I`q)kF*A=E9F?;$qPT*On
zyiQjNigf^~3;~7`034?U685w`w&h71KWVBy?sm6oftxvYlmXV~;15AX`(eZhObEkx
zMh<&==4tl?+bkX&&~)psZn1BF?F)9|*{`xLZv2#zDjT4${O{D(KW1vy>B!@`Gj)Zv
zlm!$Y7WzG<fxxs0ZJH9mGC10196ZIR$KYoc#)cxq7CiVJS1o&A%k4;tKhO^k#v;JI
z!Ik43@b|3-1{8q4)z>}+9ZZB~{KzAZ*ivmJWD6nYZO-1OqpC6%qk)}T%|1!f0QBE0
zbS-N8^huf$(BisEho0kSJ|43?3;oz$gCzjwt>W)(1%aEibYdKOvSi5;?`T=vLr24}
zKez)g3JYr#*fPa52-^~SJCzXCz>%tEkLjY6xi)B5aWRqvX%Z|^QdMFp$4XBUL=HlH
zqi{v$;dKIAX&@qxuVqpQcg06oqy;|w5f^0oJ825%_~rM)Ci=YY>J$a=MS|?<dGJm;
zM*As!)SJ^?(Ba?Eivq41EU6nKPcaeaMG6|~Uic0HMVkCA`I+m-SH=|}{{4h=hu+&q
z4Y8jt-E5z-p?2KS?RKby=61Dh&g5W)eSWhY2lB-J`%th$86cP5v{&jl+9+xI<FH&X
zx?iIOsux~zrL~>-65FB!s=9Y<v(1{%pP<qHaE<aeu3m44P9Cqd4kN|$+Fq!$4kl>+
zf3vm#cIkbzwg+~uUndeq+6zuS*-km>crB1!SDs7=|G=M=nMK4=cFLJ61C*gRWskB(
zdit$U03vSS`Ma%w0R`Z9Ti*?H>kIP)0y+_x6OEl!usktuv+WN*IGcAgsVAIs$Rv9}
z^Pe3vrZ{X81}O*9qWROofj>?r1V4&EbKF5;^Jn0u6JZAd>lK(z;0yt_=k>Q!052Ug
zKP(}f`_0~dI<DxbiEHHo2MZd1d+!4R<?<hyq$M!8*{&w=kV=I_Da1^P7fDEzyV651
zsgOKf0n+G^C96-@2$-+}FP4TF$!DONxX9k*b<*oHDFSmarBR)x{A;kK=v5pt9gviN
zq$ME1r??cJ+DyQtCtt%y`$ce4uJUgLo(k>2dR=C5nKzXUAkn9PiIUp8`$X7wje@5R
zR~NXl+a6xB!sZ+{#S1f9BVb1UD5Fe4vE2@Z0L7PfLHk4?GD?OYP&O%Bv@!g+W_+FM
zA@QdJ_VthO4<0<kes;sJ?6zyZZ703>wYFkor;XQ5f2%ij>5!jc65?H+|7IyPTnCO-
zYmL?i9NRv_PXQdSwE;V~Z?i>fx7aZ+p5^oZoRNT%1JA&lD9rFCNAbe*l#g9BdD-cM
z`cig^3J=mpVV%Uff@@Y+9y##mS*w8o1>jk$+P))>j-ge`=v?S@7@;6QtCt1AlrR|#
z+sI*7Wr5qQ=@adic}sOc$Tl4#im<PB#o(voW3N1&0(<4r*kRmOxEtk{htaWs8$pj^
zK?eYCXr@ExuLCbQ=%6wAQ%@}RL#e=8(c2D&bkMg(BcWg2aj)6<*~XSJm+9))2bC-H
zL#!0fT(}cdOHfG2fTx7=6CW@hL>L|%usOU3W`C-2`Mdn0x5sEZ&kIbv<1$#^O2~We
z9IP6KfKFl29`hu1{>Y~;khDq2`*+Fn^r*5TQa0Hu^d6*kND@z0#CbOQljs3Sn&Yp1
z_hfMb4n5A(iv;$>n`S;9T)M{2J2P3GI5dZrNLtzyr$h3CpJ+=6MHB)~odX61Go}kr
z4&V>ypiS}%7qk!uOn)3SV3vP$>Ere{pS;)(KjW1e`R~+9{_QqY=6}k=PJj)YHf!PE
zU|n|9W*a-Zq|85M2ag-ADJ)HStlMNAQzv56H<R___!qyxCQTggyEot;@_<&#f;EvJ
zyqTAsDZ4$DopPq{kxB}DlpV#2yilO}<Er%<b(`a==1*K+?>P?Qz|jZ~ar>(qPyqIk
zFWZ}jM}wsS(<!l<9kV|s`PhO-d^BQ&bu)oVpNX(<lWB1BINcro_$oW~7*6if^{DDJ
zz|ZMgjH2_i@nKl<Cy$)5dsBr0&j^B!lU)Su?e+Wcp}ESGy}?ZyIx9vhC>1C~%=x#f
zgJyW6gIck2wcUCB9X9#6cH5~n6V?0CQ+jHWUm;QOhC>G7kTmWTiQi%H6Bu#ws!q#y
zH<f4afY!m*1TZnINvOqxJMd;W$}m#M5mh?i^K@lIEj%;dr%`&q6lv@55m$r<m<K7W
zz~c+}^h$jrv{;;)cz|?yT7ZN+!%u8-HMrEiYB2#(;sOnUlKy2rl7Cyc7hwEwyZ)h-
z_BWa<XKHMhXi6~6_uq309sIyvb`)`}14}gp7b9dw$zdG;7i|>5%9CH<uhF?@pVw8v
z*WbVT8neZ>+O%WdY>S>)X>Ho;?>wyqzN}N(s70&8wZeagruL}1O)48#f>YORdUD6b
zPSLKL5$yOQXKP{LNWb_h_#0}VmmiLB)&`*L<Yf=b2xSk;2m=PnPCr7rC_D7y$1#w=
z!T^u<)qw}Rxg#vE(|FIrfRAweRUFlL-uHJmpaA@yd|?;|hT@6vgz?ydM@JJTe;RZ&
zaQwl%&5Jn5j-GX}-8uhpJNd}Ngnx)L#r*m6y+h+jU(A8v#|1O-gpPP`jz7gf6Z9~p
zz<fWy>OTDF2!j4S;Uped9G!05LBJxy!wx&FtdqVf6;Y=73l|CVV>W)|TwC3(ebNYM
z&ktdgNuv<VnSiXpI(g-v!0iD!3c6_C8_6#P_Z*ow0gRlSR&}&NXFMaGCn$mj%2nx5
zbI=4_%QKQW3~3#IU1+MSQjy8ArHrGvd_z{J*VRo9E`Lf9fibE~`IXs6G*1sq0eD&X
zMq!_ZPW=#q7`u4wmI83p$ia5Ul1;W^?dBfW16S#)3PprnT;{_F86^~jfWxLpiy(wY
zV=*B-051@i%1-H^C6&itn*0Htw*K4u=Gp)G;79H7=bvkvb(7!f4V`w_v`JD(wC`Wj
z0NnhyLu&y#rq6JtfUAI~Oc*CccDJowze!U9+Hs?7QRKuuHoN^W3GZZ;gSK!vP2`;q
zDBvrs9x+{zmz~AJIIMfrPn4bhi~6GuP@YH=F3&5%8a<%{Sf(TvELh;y3HlPkB93&t
zlLyC>v$z;~IYR)NapcW4!&oA5z|!9lpX2e!{rFGqYwyQ}{CzJP4HE5|#u|(JnD0j$
z$CTEHd0^zwwwslokpc?<S8V9A4V(#}*1$GAIxz6lu?7CfgD1Z|;f_3k3*pXm1&i`9
z`RDuaf$b|^>Vz`Vsn8L`9RwU3nky_pE6+VSXytfR4otcCe)j(lSDg&jp7C@#p{PtK
zly~YuQ|DceS1@^So^S3iOyC7VrO6W&_yciWTm<INJgvebpQh<VW575XrFYm$9k~Dp
zAfw0A)x!~#gsdaW{#57+RC;oyvcKxd+;eM}$CG!Se-@XQH}@+=K+$ekCJ|V6DDP3)
zHm6l|x*j;0wM*azL`6I02`+>pJi-&<%;=aMGAul0*9qo&lmUbdiaU#G;p1q#Og9C8
zIBn2Zp&c>^!#c;&)R$fP1BL9A@SWi28K}{^1%)#{fLp(*)1(DBk8an&KRcxDaM=;-
zA;#)}AhrQ^NpX0JgQ&($&?T2MJYCdLgJOG7qq4(mjmnPY5Q_oA7a<JqW3goFfHah!
zK8^ap^`<`o1{$cxm%Z#|K1jLXf(z_h-};v0;1A^wT=bQn_{1lCaB#~lx7cf5^BPYB
z97@V*r=8}55v~!YUqT5u(A<CpU?2YCG-et+okW=A=~Oy8I$V>c!NN$59O11j0N~{?
zkg=nO+wtww?0@c=Z_`HXbftnjw8=XdwTd6y!DQ9A{2d%I-=CkmPlrdRS*5+Vc*#3P
z6X3;^h{8e#j2}GYjia35&uOdrfnlUQRCPV?fd}p1{_94atA3hw%Y^Dqc;eJOa0)5N
zGc@GKd!-xAUkjR2qAa2g4`}pF>tIs~|Ld@MKKS!@z2>yJzuHE!yc1Fqxv%~YVFmuc
z=|T0dUU&5g^+tX6#0u;|QmPO}H7t=2NQy*c8!qzZ-NQ5Z%p2hGBoAE5rv{h~a#E}a
z9Hw8Fws>0GP`i7<3j33DcDaQhr3p;UpWyKJAQ1QsVaPjekkKScG~u-OIJ=+GCrT^A
zkP$K#g4k|H{m{O7^231-|4?1%b^VRE*f;+53T^v)sV@BKw#{0nGg88ydp-QHDy3yN
z&*))8{J@__)~&H4w8)or6D!wmlmalqMrr}!U`-EfmBnD<+{wBDaIz{%c@_D2roH$R
z<p#7`cFINh38x>S>{v(W$2fx@g@EZ3rVctfI?BGFS-(R0={I<O@{^ymFMs*V_Tm@6
z*v>!yd{?^8Ip-X={E<HB;&#vPeCIpfR|JjU|Ni&weeZjp!~Dfx{Dr;st#7R>DflCO
zUBrI+GN1tLBOkWf=xE58$_Ph;#>8DU$yWtvUKE(688ZP#L#H<#bkGo+G;)yr>(zJJ
z7ytLGY=%ZKY#AcH5q{|5w`bfMKTE@JHs9Y2cPqTmO+0w%JP`hLo-FpmHBwl}hcv0Y
z4%Xuj@zgaiSQ375<4x8*Pg|%?nqr+YpCUe0Y6+4M3ka6*2f7H;Q}rWcMMUu=teGQ@
z=rT<P=y5r|EDmoEwBV`+#(U5d?>R2gDLz0vstTt_)n^YUx*OInxSW<cq$xS!QC^43
zvGxeaBK;HwH|M83bJ*l3e28!IS34ySe#(#101<UY++c-o*K@Rv1-^daTKjtm)q^LF
z^<#9aJed*E&>Or#ByEZxj_?@ma)1^V0G@=yAB;XxXfc%|I2kzuPuu0>$Xyz}50_%f
zx|H|*%U5ma(TA&DYB8YB@!u{!-=e+z+XeG9iUtaZ{;|Htb625xZr!@gCNMo9WnsIn
z2p&FUkS*`pX|Fwbj`Cx1V9zbXPJA!_v?@D;02WckMMhZZ_<jEKpSRC_?sHyGSOpoO
zQ}$L41%mqGUH}|8-}9dL`1uV?{o$a6Hh_f`hf)TAS8-o)#T9n;*=IXEeFy^|(jG`|
zKmphXzcw8QOv7jva~JN6GtTf%q0z{>Q4#1o(7qcv@N*m}9pF5j8o0A-z0J`);6cO3
zh;g(~N#WHq0dhPcRP(U#!+6BQaF{zeF1R}1--yYg10j8*c<>X)D4&rN^@25o{ry-%
zBHy4Rp5P*`nTxunUfDfBCp@-|dBXd2liJH|txi%K;&aFuhDP}4I3nD$L+Hc;C%Rz8
zmoT|sQJ#+UMIOADRse<fZ~>?YyyeaW`~dSOgFys1KH}uOldHlg-048T7ik@i1cf8s
z^NWDuDLDUTim&C0!i1}Z6ws{06NPy={#4FZ-`)kHAk$UBHATd$D<$_Vz~Oo1_2Mx_
zK9QXgf@{~42iYv`f_dn19iP2&gUy&a-c9Bx2mN)`a=?qUwfG^Nunxr4*6b``>W#&{
zz+w@I`E%A0uxRo>zJHZ1yM3NL|K(@cf+tqnXbHA$YWq_p*eAA)vqj5R=+25!wyv{V
zJA9`2Q~+B5$8aBjPW~G!D**!oiZvS3`+aNe@Yf!ulR>2{h_))t&t%!b1%8Avi<U9J
zzx1UqwU2-N<1WxyaEybOQSM{3PZ;T_6D$hQi6EyR2uEL;uP2B!JVW?1{qX+xzu(UW
zA)NIIS6y|LuVJ7+VcO=-JMVP5$b+_aAh-bqU?2O+A>^1hM96Wf-`u%#U5K?Z`A0EA
z*ug+F{;DHB7={t_uwldPB`-SLmT$kyZkxB*Ui+ex#D$F&2XMiR-VxL5Rvfqy{4fx!
z@nK{%dl)uM^OYoZuizyu4qVY8;U|t!QYaO1;1Plw7qI@4{UMz6xGD_cjI>x#iSTFN
zcYAw>U3vv`tZ4-^XT}eOt^$XU)MT3bL#QOWJtG%x2#=_vNE7b~stz{Ck1%ASIvjD4
z7thR9%apRN<fB(ab8QSvd_7DJZ5AH+*5#MI6ke}i!E!Et{d|7ZK|ReqF(`y=YeeW>
z0r`6Y(lBjG>`~}y>b_I^DU^yx<8(Ss&&=`D+%zL$PrH)$ls)+9N_)Wxvx~Zllzac=
zJLTUBTCiBbqqJ)*0<qnWGdD1o^JBC-ZroT~wqlKa_yZT)VW%B!om${GNQ7enSb<>I
zoIYu~G+i}KxW-H1BQQ~1*$KdW{|-&}wQEb@8chRW1#mu-Rba+6O%JGjd)@4PF&kpY
z%Z;CY3}pu)Okac&&kuLgO*h$(e)J<(jHn;r=~EEWjP&VCu%JXgK^kD_?{E=LnY<2^
zmN|YF2j6<@t*(3kgYxz2SHIfR@aNiVueFOVy2$?HKmOx@5&o2WKmni@_7O*;r6I>k
z`Qwf|&KV&_xH%*3iDYmEGXNV54#S0V(JnJwJB+81>M3)l+hy0^XXpLFv7!N?*)xa0
z(F!*mQ9j=fY;Ua=lrQp#+$jSc01kzSPO&da1vsM~z#Vz;MEJ9Pi_r($DKSev_V_aU
zx67`#3CH|_&#`j-ZyIstLaeBqs3hS^5=;;L5f0Je97ZpC5<Msv@YOUqIu~L};5doM
zA{}?-J@U!(^)RUb0!@Qr(&Uc!z(d&pEPgpy6F+MwsN;hQiwDZaU@Q(o;fhm*Ce=WZ
zUyhJUfrfAu7+#&mBHk-8!PlkAl7Y^vErTtwD~#O$fOAOBF7CwDser#-yxQI%0n3^B
z&-C@c(H3ck@Hy`hhOmlvtOHDK5k|ePSh>nR`hkBjNlRblze3mf?kM1gh`%Q45}$P&
zBv2*Tw@M*qvEVj6SLjfnV`l2cODP6B+4dmE&3}h$@nE+s4%RWn)3Shbw{}C2|L?-l
zC(utYqQ{{GkvDb1S{|$eRa&SAgg=TyoWp<uf#bwN>I)Zr1$75Ll>0yXvp=(6{pwe4
zMFl3u?d?|dFwjEDpsz(CJ5bz#VdwZg_?r318KH!b!#s!<9D=Ja#h~%ixN)S1>1fy}
z7YI_e#2hjA2=8Qes<W7VsOBa$-@RB@i4GGSonj*wj0%I{z>PzgGM~xHTUJ7|00#$i
zllPt+c!7iFU{L%hEGQ7S-+sH}qF!M9{wNjT4Th(_S%}5@3OXFjl9Xl9qf2!;%3U^H
z`@uJ<GpL^mL84Sk(yG&}#gRAL(7e7pxE80tdZg9&GFEV|IAm$71SyX?=eP>|_=&H=
zIF1Spbx7hxX>-LTf{SO2{;@RBe3~>Nhwe!N`B!MK;BWOxN-u#B^UbK|s7B%><C4n*
ziy{-H)BUyObBKL+(K=hJ8}ksNrPZfMx;KyDlXylO2Pb~ozPAz42hYIw@EGm??)x6H
zc{kl@N1rmoSMGOdWj~|-4Laawv`qY5^Sxb10kas8sQ?822rU?#ge5@uH*D#)nG?rZ
z=N7?hyC1t;woBoeK3xYG>DEAZJw8YZezyK1#1ZH?`YL`n%FC1oJf8mo7Ny}$Z+eqm
ze);9j%LsN9c;@eMH{N)o?_>fWCxM=K-g)+yfBBa_*Uv&krV6NMEGry8&O195KK8MX
zIbQn6_(qO!KmRje0eFr-FpZ4{N#nfx?z?^74<Sc~5GJtxkO%|Nf$*bI^JHXBg9krM
zwu1$3%A1i6*M|P#5!3CzbwSW^a}I5pCA6U_=fZU1!iBa}D|(sl$CSAz(|n#mFdTUC
z$B%hXw(BsRzycq(7EzAgDi!1bPSW6@Hy9k8m=a_60Ci30^WejeIIKR3u1<bhohtQH
zND2vwa!<bElfY6OLdD+|gujO?1b<z4BRBv;QUx$@5mw}h5H3d80aZS8S_llf3V_t!
zO6cUNz$8)HKxyj%SC1#*E@%ys9V9D5h9d-Ci6sL|VOfCi)fX*`6EL_uO9AnZI`gdL
z<t_ceQ#b``X=qVDGRJF#JaxG4)L6I8mabUu$79zOA=2&TlRCmdZzC7sv{er4nfKrV
zd-MPLpiMho(}2T9Shv;%?bx={Mo&x@f`bMh<fHvAwO^kKkXe6+cyo=m0*+(OU-@t-
z(B$#sv}2&lrwy1sSfR5mUZKT=W7H3Ly*47|SqzJ<kYtg!o2Z{gSrZ38|J~pHon3h0
zh4#09`?uZ~V78A1e+(2@7|7A$Z2e<E!s0@<3!)IPmg;q{dz~u)Y!L*Gb5CCQ!WY^b
z-}uHx+3we20}8-C@_7-Mn4sI++r3eTS**Y0F~UXgA=EH~A@EpK#A;WVsD1>k&PdF4
z`mwX@?=Qd6-gs`e?-%A^kw*KwQ7)L%k37JQNiP(O-puzmDi?G`xv?m)Ac|2sQz2Lt
z`l3_>{tz;B9@GW2G2hRKg1R6c;ZIq*)p`Dc>wiZ-&o*q`VIxFOZ4^j{t<F0}z}{;X
zojAfcB#iKRkLm;z0w(qOK~e%OlDd@2)8G$WqyRU0xdWd(ygD6<V`y8(E8NpNaQahy
z_s~F+!BxG0^l)Tc<X7U&0gHG~mxbrSO3y3fJ)ZoMD+GAF_IQG?4p9lgx8;X&=D0EK
zOKC%~J;=XJt=f9vpzc%_bUd~Mvi{}arK|0%laFxaxQH(E+FMW13LgB89O2B<Gh+Yj
zSHEs{$V3}AR#q>~;|~#ygEi{kqQ$8rwAF8rbiv_zXA9uLQVO<fM*wYhBJi>bfMe>o
zQ4YhA!DDp+(9>LOwP1^#@P?ThWv2yzjp%r`!r-~+KdO0z0#ps>J$M;&KLQ;q0)vE5
z>WTlr2R`6%z`y+EFLxy${tgG9<KTVP8W2Z+#()Ty+ELbaJ#_#5GoS!G$KN&@JlkY2
zZ$)E`@cxiTBSx6=i#hK6NZ;xR8X?b3VY3dIY;EdP9$5H<ops8QqQy@{??D3iVH&h}
zR?eb*ht;B9vFMSsX#nNTfAi+ebB!Ghot63E?N6PMhS3!CGDX4u<M#G;n=@xlX~lpj
zI$X^EKl#obHvPoOw#ls{StlLxCrTi^t)1dsc&BNYT#%=76rF!n2B>ab$UE`|c@Q5C
zC4h<jQY(mv?CJQbcrqLrXr0&7Wr_`vWtmA|#hFr<+0}v6ULuuSj|QvzDar$CN3UlQ
zlspj#`Wl*)Sr#xD<&k(f+9!^B_W5m@yCYwZ@IUp)5SpHD^bosi;c9!M6u?nhWS6$b
zQ4i0IBM<0GZJ;Mks7m}b#C~w~wf4QQ|Erz)(%0MS&aD!fTV<8vu&O~C(T`0>{%Xbl
z;GJ41r}A?Y@aFEVsue2pAWh{BalS^m=@PGE?c>D19oqh9bJ}cn$1EwuL!CDmEJVFQ
zRWtf}Qry1~`nh~1e{Zosm$x02mwo{k@Ay#$aFhjwABU2VyZ8p=A2dYSpY3wzbnFZM
zXRc-M3#Iuzrh!I`S)B&BH;piF+z32I(3~I8-mWw4#dHy0t*SH)Fx<{RX_o!$u7!SI
z_+XhAgE4@|&yTq|77}u5Aam!OdWNei79QBkKj4L7nWA8-BFyuQ)cY&^gLZ^J=O{4p
z$C`jLl-Gr_mK&rCRvy#yer=Z;?kB9(gob<xh@P=15l>H;J@FP{z`4J8kMJUO{Uh)w
zG|)SM!t->+XtqpSzGvyn$ih?78xi0vl#&SQ0YgMyz5s0StfqB(m9`r0C<#wEpg6t{
zAtZwb^5P~){Vmc6stm$|%^<QtFy;;(0)JY_C*1M}tfy^2;&w^O=~@Ty?ZsLLz^!_k
zXLqZD(k9yK^Uw3^tv|}#jK`mnAZAqln|trKzkU1vV~4*$qgXA_+oXB?@nc8pf}bHW
z@v~w<(}ptbkJh>VBP8@DYs62w4N?fE>DXVa0$dO@LIQt+E&<{!fr(0wm1`Hp(1OR|
zIsvg;3Io<V=s|e$gI}QYnJRN$H}PHQD?&i$@xAdgunGF0k@s*xbL7K&_17ql^n@R1
zu2ypgTFLjZ^3tCo3v~bhKmbWZK~(tmwgJ-6F`Z*x<3JnmabEyiUJ!V#^6;iFra$eN
zS$5^atNna&&J<UWI~b0R0UXSwVSZ!pZ0y4)@4cZvEEa6@Lt}?-!uqRJKriVr?ZvJN
ztO?VmX;pM#O@Q76mrg~z>z;Wk>v%s@h1LH)k}BIt(K$qgl){McB2M^{<4MKw%i>dH
z5njBMX?PcG{+|2m@&i`5Jv;wHfZiLia~7Y&l;!Z0jsFziJ|gQdMf!LKP)TPQkUWB`
zORrGcOzEPml{{;8SVCqV6w(4h$e9hnad-~j6&HE>v%u}E|4CQ82OJCICJt2{ZqWG~
z4=(n04lYjMgRk~hFh+ax{Gz<P6fXF8_1g9J@h@De(9wRgABqmbeybM$A=Ia7fhv}b
z4Z7-go2+z%alG$lEehmVVA{cU!44I_V!=T8Qxgc>aXJa`@eSSfCnwC2@;<}2>aq})
zg~9M1iamV=iU4H}lR5PnbsF^U%@d0Q`1|Xq%jh$59E3$bl*0rpPh2%F(&V@f1pmMk
zz<%(RgE3(!R<v^NI6{<*I$&UgXm1+0UlFWnfRiVTwBIgx+|EDq1l7)Nb<}B25~jUh
z2M2KxM|*p_pY9ckMSrRhx}giYS(S^~pTm8)$uA$F_Z2sI>A<k4a1aqzgi}vF)x)Y&
z6u@%I+WL)~?AE*P)5vI+p91H!mEr>mkQRuMcoHTT5+=e`o+Mey2=&KlZn~VeFYzQ4
zwZ9dq<ZyYOnZzQ_Ny);<y9e$pzLt@Ep4dZbl-2>%!RSL(B8&7r!gI`d5(yB!)?;ZD
zf})T?RY0Q6B+P8(laeSsf;KwnpaiT>T4t0+{><?g3W(@UibqoZoG(bAEYmH3N;~Wz
z`}G5l*{n%v3SiWz(Ndx(lt=a=M8MG>vpMwR$T$4F<43_jU|;gh@7T}3_d`4Oj6bw(
zD%<ikYyD)uQ4;dY*Kf9|n(Jo)FE=Po)aafIfrd*-*|@pWMrjIfn0}A1S!dI<@NcBd
z+*>rl=S~Y22aX#mq5J3tJMCRZ+N{H7+IlUr<(v_edbZfY+ms8_IBSOZp+v(^&`4aa
zxb>!Ap?p<Gx~gfu3N!Hj?AO460`TnD@w1u(ObIi=pfE4mJcs14PzF;v3>(bbiV$!x
zhd65FF#FR}X4#MLTx>6S{?R_F8Y0}73E8E<3V)^)IFd7_EL!1h76;vQq;z^%E7;%L
z-rnwE(aD6i-V8^h@EAq0s{&;OYY2OM>Co_3aXKBMVY&7I<3Jq0TL1D#ciN<5r}~QU
z*h=PksBT=SBbR(Ly`B-KawK^L4B?)l7EcmjB7%6wQ^3_HiooKTgheedkf)Fyoi8J5
z)oIPu!+W^XTZA(ez$rLY*T|BPrz_#B^c4|hos@uk@MLs0%qL4;jnAq!gOaxvf$9-T
z1);2rCj_N{k5LPg5}+zE?fiJG^C_Ek_*{9PvUO6b*&_~rF^WbgvR)?yqf)p~r0u0|
zO$bdOD_07ti!XxwK{{|>b<NN0f)9UHXZ@dLtGactkB%q?3zsKxME8WzBmCsPl@jn+
zGP*U=AE9*JvI3Cb7%9_i3uNaCeBUX)O>_U+E+|XDP)!M#JuJ&zI}Ow68KyM(K}1Z$
zp$xEi7(RzbnQy1etoK1-CogEG%(+59VH%a2aNvo9zftgGYaej~?zyRf0R`Z>srP3j
zR~U~GK8%I1Lx{4v6-I@D`|5<o`%R_+{_=<S*#}l_usMfKv*nsUe?X&i%>B%tbFvmp
z9GyU4^MP(UCoB^z%0n=-h>$f4%+-UpStrsgFX{k?pr^A2FH;nBUf?9pMmQsE{(?nH
z{jg0OeVDD(-Ra3yI&kt3ZC*(!yW%~-6<5$vK8f;lc(Qn2c}Lhr8pmDWk9dC;e~1s*
zGCl}VK*YNM4_b2sfS@2`Y4Apw0@Ue+4mpq4(_uVh&wrZ12Cg~`4Fl^Sn!V_g+7Jor
zgG4VRhk$YT3~f^8YVj`ki{cdzPm4g^B6GsrF~jVd1#A2~0Jg-Ak<bG-)+6|e(K4oT
z+73btenUXg_GoXwh7wyxlL(9VXm>n;9j$dWzq#)Ld&6u0(b}JXq;+bdp2LV%t=r(Y
zQBKkP{dSG|x%3C~K6JNf{(qc~{@t)?iy!U1LG$}Z%$R0dCBRVt#wpEOofODbz?==>
z_Z|p`S%=N(K-o^EXWc<4^C-wDHVAqYX9P7TdeXO`0Kv12+);uUWUw9xUdF*=@uOT(
zMvpB15C#tFF&6PPg3HtFji34s*C?O(9szTpJqHwkedvqRK=TIMiZmDvO`MTA+wkao
z*wYRJ(^1h8;ehM4Q-8n>firct_{(ODwcBI{9scxo?Tw$WQU6JrQkd+_8~B2GdP_6u
zf^J6WSSVPZz#_s>2>Mb7RXV^C^ZnptzMoUVn$br%*Z*#l0&>s12Ze5;kCvXI^YV_P
zkRnAVk~)w~vN!>Bq6$+Q{Yicg&;H~>*Wt3b9^Oiam33e$NCmp`4sKxhK}spT^JJAY
zxiS!7bu<DWWeb3KMtB8hnSa2=GjPSbp1>A@s0_`)8k8g29b~e|kI}rlx_mPvgbOBB
zPa1`y6zo>GTL<KK2)%EYXByW7ckQzIOIO)3vvnAkM&3*{FhZwIaX=RWlQxWCWRx7%
z0fZty1Ss%01Ux_Jj<y)Wh;Yv0U$$bEz4;$LVOriV!Kf4Oy0lnNBldAx*tdGUPVLiq
zJ#AyhsIBhw(LTrij?pd!O#ABvJ5%W<>foPU;@>W95uB<;ftz#|z&7C+IfBK3Yi*W}
z{aw6jtsVS_$J^vd6UEEAkyGuTJ_CA!_ux_J!ZE;rN7<!B7>jce-jo|fI}`>K0t7nB
z0ena~5%ksa0#7(~N1xRn2Yuj)`o!PNMLk#J^R(cL@>k*Z_xAw>U?2Iy%^GTzykMvp
z=`!-?`~bGK!Jse~!ar94g0ZW(s_%4)tOFP`dX)XenRD$;Ke*lgd)#Su=9y=Ru{k(s
zyDJ^ReAV==-gCO?Fq!X<t%-~>f^L|xFAls>*06%ac?5LQK`U_8I`*_0WvyPd-hO)X
zUFu{H^S##8L!u)D92A8>(UV}`en(gl&cp9r<0$~)P!GjRol`ff_XyXs4u){z3s)DH
zg?n`MU4bQh;AsF8xZ~Lnq+wvBs69PD<y8Ss^&B+Rz<cX+p`0TnAL_G#BNx6GCi2D*
z`YOaldI9nNw70XAf_Fy$8Z>i@fNj%Mn2j22_bpmw=bxn`WeWjAd!bF?z=zO7IMP-S
zjB!LQhgabkNn;&AA%I^|2xyN%3;fFZlT8wa7kuGjTYT&NHuv<SHAgW-f_{glqBMOc
zWn_rv^f~ayxA|$5f5@cqwo=0V(8&`t@4v;50Osi5;XV?7+SYb$wWAK5CPiVJO_Cz8
zW2XeY=;eaM&J|ni{MVgg6UVnnaaryPFtq0U5%fhkyon>65r1wSK#-#lU=_@jAnFF|
zNNybf5BX4T(zO!Yo?(+ekq&zJ<+L|RQWqTXQO>{@;dQ|K`OAO;@O$)uX`nP#1YWa~
z_lOTT8aT{`+f(Bv9ye4SQg_!Dvuzv9R&BGflV^yL$NF|5wguwIA9y;x@CP$FK;=)+
z!)PMt=BQv!?7~_Rv;{L)k~e)Ne>zogU`@b}6$F}4Dx$*$KfgwfyXl`;zS17N>1vyO
z@{6_qKiw?ou*iqHbcJ%_s0w_l;{*e+Cm@mj;}4hTo%u<dyXt!xr!*dy;diS-yqADA
z4|My;x|Uxu|JOj=r)U?rb)=9c3lJ{Pu97s-RDq3j<e!(P@W;2VqH6T+?ZG3+qlr^K
zPJcw#<y%jiV=00l$3cs9DF-P6is;fbz=(0f>=%!$w*Rfs^+X+{gJleQ@^&5q4W47<
zi@=NHyHN<3Lyxrs2twL>@C>{FE*A6s>m}c^%fI}6JL+_uy)k5%l!<M&PTLR<*YVse
z@JoyR#>o<}QPk>WKdts3Dqdup;(93nC@<q>9oV8ReH;XIuoecYqp<aw%9`3X#<~>F
z)>dDHx^AOA@2J_LcZBU!-tfwvTwdlFWDrJL78>)TT&x`ks|Y&^_|`E+f)WHjQctPe
zQc6T44xHf1^JzuXFMjb0`^is!;)=*y-tw06vNY01`(j%0vdb>>b5NjxDI1OiX8jUy
zSWdtB&2RcPX%-!W2e?SzD*yc{Za@L}J^7k27@Y{64qJBONMAH^1YnhUc;`pDYFxF*
z(dc1N=F8_l{E(e^%p805$?NUscg(jFj+h|^O)J@f4J9Kwvnsy)J?Q4iR7Knn2;J31
zd4V_R$?;VEz%T_uCyVf><Ass)4vI9C9oTR=?x>#!9-8mbqt!_+-`b_q)(R<6OO^?<
zJdk7vN+|{*A<O9S5YI2(<#oq99bg_8e#O-Zc!dLAUT>ShIWFP|i|`_lMShMa<(&j2
zFq!jojyw;{{C)IR(C;y|^2Y$5kKdVoVdq3sM?Bw^pMdf#>&jR}dJ%IVrSW?Uiu9@4
z5-_9W9rAVz8*Fz!zS$mIu}&xKb@&20l!M$s13w(?jvo$ThQMPKOuL6a!)%V}oGGXI
zTC97prW1bf<Dc0-f8?Wf+PUZ1Lan)(Hd$w3NO?eja|spcr;XPQc(Mqrlv2<>WrFhA
zt*wCLTq)V41B<3g;O|twz`VbxaQ@_AASfF*X0$!Je6>xN)qp7ht&*|!nKQgEVQPa#
zgMD#%nW2pzj&OuKRu860u#_NVLkZ$kPplxoQ$NH5k3$*SlMBB6pa1zEd-I#$Y&^kv
z$|<M#nf^>s<h1ZDuD<$e`^;xP<CaF~;CqDo>}Nmg3du)4@)5h_l1uFKpZ~ml^rIh5
z?W$;B0lPmx_gwS)bLsc7vcW_&;^<U@ybyMn7~@D^%-lE_sR~axVF=Xdln86&zzrNs
z$)Q1*@lH75L^1KfcJ2ww?QefF&;IiK=iAf?V|@`4IC?9nDGN#kqalPe8g`r*2;JZb
z7j*D!<su&8&nbs&c|`DYR{*1a;wgK;g+JZ{H}90WTeRKr8*Try$$p$FrexVSsK7#I
z(~SmA#na>P0~1d=LST7Bxe&k#D^cLr&(j19@Bjo*_<3fo@|wjYuai;GR~MOu6{OZh
z7cY?HWJDS2fKq+-Y-a^gb$D|^>mm}yT1I6K{%um`9!4rLLPZH^fUAZ<zMw^LISGpK
z>PE#-9;giRm0vVK`R<s>Q#%2NVzecIV}VaQwnHfbvdpy3lh+OHJ{Lyt078*Nvsh5O
zOguJL3c`h7zSO?>>3_DlfA}(6xO$^akg(pO5&tIb?{Cv;f3`Mm(CvN4Oqt}0Ka2Ld
zrA*LrxVytG5Xy^F0h#;XthxU~v_+6r5$m-0Z;UJi+-EUd(_EbR*QK;JcdB*Fn(5{A
zqmm0A%;~QySc{ixnp_DY4kd^w5bA;f3km`Kz!g_q;m7c@PUq~i&-SgqSVwT+sN&_D
z&{q)tt#5tHuDtR}Uy#hA<oCb-{eIKm>tFwRFH@xD$?@UP5Wgr2gPgB@?Q4Dq0Q9~0
zz3=tQ%ij6Uclvh<xC7xipaAT{UzvtaL#6{k*s)!XJ-f{LBS0g*T7(EEZ6k&Xc;L{!
zf#H-qMk<)lhYug25z>?P2glDgyXe<`H1JFQ;J8Ru3hwCS%80@PUK|=frZ<$9jt;Fn
zE{@-&;|fMClK3MOi##m%gBO@kR#@Q22!MImu!7`xqaJwX{=f;Ib?Z0S|M%^iZSrx)
z+B#j3G9>!>R0M%0PfcVHR77H84}YD2ARa`->k;C{ar++v3nzd8_cWeBae@Q1Tx(FD
zuAw<UQ1aBbcL#l81&FW|Q~6sGQ3=;;ebz;M9@r$U=bgh9N$bL+y!F}k2(3%QC|k;{
z!U%2#%QvGvFt=aUvB-fUPou=BZQK_S-#2BWxqcF99l#y)SJ-P`tm!23Ln&!lhp7xz
z7}_4nDRcNwY4naFJb%$+68=B2Lry%~p42qd6z$nxxN^O<Pa5a<B(N@Fi1IvCEBLoc
z3Fwp=pZ5t`+0V!wTpSJ*$Ne%D(56FvumrHMl_>(0ge|%YpiOD0gq7QO+u6r=XeuI&
zmRSdpE8&Uy{{O9(JElkor!Qc{|EgEL%5UGq&l(_36aC6pzG5Hy*vI_6l{Em1A%hh9
zk#O{ftSQ3f`BUdu8^7>{FSw-<1>=Js{Gd-)Fv#Ja0KNkb+PEE+wHqjE(N}one*R}b
z0oX@BFpUraMWbeKKaD(u9i0g-Z}5$Z0dqGJ(%{CSSTJXYR*ojjygZ%b&fT&q*bwVD
zbh4dv+En}b-HYwalaF+I{J@o>6AV0#toRem!*s=xB};tHpBn;UEQB<!8Qq}(G{Q?c
zz{wMO=?vl~Kkzo|$Re+xt02>voHPp;FVV?wx7)!-z1$vIuesGQ|JTGsN`er@q@KG}
zjYYg_uBg{SycFR@0@WFHTZhAOf`lKj{1VaxTZDUk*Wv_Jz&TWUExt*5sYF9pH6Fl9
znAef&q?rpUo5d3z^<9IBR5e;7B*8>@eP}Ix_=vAL1RzrY?keTWG1XF~(DaGqFVcbm
zJ<gSc;yX|S2;epy3w)K11^&Q>PHUe&$#(6~Ty^A`a_;qCaz}s<8#dgo_}+inMgMk#
zjX!?8!D-`V&DywGH}z@3-*&C;XYt<{?bpZbPpd&tqk!zzNrlkKngHk?p;0|g1pUHg
zt8Mm-DL&F)vthH}V}MlvH%JNw(*`@cciYKx57Pob<trtEI^cUX;x|vYmw&(=?IElK
zywk5CkYDkNSNJp@(<1C3U>h*gn^^aPCU_KH=YX)8GiN$J(uV`i%TfeHI&fTk@x`um
zfa7zY`<(sDzx<1v`k{sV=ug6d6AQ(G=9+W+fmRdyKzV~%LfFx;@4D+QpFc#{A*36P
zbQ_TlE*gH-0+2HZD|fj*6N>^untVBl%uiR-bF4-b?>O@ayYQEbY~{L5T9KR<|K)Us
z9~wvxBatsF@|mOKT!OeB7}y*>ryIJ6;~6wYJa|!7=m@yp6Z{<OOBl)!@6BAZ{0L`Z
z*L@E>>`CcT*%n5Jn;c!X95nM(TylC-n7cAeZ@J%%v<bvV-{7c=FXGCuN}fe@dMICq
z(=+lVBFHIWD*=Jh<B6keRo645CtlJNa3#G2Rnk_nug|Fp+W<JnP2vV1fipnsNyrl<
zKYx9$KN9wMG98eFJ-7ldU|0YS9;$_OtGaB_GVS?S8>^I}2exJb@E~XN?^JvH;Wa<E
zi#~mko%nps!%L`-(c(RLW~KJ>w`mPZoA&i@mn90`n=)~{t<fUHty-+NRf<8|2+rwG
zi~E@G-zclV__ndy;-_^h+9|+AKO1BT;96jm0_P(YjP(G{A&++2kw+ZvN&<WTDR3*^
zX;z;8gu`#-#gCDBtY-oj{Tg+^Gy&#$6af}b^23ky6X6^<VgM9$AnbAA1`p-NlE9#Y
z9Si*4{qA@Bv;%OA{J|Xq62j<f<6HvX4-_|G0oaGWbO=98QXJ{aVKyvoJMzdQy;0Lh
z`@*omB0R%^KbQnl8YXl`{Vbq?sbDaEI66lBFF3B<Y~g>~Jqwm;AMr$mrEucBg8^7B
z7t)ygVeouKe&9{ejsFJoh9VKP!{F2hcu`g`9b#UjW8G7I1f4mp_%V}q%KCBbuWr-y
z|0y#6Ym1|Xf=MJa>gtnFDQYGapb=PJG%8DZB{0&5q7t}r;HuxhKz*PmD>4OlOV=o#
zusrXcc~JhE5)>Kqgi8%1;&}oS<;%iV{>GBCij((>^R#(6Qkqn)qBGlR;afKYMIGwe
z3*SAp!%CP4B&%@L0dwcWI1fM(4=i43=bU<^x3wCMfNuSyjk3>wvrO?{(-FV_=L7$j
z9d*{9*aivqgJccZrBVH=^_zS&PoA5!24JuRJ(u{bCR_r4qRs_)WW^dgbmBO_%x8xz
z0Id8UDIW7Jkm3cD38n)!X&u0rNjfY@N(0LLXr=*lO5k*@;gOiLqoe?!MC{T*LzHDt
z{L{L0eUBwS)&wyH5Q_lebG{3_%}V`N4l5|rl~@FDx88cIz4Dc>bbs_=`N%%Xhay2A
zf@8hV_rCW%`{EbB=>01e2nH6MhKTD;U*&-N!Jh#IU?2L3G*(8nVLIm|yEu)juN@DK
z7<?f_d4e0ki6DhpSda%3HZq~p?J`1cms#=jKR(31bISr94SbZ1(&=N-qeBypCqf!V
z!aNVnC@pk;&F1?(VdYOSEBFF8G&5&6fBt-*r(<DHXzqPs)WAoUOk)gDC;P<m6?W6r
zciOB|4z-mUbzu;6ah)WL-tv)qAw0`Cq7~+Kpl9h<IKt4)`ze?Z`Si=A4DflFzXKPJ
z_;3~c)wB+V@Xhc?(s&Y;qxA3|0hBmWWY1a{#VxbXAo}cu9&GPTT#xD~;P{T7N&vuk
z^()TXId$$XwV!JHsOH8yFpQ>IV3%-<ZxqJ80IV^v(L?R#`77-&w6}izn30-)XOUWw
zO^XLp2!OFFe#v9Y?4Q4QseS9KS81RBdA6RF`x@P^Shrc%4m{;s^ww&cVb;_MesUjs
z_c@-oYukDs(Q?f1$f4SDr~3$2tXd<Ibiap`0PY6ZtXl#%Yun%?&GWC(7C&g`9FMiy
zGB{RGKmT9WgjK@darTimU5k64^r?Zg;256Ed80*<o{=c(8%rL~;CKA|pd<RLMwu}{
zc<Wo=YDXV^v@h(VpJWho)>&sc_)mZOQ@j59>wSSA2ZCLD?X`X^I@W{p&O6Vpy6P&2
z!{YFbZ+yf4;UE6N-t(UK_&Nc01+bmb+kj{}P~3n5un&D{8vg?iJm6brFmYp&3TA;p
zdh3FrVHkuk4nYbdpgpr_j{UGW6oFtw4=DbSAB&#GYBlm-opFS{>od35hu^Tyo;P=<
z?HZI8@da*#Ka7ZwKIWKXe6=%pd&^8wws?XUdT}g##9T*xu$q_pXhd&8c8^EkBqID#
zl@HE;M4|IFdYa`*fuEFuV4~y4QRRiV=E(vB&Vl0l=vmI~<JYf*1Exes4ozG&ozj#z
zltligwN?RaHCM&Udp$(TJEI{gqJ~7iz!Pcp46nll-nzg#3_V{WO)ZAvl*=v3;0l}r
z<@u=GJG4bkd*ufWW}zxb^7w=ZAxJpb>RpCE!=0l50^yCpzgy=49ImtYf4XF&F67x{
zlXMH8hV1nyD{rLb`$YTw?_b|xuX_KdZQJ9U?9`W@WOwUy!MQW0TBj@ktlA$jbhNhp
z$@-)_I5sP6goOKO3GP)A<~vos>0`AUK={}p&?#ZRRp(T&s(<o?He0h{la15qe_OQ%
z2BF_6E5k^oXD7ir;cAoRVCxo+57v&E&D-s*XPjbFwWHukE%2qx@Mm9msZ!K_ztT5E
z-;pDMKk2iYIp}4_4{M@szWHYF5B~k%|J_aeq`@*6^-dqhIs>L5v7B;`3D$#fSQ=PU
zfR*)K?|PS=bIv&tbwE7_6o7r;bBCa0t_$V}(|Mzjd2h)LW`faR@@BgTT6F-7l!q(r
zsYf5iq^RBe+eJDP;Lx(uAT4ukoa%>=KJB#Ae6Q`^jL@kIIsud+loj5o4;Z^Y^$|iJ
zJh;HmdFprE&Gx_9zVK?Oj{qG+e-J7OK83_P&v3b4VFmu$sFw5<m_P#xkIwR_1J_I7
z12&gX<vU6M7779yxQ1zf%My@B70yBOaNz69(JUEBd+)$o!R_Z3R0Kd|COV(i;?$>i
zrO~53tz%?GR$irn53F8)QMLj{pb8ojobLn3N{IC!A>N?`lh_v0xx*fLY_%OfXGX*7
z0v|$}#d?^Zhv~O+&072Z|Igl?K>3y3^?|?I_fl1<q|(;fdU5aVrQNn^8?do)f`duM
zIN=-!nPld84#^w>33IYCBpExIGl7!{VekRWz#)N<3}L`HgJU3$7mRJZwvBspOTB29
zYEx-n=li*@Zt1V;l3G$PmfLlufBoP8z5DL=-o4-7en0qM%1`{Qe_6`bwdL-6Hk7Z2
zh}bf;y7n3<skARrg?$10#d+uZ!}%WxH_!NQMI>z_06)5a;eb2CzYFDfLi?@`=ra~X
z3`nkf&kr0qmZ}D-mw;eE<~}Kgt*y)BdtxLw94~|rFqyLpLqtgCnmN(tN1HG8;*0P{
z`TBl6^|(hn5dl6ewYJ}V_ua+EZ(RNl{J;;iESG${ZtZ&OTi;qg?V|J7)2?>^H{<sl
z0dNC6wf52xofC2UO>}-GaDJ|4dznpc<=7JUvBw^(6fiLtB10Xm-Yb=55!sS0g1`H_
zZz_NL4<9an<-1=Km!*}tx7%|7NaM;MeDJ}#_+1Z3xB9giK_T9O_L6|K34N{h56Wy4
z+P6<Tw<uiFlTYs{zw!ss|KGg6?*5l0G7ED4%I8Nwpq{Jer~l-<V|j9QIzL@;PS^XU
zuD2gGzjH~w$L^g@Q)NwO=)ESB-#)E?sq6Zc^PYug+4|U?Ys=kA@O??Tqz$@rvEJ#i
z%MBOLc6ZCB?3w4gv$I`sZnpDXd6)jZOuqE>d_vkv=g-9!Cd5M5_lvD?RBM`g)))KJ
zQ)Yd>E$791Yr*+Lw;t(wE$Lj|0YZinW5CZoy0?7K8}F%>{D{+5b_e|3cG-)gZ^xy7
z^pl@1|Kw+XwS4~n`)axQ^|zFX6Q?6Jn_pf1fcs?p{Er5_SH>{#SkwV$QsKhXi+DJ7
z`bg$|7m{1mYhvU_6*n3c#NbNOfB=37fTJN4<U<TF5;Q-Qw=$A~N1xhTHpO1xWZVbE
z0<b$^S)9N`AqpOS^pX0C3yNc|>W4XJt<kSu&b;-}XuOcgnxxL_N!rvqpZ#v_Z0Gf*
z9rxDJ*3+3~$kSU+?>#T~<EAuwx%cy%qJNF^H3{2GeLW5L@_X4u+XJM0@!$ByH(uf|
ze>LS==q)~j5q-Y%{_UlFcwhN)i~;8-R{6q!DDY>nNIY*sUoX+3y%JRd;)ET6peZ~P
zDi`xS#-oj+I;JOnR?)9Nv9lcc!e`5t{^&`&760OpRFKHeu37nR2N{rt8BR@~&kut&
zGd=5ED<>JBb*<;-dabv-T3Jtqo^m?Kt9___Os>=E56-WZ0Hk<0^d?sARaD+{k`bDu
z9Dw8(kC0YQy_i2$XFl_!i)nXzmM!n~e79#-@6~SQhWjt)6O%qbnwO$nh+A7%U$2pC
ztL@UpDX*PrqtJWKb97!FeXDtMqQ=fYd!elBUtIp(*Y?HQJ_dTmCHaA+7X-IYWY6gC
z6Myo#@{j(-zb@bV=iXXA|M2ee!1vr;o``Gdidgn9%KX?LOaBpKe;xsOM$7WOEY|xc
zVt~3O8}-=5Z)8#spykza|6I6zhJ9lpD*BfvsE@grZ(BI;pEwrPRU+zTER>C?5Xtgv
z?pYEezvYn%jHxRd2Aw>eu^L)hzVl5FW{1qXt5*(-dmrL-YDd=h2zm3Ew5<WOzP3*r
z-RJvlPFpvwH+go}H=o{jrJYx9J8t=W_okWGd&}v~_i{hJJ~Hg(J}Td!{<X#PdSAwN
zzsV)^rO8bX_FVe@%XL>f_**Z@Li6fZY>xTtlJe0nJW=l4x~A;ivpa@<w^w67psnuf
zX?MHNf<sclknn*A9;k+d7AR$2Pr0p6?Oyd*w0#&xb^8CYx>&{_x=YDZlRDe66Zu_H
z!MU1c>UWoW>hHyS0ZNy#Ejj@Uz@?m4Zu_oxxBOjE6@aEn^(CuYTLL|;WaohWx`51N
zxXY6PiM3qG@=aU}_}9b;jKSF6@c>u~0%9v8Vm^8*z>tJM;Ag%z+~c7m$2;IpK7nP9
zgnRT3NLslt=YbJg;n?X;#H?vx)AzLgU89p)TibK4C-cYU8uB;gbve5$b=-UUTF>U0
zT{su+zgFar-;rX)7QnqJVk*^N?Z?G|tCdY<YRB!n<!-;*ch0-xSo*$UX*LFW>PXoc
zcfhTQ2cWM&_jJa0|G~rM3lIHS`Rxz=Vfl?;{CFwD%gQaU{|jY7h}kDcCd!s|{bfn!
z<8izAWh{qq?0HU-c8`a%ACB1mL_pf{%Gd(fx|cZr=8g6aNHoBJ{qo^~x>1)M0k!K)
z@(&IV)<O=A9f=xX$<)R|N2`QD8RGx1h&RC(pFUjvV(bqX5sC;%(r8`aj<<dL_DTq_
zl4r*Q{7DCh&>1i`i~2TqJja&W+-kqC_FGw)la{WxoOb`*_w|p!9073sWA}pJX(C+j
zWM~Vl3A}sv?qV|z{;qsrQNKWXCKY@&9nhY$0nA&st}p-HTep}0{SO~2uieoXE9QY(
zHfp}reKqCv=CKeoPD9)PL<<##c2^x~GGFg_w0hj%xPVTLkj-AQ@O|`CpHCL<l^xdy
z6gwo!iMqyFW~KbFQrCsgxq30*?Kt1Sy)BIGd-}RE<V|X~cE2wERsK|xh4CfdZOIiI
z7S*`$DVqV<K#aQ%CU6XneMLA((cw-+q)Z$<8EbDo=S<bk1qAT#03Ki~OI4@x%r+<Q
zg$pOjSj5x14!D=5tg15(F)$vEb~s|><pGh06Q;I5*UqF}#LhZm8`>31=9kfku-Q*+
z=nwG_!!v0hhNJreS_1(yT<PP7kCZKuBAty$oj4qax-c$wxL{?rz8bo70rz;j_Y4*8
zKY_2(X3w%4-t;LFf%%J@DBh^0UAb75B2(izb+#+ZjZ^2f(KYX_{`mGAabry9O<5;)
zJ*~yo;=dR@`m64^DH64jvNM+bpN<X5|MT%LmPbDPncTgq-2Jw=r$$YozdM$%D8mEG
zs-^$=3)$H(bExa5u}j}Wpsff|rrs0vImPQ>YAlvytouTs&0n~va{o&*hKuvuFPx9t
zU)p<ke4;FfR{$vvF93#vp2x|&1D+$dL+`q^U@2YylauAucid7|g}}1`ko(&FVpuCq
z2Z<Y%K-;r`<j{DVi--a12#=v=Ghp7EzisYc%RF_j@}zcu{W16FB_Dw~0^lVd@)y3s
zZXxS}VBq>u@&fP>={@=6lh1RKSvcp1t5{O*T(`9R_CwK8E*mK8!s#>){d#(2@vuNy
zbcp;NYcs1{#EOOIS{9YdwE_4W2a*L`59!F)cRgJ``0M|*Y`Aw*?7O-}v%BAN+;Os5
znx0)G0I8?_>GJl1%FG*T`vH$s0jN8y_tdhwYx32ocF}8BAdXlWK(yz&GA@fpqR*9x
zsBw2K$wN%M^Uy@ODZbvDVxNI~R89q)TnB1n5i>g{`AhH2N4z5dqH|6jL`5ZpYOWHI
z^vSX!oZ3_m>RIi?`7f9k50OX+DoCb|ZZxvH9iYyyWdR-lzdRSH6B^)+9#4C7jW0SW
z#X!oy8OPwh5)#JZq<16Ft=3D|j{xYJo;KvHe#DYpxaxTe;=I{a-oB<<-(|gc-^J72
zvq%}9iF@n)z@NNg>b%pbXJIZLjHF~>?b7lqAAh)f=2IUp|LIS^R>~LuI5{?#&2PM=
z?0CcNRocQ}?_|biOE`M(*)s<Ymx;Vz+rpiT#D@@D63KyB|6&X*$T%?On+OqQ+&$YR
z0rrlV3x}Ni^!FVBmvIYWcPiy6gSG#{sN(jIWrLtd2dZy>bnEq9%9tI?J3lngAELl}
zbE@2N+s&1mKbhx)05SjM_1z?R=Ahs?VvisY5Xz&YqcsX??^eHv1^_O<1kin2QtoW2
z)yKw18FT3yKLT?Ez&HLGOt;{26K6WZEC(hqF&+IoAT<#g<N;7&exFYQF7)z8j!c$E
zAO32&bz@&Ccdssg@cAdoz>=FH4qaD4>RRh)<>L6YxpybO-i1WI>lqX6X^V|-0X}R)
z{Pfe$q^|Yli(h&uIUbLQ`{wf0#PQ^v*#n3$m2mehr9B;5Or3CJJ=Zei_a)~PJXKaL
zr#r6wYPaOu!oMI2=ct>@vH;^yIQ%^c=bF=%TLzv$K%zGW-Z*C<S%Evd#H|c)90-5|
zpRM8a9zQTruX82a6bS)~@dMn<Qa<q5pL=lXYw~$I_w5L`pJD^XjLVWI`4Al!!db2g
zP^n*C>%xtaNIX7rr255E9RRPR5pipid@R<Nraj70hB{9~T#n;ilyj>h86fgzdr@`Z
zxppp~wJfTp{<P<G24Xytg4*L4`~~T|aWw#SeDi&Q9YCzf!zH)WJ;!dGx^s@(!@YQ1
z$(5;W)#ASuV2qPJv3c^9KOTVRmV%e%YuE9!<>8%s%a&|nx8%WZFAM3{lX>^#sj@M~
zdTaqspF5hsJ0WP!9joH|{)phcgA>Lf<A}&GXV+)b9sv5-o<n702se_QqXGN=5JaN;
zJ+bd_o=rIUjM3^y1NtKRe|rC+YSec)1l~>ShRS3D>MYJUa{)Y<ph4cDl_BcRWIR`e
za3i^!pRrE_uet$H9lG>Ud%S~^V;{-6CK+%JNaOys7qL$|V9pAtBQF&Jb%77*fafi*
zIo!%uZY#glp&WC*{qDV9PhR1Va$DK%yWG9K`C4(>CuyVPUiq$6PR;l-{xe4ae3L%h
zHVFW<2?a=TxPoN3W+EdDyzK2IzuK<$md(8nSAP5LuPC?PwYGfwA3s%o{!4qxD~9Hm
z?Qtu*mek6%@G&~Flb`kyKWeY*Z4tSaaxa$`QKbJZjyQs7&)$9IU;XNTDrNm^%Va>&
z8qe4(mDghG!tcnU=v=*4FU>g4Vp8vsp*K^P*O>}j-OmbUoos=DUG7bW;ChYVwkiw#
zIIHfQ1hS*?<F-_tO54}tvI9bb$1JW3d_=<m%7ft|iL7zZu1^Mlari{S8;6H-EsL@7
z%(*ie2SkSaxN;$9*#zt!o2=I2+9luoly5v7A5Y^%j{}B_a^8J7`IGU35B-olVo02=
z#Du=~r5)oZJA#v88X^I)av`8LFBpEhzK4-IWSd>YNGEEK#2{sedp{cg{&f+3PoC^J
zTw`Qon|bGRbJ}xB@-ywKl7YqDma_HwbiQ0{&!j%X8TZ$79T-fP+w$esT1>6O4>>yN
zz)<?rxBaFvw)1cq&-kugvLNb~1!cqPY}l1P9LY1;9l~n8Kt2Ip-1f?Rd#|ZhmNOVk
zK7Csh!sT#u_oO#k%a`Bt7rbXOwgBEO1k>(A;|UNHPCj-6<9Wt|A?EsWzH<B-+a-}C
za6hC**c{?d(4YovD<g&Y!czz1y>LfaA2EL~acy4A27J!7Wd5`{NumMpn{+@-5ySx7
z0n!2MP$LE~AG6sG)N#m|n1@aE*Xp{OZ}-n^b2~Tl{+apO_12&EySKmX+_%#A90Bl6
z__W(g4@mj<15%Oqgt!ExCbtQAJz(0(cE4y^KDO<|MPCRf_O93NC?Egj&y`O-av%%K
z?7RBSHl%Ih(IL_=Qd7*_+(>&($kvA0%DI|z?r)1%(^={dLgTX!Jyst2qyJKFd*BUG
zjYfZ+_09s^7W!I#?zaG^G2Qocr4~oI+fChJy|)(PzOE>d<UmZ@JO+5u@tz8>ZeFvh
z8igG=5!bGWhjDEO0^9=I3}=zWb<&l0_oSStaWw^i<FxgO9v!g&K<IxF3j^UuiJX_E
ztf7F=zHs(7m^qSrPDjR0SAQz^;LvfRIB<q+o03;3WWdB{Tvyb6JmBnjWy&2r-q{Lh
z2a)xPaI8e$Knk~hKIiIAiinZ4VYn}QH>7OG#y~lMUtbX%>;)=8lJQs;BfC^rMbl?1
zsC8<BV<*Gm;{NLZ=gX0yug@mmJOQWz(*OE?xu_<UIZVRsH^Qr3XpgpD&yQI_pqA0f
zn<At#W}{_kS+IOn8H)IQQ3wJdeh(ZySvEzYeI|AZMEk-_6XPSg`11qaQz2~DuU;9w
zeysXqAJLz2626zz1fg^^<2#t=uporN{_&B-^Up?IAqrPy?E6>umAx@GTpi+pC&1EN
zI~6tA!O7!g`?|q$V#@dvs7~ZLZpyhMAtLK=Ja(kq|2J-r{(pJe(T%14_7VVfJnLv(
z`bWMtHu}&A2=kP50N^W!8jo0?x`3ojq9W+E9cXjgyGxpIxjHXx!Np(lYl}_MKJt-|
z)V93kEpI8?wr#6rxz_SfY5d;r{a$U8HBz10DH@<Qf9g}8szJ+0EvPxT8n^A$TNxK?
zet93wAcJ4toqaj<&I@WXne6oaqVEAx;P1tk+N5pZ=jFeeB%gbfkLzblXkq|=90=Xv
z19xr>A}lQb;p<1r*ipOinRr(&UAc|o7ls(uFx%!v+U;|-?VYVw<I}{L`XW_NhAgrV
zJ@Q0(*N^^OS$XrF<-qY6WmR~973sjtKBOHrwQoNv80EX3>u#vIrUu|b7Ip2%Ef2;G
zt3O}|@Ns=CvK<>Qx_4avvMlZZ9QySKND&S6<BYlh1F<j>`dGk{cwE_P#qATN18LFZ
zd;s!^lt0{Oq3`su{Smd}=GO&yG%<Ko)dD;N#sXS`JWUc!ryU!_cF1+*<5riZKE%Oz
zuG<8P_?muyARq;}Bw~HWZHO3g=e=b>T{|R^^Qwzw1_^gdmC?q>^)$b_u_=v_p>drI
z(P8{ZJLbjh4}4#m+V6bld7KUC_GN4lDcbF6UVNDHT)f-6VIZ{OS*hOZY|jlXljmOV
z%f0bDm`o6&1LH>m`0>vV;I4`o-g7&Z@lf_6TzT@FFTRb@`lwh|C$5DEfG71Y_jqo7
zc{dOa4D<Rz{7q(qALSsdcti~Kuc|)($6~zaS+2}`Lw#~SW%Q@tOOlW6#->PXjz{k=
z>Yw*r-R5yAC(G?OZ>u&5&bO{>m$uG27Omes-z~3i^AmyHm<=GykWds*9!Dbbkr>!)
z40FQ3mxqt-c7Z>Fq1gavhtHK#Yb!e*+xVK-BK>XOzP+wdtxc^A{k9(N*s-Iw$$dZg
zgFiUk*I)njU$2alQ0vN2R!dhq9xeYj`}dp#;0AhvZIWGXLYR0sSrf16`vEEOn@QD8
z_^w0B=p_IgyZ>@8xcnwEgE{{FsAJ&ozE-wHe#cGO#`Dgh@}WO^vi#YThs*u<U3mc2
zsaz9Zxx|Bv{j>wY(%WX|{C3avB==dES~<#aZb`slG>hav{rUfpg*?{jtCp2Rr?W`r
z`o*Oui=i8)7o%PvaiRUrcYwajN3@G`?Z9m)<pQ~5xZVJmd(MRO26Pr?K#D7@K#`LO
z1ES+P_5uBEYgSkBzHMIv_ti*l%TQlA6pj%mYgpDsKP;~M@$?gCy*&3H3HJ>gNCYhW
z3(~&*(GzdXUh;?1?;1reZBP%JfQ<q1W8p}T255r7m0(yFAOWDtsj7pNvnGLK#v+17
z2rMU_4|hIt<U|$Ut6N`;+M-;$kUs0N<l2cTYjXsFAx&{*GUaRx@G1XT#;GrTMj+H@
zl(FIRR?BrRI{uE_X{B~$_x?@M8OzgE%5gp(2BYcIqKF|7SqWTO+oR7OvVPTL4UwEz
zK>O~_c7J=%Io6VV*Ug9KFD&Pm_Laf&$LhGbx+MM3r)txXet7@JBZ?>S5vA|+*%<Go
zL45TmGcL=Mzm}Kky)P?QtgaFvDufkF;)kE>)Hddga^*!-ZHV1~XQ5nkaao={?-1Ju
zFU|9_sn9?;ee+7J0XqXm$5ZwHQyuHR^4FMar%xT1=_}8D;GW*({zi1WhRX)3>M-xA
z3Ty!MiBEi@#^Y~cd2hMpmbxA&>q>1<j&<nwfB*N(JKph*@<TuLL$&%}_=R66zxkWL
zS$^Utexm+v?ZRw`O4^ce;y?FuKQ~QuP?f&(o$su1Mp_>AxBJ_<Z-wuv+3>B<)o<QD
zdO3XnJ^dLG7C%}6cn}1$0jb_La5aH0ms;5t8<euLar`KG6T-x<<z&v|AO-^p-*wlf
z>~gZO{NEpaxa<o+EDiuzsCrW?SDT3jl}mqU_dyAl+S=#0{E~BxgRLj7@{fCfzx!Lc
z%5<#zANbAxTz>oCe74*%yf9YX=c>4RYH3dm>0DjYE?wAa)l==NYaZZF7OIeTZFz_b
zsSyG6);4Y-?#AJ*TTQGRYws-F6ItMoMUrCiek$WOn&0!&2-20^0d?2u#sQd+v8%Gc
z6DPOz);HoIRvopu0NeJlICA$<3kd$g;0zbdTL2*J4-ueeo~LhRRF|{-HU=!w<t25H
z<Rc}pvm8l7byG{5)`ZgsQ0xN$wFG=f4p{WNPkBDZBLgK%O%AJcAkqt9+>#@b1U3Yn
zgrNI<IO1)D%%!Q7+1;Nmmyj)$2v9#`L%ZwYj926tusfiOXT#9-t+mH@JD#dR1AE%r
zoke!_B+ornFIv$bqQ_R$v0KRr+&|I0HdF#8-|QN0*|4^X{Ka1&$uT;J1Y&*fCRNIC
zICli)Q`s2E7;xn?rUN1LLeoU$)432*4=qWZM`9m<phG0^3K&grA(AGW2Qe6whnIm7
zdLo45cf9VlvT^;o3dXG`o>MOY(9S=f@A__e{^^%_h-ok<NdiPSB^7x4+uvS(>Zg7x
zQANMI5(0n!@BjVsV?Xv|RZ7tApY0p?OSVQAzPBZ7$)k@xT7R17Z4Gfk8zia;TLFK^
zcYH_v?T<C>umAPGUVic?f3m#qeebK13HA2gKb`qyelT-}z0AA&&DT9IrI!<s()WM%
zv!5*}XL>6W%%=ffZ|~ZD&b7(jd);x%1N<5C;LwTj*b|sQ?rUXRP;h7f>4Cepl(Kqp
z`NY>p%SZq8(aLqEu<7DwD_46h0*IB%wMRcYz@7rw=@PGepsj5jl+ltV>D!00<?lcI
z=N~S2zAd|{g)3S(r890hFD~;_eW+#h9(R3j<pMB^IYBqQx&9K@BaV*asPlVM46@cn
z^o~mg&KBi~aHx{K+JSZhLTzqtadgHa&W!~CS0x__g9O0bGm`Vby#eq9KuGxn4v4Td
z0es+2WWPVQ05)@48;&3M?H-;3BouV>z!A`s8uZ2Z%ryx}F)jqy(uukOj%VMgIO<7%
zav2zj6@aT{SKy9Cxepi$ma{xSY&;aDepLW^>IYr(A<ou@*4KH_>o1)0#n0turLI*|
z+-veuW00mO-<v};Z%i{RRdOL#LSI0)Ue^{+FW)WIV-3`r`rWQ{H9238a*ua+@v;@=
z{K6PL#t_akq>4Ekdjz}m4QH1>7W+$<Q2(4S<9S|)LEa6xdTsFUWC)>6JOPq#ECib8
zuN}hr4(EBPZ-0z55txVzTLI&259ZxK)HxPh$d<qi2gl>7GZ6KF{tN$0Dp4lRl-J+;
ziZU2?K%mE1w58tp(mNLI`t^Prcb~Rys7KxAJ*nbtx7}8amH+WS{>SBKfA(j~yWaJ#
zYA+xkdn2lXUAuNof5bDPij?HB4z{G-%APH?`f>jxnQiX?_x$h=|8V(@-}sHX36OC_
zAavu?5k|9>_i{ZqrvkVEpRAYEB=YS=_VFS!fVuzv`>Sz{3DqXs<+|XyWa6~rmRH#*
zYv+Xm6)w2#K!|$3T-l@mH{Uc|zWdfS<s)A|T>i~R9xC7dnp?}}Y-NgjQKj}Od*)7l
zt=!gz*}m0#IRas3etiJ`tuMXTl_4DJpM3tS<=sE_fpYJI8_VH6yUMYMV!5bIE$O+%
zq90(ZcrXPDIx+POpd2x)@W&NcbMJ5#)LrrMULFn%r(3zt04^@gh|{?PpRs`RSh&2~
zH?NCzc4EMX007n_Qye*tov2viHxDR{AuwRL7v*olFemVj1wZ#|lX&hs4~>=O$q(d#
zDcb^Ij&oc>YLGh5Fpx`Yj)m~xp5|EjqB?5dQ;8culE5$!VA4Hv$rGa<Awde!00V(U
zO3)Yg!h>;})NW+~GX%tu=)pTTwh&_Z4G;nHWW>MKz%4p!k_3ZHe~pOL-da!E#af=e
z8wab8)wr*Yq>(YAQ_m&Wrc=-9Y6O>ZmqzzawSXug9$#=WHWC^0O@pf<dC3lR=jWHf
z5LJ7k#^B~y$;|XYQIg_4`mVkl)DwQ^xc<zayjO|BQ%){SkS*f+r7I%F&SpF(7nY^V
z0(=V=mjh$5j*qT?N2EAF-`*46QRB>xg0KIQ++*AT&+*LVmC2J~>qPzQ*2DntWF1@K
zeouuc=!^CL=tN?0q}`q|sgcyi`^?F>{^dDy0c4z5`>^L!x#x~<hdiQq^Rf-0`O)To
zE$Bu1)7BmRw+>kY^qFKpd)ijM?1Fylw|;B7P4WY096V2<cfb4H^;$3BE4xVtW;Vao
z%NUT#Kv`{F!QixHOJFKV_EbWTf2*ZA0^kOF;_byX0h)LpXYTjq^!=CXg0g(v+najJ
zW|Sw7vejpTnwYbtWUHLcPz=Vt=e{lFBaa*^pLuG$eCQJomH+8o-;rAo6#>djbT$em
zoN{sDS1Px+E_J(aW}AESIClMV;18%7+CBWl)8&o-;1`0(2{v>5SnzbRifb#!pT5*&
zXnMy-CjmiD10-DSV7TGoY@@ns?}7BqxqwW79JuvoVcr`+M+A%nu-9j7gc^P#1j2&I
zaQWPe6L0ZXD3C=^D3^%aKJFvo>PQM$kdFnh<;7VF*X!KF`$o&n0h4M&Fx6JT+n4%R
z1Lc4XcQE%HPFu9wGhG_7fJlAw@=S)ak&I*DOr&4CnSM49-y2c#aN-Ye^IN(gpbjvn
zuH()R=iW^jgJZGy9}6+k5gg}h2x8I@<sl68OCl{$&+?_Qmd`N>#;UZpvxCSu1O$OI
z0tB(dsy*3=1M(*)>J!M<Wm785|M?k*Q)fdAC!couMhGpc;~Rj;3H2YzGjJ7PYT>E`
zR}y^aNC*edpC^SrNXWP@B(W2E*R`g*+`06vST$JYogPbBTm<K3yWJD%UzXF9H4$(g
zj8sH<q%DX$guqAue`ChjhCTZe=K<liG;V|>EJGm(s9|mz>MZNI1$s8>CNc6(P#rN4
zML_Mz^V$$2OST&)@~k>lQEXB2?jiWnjD#g#T5h{}TUywSXuhV?Y)rxQ#TV(o+{U<=
zw_FM(fDXi|9sl0n`+HZcne>1`VH4+*S_*62l6PLReyEG;@dF?DK$UDdZ_Su(ZF{b@
z#=S4sH2X^ba_#LmLBG6=K0uRyKixUQIq~ruK``4(JX_aH2%k3DXDgdEE`u2qvdMU*
zvTIpct#PQYyL)To>kG<lYnGP3`8!`KkL=o8MSMr&ex%LBecG&Clm1HO_O@LeCh(P#
z>x~nb$pPr=@^BYV?m1BY$M=392vN!{+3@F?aAoIC@sXZ-<t>nPmAuFSw%^)H%)U0<
zwjeAkLI800!m+Z}2C@xExoQP*D_6a?KSp;U0<78s!&ELg0LM~Xv^$A)_v%O{Sb|%x
z5E07ibi<1i1SnyBv)BVzZgx0Vu8PFLxM#tFi0ZY4zdexko8jP!ltuLGe&8X`a6sZ{
zxJkx>T>SKn?zwW`DYx|#I1@i3I_S>Vr5#49*&7%xu4O>huo*S~5F4Zehz~%k&xjsk
zZ&4O4$yH*I_eL51u#?bE{jY>nju}JNHm6TLPo%!TZtHHYNA-_#C8Z|XMResSD{*Hm
z^{GbMOC~BB?~5^FyPz%0jcuM;XRRL(g!Gy80P$juRM>45^j4e(>w}#LkIhdkixUBw
z{6@gJmqf_@slEcWckyWI@hpu$k^jo52abk-I!rPXf<ZOn`m-Y#$os=M6TwJ|QB@a_
z{!npH1GsO~5J_8iy}xf&H7wLWQUQ_>1i`$tZ)fZdzWdF01t2$uc<7!_s+vyjx!O57
zebe(isPigof%Sj{Kr(-ys~y@ZF#!D0AN|q7h>&zG5HnE#06+jqL_t*X`@jGDs|vw2
zwgA)$)&*;b^wE!gw7ULr|MgtdMWVosu&EY)=4XDUs#HlCz7^BdYWS_#^&7L#CPUNr
zGkT%#=dNNg@p-Pv(t72Em)B%)>?5_Z*%Hvnd-1QeY_<Sf*Y}rK4K1mhM_8WnAOG-+
zWp7jzpZn9#maSX2mV55G=bFpaK5cGoZ4=q~mb4WIA>YigaBW$*rG1A+%a6YAm&&I<
zv#UHXw5ps*-UX-P7d^e0XMw0|M8>2p7J(e!Ar3^Ot8w>}7ueZ)6?bS6!W|EUQxl9u
z02^Xvo97G#WLIY~CT6t-D{!O#296g0^)d7UxWI`Kpbs6mWY2fpak5pA>jl563IN_I
zkBeXiChh2m4f=WDNn%jB@st5X=zn<wq^b(W#)y#}(eSEV<6kZU;IZnR5P~52>=x(k
z5PN`onzm6b7^D2J^>B4FR-GL{PA0imelY~-@vF2Wga~y&-G(;pP!2t|I0e3`L8^oz
z_XB(tARdmSpWON=XSz4FRxx#~6|<z8G1ok^yc3c#NiOt>iiGz9BFgi14N+B%_NJZ<
zf<+&+!I&~6tZ>uY2nA1#zeDv2LEP6jn0q>jgJ+<eYUG)|A}$ySu1=qa0_;me5Kktg
zFulF^RB?ha*#Kxso{9HoBwW4Cmz<|M>JRt>^&=TGswU%ypcd?BbtD+(3F6WQL5Ko7
z6B=*om!mr;%D27xuBx`_-2b{UY;)=xK>!#!0NvCPe)p!fPR;z*X6Lx}aZhIX|I5Gp
z%jJ*%_>aqd_jPVajS-Nq^-;vaANi3VsZ}&0!uWAbbpQL`|Ncsd@Fu`qxp(H8(%Zh5
z<MEsd;0AoEs9Qd?*P`#Ats5qS7aCxiAYO2~F2;noet|l!#+IPBY|97&fGGP7n^%;d
z|LR0}_2$0vPyfr;O8=r$<%hrTJIm_+HMLy#waM69#+8ocZ<BRqKK*D5*UWN>{fSEV
zA08|J%TNDG`OjZ?y4-c!n)1lu$ugKgGmFoiET^;RT1cm@_&X{f)es|hV)HctcxmKl
zjF$ouHmx}uj)h9$LL#@}%%ZJ@t~D0E#oCxWweL{nmX~KSCjveZTLaeT5N`gI)Byki
zXX+5`0H%N$$4WPE_r6o%CRwoahZot6o|t7n6~W3#A{Yp=4(E@*A_h^o`;jxz!KW-z
zhI1hZ=(pvkckf&X1@B30_~`zLlIu3JsUK0YIQ8o1u}TD_ZTTJy@Bo%hP0{K3mxKty
z;ZuJwcBArukvQ9BAsqOr1FR(}mk9n$>O)BIIAGl_uRi0<N0WabW1)SnuMirT#2{@)
zB&AswljBQHOkK(t0PSj+n&ZhxcLpwG^P-I1p_FgrR|F&h;_(2e-SW5zA_5Uq2a{(o
zV`Qw^J>cFEV$}{J;G3eo7Paa$Glv~%*8S~>lqIVYd2io=I(Gd4Ib)+go1+HehUc9e
zSQ>-Ag^S98k+ItE9U&;k<H<u>V}Af+x;d&95|jNg7+fF8%dtq6glO(h+<={X_EoUg
z9+E8pdN4%8js*Jvr9fK089#C~b_l6A|BT;ClBKvsRs+EC(Q^0gw^UDo6CoI80$ZC$
zQ=|Eke3`R|pUbHOXU^%Fjk10L{092%y7L&N_rCYN?NpxIh4P;Fyr=$l+z15TQ$POW
zKR%tW9mw-s^|!n)r|*spc{%myo3t-pK#3(RcRca@`O}&#*ZX3&`x^KYozaWY%U^HV
z%G6Zf_iP=gYJd}zsR@hvM~@#V=jO*sSDf~#JlFa-EL3eQ+P!`^@Yk2tuXf#H2B7yI
z94&wQy}wv~^FMx}yzaJj<v>7KLq;S1KYKch;1m&5*-pcqMU_f`Vb=EHHP!MRcZjp+
zzrShyQ1xdAh9g;QakzB4<1v^EI|(1^&U;+?+~WkUoQU6oZ4AzaTNE_LL<6|%!VTh(
zPo{hzuqL340MN(IJ&XjR<77`q;s8V>051kSkis1T3}SQO4F~}~&JO_j&EH>e9NfP0
zwTq~pLE3@rIbZeCIY$T2-(NzYaQ(B%&$<K@qLz@F?%W>qMA6h1H40&#)ky3QaM2wB
z)wNSRd*!lIpr!!QYoik<K|$oeR{CtlaJX*$B&DbWXi#US<e8aL^mKB#AVGuZ+4UXa
zA#5%JfcvE}tEz?&9|#@AEk$VLpZu+?>fV_12%NPc1U)BXlIGVocYnHSiP_IBTvlBI
zw<LbU){X0`s&47B5N0c42ND6w<f(W8gwO!M^!$T0FjI)JJUioK*FKT~;LQf28cL=v
z`uu@N1dfIf5RDO`NM!EW()SPLnJ&vaHWGr#v#N%kAv_Vb)fh9+Fh(oo*6rKsdo^>u
z=)pIV0PwEQBuKhZz4BlE(C)TAIF@GXUppsnJC@h)cE8`Pyxu&{f2*W90^kOD((MIg
z)Pg&|R(=1>ZnT%2$R9!R=}&*UE+Q=b*%w@kUcR^Nu3VF9es&@2D{oxAusoX8?%tJ4
z%7-2~SU&LEAIq{z@?c@@S*3dmxx%r9p-pVrl+g?Ry|=nXylElb`OLoZKmYR&l>hYk
zUFD8ft}TCdaH4WijLpuSNI;b7wsdXsH?6EdByK*RYTO9`1oi{r7>|b2Ivju-4o6q@
z^Z`GcxDa=*&LRxhY}x`)xB%jC>Cfr;5etmG5CeR|>n?0<0ZZLk(<?CJxGntcz7_=#
zKp&3X0RRAhjQlL_0F-EZl4v;XYlH_8J9S0nWQnM2|2iw_D(24pb_7%gV>O^y_2ub<
z_yh6-L!ggP;7@OiJjW9mm%)^c+Z1DfsM|H-bw2l%089ab)F5I+Dg?OP0evH4)Jufz
zF#y$*;bdz}{18$iBC-ppV4QMWGAHu?SxMb_pp~HNj`A!+be;sF8CdEP1e5m&p;e_J
z`Q3qexWN?ab37t>#()d~-6z^2QU03je7ST{_q~2~Kt5xDc)R!xrBV5PVO~W3^Ao)<
z<s8c#IFOzBj;220nt?j~e_zVi1>OlbbG!4YV^b{s5t&bo9ZTIEumSkK5FRyLa3VD#
z3J`deNX#=qd>Z?unbXw^AZ_T2ebAv0hW!5zM}<K3@|9f&%lEzgzOp7Ri6l%?TVLvr
z{4q8&&$VkW=C?U!ZMz-<K)c-6+I=-;w7jiPe)pd1EwAPIR{EYJ0B*3S%{tZ#3iPh#
z1)r_?4R}p7iOx$f4sI?czDan!Wh+m)#FX>SpDu5`BSA(YTR)Ta@xHYy%0K+QhsuA;
zcBzb#0OYkMeQyLn1Ak>S5nd~=F39<@BEa<3$99$P`MdvT`QXPNDX-tLwv2`Qx{yAO
zWD)I)WMbZ#<CQ4E^8p!)H{&*9VH@5EB}<1o67XL?u)6AxhXV8r{J8e9T<4CZPGdJO
zfIu9ttxeQEF~tcpHlx?2Yfgr$Z(f394xr-Xgl)B`1AW6Y8fgN60Z5F!#OzPgYTk9R
z7zdt4QM=VC5zRB&Bfei4@K_seooa&X*r|YyvVc2cfeL_1AWg6L>BH)bStoVqKwNte
z1QmdD9VeM0x|I7;RBf{|5Hg+*-F8)3<aYy9peK$2BF42&-6HT%Ea8)X>{xaeOd*I0
z5(8}~aiKSNzIS0(xhAxCxob=ZWc3-=QE9m{e)$bBjg|fyL>BRk`$$8o%|`0#2!iuv
zB_b&O_IxGdJhTc?cfLke9F3G@GMu|8i`@M}57aS1Otn<s&m}9m{>J`eB-}pf1Ifmz
z%wNw{TaV-!>kCx~8<llYz2z|lxVya?*m?*G#b9$`)CEsxr$t*plV}Wv&|DCEgJ%-u
z&)6Cx_9aJ?e|;i79?#gd&4EY`sAS5M6XlJsy|3;PNP<Xh;N#uZ=JuI6r&_;W%x~k%
zK=}ErKQCtczSYW}BLHr&C*CIM%*F6(n`v^JT)2FeuDf>asyef6+qTt3!1e1XyDb_d
z3IcO*^XrR_^84PBkegXVt&69loW5;nMftv8`or?rlY7INCr(=?{I#Se;<r$cQndc`
zmf;%iQCmLt=`WT0e*9<4<MB_w?WWb`(ZktmJiw0Itc(1yiE@U{J+5$;ZrmD<d`Gr2
zJ{u6iwTik1q%Q;zl#L)*x|o4u3=Q*P<t>t%`y5+{?aL1QE$|406KN~(6=T5ove>7O
zt^t6uSOa9DbHQ5Z$N{iGL8AS!+FzPQU0Fod)dxRqMQCX=fR%qN27`_p1Zf|zu=C!A
z5CQbvMCphSW%4M%T{4OT{3}yGU^Auz5l0^lz)%}74z#Upims@O*s(u=asZxv0e~gf
zK<WT?1b~|g<7hxHRn&WrrEV_*{kJ+KAW!7YHiAmRW<<pP+J|T{rd;kw3J@B-sS$bI
zx$a`#w3O|gx|o^7NxkQy|HjE9JhY8Pz2Ds%sDJJ@7K|=KAf*k)khT5Ne1tFFllRCy
zBq5%wet3sTbXJ54U?gZf*#cCNdvyHERwk5hLftBJWr&4y8CM<<19^|UE2K-eCMw{P
zi1Ej>2L(xtJq!@Y+KJfjOTOWuZ1Ef75mDk@pe9+rX0S>&5Qf4EFUWg>(5pmg-jCxU
zQu<;Kf>_{^>YltY<-WUaukMD{fyM~5Ic1G$>x{AWzFyCo;NELZ#?!jdh>w@5-fL~=
zw@Usw0^kOF($8l?m|SgQ0ce|V+;r1TwTErOUr+hUZA1lk8@u`4d+$9JRJW`jD1YNs
z8_Of3Bm#WXMJPK{e&YQfDhCtZmWcXVi2yrvX)jK}0w!sLk422Q)1v+#KJ;J9TfYBa
zCVI+=cqFVU6Iql2`u<qvuMamhuxw#DlmH_*EL=0tvd8+ue4ox@5Ad9)*S0mQ4SNLd
zLAV?m%iibV>P3(fhzCap#EJS@mE)Lk`oJ8BuHi$UW1Kya?Gz?ei*Dj=iC4jrEZU85
zuvh~*ed)>$$DiK-l8*ggM7jXKf^Yzc6hZz<5TyPCiE2okJQ7vGKzz&Bg(IX&(68KH
zi6nJO+Dkk?7!fm(KhVdaB5aNXM7YKQBw#PvoWMo>%R84qpt&X>0~j4s5md0tI1%k@
zA2@c+Sh92n^gt|^>IFXT2o%Hu7_R+Fn;i2Zz^QBVrRiBGYI&EOO&!{z0q4#os7~q@
zw*f(+JVt#I-9L$mHX+UybdjGbU=F392O}v_2o7Fx-nBHX5?-EZPSxi|B`}!$Cqf`p
z!Y9Pn@dXL@oQQr~vkmjc=-&IX6W&cjYZBqk?tdYA0?21W9E<LFFa#K|_Z}mtJOdm2
zEC|8Ly8mcC)jcp3u86@J*G$qjeMCs?8=0sQ1`;;gF`Lf|L)?;J;r^c<oh+|-+wC>}
ze+^6(^%epEv1~4JDK@`Y$eUkHLeSO+WA$PtdyUvB@Osvv7qea8s%6g+056j#Z^1Ir
zO|U=tlRv4w;)idtHABMdnRFH^t}_y%Lc}hSvuvnw99J9=`1AMQT*`RFt??Awcj9b$
zaLvl{dtW<H{<mNG{eVpt_bgm=^;b&TW)ZSLX)iZLga)HUZNR}JZdm#sh&c40{_9^W
zKlFEgt=#wE=CV0J|0uC(08ZFjM94(^efJ+cR2E){i(3{!L`Z)YWSkyRI*W6}39bnU
z@uvqC8`ln2V=Kf0;s6K`x#GZyaM?agaQzG5ug-6aERGn!0SZ9NGT+1k)?)ROR#?Cp
z?j4TU7iiN_kA{Gt*X?51M}(inA0dE{U`R*T4~KBe#P(Dcz_#wn7jDr0^0{0IgF_((
zw4npmSh@#LsT-89Ou$FST=yi%Ipa-@&>0SPfrpe0<kf8l!X@eVqV!kdZir*oZkq^I
zAdSQWr>o!E*$A7}Au<3k(DVWG-T@+Npe%2iRDqa|TAt@j)0y=6(&YKonqmlu`=%%N
z?1VcuFAz7{rXpkEnl^YYLJs4ijWJ?|t{!*a?L)vzBF{Aac8}}E!F~wqVkbbgKnidw
zhObA?lQ3nQ<ME?qcl7!v6R^SWod?IF4k3Ea#yA0K5+Z#E_JaScT{BP#=8>pJoF^?I
z6%fVpa8y_tib%UHMutG1YNfjU<ym|0`2AA@P%WgX<-oql^5<W7Pgxhwf=(Ksh>mbZ
z0C2$*Oh-^1JNS7Id`KZ?stPV0<>&uPJ^b()AWN0Vdj7@EmGC@9V2%KI9)tH{+-j0s
zOEQUkQ23~N;NQFH4)DC*iRNC5Mk6Xn0mRi-mj$Dy)cwlax0JGZW!V$(UzYZd#31So
z5y$`B#~v&1{lM>4PP(cC0&JISi}OVOY#<OW+<KfNap1kM^NFu#@t7!|{M=W{U;YO_
zSN@M*{CK(j&Vh0+^)1cfXE7WJcV+XPf#sd=_r&BxH8$h+#gNZl^tIjT2Oz|e0?kwu
zoog1ya8y8>K303{;-BApLvV=fhD!sq^Rhszuet;Rh>R0mK8_H`ak&GgK%f3ss^adH
zaUpquBhj-cfbIp_q!NfFz<N5IK9z(IFM=u&X!Yef=>UT=pIX)wr~!D!dR2UlSPDS{
zxN=MS*x6A`Mr3c&<COIo5u`007oUvD2ds(5jRhl0stjrY!Hcw2{paHH)}QEtEuc|u
ztK-F`j)Y+KRNXmURSBcvei0Y$m-Lf|z$kHc+MlYXQg^*Jq@8+>@5<@QlR{3N4v+?T
z8DJtgm8M?6u1?04fEyt=9GyQv<sAa_aoY$d(jH(;rPC1u9RX!Lw3D%6-?Bu3jGo@P
zbJ5XniuitE3=wT&BvK+H!jmUs#h<d+OXR`Vt(DmmtjK#Hta5*XaIu?sGQ_KmmW2HE
z&aH|4$wbQO{P#n^<Qa+PIG#XEwvCqT34wUkmL=&+H_vn7nG@wT_uiTJcUdK<sgv3~
zwvkwiEkZKre3K6FGBDT8apw^O=A-%I_w!9{-XF{U3DytExochTo^Lzm?!R;dI@a#e
zb92AGNk*VeAd{G`|MABkuZDi>*RQW!wDY)k9QCy(Wm^=e5rhjC+(+Ob3xh?W>guz&
zSnO|FKUjYJHJi)7`fpE^wL=kAr?QD~K(E`_SAOOXA1(~!e(F8nTQ)~Tsx1~ay7jiy
zTVQbiQd<!EV<=@yQ6NeL`svSosl4|Wf4ltq-}+i9ujnuLzixAR<j|4CV4p0TBJSUj
z%~CFe1Kjz{-f}z({OW+#u~XsJvS9L0x8P4ihb~?^FvIcRx^-iDD&c)4y812gyPpV$
zj$1q&pr=o-qVj--{#K4S?Exq(y%7)+|9T?xJpu5IS-dTxUq3Kby$~4o9nHdS3FWc{
zKt^&77;(un29q%~+p!^PgZvVb7YNlI-NJnXOhILo0eCBjr7f-tTtsv(X28vB`_pb<
z*M-^j{75>+j(2+dQvqxe1O&i|Gm&DX4I7pwl3ch)KK(dj7>rwIRgUODB(Pfmh=>h5
zEkXf^j;GCi5m7e-L;dino8DyH5FbR`R1#~_j*jyV$C`UdF!V#xziI4u!fAsvOM0%e
zz<B9=J&;pDvGGXTwPmytCe#3ljgSa_-Z|<KL3W1wIw033QUUIYxP4Ner3u4`NbuYB
zbCDQ$KWy#Go<lw2FPy&M_;@5fEAt+42|Zi(96p}D#|tF(7Q8rWBtZA1+<j5o^e<%$
znunILS8t*jm76N(Zl}PgWzz1YnZx2p;Qo16Y+t{&Zu84_LN5`bE0#sfpJN1Ixw)^r
z;^u9&t;U0W1z-pM2x*-E=FOWcxVO2MTkE{DR`_^#+MM*=h<*9J+djSby8mjDd)P``
z&%IYu{#>4GAAvauz_pLri+;z|Oc0Y4D4C2zdQUy|R29t=>-7S@>kVA5rs2w!Q0dCF
z4}nO87LI0==elw&pw7|BzxkCrthbdT1w<Ch(fHxNK9YbBeD=xm{r}|G%EvzYl^RNS
zc>tj<{He?0!dko5n|plgKaj<}9Ei^IBcJ?Y`RU*IX!-YFJ5*lzZJWx5EXq&ESZi&>
zutMW9s#=ziY<7{O$KM)Vcu8=a6p<h+gOx)9);L{@;&=!F`fy>3iAmQczw`a8`zk@h
zPQXrle77ykgZ-<c7mvO_ebp}`v@YHQRrH<X>gyg3FBj{A%)KkCaWx#fY2BLWjF*+|
z3E{eEQ8>AjXYm%d!1E&o5Gl}jTFk3;e2y*r>>1Qm>+4rv$9*G;aE0=_2(0<i>!Vl!
z^!!K@27mH`L<W681!r8rIQ=#5+H$P+voka#{%_%c5kTxG@<3|;m65|bR^9G@UD#hz
zh>EwV6Bk|WC2}9`**_w55*LIY5qMAb+;MeDInPcxXK<2yc{XQa2q^Te_eVXz+B;>8
zsDT6`%G0UOAMr|;&-P(e!dfGQxCN4GAb1d5-ti1!%8Cd-;{;Ik!+QrD2O<UG3Yp;0
z<;b~EU$F_8pXWSwEb;l5L>;v%dpN{Cfbrky3jy-e=gRK9)1CtZ!O;-MR6uoG=SY}{
z`@Ls~C(ki2M)gEsG&q2GP1y&-{j)`xIAXWJ#dQvOre8fUUf%JB+sc;B8|!<m-@u>P
z9p`Tj<NTFB+ZrRkkK}!HuXW&slnz*%y!UhI#u$M)3BZl<9Hv__+r*r{ICWs*F*!{p
zdU>2a(R}0fP2M(<O}2JCvz(c~<#VqO*TDk^4%8h2?H8b~-nKdRu^3AC-nO~izG+q2
z6TRhN79&>ExTXW)I3L*1SH3!WtbFG``uF93{Y$SZfAKqCRc_h3v2x`=o~V1eyeztO
zvSRuF@#p`n{KD^ks(kKC`@>q!&%Of*!5iP}bLkWPE^vOuaDU})p4fY^oKJf;gd1o1
zo`ASzd%%~(0(KxYK*-W8m}BAchr_9}oMs#*qT1&C9tqbkik}6YSQ)I$PalZ!5efj5
z<iKKkjwyH&j*_vU>&wDD;jHP8H%BT0<c$yFLa19DW&O_LZ>-f@-ON&7<>u42;Z-rx
zOBsCnf##_MlhF>f86!t@%dWF2(0*Js2?4r8vrmMpwW*A)XaS%x)=u66j34u}_Ca45
zpsmRLK-_uxsWO16c9Q}OC(zA-7?+LXrqz|c4p*@@RYb-o?QkC(4BYCWjMq&AQJJJb
zst*Bqj%K&IQvoq?5za+=uvlNZAvBE7nYs~}nN0x~3qjybrhkj7A!lOfr+>KlH5pg7
z3v}^I7AK@^#+bbYOL{~N0#1K{G9sapEmLD)%#BSY7~?_Wbr%g0fi@n(b12>yA{lN9
zVR0sm|FW|gr*QY<c?O<bUqt>BA;k9O9a$F%Om3|~iq^)x?_9>iJh>^mOCm6y8ktNa
zz!;q7o#p<wJnFC~G7sdTTH={9mK_0Y)x4+@3u9$tp5H_W*<~Rl%l;GP+vEN>5P}uR
zix&X=asIZ$#rad`ncr8F+PYy3tSt?=pO19Fd}Mq$mu`#^m?HpgjOTDwtuP^dQ00RA
z5Y6-Nx4k7UAIQy|+$ODSZ8Bf3B9j^+p$tTY-&fipumQFO*gYE~KL5ejY$-qa!LP=3
zDZ15gI%`*U9snZ&h})uXJrke&fA*ihRQ}m-{b~8(H{DVmynR#Im}q-+qL%T=aOMXR
z%J&mr{(AX$pL#rKl#}xlWazG;?7+GpBL9i9vbsa@hYyhvq&gm*B548xJ<D47%NP!h
zm8J8}2fPxaJ}WLRF6()Xl@4?}ooc`^V9#RQ;>x{>NZvT%SPzC6;9tEy*4$s)^-Kjf
zR`yTqJyf|^#(X$rV%oLg&UYR-T*rQU+zV^`_K1kN4B{%q1faWR$j7*jj-Ic4a3$b*
zG=#yqa#OA|OkAAv8wV1-FZUq;jIX^24#v0)@F04$kwtj}ZDS|m-zcN8fbQOm<GO&h
zaXl1}S`tvk5wqkcx<?Sq&rNI%5Fo%tcv)aboPG8}z$5unP5tfLJ-)S>*q^TdY=}Ey
zgApLa<Pw5FU?2n$Zl4YuhwT|4YzBf;^0Q57QlNU{^q-Ch)yGTTS(W#sP7>4dc)Fd8
zAfsB{0TU{(a{cNtzWP%G{-htIFeEzF$3C|rQoIvF1+Ps$$@o*Rkg||=Fs4)|<Is8M
zI^x$H;0eNJLj6mk5;+{T)WU2DylH4@2;iJPdn)fiTusC6<LJloUJYl=)(yntBkejH
zsQ_C8y73fOpA$lEIPQOZ{f)C7CY9eAiC)MS(W}al5E;Bpw1EU}Y2Kf5_*{9-eRtKd
z2H3!#1n+vlKNVh*dY^?grZE=I>qjFB982=F&&9PO(y|Z$+yJHCGV6)CKQH+R%n<-D
z`H){<g(i!Mg!6yqnP<u)k33TDyz|a!eO|jhvvd>H<h<N<`CUg;&=XoP)~s1m#s2P@
zji_*5GQk~N;8|Y3`PH|S^4kxE+niVS#}II70Ag80s|@XIKBN8jCRmU~;@7_TRQa|4
z_o0%F(6ic}#*zkY!mW?QUg0%EStMhm6)`8BGYkJMSp;v2uK(-dM1ki`@idr>2=wW=
z8d`ET2i#UH$)Xd$9E?7EeDrY6<LcrXmPK?y$_GNY+QnHM^_jkXL+W5vy+4M2eE~vU
zv#Z}^xL5=c?!R)w>7#bjX|Lt`9s7t~Cq`@kartW^o?VtjUio$l9F1Gq?OQk0U_yZ0
zI5GH2lPa)~1n?*TGTOxm5GT4nfU=$^L)yfTUmL3rdI*(0Ap*b?q64QYzcE$kyhuhI
z<CI0T+jn@pZWY{#?K{<hI)i`JNbY#<-5!F5UU^XnGvRNmEkU*&<`Rc+X#ma2yps&H
zy1MYe`|>QBj@>3UxPM`jaohCKTppJ#2w{+RP39R_RYeE_eLfP&21&zgNgwKim~k=1
zWcVU|Nq_4bnD08E`CE@Nj?_TMLinxjj~Xezc~p3QR2f_oYn8e0V4lUgNCFTeR3J+u
z8Q=m~?LP9(^1H9ZO!~o#!1FV{1V4y??%Q+lmenrSw%rs0aOA-L%$ull5*_PIK1-IZ
zOu(r`lMGQh8f*U=&?iQHdlEuAP0D*76+t8tA~dqwm^jLqF_8pg`@$t~ML6c&2S=(z
zZAmzP1PdUgvN{^VVI{&NQigl}{4Hh2wyo8^=1ArR1_5&*x+n>r`T9bowgwmz^Tcl-
z^UB9uv35wUUgr@eJbmWUjWGf*+#7LY^yg*S3KPpDGI?wi#Mn=`U=y22Pcji_OT7eu
z>uut;W54P6xc3PcylvaIOLp?RT)EEsnCQ4e?p$}=ys5nP&V=DTdb-#>?r^*U&Jkxu
zPm1fsc}+wzz%1?7NC)2Vz^3xbZGGi|d)AlRv$4>F`F+oW!{yHC_KBvyHX5!s%j!@>
zr#lm7^>jF+J0tRs4Sw0Z?_dR)9qR_lZTYN@D_~ZhvSBF3fad~g(Us!p?TyZ&ALq=l
z52%3)Zdo>hVF3={fHaPs*c$j-bSsz#fGf8e5CH7%NAM`ab^WqAPgO01yMJ>3NCoGG
z5r+e5S=Bf%_m8Dq<E<^E4n*+EbDhsWV5`yna$R4D-!})Owcqmxm<SL-j>IOQqfUX9
zYvi6ra-mHO&Z=%W&j#0tI~O9DmxGo|z72);(1mm7bT48lU$ss0RN^FKVqEgSCijS}
z=lLUQSfK+~Muxy&HX)3?7w$uF)rBS(7*FbFE2}H9{q0(({_be%yc0=As=bfeFm<}S
zHZ_iZoDb0lAOW}`vNK>EleCSR0CBQ9Ag-_SG*1lUaX!ed&sAMei874|pykWBdl!sT
zSAUn6B*yqHh|fP4w{2TDSE7^w=vatg9taa5>>k^5D1=bwJtFP|mp@(sRV*G5y(uvS
z*nK<`0<4a4`f&4<`=31*1|w<mjCl)i6ZB1T#ZzO~&XMvL9(+~VylF!WcSp+SBgMn{
zTO%%~^REN<0{qcd$+NNEU{b6pm<q&!WSx+x74>&6-54V<M*!Ry&*73*v_;OzMm3v2
z40(w4h~}GKzERhg6fpI}#B{w;<-O&)=C{fE#1l`HFMs*V<(02|W!biETU{{P_1Vg8
z=iIMsbr6->q3#DBye+dli&fm9E^u9n?iX<Jd1ge#0$)0rMe|??gnf||jOH^L6x}(V
z5W4dhmR(r@agS@VI39?oo8EX+h=>JovkFB~jp4X*(W?r|!`+G=blu>pNL)|GV!B&k
z05Hy#VVZv81o=x_utx&s4DN`@M<dQBLT3PHp~u}bWW&V`$GhNUxHf@)s2L<y?EA(N
zcRj$jBMbDR^z+I6qm@%9u00ySAvQi7Q9O_b)GWr8k6@4n!rkK}8Jsb?tA=+u&QR4h
zTj&MI8BG}oC;iZF96cZx4US&l9smsa06icBlJep*C&EDjXX1aqML5(J07jROTNk}g
z8-X=fN6)cx|5FHUX{YPvMS(cc2Js1wg`-p-y9V|W>!Js@k3zMe*RG5amq-0_9_W(*
zxc_8~Lp_TQHjD-?QO~cOyp#G?bLP0E`kpH<^(pss2!IPyJMGD(zqn=rd=Ma&084$w
z*cgg}NM+%98jmG;cO`^^x=3n#BP0-i##nRJt9<GLF);)enuzh=vXy;hOMrOg$^-<8
zRsXHSgCQ&ubus0yOxcU0e*uESd1l1;z|wZW%GnjEz?zKN{FF7Eu+(SMwtb@$kyd0^
z!PI#m@9pa7`#q;_%%TFw^C)}9%6EL*{pG~*V~L^hr{%6l2dM{V+U}&eFM2XA5@G<;
zf(RHI8Y-Xo#3#z{{Lb%`SHJqz^?L7mR8P$PdC5m$jsSSchy2PFwn<SJmrNv7w8&lj
z`)y^(N8W_8fLsauTP-b5o0QH2ZIjbA9t2bhj0gqyX$wYcM=NLMx9jd#R&RR4y*q+#
zX>%4!I{NS^RquU13#`59iMjy{u{IHQe}Kat@q+3AA{^O3I4wKY(JxzgiCH%eX3>jV
z-&gk@uC7~L`|jE}lto=k{CUxt_QygyTyqxU3W6!yrae9k=9FU$R)m;f7XZ+y5^%5#
z>FDVvB8H{kw!i{d;$Y>af+~`==mHbmH+{M?aA91uAOg<Om{17-_qDlRxyxLu#()8j
z3VbO;#5ZO60BHXBz>%dtP-QDXLV**tue{K_!!Zy8r0(g$@^<2Uzw<u{hum>uT=|T%
zdTcQZ#BJGYyI#BW0lu|4Z%hy+M9%#IBt#CPLOTJZO_6Z%Ct?8S`Pmo<;H(ZIrB3bP
zi;fFrDA$SHYfCEFvU@-bAw23km2$OlX6k)E+qEuV2PV_!+QE!(dF4zs0oZv}d=BW1
zr{^fhlXq@3HV+6xDi53WxNp_eI|)$r2eH7A6am2C(|O}C7|E3Ai0Y9~pDduH3M5E^
z5CQRt5R9v$=N_b^&vlywot?KRy6^t#QF1h6d3t_8aWHW#QrMpHW2p-f5o6M{jVtAn
zcn(aQi25qePkYzKi)3%qI)iD4`bll918f`Y(s<9??}<0d>hgupf1%uc_g&NA-{yaA
zdC8Nt0pmjC|H)5&vTo1&!4H10dLlH&;U#a>T!oj6z#IW^$#C3QKY;(fefz3te*5<A
znb>#N31w0}7eKe3$Y*i&actt@h7k~-`OIf3;FBmIE{F=RWx?nr04$_9=4~6-mLL7L
zZRN53$I1W#Al%c1Tx3B%pV%E26tEMWj}aQ4l^Fa~xWd)pK#4zrhJ{o4JJv=|ee_7#
z6)S(D@GY?es3BCd3{FNGA}XK-_(CKDT&3)>PWNbx5F6bFF>y%@m(R6tEL-!EL~sQh
z#8ssp06G|*JAkKyWe<QT0pcUcM|{pgoJiL=F3!SBO0YLl0MYgk4)X9!pe6v2@)7gn
zSm~J=z~O`el5)k+SEg&~s)2HH4X6yIk1Xoz_OuxrQi>WFCM&;v4%UPKSzW<6dTQX4
zaz-Lzu5N+>FZZ^c_XMT!+(qZBL_?1CS-TKmpc+u(jJ2660TE<OB!7wUwV#~_`vL^Y
zMB0xqP!^y>{8ULoey8YqP5GkVO{9MakFH2}^rW;$Qg?6iOS`)oE?%j*F3no=b?TOm
zJ6Gb_GYG+s(81wL^3=dZIZktGAFKUJR3@G8zmU=LcEHXRa9v+4*3B93GJ+uYiFj%J
zMFjLN8ABg-1B=7qS4lxggGEatmC3g)e|ca4ZSf+;vw7C}*le)L*d0(uh=@qYcyv?j
zPgW*K(Ef1!=BhqyOtirIY?6O4*AG>Dm1Q-WpcoN56M697?HkIzy}QfJx7>0=fPV(0
zwqGPt%*$Ke@|N<kkA1AR&D^|}bw*ip>BSj=IRfCt8Q1GB)kHTrQ1_zX(fPmTHLoe#
zwsoR;1zQ-eHnoXnF;O<6Vq#*VeCku5Dgz0&bN~JK*YbeowUn!VWh=KO3#?6FzUK|M
zlkwNhb%?A_#nKsw*nX9$J~n6J#^%RmuhV1Wc({_`K8JIR`?_ghdBA$Sj7B$3jA)VE
z8&1hqsjt|$CR~!8(V|+2UNqUOCH9#tu-x1b0>EFmRo2lAz%1UY!r61N!pX9>Cthud
zFjvqUKL6AdxVeGk2c|$-+i-UqQy-9cCPrDFf*sj><BW%rpO{<1Js$|M0cg~+HmU*K
zq>!`3;tc$%_&gW=Hjc1Ie$oQ>;KG3{X~4;}XMczb#L|Mi8vqL!X`5IA9hcqR2hcmq
z^bVSiL=~Vf;{4<OB}9|>30kLJ6B!?FaW*CDjO7^7*_?>`W4Pwrr(HOG&&xIKuf#+0
zAe?wB*os%m<xbs^eg&f{r&}Lby*l$xcTewG?;?HtOu_)6Q3t#?v*W2EJV$*y8WqKb
zNP3odmO01&pO|0LC(<9cc&3Qb$$YDe)MYZ9_fS+$Gyy>QP{yBs{=Rq)7z@Xw9zeS<
z5-dqsUZ>;6D&r!!(}LKc0Br7fHk_g&Q}61yMN&tIU4hWrv8Jy+ADacuiyZ-Cm3qdG
zfQv&cYc==39qR_u#t=98LxM-u#T~Is##UhSy7KB*y{d-yZBjh#`bH)1Gw`RjeB&G6
zSnZa$Ap(DGpG!B!2+R=xH^y_AZiR)<B;ks8=+L36?-vn|vEQ`<bR#Bw`{4Zd?%iAC
z><bEV#~pW6ZrNnL*0SYq1VFpjBB-ppZ`o8{d-qWJa*P322c1uz&q9}LM5*%v5*E>;
zF^J;_z2H2*`sh1j6nHdT?5f4{V;z0CMp_GolxG;`g<IMb;I(khPqZ{K{vY0buo~Wl
z_mBA89tc^a;}vi$<&s1o1OPex`Jq@u3!KvF)lXDnfYBa6viXl)@<;@P3O-I!k@IP6
zRRiGCfiMsyGDpY|$yeSq`D_ggl<g$Q=n#nFh^YyPodL8}TbcS(MeE@d0gwBH`URfy
zGVr5k7mB!A=cjDoOhup_KprQKv({&XNd1@c^+i=g`~2Sp1X4Z#WNapk?DCACHgMDH
zkCcVPg)N11KHd>+@ht%~gi4xF0gk!^u~H2MbKQ9#Ts7j0ZGd~avLXdY{rX=eA_{cj
zno`$}-dpr{_T+q5fgaf<;bg#jLEgXqj4h+ZP9@N73<Y?p3XH2~QtcDc5B=$ns6W(0
z*_~ih%5<N;@LoWi8fSr*5N08Pt5=5F4O<sJn3@K0k>?!S7SEMS>fWdVj-AaO46zMZ
z6T*O)e{G2HWpP<V9PbykG4(S--I(A|OG3ON`WO(JYiVhX0=PaB0UH5r5bFUEw`@WA
zi{Ja_%J8~%Rif93{@KUr#XoEPBKGHn@b<UAy@J0;qL_?1@PF~g|5|F8BLJ>tv|ggT
z4?p~Ht;wc4K>J!n^D~<XpiMm61CuD=`dc_qi+uy|nfG1pcLRt<08IZ*J@ny&(N+K8
z>u;`{2|rUtR24FF-GbWMb2uE8g?CRPqY>R7&O$etMUaYMcK~cnMB$qvy51G{HwJs_
z!bL9+TedV~vLl97C!;fE@qOw<Hq{Bh(e2~@z*RNM0>){N=zy%H*&zVbbwh(y{EFkp
z@!}3QC&VfdHW4`EJ;8wJ<@tYeC+mwh0ZGK6w7C*8>59erK&-im;ECK1MOwj@02tRG
zGs$mkj1wz=U_)n%bG8u>$pvt+z%%SK7QlX8YzK&@0gACAiZ*fppYEDeg&JW~#seNw
zEg%?xzdQ^Mm0R`M0UySF0Fcx}UyK0|r?;-amwWnhjV}2_${8Z<NE@~9Xb28q$wf|(
z9rfBAX*?<bWl<%p%sF~?-Vlq@-u$n?zba!=HvvjMVU&|~mNe_hseF2UX+mK3Cgan+
zzsm8Mx+egBCi?<tBMzS-DE<1fJZsO(IJ<@@%wc^lR53$w4XnYM(jTf2#1;<%1e@{o
z9{N@d!l8zG4spoSMLVdwjID<!SQG=%RRP0!3u8YJLq9=-s1Tlxx5!uak5zk?V+oJ@
z^~Aqmiy(G_08(pWAK+O`<Rj{&&9JxyYBxem5G(C}>~nj|Km4oTQ|@?0B6UW(B^jeO
zC+mrq@DGs3`G4grUn$@9UEfvJmpgXss7#IdGna0x5tt(YZmj1~TXXZxH<$bFyRRN!
zufEJ;WFeqR06?OFarJxf!3V48zqtl=i|4fug8L8^+Sig}+%cee)2k8_U_~OpWr0oP
z-YkqU9LpjJ1Ztfj3UJfBs2PtPizk36XuD(eZzB_ifR85d&*E@F_eSKuWZt>*$TR!O
zuEPSYWI>LI7m#Ibu3(A#t_w{1s?QaLQkQ)Q7Gz;P6OL`)=y>H)L4-wAJbjV>wC7ZQ
zA5PwlF($<A0X>8UPRjz!*LqiWcB2Qk=mQakaK!INr?_AK`9Kxm(CM>#kZ{d7ZlH=I
zws!#40?rxNtbPEhUJEy=1%S!V3b2)^NI8B3cpeJ7Bh~O+*JjKBzWaPCXPYvO5ru&I
zh@DySb9Y=F@x6qI5oE>~A{2nya0o7*1D*%tMRpCjR1Fd3BP}3Z;lbb@;|PvPNO&0l
z#3ltmJg8fp)%HN2W=k-8wxnVeTvbxYROWp7MMW_YU<H`^e~3svO{g{#0cI*8U|S6!
zbL{TENJywvMnc?F+l^cY{0M-~gQX)FJcs(6@?9l5d5)q>u86Uy`a&aA<4ezg$_LTJ
zu7z}jip)2|)Zq|X8xt9c6l+~1Mq7s0<QeC=&5Jieo&)ZGYrIevMJ-@wK<^E=RrwK^
z;_B=7rl<yv$50sfHupR4MtdiGzZA**2mY9hd+)us8vc<qIzPwxzZ6g4+UvN?ti1Mu
z=I(gmM!=4Ez}u&S<#iNo(gQ+^LfiXW>;N_Z4g9a8*h_9{1c18KW1`c&1KV4-t}j25
z4T7GWu-AM<=n05&EDI|uYTQiK!v<h**f#6|qPX<+F>>1%PINF_@J0lK*zTvJ{>Y-&
z9}E5s0lei|XmLPQjGu+hVn^@lWBF&yhkK{*$C(?$Gm$WmEUb%;pZbAVb|`vTfIUni
z5D9|&k4CCc7t8?2o{_QaxE8O0EV=s*jaJYk+6U%ma~}|;lja8Jcx9vnIjo?dUgKfy
z1?<M9EyL<@9$347Abphp3vf7;g+GTi4=IKARBwW0P!GZa=<bdYq9}0oiFX|s0dhfM
zfIAKm5aQfB-E|E2a-9?bAOS-_&dCj7aW=@s>p-$qaBi&zh)u6gT7bLlT;4p9E~ytW
z1?+)keIHV8f1ai1EZ%-47?N-HG~;|Lct+++Pj~mIt3I#kyq&5AFp>_1ys5^2^yb9#
zhymkH!eR`y9muotr(R&BCx!*iAAq0Cn0Q9oFSTdZ>>o%F06cO3i4dhLmkQfUV$j7`
zprc3Jde03Q+kityPsQdUb|HiP$(tyYi<Tx{MQ4Qic!*5zB>RQ+0sK{~`pV9Ihw9FQ
zhy&x#l~tR@6I0;!H*6`lZrdE`-s3fhmk{0zhZ*^r|0IYFeA`%EI$qNQ_q-0g>%{)H
z@V0jdw_<_H=D`1X4eN`3^Be*2q7U)4S76dyZ)%g=#0UJJ{p@E08i}JHO58V~_WeSn
zndSHmY(<9y>VM(eVhngRe$`oMaYlQN+kSUyE7fqKI7$E~PW(`Sb>ER}!yAz2k9{tS
z@|q9;tot85Fj}6B0iZJMn6^8MWVZ>5fG@26#iAEr#kh59h{Yjvh`kvm+PTewZ&7z1
zcPAf%#9`w0{5+WjQ!D_BG$SvT_Kf;gMjD|lVzmPcmfQ#nQUzSQkij^AqG5rCfTr?R
z1z;`i)C<J()B^w+K#qsIV|C9LANU9X49<Ws14Zs?I8&fQdcb?ZZ`Z{V7|0mW;{$Mg
zsFv(GMg$-LYD|FS*O&aSE|%cB(?xeu05$?Oz)|{Cr3L{#_66$H$5jzkBL=8Ls#ien
zca5zAv3`9PDce};zc!EOe(H)&lF?lQRDi2eGs%BRngZ3{bKal)zf97b=T6-|4yiKl
zgwW9Xhb-XUXGDyV;7Gm^2A*qQ2vZUu&x7iMM+XT3DMoDlI;l{ONkWw8p4yrmcSJ#?
zI00nd8p~tHVt)&2ons*=ar?HC762;;QEl~w^G_tpaP1H6K3u*Y+kkm_b_XM&(toie
z7!F>DBw#Wje~saW#1q(iXe`p1Z0?h1j*!`N;Ar{YH@>>O@xeEgyY9ZH&T$0(?%lhq
z4bhjr^riCj(@$4Pfw_qwASG+_wyhP<J)&36(ZK(SC!VMV%j}m(0Or8|%Hwt;Tzz4X
z;6~`l%cu<&f$K>ow0nWSiO&cJ70$n3ta0gj%bwXDby-YA_F}B4TnkKFWN;v_e8rY>
z|80Zi&jd%wn}%DRNC?;?+7cjN8Fzp+akB!_7NyPo%X9u*f}X?&Ixc=+e)>R_7TgjJ
z{pMIoug)Iqz-dFm)zX`uCv`|Ybo)S>Sd<OI@(>>ggQMX{ZT|~^iC+8T>c=f@cl7pr
z<LSx81sDx*)3q{I<7yaWtli~~CFXw>p{H*adi@qZ-{L<M;sNl9dM9Q9J$EjnEvh@<
zk<>-+&)tx5U)=^b7Xc=)#_bc$vnU6oP4xhzyaAPaPrHuctOHh(@(?n9Y7YX6M1gdq
z0&nt=HXuZl1GKeY{XnaMwDA!77oZ6XOr6n}bIR;=*rghxb!}1>dxhl9J;%f80!ew*
zgJY$ZP#3Nmall|sIuas+F=hqJdZzt(_L+92Hog6r`hHgK=}gbMQtxn<ksp^iX=U0Y
z$d5kQZW#eZQp6^sa`3sP{Z-PEJb)j;VB8S}R2!rj-Vu@n#2muQdutrL|HhYV>(RU)
zOJl1-(v_^?`V){T?FH;&EeMZIl>+2#UUhTC{DKdePq!wNFfsqmsJ{@jAvemVwVi~j
zs&zt$rrH|dsT@92-u&SGmDtcnA-t&%1=<0?evgih)>t1vnGKNNfLUF=(t+!>8^BQD
z{GWXC$r?!PqaXcfc~!g-=D^<*x$)DSRmKhYjIIZy0h7hZhYoK4{{1y*kcfeZ0oPxs
zqz-BXW!rvOXky849t*w~WO3UVo#x+s+dbuP{PZ7{!8_KJ-2wO0;cAwLr6Q8%1~qmh
zYJmXzp|~cB%|0G5p3JJbK7;`<c|1F$?dV@o^NvO|ZqYg!0HU8qMBw~!<lu%8phY;V
zadxj8&!$At=cldo_4Bik)ALgstPCJp)T>W*hylii<53IX;txc(zKEd~KmP+GmFpgz
z@+`3E+lb|CK*i#3;jby*h>!UJj1%F4hZB*I3If=u2R9hSem0v4Arj#1kDl~sT+O;2
z4Le&3e)T{T$fV>7{`vvT5ip1qn1=vhtj48}7}~pnfY6?y5E=(!q=>uMv`Uoa9x4F@
z2CzGt`w>a2sU2cmS-JY0C-sn@pFe2>70dkjQNiTCinS@v*djPaQpZ4mY~iWA2gyrH
zq1^}?1e!jM#5%q1@^~@`*v2Vsp~657XiqJ-4&cQ<qWR@=Pm;@-OV9RXm~ohVQ@5nK
zF{De3ea|QTI*~phRDk$I2pRoF^vEvkvRM53u_^>)O}UT6#`~j>3?KRaA5R-eLTsFb
z07NJdawv$T;Y<c%B9fC8JSrB(kTe9q>8POQC6Xn>#Kn34x%qki>?~I2U9k_xk!&)w
zB6W?7jaLGM>nLs?0eCj=ye)&xJ1V2m7|Xu?yKXD@-1&-1Oqk!m*tdR(>p>rH`(JK>
z<6LS3{)hkxXmbNLSLHRYfVNLtE0lAkB(HL;FAV*-7E?1KCYsveO83uQ`^JyJf`5|q
zjc>+W3wlSu1iTjgzx*hA6B4C=B4S4o15Iq4IO9JP)h=A}JlDkTZOd%O%22LF0|Bru
z?nmqvEHoqnTn)#v*jPTqFaP<k?kX4J=7jUgPULar!%0OuOG=%#<6v=X44qi|Tja(A
zko4_a2UkXgaIQSO?{M4zv!h;gwSc?=LMi}CO$X0P8t@*8RlNmT`z*wObtqBN4o3`c
zT)Ek82ysAXJ;qQkZD2{iHJnymV2S=CIXD<Uehd6ithep@27++JC(=HET)E|dfR5q{
zWmuK=aq*(31)hthMC**s<mYP0U%rk_KvYm6V5l87(_vpgbSyc)DXw0m7Jyc&zXMjp
z`3RB<x~Y3K0N$UniT??wx&9$QaJPf$54!=+y@nDF$N+MzB<@{bL}EM|!lEV5MF66a
zJQIYN_EH(F3&)CBP`2`L)xta{dv!nb9EEW<PA8)WPkg83Yo$(o%lxb~GgFrV!QlSN
zW%t}u-{Ek>m!@RwDO(;Ky>}BrGsvr63Mmt(J}-IKg}B@kTMFs`<6y@?Dk!Z%5Hg6g
zOW^7dg(CQQ9}q}XJ7;p;x98+NRG(w6g1*I)h&G@SLLMLDLSK%gJSwcU8MozO!43!T
z59RsT^J3BbPBO7QyJUJ7ZFV&jw?r{35Z;Vk#UA+Tm-d#w^FMrVBmfW8x#*fy$0gt1
zbMv%;vAItYXucv05DQd<d-m+95<F{$GN=$p@f>>xd|GN{`>kx_YOddU>#YUn&%juH
ztu5`ox$hfy1m+|FH}12!s+O5>CT-ivPt0v%e-jr-qr^=}`Cf=*;`=lT-y)@a;zo6=
zlez)7@zm4LRHER&-?F^?>O-UDj=^wK$$R|lf)D_L<p82=!x}DUZS?&_o_n$%fM{rR
z+ZUqSKX&$H1?tt&#jlEve$T;?vNjyM1?+6Ns%ppuB=W(vA_6SNJPGs}coM;j0BBKX
zt!~j?9<UUJZCz9W$^eGK=5p1-A@4afQp;&vKA>eps0;>nwh?BP{YbV4UK`L}AL4;=
z9P#*}NHI_aBOx>x(+O6zH<AVhW0q-z$G&js{b>tZ0ZZ)G0R7Q`0F}VxlodKl`mqo&
zfLEWi1tB57cK8jDZ50gI4@O<FBas+8Q8tc0nUa|lq}`u0WjF*u-B2drrZ3(Vgp&X`
z0uD7G<Bqm+iI@a=k1|FzMgcu}`cXOk5M)bF+ZRAM<#5A!*Tnf>z66IH8-fr3X=OfI
z+a+gG;nc6^{%ZLzmT>VN1QXkgi>Z|>cgG0_h2t?2X~gnXWqzJ3f?yzo&1m{cRYBq)
z@KM$MQ~xA&p@8jJnDEHyclBrp@qm!5Mt`}lssZwi2<V<g2f7Fe@3peM&xk6-F0ucT
zh1n82<HSzl+}VVWzHqjV<C++zu1xt1WKTrev|v%<2jn^LJUCj`<()@J8s7~e7`!`E
zP+uN=?R}Yhoh`!5@#zI?^PmL>GFNf_Bz5L7;1?Lo_Tq?l_mK*i&xi&@v~$Yv@vgWJ
zL&4baO>cTr`S6E7T)&;41ONI8ebfAzBLHr|M`gnGPT<*^*eG+NY7>{$ElPabwrv%J
zU2o;Q5Cnkonut+(t$n`R;-H<bSvV{rTefV;LKhwVqE+QrzxLs9PBFmB!hAA7k^cea
zR)@CJ002M$Nkl<Z?2jHm5i@^ZxbsZ`vzX{-agAkujHHfaVR9a4wke$2_}D}R-Sb&I
zwS_<W*?0*o4ro#x5YG$xvsSdeEJhYfp;A|dQ`sLtX8}JFki02i%{`E7mxUJoGwNdx
zFcHuO3Vi{;XO3ly;DDSBc`WKn7R5j@dCrA!z&U4KE?Xi=07UE<4kr-NcozKa0ds!m
z#)jUOTO^LS^4F<Le~hOx`tpsV#;Ny3;t^d%1=}OxHUSloCxQm>fc$Wb%vkFKWLzOm
z6YQn<3jE_CEO2l*ap%FKeF0KTB?5pqb;z-(V2J32v}LOxCc|k9xPp@0dobZ>8J!VT
zGe8u*jsyd@x;Vj|w1-DVjpUeq8%O!*))%Jiqv;<>!Br=XoFy6OnTgbAs;Tu-eIIjw
zmo<N<F8V?MELs{uFF&g7MegMZQN{ZqU{^*;Lprc3ch?AqIj-({l@ROf04h<CcgemH
zYxCYx#WX|6D(T5JL_wVoslIaTsn2+_saTlzG*7+O*%2)vqB8Ddc@7h2kC(0SZeirN
zKhJkl)L*9(*vbY#D>J_zO&IJuv+o6>Z7|&br=L7ne)uoHuH3qPYu*KR0#{DjRL3XT
zdgrQh%R(sQ<ZT>;n8!$v4D8yqYZ~aAT2DTNKWQV*pR4k}{kQ+Ny!p*<uI2Ww3s-K@
z+|?Up1m*~U8|6t{LpvI{Tj*Kd`q;^jC?2IRwF&R|`4RvP{FT)>f5$EL*43NWV$ACP
zwXeLR{H+HbDgW0ecb2=?E(`D_3~j{N0}BIkahn3fYhnj*CX3|SEPgnW&+R%CH^8|4
zCF&Wm|Ct2)S(LH>B9LZ)M?bnci}As@>j5fJ1_3d>Iq|*S??lW4NQbhp-n?N=7BzZv
zk=>FnK#0qI=J05^;^mb9cq}eg8zL3ZCt$%{4wnUZ0rm;81VV5O53a5RfZYhIE;rHx
zQUV+x!emkElmN6KKoWy89O1e1u>}YK>5F@S`A9%jS`#9MQ~;L<SZsQ9FvN<T=K9hX
z0Q}U*k+M9yCZ?Me+-oqKv{$O$5xH+LoFg#VzGig*VqSUj#AMZRSI>cf4r4x&0^8WS
zm8Su5Gh;Vm{mmhw5It-xYNW*UQ~xi_vtiOjI|1VUk=PidYn8~rEh~F*MA-<PP*c-j
zTpO4=uQq8H`eCM3KzBbIjEg0C?`i|4Pa%M!8cQMU0G6%nR2`lvNfDoay9S<$d#Z8W
znW%s3a<Auta3Lwue(rtZIvmOdKik%<O24Q9*j<ps#Jx2{g7@rv@+}V`f#6!SfctH{
zHX?l>z41OEghhSiW@=2tvG9(qi)v~s;vu{Eoyhs&Jnx|p12zV-Uw{aeH?JG01nQCT
zQ`L~LJTy|?`QC4jD8G~B^v=KO?qB9$W?o#ok7RBGdy+ioY(mx~1Gs&vMB{wREw_~S
zz3+YHcYpVHtJvSY%AHxZ^K<FO9)V7pbYr)1u5~>lU;>(`Kz-&;el1*Z??iO`d<Xy>
z1F^oOoU2J~F~I$FqUMF=FMa#nVa4LAmsPt)VNYbV{l~HcUqqgh$BxF`FXH<wIFDq{
z^hJrGz9WWoVgnH2@7%M$eCf&k0l}=D$;W^CPyk+NTNeCW4#iBL2nWva&(3y$7l-Tn
zM7SyLbrmQxcEbM(?VI8%Xsk%S<>7Al`U7ylLi8?-F0}w|UZN-Oiv$4iVUGce<l%_l
zacW!!aq7528wfGnGg|f#phtEM(lYF-BIOVwRRFa>O23~RuyIXYjMISr(*bxy3_?KA
zA<3BPOKz)NC17zsUwb-fV+{j6Fa28{(Jf=gHWoN>fUkW*8Yg=lU-bcT#&Nyey7x?(
z2pk4*)mSh<2>3E+)&79pgg6Pj47d!+uiWz?u;~Bm_}12UfBbg+Sr>ZpTyhB0J&fy3
z>4R}Q6Yxbq0dnsj?mdEws{Zg!;K)~GT-aXJU`{!w{{USOCX$Xd(Zer}C?9}}(?Csg
zGVinTB(WL^ktS*))fMB^(6br0MU{|1bdc;Ak%bxSa|xM<^CtDFghJ*Q1H-=P<_FkV
zgus3LP-mCGretwk2-&i186GUV@@_ovu2+`V+;>-f-rD(mQk$#3l|{0KiSQn=71*|I
zTea%v+kekJ_tZ6qhru8G!5@^JJ9k!_BlEW1|9sjqckfF-0#~=9zx3^zt8tbQFe!2U
z0NGA{M0RZ_Ka+MQK)1yK!O*DuS&F^vDfd_qlsmKR%ic8Of~a-^1HJmrZRM}Lc2jxe
zNc7x6KstLd(Zxy^W_Khy*@ao?hQhf%5x;2B&~DCxw>riGbk_GKun^AsRDk7Z7Fh;r
z7G5rZME6AB4C*YfT=$5j1s1~n<NEo1GX$hkU}%TCvjF3Aan-;J027hpx>>R_v|GhE
zZVH4~q@9fW_NT7>313`AyUADU2#5eiKrT8Uk+G$j^guiSI0pMfN|XfGVTi{-uSSc@
zn9#=y5iDHo)1#9Wba^dM1t1bU1EGo;|5^8aNHdhHHekfS(N@d?+5md)Y=D{yL4yDw
zqN>B;)EPC&Qfh#yK9Wk<p^(cW*GTnx?{Lyo8NePfV7x>!6e*D;1rMoAf9Hn-Ce`Rt
z`_|T+NgG<u?s|8AUI4nw(9X)%**%b8T#|HrI!#@><lOYd6p`zBHm4#5paSCe?>RHh
z)DLP8{X_Uw4~*nh-?Sb-<Fz^)Pn}OZ3X&YplX#y|pfNJ$hzJB`XR8q@Pr!UU1c5r|
zMS?OA;sLQ%gEr-zWVeCX;u5%GNk=e_<=NYu>2&Vd8=`W1qz$AH3<kxysGBXNTYGnn
zmmmCd50(wXYwJ2t`**Rs&-+MuZLXX9))*he00|?b!4G}tL)Dvrn(vo>>6hwme=2V4
z&GT-=TmdiH2+R=xFWFFEeMPfPYLknxU++$Sb_tZXf%I%OxPCnZfHDyP7K7Q^Gh1D=
zodXmC0|C{)`qo#5wN6ZctjV~d^VuVPc{V~BKXJB#=D`pFE8@9u$J+if@7(F~%>IJ`
z>~LG*w1{mV*>gAxXT<CQd4S4w&mQRe;+AKzTo$gJiy%>T^Y7OtT=4dUOPvUC<NkyJ
z#-S634@H^)$j1_J#eKlR;*A>wnmBQU!Yvzy!u6djU*CVIf`ypwBm=-0s0jT_#|+^R
zHx7XccSoSkfXrBorfft8mpt4h34?$@{L<G37)S@|GfNqa?oOtiydehCH-w82!bHir
zTmce6pdT#OU1LuH*cj&Nx2=UM>Lx#tc;(h}9wDH8<H^TfU@&Ei*r!-TfVc*(lP)m$
zW3i4vq3-AikVtZJpEBiVU0(qpoNfkSRxluK=!Nc7Rg-Z^5zkJuoV@yXu`AGD#2zUS
z{k}a7Drko&)xQi|eZEBhwqMr9RS^&9Ysimkflr0NNYIBghl<4bS9^+-NA2W!BMcBj
zerTubyhkSUIddjn9+3=idA*QtbsUK5YF`3eF<9L;G*E6{A5nh@q0KoKzv7YT{;6b0
zxaj&xY2qqS-t@XxN3DRUSFb1b;v{7wj@TJ-BYgL}-(BAJwzrjcyyG3!kno-Fd}n$8
z``=$}0JNYH4lhn?=1P4rMqrKrcrixx1uWB|+Q6UTjqNoV`?2%~(kAG0Edq@Is0;K}
zw5^rvzUM0YDvGPX$Jp+5_uW$7{^||os|gRhBJ0+%ENV7M8BX9Imil)N$K@>|RvQC}
z!bW!ru=-*rFcE-WpY4Njp>{x{Q{9vRK{#B1jx#@!^R{nYA3eIoi0&LGk2hZ$DS^#W
zaJMTXRy`W<7H;@(tn<0hIR?l$wZ2FXYzzcENEx^U5{0vD2Ub2SvM5&ZZ-}H8+uwpu
z%zG$Ai{yT8jV2i&#U&2M!vV(+K<N1=(_Y+i4Th9*iJL1p=bP)EdiTdojl_ewmv$jc
zjt9Bcgu@41Bo)9IaRaPz>o`@u_?^LE=dy@UA>t;|@6-kX$K*wn2tETGwORj!_O1Hs
zlq2FHTM9b^BHS9}RD8}2eC!b+RWcCnzc;0*F3kewEuVLIwlwp7^)4FAndfJ_w{~PI
zFHk-nLqNfyg#4}6`FZBX&X^dtg`vFk-ScB0I1-|h2;VM#KpFvuNRx~ibqsJODX0>V
zIuC0Bl^`;P>^nSTgu(HI1NNRiv45-@zCM{fEyRPMsyP_@0_8lh=U{n!tg|&4iyh3m
zNQVYP6z<7JL8M<_j$O>(edmMawjEn5apxJon8|a&bl7v?J@0u>`D=geuT>&IovyX#
z#wc(ueRGb$9070xKQ{{<AS3pF^wCF)T>^3aEx3oKUkyMjqp1L{lwW<+3nn)3f1#2(
ziOFpu^Ecn||Fd@<aC((>zJJm)y=6-J3<<q=kdBLj6csLTU7CP`poofBcR^Xi&s9W<
zu8S+yyNY66ToDmb%1UPiDS-g#ZTd{6rYHCJ{h#+gbLPz?net9aa-QU!_dVx4{hags
z{{1;Tgc|LhCJtzeIN2se2HYf$vAwlb4glh26%p+nSwpXA5yz(OQfG*BhbzXCAD#Z&
zHI3rjq(c>mp&wr%z5T4xA}vycaTN!z1sB6Zg0HYZvrreywU2PXgy4n4XZJnS3I!>b
zKfn_S0~|aYARlzv(`D#aEElbnvW9OJP~%k)arwmSZ!^&^+%Q}`3%H6ImyY@Ym4np>
z@Tp#2G}Hos4`l!oq$1cK5N{sP!O#%SUD>Ulk%|HxbtGXxokHAyfEI2R)dWNbfCPBK
z6(Vsksf6w!0H?-nH$?6&gaeTP5j9&(28a>?3;7v8Vr_^M`UKaERDo-qx&{FUfVnuH
zKHx78F@{PZPhcs&Sba}lQX-I|f4!&Pr2^?90or8@5x1QNe7#oQeG*XD3t-*=><o!?
z2XQ~o0yXh$LIAi20DwL#SD|>Rg1OZXh#Uwtf-b@2n}kSZs0X4k{>_{jsi1P~4#@9G
zB53)ILF+gfa1s@8VN<)=0_;^*7&a?F5H==z$%BLqo{(})*7I+h42b}x&LBI{bKZNB
z5xYD`+k2OLpX<J#Wevau5WAvlue~<F?SJS)AF>LV57rbuN!qz*Ai0C1^#1XZIS3Qd
z7YkhU{rK>@Pj6ql&Wp+mO`Y3HTz6wXZ+BaH2XbC(pIr7fg<}udOYzHRhL=_=7>_~$
zYZVFAGK>yd6b+mdIc|@#61kjB3OO>mT3%fnUR~8_hHY&!gqm5DABtq5-k^O0;ONkE
zv#=807Y-XeD+@Z{2tefv=&NP4#i9;~5XT<N|FK$NYs6XMUtOeQ>=XcP^zs;C&6f2%
zhFipEN5_ju90CK6g#v*HZi2=c8^}fmWpKgd0qEfX0gPJdjS(FqE!eDcjUqlEIQSqQ
zhWiAB0DIyan86s+p_AAjM3w}RI$@&9v*@D=0Iay3^q;x`Zd3(`s8JCB&;S$lKm--3
zF7)|u@jM3+wh5f{8zZ<Ve68FSRRECXoSKRNQswC{QUIh4^ucxM02=BPHoY0IUF&*%
z!qi4YqQssYD(@|A>pXdTuRQ8xsxJE0vu1Bu#Dm1}1nD8z&nAtJd5AcBNn})>`-lC+
zcyas8DaL*fL!G*xG>!BKVl7X089Xa=2m*}x;u(P>Cnr)IjtSbsy*Il7%^~+;qikNd
z4;}J8*r%dcFNIWHwT+U<Xc&42h#pA9%VqLPps&1(xN82bu|0hBg5$!$`|mA6Mh2!^
z5n8DU$iSC_KkLB_H{1|TJ@wRZ>7|!i&0HHw!XpFSPpat03?u|VKPGE9vm$zf`hND=
zXYGYM_Sj>MBF7-7zm)fm2mm+s<Hhc8-F=-C>IRsL1wdE5Pf~zdapPJr35z;G#(xd#
z$C!8=eR!vQ;4zM45k{X}D8PkdVxu6U8`cuvU5j_QVyLrd!kHt!o}`6&mBi(!5D=9Y
zD3(9g(<ZXjf{gx}#Tk*Mg?`myYuoUuA4nHWf}m^D%jI*ezA`Uxz9cw6hyy?a&?NB}
zz{H4=WB{N=9`QK@0AXG+_yQylx#K;6OC4YhcaDk!wFBS^h)^xyh2#L$1^~ow{+ThK
zY#y-ir(M7vRR#Ge2Y6u*VB=94#tg9n!2vPE{s{CH;Gl2RL0OLRZEsaS7%P$phzE4w
zEDq#jZkbQ|kuRdkR0FDmaUpTQkdBCdk!=PgdP_ZFIm)q(yxTY(^(L)u<&D?OT=XDx
zZGxivv`%UO^GeY(h6ILu+y{&qA=;1#7`LvuWYa44G2otN>)xXZ;yU*c0)Z|P(a{Vh
zbw5#Aq0T{i#IqCiBYK}uTWt~Wa}SvNX?kw+W&GFK*=~{-TrsgLu=#9hH9~lA8Nf!*
zF?(i6lJjg`IRA{3ZCmPy`%jyP3?1^!BXL9x`1#L&-XxA!UU{X}4)_n%`jDaSC)M<G
z1`-0GpA$A5*#S#7*a7@2Dk{tkhsBDQb09AuWqK@%u|ZI$*E^89V^!_=rxs%tuVW6{
zGyJa;W`~#S<;RbvL1WvZAUkyufx`vo%FlYC44~esQ}nzo<IKgbTEN~YU1{eU1?AD=
zG_$-!KGFg_aYHQX6*8PccMdpf`Yf`YCd>9zhr7mG0F^+AjQ-%T09>L5B3fnKcn$#A
zlf>CK>Uxbl5he-jkQ_jWU;t<q`Ua4}ep42X-F5-gFlf6((h1rl(I0>X;5CuGboy|H
z^ba6_!v{#fD8&23@CWeW5ZRd!F+5;Q8|G`S`p69t0A~o$07%B|i<rpL!i^6;{Q}gm
zs~9hUZxMK75Lhnai=aRN4Fm#t3S^<q4uk8|21byCP=>1q81Yo#xq-NVhi_6@#MJbH
z=y}wyp;=lj7M6baadeiyQ61gK(J$s;+ngjV^h}Y=qUX8jC_QibF_IE-<q$fkaiq`2
z4uZZiUhXX=-0HssHAu=wpE-gk0)Sg&jc){kGDFA`I{<YJz~3S22jURoGP3wrXk|&a
zI%Rm6%_cu0a?1;})8yt22K@Dkx`$;ykslir(;rWm*-~{)V-E>&`xSadi6MXi>@013
z_4NEz;U7P7d^r4|eeK!b_Pq5rKYc$3zytn#-+1GVW^44ZkA2K4U=D|Tedv2vlbjiv
zfrJ3a&_rgU8V6}!z$MyUYoC4gu`NE~`Z+g{7vB~?fd%Ege)`JqOj-K(b?ks13o3?9
zSD$^96sd66;*une#eHwwq^vN1MXfke?Fc9!Em7pTvU~x~2I;M}08dwRKy>_5P=lzR
z#d5{N$PX)O#LxKdNA6Y=6je{6Zq<$<7NACmUJU)31!4pR!s6cgqj`b*2XM>fQGnzH
zfHgxwan9yK2jQUnU>&YsMUqZnaSyj$B<>$WMYwu&>}>lB2%Afo(9i-1=^CI6h{0L2
zkq?9bb_ATq7!nl&ss%WGz{$1}R=o@Y0std0MDWp=bnhgd%~oFt6-(O?8IyJ0k~#{l
zKz|t{?bT|`xRRA?Ufa}fI9mX&L-UHEAzUsJgYhXVexw47$J+zw)~HRQBw{@e(4z-8
zbq8*L+WD@x#B-)*woW>)hj6R&^tad4OWt_-u5#&B+HpiaFdCu);$6YmA)p`ta&&(Q
z`%Ac9{Z~k3xngSWC4o2HK4VAqf!e@4HZ*SI`1K9w=ZipqNXRv?mpVYt8WAeTY4abp
z^KDejifUQ>H!Ds>t?oIhfMp7KU8v`=MBWv&vr+weapCeXzph0$PsW@gR%R#&7Qe%a
zb>Xt}&QQ>+D8l7H)`4E<G<K5p0r02pn{U3^A_HE3{q>e(pleU8`sC**oq>b^7)3Y6
zUu?b!vq^YP*Z2Fifpj-60iYfSe_m`j$-&g!-~O;r!;Ky-DZn+S?it=}>XcYHB2dO>
z<%b#C_q<ua+qzzEaRMuRw<k}|3I|tB6UdKKkR4e}YvHbuZhnT0rEvE{r%Plt%9(y#
z!~kM%;;z#kU`Xt8@h}1%i7{0t*7AJd%uT#2tA4imZIms6iWo2e^nfvmIRE?v{4gQ}
z<Z<^SQXctDge;(dOT^N=P~u*qyj6&BC>2O!e8z5W=+@iRZh_7LL=Xx9Ee3Yzi%EbC
zqIFaW06Su44EHd&n<TW5^xC{3Iwh4r?NK1$M`{3fj|2gLA-rvi`rIbz2)bl-(KvX1
zkOyQ3#37&<3+*scjNEXkTq{C{#Fzm|2m+$w!8IdEK`p@I(E-sQ;OOGesk<xbtSUgw
z;Pv|@&i9ru423w^)-&1je5wxW>T0)Vy<PoMW4s5p>5aNb7+?q|?i=I51(JE<KCBVd
zh=hoFCl9j+5ydr(9eLiENA_~a(r!OlW4Fkgq{Y+*s}#S1YYo~0o2ZOyRTokdLObV6
zeZ{@u86u?bTa8jr=~*TEB~q=uD@ww?idISA*ikS?_we~;jp6ERPgOL{1B_s|o7CIH
z4dNW@0BSMj_RC-Xawse;44?bl=d4DqC7k~tCU=w<O$dNdc!PKm0cJS#o(tZ<I?D^p
zi|n7FH1^w34;c??0NmOxKl>OVjuu!JTrJ`(mMxnniqp>v6?uvjs1Uuc%J+Y{+_0J@
zmS0faXx#7wiBt*wJ4J4Q#Qq<vvbgvWMm0At*E|If$<hu8#<stm;&jDM*&+_tZ2^7+
zj>P)M&2M=_i_JA$HoLyjPk<h=K1N?8IR0#bo$WX%a3#_n_Xz@nKIcjjQBqWB^|56u
zvEf-*rxfIyn;jA0W++l2n+LI-G2A1%Ux*Mu17kWhYi*l^LT#c(fQUgtV1%2FvEXl2
zUL+apr9dDf2o1#c%p)H_1_MdT17I`(2tz!Th(JM-p;J~T1Oatpi-4U3oq&L$>;?fc
z(hk%Fs01Jah?7qqK+;^mkQ#VW9-}V!tKZBcbI57hi&8g8>vDI}Y0B6?3ZxR}NaXF#
z9=mSWA|l5!#{a1QUMeC!rKgwSU6$?(<HMI80%r<DkH*g45g76rVW9am6^q81Cl0?Y
zMJUk`)H4Or;}Z~y=ZvkMk=oEJ?kmIu#GMfudJfn;in+y5b$Pp@JIZ$D!198yZcTfb
zx1vt2s8VG?*@>WhOV2q(18Rb>pgml0#z|((ij4}yQ6>{>0?#m?uYUEb_I=Y$H(3)2
z_-AsaM`v9L0Wdo6i!Fqi>KZS%Q}GZLzA$I1{{Gg(0*Sx=!F$gR|M&dE!kce5g&ESB
z!gX#@T!5Cf?CQ5pkpgqW9<s)tCEX|v1Bm};$W4j}e-kDuLZ5V~)#8$|d}oaL0$Pmz
zuuw;Lk5Sxg8T7HR*U5$ekvT|&jvlwb62$J>xQO@x=zsw<Yzj#D$^&GG>H$vd0T3B)
z)B>F?;$^D55vC~Kz!U*K9Bz|1K8zO^%T4jERSgpLMh0GQ*EEGUme*00qzG$6U5k9m
z1^#$0jMsRkiBJHT^0Dk!pGg2FKn-xQ+;hyrPd@;6K$XzL5DpL}Y$}rtU<lCo2cWOl
z?{<NbvKVl(5mS!FkMzak+;bn`{sB44&<=JGctIfIrac_xPz_KQ0fR93Tc<MYqKJR{
z7W4bxF(|3}?xc1+F_!KWosV5kUFtqhA1(Q}4vc7s9j6MVzM}bxs0WQv_iuy5?e5lx
zN(L9b905Ew7yvu!5Y9skFfSe;ig|)Cs4B{_Sp66Z8d0G8mZN79Nm8DkMF_ckJwL0Z
z-^akuf+rzO&^%)8e}Jt2Cu%Q>wX#Pbw!{n(kCRk>i~80eb-_$M^Xwq_^1{0C#V>py
z9H+oPzUj~Q$J%q`J34akNA!RD?YEnB<V#=rk`+t9e@C;oYg|tVfL-H0ZgUhY7;tI6
z%NH*@ambBxQ>jcYDBJABwk~KbzvS%W!(j)PhgTFLxKxWCw&v?6=7kz9o)7}`6;6tc
zQKrZz9}((1^(|V+w+O^#lqO(?+lISdqlK5iEr`S!SBVG$;?l#(qtge4>jn0Re9L5L
zh2GkN1F4SnvOB=@0FDsg!|ELnhEoI#aOcB_Zfc2w4=Efm+&%jJrP@4*KstzuQ5g_#
z9(_G+5XKj)d58xr!6Cx1wyzbD0{Eh)AOKIPBovGbVgmy`Yzl~R0Kox(16EBkc;py>
z0t5jf64DG*AB3MJ4ghd!MuP$%I7U@#bt6u0meC`jf^nbB(lL5&!WOR)s6${tG!XoT
zMS=On1r7-S+&_RIyf;7u;E<GHFR@7g87U9wjZzBG5#`4d<BOm^1>m~KujriT?7r50
zX6ti)&A*L^P&NoDPZ41ewLo7?3ZjeT=+MSH8h)gD(VSxBxLMy#dIqp@Fm7LSL=R2p
zuX|0^s04C|Um*#?A{ppGbh2MSv|qvm3!TipM=gSS2m+1$0&quNF1Nm!CHdM&Npq|6
zNU@fS7+k6IZK4V=sD%5+JHhUSR9f_Q|2?LK#)f(`Uc@h-5ABlt-Q~N#i89nfeRtk@
zXJ}|>2)EsKn^iMZ;6IrD_IG?q{=UsXLICt_o`yYFPtIsy%~J;!{=Dee6TJQQ+u?uy
z=YPUG@4RCng{c*S0Ahf=!~TM*&76s2!np^QNd7G@7LH4XQ)9&4&n&Od_P`2+vbsgy
z0L@{E)BtP*w6v~Ki>T}Zq<fvHe$OZ`H6UG~#S~7iK>&}KmAL)rT`7aEAFdzi$5<`A
zMdI9XzoQJm1Q3rw^sB|+=0u>1Yae+Kl~0${p+fq6hzvj)&JPYB$pai7TsiUk;lu%E
z0GcHkgFwI-AOO(O;iEQSzW}&sILj3R5+oGd10o8dSB7{fl5rTKXe{*^4_hZg9B_{&
zD=)G8O=PS*<VVyD7m2?=#(Bg$Kn+42t`Y$3`HgJ=`f2(}v_bUw{D#9vB7nUCeX}Tv
z8VCd!VC3+Mqyz$}lP!UDHXVMp_3T<cwG&Se8dh)G`L4#&&s*W1bwtl@PlCgrWwKjt
zHNrUvsI`&+@LV8qLG3{fBtVRxaicz=Ha0#ooxjFPKP!uKWk=C&QkQbw6Qm->=_4)B
zJ!OWl<3PXPERPlJJnAIL;u)D;DE+^J{Y(~izrIRDqt3H!bcyn?GvONTcDY*TQ2mr@
z?}2$Q)`V|-`hsxu;RlB$OP1Pp-!HxNl724<9U^=gv%kA8-HVTzac?V;k1~M&H^2GK
zux#0~aMxXTS$6jb+E&?to~KRv?4N<wla$Ns42-))DYI23wPj)k03$xUFau4zbT0BQ
zEiE;dLgIQO4Op>aMX0T<HO_)ISPWQD(zy%T9!ww7`%a%oaF2vGe(kl_!pZMCJWSrS
zF8t>UOG8z$0@i4;#r<#d+Q?-Kx2*~p{IQ7;yTg?Vz)=Tuh;xGLtCIm1qF0qL&bqX?
zC^QLt;oMoE(UlYH9s|H4sTGJjfK37V_i2juM?fAx8M_4h<^f5{Ap$OuO#uOcn#F;$
zIgU`oM7^360j{`wfox;Dhz<ej1ohK6cLDV@sR$rC*ya~NnJV2s>IEVU(pN|XqQ{XK
zpN&I(0{FQX7@Q$GrY!T%v;k7)-Y3vto8m@s^8lL7sQxsl@0@41MFQ0rfuQp|L4Xqy
z0>spe58PwtL?jD<KVgzIwPC8%Ef6HDHCDV507d-WTO?sXG7=>LspfFK?vZiV>R&uX
z*&-sS#}x*#&9wEksD@q&q}78oCH-BAm$g1nJ>yhf)Fy;-9ydYb)BN&WkdW+Pq?xOG
z2_Xluhbjo8zbTR+u_4fc<__)45N#E_T6QJGsK90fVgj`Zwi)@#+pPH_ra~T$93nP&
zHs7pkGJ<J_Vp_E7xu_AL#ZHnra(g8%1w<$D0bW+%t0NAX75?SAOT)|=(?V%!nLefV
zZn8Zu?=FGCFlZ$4-a0YBF>~SXS^B%nHRgbI;M?E+c6j5BH|(2p&MZ(zCX<6dYZ31}
z#4i2y^O<ZbsjGi8kPra<o3CNZ&5N48sP(6DUM`INSO5TZ>f+mX^(rqfx1c(36TFnw
z+V7ifJ;?_+^WjCM9zLY>#iT#|A+Nvi0Nmomi^IZ&3$?}QK^EG0+LXNT?Z3_oII@k?
zBD_hU(<=Y!w$>KYkHg`vlQ?yP`USu&Q@CJoXQC{yYh<|(;7$-2j~^e2zTwc}t`L?s
ziPOjZZM?P_X5$|$_boV%fR{R2rJDyF;h3?cpDHda3!S=T-f+_p8y>-r?R?E*U*G_T
zOz;=R1rQeqyy46N9BdE_M6~FU;1JitH4A*;(%I^_L8=1)mKXr25a1pm7>L%#9BD<i
z$aJ_U!i}PXhGXS!Ai01OM=F5V0j`Xw0><hb-Ue(#45$G1<OiGpKJz+I-|D4eVVtNu
zT2h1t(gNzo&<~<O!V)u<#JCM<2m}IiuMR6Jt?rb*tv6`|fIYX$wsoQ>y`<Fh6~AK7
zS}*70dH7A!<n?!jV=J~*Q*>M>mBD@~Q@-p!7;mk(dZa9H`P>KGMj>_penk9uakMH3
z6Xo={>AuQ{Rrw*%h$P6|Hflb3-T?JNJr_mV^rv0#R+hkhxjX}y%NdGsu~IIEdg^Q^
zKz6?D>}U(~WjDh8EEeZKK^q0FSkV!FaLY%+!Taqkkk^ia>N9|ebw2McyYd0xysx~s
zH8nLR(F0(4XSqgx%JFP6H+=JP@W1!odrcbn-S2+4$NIpWW;)R>efjy%e;$rM{`l~L
z4}2g@pFZ9CMSqesax>7o^X|y?Cuun~1H3Q;O}y|d2Grp%m@fwI0l@hwQ>GYaL7ahB
z#o}*jY6?r2F13BWahrqdV9{Yg;U$Jcpl(n7>7y?`v^#I!JZtNyqmD8n1c}5xd&~&;
zU4Bxy_+K6j2T3KcRQmtPS-H}oYvJl#9bT<zW#v_f-n_6u0an<Lz1%hr>I{|TC1(9!
z)3DkA4!>;twj*P|4l^8NGoa;C9{`xs1mb`x`dYvpaK#{y7zHyFr497~0JvPifbj30
zv_XW0<`UyV3=c_^Cze0Pgh(1d2aL!Vr*W0a(;<#uhsH&+zyrYOkqO^RdB7jP|LFpJ
zw(o^|<T?gl1cI^XeMXo>UI@N(A7O|AWWe1X6SV;sjH;nobwJGIWXr;QQ=85cZb9}C
za9rvm1jtymiR}cE55m<lHnSTL=Sv)b_VqHXOmVYpSqsrb2weJRKr7A}LrlhM12+;Q
zdS0V*K&@Y6wxUFK>#v{a`X+B5leO8M7XwW6_9WV7FM_D;v^wGP)i`+|#+p0Y!*ro}
zo49vE{Xz&K(P`1Mz;nQ!3)~x0mi+tQloYA5FvrC8D<w5ylSR}yNJfa=zeys0Vk+!C
zMVmdz4rR0IL)8MjVdS^4vDG9!dq~2vSkG0qYzfB55D>}Fo_c<D>q7RJP2t6ttHVtn
zKQkOJ!$0g({2gQ7dH+a&JcIxr-dzX*-d%`+B~lUaz7`i3+b2hRXc#f6a;ypbW((gZ
zo_HeMci(+o;P2~%HITtSE>&PJlP8~i()JYaab&QWq^beVKtccvaN>ren7{n}X^WSO
z4=;3Y<^7y5C>)cwq@=_M0tf_OP?j%WZd^K21BeCoAwV)f8OrnV1>fuKt*_)`(c+tZ
z4d~y|M;~qHqR__Tjs<9EpK?UF`jtiD-pAe!2h7Y5%e7I^#If2KW&D~jNBUB@+K%=%
zg$>>q4wzk`#Vc~P!k4;53v`V>RWh_90vq^)cz*1d9D$WMbpbaV+ca^##QjINj~*NY
zK@G_E2Y_p;lQcmrzN8I=4IX2J0hmLem}y$|;k27NTg|wSZG-XO20RJP3tqIYZ8f7Z
z4EYeZqh_EUxJo!|{^Qbz;TQk|2Wr0s7#fb{CBmG6uFlaWhJxsa*>aaQh>ZY<ZQLAf
zX$)aPltRD+gErblU4R;+SsWx=4>Mjw+87N&jF=Ii#!oxN;zU=8GtHYs9DDS%(Oslo
zfUHPj>Sel)L7W~I?W4d(IqHZfeo}^Wd{lvfv0vv~=Tp_j&qn!sp0OjImA)pER(PjX
zNJFqOZuLyky;P;BNm?ac3X=qo8WRK+&lkjy5o+S%3q}0N)I>y{R7jf3))d%tA8;$h
zJ+d0(Og)cC2~1t0JsWFe$)B&eohE`|p}7A!6LZ5H*$=#>9RT+bF>vtoDWS2kG0YQD
zfc*xNfeB+aYL3^01CE#-E<gWFd(QBbfw1%B<6&+*J_zqF?<IiGdqt!}w(f->Sh;eg
zy{`}iynl1&&JDl$&2P*G=>GffxA&ehuJ-eGGWAVAsT;!M?z`^}1Qz<i4}M^h$xOA8
z)HAplNC<$z&EKdh&5OoMw=M9Gcb3H@zM%MmOWA^g0$WrnD=W?5h{Xyu0b+Y30}u;H
z28hc~e#)|t@Exz0_BaRE!M+9<F(G!{fB*drAm}gUNr=nwFaONN?+N$5Ci?^}wq*j_
zMoG|Hwq%7;acdZ8v32SsMGO?GD`ao08U0PmCsZ$CSEJ2uFwUBzSn4c%gj3C)2)Ls6
zhS(9&pn5=3`icagN}CUncUf(Nbl)4o9tyt-R|&A-2|yV>04EDFz+E6~E=2yQB8;mS
zw@-bvf#d<z0ap8fFH(oKB2Ea)%SJtnRoSe54Eg}wW&tt6X@Ub`5#G73#lWaVBIN?v
zA>d&^q(l0|_#hGhdy{CW9~RM11c?y=8q*ftkBQ>qC#(}ktTK2w;K@LF@@NPa@en%%
z_9&pA5EVo|M3O*0&arLsSoN`#5V@MSiIZim&OOt0OAI9S+c<BIm7>4)v-^+bjX3e{
zll&sLLv#0)R8?MmS3BJ~(2_)4v7ckCMtpcbtWA0r)o;(6?g4~QOIvFw$r1seapIzv
zud-O|L+o+iu{$B)6gC=o2Gq#bqX>dXb5V~GpCVOK5BFfITmx5WQ=ipsn?srI2U3Du
z<(nzvP2yP0RuoIz|Jnu6D{9*8-t4EKKX`|1Ql!iA>$im3hV|jcw_FseW=%IY!#Mc+
z`1p>$W4vE{{5<a@?<gX6hy;jvjDLUs``?GZ|NZabFMs(<c>M9lyWZ8I0)Ouh-w!<S
zfNf;P7Uqi<Eiz(+^GO={8R#Z2N4_sf4`MUGOWxPSf(Pf$LcjvopR%;!3ku)V1u%Q~
zVMK^kRaN1z!ww4v9dwWp2N(~&`s%A@nT*PSMTUjM?FOimdgu@5P%k|9+;b+fJn+B+
zZP8}oj`tIQ#fpCJtcvjH^=F0ph2o?oBFz?X=j7xCnY(H0-^S3Qg&KW(t3s?|#gC2_
z|LJBe$mn2+j8`loW1@(JY6WR&m)!umdh?^!_yD;^0p%1e(%BNFj}zCATia@ZtGX-$
zFImp(hgr(Q@#;1ZA&`Y0i2~929MsXVlem_rx)2@1H4~%=ZW3UCSSXQ*pFZGjmn|?w
zYJj*O!!ck9NrZ;4`4AzbIYS~qip~*`sKfO6*fGG>BbMjspr1$(=pV!pgalFqIQ?49
zO`FcM84+>&*~@_Z<mpsDid6?T52zJTH?S43sYEnp3m%ku{s>3|CyI`IjMM{RoYWDj
zC!Qiu)wv$0>0xw&XQ4ZB-c$U%pNpOyKZ->9eiZ+X*VBC-BffZh{Em)O`BTS$D>46Z
z>BCr$y%5&vz5w>^Qg?7~AQ~aeHs~0l4oLyT7t$Og1*=7j&JgftiH&M(j?_=4+EINa
znlC~MqsD<S<@wB!f!_p)`5~m1>i%%wT4g5?<N@&Vv#Y|_t~oQDc+BB?52QL!TYX9K
zcbELUSF8oR!*Kmb22MWt<Z#3hM;PJ$*kg}{r=Na0oPGA$;h~2fvL@p5)t5H=l7~9!
z8&X8tIrrRit(dP<9K_%MzSNoI$@~nAyG1GU)h0C#b_RHXoU-3uUF73sCjr8lc&U6*
zp-%ch{j>v^v!Jj+5#KCQE^5Ro87@Cg2q032SU~K*_uhMl%F0S3EU0%|{oEwZt8!+A
zxUUfY--Gi)RYi7a7B@|Rpeb5uixd%v2zM-;=xhlLME|hrF3eY)djWLw8o6Z&d|2S&
z?f~Bg0X&>KV9PlI0Kw5Ol#3yt3b<oM&%zIwvvCZ5?|?b_VMO^;#F;Y>xNuFAC>NtK
z00HhYM+<$Y2pH4>1dqW`jB@&E$AAdHOrP0T0QWoK4`PNG`xqRmI!P8Hw?ga+;M6fT
z<9waQHBN1p$O8f+M6C1yF+|7&5o%I>#@VY*EYnGB0)&*HOdLK0P&;Z3bmXdsaWX#o
ziE$!o3H1CJF+x<(H|`Pl7(+*hg)n}y1l-+{BWuL+MZ`+)Li-4KQ}COrH&v2Gc90+p
z>8}+`&)Zvh@-pe$Nmie(Qyi(xByr7vGQNKVLc&PTLM|&3A4Y%Nd&K>eXxDGNICLwG
z2$+&5!#@$47*Y}(sZA0j?mLkt@dUuFvZ_>DOK)5kUe!Jq5P*3y+AP<zB%wkml#$@e
zO>4ph?>#Wwc+EwMuRsLGfqVbN-(AK@91S?|AN}Y@;m1GzvF$H%;)y4QQ%*U>-eJ@S
zq;0*Ac5wiIXq!48ee}^lWI_6Y`T<*jhaY~}b{RxU;rRy8L{cOJGmxkNGBAlFRfU7I
zFVsWPkNAQDcLCS}3UN^*z>nS$RRM-ba2~weND**ZI`YUPjr(x7CHmRhi%(mC>NtHa
zdH+daz9Q)T+wWcv2hS}EOT`p-jLQvE6fB5fKkLU#P^3U{qgue*Wh;Q5oIpU!B=WBi
zn8E21r438#4O+<IYG=tnYqA#DW<~8AuWOvgTHmA(0$~y^V8jhz(XY@Vjfi%oMCS!M
zk8T~yed>nW2WGIuFM%-7KSb^Yst<8{n}Y7NtIiT!=`<e=NJ%t4@-<2wKv{@|iX3e+
zr1MRk+So{KaUG(7gpnV%1ngZfzCAMV%N3D990Q~nX6Q&>mAATCbXAUh0RVTlTV{vF
zIc%e>eqbX(J_sOpU*tEgiR-j^4IztbWypwe8`ki}0xE!im8|5)jTiJ!()PoWwx~h6
z*Oj=JTOqXC(M?Xv=|B^mr__!TRWm>7frYkPE?%+qEuK44<M8faK-j6EK-qZ(lIlc}
zD6yR=Q*8b^>G7ExBp=N!>>D8>Q*%LV2|&J7qW#T!ewNlWXf9<q*(n!I<tbO$C2|!+
zVlY$2ro00!dN*_{r2o$s=P&P+O%p``WQ8?)CYH$4rCf%A|MKyR6-K(iq%D2r{^O&H
z|K@)1K^5?aKl~wl;~U=y|Mg%0W$HKHUw=O-)7yI(uWO*ceGKn??|aR#n08n{KJkf9
zgv&0w%+!F?&Am<1$jv}P0F2z+Z@1+gy%@Yqe0b4zly|!g_2VM#cv~zaB#w~`Or1K_
zMEvv#VgRlmaUYTbqy+p%|BqCFL_QW_Ux2(`z!(uZ@%C@L;_R?sRa5xmI}PE$vb?Zt
zt;FYR1eW5sF#Ic!DZrB2Cb<i4l!bP#BD@KlwWzajcgQBd7C~{;V<i`#Rw8|^)B>xt
zD;r{Oe6;~|xMsi*F}HKDlq12hk5?WOwX2Ri0V(ShL2#xhk|6zT79dR2_|elNA~()o
z#o1qAUvb`8_vh(%q1wbv4-0-!v9?WOcX5(<3UCkUGfN?T0ASF=(-<5%=ZI5)5h1RE
zNI3{a+$dlsq%bax5I7JfaQsLxY9wN2Y<!S}K={N73)+O!ck6zP4D+(JNf87PAi;gm
z_}A&aO_sd{5|~JZ0CAEENvvO%m@l=n6%n9(5pZ)YO8yH%r)T2XiU<QaE!bReoqQ%y
ziDkAr^vCa$U+O-lZgQo!!Kv=bnHz|`jq3=>E2&a@MAS{xz2+HVLnS0eY*w{e&&PBT
z1Lg^#Iwp${UD4ETDybQI#tLOXNtDSM()E|f1rQZaaqi?WR+QL$8UGdN8N;1(uhJsT
z?Uqm{b=FiF|1DV78Xmp%lCaO*O4Iv0aQc`A@@>z(!2eHw`ct^|)?1ATfYaxl1N^z}
zbCrqqw%11ti1~OzLb%5sdzj4){oy!CBR2zy3Si{seupjdg7lUEU*lkg5&YBp5hn&%
zOz0EmSWvJFK;#GK59kxT1jzvE0PF?Gk6|E$gD(KSAbGviS0qvS(fiH{zkPOISfItW
zQbt#;;{1!WAkWO#4tN5#1{r1%qy>NMI@txlabuiEz#u{d=Lwh_rHjP_Kx?+?@ZsbT
z?c@KCE|*0f5C!1S?-H>O^}uw+WCuuy_*bGZ#Bl8TZy|H(cZ;}ixNXGgP2#TMlxY{@
zr9o|WNCb?VohHSgsL*T_L;$HD*gK#vE)(H`uAXFVim>Pu*tiSYH1&ac2+0c&;*(#e
zc9|a{2O_dY70{&q6VZ=EKqJ%<j7P94qC-TG0G)nf+fa~FE3hjedTq8Nu8{be+(<;Q
zh=-uSet^Dg94q2M(HSkJ;VSoSYcP+pa*gcWaNd0$Aa<fJIvWAHPN&NF*{y-zo)K%e
zJC7Bjuo7Q*N?2D@=d6rfh3M1|%_Z^}=4h;<PiD#11!AaRvWQF(J=`!PBLF_!JR*Kc
z>y5ag95yIOcDVmQ7`r`oh!|`a@rHV+P)4GQYU*u%2_bEUn@AB<2|EWC>p6RU!Rqjn
zTQ3b~{=*4fR{iO7VqNaXAJz;27hV6SKJ}^a$Rm%K2TvSqZ7vwUK0}@Sy`~+0pL*)4
zCLQGc_kIj@ACmgFJp%~=u<f}S@hj;B051^d0Dk~zI9}ksOalFavuEKX)G$QA0S6pl
zAP*tnKFg>LP!|xm1h0aY7P(<5K?v}n9Ch(wZ0ixJieox;znS6QKmT1Eu(k-+!nt0$
z=p3!O5CC~vxTh-*+%E=WaNZM@r@l=Q_|zvh;8`ixK8*f^35ta;E?fYp?B-er;G>tX
zmyG}-_`=-CV6Q;j{v?UK;ka95Q$T;v?JlcdZDL{21@Ssqg6@|P!&uZ4Uq43%b>?=c
z{uc=t(CIFh8Uf;<K=tE>h$MgjGV~DxJph*-<j8~f082A<1Nd;{{6;+j=;N9PhYB?U
zx6h7%5D%1rSi$%Ze}9Y<O$wlI`pQ<x%q@gOq@%{oj8bfKEENhwlD_%fECM25b!}Ae
zfK3y!tRBm#KRu%VsXk3k*FEz*I;G$@x<<!2=#jPt|L8KA(~{MZe!iQ7_1uih6U{}%
zT&ANga8X25>R0rwOeS&CT=D!BidbBy=aYFL0${uD9~EKEj|~O)WokjT0{n>BgYXnV
zhor>9{pvZwfRIg>as~36b<ZHe8b!Q8{;_!zfqt4>+rkV<240)r6mGxaobciE&r-L#
zC1G@Ppow(>@c-}s{_pUi4}B;+^UO0{QaLBI2U_<)%Q?vVH~j<r{T&}@y-B&u%s@f_
zWM&?Rq!wOiPrNX2eWa0knY}OZZ~DmM31<)Z1NLz@Kl%;P0=T1-B%T0ykPJXDKn!3f
zfZBk(EDn4aBa1US)EU|*{p3B%!@c)EAMy$&m?71I4tXl9>5yKQjeOXwM^Oy5%x6hD
zP~FsG`hVQDaD`&lEJnV#UyH=2g`0T(7Q{umSU?c4EGsE8KyH)^mzY5dsIx@A+MpQQ
z1_RVk4jw=VA9{@N=qG`R0Acd6V;;eN2&RL`m5qgvG+=2CkwN@`R&mxw1ju-i`Y{B|
z6*mtsBT814MraUspkDb=0nkri-l7xgxJ|&t_>llWs8D|qgp1lhBEXp7`b#7U!RQcw
zbzBT_L&S{`JOnl&2(V+o&z%i~AS%XaEJ#M0Hxh+T3%2sb6AL=`%WBYX16X}~iU2o~
z4Uo4xsf^0?Jjbc@79zSqbk)|Ww{z(|+$zsjC#+ujf#(KNfSjysn``<40l@uaV;?+C
zxX0Z0_1Z^(=b5=<9+B#x_F1jD1LSjM>A$F^!PG;jX$ZR1CQp<siJ$7l`A?S=>*d9(
z!{@I!HC%uB1tvwHJnzR~5`6+(0sS-1IK#Yj*#Cp`qiMaMKg@ah{U6NOld=P!frJ1U
z_}mR?Nfsd!MXD8k^Wu;OyHHpq8Tv!N0RQ^>dh_uo&<OfH2o<i;SAW?#hFD-6oJTU?
z9tEzSL`48GfMkGtfM>)lgfO>qT6ovKrQzRx_-t64C+mM1`fV61QT}F06vW}<9>^vv
z8{|`sh#UhpIBGWA!M7VxF2QmLfP;%3oN=L~1(N`58De#AP=6&sU@P2ufhDp1;dJ5F
z0a7+}Ax1u8`yz4DfIlD)7f+xZ02UvA#O>x!F94>EGP&;IQP3%XF4i^1-lF3uiQv%~
z5xEoom;6Wq&>v$H0I&n*xCQF8>W<tCYeb;niU@GiXLR+s0&R@Yr~}LM2u>m!0DugT
zfk1#8Etls6HV+-TPhCF$Y6k;20FN=j&2AYdI|;Li@A<&E)WG!nYOhD~W3@$!4r6)r
z9Q3~UZw=6l&!hbHXd?~mdsjhFU%cI(HAGc#J-Y7@n$`ntZwxWTvxCh9(iD!T>OO7~
zk+xKl9Fr!gZ$w?>K42)xeVaHIj~T^-P_0N0@Sb1~iV7J9Ho^G|{Qq9x8LmF_2u1t5
z*kb<^?QgK}H17)F|NQgMo4-Aqea)OXbJT%9cfi*KekW<<XCNT}M*bdb)dL4l7pt>4
zv1pDKN}m4FAHW$uUp5pWMgWF@-q-X+o@4(oV2A;ia6m*5T!`{c3_w&6$qf4eTb#8z
zAGpt)aLSQ0!}lI~DKv}UpDJ#*O*_(+OjcN9Ewpg|8)ccDBe6Q1E+B|OSGMZMfR9aW
z5c6h<>t+KN;l3p->es42#MH;I58ZExR0g5|B*I;1=K(#U>H)A=hGYGYrvQWk96ww+
zpv^Y95CbH*b}ae12GFouAMSz{)lYfN_CR9d7TEw0$frUZ6R|IVR&EgqA__DHz<-kn
z3pHzCuj<rCd1L4Y(uQ0SGs~LW^jk)e`kf~t!FBim8tM+p65IwNh(05E*r37V`UYV^
z3;;y`NJNkr5Ct($1Xk;2b`ung(kPY0QUv~-NS)Vp+Ozvk{^%(673Hxi>>A}PH$PH|
z<DQ99JZn8t`Y{)KJJ(gn`<+&gw;y%N?$daQ8zxBYg9in3oIOb#zYfSUS%!P<y8qZ_
zFw$rawI#IXtW?ilk-&XnU8A1AO|~HuJ6l3{DT=TpQmaFS+*e<%=?GVzc1XDOlb4xI
z$y&YF5P5@14*p2rh_1P4(V|dUSve}fe=y@r$_~v84DRV0nm+B;wK}-M1+!Bg;D@-J
zfILvofF^miS{R3E*1Ugk=R_D=xNxBv`>}6;hbQezSK`C~ix}fXa)6qks;Vj+dg!6r
zDCj_w5<mpJ_S)+bZ$2CLs3-`(yXk}CPUQ_Ca4b}0y3PsmE&zx{3}k2Lg<@GePnI}$
ztTqNJEGX36Y4<hdT_Yh1hF*<wbz7-;?2H9bJH~za;&|y3JN^+EhTnj)>(Qe*kM@Lz
z5TibbfeGT4(V3fppz50>;zM0C&JlZnT!Az$b#Sx9=*L(Pu93)dNEI-`!^jMmK6Z$M
zi)U9jqR;`xggfRR*V!o$E*&Vde(N`YLw_M+07ASYsGIiG4eJLs2<WOYc*EU|SPPRy
zsF-Jg&T*bfRm94`Q8%v9b0+Bmb)=_gF6c$itA3Os1)^ehlDtZ^la`og4RfnRF((lH
zBK@<D`iX6Q$J#-3&s@=Gl0A3!i(|lCz11~<*tT<)M}T=i=o~~IL?PY)Z0*ebussq)
zJfhaY@UKWj!sbnSwiMxSng}*_Yh)uTBr{cIg{E4VruY@F3;ZuX?cng`>pvW(XctE8
zS26|uc-&xcyI{eBP*qixfWO_}<WFB_AaMii%Pb9J9u^}ODWd1ydFP$>`?%wdGwOc!
z?AZp8bO-f-i`qFJtQe6xc9d+P6z>#(YXAU107*naRDh6(1jtT;#~gEvsR020f%bv%
zdh#(*7eo)D%^5RhnDJnf2!Wc~dTpUPHhkxU2ZfLQ?0Jb{Cxin_a>I&d#Rkx(LYQ35
zkjRw{SuoC;g4kS^-Nd>_56}MYCb}2kqvvHIhU3E(5HKad$ph{H0N`P6f|gG}p=)r5
z^ceBEZJr}hFQ9>UflCl@$7AyyKpoB-6~I(o$H<Q$Ke>`tU>V-2`-pKL26q6$N~tUW
z?{X0Us3!n+0Gfa`X4$X2NFxyY!-W#XkUnl!V@M0~rSB(r5K#)5R`w`hheP@c<k1F3
zgIpxI4H5w)5vVj&wc641H$j^}=>q_BZ|s>=*{-DiM72?CEa`A-kv%7sU-eQ?JaGiE
z!1$xz-B+2X^pswn>n4dv&lo#4BElpnQD3bN>qqx7^Aw$uokpkTKS!!3sR9W2G`o1B
z>G#`AH4>?K)@<z5?thB1C}X}d#iCe6#7PmBmGbAGF4sQnL5QwcT~imv>wCf5ZQ)bb
zo)-S;std*WYqKCpm3WQ@lbrMCJtRse0e0rhnUjG3U?z1pESsnRM%hjG1($gI=bUp+
zIQZa$ZSU`2{pwd1&%2?a!Bhco%6t&RlUNX(V~PW6riMq|s3ZXBzy0lR2KFpGaKoG%
zDEO!MkG^~VX_IlWXwx>)`^wA9v=2d1*t2p*IPb8kuyX0D@XRZ<VZT{ZLW@ALVzSDs
zT&?u|xWyHUgToWR08@asS@jV`56gc5o6x?9xrv^K=$q(mi1mR7qVK^mBi6@FkPUS(
zCgd7E{SW{U5zTP!0ssJ&z(D}J_-92d)W$;se%$TyL~tNxC;TfFOprQ)5W!6XW4Kfj
zTLfcgK!QjCys-gb9+(4DAE<7m64)Xj@`pIU`v9o}5(bC_Tmu=ONi)<Z;vImeNGZ@s
zL(nW2p}{6V5IaZ}=oG39)FflFWW1*D?i6v#c5FzQf@_d7mZD1$0UJAQKY{q3x#7J5
zUaq8FGwk1*0EkdJUUT2_K+JXJ?D`R}C)Km4Ln31I%qi{x2Afj++p~?19|KYdwQN0$
zxK8G2ry#squ;ai^ph3o)NOwvV3m{)cf4RCTZ>$>=zWmYC!e_2H-_%b?83wD$bMSxt
z_18`Hh6-+<efAj=@Hc`<Q|0eueAbfRJO2zwy=SlO&Oee}d<5L4CEDuqg%@71Sl&no
z*iM%i-+S)4r)|!|6d+1><H0zX4rY1)AP5aOem3+$-0uqx*M@6x@JAJ51DC%^0#!C(
z`O<fO6TbKJSHi(ZRLG!DZd~p1D6oFv7B?=eYLFd)028l(wE|FF=-99Y@WVwAOM3L}
zi02WxV>Q1?L=7A90NUA-8W2htZXbicHCn_0ZuIMf;f25;jy&RixbW2?4nVhZtnc-k
zeq&HpstA6o<e@-<_#j3A*RdwY<^V1p?*PCWf&{{Z_ytqMX)+$79X6<s%s0eGkz5K1
z_%liRW}+U>kvn9Ih!#TrPLl;X(F&QX6%t9alOF^?f$Bx#L9ijl&;A1RZ>8)Ah?6ij
zE6*eV*2&Z#xa<fb1*8H;yWc8fSQ;Y&Y|eFF8REqw_F8qMpO5aR)zUM+asO%Oska-(
zEq|}&EnloUwF?1YY7S1vu3HPKW5(pt1nM0(G1~~vX6?gJD(ij-NHzc>x+3-^M18DP
zP$|x{>mU05PN@b&hKC~goh;DKejA^y4gdC|tHb3No-Ml*j7Vi@sCy9gdtjgn_(S;5
zpFcmaxejVVTy8l(Si&MUYt~2l>4d$%=fm}`$NQ4}9=aJwBmhG<w;8FK7mp7wA_j82
z?5Cf8x{3V&WI}!Z`q#fU;($EZ3-AH3d4VZAm<5Nj80its5Ad(7tTckdMfJnEAkdb#
z$GG6w3&cgAa^(J@qGU|?kN@{l7+;~_IO5O{-?Ik*gu*&8xcM>yELV`7$>Q$W{f@vs
z05E#od>ICUIat88edS+N-S_qcG4ue^H8j%QFhhe%DJm+BfYM0k44u*lC?%pG(kdN8
zBhoG1-8u9#+~427@XVWeG3T?->~r>BYwfkJ>smBjCOLZJ+5tl7?4r<5YK>~qm#Q1%
z|E6(Ckqv~)f06>Bb<WqHs^<S8pll9AMU|{P?>KVt;}Zb)qF-syBaGK0UFt0PoiAuV
zP;JUCddCi)wSLzHOyJUMGIXoS4|s02mJP^LzV6ZFP<qOaqSl8Y!qW`-gw5(Em4P%z
zScr)#2nSKK2>C^)pBoX@O|-uDqhFPZ5&pu1E9uu%_GZ<CC%VGUefO`mGb{1F{1ES>
zvWf`UfVuYxV*Am$8=nk&aC$(m4HW3Z9{U7rA^7Dw4_v!h+(iY*HWM_}sFT2|C!L2H
z{Z9)OPmnRk7h4CoPw%P5aJrne(pEg}@4BaUkt7j2V`CvcU-_pbmh<t{qh4kf6O*x`
zox;DZO#v&3WlmP1AhXdBYH#2pJ?2@?ychL9dR`!BwegWfS7$Sxrc*OyOH-c#x0t$)
zkc;w}wO^MEpZ1s6bR@x%IA+{6=Jj3xqWhUco$k9S&rJw)70>fttgr-N7Jzku4X2ZT
zIl%mqj$-rf7@~&ah07Kk)<6kGM*KE>#Vbv^TwK>zU&Ht0o4p}dU^Drv<>-ZAQzSys
zkfHTPj4QlvY_-e~`0Cy7{SRweZ9-*jYO!WAhF(%pV%F;a=*X(7eO3hSRrO1_9XQ7h
z6NvDWvNZWjV<KSH5Ae+MHli?iX=4Nc$QF^5@!Zh2tQdWC79-fr9uy%15DD^PsZ!kr
z@6&b40dA2wB?jmiPeAVLNwxa8Qbho%k{*vm5G^pzYDk7M1aI-%3x}V~mIG31{>FuN
zGW0@tiN0?yUm$E;9Fj&O?HG20yqEy)5x-qh`R|DgjR-z(KS<!qoiNb0(PcWJ%?$Sa
z-BCnCzV6ACpVd3^byo=dPg+kSHFwQ!>nKNpHLy5seCQ$5#)|$IDC|#?d2;u;t9?J8
zrKZGN(}01en(!;5P2Y?b4JSr9eb+(-W>p>~RR=wOl&zM9wbq)hq&)wGpiGe|gUM}c
z#eSqvBe&lLJ59)o;chCHFRjxrMP94fYQXjO$jMp>+&`{0EDwKanicYw!&t7w)d|>2
z8#t@r-fT}D9A)256Z>z_eDXGS8&p$A4|EP5-gJKBIhR4*ZDerTuLPbJBK-KX4w<re
zz-fM_$4luO%#t>@E@a?NyVU$&g;ix)buZv@_6ZG<Mh0y}+#geCGz%Y+nu(f$y11G4
zvVE#+#ECi}0#Mz2eO5%g#v~!-R^$jl2IR{;icf10Wf05)2yV6V3QSh_lTU<_giieH
zRR;cr#qBpmL}Ia{vPK(vB=bw!U1rsqZEKj}R&oje&%+SV5CF{;#gK!Qy00(4z>gnn
z7%$X5zCNdzN*t6Y9wC)$F^5G+GlN@<W@zLA^SeOq==Jb61j!dggoR$Q|H@kyCxF}4
zws6F3R>%#@U)RZy+#H>5R&s;{F!cds-rgT(Smpqop0HL)ej#UKLiz11-~A~%$3hD;
z9{Rp~_ybwr%QU8k-(|NSZ3cCW{_Hd9OX^f-`N3<VF}_iJL@+g+cxdQfKKz|)=h1~k
zM~=vm$kVXv4jcr7##>%l$@|dwtWQB1TAy`nX{__r^*L+$eV^Y8J^!)U64k^{;5qH<
zjoE&BOWV!i@RBc=!ZG`d@;6<ZSXl7{36vm-Ocl7tcJ4K|UsA4EfYiCS|6YWjwNh7h
zgJa5^Y?NiikgO~FyXvcv$YSzDyh@U=^y<CF%c5(`IEAr~-LF5TZ~8vgy|sZQ0FIG;
z>Q4#QXfdnp@44rqQU`LNAl3C&Tb6787cbJaDB_9HgO|~|3YxS9cC+=~l?dPslyv!_
z6Z`C}=B>3-Pm*-gGlt@6m%aX0CS86uqgT={&pZrS{RJ+(Pl?fHjx}z#2%Avpw;iO;
zt?A%j@1u@%V$LYY!AmK~lxhz^ikWaRAviitkrA2IJw}kcFp&R%3g|Wd7vmg6z5GrK
zzpS#8Gp<6DP8gv2$h2UtfaK$^C8J=~3wv}DySsAcHnBlyL}XVQbV)gnplDB%c!IV-
zMHN(gSu^~A>P5*Y+c<D}_$%A~=>Z!cZNdZN5Gt7U-3dWYg_EHm7;_AE^ssgbC@>XL
zRx%Pq*wAWmYm)MJSE(f&#!DY-cH30*9e>Tv%0O*bGR+gZt9$3#NRFy+*E@zhAG~((
zKd2wv$Xa>wqM_eJG?PuUj(M~hDk1j0{ugMlO@c#6eZtG%GJ^GU+RfsrNMDFE!w>oP
z!&~IScxkA!vlixh9L_xJzgYf_7jf8Hx45H6(s`p4s($>)W8m13hMAG*DN6`w6O{FK
zwzEB&=2Ob;6I}%>)b`#FPp5*W)UKNdfsh28A)$^GC{RUVOT`q41yo%#lGzdQ6W3>K
z*ap?s)}luDO>ZxrQ*JGYy0s8+ry0&g0@rq5xXFLJWesPva4$D|ze<xO(yCTo(8x7j
zurZt;+B+#Wzk7>y^nIyWq(A%lNjYT>K~TKn6feqhMMU9kRR63w^(-Sh6FP#7>GlyY
zAiVnbGc7XVN%n5jhH5zK=c5o%n5lLp_%qv$`F{SKCVoX*yE$V-IB;z7iK^zjmY%>U
z(8m)J4KnW3B(uOV40%#Yd4S#nxoju3e_v`0#{sz9zIF-!kynBK8R+`N>aC*E#oNg@
zL;1?Q_#XG$!cZYP^rgU(LD1CrYC+b=Z;}jrszJ>6LZ4CR%i{Tx^HE<P1RF5_8LJbL
z)sL4Ae%?OTAIkglVlm}0YRM6;{gm8)@LLr_Q#|Q%?J8;}rC3A8S}#O(Gvh?NsSYJA
z@vX%22gjK1ikd@E9&WBinZp^*H9Ub?;$(G9Mtk$a^XJ8+3oBF$KdUlWN@|y<%R0By
zxmdsK7O!z^3XEeA5b4joa?uJ{u41KRX4HK^*CGGXMf7Ww@pTE-DWZIws?pi*s7kNW
zfQ#Ym$&b!hN2>wWy4j_;!{^quIZXRN`9<sN^W3oIJ@78GBWMcf7R79b3SDo;XA!q$
z*$b+6GkX<1IaW-zN7wNuXk!Z6|51X^;OWypL8j9s)%$bsQ@`rtDdfN|e8;@$J0T6f
zp5yRHa^9&MO*WUA6WP!1DCJMpNKnoEu7t@tF@$|6n+w%0#WW)XGjgoM6ie{=Pu~t;
zX}a`@VB(vI1ylA`S?MK;JmOlG-b4VF@zw_F@FkJ$OQJXb25LnRL+!t*P^5ofT#fvr
z>*u{sI0~MH6x;_OH&uel9<)K^ddu~`6v9SC9s^?NYzmNGRAq4}O1hLM4^`ZkY_W~7
z2$0AZwfIFh5=iR6vaL#BhpgVxE6JZ~f;oiWPH`kr(iEvq0(23FFUw2d8PUAEvQ*de
z^~PH(eyUn-#u1zI@wJcdKa<2J=OL@Mg=@y6uf7`wl!m{vUWzXi&86%rrWaq^I^ol0
zOJw^O0R`tQq<>vN8oR%%-mN%lSqR^DI%7zHpB$eknDhG$MFI_Q3h)MwzJ%Yl3k8t(
zVYw9hC#A>+JMIPpsqo{!U&)-WX_<0^=4a>yL9aFOQw<#ddu`3Z^F$;nR^^f02uip`
zZln^lY`$#5@95K<cmt=bR<{ZSh6~fzFr}&SdZltO)to)hjH71$`J<wLt&0Up(B+()
zka6xsh3M7gbzF$k>``*jSx1Ov;j$>VKIYSVho}zjgGT8KqZgjwmKcYNWRO71&bQle
zS{UP=Y$P(2q63$roh<kBQ%29wZZ=swdy#oQnsStgX_6|2dPIZGt8$dH17M41M4+!>
z*b}fo<bm(JxBUn*s4RLCxH|RMQaIvukfkz!QZwvAAKqi&Oy_pFHzNMg0Q{@<>3h*%
zK$IoGkpDjJwOxYw%S-DT{Z(o@1ieP_5ozkLeg&H%a@0HZNY@Xgmp-aI&gF94C(R<h
zCM7S2YIZBH@*0Z;(3NufpH_Aht(uBV2Zb&fR<>SH2jP<k6e5miPEr-!nASDi27YF(
zGrER;&Z~@sJgs^6H<@F(xYJfSErqJl{Rke0ls_X@``9x~8brwT^>k>i*o#l@VrOab
z-;xwBfCIF>>3K5x{FBJu_1T=0>aIi8jJ2;LF(=e_rhs{3F4!N>S?=^mj$Zej|5_S0
z4L>2X`=rBXgNS5~Uy^jW0N?P9W9<gl@I%Gp0Tr%*wgTuD5<#WUp$cdNOara(f3A~w
z=*0voF5oJ0hzD_*7iix487(hZ226mqZUb&c_fxXvDYASpJ9+_#SP=eulF`f%_slmL
zP47MhGBWxnxgD<ih-9_yF+lO}adis<6aq5c_U%8az-a3swrax84XUyh$0re&|NI~>
z@I)N8S#!;2{Klf20E$|S(R-kCcwHxNlY!#ZkEhSSNc&LC;%Zl_d<L<<<)2_H8a0fE
z!t~Ry{eTQw{uYw`y=McUQS!)DVGH~SBIqz6=N##uUS26|?A5<Qu6`pTANAD5T)NJ#
z4fbu(zWIe#E95&Y&ac<`);*buj<-|jejJ#BpH?@uAWi`(jNoq0nW@6aY}svwjOM^y
zN-`fVizqaX06w`L6@;sr7M<1VEE3}AJb#dxe><X7B0;}&k1pcF`aiGh_NS8%YdNP~
z`hPEMpJmbV+UQjIO$_LrJ+s^?tVY7UO3SKP?_ti@vRNFf7s8y!Uf8%lZJA}5fS8Ru
zgo}+jJ_lbcz{vFd&sS2z@A!_?ALO#Q&l~LS>SEjUXD)~@`!q~|hTYn&7ADVW^DuRT
zFbKuq+|PbRyIf7ltw36sI=l_Q=E;p$&`HB~%d8iEVc12Z=$7Pl-)ug@@t&Psm8E8B
zlwvNSlDO#)G$QWi#A%^6Tao(Z+IUzn84tmdq*J`Rr*8pGLCc^Wl{hJb5l>Q~S%dWP
z%Z*{+_&cUj9dA6t*V5z=FE;26)s`RsAv-o<8<eo(U#v{tn*LGR6i1~PtMCDq;kqzs
zK+`keoRXpESeV48`YGh`?yG+>x~G+_6f{uO2z`ROS(xfqSEMDQ=z{Elrmr^*xJ^g_
z$nV`b&)z)K3(=|xt+YY3ng4i#W{SYSa8dor27p;h&tw8t?)hzLj7!MTrMQ1rk_DQ%
zd!(+HJU*&_n%nrRe(XbL+(r0<kQ-Oiy2qQcoe8hZrahFnEyurP-v^8WucKd5C47_N
z@)IjTI*mo@`MP^oFV~8-YACurlinNvYv=w_Kg&-3vo>QOBZ{{&<JxyS?b`pl@%+f{
z{bpPfI3VM@(_~;lK3O%NiO<H@g`e5iMOy5zM_&CyPovUp<-Xfs?g}Mg$UY6@Xl8Z*
z-)m|B^>0i|m@$Cy)D>$s^dYQx!mu_ultSkVS{@Arev3xw_N}gwobR7-qtw=WrRK!H
z+`mwoTX3x6PAF3|S<xqmcT#ir_rgTdwETwq%7>hveD<5m)k_zWLeL=ve%O(^FLp;K
zS82+g*=|gQ5>fA_8eWc8a05hpEKJSHA1BOB=tcBe4)njer)X?5oay0Bs>kf7*3|Uu
zA^>y+NPn^~kSdqGC$6cAsP;G&<cV+e{#bNOz91Cu#R3X~X`FBBW<nr!x`p6dHw(P^
zOZd|jSB{2tgSi7?lo@bBn8H3jkNwdzfBJ}TpF0)s9`$y!z(Ovc+vuz7NoK_?#S(gE
zpIUamRET-RdA}#;B)EqAYsrMumTKvmWll-PGtm?wULMqQHF?*pqS;^a9Y>y-(UWIa
z`p9M8`?=8q-WxWRT-AD#bU#*B^yNy)w+5V-@ySR`>#<a_#d3`hBdvE4oXuicZ=r5|
z*`6c=?Q1tJL_*tHQYUeWatN~ay7p<WYg%^2tNqqVBfhB^@(q^9;beaE;MKzQqcp$0
z={rXzcXrL-w)euoPvjsq+jS5`w{f}j9ty*`oyX>HCiB3~1W<6q+`nJpxWN@D5eiTM
zxN=s#zU1a6c09B(TQXkLEFba8-yzr~(K8oq09S>Kgu-u6kSz*iij*ea+Oo4P<qHGf
z&$${+*+t&HS8ehFqg`~Vs(Kao*2fk){^ArBi}qPQVL}wB!UhX0RZprJb*NC5a)1J5
zSo2INMcKs70Vta=^5}pc248X=RFK1vH&$a0UoLFIfF((9I@m08simvbh?4=N;GXSr
zY{BQW>ZZb9dO0#&BLz&GJkSCwD{PBCtK8jCiz}i$cKdIg-}5T95>|ZOn1wY6q0N=X
zAIwnRyKgltpv8;QxXMLd7V<q}CtkUVg>73((Q(<8uE}cs%(^4r(y{U&HpLG=W>4Ig
zoy`n1y{{aisQRa=M*aQtg~;bj8TB3^3!>H@wNeNsnmcss_RM8rhMSQtR{j{J_ww;H
zm1eZ4pJ+Pcm3LK1{bBe1%~=!N$nT(i)B5<4eOy8BYk0k2lT$#3-(Ky15yo!A&Ujbl
z$<QTeb9^b9Wvm+t?w3D`+0h5k=wtv9H5C;@#OZ)nZHJ@-VB?-QbAeg$Z3Qq$aJM)9
zaR7;7y1%_3dBIq~u5J^?7IP4cBtc?swpbYxX-Er)JX$7?asB0T0!#h0U87wNA}){G
z9Vp}ALo1ZL(oy#yYrXCsyREkJx`{XzK!kW%sr*skg5ihbTO1<gWnyWIS7w`m_qvj!
z5QbliqxH#Pk%$YuQVdXgldL%+9`Q&o!eSGRj=xmNfi2=L-*@`4O)F_@$<TH)!J#^U
z>z#AZiKf@Pcl9yQ(vNxRF{(=t@w_;(fa{Q+htQe*6KZ#83=d{4Hhv_(bAEM6Rqew;
zEK)GW$>b6@`sR7GY~_*3K<pQ=({QOqzg>~Q3McJ7M)};E$GDL%`8TvXe?FwrDd+LS
ze6(yCrD=dOzC+J)-JgWD2oC!M>`Bg{;QgX)+ZynT!TqeW`p)BBBg=rrJ%+U~u>+I#
zq?xr5S2}q16_$&OHDw;NzEW%oabQQr9-#LrJMy5N<~=OO6$&;aeg5p(M@)QsY-p^L
z>BN<<Z}4Yq8p2t#TK#hndUB4F*#Enw?}=pwzS$!R&%QLsfEcoQXn#UXDE!r~zj#O$
z2bDCo^k5P9ZVH3Xk?U{@eDiF5ZgjE7rQy@pdAM`Jb|3zlc7a*tio>|f@jmKzn1zwN
zdZ=njt8?i~h6A$#DdQnr?U;?-f7{{4*A-v?mL@oWGT?w$PoMF}1(H-O+dQ27O{l>~
z5Ikww1gIqLqmYWeLB4qMLzzaGIH}@DWoz|-OEj)ODYCPUb(G^?12EQQWAh1_r5xU8
ztT7T?I;`t=5s;nySYSp`kosaT!6Fd^-x~twGSUYsO>J=Q7#RdfHW1rvPc*6_3j2>G
zG+cG5qM!fmI$TL%Wt3*W>eQwQ2cJzn?C>8}4h+&NenmDIr6_;I2)||nmeH0go0*|}
z@6fbAN@}}HPCSIRl@Gbo1Bve#La&`gJ*x^0(SDz!D8c-1s>#S7MO+iWNcAYCMNrX{
zk$$9$jGbHaX{rp)n!7aGX7{EOdA1xhWbkjtWta`X5i<)nCYen<^@>PyuYlR0{h7;b
zqQvXJyv`Mp-su*fgXD1L5kuC2G}HdSM{wIf=(IQzf7`qto$h2>h|iK7EfZuuV8v8_
zJYnYLm-!K&_{m{7r-6ou89mk@loK21G5#gg%oHs$4!9q)u-VpJ7}YhQI}KldmLH(g
zCkXw`%C5rHs?xc1so~n0K|sMdhCi^Akr|cqDO;A#e;alZ?)pIuN&IW!-hEw(87plI
zM8bp&sc|B-pJB9hbXOIbJf#C@8%q%{8PVaz*H-<<e!o+lYAT0P)4qa(&YT8bsjxdR
zUD-6$Mgr-~65-B3P7W><4eJm;{rT<H-1{XO7n<+9wVgvVKYFg-?XR780%(?GLD%us
zV^ekU=^v6_Cep2EVFTjztmx%&m^HdNNll@#taV37pH}}WB2pPb6DI{EGQk#DU+gBa
zAT(srxqHzp?iOa6M(;q><Sn#hIEM*~JH)wLuc%JS2go9Tlg+sMPNltMP3KEb%&k-&
zdd=7167q9AygA$!emHgVOmlA*_FMr1>_`;QVN|K-`UiHiC52EC7nG5cUEC_WtgRo~
zaprQ{3w1`EeOkWJun#y){@au-5EY0x&CO<w4C(~K*i^nse)&V;@~fysYo29egfH3V
zM3x-71hIS+(xe%~hZ(^OR{#nuZnE#kased(GIdXI#v*c;d~SCcjrP1*s&9EqFV~M~
zGc|uB;59|M&z;zoV@30Bln30-4?Q!V%o#NPaOa~j&J$eCinenPs9hB^tt1;M*U7k=
z8m(9@9hdB_S!%hh*<N;@P0tULNfEQ>*4!mxkfdw=JjY08WnmKk=HPv(!ljoHwgNWB
zv37ON(yhqh&Mk(%(_cGeeC>zF)<;6Tj^|3hK>G(^3*yEro7WIUh_|$~C~rmoASk<!
zZjVE#(b83XCLH$x>2%c1J!M5$TIx3~{Q9Rzd+XZ*Y5Mu*1!he578-D{N%3MW682V)
zqq$IiV{=83OVBSk0zV?`{p2G&7thM-GkR}Q*k<@rw&0B7xCnE|U!kXTMYkfcNjo_R
zo{x1Y0+~?{VV3YwEXO35)#^Gf891(33H_!xjXNdzhG=;jq*hGXF9h2FPHqOXmIQx7
z54B(l{p9IB#|K%nLzBsJVWf-8QZ35vKJFP{>9<_9x0>P7KP>%Yg2yqnI`2&Aq{+_q
zy#F}4Z>o}g0x`K}`H}YP%K@4qaDOMZ*MIl>4wCJC^-Z|JNpd|xY_Y;gR?ct!_(_XV
zV1W0(WMQqdL~O#dVC2p9K<Q^4fP5u?9`Yg|NhoI1CvJ+2rQElnP883$bCRtxQ+01&
z@!Ru{xs2b(0;Rohl!Icis5mL~*cZ(4&q#8YL77g+H_l_ITW4~>!4&TP1TBtX44AZ}
z<gNRg<gW@PuWnD)hZk-Gc66iaiei4cw)XhFm?iqNP<r(GD_OOQbP*M>)o*@Dm&ep&
z1&q#zOn=d9c}(yoW(9ZB4$+3@a=~^BXv=|81!~aX;1FokS9~|FW7bKI{+LU`Tg$Qo
z^alKrIi&lzsk%9B_#1!RalT)x<g5!k@c1!+So&SBUh2y_N?R}=HQ(Q-V5`GU3XPb-
z>az(>zQX!vTpvt-AIsZV=~=28v%X`uC84OK@H>X{+ow1wrYJV?W(53MSXc+IhJ2^8
zGT+@Mq<i!rl#S|<iAm$XFR$$y%g$j+IL^BAB%?=a3@G>s@*vujxV^ny9cMsroT6`T
z4{@kq^xs327}k=jEciJ<hi;SB1(>aLr>GOp>!X#s)Ol_%N^fls@SzF0d|!)@Vwodi
z2GJ*#jx_$i<3w}f!oi)tON_%%d;BJ>gV(G3%ftQQ{d4E>MLn-yOFC32ldXkSFxGs$
zua4%R1FimXYI>0yz(z8rsiwzD)grwkxmJ6l#m0E(`it)~DqA3ul{x}u$Y&8u3qr8G
zy3!djIdK@mjY%nBNdYE;t4|>G8{DqTXc7Q>NnG2qeio!`_zVbRc{T6F40v%73^Dp{
z(59ArJf$b^o?JUNgWo@9{A4r6e^5CuS8wLQ)yjnH68)c=$nTP6<vj7Wp)f6nt(9lr
zV|oQ9x7SCl`TVcWFAdjM?fT^;{Puf>m6Zar`ZRpab9`(ashh_pa;}+kd19)|C}a|y
zVh%LrE&Vch{)`-{Wnu%0)6}LbtDkn~Ei{b-qt>7sNvc2D=66<>c9lT)W3)9oF!>8+
zvGcs&Z&gekT6QKA7P5ZxuLz<HD2WUmalkV)943fb__)-?n>ohjq96h-{M~t(+a9g#
z5m2t$bI51<jENb!o3h`kvHT44F{Yo_YD0*<r5hA_)!YBf2<0;Q+1cLk<qY)YJ)s=!
z+-u(N`1V4;e^69d=%J*UovA%I0#oG}ZBMy;QAL*>fF_^+?FdP$1Hf?$@IH65>Cx1y
z!9K}qkOzCGl+zpN;@otzfISXbXFFf9CGs0q^{kUufAa=EN|J3Oz^D4v`1K;yc6?q@
zO%+3h4q!q*w0!Unbg4|PP;-&yAKbUlQH^0b-w>k8hd&R?%k`S|bbBg&fv#CtZza{!
zTyTHansE<XV|Uh$u9TiF0KjdN0)Bc1J<Nfzv4YA3SZ5U}0314Asn|{I*&_Dm=gLrE
zGxXDl%9MOn)m%UwU3*0xOQhYw+MEvQA=z2W98tg%<TpsVl83;3#cu06oZdf@e@u0j
z^PdBMYWiad8Nw>fcOog3-wintoa<z>IGO}7yjX=(LsxJc7b;Xjhd8FqLgRJ;R_<h2
z?VLc&NJjK^kstRx$e?{Ol?JOpD?s}9`DhsD-yf~){k=p;Z}aqAvM5-Zz(fCkq}$vH
zw|V8DH^3IWz~?Fqi}u8mI0Yb|e<$e-Q`O~*Z0USCk9R(9%i$Jvq%$}(w7p~wQ0s6*
z{D|eD1bdczx#fz}g47FUp837DVN5Uva$0DGXGDT9s^e*K5a;9KxnK5L@cE4|4(_;*
zy89$Lo`@@5@^!rtDg=J%)Lvlv_cSDRU`eH(XKi`fxjCGSfF{fRU7AcoivgR}lLzAC
z?F+@{PP_pDcG7y~tZx|RS6}1=l#1PC986=G5m4}UJb?)6JwXU#2$#s&fg$T0A`*y?
zG*I`PaZmlNWbZ$u7l?E+oAcK#+E~^V1xsRq>GATAFSd9QJ8?f)rXOE`Uc7q6VB)(Y
z5v{EG#=;(V_N2t~pE_s6yZf!6U0aYT<XUbRe4>`po{7?g@_9)C=&Kb$3aETKXm>um
zLNb-CN%eO0_X!shx_AHII88Xbp_(lu1$1ql@*|mVPTZwhx$Aq)(B8vQGakG$!odic
zh#xSDh+bRN9LLQ^&iY0YFZax!D93Te2^r&jOQTVd58&^^9({dJcsc2E1V@W5fp0v6
zdGn?C>?`PQziwKfN93O|D%M_f3G30l)y3snk4CAPgiY={8&>A39Z<K;e4+|&RCX+o
zw|s_KY~M>+F!^;rdK6V>qOa5#7ZtO>x$`=*dmRhn7Y~B!ogKUgGr7s51VeG;r6ybv
z5GWzdZ(s;h8(;{iypY%*2`mlX^tJ2HCYv^FIw^KaSK7+-oPFy(oV{aGJF)j<mU&KB
z{vf1*&wfMOne}34{L<Lvv03V!gx`df(@vj)uqo_Cj~2h#0ZwNJX;=tHM?CZopd4?a
z2K&_pAd+P%4OXkI3Y9f+GDTimJdkVT2F~|e#HoTx1}VMI<E7&+vxT}CI1j^NZyfB%
zh*C=u4Qub$*W}68AD<qUDSfy`Hh=X0?lxv8-1P+4j$>6Qsj-&r)fC9i6hK$yP7YaI
z)Fwoz;8mlM$F9K~1evyff1Pa-S)34bgHe__nmMfWC_d9t%Wm^Gog%)_uB5&I@ZPih
z67b<aiq<8&CbbZyWVe*^&6hhL-xDfxY(P@hr)6jU3b<E9dxaa5J~2`VEj(=HTw}ba
z17Cpj5=~#O3VX)AFkc-x)O&11TX%MtI6yo0moj+aJ%iM2R^V^D@euY{>6TpwQ#5?j
z=FGGSKFzQj?Ro_T*r)dX#0zY0h@}KGKKWpiLKbO>?7*mJ-YKqg_k5D-w|`kMm%X#g
z{CYLD>N?u=iml9j|Gm^KZNPpoi(y+&9P(;uEMo1?qi8Jwxl3%ohVBc%7aKsDiZA7)
z1lr;o;6__IA$nWd!;@!qsozo?Rx?L{K)6c8+gOwIxxRiHkuFl}hThr_o#fr=WVujc
z!3;2G5qb%iRjWAXoe7z^`!Oo&91lOsMkKr1MKJQ5DX|TXzFAXN7Hn#GQzK7yuhC<%
zZGZZ&Q;5x)06|>;^N<#gjhZ<ey_5=$`#i(=s(7rfzsG^Dcs?{bq`u3#kGmE{2q}vc
z2#$NzrTZdFO(-2ARBDvzCe8IAP<CT<Toxo)6QF5$T>8Ag-0hH$!7AkJfgR+jsE0w<
zYbrwX$UItx6?$*4w<Gza*I|&Ze!^6y$5ef?=Uu|Ti&O`R<&?8xn26d2`?EwwC8r{n
z@e??LZ35Bq0duT+Fm2&_9KQrS;1JZL=M%hh><{qfpt7KUPvj}<pn0^(HXde`o)^Rc
z_A00*XI=5e)2uEBp@r)Al`+@`+*ripal#tMaHPkE*dFNU0L!#Z=OEdmx3J+)=p<qA
zB%wM;*f`)gA?d$+4f{Q=Ll5w5&@^vNj}0;M<<cs|P#b`AmWXB8KjJbc^vL%o?`h2f
z!)Tq+6U=754L91Kva;t@8Cl+Sv4ZhnEJP<icyBy_WMA0WSW)<?eIhEH&k+7kTn*W=
z4MH!hBl~}ZrhQWRVu4ds@X7%%1a+La^4q)!kyX}OD)^nCw8@C2MKthaHmVBn>NYGO
z=>b=$vtFd04tkUWksX&(KSG|z|6=G%qSq5PpFcQk<PC_%N3XbO9zkp}t2MmN^e@BD
zA3h7i3l+zMoXuI<rit0ToN(HcVb%5JH2cj0ER<($8(hLA*P>t>m?Zwp`kmlh{#@b2
zsqgX{@2Z?AP`H}|<2!ZWgk9*as82Wh+xr=;bn~C&3-I++4&E0Ad1DUZkc>B&PI%ZJ
zmhW!;TsV1t%Qcv(yHoMBe7|n)x-ZMU&(-grRcd;zJ<laHyK4EYPXug1llZHyAK~u)
z+l8CoOityU-o6vHjOj$#Vuv7#Y$2GuAAY(0a{(LDeANO=lU0#Qg_NgKf|JiKuH^g$
z-{3=0&SolX()D06`GH9f3$})y*w`fpH8-p}p8Ct{K(+~(g_<eel?jbicT{xC#89H1
zgs-~vZbYzIY|8V;q`>cifjR&K4i4fUD;|bV=3alI7BHzWz)PukF1-zP8L(^ZemeTj
zGWnj8!-v$AP!iAuqs!{oLwQ>6GENV)Y~(-kn$|!|Xv9n_&OZr*0y0iDT<VdoCA>!;
z=tI(QnbYGMUp#o9|0E<j?$1Tx!F<$u$Wva=IHCTvc`^-BOcI{#nF9SpN8lXGWVj})
zOROjEQ_3}Awl-_j**aENgv9EjN<<R;dP8?15T9B2E$&LClwyeu+n``S&o$@Za|p#M
zkJcaJynbw=BCo&8@!dA+{B+9clgNz1+4L|~^fb*>rWLQ@vQbfY^*TISpu|{%#s8Q8
zz=G(r2w+edtWn{7IqAKxhALs&lQHSRWJs#dd3-hAI<T;sE0?HmIZSoK)uFg!GS1ZI
z?b*QZS$K(<H5(rNJnyh-pjYy}NWI$i?(|_5e`Diy`?rvM#<Lqe{sRd#fP942<!W=Z
z_u?0i=t75{p&k#6?`gsGm9WVY*8-?Z9yNO$sHj|^r}(f9K^rX85Fz4n%*4h+9P}7*
zZ{=HD!W(0{%Zw}Um|(anW8_WST26loDeOy?3NnM%tk3%cW+3&&PwP<KfWxGP=S>$W
zq2~=tv2s0%fossGfpJ!V-m~wJMEr{l|F2O0_bg}Mb2)|`Mec}f(X-a3Z!lctfVTc}
z=Vk%eH&yvJM3;iXG%?X_hW?K-RK$ti?AI(t@%7lUNHk*`=5iigTXRdckf;!&Mf91=
z@TV~(fv8Yv^Oyu?GvnTtO;_2BC;0oP#IuyYbxim3&wi;Ph+(eGOKob=)S>4`Yg3HN
zt2A9@#ZOLDKU~Os%r*iVc(>z;d8v$FPui8*kyfR}GRM+v5T5Mw1FxK~6mLq(8f;Bc
z#X5F;uAyE9MrlV00A%))#wD`{w?;!PVjYeteqZQOX3dC`1P$i$_$IGnjMw6c)uEKC
z(C445UNz6<e<E*=vRm`9i{_DJ(#Y@V9)tzBEPA@@h8SEDhf6h+$AH&7I{WUgz_VUG
zp~u%7c3g*|%!zS)jVKkIn%DC+Ygs?r85~LCo3To$+;9=d4K_2(uy6T#ozaZzLMqu^
z*cxTFG4!?4Sj@r|!IsH)EMrk5&j8hi!Z(1f0I9Qtvzi6(cjRL8LJ>V(@mB#dm|>P(
zqP6g(qq$kLbt0D4m8zy|2N=44DGu(~HgwuevLca3iq(gz>_q#Y&Db=#z&Xw{NUbil
zX5{`eyM3;uqC-zi0o444(8j}*(h!!L{qO}F_+>h7dCkVRPemmCNeOrw=A1G8z&1`i
zH_8UlqoC1KF+#1F&~q%N_YIZ=F9kFzEG+EVocsGkP)E;M{w2kdDo2d@1jUQMC0=~J
z=JJlt%vsZJCD8>|0Orue0$?l^aM(kI14slyK->4z@FBp}Fj-A-CPVkIhNAP$7p5x~
zzx`(PW$O)<y=^-djfvE$l$w*&r^B>nh|GC?-Pci>sQ3q^P>fK#5M{jXLhB{+hP}n3
zph634A3KXL5dkb_?^xK@d2PyV-qbEdAF$c<9_Q}+`5|?CjZ<zZ$I-Z!o)7pFd+`4>
zN4Yw8U+y(E`3u&r=jw6$pJQuNi~+}ASYBiRbII9W7Gq(n62O&(JRtXKnm9eRArf1?
z1Mvx%HcopO;pBPZ+{6UdI5TIJi>P<tMo3p5^dKPf)*Z3$F?A}$^VIm}h=7ZQewLHO
z0$+{DcgbsD%#+SHt5;YkrTJ3jzr)G3&TOIE*}*h1m&_Wg-Rgdp^DYe^gALcorPuGu
zrtXZ-wsIUwsDzfyfAdMUX#7$;x6Aub#wM?7qpL)b2W)R3j;;{`gUD6^FNL~+5qno2
zR19|nP#4AR<BW<u4|44&WY7ZSXi8hK^*@cG1<?8j01a0&mb_+6_VkdK*Bxf$q)>ff
zUXqAUjVPqgEiFFFrWLjc2#{_SN9rL5CluT>9!13$LyLl<g-Lx0FO<De8FAz3v(%jj
zHLm+`Y*l@y-gUSkg<;qAIO)Lh1lcPwo@UOoWaZaNaIvv;>f7!ekmRSc13ORZLD*y8
z62e~8BQ)FJ``@7NPs2Le)B<zoNLA9--+yRgjm;G=<nXfL=rEXIewGi`kJ{)wmoVx~
zBmG1W14(}f*u;v95n|PYBuL|XPy#qK=d3tkmO2f40KwfjO+V<RN)KQ*eH-kMP>^&0
z^vQ^(1k>gWN*a~p_cRN~`W<td;Jdp#GWIL0nopx8@{8lz&lf_$RmBq7E~2*PG=t%;
zw7MHd3{4-dbgZ%XNURF$zRZ04TI=HU6k9wgju!~lak8V;uq`Mp0j8%QQjGMDboI=z
znJv5Oum+?6QNBGQofCbbhP`6rP%x<%V}AcFN;c(L*B%`pS~Z*&avXXiJD&NB<4gEW
z25^@25URoafW}g}M2R?u7hwzf+`@PfzSTp5dJt50Xs04#15O95Qv9K-y*cWW>(3>B
zW?&E_yU)@!ktkX>cNaD7diK{z^M>`cME#{8V(2j9Z)r^jkm8d~GVPvvia!JJ%yM6J
z%d84q%V&XbAj+ciKAs@51yRVpKAtj)%spPZ(?&Q_agaul`6I0zGWIRsY=85w;na!%
zI2~nMcohlx^57ASP_u`Cf}8B?!xwEgJtP_MxeSKwtK+Y8{(qL{b@?NI_R0RuCC|kw
zg=7eCKQz|LdX1;vdiUqvR1?*?unf_7QTyY+PssXk-1zk@e9KRB!=H<nQE|TQwM{A_
zW}=J%MT=l0HOpZG(9(&)QcW~`d>fdC6<09TzvVN;PJoZt39=NW6yc<aF;g={2fQ#A
z^N%7tuNfzv#dpzggVf*glisuiV#1La{H3e%-&*K7$ESxj-2a`fw498%>UM}uGzH4m
z-vgD2JD@CkF@M(L_uOEeP%;a^9{?}dgTGBUPBSQ?mCiMTHpkC&H?&I4WOL0|e6!~|
z;DT{bHv-rQ^x^;+4p<gMOyb0W^7J(!KG6~g|750;<@q?bwL1zD6;U~iVxOf%F8-I8
z!Ck6NMdGJcgIorX8o3C?^jR5BEj`?UNoGV}k;+Z=6Zh4~j&HTw+1Y8k#q^?IIW<`A
zh87UO2w`|eqVu*OY<JXCAHb8tLlR+g`_3?pUw^q3no=1A=8nmg!^v!kI$A#TRll&v
zA=C%xM^piiKuS<<Jih?Ce!bYy=Iq{Z;p5l%%6ZZRn79wz_cuDt?y@C#V!m4cfW3g6
z_FR(jOKh+Dsi1DQIIk31T-sr8`x7@u9|TFP8EXfjVFDH^g(3}BAnpb<V_|K48E!*b
zmG4DrT{yswVu1PhG^x#H`V$LO#4-*js9v$^gLm@H=QzO?l$W^)q;-3(ro&Kv3@}Tq
z83_ZV(PWW8z^g$r5tLv>jdWUB*Vx~1W-92{0>k)8Vj7GXi=W>k6|MEx7LAFr9*uW<
zE^{wyM_eIR$ZR9iT4c*XDVTxFZ=;0A;(fn{ta&htLK~V=SKho0V8pd(4WeAI2=ii`
zZ?0h8@&+VEgrXMo93LbH%;EX=Ft#;km$z}!n85q-LV$D0fdI4#AeW#84JMNjna3e_
z#Zf9NUi-T>H&_)6FTc0W;+VaCC7m(Pf`q4mA74$!cElU$TAOLyy-m4~pO-IC)+k&%
zjp!61TNE7v!5?rcs3A`p-l?2Lnhf6T;o)DlJoAp~ry4ud#L4p6y#Iuzj2l=I>U<vJ
zfeX?91bEcOV|K&CQfhgRv?jVWKU-7K$N^YzA>Ven<Ek2v1%`n7zEXlmz^tf`fH;t+
z&})-L_qMIsStopCE{}2q>sNHyyXxlbHI@{}l6tqwrbAYx)Q9DlqTs1dcbSCQZH{p~
zaWLsLP7muN-e}QiZX%gzI_mlA@8*zBHj=ZjQF#VRAhBVOFB5nZtz#GwI@8IB|1JWk
zI#BQcz)^i&>c4|kX8ciF>F}P8xA0z7+F8Yo!pm<#2Qnh~Ic$R{luP{XLnKp=ci3VZ
zs*~QiET)q_;vrtWylX}ADdW{YIB=2hK9LDQfK!-{Vgr)}>$Jy|!{-OSx@8HJ!0&=h
zSBE7oPCsr{W}5PDjP0<EOw7h>v%gq~^Z(-4i?AdA-17N+@vk?Uuq^y1X9C%bp!rjm
z@<TOe17d3&*=9_-5KW3NP1?iCWUD7^c>tVCnVnz5F>+Rl#@c{_LspSFb`oku>I*Tw
z{rn>C&860AEQvEgAe}Oc8^T?}PKR!6;{=ZYyL_2UxV>e_^0kTarIGex@%6-ON9#*1
z6Y)0#WCqMMM$Sr@t2H=_4*dGqd?&77_prh4u2Q-E@yxwVV}<BntZYlr^#U1E`mXw8
z4OlzBd^vlx7d6)dgaK%X=1sj<QYuR?Be}EQ<N%aVuK_6pu<%|$hb>M%unQo_zrk9q
zxq;K}+(Ty>;d-;aJBT%jM{_3d@7XvgIVH4+rH2LJl8_=1Xdw&@g<3?mF^Y8h31isP
zD2Kk^UiIG+cd{Vsjn8U}5n(q2yOocfcCMGPfn<NgR!N;8E1*)s<<EJ*UC?ms+nosV
z&jeE2DDuBKSk6J8$*g5i24J<)rtrxHQ73l=aY+Gsoj$(VrorQ>HL{0ZXo7NYpB7=L
z1q$Lmp^q)jmaG>iR7i^e!io3a!5Nc>x4M_h=WI4re+N?`xtg1*8i%+dMcA0kGO#h7
zJFuUM0ytL&$;GJyN{Y601Zii-7sZ!K3GWuqJj%9Y)VXq*b5^=KL*AaD66_C6Ui(yf
z&Rt*^!CAu@yW%@E<kps29G%_%VXivFd|+i>YLSU)fwXO{nFiT+Ajj0hiWiZO+Wkld
z3wn+A6m(m901+tB&sv^Kr{}$PRZyTl{v|gupFQh&fv~XG)X$(r<KYHHIk{b{+O{#T
zY?<ErV76`Gc1LP8n0i<FiRBaEf3!hu<v@;>J@!3t8@;N?(Mse5ZO=+^)vRrPJ#$l*
zAGetBz@0@3>@#}osA*#+O)WDyzeJGJB@XqO3BEpUJng6f!xT*qwgA0}oDXB5FhW*9
zupxuv5GOc!NuDJ;+KUOiA$0;UJhuDQc7MNIMg%RD2$UxbIR9%JhVA1uwi-Hftp}WU
zQ8G~Ow%SbJuT8V7m#>EFa17!dg*NmAT<+M+^;~1KiFC7A0pR^RHCH<wd|xEqJKZGF
zy45>3TjFi%`BA@o{VO+h>mz~M4foDCf2G;M8qCSy&D5HhKk35hf=2@3ymHsXzfV|?
z*9I`tc=rid0v5k+ATx<|LbJHzloMy5*PZ;+vz#u}EEMM}%PBF(`EVw|nq`WmD+2Ws
z?DLz9Ks^*{7&&}w&2gbEPh1b&tzN=1MDJ?(kM0)SYajKb3y(rNgi)+76GPp+MHL?>
zzFgyy5xXkr?v_Qsq8?V92}Rl;u5%r<q*Q|c{_a6={;XxdEs)3CUkI+QXGH^!ZzjNg
z*i4-`Xda74JNbLsTU|~>Tey72)er!o9HH`RGb3`{YdHI;&+S-2>-8><r|yeyf{D|+
zj2?v3ot<vzeeljPH4BJk{kHXPg)ulnvago8ThAKL8un}Y{rf7tXP(j+cI?1*q|#h5
z>8-_<s+zRc2t0OWWlBpKU1d?|+Tt*<YwK&{An^@=e?IPprQ|k>uLQD|>6z6_WH~Pe
z{S)55X4dpJyGG5uwZW{<fA?S)rFkMB?7(+F`wqd#(T{eN0F&N0H*jY-D`T)HJSVJ2
z0Iv3BGC0$Lk3Ck@K9q})0kn-o(cb(E<Z|%O@on1{5sHlL%Bx}C*oI7P;j;4HXeKm^
zl-%)zOHlLC`{k-&q=rU#jltq8<U%OTWWzU8mt1R(z1!2t+1cL-8B=*XnXY&qeGSh&
z`)YK26bP}7D!Fa;D592JKPloO3^;+sIuBEwlenCgqyc1E7>*WlTD+&{F?XtW)=d49
zt+ap4@5g-v*m-k+aVCJr3p!dTg;QK8LC^Oeq>DMSiLM0YLG23aNS0enr}i-!#`nRV
z%|-bdL#)%}=_Q3z+!6b~QZLiQxw?-3mZS?yl0LRq6##TKlnKj`E(s@nb6w<^-^+|o
zl}*%`KnyjG?@zbMdOHNj#VBA;5DQtpyCToE^T1d<FgDp_hQ+_|wo$oow6AYSf%iLW
zvwWkQ=dfw6U1uyV=apoKEWMIMfay?Y5+k=*{P(*p`l$Ey<9;0pa38Rf2C3Rs=XzMP
z?}Va6y)NP9<!!l~JyX0K85^4%tKQW|ys=Q0UTTUp04&LH05%>2N<F35(nJPWIPOh&
z!P*gfy7a~K@QkCRU%!3-JAcidA*+}acHjQ}ARtx(-G!D}$eH8;@)7_)20g^46#+Ro
znp#ls%F2rMXC2K&Y&^_2?LKZS=`?QS`u@B>8yh`YgV8sn*iQCBEZk*jQwwRP_{c`5
z`U7>b9Lp-L$Ji<&$)D^QKRgO{^vo4-VkV5m6a&YKhZ(+^!J=k(uU4g+2r$M=;N79X
z83)G;Q4{pN{nbk%Q%ulzfKh5+04`Mjj-aOn5ys(g6PUBIvicSr4n^D}6PXV|>p{WC
zWQ)OWN4-+7E>+LjU3w!K<Pxs5tj(nUi;=aRu6E_^@d2v;uqkd|G;fA$h!H1`PKM@0
zCYE_xemM=8uP!u~*E&2h37f%84950GLC$*WDZC=EsaPtQyB0T|C~i0D&4?$qceWv#
zWmn)2c87|c>sXt4-W5hCGWTvA<vsBE8zb>F2Iv7xw^g1ug6{nK04(XMjM4!MHQ^s3
zq^2&X!ioMhsi|q6pdOukpF@eCDnt_<pZELcPbhQ9Js^V?xkFpGYQ$};?_9lSO9e-*
z)HI8)O;ccRZJn#9O(UhWO<zw2<=Ly4Z%ZZkVtt>@{(kX^2`n{j`X&vw7kQoiPO$(0
zz}wbRRWkS(x)I!EWGIP;@xPMn%M|BHhMQzOFE#m^?PzbG=lZKS>FmXGkPPJjpzB5L
z>CQ9I(@?#J#*<FtfQZ9oo>a&85r6l#SLgPXwQe8iAI01Zgde(s_Ty{f?{#FukveZ<
zT4}41w*iDh!Je#RBm=W^{$D0(8Y35N2It1co<%uV*VLpJl02>)_3#l0{01EWyoutl
z??gaC7K?wp_8HWi)O(hA@P2M4ZmrdswDH$(+krBwqpK@<ZAxfjS|(l*9i$&&Km58E
zfCt<wq`%h;ONP(lSgcX1#i`0B2k_mlAHi;QacTM~s^6pV&<<{XT!GA#5YGNt@9E{X
ze2vHiiu6pvgZj?i^{FjnKH~<w1CRx*q)9Bmw>0zdku7G45bs##M$;l{!jzij&KVr1
zrlzoP0=4qGx=)+)+Usk%eTDUf<Nn~Ca1Om-6dr;^^H~C-6w<bSyuUB3Hq8!VIa;p%
zb9Q$2lk{QCyS!T~D^i;nV4}D=L|7Z45Cc5jhkNbA=<s<?BP38k+>P)D+2VD+zP^88
zkjjQc<CBI11rL?7A0a#Os@fxvg2cnx`#9o&k8A`}gSH@k)z>2oU!GFjQ}8)1fSLI7
ze8>O}cIqXIROB~R-_i{x)x__F-irY$Q&3i<wSP(;FEz~%QFofH*CNK3Ih_0V@BY}>
z*y9n^AnxBv0a}n7F*h7y9C0_cpUpMoh=T+~I7K(8{q%=Bu;xVJ;>oiM;hX{F^aUA<
z;-k*P!*`;1`^58`ozjMehMEL1`)Ou4W{4Bz^I@*o!bkB;bHqz4OPz6Vqk@<qkDeNG
z47>*U06y|FO^wlmc%b)an(g$QW1Oer)At2|t)an>B`7p4ug;oY5JigZAFllUWt~gY
z4Ntx_cCfncz{-RkobF7$j+wu4ipF0PlvH%$0xAG{lcS4m>455~Jjm%8p4;K!Ve#Ak
z>xxA?x)0)uf^oGAfy&k+3NkY3v1Sx!OSKGZdN;4JjqX|KT!aYm@LA&`9rkx;>K3sH
z)b@!9!^u!Y1;>M$fUCZrI3E!Ra1O!aGLm90)MJ&;{<k*@oTD-_GFq%rfsdaB6o(xn
z*IH?nzqgPmP7|rk@Tor%VPs*s74OE7&UNs>Y6g0HwY$YNd0sz$@blZ!_BOv7oI|Zb
z4W!L6e-A<V)#B4j6_`3^uF?M?>pQI<mzaa4JcQxDKE~=54^YFin=0gWQ$DQ&K0f~a
zIba>-{yfy;Bed=4=tx0IZ}ZonEJXpu09cQ6@iz`00(HH&w6t`;QIV*tq@*P3>({Rq
zj*h!)Eoh*)9gm{S8zBO`uAZJpvY?41Zr#V<RFjdk-UmI{DJmn#5KlPF<z8yTPJjKZ
zU`myIur%A?i#xc_->Q_e9f{drP!W-kNU7xiut%n$Yj5=m@&@orZRpgMhd^A(dm!vh
zXO6~{HZ@yJE>8YD^_jxM9Id;h$w4wT^b+*>^<RJ<+h@)0v&(bOr-4sPFaJ?DaRLe4
zQqiW^uA5FtD4d%RKos*h^88KIA$RaxzHatVl;>m!xmf}VCoCJgTlfDCFl**;<Fo)+
zxq;@9k@|>1z&e4WDQ{pS)*h|Py@T@vu2PzjvxOFfjE|4EZ*CTTcK9SD`HCkn7W-G-
zLjHDBbY%nB{l2k5tW_*N+4N_9^}@+p{H=)qonuW+m&S)vRgf{~{>}B}B9;=^-qxo2
zKv!N=bZGSKQj|c`tPDFHyuWj$4|DW!A{_dsZ&QU#qrQCkVu7^`p*5U%H*qZR$FB}Z
z4LM+e3*2TL=!MkJ3h{_>NLx$O-@kv8-#c4cvTNhUr#t)mH_$RM30T|MymuUzSZve;
z1+$wmqMFOk{!+Z*2g(C`&c!CVA1o{`S{z`zPb3$083Nu@eIiLhXy6?GZzgZ>ffbY;
z8E~!0G(y;lfEqZ5=@x(6>IVD!vXG@OPavF4Z+0(E3gr%9q$bdWk%8y|-d%$x*vV4F
zqi|%|=HJc6I=^@U+TlQ$<y2_oWZv8UjdRGBgAWt(o@5~S?>dA4c9YtgS0v(az<}66
zgzg6UGd%l$gV>S}0186?Jh&SWJLh`rQ(orO^<~FiioHvL8g|oAbMa~Y?~d4^f$^#G
zJMaHDRt`0g1obklh~s}3cSU23rODwhss9`6CH7hW|GIG?67a{%gWv~0XAl7U(Nfb<
JEm1ZL`hUrto)Z87

literal 0
HcmV?d00001

diff --git a/torch/functional.py b/torch/functional.py
index ab8f70f6bffaf..78f833eaf5417 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -330,23 +330,54 @@ def meshgrid(*tensors: Union[Tensor, List[Tensor]]) -> Tuple[Tensor, ...]:
         return _meshgrid(*tensors)
 else:
     def meshgrid(*tensors):
-        r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional
-        vector, and create :math:`N` N-dimensional grids, where the :math:`i` :sup:`th` grid is defined by
-        expanding the :math:`i` :sup:`th` input over dimensions defined by other inputs.
+        r"""Creates grids of coordinates specified by the 1D inputs in `attr`:tensors.
+
+        This is helpful when you want to visualize data over some
+        range of inputs. See below for a plotting example.
+
+        Given :math:`N` 1D tensors :math:`T_0 \ldots T_{N-1}` as
+        inputs with corresponding sizes :math:`S_0 \ldots S_{N-1}`,
+        this creates :math:`N` N-dimensional tensors :math:`G_0 \ldots
+        G_{N-1}`, each with shape :math:`(S_0, ..., S_{N-1})` where
+        the output :math:`G_i` is constructed by expanding :math:`T_i`
+        to the result shape.
+
+        .. note::
+            0D inputs are treated equivalently to 1D inputs of a
+            single element.
+
+        .. warning::
+            `torch.meshgrid` has the same behavior as calling
+            `numpy.meshgrid(..., indexing='ij')`, and in the future
+            `torch.meshgrid` will also support the `indexing`
+            argument.
+
+            https://github.com/pytorch/pytorch/issues/50276 tracks
+            this issue with the goal of migrating to NumPy's behavior.
+
+        .. seealso::
+
+            :func:`torch.cartesian_prod` has the same effect but it
+            collects the data in a tensor of vectors.
 
         Args:
             tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be
                 treated as tensors of size :math:`(1,)` automatically
 
         Returns:
-            seq (sequence of Tensors): If the input has :math:`k` tensors of size
-            :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also have :math:`k` tensors,
-            where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`.
+            seq (sequence of Tensors): If the input has :math:`N`
+            tensors of size :math:`S_0 \ldots S_{N-1}``, then the
+            output will also have :math:`N` tensors, where each tensor
+            is of shape :math:`(S_0, ..., S_{N-1})`.
 
         Example::
 
             >>> x = torch.tensor([1, 2, 3])
             >>> y = torch.tensor([4, 5, 6])
+
+            Observe the element-wise pairings across the grid, (1, 4),
+            (1, 5), ..., (3, 6). This is the same thing as the
+            cartesian product.
             >>> grid_x, grid_y = torch.meshgrid(x, y)
             >>> grid_x
             tensor([[1, 1, 1],
@@ -356,6 +387,28 @@ def meshgrid(*tensors):
             tensor([[4, 5, 6],
                     [4, 5, 6],
                     [4, 5, 6]])
+
+            This correspondence can be seen when these grids are
+            stacked properly.
+            >>> torch.equal(torch.cat(tuple(torch.dstack([grid_x, grid_y]))),
+            ...             torch.cartesian_prod(x, y))
+            True
+
+            `torch.meshgrid` is commonly used to produce a grid for
+            plotting.
+            >>> import matplotlib.pyplot as plt
+            >>> xs = torch.linspace(-5, 5, steps=100)
+            >>> ys = torch.linspace(-5, 5, steps=100)
+            >>> x, y = torch.meshgrid(xs, ys)
+            >>> z = torch.sin(torch.sqrt(x * x + y * y))
+            >>> ax = plt.axes(projection='3d')
+            >>> ax.plot_surface(x.numpy(), y.numpy(), z.numpy())
+            <mpl_toolkits.mplot3d.art3d.Poly3DCollection object at 0x7f8f30d40100>
+            >>> plt.show()
+
+        .. image:: ../_static/img/meshgrid.png
+            :width: 512
+
         """
         return _meshgrid(*tensors)
 

From 2b303f3f315b566d1103859b34d55e4d6ee21cd7 Mon Sep 17 00:00:00 2001
From: Michael Dagitses <mikeyd@fb.com>
Date: Wed, 18 Aug 2021 04:04:43 -0700
Subject: [PATCH 013/530] enhance comparison tests for c10::optional (#62887)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62887

Reviewed By: VitalyFedyunin

Differential Revision: D30305044

Pulled By: dagitses

fbshipit-source-id: d0a3a9e4ea186915ef087543aaf81a606f943380
---
 c10/test/util/optional_test.cpp | 86 +++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/c10/test/util/optional_test.cpp b/c10/test/util/optional_test.cpp
index 1e34377282898..cac325f9188ab 100644
--- a/c10/test/util/optional_test.cpp
+++ b/c10/test/util/optional_test.cpp
@@ -1,5 +1,6 @@
 #include <c10/util/Optional.h>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include <array>
@@ -8,6 +9,14 @@
 
 namespace {
 
+using testing::Eq;
+using testing::Ge;
+using testing::Gt;
+using testing::Le;
+using testing::Lt;
+using testing::Ne;
+using testing::Not;
+
 template <typename T>
 class OptionalTest : public ::testing::Test {
  public:
@@ -90,4 +99,81 @@ TYPED_TEST(OptionalTest, Initialized) {
   }
 }
 
+class SelfCompareTest : public testing::TestWithParam<c10::optional<int>> {};
+
+TEST_P(SelfCompareTest, SelfCompare) {
+  c10::optional<int> x = GetParam();
+  EXPECT_THAT(x, Eq(x));
+  EXPECT_THAT(x, Le(x));
+  EXPECT_THAT(x, Ge(x));
+  EXPECT_THAT(x, Not(Ne(x)));
+  EXPECT_THAT(x, Not(Lt(x)));
+  EXPECT_THAT(x, Not(Gt(x)));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    nullopt,
+    SelfCompareTest,
+    testing::Values(c10::nullopt));
+INSTANTIATE_TEST_CASE_P(
+    int,
+    SelfCompareTest,
+    testing::Values(c10::make_optional(2)));
+
+TEST(OptionalTest, Nullopt) {
+  c10::optional<int> x = 2;
+
+  EXPECT_THAT(c10::nullopt, Not(Eq(x)));
+  EXPECT_THAT(x, Not(Eq(c10::nullopt)));
+
+  EXPECT_THAT(x, Ne(c10::nullopt));
+  EXPECT_THAT(c10::nullopt, Ne(x));
+
+  EXPECT_THAT(x, Not(Lt(c10::nullopt)));
+  EXPECT_THAT(c10::nullopt, Lt(x));
+
+  EXPECT_THAT(x, Not(Le(c10::nullopt)));
+  EXPECT_THAT(c10::nullopt, Le(x));
+
+  EXPECT_THAT(x, Gt(c10::nullopt));
+  EXPECT_THAT(c10::nullopt, Not(Gt(x)));
+
+  EXPECT_THAT(x, Ge(c10::nullopt));
+  EXPECT_THAT(c10::nullopt, Not(Ge(x)));
+}
+
+// Ensure comparisons work...
+using CmpTestTypes = testing::Types<
+    // between two optionals
+    std::pair<c10::optional<int>, c10::optional<int>>,
+    // between an optional and a value
+    std::pair<c10::optional<int>, int>,
+    // between a value and an optional
+    std::pair<int, c10::optional<int>>>;
+template <typename T>
+class CmpTest : public testing::Test {};
+TYPED_TEST_CASE(CmpTest, CmpTestTypes);
+
+TYPED_TEST(CmpTest, Cmp) {
+  TypeParam pair = {2, 3};
+  auto x = pair.first;
+  auto y = pair.second;
+
+  EXPECT_THAT(x, Not(Eq(y)));
+
+  EXPECT_THAT(x, Ne(y));
+
+  EXPECT_THAT(x, Lt(y));
+  EXPECT_THAT(y, Not(Lt(x)));
+
+  EXPECT_THAT(x, Le(y));
+  EXPECT_THAT(y, Not(Le(x)));
+
+  EXPECT_THAT(x, Not(Gt(y)));
+  EXPECT_THAT(y, Gt(x));
+
+  EXPECT_THAT(x, Not(Ge(y)));
+  EXPECT_THAT(y, Ge(x));
+}
+
 } // namespace

From 4a390a56c4a345a917f5a4d8ee6cd47818fa7e84 Mon Sep 17 00:00:00 2001
From: CodemodService FBSourceClangFormatLinterBot <>
Date: Wed, 18 Aug 2021 04:18:47 -0700
Subject: [PATCH 014/530] [AutoAccept][Codemod][FBSourceClangFormatLinter]
 Daily `arc lint --take CLANGFORMAT`

Reviewed By: zertosh

Differential Revision: D30391472

fbshipit-source-id: d4eb1e7debea8905e7fee5f026c082bee65e78f3
---
 torch/csrc/distributed/rpc/python_functions.cpp | 9 +++++----
 torch/csrc/jit/passes/shape_analysis.cpp        | 1 -
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch/csrc/distributed/rpc/python_functions.cpp b/torch/csrc/distributed/rpc/python_functions.cpp
index 2d6533d797175..60d67c558dcae 100644
--- a/torch/csrc/distributed/rpc/python_functions.cpp
+++ b/torch/csrc/distributed/rpc/python_functions.cpp
@@ -155,9 +155,9 @@ c10::intrusive_ptr<JitFuture> toPyJitFuture(
             } catch (py::error_already_set& e) {
               py::gil_scoped_acquire acquire;
               // FIXME: this is a temporary solution to add a special-case for
-              // ValueError and TypeError, as those are already used in our tests.
-              // We should have a more comprehensive coverage for other types of
-              // exceptions as well.
+              // ValueError and TypeError, as those are already used in our
+              // tests. We should have a more comprehensive coverage for other
+              // types of exceptions as well.
               if (e.matches(PyExc_ValueError)) {
                 child->setErrorIfNeeded(
                     std::make_exception_ptr(pybind11::value_error(e.what())));
@@ -165,7 +165,8 @@ c10::intrusive_ptr<JitFuture> toPyJitFuture(
                 child->setErrorIfNeeded(
                     std::make_exception_ptr(pybind11::type_error(e.what())));
               } else {
-                // py::error_already_set requires GIL to destruct, take special care.
+                // py::error_already_set requires GIL to destruct, take special
+                // care.
                 child->setErrorIfNeeded(
                     std::make_exception_ptr(std::runtime_error(e.what())));
               }
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 3024811fef6bd..47cd30b3d43ac 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -41,7 +41,6 @@ bool mergeTypes(
   return changed;
 }
 
-
 namespace prim {
 using namespace ::c10::prim;
 }

From 30e1c74dc19ae2b622b46ebcdb7972c42775ac80 Mon Sep 17 00:00:00 2001
From: JackCaoG <jackcao@google.com>
Date: Wed, 18 Aug 2021 06:42:51 -0700
Subject: [PATCH 015/530] Update cuda amp to also check xla device (#63413)

Summary:
Fixes https://github.com/pytorch/xla/issues/3086. Pytorch/XLA:GPU also use cuda amp. I verified the pt/xla `test_autocast` with this fix and all test passed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63413

Reviewed By: ngimel

Differential Revision: D30380785

Pulled By: bdhirsh

fbshipit-source-id: fd1a1de7d224c616fc3fa90b80a688a21f6b1ecc
---
 torch/autocast_mode.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/autocast_mode.py b/torch/autocast_mode.py
index edf36d25745fc..ec9fdb0326d62 100644
--- a/torch/autocast_mode.py
+++ b/torch/autocast_mode.py
@@ -135,7 +135,7 @@ def __init__(self, device_type, enabled=True, **kwargs):
             self.fast_dtype = torch.get_autocast_cpu_dtype()
         else:
             raise RuntimeError('User specified autocast device_type must be \'cuda\' or \'cpu\'')
-        if not torch.cuda.is_available() and self.device == 'cuda':
+        if torch.cuda.amp.common.amp_definitely_not_available() and self.device == 'cuda':
             warnings.warn('User provided device_type of \'cuda\', but CUDA is not available. Disabling')
             enabled = False
         for key, value in kwargs.items():

From a00d5878497af2a0bb599fe939369de49256b1ea Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 18 Aug 2021 07:36:22 -0700
Subject: [PATCH 016/530] add `OpInfo` for `torch.linalg.tensorinv` (#62326)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/53739.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62326

Reviewed By: H-Huang

Differential Revision: D30136376

Pulled By: zou3519

fbshipit-source-id: 04ec9450e8866667649af401c7559b96ddc91491
---
 aten/src/ATen/native/LinearAlgebra.cpp        |  9 ++----
 .../_internal/common_methods_invocations.py   | 29 +++++++++++++++++++
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index bbb6fce844524..10576a0c63a49 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -2651,12 +2651,9 @@ Tensor linalg_tensorinv(const Tensor& self, int64_t ind) {
   shape_ind_end.insert(shape_ind_end.cend(), shape_start_ind.cbegin(), shape_start_ind.cend());
 
   // If the reshaped self is not invertible catch this error
-  Tensor result;
-  try {
-    result = at::inverse(self.reshape({prod_ind_end, prod_ind_end}));
-  } catch (...) {
-    TORCH_CHECK(false, "Failed to invert the input tensor, because it is singular.");
-  }
+  Tensor result, info;
+  std::tie(result, info) = at::linalg_inv_ex(self.reshape({prod_ind_end, prod_ind_end}), /*check_errors=*/false);
+  TORCH_CHECK(info.item<int64_t>() == 0, "Failed to invert the input tensor, because it is singular.");
 
   return result.reshape(shape_ind_end);
 }
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index b281c5e474c41..f06d3ce899749 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -4986,6 +4986,22 @@ def sample_inputs_softplus(op_info, device, dtype, requires_grad, **kwargs):
         SampleInput(make_input(low=1), kwargs=dict(threshold=1)),
     ]
 
+def sample_inputs_tensorinv(op_info, device, dtype, requires_grad, **kwargs):
+    def make_input():
+        input = make_fullrank_matrices_with_distinct_singular_values(12, 12, device=device, dtype=dtype)
+        return input.requires_grad_(requires_grad)
+
+    # lhs / rhs shape can have any number of dimensions as long as their product equals 12
+    shapes = [
+        ((2, 2, 3), (12, 1)),
+        ((4, 3), (6, 1, 2)),
+    ]
+
+    return [
+        SampleInput(make_input().reshape(*shape_lhs, *shape_rhs), kwargs=dict(ind=len(shape_lhs)))
+        for shape_lhs, shape_rhs in shapes
+    ]
+
 def sample_inputs_mse_loss(op_info, device, dtype, requires_grad, **kwargs):
     _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -8673,6 +8689,19 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
             ),
         ),
     ),
+    OpInfo(
+        "linalg.tensorinv",
+        ref=np.linalg.tensorinv,
+        dtypes=floating_and_complex_types(),
+        skips=(
+            # RuntimeError: aliasOp != torch::jit::getOperatorAliasMap().end()
+            # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":159,
+            # please report a bug to PyTorch.
+            SkipInfo('TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+        ),
+        sample_inputs_func=sample_inputs_tensorinv,
+        supports_forward_ad=True,
+    ),
     OpInfo(
         "nn.functional.mse_loss",
         ref=reference_mse_loss,

From 061b36e2f58fea2ec9c06577c4bef70a4519af20 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 18 Aug 2021 07:36:47 -0700
Subject: [PATCH 017/530] [fx2trt] Add dequantize support (#63448)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63448

Only available after TensorRT 8.0

Test Plan: buck run mode/opt caffe2/torch/fb/fx2trt:test_dequantize

Reviewed By: 842974287

Differential Revision: D30296863

fbshipit-source-id: 44b9630ef0d210e7f20e650dc81c519f7e41f5f3
---
 .../fx2trt/converters/acc_ops_converters.py   | 32 +++++++++++++++++++
 torch/fx/experimental/fx_acc/acc_ops.py       | 32 +++++++++++++++++--
 torch/fx/experimental/graph_manipulation.py   |  2 +-
 3 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
index 0bca6e28c83b6..88a74fe9e32c0 100644
--- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
+++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
@@ -1138,3 +1138,35 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name):
     layer.axis = 0
     layer.name = input_val.name + ".quant"
     return layer.get_output(0)
+
+@tensorrt_converter(acc_ops.dequantize)
+def acc_ops_dequantize(network, target, args, kwargs, name):
+    """
+    Currently just a no-op.
+    """
+    input_val = kwargs["input"]
+
+    if not isinstance(input_val, trt.tensorrt.ITensor):
+        raise RuntimeError(f"{name} received input {input_val} that is not part "
+                           "of the TensorRT region!")
+
+    q_scale = acc_utils.get_field_from_acc_out_ty(kwargs["input_tensor_meta"], "q_scale")
+    q_zero_point = acc_utils.get_field_from_acc_out_ty(kwargs["input_tensor_meta"], "q_zero_point")
+    dtype = acc_utils.get_field_from_acc_out_ty(kwargs["input_tensor_meta"], "dtype")
+
+    if dtype not in (torch.quint8, torch.qint8, torch.qint32):
+        raise RuntimeError("Only support (torch.quint8, torch.qint8, torch.qint32) "
+                           f"quantized type in dequantize, get {dtype}.")
+
+    if q_zero_point != 0:
+        raise RuntimeError(f"Only support zero_point == 0, get {q_zero_point}")
+
+    scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([q_scale], dtype=np.float32)))
+    scale_layer.name = input_val.name + ".dequant.scale"
+    scale = scale_layer.get_output(0)
+    assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in "
+    "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
+    layer = network.add_dequantize(input=input_val, scale=scale)
+    layer.name = input_val.name + ".dequant"
+    layer.axis = 0
+    return layer.get_output(0)
diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py
index bc4dfb3c4fe5f..9b2c7f95e0000 100644
--- a/torch/fx/experimental/fx_acc/acc_ops.py
+++ b/torch/fx/experimental/fx_acc/acc_ops.py
@@ -462,10 +462,12 @@ def quantize_per_tensor(*, input, acc_out_ty=None):
     )
 
 
-@register_acc_op_mapping(op_and_target=("call_function", torch.dequantize))
-@register_acc_op_mapping(op_and_target=("call_method", "dequantize"))
 @register_acc_op
-def dequantize(*, input):
+def dequantize(*, input, input_tensor_meta):
+    """ `input_tensor_meta` contains extra argument of quantization
+    parameters, e.g. scale/zero_point and will be using for
+    lowring dequantize op to TensorRT
+    """
     return torch.dequantize(input)
 
 
@@ -1174,3 +1176,27 @@ def packed_quantized_convrelu2d_mapper(
         )
         relu_node.meta = node.meta
         return relu_node
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.dequantize),
+    arg_replacement_tuples=[
+        ("input", "input")
+    ]
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "dequantize"),
+    arg_replacement_tuples=[
+        ("input", "input")
+    ]
+)
+def custom_dequantize_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node:
+    assert "tensor_meta" in node.kwargs["input"].meta
+    new_kwargs = {"input": node.kwargs["input"], "input_tensor_meta": node.kwargs["input"].meta["tensor_meta"]}
+    # `input_tensor_meta` contains quantization parameters that can be used to lower
+    # acc_ops.dequantize to TensorRT ops
+    with node.graph.inserting_before(node):
+        new_node = node.graph.create_node(
+            "call_function", dequantize, kwargs=new_kwargs, name=node.name
+        )
+        new_node.meta = node.meta
+        return new_node
diff --git a/torch/fx/experimental/graph_manipulation.py b/torch/fx/experimental/graph_manipulation.py
index 9d0af5343ae9a..6daa000f609d1 100644
--- a/torch/fx/experimental/graph_manipulation.py
+++ b/torch/fx/experimental/graph_manipulation.py
@@ -412,7 +412,7 @@ def get_user_info(user_node: Argument) -> Any:
         def get_arg_info(arg: Argument) -> Any:
             if isinstance(arg, torch.fx.Node):
                 return {"is_node": True, "name": str(arg)}
-            elif isinstance(arg, torch.dtype):
+            elif isinstance(arg, (torch.dtype, torch.memory_format, torch.qscheme)):
                 return str(arg)
             else:
                 return arg

From c508433617687130bb45a0aad95ae454de425fc9 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 18 Aug 2021 07:45:45 -0700
Subject: [PATCH 018/530] Implement subclass priority for __torch_dispatch__
 (#63411)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63411

In order to get this behavior, you have to use append_overloaded,
which I forgot to use in the previous implementation.  I exposed
an internal helper function which is more appropriate for dispatch
to Python where we know that an argument is definitely a Tensor (and
this test no longer needs to be done).

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: zou3519

Differential Revision: D30374489

Pulled By: ezyang

fbshipit-source-id: 43b08c00d1958c9b26d82a025d19f0b67bb85590
---
 test/test_python_dispatch.py            | 33 +++++++++++++++++++++++++
 torch/csrc/autograd/python_variable.cpp |  7 +++---
 torch/csrc/utils/python_arg_parser.h    | 10 ++++++++
 3 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index adacc7efb7093..0f5b6b9cbd70e 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -246,6 +246,39 @@ def test_version(self) -> None:
         x.data.add_(2)
         self.assertEqual(cur_vc, x._version)
 
+    def test_subclass_priority(self) -> None:
+        class ErrorA(RuntimeError):
+            pass
+
+        class ErrorB(RuntimeError):
+            pass
+
+        # The big tests for code coverage are test_precedence_semantics in
+        # test_overrides.py; this is just to make sure it is wired up at all
+        # correctly for __torch_dispatch__
+        class A(torch.Tensor):
+            @staticmethod
+            def __new__(cls, elem):
+                return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                raise ErrorA
+
+        class B(A):
+            @staticmethod
+            def __new__(cls, elem):
+                return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                raise ErrorB
+
+        self.assertRaises(ErrorA, lambda: torch.add(A(torch.empty(1)), A(torch.empty(1))))
+        self.assertRaises(ErrorB, lambda: torch.add(A(torch.empty(1)), B(torch.empty(1))))
+        self.assertRaises(ErrorB, lambda: torch.add(B(torch.empty(1)), A(torch.empty(1))))
+        self.assertRaises(ErrorB, lambda: torch.add(B(torch.empty(1)), B(torch.empty(1))))
+
     def test_format(self) -> None:
         x = LoggingTensor(torch.ones(1))
         s1 = str(x)
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 9496d668b3468..303584603aaa0 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -1562,7 +1562,7 @@ void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHa
     if (ivalue.isTensor()) {
       const auto& tensor = ivalue.toTensor();
       if (isPythonTensor(tensor)) {
-        overloaded_args.emplace_back(py::cast(tensor));
+        append_overloaded_arg(&overloaded_args, py::cast(tensor).ptr());
       }
     } else if (ivalue.isList()) {
       const auto& list = ivalue.toListRef();
@@ -1571,7 +1571,7 @@ void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHa
         if (nv.isTensor()) {
           const auto& tensor = nv.toTensor();
           if (isPythonTensor(tensor)) {
-            overloaded_args.emplace_back(py::cast(tensor));
+            append_overloaded_arg(&overloaded_args, py::cast(tensor).ptr());
           }
         }
       }
@@ -1620,7 +1620,8 @@ c10::intrusive_ptr<TensorImpl> concrete_detach_fn(const c10::impl::PyInterpreter
   // TODO: fix the constness of target
   Tensor self_t = Tensor(c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::unsafe_reclaim_from_nonowning(const_cast<c10::TensorImpl*>(self)));
   auto self_p = py::reinterpret_steal<py::object>(THPVariable_Wrap(self_t));
-  overloaded_args.emplace_back(self_p);
+  TORCH_INTERNAL_ASSERT(isPythonTensor(self_t));
+  append_overloaded_arg(&overloaded_args, self_p.ptr());
   auto args = py::reinterpret_steal<py::object>(PyTuple_New(1));
   PyTuple_SET_ITEM(args.ptr(), 0, self_p.release().ptr());
 
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index c9a1e4a39aeef..d132185ccaefb 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -810,4 +810,14 @@ bool is_tensor_and_append_overloaded(PyObject* obj, std::vector<py::handle>* ove
  */
 bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector<py::handle>* overloaded_args, int argnum, bool throw_error);
 
+/* Given an argument that is definitely a tensor and is definitely overloaded,
+ * append it to the overloaded arguments list.  Use this instead of
+ * is_tensor_and_append_overloaded in situations where you have a PyObject
+ * and you know it definitely is a Tensor and it is definitely overloaded.
+ *
+ * 'overloaded_args': the vector to append the overloaded args
+ * 'obj': the input tensor that is overloaded
+ */
+void append_overloaded_arg(std::vector<py::handle>* overloaded_args, PyObject* obj);
+
 } // namespace torch

From 93582e3bba33fbb7aade5ce3560ebde6e889f5fc Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Wed, 18 Aug 2021 08:04:08 -0700
Subject: [PATCH 019/530] A tiny fix in MT19937RNGEngine (#63219)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63219

Reviewed By: VitalyFedyunin

Differential Revision: D30341484

Pulled By: ezyang

fbshipit-source-id: 0ff4499d0f4a3dfeb991c0f10fe3248c6ca1c992
---
 aten/src/ATen/core/MT19937RNGEngine.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/aten/src/ATen/core/MT19937RNGEngine.h b/aten/src/ATen/core/MT19937RNGEngine.h
index 033df304e4a8e..40c1ba5f584ad 100644
--- a/aten/src/ATen/core/MT19937RNGEngine.h
+++ b/aten/src/ATen/core/MT19937RNGEngine.h
@@ -157,7 +157,6 @@ class mt19937_engine {
     data_.state_[0] = seed & 0xffffffff;
     for(int j = 1; j < MERSENNE_STATE_N; j++) {
       data_.state_[j] = (1812433253 * (data_.state_[j-1] ^ (data_.state_[j-1] >> 30)) + j);
-      data_.state_[j] &= 0xffffffff;
     }
     data_.left_ = 1;
     data_.next_ = 0;

From 383a33a0eb28ae454c0c8965650aea8ce1608943 Mon Sep 17 00:00:00 2001
From: Erjia Guan <erjia@fb.com>
Date: Wed, 18 Aug 2021 08:47:27 -0700
Subject: [PATCH 020/530] Make DataChunk support list in-place ops (#63422)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63422

Fixes #63095

Make `DataChunk` delegate to list method. Then it will support in-place operations:
- `sort`
- `reverse`
- `append`
- `extend`
- `random.shuffle`

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D30379027

Pulled By: ejguan

fbshipit-source-id: d176bd0cc8b89b915c7bb184ff243ab1f605616d
---
 test/test_datapipe.py                       | 48 +++++++++++++++++++--
 torch/utils/data/datapipes/iter/grouping.py | 10 ++---
 torch/utils/data/dataset.py                 | 14 ++----
 3 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 9a7876e334639..9c2380112705d 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -110,14 +110,54 @@ def create_temp_dir_and_files():
 
 
 class TestDataChunk(TestCase):
+    def setUp(self):
+        self.elements = list(range(10))
+        random.shuffle(self.elements)
+        self.chunk: DataChunk[int] = DataChunk(self.elements)
+
+    def test_getitem(self):
+        for i in range(10):
+            self.assertEqual(self.elements[i], self.chunk[i])
+
+    def test_iter(self):
+        for ele, dc in zip(self.elements, iter(self.chunk)):
+            self.assertEqual(ele, dc)
+
+    def test_len(self):
+        self.assertEqual(len(self.elements), len(self.chunk))
+
     def test_as_string(self):
+        self.assertEqual(str(self.chunk), str(self.elements))
+
+        batch = [self.elements] * 3
+        chunks: List[DataChunk[int]] = [DataChunk(self.elements)] * 3
+        self.assertEqual(str(batch), str(chunks))
+
+    def test_sort(self):
+        chunk: DataChunk[int] = DataChunk(self.elements)
+        chunk.sort()
+        self.assertTrue(isinstance(chunk, DataChunk))
+        for i, d in enumerate(chunk):
+            self.assertEqual(i, d)
+
+    def test_reverse(self):
+        chunk: DataChunk[int] = DataChunk(self.elements)
+        chunk.reverse()
+        self.assertTrue(isinstance(chunk, DataChunk))
+        for i in range(10):
+            self.assertEqual(chunk[i], self.elements[9 - i])
+
+    def test_random_shuffle(self):
         elements = list(range(10))
         chunk: DataChunk[int] = DataChunk(elements)
-        self.assertEqual(str(chunk), str(elements))
 
-        batch = [elements] * 3
-        chunks: List[DataChunk] = [DataChunk(elements)] * 3
-        self.assertEqual(str(chunk), str(elements))
+        rng = random.Random(0)
+        rng.shuffle(chunk)
+
+        rng = random.Random(0)
+        rng.shuffle(elements)
+
+        self.assertEqual(chunk, elements)
 
 
 class TestIterableDataPipeBasic(TestCase):
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index 1bd8c4cf4c315..e6304c2de8217 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -133,14 +133,14 @@ def _dive(self, element, unbatch_level):
             else:
                 raise IndexError(f"unbatch_level {self.unbatch_level} exceeds the depth of the DataPipe")
 
-# TODO(ejguan): https://github.com/pytorch/pytorch/issues/63095
+
 def _in_batch_shuffle_fn(data: DataChunk):
-    d = list(data)
-    random.shuffle(d)
-    return DataChunk(d)
+    random.shuffle(data)
+    return data
+
 
 class BucketBatcherIterDataPipe(IterDataPipe[DataChunk[T_co]]):
-    r""" :class:`BucketBatcherIterDataPipe`.
+    r""":class:`BucketBatcherIterDataPipe`.
 
     Iterable DataPipe to create mini-batches of data from sorted bucket. An outer
     dimension will be added as `batch_size` if `drop_last` is set to `True`,
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index 5b8102c235607..7a069d61de6cc 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -25,25 +25,17 @@
 T = TypeVar('T')
 
 
-class DataChunk(List[T]):
+class DataChunk(list, Generic[T]):
     def __init__(self, items):
+        super().__init__(items)
         self.items = items
 
-    def __getitem__(self, key):
-        return self.items[key]
-
-    def __len__(self):
-        return len(self.items)
-
     def as_str(self, indent=''):
         res = indent + "[" + ", ".join([str(i) for i in iter(self)]) + "]"
         return res
 
-    def __repr__(self):
-        return self.as_str()
-
     def __iter__(self) -> Iterator[T]:
-        for i in self.items:
+        for i in super().__iter__():
             yield i
 
     def raw_iterator(self):

From e2ddaec5cf6608b8e06667d4873505609ff1d674 Mon Sep 17 00:00:00 2001
From: Rishi Puri <puririshi98@berkeley.edu>
Date: Wed, 18 Aug 2021 09:41:37 -0700
Subject: [PATCH 021/530] Reverting launch bounds change in topK that induced a
 regression in perf (#63431)

Summary:
[topkwsyncs.zip](https://github.com/pytorch/pytorch/files/7003077/topkwsyncs.zip)

Running this script on nvidia containers 21.08 vs 21.07 we see the following perf drops:
topk(input=(dtype=torch.float16,shape=[60, 201600]), k=2000, dim=1, sorted=True) - 0.63

topk(input=(dtype=torch.float32,shape=[120000]), k=12000, dim=0, sorted=False) - 0.55

topk(input=(dtype=torch.float16,shape=[5, 201600]), k=2000, dim=1, sorted=True) - 0.55

topk(input=(dtype=torch.float32,shape=[1, 10000]), k=1000, dim=1, sorted=False) - 0.33

The relative perf drop is reported as (21.08_time - 21.07_time) / 21.07_time

I narrowed down the source of the regression to this commit: https://github.com/pytorch/pytorch/pull/60314
which reduced launch bounds from 1024 to 512.

The perf did not seem to regress in the original  evidence provided to change 1024 to 512 due to the input shapes in the benchmark being a lot smaller than the input shapes of the tensors which I am witnessing perf regression in. I suggest reverting back to 1024 as with 512 there was no considerable improvement in perf for small inputs and a major regression in perf for large tensors.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63431

Reviewed By: mruberry

Differential Revision: D30384087

Pulled By: ngimel

fbshipit-source-id: 11eecbba82a069b1d4579d674c3f644ab8060ad2
---
 aten/src/ATen/native/cuda/TensorTopK.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
index d6b4fe2620191..c0bc353110b6f 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -15,7 +15,7 @@ namespace at {
 namespace native {
 namespace {
 template <typename T, typename IndexType, int Dim, bool Order>
-C10_LAUNCH_BOUNDS_1(512)
+C10_LAUNCH_BOUNDS_1(1024)
 __global__ void gatherTopK(at::cuda::detail::TensorInfo<T, IndexType> input,
                            IndexType inputSliceSize,
                            IndexType outputSliceSize, // aka `k`
@@ -255,7 +255,7 @@ TORCH_IMPL_FUNC(topk_out_cuda)
     dim3 grid;                                                            \
     TORCH_INTERNAL_ASSERT(getGridFromTiles(inputSlices, grid), "Too many slices to sort"); \
                                                                           \
-    dim3 block(std::min(at::cuda::ATenCeilDiv(sliceSize, (int64_t) C10_WARP_SIZE)*(int64_t) C10_WARP_SIZE, (int64_t) 512)); \
+    dim3 block(std::min(at::cuda::ATenCeilDiv(sliceSize, (int64_t) C10_WARP_SIZE)*(int64_t) C10_WARP_SIZE, (int64_t) 1024)); \
                                                                           \
     /* This is used as a template parameter to calculate indices. */      \
     /* We only specialize it if all collapsed dim sizes are the */        \

From 565578cdab2cbf18fac0ea97c1a1f954492cc8b5 Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Wed, 18 Aug 2021 09:42:14 -0700
Subject: [PATCH 022/530] Use `fastAtomicAdd` in EmbeddingBag (mode "max")
 backward (#63298)

Summary:
Rel: https://github.com/pytorch/pytorch/issues/62695

### This PR
|   n_tokens |   num_embeddings |   embedding_dim | mode   |    bwd_fp32 |    bwd_fp16 |
|-----------:|-----------------:|----------------:|:-------|------------:|------------:|
|       4096 |             4096 |            4096 | max    | 0.000326228 | 0.000181448 |
|       4096 |             4096 |           16384 | max    | 0.00102805  | 0.000618136 |
|       4096 |            16384 |            4096 | max    | 0.000907326 | 0.000530422 |
|       4096 |            16384 |           16384 | max    | 0.00334988  | 0.00264645  |
|      16384 |             4096 |            4096 | max    | 0.000366449 | 0.000320232 |
|      16384 |             4096 |           16384 | max    | 0.00126421  | 0.00104183  |
|      16384 |            16384 |            4096 | max    | 0.00087738  | 0.00065068  |
|      16384 |            16384 |           16384 | max    | 0.00379229  | 0.00298201  |

### Original
|   n_tokens |   num_embeddings |   embedding_dim | mode   |    bwd_fp32 |    bwd_fp16 |
|-----------:|-----------------:|----------------:|:-------|------------:|------------:|
|       4096 |             4096 |            4096 | max    | 0.00032407  | 0.000188231 |
|       4096 |             4096 |           16384 | max    | 0.00104356  | 0.000624001 |
|       4096 |            16384 |            4096 | max    | 0.000902069 | 0.000527382 |
|       4096 |            16384 |           16384 | max    | 0.00302202  | 0.00255153  |
|      16384 |             4096 |            4096 | max    | 0.000384343 | 0.000403249 |
|      16384 |             4096 |           16384 | max    | 0.00126445  | 0.00135069  |
|      16384 |            16384 |            4096 | max    | 0.000880814 | 0.000825679 |
|      16384 |            16384 |           16384 | max    | 0.00337611  | 0.00319515  |

cc xwang233 ptrblck ngimel

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63298

Reviewed By: mruberry

Differential Revision: D30383583

Pulled By: ngimel

fbshipit-source-id: 14dd9d67002c53a153721812709033c198f68c1e
---
 aten/src/ATen/native/cuda/EmbeddingBag.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index 34a9d9dd82133..35094681a79c8 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -17,6 +17,7 @@
 
 #include <ATen/native/cuda/SortingCommon.cuh>
 #include <ATen/native/cuda/EmbeddingBackwardKernel.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include <c10/macros/Macros.h>
 
@@ -235,7 +236,7 @@ template <typename scalar_t, typename index_t>
 __global__ void EmbeddingBag_accGradParametersKernel_max(
     index_t *max_indices, scalar_t *gradOutput,
     scalar_t *gradWeight, int64_t stride, int64_t numBags,
-    index_t padding_idx) {
+    index_t padding_idx, const index_t numel) {
 
   using accscalar_t = acc_type<scalar_t, true>;
 
@@ -252,8 +253,9 @@ __global__ void EmbeddingBag_accGradParametersKernel_max(
       index_t word_idx = max_indices[bag * stride + featureDim];
       if (word_idx >= 0 && word_idx != padding_idx) {
         // If bag is empty, we have max_indices[idx] set to -1 in forward.
-        gpuAtomicAddNoReturn(&(gradWeight[word_idx * stride + featureDim]),
-                gradOutput[bag * stride + featureDim]);
+        fastAtomicAdd(
+            gradWeight, static_cast<index_t>(word_idx * stride + featureDim),
+            numel, gradOutput[bag * stride + featureDim], true);
       }
     }
   }
@@ -289,7 +291,7 @@ Tensor embedding_bag_backward_cuda_max(const Tensor &grad,
               scalar_t, index_t><<<grid, block, 0, stream>>>(
               max_indices.data_ptr<index_t>(), grad.data_ptr<scalar_t>(),
               grad_weight.data_ptr<scalar_t>(), stride, numBags,
-              padding_idx);
+              padding_idx, grad_weight.numel());
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
   });

From 7df2324120513c387524db604f8bcc7281aaf95f Mon Sep 17 00:00:00 2001
From: John Shen <johnshen@fb.com>
Date: Wed, 18 Aug 2021 10:35:55 -0700
Subject: [PATCH 023/530] [pytorch] Make qconv forward() thread safe (#63432)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63432

There's a race condition in quantized models when multiple threads call forward() due to qnnpack packing the weights the first time the operator is called. This locks the entire apply_impl function.

Test Plan:
https://github.com/pytorch/pytorch/issues/58055

Ran the script before and after, original crashes went away

Reviewed By: kimishpatel

Differential Revision: D30229520

fbshipit-source-id: d06cabe24199a80325cd57f24a7fd60624be2cf7
---
 aten/src/ATen/native/quantized/cpu/qconv.cpp         | 2 ++
 aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp | 9 ++++-----
 aten/src/ATen/native/quantized/cpu/qnnpack_utils.h   | 1 +
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index bf5c596a9e0d2..3c0d79acac18c 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -563,6 +563,8 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
     const at::Tensor& act,
     double output_scale,
     int64_t output_zero_point) {
+  // QNNPack is not thread safe
+  std::lock_guard<std::mutex> lock(qnnp_mutex_);
   const std::string func_name = transpose() ? "quantized::conv_transpose"
                                             : "quantized::conv";
   TORCH_CHECK(!(kReluFused && transpose()),
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index dff28b141f6b1..87294c11adda0 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -276,9 +276,8 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
   // during the first invocation of operator run. Refer to qconv.cpp for more
   // details. TODO Update to actually call pre-pack here once bias is removed
   // from pre-packing step.
-  c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> ret_ptr =
-      c10::make_intrusive<PackedConvWeightsQnnp<kSpatialDim>>(
-          PackedConvWeightsQnnp<kSpatialDim>{
+  auto ret_ptr =
+      c10::intrusive_ptr<PackedConvWeightsQnnp<kSpatialDim>>::make(
               nullptr, /* PrePackConvWeights */
               weight_contig, /* int8_t weight */
               bias_fp32.contiguous(), /* fp32 bias */
@@ -289,10 +288,10 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
               groups,
               transpose,
               c10::nullopt, /* input_scale */
-              {kernel_h, kernel_w},
+              std::vector<int64_t>{kernel_h, kernel_w},
               w_scales,
               std::move(w_zero_points),
-              is_per_channel});
+              is_per_channel);
 
   return ret_ptr;
 }
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
index 161be5a2f8fa3..91ede920b87e2 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
@@ -292,6 +292,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
   }
 
  private:
+  std::mutex qnnp_mutex_;
   template <bool ReluFused>
   at::Tensor apply_impl(
       const at::Tensor& input,

From af3cbfed9510747c776418c260c5116f662c6452 Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@fb.com>
Date: Wed, 18 Aug 2021 10:39:53 -0700
Subject: [PATCH 024/530] Add validation check in fx2trt interpreter (#63424)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63424

Add validation check in fx2trt for missing converter operators. If any op missing, interpreter init will report missing operators.

Test Plan:
for call_function and call_method:
manual test with feeds benchmark and verify init failed with expected message.
{F642390780}

for call_module:
specify a module as leaf node and make acc_tracer trace it as a node; then in fx2trt.py, in CONVERTER initialize stage make it skip recording all modules; initialize interpreter and call validator function, verify the output includes the missing module name, return value print as screenshot below.

{F643458718}

Reviewed By: 842974287

Differential Revision: D30294832

fbshipit-source-id: 243dca3fdfc6a174ded65248938e2a234aec19c6
---
 torch/fx/experimental/fx2trt/fx2trt.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/torch/fx/experimental/fx2trt/fx2trt.py b/torch/fx/experimental/fx2trt/fx2trt.py
index 160b4a7317a69..9879fd7e0952e 100644
--- a/torch/fx/experimental/fx2trt/fx2trt.py
+++ b/torch/fx/experimental/fx2trt/fx2trt.py
@@ -229,6 +229,10 @@ def __init__(
         self.input_specs = input_specs
         self.input_specs_iter = 0
         self.validate_input_specs()
+        missing_ops = self.validate_conversion
+        if not missing_ops:
+            warnings.warn("Interpretation may fail due to missing operations \n"
+                          + "\n".join(f"{i}" for i in missing_ops))
         self._cur_node_name: Optional[str] = None
         self._input_names: List[str] = []
         self._output_names: List[str] = []
@@ -290,6 +294,19 @@ def validate_input_specs(self):
                     len(shape_ranges) == 0
                 ), "shape_ranges are provided for input that doesn't have dynamic dim."
 
+    def validate_conversion(self):
+        missing_converter = set()
+
+        for node in self.module.graph.nodes:
+            if node.op in ["call_function", "call_method"] and not CONVERTERS.get(node.target):
+                missing_converter.add(f"{node.op} {node.target}")
+            elif node.op == "call_module":
+                submod = self.fetch_attr(node.target)
+                if not CONVERTERS.get(type(submod)):
+                    missing_converter.add(f"{node.op} {type(submod)}")
+
+        return missing_converter
+
     def run(
         self,
         max_batch_size=64,

From d565a7bd6871b39b2aee978f6fd1c15fb290c3ca Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Wed, 18 Aug 2021 10:46:09 -0700
Subject: [PATCH 025/530] [6/N] Enable opt-asan for elastic and launcher tests.
 (#63442)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63442

Continuation of https://github.com/pytorch/pytorch/pull/62051, I've
enabled elastic and launcher tests to run in opt-asan mode which is supported
with spawn multiprocessing.

This allows us to completely get rid of fork based tests from torch.distributed
and have all tests run in spawn mode.
ghstack-source-id: 136057123

Test Plan: waitforbuildbot

Reviewed By: cbalioglu

Differential Revision: D30384267

fbshipit-source-id: ad3447cfb9d6e31e7ec8332d64c8ff1054858dcb
---
 .../server/test/local_elastic_agent_test.py   | 100 +++++++++---------
 .../elastic/multiprocessing/api_test.py       |  15 ++-
 .../elastic/timer/local_timer_example.py      |   8 +-
 test/distributed/launcher/api_test.py         |  14 +--
 test/distributed/launcher/launch_test.py      |   6 +-
 test/distributed/launcher/run_test.py         |  28 ++---
 test/distributed/test_launcher.py             |   4 +-
 7 files changed, 90 insertions(+), 85 deletions(-)

diff --git a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
index 9becdeb663ef6..2536b1033d56c 100644
--- a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
+++ b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
@@ -36,7 +36,7 @@
 from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
 from torch.distributed.rpc.backend_registry import BackendType
 from torch.testing._internal.common_utils import (
-    TEST_WITH_ASAN,
+    TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_TSAN,
     sandcastle_skip_if,
 )
@@ -406,19 +406,19 @@ def dummy_compute(self):
             self.assertEqual((100, 100), return_value.shape)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_dummy_compute_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.dummy_compute)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_dummy_compute_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.dummy_compute)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_dummy_compute_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.dummy_compute)
@@ -431,19 +431,19 @@ def run_happy_function(self):
         self.assertIsNone(res.return_values[1])
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_happy_function_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_happy_function)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_happy_function_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_happy_function)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_happy_function_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_happy_function)
@@ -465,13 +465,13 @@ def check_master_addr_port_override(self):
         self.assertIsNone(res.return_values[0])
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_check_master_addr_port_override_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.check_master_addr_port_override)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_check_master_addr_port_override_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.check_master_addr_port_override)
@@ -484,7 +484,7 @@ def run_check_env_function(self):
         self.assertFalse(res.is_failed())
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_check_env_function_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_check_env_function)
@@ -497,19 +497,19 @@ def run_function_with_return_value(self):
         self.assertEqual("foo", res.return_values[1])
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_function_with_return_value_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_function_with_return_value)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_function_with_return_value_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_function_with_return_value)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_function_with_return_value_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_function_with_return_value)
@@ -520,19 +520,19 @@ def simple_dist_sum(self):
         # _dist_sum internally checks that the sum computed is valid
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_simple_dist_sum_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.simple_dist_sum)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_simple_dist_sum_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.simple_dist_sum)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_simple_dist_sum_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.simple_dist_sum)
@@ -556,19 +556,19 @@ def run_distributed_sum_homogeneous(self):
         self.assertSetEqual(set(range(4 + 4)), ranks)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_distributed_sum_homogeneous_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_distributed_sum_homogeneous)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_distributed_sum_homogeneous_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_distributed_sum_homogeneous)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_distributed_sum_homogeneous_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_distributed_sum_homogeneous)
@@ -596,19 +596,19 @@ def run_distributed_sum_heterogeneous(self):
         self.assertSetEqual(set(range(1 + 2 + 3)), ranks)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_distributed_sum_heterogeneous_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_distributed_sum_heterogeneous)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_distributed_sum_heterogeneous_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_distributed_sum_heterogeneous)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_distributed_sum_heterogeneous_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_distributed_sum_heterogeneous)
@@ -636,19 +636,19 @@ def run_sad_function(self):
                 self.assertEqual(int(data["extraInfo"]["timestamp"]), failure.timestamp)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_sad_function_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_sad_function)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_sad_function_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_sad_function)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_sad_function_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_sad_function)
@@ -668,19 +668,19 @@ def run_bipolar_function(self):
         self.assertTrue(agent._total_execution_time > 0)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_bipolar_function_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_bipolar_function)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_bipolar_function_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_bipolar_function)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_run_bipolar_function_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_bipolar_function)
@@ -711,13 +711,13 @@ def correct_rank_assignment_heterogeneous(self):
         )
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_correct_rank_assignment_heterogeneous_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.correct_rank_assignment_heterogeneous)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_correct_rank_assignment_heterogeneous_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.correct_rank_assignment_heterogeneous)
@@ -744,13 +744,13 @@ def correct_rank_assignment_homogeneous(self):
         )
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_correct_rank_assignment_homogeneous_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.correct_rank_assignment_homogeneous)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_correct_rank_assignment_homogeneous_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.correct_rank_assignment_homogeneous)
@@ -852,13 +852,13 @@ def double_agent_fault_tolerance(self):
             self.assertEqual(0, p.exitcode)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_double_agent_fault_tolerance_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.double_agent_fault_tolerance)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_double_agent_fault_tolerance_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.double_agent_fault_tolerance)
@@ -905,19 +905,19 @@ def double_agent_elastic(self):
                 self.assertEqual(-signal.SIGKILL, p.exitcode)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_double_agent_elastic_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.double_agent_elastic)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_double_agent_elastic_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.double_agent_elastic)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_double_agent_elastic_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.double_agent_elastic)
@@ -955,19 +955,19 @@ def torch_rpc(self):
         self.assertEqual([f"{msg} from worker"], list(master_retvals.values()))
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_torch_rpc_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.torch_rpc)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_torch_rpc_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.torch_rpc)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_torch_rpc_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.torch_rpc)
@@ -993,13 +993,13 @@ def workers_drift_success(self):
                 self.assertEqual(rank, output)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_workers_drift_success_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.workers_drift_success)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_workers_drift_success_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.workers_drift_success)
@@ -1024,13 +1024,13 @@ def workers_drift_fail(self):
                 self.assertEqual(rank, output)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_workers_drift_fail_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.workers_drift_fail)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_workers_drift_fail_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.workers_drift_fail)
@@ -1047,19 +1047,19 @@ def barrier_failed(self, barrier_mock):
         barrier_mock.assert_called_once()
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_barrier_failed_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.barrier_failed)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_barrier_failed_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.barrier_failed)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_barrier_failed_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.barrier_failed)
@@ -1081,19 +1081,19 @@ def shutdown_called(self, start_processes_mock):
         pcontext_mock.close.assert_called_once()
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_shutdown_called_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.shutdown_called)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_shutdown_called_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.shutdown_called)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_shutdown_called_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.shutdown_called)
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index cb1db294d2791..c27d932e43cb9 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -35,6 +35,7 @@
 from torch.testing._internal.common_utils import (
     NO_MULTIPROCESSING_SPAWN,
     TEST_WITH_ASAN,
+    TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_TSAN,
     IS_IN_CI,
     IS_WINDOWS,
@@ -222,7 +223,7 @@ def start_processes_zombie_test(
 
 
 # tests incompatible with tsan or asan
-if not (TEST_WITH_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS):
+if not (TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS):
     class StartProcessesTest(unittest.TestCase):
         def setUp(self):
             self.test_dir = tempfile.mkdtemp(prefix=f"{self.__class__.__name__}_")
@@ -386,7 +387,7 @@ def test_void_function(self):
                     self.assertEqual({0: None, 1: None}, results.return_values)
 
         @sandcastle_skip_if(
-            TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+            TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
         )
         def test_function_large_ret_val(self):
             # python multiprocessing.queue module uses pipes and actually PipedQueues
@@ -548,7 +549,7 @@ def test_multiprocessing_context_poll_raises_exception(self):
 
 
 # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows
-if not (TEST_WITH_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS):
+if not (TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS):
     class StartProcessesListTest(StartProcessesTest):
         ########################################
         # start_processes as binary tests
@@ -646,7 +647,7 @@ def test_binary_redirect_and_tee(self):
 
 
 # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows
-if not (TEST_WITH_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS or IS_IN_CI):
+if not (TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS or IS_IN_CI):
     class StartProcessesNotCITest(StartProcessesTest):
         def test_wrap_bad(self):
             none = ""
@@ -696,7 +697,11 @@ def test_binary_signal(self):
 
             failure = results.failures[0]
             self.assertNotEqual(signal.SIGSEGV, failure.exitcode)
-            self.assertEqual("SIGSEGV", failure.signal_name())
+            if TEST_WITH_ASAN:
+                # ASAN exit code is 1.
+                self.assertEqual("<N/A>", failure.signal_name())
+            else:
+                self.assertEqual("SIGSEGV", failure.signal_name())
             self.assertEqual("<NONE>", failure.error_file_data["message"])
 
         def test_function_redirect_and_tee(self):
diff --git a/test/distributed/elastic/timer/local_timer_example.py b/test/distributed/elastic/timer/local_timer_example.py
index d73aa67ee75e7..7845c4b5001e5 100644
--- a/test/distributed/elastic/timer/local_timer_example.py
+++ b/test/distributed/elastic/timer/local_timer_example.py
@@ -14,7 +14,7 @@
 import torch.distributed.elastic.timer as timer
 import torch.multiprocessing as torch_mp
 from torch.testing._internal.common_utils import (
-    TEST_WITH_ASAN,
+    TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_TSAN,
     run_tests,
     IS_WINDOWS,
@@ -55,7 +55,7 @@ class LocalTimerExample(unittest.TestCase):
         unittest. As of now this will SIGSEGV.
         """
 
-        @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible")
+        @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible")
         def test_torch_mp_example(self):
             # in practice set the max_interval to a larger value (e.g. 60 seconds)
             mp_queue = mp.get_context("spawn").Queue()
@@ -80,11 +80,11 @@ def test_torch_mp_example(self):
 
             server.stop()
 
-        @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible")
+        @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible")
         def test_example_start_method_spawn(self):
             self._run_example_with(start_method="spawn")
 
-        # @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible")
+        # @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible")
         # def test_example_start_method_forkserver(self):
         #     self._run_example_with(start_method="forkserver")
 
diff --git a/test/distributed/launcher/api_test.py b/test/distributed/launcher/api_test.py
index 954b7e201a351..d2bfd360f9c31 100644
--- a/test/distributed/launcher/api_test.py
+++ b/test/distributed/launcher/api_test.py
@@ -30,7 +30,7 @@
     _get_entrypoint_name,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_ASAN,
+    TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_TSAN,
     sandcastle_skip_if,
 )
@@ -128,7 +128,7 @@ def check_works_ran(self, world_size: int):
         )
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_launch_script_python(self):
         nnodes = 1
@@ -145,7 +145,7 @@ def test_launch_script_python(self):
         self.check_works_ran(world_size)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_launch_script_python_local_rank_transfer(self):
         nnodes = 1
@@ -162,7 +162,7 @@ def test_launch_script_python_local_rank_transfer(self):
         self.check_works_ran(world_size)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_launch_script_bash(self):
         nnodes = 1
@@ -177,7 +177,7 @@ def test_launch_script_bash(self):
         self.check_works_ran(world_size)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_launch_function(self):
         nnodes = 1
@@ -193,7 +193,7 @@ def test_launch_function(self):
         self.assertEqual(expected_res, actual_res)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_launch_dist_sum_with_static_rdzv(self):
         nnodes = 1
@@ -224,7 +224,7 @@ def test_launch_dist_sum_with_static_rdzv(self):
         self.assertEqual(expected_res, actual_res)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_launch_elastic(self):
         nproc_per_node = 4
diff --git a/test/distributed/launcher/launch_test.py b/test/distributed/launcher/launch_test.py
index 2d27269014246..73aed1a4ea59f 100644
--- a/test/distributed/launcher/launch_test.py
+++ b/test/distributed/launcher/launch_test.py
@@ -14,7 +14,7 @@
 import torch.distributed.launch as launch
 from torch.distributed.elastic.utils import get_socket_with_port
 from torch.testing._internal.common_utils import (
-    TEST_WITH_ASAN,
+    TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_TSAN,
     sandcastle_skip_if,
 )
@@ -36,7 +36,7 @@ def tearDown(self):
         shutil.rmtree(self.test_dir)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_launch_without_env(self):
         nnodes = 1
@@ -58,7 +58,7 @@ def test_launch_without_env(self):
         launch.main(args)
 
     @sandcastle_skip_if(
-        TEST_WITH_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
     )
     def test_launch_with_env(self):
         nnodes = 1
diff --git a/test/distributed/launcher/run_test.py b/test/distributed/launcher/run_test.py
index 7318bbd630687..a63ec0382dfa4 100644
--- a/test/distributed/launcher/run_test.py
+++ b/test/distributed/launcher/run_test.py
@@ -22,7 +22,7 @@
 from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
 from torch.distributed.elastic.utils import get_socket_with_port
 from torch.testing._internal.common_utils import (
-    TEST_WITH_ASAN,
+    TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_TSAN,
     sandcastle_skip_if,
 )
@@ -138,7 +138,7 @@ def test_launch_user_script_python_caffe2_bc(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     def test_launch_user_script_bash(self):
         run_id = str(uuid.uuid4().int)
         nnodes = 1
@@ -169,7 +169,7 @@ def test_launch_user_script_bash(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     def test_launch_user_script_default_nproc(self):
         run_id = str(uuid.uuid4().int)
         nnodes = 1
@@ -198,7 +198,7 @@ def test_launch_user_script_default_nproc(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     def test_launch_with_env_vars(self):
         run_id = str(uuid.uuid4().int)
         nnodes = 1
@@ -256,27 +256,27 @@ def _test_nproc_launch_configuration(self, nproc_type, expected_number):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     def test_nproc_launch_auto_configurations(self):
         self._test_nproc_launch_configuration("auto", os.cpu_count())
 
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     def test_nproc_launch_number_configurations(self):
         self._test_nproc_launch_configuration("4", 4)
 
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     def test_nproc_launch_unknown_configurations(self):
         with self.assertRaises(ValueError):
             self._test_nproc_launch_configuration("unknown", 4)
 
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     @patch("torch.cuda.is_available", return_value=True)
     @patch("torch.cuda.device_count", return_value=3)
     def test_nproc_gpu_launch_configurations(self, _mock1, _mock2):
         self._test_nproc_launch_configuration("auto", 3)
         self._test_nproc_launch_configuration("gpu", 3)
 
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     def test_launch_elastic(self):
         run_id = str(uuid.uuid4().int)
         min_nodes = 1
@@ -304,7 +304,7 @@ def test_launch_elastic(self):
         )
 
     @mock.patch("torch.distributed.elastic.events.record")
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     def test_launch_elastic_worker_raise_exception(self, record_mock):
         """
         Asserts that when the worker program fails and lancher raieses exception
@@ -332,7 +332,7 @@ def test_launch_elastic_worker_raise_exception(self, record_mock):
 
         record_mock.assert_called_once()
 
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     @mock.patch(
         "torch.distributed.elastic.agent.server.local_elastic_agent.LocalElasticAgent.run"
     )
@@ -364,7 +364,7 @@ def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run)
             launch.main(args)
         record_mock.assert_called_once()
 
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     def test_launch_standalone(self):
         nnodes = 1
         nproc_per_node = 4
@@ -386,7 +386,7 @@ def test_launch_standalone(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     def test_launch_run_path(self):
         nnodes = 1
         nproc_per_node = 4
@@ -408,7 +408,7 @@ def test_launch_run_path(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @sandcastle_skip_if(TEST_WITH_ASAN or TEST_WITH_TSAN, "test incompatible with tsan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
     def test_launch_elastic_multiple_agents(self):
         run_id = str(uuid.uuid4().int)
         min_nodes = 1
diff --git a/test/distributed/test_launcher.py b/test/distributed/test_launcher.py
index 85ba293966f2d..53faefba95f95 100644
--- a/test/distributed/test_launcher.py
+++ b/test/distributed/test_launcher.py
@@ -11,7 +11,7 @@
     sys.exit(0)
 
 from torch.testing._internal.common_utils import (
-    TEST_WITH_ASAN,
+    TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_TSAN,
     TestCase,
     run_tests,
@@ -21,7 +21,7 @@
 def path(script):
     return os.path.join(os.path.dirname(__file__), script)
 
-if TEST_WITH_ASAN:
+if TEST_WITH_DEV_DBG_ASAN:
     print("Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr)
     sys.exit(0)
 

From 2f615f63135e834499b5d11a4fbced91d70913cc Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Wed, 18 Aug 2021 11:29:51 -0700
Subject: [PATCH 026/530] Improve custom function docs (#60312)

Summary:
- Adds some code examples for `ctx` methods and make requirements of arguments more clear
- Type annotations for `save_for_backward`, `mark_dirty`, `mark_non_differentiable`, and `set_materialize_grads` (BC-breaking?)
- Refactor `torch.autograd.Function` doc

Pull Request resolved: https://github.com/pytorch/pytorch/pull/60312

Reviewed By: VitalyFedyunin

Differential Revision: D30314961

Pulled By: soulitzer

fbshipit-source-id: a284314b65662e26390417bd2b6b12cd85e68dc8
---
 docs/source/autograd.rst        |   8 +-
 docs/source/notes/extending.rst | 134 +++++++++++++++++---------
 torch/autograd/function.py      | 161 ++++++++++++++++++++++++++------
 3 files changed, 226 insertions(+), 77 deletions(-)

diff --git a/docs/source/autograd.rst b/docs/source/autograd.rst
index 5958c639813f1..6423d5d6d088c 100644
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
@@ -189,10 +189,10 @@ When creating a new :class:`Function`, the following methods are available to `c
     :toctree: generated
     :nosignatures:
 
-    function._ContextMethodMixin.mark_dirty
-    function._ContextMethodMixin.mark_non_differentiable
-    function._ContextMethodMixin.save_for_backward
-    function._ContextMethodMixin.set_materialize_grads
+    function.FunctionCtx.mark_dirty
+    function.FunctionCtx.mark_non_differentiable
+    function.FunctionCtx.save_for_backward
+    function.FunctionCtx.set_materialize_grads
 
 .. _grad-check:
 
diff --git a/docs/source/notes/extending.rst b/docs/source/notes/extending.rst
index 1c89bcf10eb0a..a8d3983f9f0d9 100644
--- a/docs/source/notes/extending.rst
+++ b/docs/source/notes/extending.rst
@@ -13,60 +13,110 @@ Extending :mod:`torch.autograd`
 .. currentmodule:: torch.autograd
 
 Adding operations to :mod:`~torch.autograd` requires implementing a new
-:class:`Function` subclass for each operation. Recall that :class:`Function` s
-are what :mod:`~torch.autograd` uses to compute the results and gradients, and
-encode the operation history. Every new function requires you to implement 2 methods:
-
-- :meth:`~Function.forward` - the code that performs the operation. It can take
+:class:`Function` subclass for each operation. Recall that Functions
+are what :mod:`~torch.autograd` uses to encode the operation history and compute
+gradients.
+
+When to use
+^^^^^^^^^^^
+In general, implement a custom function if you want to perform computations in your model
+that are not differentiable or rely on non-Pytorch libraries (e.g., NumPy), but
+still wish for your operation to chain with other ops and work with the autograd engine.
+
+In some situations, custom functions can also be used to improve performance and
+memory usage: If you implemented your forward and backward passes using a
+`C++ extension <https://pytorch.org/tutorials/advanced/cpp_extension.html>`_,
+you can wrap them in :class:`~Function` to interface with the autograd
+engine. If you'd like to reduce the number of buffers saved for the backward pass,
+custom functions can be used to combine ops together.
+
+When not to use
+^^^^^^^^^^^^^^^
+If you can already write your function in terms of PyTorch's built-in ops, its
+backward graph is (most likely) already able to be recorded by autograd. In this case, you do
+not need to implement the backward function yourself. Consider using a plain
+old Python function.
+
+If you need to maintain state, i.e., trainable parameters, you should (also) use a
+custom module. See the section below for more information on extending :mod:`torch.nn`.
+
+If you'd like to alter the gradients during the backward pass or perform a side
+effect, consider registering a
+`tensor <https://pytorch.org/docs/stable/generated/torch.Tensor.register_hook.html#torch.Tensor.register_hook>`_ or
+`Module <https://pytorch.org/docs/stable/notes/modules.html#module-hooks>`_ hook.
+
+How to use
+^^^^^^^^^^
+Take the following steps:
+1. Subclass :class:`~Function` and implement the :meth:`~Function.forward` and
+:meth:`~Function.backward` methods.
+2. Call the proper methods on the `ctx` argument.
+3. Declare whether your function supports double backward.
+4. Validate whether your gradients are correct using gradcheck.
+
+**Step 1:** After subclassing :class:`Function`, you'll need to define 2 methods:
+
+- :meth:`~Function.forward` is the code that performs the operation. It can take
   as many arguments as you want, with some of them being optional, if you
   specify the default values. All kinds of Python objects are accepted here.
   :class:`Tensor` arguments that track history (i.e., with
   ``requires_grad=True``) will be converted to ones that don't track history
   before the call, and their use will be registered in the graph. Note that this
   logic won't traverse lists/dicts/any other data structures and will only
-  consider :class:`Tensor` s that are direct arguments to the call. You can
+  consider tensors that are direct arguments to the call. You can
   return either a single :class:`Tensor` output, or a :class:`tuple` of
-  :class:`Tensor` s if there are multiple outputs. Also, please refer to the
+  tensors if there are multiple outputs. Also, please refer to the
   docs of :class:`Function` to find descriptions of useful methods that can be
   called only from :meth:`~Function.forward`.
-- :meth:`~Function.backward` - gradient formula. It will be given
+- :meth:`~Function.backward` defines the gradient formula. It will be given
   as many :class:`Tensor` arguments as there were outputs, with each of them
-  representing gradient w.r.t. that output. It should return as many
-  :class:`Tensor` s as there were inputs, with each of them containing the
-  gradient w.r.t. its corresponding input. If your inputs didn't require
-  gradient (:attr:`~ctx.needs_input_grad` is a tuple of booleans indicating
+  representing gradient w.r.t. that output. It is important NEVER to modify
+  these in-place. It should return as many tensors as there
+  were inputs, with each of them containing the gradient w.r.t. its
+  corresponding input. If your inputs didn't require gradient
+  (:attr:`~ctx.needs_input_grad` is a tuple of booleans indicating
   whether each input needs gradient computation), or were non-:class:`Tensor`
   objects, you can return :class:`python:None`. Also, if you have optional
   arguments to :meth:`~Function.forward` you can return more gradients than there
   were inputs, as long as they're all :any:`python:None`.
 
-.. note::
-
-  It's the user's responsibility to use the special functions in the forward's `ctx`
-  properly in order to ensure that the new :class:`Function` works properly with
-  the autograd engine.
-
-  - :meth:`~torch.autograd.function._ContextMethodMixin.save_for_backward` must be
-    used when saving input or output of the forward to be used later in the backward.
-  - :meth:`~torch.autograd.function._ContextMethodMixin.mark_dirty` must be used to
-    mark any input that is modified inplace by the forward function.
-  - :meth:`~torch.autograd.function._ContextMethodMixin.mark_non_differentiable` must
-    be used to tell the engine if an output is not differentiable.
-  - :meth:`~torch.autograd.function._ContextMethodMixin.set_materialize_grads` can be
-    used to tell the autograd engine to optimize gradient computations in the cases where
-    the output does not depend on the input by not materializing grad tensors given to backward
-    function. That is, if set to False, None object in python or "undefined tensor" (tensor x for
-    which x.defined() is False) in C++ will not be converted to a tensor filled with zeros prior
-    to calling backward. However, supporting this optimization means your custom autograd function
-    has to handle gradients that are represented in this way and is thus opt-in. Default value is True.
-
-.. note::
-
-  By default, all the output Tensors that are of differentiable type will be set to
-  require gradient and have all autograd metadata set for them. If you don't want
-  them to require gradients, you can use the `mark_non_differentiable` method mentioned
-  above. For output Tensors that are not of differentiable type (integer types for example),
-  they won't be marked as requiring gradients.
+**Step 2:** It is your responsibility to use the functions in the forward's `ctx`
+properly in order to ensure that the new :class:`Function` works properly with
+the autograd engine.
+
+- :meth:`~torch.autograd.function.FunctionCtx.save_for_backward` must be
+  used when saving input or output tensors of the forward to be used later in the backward.
+  Anything else, i.e., non-tensors and tensors that are neither input nor output
+  should be stored directly on `ctx`.
+- :meth:`~torch.autograd.function.FunctionCtx.mark_dirty` must be used to
+  mark any input that is modified inplace by the forward function.
+- :meth:`~torch.autograd.function.FunctionCtx.mark_non_differentiable` must
+  be used to tell the engine if an output is not differentiable. By
+  default all output tensors that are of differentiable type will be set
+  to require gradient. Tensors of non-differentiable type (i.e., integral types)
+  are never marked as requiring gradients.
+- :meth:`~torch.autograd.function.FunctionCtx.set_materialize_grads` can be
+  used to tell the autograd engine to optimize gradient computations in the cases where
+  the output does not depend on the input by not materializing grad tensors given to backward
+  function. That is, if set to False, None object in python or "undefined tensor" (tensor x for
+  which x.defined() is False) in C++ will not be converted to a tensor filled with zeros prior
+  to calling backward, and so your code will need to handle such objects as if they were
+  tensors filled with zeros. The default value of this setting is True.
+
+**Step 3:** If your :class:`~Function` does not support double backward
+you should explicitly declare this by decorating backward with the
+:func:`~function.once_differentiable`. With this decorator, attempts to
+perform double backward through your function will produce an error.
+See our double backward tutorial for more information on double backward.
+
+**Step 4:** It is recommended that you use :func:`torch.autograd.gradcheck`
+to check whether your backward function correctly computes gradients of the
+forward by computing the Jacobian matrix using your backward function and
+comparing the value element-wise with the Jacobian computed numerically using
+finite-differencing.
+
+Example
+^^^^^^^
 
 Below you can find code for a ``Linear`` function from :mod:`torch.nn`, with
 additional comments::
@@ -151,12 +201,12 @@ And here, we optimize the above example by calling set_materialize_grads(False):
             return grad_output * ctx.constant, None
 
 .. note::
-    Inputs to ``backward``, i.e., :attr:`grad_output`, can also be Tensors that
+    Inputs to ``backward``, i.e., :attr:`grad_output`, can also be tensors that
     track history. So if ``backward`` is implemented with differentiable
     operations, (e.g., invocation of another custom
     :class:`~torch.autograd.function`), higher order derivatives will work.
-    In this case, the Tensors saved with ``save_for_backward`` can also be used
-    in the backward and have gradients flowing back but Tensors saved in the ``ctx``
+    In this case, the tensors saved with ``save_for_backward`` can also be used
+    in the backward and have gradients flowing back but tensors saved in the ``ctx``
     won't have gradients flowing back for them.
     If you need gradients to flow back for a Tensor saved in the ``ctx``, you should
     make it an output of the custom ``Function`` and save it with ``save_for_backward``.
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index 4fc25c5951d11..4d6122924ec14 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -8,24 +8,53 @@
 from collections import OrderedDict
 from typing import Any, List, Optional
 
+# Formerly known as: _ContextMethodMixin
+class FunctionCtx(object):
 
-class _ContextMethodMixin(object):
-
-    def save_for_backward(self, *tensors):
+    def save_for_backward(self, *tensors: torch.Tensor):
         r"""Saves given tensors for a future call to :func:`~Function.backward`.
 
         **This should be called at most once, and only from inside the**
-        :func:`forward` **method.**
+        :func:`forward` **method. This should only be called with input or
+        output tensors**
 
-        Later, saved tensors can be accessed through the :attr:`saved_tensors`
+        In :func:`backward`, saved tensors can be accessed through the :attr:`saved_tensors`
         attribute. Before returning them to the user, a check is made to ensure
         they weren't used in any in-place operation that modified their content.
 
-        Arguments can also be ``None``.
+        Arguments can also be ``None``. This is a no-op.
+
+        See :ref:`extending-autograd` for more details on how to use this method.
+
+        Example::
+            >>> class Func(Function):
+            >>>     @staticmethod
+            >>>     def forward(ctx, x: torch.Tensor, y: torch.Tensor, z: int):
+            >>>         w = x * y * z
+            >>>         out = x * y + y * z + w
+            >>>         ctx.save_for_backward(x, y, out)
+            >>>         ctx.z = z  # z is not a tensor
+            >>>         ctx.w = w  # w is neither input nor output
+            >>>         return out
+            >>>
+            >>>     @staticmethod
+            >>>     def backward(ctx, grad_out):
+            >>>         x, y, out = ctx.saved_tensors
+            >>>         z = ctx.z
+            >>>         gx = grad_out * (y + y * z)
+            >>>         gy = grad_out * (x + z + x * z)
+            >>>         gz = None
+            >>>         return gx, gy, gz
+            >>>
+            >>> a = torch.tensor(1., requires_grad=True, dtype=torch.double)
+            >>> b = torch.tensor(2., requires_grad=True, dtype=torch.double)
+            >>> c = 4
+            >>> d = Func.apply(a, b, c)
+
         """
         self.to_save = tensors
 
-    def mark_dirty(self, *args):
+    def mark_dirty(self, *args: torch.Tensor):
         r"""Marks given tensors as modified in an in-place operation.
 
         **This should be called at most once, only from inside the**
@@ -35,6 +64,28 @@ def mark_dirty(self, *args):
         should be given to this function, to ensure correctness of our checks.
         It doesn't matter whether the function is called before or after
         modification.
+
+        Examples::
+            >>> class Inplace(Function):
+            >>>     @staticmethod
+            >>>     def forward(ctx, x):
+            >>>         x_npy = x.numpy() # x_npy shares storage with x
+            >>>         x_npy += 1
+            >>>         ctx.mark_dirty(x)
+            >>>         return x
+            >>>
+            >>>     @staticmethod
+            >>>     @once_differentiable
+            >>>     def backward(ctx, grad_output):
+            >>>         return grad_output
+            >>>
+            >>> a = torch.tensor(1., requires_grad=True, dtype=torch.double).clone()
+            >>> b = a * a
+            >>> Inplace.apply(a)  # This would lead to wrong gradients!
+            >>>                   # but the engine would not know unless we mark_dirty
+            >>> b.backward() # RuntimeError: one of the variables needed for gradient
+            >>>              # computation has been modified by an inplace operation
+
         """
         self.dirty_tensors = args
 
@@ -44,11 +95,11 @@ def mark_shared_storage(self, *pairs):
             'Tensors with shared storages are automatically tracked. Note '
             'that calls to `set_()` are not tracked')
 
-    def mark_non_differentiable(self, *args):
+    def mark_non_differentiable(self, *args: torch.Tensor):
         r"""Marks outputs as non-differentiable.
 
         **This should be called at most once, only from inside the**
-        :func:`forward` **method, and all arguments should be outputs.**
+        :func:`forward` **method, and all arguments should be tensor outputs.**
 
         This will mark outputs as not requiring gradients, increasing the
         efficiency of backward computation. You still need to accept a gradient
@@ -56,20 +107,73 @@ def mark_non_differentiable(self, *args):
         be a zero tensor with the same shape as the shape of a corresponding
         output.
 
-        This is used e.g. for indices returned from a max :class:`Function`.
+        This is used e.g. for indices returned from a sort. See example::
+            >>> class Func(Function):
+            >>>     @staticmethod
+            >>>     def forward(ctx, x):
+            >>>         sorted, idx = x.sort()
+            >>>         ctx.mark_non_differentiable(idx)
+            >>>         ctx.save_for_backward(x, idx)
+            >>>         return sorted, idx
+            >>>
+            >>>     @staticmethod
+            >>>     @once_differentiable
+            >>>     def backward(ctx, g1, g2):  # still need to accept g2
+            >>>         x, idx = ctx.saved_tensors
+            >>>         grad_input = torch.zeros_like(x)
+            >>>         grad_input.index_add_(0, idx, g1)
+            >>>         return grad_input
+
         """
         self.non_differentiable = args
 
-    def set_materialize_grads(self, value):
-        r"""Sets whether to materialize output grad tensors. Default is true.
+    def set_materialize_grads(self, value: bool):
+        r"""Sets whether to materialize output grad tensors. Default is ``True``.
 
         **This should be called only from inside the** :func:`forward` **method**
 
-        If true, undefined output grad tensors will be expanded to tensors full
+        If ``True``, undefined output grad tensors will be expanded to tensors full
         of zeros prior to calling the :func:`backward` method.
+
+        Example::
+            >>> class SimpleFunc(Function):
+            >>>     @staticmethod
+            >>>     def forward(ctx, x):
+            >>>         return x.clone(), x.clone()
+            >>>
+            >>>     @staticmethod
+            >>>     @once_differentiable
+            >>>     def backward(ctx, g1, g2):
+            >>>         return g1 + g2  # No check for None necessary
+            >>>
+            >>> # We modify SimpleFunc to handle non-materialized grad outputs
+            >>> class Func(Function):
+            >>>     @staticmethod
+            >>>     def forward(ctx, x):
+            >>>         ctx.set_materialize_grads(False)
+            >>>         ctx.save_for_backward(x)
+            >>>         return x.clone(), x.clone()
+            >>>
+            >>>     @staticmethod
+            >>>     @once_differentiable
+            >>>     def backward(ctx, g1, g2):
+            >>>         x, = ctx.saved_tensors
+            >>>         grad_input = torch.zeros_like(x)
+            >>>         if g1 is not None:  # We must check for None now
+            >>>             grad_input += g1
+            >>>         if g2 is not None:
+            >>>             grad_input += g2
+            >>>         return grad_input
+            >>>
+            >>> a = torch.tensor(1., requires_grad=True)
+            >>> b, _ = Func.apply(a)  # induces g2 to be undefined
+
         """
         self.materialize_grads = value
 
+# DO NOT USE: This is only defined to be able to load old serialized models
+_ContextMethodMixin = FunctionCtx
+
 class _HookMixin(object):
 
     @staticmethod
@@ -81,7 +185,7 @@ def _register_hook(backward_hooks, hook):
         return backward_hooks, handle
 
 
-class BackwardCFunction(_C._FunctionBase, _ContextMethodMixin, _HookMixin):
+class BackwardCFunction(_C._FunctionBase, FunctionCtx, _HookMixin):
     def apply(self, *args):
         # _forward_cls is defined by derived class
         return self._forward_cls.backward(self, *args)  # type: ignore[attr-defined]
@@ -103,28 +207,23 @@ def __init__(cls, name, bases, attrs):
 
 
 # mypy doesn't understand `with_metaclass` from torch._six
-class Function(with_metaclass(FunctionMeta, _C._FunctionBase, _ContextMethodMixin, _HookMixin)):  # type: ignore[misc]
-    r"""Records operation history and defines formulas for differentiating ops.
+class Function(with_metaclass(FunctionMeta, _C._FunctionBase, FunctionCtx, _HookMixin)):  # type: ignore[misc]
+    r"""Base class to create custom `autograd.Function`
 
-    See the Note on extending the autograd engine for more details on how to use
-    this class: https://pytorch.org/docs/stable/notes/extending.html#extending-torch-autograd
+    To create a custom `autograd.Function`, subclass this class and implement
+    the :meth:`forward` and :meth`backward` static methods. Then, to use your custom
+    op in the forward pass, call the class method ``apply``. Do not call
+    :meth:`forward` directly.
 
-    Every operation performed on :class:`Tensor` s creates a new function
-    object, that performs the computation, and records that it happened.
-    The history is retained in the form of a DAG of functions, with edges
-    denoting data dependencies (``input <- output``). Then, when backward is
-    called, the graph is processed in the topological ordering, by calling
-    :func:`backward` methods of each :class:`Function` object, and passing
-    returned gradients on to next :class:`Function` s.
+    To ensure correctness and best performance, make sure you are calling the
+    correct methods on ``ctx`` and validating your backward function using
+    :func:`torch.autograd.gradcheck`.
 
-    Normally, the only way users interact with functions is by creating
-    subclasses and defining new operations. This is a recommended way of
-    extending torch.autograd.
+    See :ref:`extending-autograd` for more details on how to use this class.
 
     Examples::
 
         >>> class Exp(Function):
-        >>>
         >>>     @staticmethod
         >>>     def forward(ctx, i):
         >>>         result = i.exp()
@@ -136,7 +235,7 @@ class Function(with_metaclass(FunctionMeta, _C._FunctionBase, _ContextMethodMixi
         >>>         result, = ctx.saved_tensors
         >>>         return grad_output * result
         >>>
-        >>> #Use it by calling the apply method:
+        >>> # Use it by calling the apply method:
         >>> output = Exp.apply(input)
     """
     def __init__(self, *args, **kwargs):
@@ -224,7 +323,7 @@ def wrapper(ctx, *args):
             outputs = (outputs,)
 
         err_fn = _functions.DelayedError(
-            b"trying to differentiate twice a function that was marked"
+            b"trying to differentiate twice a function that was marked "
             b"with @once_differentiable", len(outputs))
 
         # Create aliases of each output that has requires_grad=True. We need

From 50a3b6a6a81395106cbbf0f2a80885d4d43a614f Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Wed, 18 Aug 2021 11:30:44 -0700
Subject: [PATCH 027/530] Make SkipInfo with expected_failure an XFAIL (#63481)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63481

This PR changes the SkipInfo decorators to use unittest.expectedFailure so that the test reports as XFAIL as opposed to PASSED.

Note that changing the expectedFailure here https://github.com/pytorch/pytorch/blob/30e1c74dc19ae2b622b46ebcdb7972c42775ac80/torch/testing/_internal/common_device_type.py#L879 to an XFAIL is not possible because the decision of whether to decorate is delayed until the wrapper function is called.

fixes https://github.com/pytorch/pytorch/issues/63363

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D30397154

Pulled By: heitorschueroff

fbshipit-source-id: c5e4911969ad8667763eec4203dbbc6a51178592
---
 torch/testing/_internal/common_methods_invocations.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index f06d3ce899749..5d55f0ec64291 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -6,6 +6,7 @@
 import operator
 import random
 import numbers
+import unittest
 
 import torch
 import numpy as np
@@ -21,7 +22,7 @@
      integral_types_and, all_types, double_types)
 from .._core import _dispatch_dtypes
 from torch.testing._internal.common_device_type import \
-    (expectedFailure, onlyOnCPUAndCUDA, skipIf, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfNoCusolver,
+    (onlyOnCPUAndCUDA, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfNoCusolver,
      skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIfRocm, precisionOverride, toleranceOverride, tol)
 from torch.testing._internal.common_cuda import CUDA11OrLater, SM53OrLater, SM60OrLater
 from torch.testing._internal.common_utils import \
@@ -86,7 +87,7 @@ def __init__(
             active_if: whether tests matching the above arguments should be skipped
             expected_failure: whether to assert that skipped tests fail
         """
-        decorator = expectedFailure(device_type) if expected_failure else skipIf(True, "Skipped!")
+        decorator = unittest.expectedFailure if expected_failure else unittest.skip("Skipped!")
         super().__init__(decorators=decorator, cls_name=cls_name, test_name=test_name,
                          device_type=device_type, dtypes=dtypes, active_if=active_if)
 

From 4e1d84ae8fae49995c8966ccbe0f34360978492f Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 18 Aug 2021 11:37:07 -0700
Subject: [PATCH 028/530] [doc] pre-commit fix instructions (#61717)

Summary:
fix invalid instruction

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61717

Reviewed By: zhouzhuojie, driazati

Differential Revision: D30359218

Pulled By: malfet

fbshipit-source-id: 61771babeac4d34425a61ce49f38a7099b521eec
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7d8659a8babff..2d820a360dd7c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1151,7 +1151,7 @@ formatting and semantic checking of code. We provide a pre-commit git hook for
 performing these checks, before a commit is created:
 
   ```bash
-  ln -s ../../tools/git-pre-commit .git/hooks/pre-commit
+  ln -s tools/git-pre-commit .git/hooks/pre-commit
   ```
 
 You'll need to install an appropriately configured flake8; see

From ac1ece054b89ec8e69d5e7f5df08dcc8fd145a1e Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 18 Aug 2021 11:38:11 -0700
Subject: [PATCH 029/530] [DDP][Grad compression] Fix fp16 cpp hook (#63375)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63375

I think tensor.copy_(tensor.to(torch::kFloat16)); will keep it as
float32.

Tested by add the following line:

```
LOG(INFO) << "Type is: " << compressed_tensor.scalar_type();
```

before:

```
I0816 17:03:09.823688 364141 default_comm_hooks.cpp:21] Type is: Float
```
after:

```
I0816 17:01:16.779052 353924 default_comm_hooks.cpp:21] Type is: Half
```
ghstack-source-id: 136056092

Test Plan: ci

Reviewed By: SciPioneer

Differential Revision: D30356256

fbshipit-source-id: 8208a705acd7628541cd43c8bf61d007dfdd2435
---
 .../csrc/distributed/c10d/default_comm_hooks.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.cpp b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
index 9d13099c424c6..91700baa2e4a5 100644
--- a/torch/csrc/distributed/c10d/default_comm_hooks.cpp
+++ b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
@@ -16,21 +16,23 @@ c10::intrusive_ptr<c10::ivalue::Future> AllReduceCommHook::runHook(
 
 c10::intrusive_ptr<c10::ivalue::Future> FP16CompressCommHook::runHook(
     GradBucket& bucket) {
-  auto& tensor = bucket.getBufferRef();
-  tensor.copy_(tensor.to(torch::kFloat16));
-  std::vector<at::Tensor> tensors = {tensor};
+
+  auto compressed_tensor = bucket.getBufferRef().to(torch::kFloat16);
   // Apply the division first to avoid overflow.
-  tensors[0] /= state_->getSize();
+  compressed_tensor /= state_->getSize();
+  std::vector<at::Tensor> tensors = {compressed_tensor};
 
   auto allreduce_fut = state_->allreduce(tensors)->getFuture();
-  auto decompress = [](c10::ivalue::Future& allreduce_fut) {
+  auto decompressed_tensor = bucket.getBufferRef();
+  auto decompress = [decompressed_tensor](c10::ivalue::Future& allreduce_fut) {
     auto result = allreduce_fut.value();
     TORCH_INTERNAL_ASSERT(
         result.isTensorList(),
         "ProcessGroup::allreduce should return TensorList");
+
     auto reduce_tensor = result.toTensorVector()[0];
-    reduce_tensor.copy_(reduce_tensor.to(torch::kFloat));
-    return c10::IValue(reduce_tensor);
+    decompressed_tensor.copy_(reduce_tensor);
+    return c10::IValue(decompressed_tensor);
   };
 
   return allreduce_fut->then(decompress, allreduce_fut->elementType());

From f0f5cffde9196dd5ef46a6d6ba17975ea995c3ca Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 18 Aug 2021 11:38:11 -0700
Subject: [PATCH 030/530] [DDP] Add a debug check in cpp fp16 compress (#63379)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63379

this codepath has been prone to bugs as seen in the below diff, this
will help ensure against changes/refactors that touch this, as a basic sanity
check. Enabled it in debug-only builds to not affect the perf.
ghstack-source-id: 136056093

Test Plan: CI

Reviewed By: SciPioneer

Differential Revision: D30358440

fbshipit-source-id: e1b3893a223722c2593ceed8696a09c7d07d47c1
---
 torch/csrc/distributed/c10d/default_comm_hooks.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.cpp b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
index 91700baa2e4a5..30bc96b16f7db 100644
--- a/torch/csrc/distributed/c10d/default_comm_hooks.cpp
+++ b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
@@ -1,4 +1,6 @@
 #include <c10d/default_comm_hooks.hpp>
+#include <c10/core/ScalarType.h>
+#include <c10/util/Exception.h>
 
 #include <c10d/ProcessGroup.hpp>
 #include <c10d/comm.hpp>
@@ -31,6 +33,11 @@ c10::intrusive_ptr<c10::ivalue::Future> FP16CompressCommHook::runHook(
         "ProcessGroup::allreduce should return TensorList");
 
     auto reduce_tensor = result.toTensorVector()[0];
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      reduce_tensor.scalar_type() == at::ScalarType::Half,
+      "Expected reduced tensor to be fp16 in FP16CompressHook, but got type ",
+      reduce_tensor.scalar_type()
+    );
     decompressed_tensor.copy_(reduce_tensor);
     return c10::IValue(decompressed_tensor);
   };

From d4593d9d08ac7df2e5cf6f0f24c364c120be51ec Mon Sep 17 00:00:00 2001
From: Michael Dagitses <mikeyd@fb.com>
Date: Wed, 18 Aug 2021 11:39:12 -0700
Subject: [PATCH 031/530] document why wrappers exist in `torch.functional`
 (#62847)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/62844.

These wrappers are not super obvious, but ultimately stem from the lack of support for functions with variadic args in native_functions.yaml. https://github.com/pytorch/pytorch/issues/62845 tracks that issue.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62847

Reviewed By: VitalyFedyunin

Differential Revision: D30305016

Pulled By: dagitses

fbshipit-source-id: 716fcecb0417b770bc92cfd8c54f7ead89070896
---
 torch/functional.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/torch/functional.py b/torch/functional.py
index 78f833eaf5417..81b3de234e1ca 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -66,6 +66,7 @@ def broadcast_tensors(*tensors):
         tensor([[0, 1, 2],
                 [0, 1, 2]])
     """
+    # This wrapper exists to support variadic args.
     if has_torch_function(tensors):
         return handle_torch_function(broadcast_tensors, tensors, *tensors)
     return _VF.broadcast_tensors(tensors)  # type: ignore[attr-defined]
@@ -96,6 +97,7 @@ def broadcast_shapes(*shapes):
     Raises:
         RuntimeError: If shapes are incompatible.
     """
+    # This wrapper exists to support variadic args.
     # TODO Movie this to C++ once the jit has better support for torch.Size.
     with torch.no_grad():
         scalar = torch.zeros((), device="cpu")
@@ -277,6 +279,7 @@ def einsum(*args):
         tensor([[-0.3430, -5.2405,  0.4494],
                 [ 0.3311,  5.5201, -3.0356]])
     """
+    # This wrapper exists to support variadic args.
     if len(args) < 2:
         raise ValueError('einsum(): must specify the equation string and at least one operand, '
                          'or at least one operand and its subscripts list')
@@ -324,6 +327,7 @@ def parse_subscript(n: int) -> str:
     return _VF.einsum(equation, operands)  # type: ignore[attr-defined]
 
 
+# This wrapper exists to support variadic args.
 if TYPE_CHECKING:
     # The JIT doesn't understand Union, so only add type annotation for mypy
     def meshgrid(*tensors: Union[Tensor, List[Tensor]]) -> Tuple[Tensor, ...]:
@@ -1042,6 +1046,7 @@ def cartesian_prod(*tensors):
                 [3, 4],
                 [3, 5]])
     """
+    # This wrapper exists to support variadic args.
     if has_torch_function(tensors):
         return handle_torch_function(cartesian_prod, tensors, *tensors)
     return _VF.cartesian_prod(tensors)  # type: ignore[attr-defined]
@@ -1076,6 +1081,7 @@ def block_diag(*tensors):
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]])
     """
+    # This wrapper exists to support variadic args.
     if has_torch_function(tensors):
         return handle_torch_function(block_diag, tensors, *tensors)
     return torch._C._VariableFunctions.block_diag(tensors)  # type: ignore[attr-defined]
@@ -1163,6 +1169,7 @@ def atleast_1d(*tensors):
         >>> torch.atleast_1d((x,y))
         (tensor([0.5000]), tensor([1.]))
     """
+    # This wrapper exists to support variadic args.
     if has_torch_function(tensors):
         return handle_torch_function(atleast_1d, tensors, *tensors)
     if len(tensors) == 1:
@@ -1199,6 +1206,7 @@ def atleast_2d(*tensors):
         >>> torch.atleast_2d((x,y))
         (tensor([[0.5000]]), tensor([[1.]]))
     """
+    # This wrapper exists to support variadic args.
     if has_torch_function(tensors):
         return handle_torch_function(atleast_2d, tensors, *tensors)
     if len(tensors) == 1:
@@ -1243,6 +1251,7 @@ def atleast_3d(*tensors):
         >>> torch.atleast_3d((x,y))
         (tensor([[[0.5000]]]), tensor([[[1.]]]))
     """
+    # This wrapper exists to support variadic args.
     if has_torch_function(tensors):
         return handle_torch_function(atleast_3d, tensors, *tensors)
     if len(tensors) == 1:
@@ -1479,6 +1488,7 @@ def chain_matmul(*matrices, out=None):
 
     .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition
     """
+    # This wrapper exists to support variadic args.
     if has_torch_function(matrices):
         return handle_torch_function(chain_matmul, matrices, *matrices)
 

From 9bb1371cc20a14907dbc47bc98e3ac5de866e34b Mon Sep 17 00:00:00 2001
From: peterjc123 <peterghost86@gmail.com>
Date: Wed, 18 Aug 2021 11:41:42 -0700
Subject: [PATCH 032/530] Disable RDYNAMIC check with MSVC (#62949)

Summary:
When testing with clang-cl, the flag is added though it is unsupported and that generates a few warnings. Tried a few alternatives like https://cmake.org/cmake/help/latest/module/CheckLinkerFlag.html, but they just don't work.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62949

Reviewed By: zhouzhuojie, driazati

Differential Revision: D30359206

Pulled By: malfet

fbshipit-source-id: 1bd27ad5772fe6757fa8c3a4bddf904f88d70b7b
---
 cmake/MiscCheck.cmake | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake
index efac9e0dfa8e6..1497b0044a0b0 100644
--- a/cmake/MiscCheck.cmake
+++ b/cmake/MiscCheck.cmake
@@ -178,10 +178,12 @@ endif()
 # -to add all (including unused) symbols into the dynamic symbol
 # -table. We need this to get symbols when generating backtrace at
 # -runtime.
-check_cxx_compiler_flag("-rdynamic" COMPILER_SUPPORTS_RDYNAMIC)
-if(${COMPILER_SUPPORTS_RDYNAMIC})
-  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -rdynamic")
-  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic")
+if(NOT MSVC)
+  check_cxx_compiler_flag("-rdynamic" COMPILER_SUPPORTS_RDYNAMIC)
+  if(${COMPILER_SUPPORTS_RDYNAMIC})
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -rdynamic")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic")
+  endif()
 endif()
 
 # ---[ If we are using msvc, set no warning flags

From 11fbd3958cb7ab69a9764d9ae9b000f24333ec40 Mon Sep 17 00:00:00 2001
From: Basil Hosmer <bhosmer@fb.com>
Date: Wed, 18 Aug 2021 12:06:53 -0700
Subject: [PATCH 033/530] MaybeOwned page for dev wiki (#63450)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63450

Brief guide to understanding `MaybeOwned<Tensor>`, aimed at C++ PT devs who are obliged to interact with existing uses of it, rather than encouraging new usage.

For reviewers: I haven't yet added a link to this page from anywhere. I'm thinking the right place is the [dev wiki main page C++ section](https://github.com/pytorch/pytorch/wiki#c) but happy to put it wherever makes sense, suggestions welcome.

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D30402313

Pulled By: bhosmer

fbshipit-source-id: 69b15909ecafcd8d88e44f664f88c3ad4eb26d84
---
 docs/cpp/source/notes/maybe_owned.rst | 59 +++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 docs/cpp/source/notes/maybe_owned.rst

diff --git a/docs/cpp/source/notes/maybe_owned.rst b/docs/cpp/source/notes/maybe_owned.rst
new file mode 100644
index 0000000000000..8fa05f1b6aea7
--- /dev/null
+++ b/docs/cpp/source/notes/maybe_owned.rst
@@ -0,0 +1,59 @@
+MaybeOwned<Tensor>
+==================
+
+``MaybeOwned<Tensor>`` is a C++ smart pointer class that dynamically
+encodes whether a Tensor is *owned* or *borrowed*. It is used in
+certain performance-sensitive situations to avoid unnecessarily
+incrementing a Tensor’s reference count (at a small cost in
+overhead from the extra indirection).
+
+.. warning::
+    MaybeOwned must be used with **extreme** care. Claims of (non-)ownership
+    are not statically checked, and mistakes can cause reference undercounting
+    and use-after-free crashes.
+
+    Due to this lack of safety net, we discourage the use of MaybeOwned
+    outside code paths that are known to be highly performance sensitive.
+    However, if you encounter pre-existing uses of MaybeOwned in code that
+    you want to modify, it’s critical to understand how to use it correctly.
+
+The primary use case for ``MaybeOwned<Tensor>`` is a function or method that
+dynamically chooses between returning one of its arguments (typically
+from a passthrough or “no-op” code path) and returning a freshly constructed
+Tensor. Such a function would return a ``MaybeOwned<Tensor>`` in both cases,
+the former in a "borrowed" state via a call to ``MaybeOwned<Tensor>::borrowed()``,
+and the latter in an "owned" state via a call to ``MaybeOwned<Tensor>::owned()``.
+
+The canonical example is ``Tensor``'s ``expect_contiguous`` method, which shortcuts
+and returns a borrowed self-reference when already contiguous:
+
+.. code-block:: cpp
+
+  inline c10::MaybeOwned<Tensor> Tensor::expect_contiguous(MemoryFormat memory_format) const & {
+    if (is_contiguous(memory_format)) {
+      return c10::MaybeOwned<Tensor>::borrowed(*this);
+    } else {
+      return c10::MaybeOwned<Tensor>::owned(__dispatch_contiguous(memory_format));
+    }
+  }
+
+Using the vocabulary of lifetimes, the essential safety requirement for borrowing
+is that a borrowed Tensor must outlive any borrowing references to it. Here, for
+example, we can safely borrow ``*this``, but the Tensor returned by
+``__dispatch_contiguous()`` is freshly created, and borrowing a reference would
+effectively leave it ownerless.
+
+So, general rules of thumb:
+
+- When in doubt, don’t use ``MaybeOwned<Tensor>`` at all - in particular, prefer
+  avoiding using it in code that doesn’t use it already. New usage should only be
+  introduced when critical (and demonstrable) performance gains result.
+
+- When modifying or calling code that already uses ``MaybeOwned<Tensor>``, remember
+  that it's always safe to produce a ``MaybeOwned<Tensor>`` from a Tensor in hand
+  via a call to ``MaybeOwned<Tensor>::owned()``. This may result in an unnecessary
+  reference count, but never in misbehavior - so it's always the safer bet, unless
+  the lifetime of the Tensor you're looking to wrap is crystal clear.
+
+More details and implementation code can be found at <https://github.com/pytorch/pytorch/blob/master/c10/util/MaybeOwned.h> and
+<https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/templates/TensorBody.h>.

From d661e646add3256b85687faab4197d2d1aab3dec Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Wed, 18 Aug 2021 13:16:01 -0700
Subject: [PATCH 034/530] [FX] Fix GraphModule deepcopy to use deepcopied graph
 (#63090)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63090

Test Plan: Imported from OSS

Reviewed By: ansley

Differential Revision: D30252471

Pulled By: jamesr66a

fbshipit-source-id: cafd7d7917935a5ea6ffa2a7fe9e9b2a9578b3e3
---
 test/test_fx.py          | 19 +++++++++++++++++++
 torch/fx/graph_module.py |  2 +-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index f0a3291d07d4a..1708634653a64 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -1943,6 +1943,25 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> boo
         with self.assertRaisesRegex(RuntimeError, 'cannot contain a Node'):
             traced_graph = MyTracer().trace(CallsModWithDict())
 
+    def test_module_deepcopy_edit_nodes(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return torch.relu(x)
+
+        traced1 = symbolic_trace(Foo())
+        copied = copy.deepcopy(traced1)
+
+        for node in copied.graph.nodes:
+            if node.target == torch.relu:
+                node.target = torch.neg
+
+        copied.recompile()
+        traced1.recompile()
+
+        x = torch.randn(15, 15)
+        torch.testing.assert_allclose(traced1(x), torch.relu(x))
+        torch.testing.assert_allclose(copied(x), torch.neg(x))
+
     def test_direct_param_use(self):
         class TransposeTest(torch.nn.Module):
             def __init__(self):
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 0cbbd9373027a..85479f069d53d 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -615,7 +615,7 @@ def __reduce__(self):
     def __deepcopy__(self, memo):
         fake_mod = torch.nn.Module()
         fake_mod.__dict__ = copy.deepcopy(self.__dict__)
-        return GraphModule(fake_mod, self.graph)
+        return GraphModule(fake_mod, fake_mod.__dict__['_graph'])
 
     def __copy__(self):
         return GraphModule(self, self.graph)

From 99b154b8bebb8eed79d10f5c83798263ae04153f Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 18 Aug 2021 13:25:19 -0700
Subject: [PATCH 035/530] [ONNX] Support lstm_cell symbolic (#61476) (#62757)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62757

Support lstm_cell symbolic

Test Plan: Imported from OSS

Reviewed By: SplitInfinity

Differential Revision: D30349061

Pulled By: msaroufim

fbshipit-source-id: f236177e3e5c62a30b7e4d91a623bcaef21b5eb1

Co-authored-by: jiafatom <jiafa@microsoft.com>
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 16 ++++++++++++++++
 torch/onnx/symbolic_opset9.py              | 11 +++++++++++
 2 files changed, 27 insertions(+)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index b92568c03cba2..dc376dc817e4f 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -3723,6 +3723,22 @@ def forward(self, x, h0, c0):
         c0 = torch.randn(1, BATCH_SIZE, RNN_HIDDEN_SIZE)
         self.run_test(LSTMModel(), (input, h0, c0))
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_lstm_cell(self):
+        class LSTMCellModel(torch.nn.Module):
+            def __init__(self, bias):
+                super().__init__()
+                self.lstm_cell = torch.nn.LSTMCell(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, bias=bias)
+
+            def forward(self, x, h0, c0):
+                return self.lstm_cell(x, (h0, c0))
+
+        input = torch.randn(BATCH_SIZE, RNN_INPUT_SIZE)
+        h0 = torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE)
+        c0 = torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE)
+        for bias in [True, False]:
+            self.run_test(LSTMCellModel(bias), (input, h0, c0))
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_lstm_default_init_state(self):
         class LSTMModel(torch.nn.Module):
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 36c1753ab252b..611749f83f8f7 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -2316,6 +2316,17 @@ def lstm(g, *args):
         return _lstm_full(g, *args)
 
 
+def lstm_cell(g, self, hidden, w_ih, w_hh, b_ih, b_hh):
+    input = sym_help._unsqueeze_helper(g, self, [0])
+    hidden = sym_help._unpack_list(hidden)
+    hidden = [sym_help._unsqueeze_helper(g, x, [0]) for x in hidden]
+    weight = (w_ih, w_hh, b_ih, b_hh) if sym_help._is_tensor(b_ih) else (w_ih, w_hh)
+    has_biases = True if sym_help._is_tensor(b_ih) else False
+    _, h_outs, c_outs = _generic_rnn(g, 'LSTM', input, hidden, weight, has_biases, num_layers=1,
+                                     dropout=0, train=0, bidirectional=False, batch_first=False)
+    return sym_help._squeeze_helper(g, h_outs, [0]), sym_help._squeeze_helper(g, c_outs, [0])
+
+
 def _one_hidden_rnn(kind):
     @parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
     def _rnn_full(g, input, hidden, weight_v, has_biases, num_layers, dropout, train, bidirectional, batch_first):

From 3a7bbf5fb70f288050287dd6168c7af0f10de3ec Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 18 Aug 2021 13:25:19 -0700
Subject: [PATCH 036/530] [ONNX] Add support for opset14 in PT-ONNX exporter
 (#59486) (#62758)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62758

* Add initial changes for opset14

* Fixed flake

* Add onnx submodule changes and removed utility func tests

* Add updated batchNorm symbolic

* Add triu/tril symbolics

* Fix lint

* Fixed test failures

* Add reshape with allowzero

* Added tests/refactored opset versioning

* Bump onnxruntime version

* Fix clang/lint failures

* Add reshape shape inference for opset 14

* Changes for allowzero

* Fix lint/clang and test failures

* Updated PR

* Flake fixes

* Fix flake

* Remove new_jit_api tests

* Add opset14 models

* Update allowzero

* Fix test failures

Test Plan: Imported from OSS

Reviewed By: SplitInfinity

Differential Revision: D30349063

Pulled By: msaroufim

fbshipit-source-id: 54724246149b01a2f627c43d7396253a7e9c9eb9

Co-authored-by: Shubham Bhokare <sbhokare@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
---
 .jenkins/caffe2/test.sh                       |  2 +-
 aten/src/ATen/core/interned_strings.h         |  3 +-
 scripts/onnx/test.sh                          |  2 +-
 test/onnx/test_models_onnxruntime.py          |  2 +-
 test/onnx/test_pytorch_common.py              | 10 +++
 test/onnx/test_pytorch_onnx_onnxruntime.py    | 75 +++++++++++++++-
 test/onnx/test_utility_funs.py                | 30 +++----
 torch/csrc/jit/passes/onnx/constant_fold.cpp  | 13 +--
 torch/csrc/jit/passes/onnx/constant_fold.h    |  1 +
 .../jit/passes/onnx/shape_type_inference.cpp  | 23 +++--
 torch/onnx/__init__.py                        |  2 +-
 torch/onnx/symbolic_helper.py                 | 50 ++++++++++-
 torch/onnx/symbolic_opset11.py                | 23 ++---
 torch/onnx/symbolic_opset12.py                |  6 +-
 torch/onnx/symbolic_opset14.py                | 54 +++++++++++
 torch/onnx/symbolic_opset9.py                 | 89 +++++++------------
 16 files changed, 276 insertions(+), 109 deletions(-)
 create mode 100644 torch/onnx/symbolic_opset14.py

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index e66b7ae958a1e..4c577ed437439 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -170,7 +170,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   # JIT C++ extensions require ninja, so put it into PATH.
   export PATH="/var/lib/jenkins/.local/bin:$PATH"
   if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then
-    pip install -q --user onnxruntime==1.7.0
+    pip install -q --user onnxruntime==1.8.0
   fi
   "$ROOT_DIR/scripts/onnx/test.sh"
 fi
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 2f527cdde5e91..c1dbc75a26b99 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -466,7 +466,8 @@ namespace c10 {
   _(attr, keepdims)                  \
   _(attr, cache_id)                  \
   _(attr, new_axis)                  \
-  _(attr, warn_id)
+  _(attr, warn_id)                   \
+  _(attr, allowzero)
 
 // 'prim' symbols are synthetic operators that occur only in the IR
 // and don't have corresponding implementations in ATen.
diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh
index 4ee0cdad92ad1..f39d4f0fa5abf 100755
--- a/scripts/onnx/test.sh
+++ b/scripts/onnx/test.sh
@@ -79,7 +79,7 @@ if [[ "$BUILD_ENVIRONMENT" == *ort_test1* ]]; then
 fi
 if [[ "$BUILD_ENVIRONMENT" == *ort_test2* ]]; then
   # Update the loop for new opsets
-  for i in $(seq 10 13); do
+  for i in $(seq 10 14); do
     pytest "${args[@]}" \
       "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset$i"
   done
diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py
index be7f8c62176e8..59909db5958cc 100644
--- a/test/onnx/test_models_onnxruntime.py
+++ b/test/onnx/test_models_onnxruntime.py
@@ -7,7 +7,7 @@
 
 
 def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7, opset_versions=None):
-    opset_versions = opset_versions if opset_versions else [7, 8, 9, 10, 11, 12]
+    opset_versions = opset_versions if opset_versions else [7, 8, 9, 10, 11, 12, 13, 14]
 
     for opset_version in opset_versions:
         self.opset_version = opset_version
diff --git a/test/onnx/test_pytorch_common.py b/test/onnx/test_pytorch_common.py
index 0695a989013c7..09ab7a26f4967 100644
--- a/test/onnx/test_pytorch_common.py
+++ b/test/onnx/test_pytorch_common.py
@@ -60,6 +60,16 @@ def wrapper(self):
         return wrapper
     return skip_dec
 
+# skips tests for all opset versions.
+def skipForAllOpsetVersions():
+    def skip_dec(func):
+        def wrapper(self):
+            if self.opset_version:
+                raise unittest.SkipTest("Skip verify test for unsupported opset_version")
+            return func(self)
+        return wrapper
+    return skip_dec
+
 # Enables tests for scripting, instead of only tracing the model.
 def enableScriptTest():
     def script_dec(func):
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index dc376dc817e4f..fe877948666dd 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -17,7 +17,7 @@
                                                        RnnModelWithPackedSequenceWithoutState)
 from test_pytorch_common import (skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedOpsetVersion,
                                  skipIfNoLapack, disableScriptTest, skipIfONNXShapeInference,
-                                 skipIfUnsupportedMaxOpsetVersion)
+                                 skipIfUnsupportedMaxOpsetVersion, skipForAllOpsetVersions)
 from test_pytorch_common import BATCH_SIZE
 from test_pytorch_common import RNN_BATCH_SIZE, RNN_SEQUENCE_LENGTH, RNN_INPUT_SIZE, RNN_HIDDEN_SIZE
 from typing import List, Tuple, Optional, Dict
@@ -2489,6 +2489,18 @@ def forward(self, x):
         x = torch.empty(2, 3, 3, dtype=torch.double).uniform_(0, 1)
         self.run_test(Bernoulli(), x)
 
+    # Enable test when fix for allowzero is in ORT
+    @skipForAllOpsetVersions()
+    @skipIfUnsupportedMinOpsetVersion(14)
+    def test_reshape_allowzero(self):
+        class ReshapeModel(torch.nn.Module):
+            def forward(self, x):
+                x = x.reshape(3, 4, 0)
+                return x
+
+        x = torch.randn(0, 3, 4)
+        self.run_test(ReshapeModel(), x)
+
     def test_reshape_different_rank(self):
         class ReshapeModel(torch.nn.Module):
             def forward(self, x):
@@ -3283,6 +3295,7 @@ def test_batchnorm1d_noaffine(self):
         x = torch.randn(10, 10, 128)
         self.run_test(model, x)
 
+    @skipIfUnsupportedOpsetVersion([14])
     def test_batchnorm1d_norunningstats(self):
         x = torch.randn(10, 10)
         model = torch.nn.BatchNorm1d(10, track_running_stats=False)
@@ -3301,6 +3314,7 @@ def test_batchnorm2d_noaffine(self):
         model = torch.nn.BatchNorm2d(3, affine=False)
         self.run_test(model, x)
 
+    @skipIfUnsupportedOpsetVersion([14])
     def test_batchnorm2d_norunningstats(self):
         x = torch.randn(10, 3, 128, 128)
         model = torch.nn.BatchNorm2d(3, track_running_stats=False)
@@ -5709,6 +5723,52 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(SiLUModel(), (x))
 
+    @skipIfUnsupportedMinOpsetVersion(14)
+    def test_tril(self):
+        class trilModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.tril(x)
+
+        x = torch.randn(2, 3, 4)
+        self.run_test(trilModel(), (x))
+
+        class trilModelwithDiagonal(torch.nn.Module):
+            def forward(self, x):
+                return torch.tril(x, diagonal=1)
+
+        x = torch.randn(2, 3, 4)
+        self.run_test(trilModelwithDiagonal(), (x))
+
+        class trilModelwithNegDiagonal(torch.nn.Module):
+            def forward(self, x):
+                return torch.tril(x, diagonal=-1)
+
+        x = torch.randn(2, 3, 4)
+        self.run_test(trilModelwithNegDiagonal(), (x))
+
+    @skipIfUnsupportedMinOpsetVersion(14)
+    def test_triu(self):
+        class triuModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.triu(x)
+
+        x = torch.randn(2, 3, 4)
+        self.run_test(triuModel(), (x))
+
+        class triuModelwithDiagonal(torch.nn.Module):
+            def forward(self, x):
+                return torch.triu(x, diagonal=1)
+
+        x = torch.randn(2, 3, 4)
+        self.run_test(triuModelwithDiagonal(), (x))
+
+        class trilModelwithNegDiagonal(torch.nn.Module):
+            def forward(self, x):
+                return torch.tril(x, diagonal=-1)
+
+        x = torch.randn(2, 3, 4)
+        self.run_test(trilModelwithNegDiagonal(), (x))
+
     def test_mish(self):
         class MishModel(torch.nn.Module):
             def __init__(self):
@@ -7586,6 +7646,8 @@ def forward(self, input):
         x = torch.randn(6, 4, 3, 3)
         self.run_test(FakeQuantizePerChannelModel(), (x))
 
+    # Tests skipped temporarliy as latest onnxruntime release does not include training ops
+    @skipForAllOpsetVersions()
     def test_batchnorm_training(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -7709,6 +7771,8 @@ def forward(self, x):
 
         np.testing.assert_allclose(ratio_pytorch, ratio_ort, rtol=0.01, atol=0.01)
 
+    # Tests skipped temporarliy as latest onnxruntime release does not include training ops
+    @skipForAllOpsetVersions()
     def test_conv_bn(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -7745,6 +7809,8 @@ def forward(self, x):
         [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in
          zip(ort_outs1, ort_outs2)]
 
+    # Tests skipped temporarliy as latest onnxruntime release does not include training ops
+    @skipForAllOpsetVersions()
     def test_multiple_conv_bn(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -9563,5 +9629,12 @@ def setup_rnn_tests():
                                     keep_initializers_as_inputs=False,
                                     onnx_shape_inference=True))
 
+# opset 14 tests
+TestONNXRuntime_opset14 = type(str("TestONNXRuntime_opset14"),
+                               (unittest.TestCase,),
+                               dict(TestONNXRuntime.__dict__, opset_version=14,
+                                    keep_initializers_as_inputs=False,
+                                    onnx_shape_inference=True))
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 71f52b306b8c4..06faf410f865e 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -5,7 +5,8 @@
 from torch.onnx import utils, OperatorExportTypes, TrainingMode
 from torch.onnx.symbolic_helper import _set_opset_version, _set_operator_export_type, _set_onnx_shape_inference
 import torch.utils.cpp_extension
-from test_pytorch_common import skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedOpsetVersion
+from test_pytorch_common import (skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedOpsetVersion,
+                                 skipIfUnsupportedMaxOpsetVersion)
 import caffe2.python.onnx.backend as backend
 from verify import verify
 
@@ -635,7 +636,7 @@ def test_aten_fallthrough(self):
         # Test aten export of op with no symbolic
         class Module(torch.nn.Module):
             def forward(self, x):
-                return torch.triu(x)
+                return torch.erfc(x)
 
         x = torch.randn(2, 3, 4)
         _set_opset_version(self.opset_version)
@@ -643,8 +644,7 @@ def forward(self, x):
                                             operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
                                             input_names=['x'], dynamic_axes={'x': [0, 1, 2]})
         iter = graph.nodes()
-        assert next(iter).kind() == "onnx::Constant"
-        assert next(iter).kind() == "aten::triu"
+        assert next(iter).kind() == "aten::erfc"
 
     def test_custom_op_fallthrough(self):
         # Test custom op
@@ -731,7 +731,7 @@ def forward(self, x):
         assert next(iter).kind() == "aten::dequantize"
 
     # prim::ListConstruct is exported as onnx::SequenceConstruct for opset >= 11
-    @skipIfUnsupportedOpsetVersion([11, 12, 13])
+    @skipIfUnsupportedMaxOpsetVersion(10)
     def test_prim_fallthrough(self):
         # Test prim op
         class PrimModule(torch.jit.ScriptModule):
@@ -836,6 +836,7 @@ def forward(self, x):
         x = torch.tensor([1, 2])
         verify(MyModel(), x, backend, do_constant_folding=False)
 
+    @skipIfUnsupportedOpsetVersion([14])
     def test_fuse_conv_bn(self):
         class Fuse(torch.nn.Module):
             def __init__(self):
@@ -857,6 +858,7 @@ def forward(self, x):
 
         assert len(list(graph.nodes())) == 1
 
+    @skipIfUnsupportedOpsetVersion([14])
     def test_fuse_resnet18(self):
         model = torchvision.models.resnet18(pretrained=True)
         x = torch.randn(2, 3, 224, 224, requires_grad=True)
@@ -917,20 +919,10 @@ def forward(self, x, y):
                                (TestCase,),
                                dict(TestUtilityFuns.__dict__, opset_version=13))
 
-# opset 11 tests
-TestUtilityFuns_opset11_new_jit_API = type(str("TestUtilityFuns_opset11_new_jit_API"),
-                                           (TestCase,),
-                                           dict(TestUtilityFuns.__dict__, opset_version=11))
-
-# opset 12 tests
-TestUtilityFuns_opset12_new_jit_API = type(str("TestUtilityFuns_opset12_new_jit_API"),
-                                           (TestCase,),
-                                           dict(TestUtilityFuns.__dict__, opset_version=12))
-
-# opset 13 tests
-TestUtilityFuns_opset13_new_jit_API = type(str("TestUtilityFuns_opset13_new_jit_API"),
-                                           (TestCase,),
-                                           dict(TestUtilityFuns.__dict__, opset_version=13))
+# opset 14 tests
+TestUtilityFuns_opset14 = type(str("TestUtilityFuns_opset14"),
+                               (TestCase,),
+                               dict(TestUtilityFuns.__dict__, opset_version=14))
 
 
 if __name__ == "__main__":
diff --git a/torch/csrc/jit/passes/onnx/constant_fold.cpp b/torch/csrc/jit/passes/onnx/constant_fold.cpp
index 901844cd62380..76c0674e11fd8 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp
@@ -266,9 +266,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
   if (node->kind() == onnx::Slice) {
     if (opset_version == ONNX_OPSET_9) {
       return runTorchSlice_opset9(node, inputTensorValues);
-    } else if (
-        opset_version == ONNX_OPSET_10 || opset_version == ONNX_OPSET_11 ||
-        opset_version == ONNX_OPSET_12 || opset_version == ONNX_OPSET_13) {
+    } else if (opset_version >= ONNX_OPSET_10) {
       return runTorchSlice_opset10(node, inputTensorValues);
     } else {
       std::cerr << "Warning: Constant folding - unsupported opset version. "
@@ -351,7 +349,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
     }
   } else if (node->kind() == onnx::Squeeze) {
     assert(inputTensorValues.size() == 2 || inputTensorValues.size() == 1);
-    if (opset_version == ONNX_OPSET_13) {
+    if (opset_version >= ONNX_OPSET_13) {
       // Squeeze version 13 input axes is optional, inputTensorValues.size() ==
       // 1 means axes equal to None
       updated_val = inputTensorValues[0];
@@ -415,13 +413,18 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
     std::vector<int64_t> shape(inputTensorValues[1].sizes()[0], 0);
     auto shape_a = inputTensorValues[1].accessor<int64_t, 1>();
     assert(inputTensorValues[1].sizes()[0] >= 0);
+    // Set value of allowzero
+    int64_t allowzero = 0;
+    if (node->hasAttributeS("allowzero")) {
+      allowzero = node->i(attr::allowzero);
+    }
     for (size_t i = 0; i < (size_t)(inputTensorValues[1].sizes()[0]); ++i) {
       // All shape dim values should be >= -1
       // onnx::Reshape supports a shape dim value to be zero, in
       // which case the actual dim value remains unchanged. However,
       // at::reshape does not support shape dim value to be zero
       assert(shape_a[i] >= -1);
-      if (shape_a[i] == 0) {
+      if (shape_a[i] == 0 && !allowzero) {
         if (i >= inputTensorValues[0].sizes().size()) {
           throw std::runtime_error(
               "Dimension with value 0 exceeds the input size dimensions.");
diff --git a/torch/csrc/jit/passes/onnx/constant_fold.h b/torch/csrc/jit/passes/onnx/constant_fold.h
index 1c54412ccd7a1..8bfb0dd081c39 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.h
+++ b/torch/csrc/jit/passes/onnx/constant_fold.h
@@ -13,6 +13,7 @@ const int ONNX_OPSET_10 = 10;
 const int ONNX_OPSET_11 = 11;
 const int ONNX_OPSET_12 = 12;
 const int ONNX_OPSET_13 = 13;
+const int ONNX_OPSET_14 = 14;
 
 namespace onnx_constant_fold {
 
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 634d8d2e1db99..f630cf023f7b5 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -404,8 +404,10 @@ c10::optional<at::Tensor> ComputeConstantFolding(Node* n, int opset_version) {
 // When the Reshape node's two inputs are constant, compute the output shape.
 // The reshape value 0 and -1 are converted to the real value explicitly.
 std::vector<int64_t> ComputeShapeFromReshape(
+    Node* n,
     const std::vector<int64_t>& input_shape,
-    const std::vector<int64_t>& reshape) {
+    const std::vector<int64_t>& reshape,
+    int opset_version) {
   TORCH_INTERNAL_ASSERT(
       input_shape.size() > 0 || reshape.size() > 0,
       "Reshape node should have at least one input size > 0 when constant folding.");
@@ -427,6 +429,17 @@ std::vector<int64_t> ComputeShapeFromReshape(
   auto reshape_size = static_cast<int>(reshape.size());
   auto it_0 = std::find(reshape.begin(), reshape.end(), 0);
   auto reshape_has_zero = it_0 != reshape.end();
+
+  // Allowzero is set to 0 by default
+  // When opset version > 14, assign appropriate allowzero value
+  int allowzero = 0;
+  if (opset_version >= 14 && n->hasAttributeS("allowzero")) {
+    allowzero = n->i(attr::allowzero);
+    if (allowzero == 1 && reshape_has_zero) {
+      return reshape;
+    }
+  }
+
   auto input_shape_size = static_cast<int>(input_shape.size());
   auto it_minus_one = std::find(reshape.begin(), reshape.end(), -1);
   int minus_one_pos = it_minus_one == reshape.end()
@@ -594,7 +607,7 @@ c10::optional<std::vector<int64_t>> GetValueFromListConstructNode(
       : c10::nullopt;
 }
 
-void ProcessReshapeNode(Node* n) {
+void ProcessReshapeNode(Node* n, int opset_version) {
   if (ConstantValueMap::HasValue(n->input(1)->debugName())) {
     auto shape_temp =
         ConstantValueMap::GetValueInto1DInt64Vector(n->input(1)->debugName());
@@ -602,8 +615,8 @@ void ProcessReshapeNode(Node* n) {
         ConstantValueMap::GetShapeInto1DInt64VectorWithOneUnknown(
             n->input(0)->debugName());
     if (shape_vector_0.has_value()) {
-      auto final_shape =
-          ComputeShapeFromReshape(shape_vector_0.value(), shape_temp);
+      auto final_shape = ComputeShapeFromReshape(
+          n, shape_vector_0.value(), shape_temp, opset_version);
       UpdateShapeFromVector(n->output(), final_shape);
       return;
     }
@@ -865,7 +878,7 @@ void ComputeConstant(Node* n, int opset_version) {
       break;
     }
     case ::c10::onnx::Reshape: {
-      ProcessReshapeNode(n);
+      ProcessReshapeNode(n, opset_version);
       break;
     }
     case ::c10::onnx::Gather: {
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index c859419cf38a5..c143d0ce8984d 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -13,7 +13,7 @@
 ir_version = _C._onnx.IR_VERSION
 producer_name = "pytorch"
 producer_version = _C._onnx.PRODUCER_VERSION
-constant_folding_opset_versions = [9, 10, 11, 12, 13]
+constant_folding_opset_versions = [9, 10, 11, 12, 13, 14]
 
 
 class ExportTypes:
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 5b378ecc214ce..66f276ce42f55 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -298,7 +298,7 @@ def _select_helper(g, self, dim, index, apply_reshape=True):
     elif index_dim is not None and apply_reshape:
         if index_dim == 0:
             # Index is a scalar. Reshape it to a size 1 tensor.
-            index = g.op("Reshape", index, g.op("Constant", value_t=torch.LongTensor([1])))
+            index = _reshape_helper(g, index, g.op("Constant", value_t=torch.LongTensor([1])))
 
     index_scalar_type = index.type().scalarType()
     if index_scalar_type is None or index_scalar_type not in ["Long", "Int"]:
@@ -367,7 +367,7 @@ def _topk_helper(g, input, k, dim, largest=True, sorted=False, out=None):
     if not _is_value(k):
         k = g.op("Constant", value_t=torch.tensor([k], dtype=torch.int64))
     else:
-        k = g.op("Reshape", k, g.op("Constant", value_t=torch.tensor([1])))
+        k = _reshape_helper(g, k, g.op("Constant", value_t=torch.tensor([1])))
     if _export_onnx_opset_version <= 10:
         if not largest:
             _unimplemented("TopK", "Ascending is not supported")
@@ -704,6 +704,48 @@ def _index_fill_reshape_helper(g, self, dim, index):
     expanded_index = expand(g, unsqueezed_index, expanded_index_shape, None)
     return expanded_index_shape, expanded_index
 
+# When using reshape helper (opset_version >= 14), if reshape has -1,
+# allowzero cannot be set to 1
+def _reshape_helper(g, input, shape, allowzero=0):
+    shape = _maybe_get_const(shape, "is")
+    if not _is_value(shape):
+        shape = g.op("Constant", value_t=torch.LongTensor(shape))
+    if _export_onnx_opset_version <= 13:
+        return g.op("Reshape", input, shape)
+    else:
+        warnings.warn("allowzero=0 by default. In order to honor zero value in shape use allowzero=1")
+        return g.op("Reshape", input, shape, allowzero_i=allowzero)
+
+def _batchnorm_helper(g, input, weight, bias, running_mean, running_var):
+    from torch.onnx.symbolic_opset9 import _var_mean
+    batch_size = _get_tensor_dim_size(input, 0)
+    channel_size = _get_tensor_dim_size(input, 1)
+
+    if weight is None or _is_none(weight):
+        if channel_size is None:
+            raise RuntimeError("Unsupported: ONNX export of batch_norm for unknown "
+                               "channel size.")
+        weight_value = torch.tensor([1.] * channel_size).type(
+            "torch." + input.type().scalarType() + "Tensor")
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or _is_none(bias):
+        if channel_size is None:
+            raise RuntimeError("Unsupported: ONNX export of batch_norm for unknown "
+                               "channel size.")
+        bias_value = torch.tensor([0.] * channel_size).type(
+            "torch." + input.type().scalarType() + "Tensor")
+        bias = g.op("Constant", value_t=bias_value)
+    # If track_running_stats is set to False batch statistics are instead used during evaluation time
+    if running_mean is None or _is_none(running_mean) or running_var is None or _is_none(running_var):
+        assert batch_size is not None and channel_size is not None
+        reshape_in = _reshape_helper(g, input,
+                                     g.op("Constant", value_t=torch.tensor([batch_size, channel_size, -1],
+                                          dtype=torch.int64)))
+        trans_in = g.op("Transpose", reshape_in, perm_i=[0, 2, 1])
+        running_var, running_mean = _var_mean(g, trans_in,
+                                              g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)),
+                                              False, False)
+    return weight, bias, running_mean, running_var
 
 def _avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override, name):
     if divisor_override and divisor_override.node().kind() != "prim::Constant":
@@ -787,8 +829,8 @@ def _handle_reduce_dim_none(g, self, op_name):
 
 
 _default_onnx_opset_version = 9
-_onnx_main_opset = 13
-_onnx_stable_opsets = [7, 8, 9, 10, 11, 12]
+_onnx_main_opset = 14
+_onnx_stable_opsets = [7, 8, 9, 10, 11, 12, 13]
 _export_onnx_opset_version = _default_onnx_opset_version
 
 
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index eaa49c29e1546..ed7abf263f31c 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -179,7 +179,7 @@ def index_put(g, self, indices_list_value, values, accumulate=False):
     rank = sym_help._get_tensor_rank(values)
     if rank is not None and rank == 0:
         values = expand(g, values, values_shape, None)
-    values = g.op("Reshape", values, values_shape)
+    values = sym_help._reshape_helper(g, values, values_shape)
 
     dtype = self.type().scalarType()
     if dtype is not None and dtype != values.type().scalarType():
@@ -266,12 +266,12 @@ def masked_select(g, self, mask):
 
 
 def masked_scatter(g, self, mask, source):
-    from torch.onnx.symbolic_opset9 import nonzero, expand_as, view, size
+    from torch.onnx.symbolic_opset9 import nonzero, expand_as, size
     index = nonzero(g, expand_as(g, mask, self))
     # NOTE: source can have more elements than needed.
     # It could also have arbitrary shape.
     # This is not supported by ONNX::ScatterND, so we need to flatten and slice source tensor.
-    source = view(g, source, torch.LongTensor([-1]))
+    source = sym_help._reshape_helper(g, source, torch.LongTensor([-1]))
     source = sym_help._slice_helper(g, source,
                                     axes=torch.LongTensor([0]),
                                     starts=torch.LongTensor([0]),
@@ -453,9 +453,9 @@ def _prepare_onnx_paddings(g, dim, pad):
     # paddings = [[..., 0, dim_n-1_begin, dim_n_begin],
     #               [..., 0, dim_n-1_end, dim_n_end]]
     # Reshape back to 1-D paddings = [..., 0, dim_n - 1_begin, dim_n_begin, ..., 0, dim_n - 1_end, dim_n_end]
-    paddings = g.op("Reshape", paddings, g.op("Constant", value_t=torch.tensor([-1, 2])))
+    paddings = sym_help._reshape_helper(g, paddings, g.op("Constant", value_t=torch.tensor([-1, 2])))
     paddings = g.op("Transpose", torch.onnx.symbolic_opset10.flip(g, paddings, [0]), perm_i=[1, 0])
-    paddings = g.op("Reshape", paddings, g.op("Constant", value_t=torch.tensor([-1])))
+    paddings = sym_help._reshape_helper(g, paddings, g.op("Constant", value_t=torch.tensor([-1])))
     padding_c = g.op("Cast", paddings, to_i=sym_help.cast_pytorch_to_onnx["Long"])
     return padding_c
 
@@ -695,7 +695,7 @@ def _get_im2col_indices_along_dim(g, input_d, kernel_size_d, dilation_d, padding
     # Broadcast and add kernel staring positions (indices) with
     # kernel_grid along dim d, to get block indices along dim d
     blocks_d_indices = sym_help._unsqueeze_helper(g, blocks_d_indices, [0])  # Reshape to [1, -1]
-    kernel_mask = g.op("Reshape", kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1])))
+    kernel_mask = sym_help._reshape_helper(g, kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1])))
     block_mask = g.op("Add", blocks_d_indices, kernel_mask)
 
     return block_mask
@@ -766,7 +766,7 @@ def im2col(g, input, kernel_size, dilation, padding, stride):
     output = g.op("Gather", padded_input, blocks_row_indices, axis_i=2)
     output = g.op("Gather", output, blocks_col_indices, axis_i=4)
     output = g.op("Transpose", output, perm_i=[0, 1, 2, 4, 3, 5])
-    return g.op("Reshape", output, output_shape)
+    return sym_help._reshape_helper(g, output, output_shape)
 
 
 def narrow(g, input, dim, start, length):
@@ -895,13 +895,12 @@ def chunk(g, self, chunks, dim):
     return split(g, self, chunk_vec, dim)
 
 def repeat_interleave(g, self, repeats, dim=None, output_size=None):
-    from torch.onnx.symbolic_opset9 import reshape
     input = self
     final_dim = dim
     # if dim is None flatten
     # By default, use the flattened input array, and return a flat output array
     if sym_help._is_none(dim):
-        input = reshape(g, self, g.op("Constant", value_t=torch.tensor([-1])))
+        input = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1])))
         dim = 0
     else:
         dim = sym_help._maybe_get_scalar(dim)
@@ -983,7 +982,8 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
                 loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1:]))]
     r_concat = loop_block.op("Concat", *r_concat, axis_i=0)
     i_split = expand(loop_block, i_split, r_concat, None)
-    i_split = reshape(loop_block, i_split, g.op("Constant", value_t=torch.LongTensor(output_sizes)))
+    i_split = sym_help._reshape_helper(loop_block, i_split,
+                                       g.op("Constant", value_t=torch.LongTensor(output_sizes)))
 
     # Loop outputs
     cond_out = loop_block.op("Cast", loop_condition, to_i=9)
@@ -995,7 +995,8 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
     # the zero'th dimension (by default). In order to avoid this and concatenate
     # along the dimension provided, some post-processing is required
     loop_out = g.op("Transpose", loop_out, perm_i=perm_i)
-    return reshape(g, loop_out, g.op("Constant", value_t=torch.LongTensor(output_sizes)))
+    return sym_help._reshape_helper(g, loop_out,
+                                    g.op("Constant", value_t=torch.LongTensor(output_sizes)))
 
 
 def normal(g, loc, scale, seed):
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index 58420a2bc7749..8e989cccf9867 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -123,8 +123,7 @@ def celu(g, self, alpha):
 
 def argmax(g, input, dim, keepdim):
     if sym_help._is_none(dim):
-        from torch.onnx.symbolic_opset9 import reshape
-        flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1])))
+        flattened = sym_help._reshape_helper(g, input, g.op("Constant", value_t=torch.tensor([-1])))
         return g.op("ArgMax", flattened, axis_i=0, keepdims_i=False, select_last_index_i=False)
     else:
         dim = _parse_arg(dim, "i")
@@ -134,8 +133,7 @@ def argmax(g, input, dim, keepdim):
 
 def argmin(g, input, dim, keepdim):
     if sym_help._is_none(dim):
-        from torch.onnx.symbolic_opset9 import reshape
-        flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1])))
+        flattened = sym_help._reshape_helper(g, input, g.op("Constant", value_t=torch.tensor([-1])))
         return g.op("ArgMin", flattened, axis_i=0, keepdims_i=False, select_last_index_i=False)
     else:
         dim = _parse_arg(dim, "i")
diff --git a/torch/onnx/symbolic_opset14.py b/torch/onnx/symbolic_opset14.py
new file mode 100644
index 0000000000000..443aac97c9404
--- /dev/null
+++ b/torch/onnx/symbolic_opset14.py
@@ -0,0 +1,54 @@
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+# This file exports ONNX ops for opset 14
+import torch
+
+import torch.onnx.symbolic_helper as sym_help
+from torch.onnx.symbolic_helper import parse_args
+
+# Note [ONNX operators that are added/updated in opset 14]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# New operators:
+#   HardSwish, Trilu
+#
+# Updated operators:
+#   Reshape
+#   Add, Sub, Mul, Div
+#   GRU, LSTM, RNN
+#   BatchNorm, Cumsum, Relu
+
+@parse_args("v")
+def hardswish(g, self):
+    return g.op("HardSwish", self)
+
+@parse_args("v", "i")
+def tril(g, self, diagonal, out=None):
+    k = g.op("Constant", value_t=torch.tensor(diagonal, dtype=torch.int64))
+    return g.op("Trilu", self, k, upper_i=0)
+
+@parse_args("v", "i")
+def triu(g, self, diagonal, out=None):
+    k = g.op("Constant", value_t=torch.tensor(diagonal, dtype=torch.int64))
+    return g.op("Trilu", self, k, upper_i=1)
+
+@parse_args("v", "v")
+def reshape(g, self, shape):
+    return sym_help._reshape_helper(g, self, shape)
+
+@parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
+def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled):
+    sym_help.assert_training_mode(training, "batch_norm")
+    weight, bias, running_mean, running_var = sym_help._batchnorm_helper(g, input, weight, bias, running_mean, running_var)
+    out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var,
+               epsilon_f=eps,
+               momentum_f=1 - momentum,
+               training_mode_i=0 if not training else 1,
+               outputs=1 if not training else 3)
+    if not training:
+        return out
+    else:
+        res, new_running_mean, new_running_var = out
+        new_running_mean.setType(running_mean.type())
+        new_running_var.setType(running_var.type())
+        return res
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 611749f83f8f7..616c2fcaff5b8 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -70,11 +70,11 @@ def _shape_as_tensor(g, input):
 def _reshape_from_tensor(g, input, shape):
     if (isinstance(shape, list)):
         shape = g.op("Concat", *shape, axis_i=0)
-    return g.op("Reshape", input, shape)
+    return reshape(g, input, shape)
 
 
 def reshape(g, self, shape):
-    return view(g, self, shape)
+    return sym_help._reshape_helper(g, self, shape)
 
 
 def reshape_as(g, self, other):
@@ -461,7 +461,7 @@ def expand(g, self, size, implicit):
         # Expand with -1 dim value means dim is unchanged.
         # Since onnx::expand supports two-way broadcasting,
         # -1 dim value can be exported to onnx as 1
-        size = view(g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1])))
+        size = sym_help._reshape_helper(g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1])))
     dtype = 4  # dim type is int64
     ones = ones_like(g, size, dtype)
     neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
@@ -566,17 +566,12 @@ def permute(g, self, dims):
 
 
 def view(g, self, size):
-    size = sym_help._maybe_get_const(size, "is")
-    if sym_help._is_value(size):
-        shape = size
-    else:
-        shape = g.op("Constant", value_t=torch.LongTensor(size))
-    return g.op("Reshape", self, shape)
+    return reshape(g, self, size)
 
 
 def view_as(g, self, other):
     shape = g.op("Shape", other)
-    return g.op("Reshape", self, shape)
+    return reshape(g, self, shape)
 
 
 def prim_ConstantSplit(g, self, split_size, dim):
@@ -1349,32 +1344,7 @@ def conv_transpose3d(g, input, weight, bias, stride, padding, output_padding, gr
 @parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
 def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled):
     sym_help.assert_training_mode(training, "batch_norm")
-    batch_size = sym_help._get_tensor_dim_size(input, 0)
-    channel_size = sym_help._get_tensor_dim_size(input, 1)
-
-    if weight is None or sym_help._is_none(weight):
-        if channel_size is None:
-            raise RuntimeError("Unsupported: ONNX export of batch_norm for unknown "
-                               "channel size.")
-        weight_value = torch.tensor([1.] * channel_size).type(
-            "torch." + input.type().scalarType() + "Tensor")
-        weight = g.op("Constant", value_t=weight_value)
-    if bias is None or sym_help._is_none(bias):
-        if channel_size is None:
-            raise RuntimeError("Unsupported: ONNX export of batch_norm for unknown "
-                               "channel size.")
-        bias_value = torch.tensor([0.] * channel_size).type(
-            "torch." + input.type().scalarType() + "Tensor")
-        bias = g.op("Constant", value_t=bias_value)
-    # If track_running_stats is set to False batch statistics are instead used during evaluation time
-    if running_mean is None or sym_help._is_none(running_mean) or running_var is None or sym_help._is_none(running_var):
-        assert batch_size is not None and channel_size is not None
-        reshape_in = g.op("Reshape", input,
-                          g.op("Constant", value_t=torch.tensor([batch_size, channel_size, -1], dtype=torch.int64)))
-        trans_in = g.op("Transpose", reshape_in, perm_i=[0, 2, 1])
-        running_var, running_mean = _var_mean(g, trans_in,
-                                              g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)),
-                                              False, False)
+    weight, bias, running_mean, running_var = sym_help._batchnorm_helper(g, input, weight, bias, running_mean, running_var)
     out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var,
                epsilon_f=eps,
                momentum_f=1 - momentum,
@@ -1771,7 +1741,7 @@ def tensor(g, data, dtype=None, device=None, requires_grad=False):
         input_list = list()
         for t in sym_help._unpack_list(data):
             shape_reference = g.op("Constant", value_t=torch.LongTensor([1]))
-            t = g.op("Reshape", t, shape_reference)
+            t = sym_help._reshape_helper(g, t, shape_reference)
             t = g.op("Cast", t, to_i=sym_help.scalar_type_to_onnx[dtype])
             input_list.append(t)
         return g.op("Concat", *input_list, axis_i=0)
@@ -2060,7 +2030,7 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
     # if dim is None flatten
     # By default, use the flattened input array, and return a flat output array
     if sym_help._is_none(dim):
-        input = reshape(g, self, g.op("Constant", value_t=torch.tensor([-1])))
+        input = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1])))
         dim = 0
     else:
         dim = sym_help._maybe_get_scalar(dim)
@@ -2115,7 +2085,7 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
                     g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1:]))]
         r_concat = g.op("Concat", *r_concat, axis_i=0)
         i_split = expand(g, i_split, r_concat, None)
-        i_split = reshape(g, i_split, g.op("Constant", value_t=torch.LongTensor(input_sizes)))
+        i_split = sym_help._reshape_helper(g, i_split, g.op("Constant", value_t=torch.LongTensor(input_sizes)), allowzero=0)
         final_splits.append(i_split)
     return g.op("Concat", *final_splits, axis_i=dim)
 
@@ -2128,12 +2098,17 @@ def pixel_shuffle(g, self, upscale_factor):
     if any([i is None for i in dims[1:]]):
         return _unimplemented("pixel_shuffle", "only support static input shape, except for batch size")
     output_channel = dims[1] // upscale_factor // upscale_factor
-    after_view = view(g, self, g.op("Constant", value_t=torch.tensor([-1, output_channel, upscale_factor,
-                                                                      upscale_factor, dims[2], dims[3]])))
+    after_view = sym_help._reshape_helper(g, self,
+                                          g.op("Constant", value_t=torch.tensor([-1, output_channel,
+                                                                                upscale_factor, upscale_factor,
+                                                                                dims[2], dims[3]])),
+                                          allowzero=0)
     after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
-    return view(g, after_transpose,
-                g.op("Constant", value_t=torch.tensor([-1, output_channel, dims[2] * upscale_factor,
-                                                       dims[3] * upscale_factor])))
+    return sym_help._reshape_helper(g, after_transpose,
+                                    g.op("Constant", value_t=torch.tensor([-1, output_channel,
+                                                                          dims[2] * upscale_factor,
+                                                                          dims[3] * upscale_factor])),
+                                    allowzero=0)
 
 
 def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases,
@@ -2277,7 +2252,8 @@ def retrieve_state(x, start, end):
             # Transpose, and then combining it with hidden_size
             # with Reshape.
             prev_output = g.op("Transpose", prev_output, perm_i=[0, 2, 1, 3])
-            prev_output = g.op("Reshape", prev_output, g.op("Constant", value_t=torch.LongTensor([0, 0, -1])))
+            prev_output = sym_help._reshape_helper(g, prev_output,
+                                                   g.op("Constant", value_t=torch.LongTensor([0, 0, -1])), allowzero=0)
         else:
             prev_output = sym_help._squeeze_helper(g, prev_output, [1])
 
@@ -2525,7 +2501,7 @@ def narrow(g, input, dim, start, length):
 
 def argmax(g, input, dim, keepdim):
     if sym_help._is_none(dim):
-        flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1])))
+        flattened = sym_help._reshape_helper(g, input, g.op("Constant", value_t=torch.tensor([-1])))
         return g.op("ArgMax", flattened, axis_i=0, keepdims_i=False)
     else:
         dim = _parse_arg(dim, "i")
@@ -2535,7 +2511,7 @@ def argmax(g, input, dim, keepdim):
 
 def argmin(g, input, dim, keepdim):
     if sym_help._is_none(dim):
-        flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1])))
+        flattened = sym_help._reshape_helper(g, input, g.op("Constant", value_t=torch.tensor([-1])))
         return g.op("ArgMin", flattened, axis_i=0, keepdims_i=False)
     else:
         dim = _parse_arg(dim, "i")
@@ -2868,7 +2844,7 @@ def try_mask_to_index(index):
                 folded_adv_idx_shape_list = [g.op("Constant", value_t=torch.LongTensor([-1]))]  \
                     + [dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices]
                 folded_adv_idx_shape = g.op("Concat", *folded_adv_idx_shape_list, axis_i=0)
-                self = g.op("Reshape", self, folded_adv_idx_shape)
+                self = sym_help._reshape_helper(g, self, folded_adv_idx_shape)
 
                 # Transpose folded advanced indexed axis to its original location.
                 adv_idx_permute = list(range(1, adv_idx_indices[0] + 1))                    \
@@ -2887,7 +2863,7 @@ def try_mask_to_index(index):
                     *[dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices],
                     axis_i=0)
 
-            return g.op("Reshape", self, final_shape)
+            return sym_help._reshape_helper(g, self, final_shape)
 
 
 @parse_args("v", "is", "i")
@@ -2919,7 +2895,8 @@ def baddbmm(g, self, batch1, batch2, beta, alpha):
 
 
 def meshgrid(g, tensor_list):
-    tensors = [view(g, t, g.op("Constant", value_t=torch.LongTensor([-1]))) for t in sym_help._unpack_list(tensor_list)]
+    tensors = [sym_help._reshape_helper(g, t, g.op("Constant", value_t=torch.LongTensor([-1])))
+               for t in sym_help._unpack_list(tensor_list)]
     tensors_shape = [g.op("Shape", t) for t in tensors]
     out_shape = g.op("Concat", *tensors_shape, axis_i=0)
     out = []
@@ -2959,7 +2936,8 @@ def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled):
         return _unimplemented("group_norm", "unknown input rank")
     # 0 in the shape list keeps dimension value unchanged.
     shape = [0, num_groups, -1]
-    input_reshaped = g.op("Reshape", input, g.op("Constant", value_t=torch.LongTensor(shape)))
+    input_reshaped = sym_help._reshape_helper(g, input,
+                                              g.op("Constant", value_t=torch.LongTensor(shape)))
 
     # C is always divisible by num_groups
     # Due to shape difference. we need to apply weight and bias after
@@ -2970,7 +2948,7 @@ def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled):
         "torch." + input.type().scalarType() + "Tensor"))
 
     norm_reshaped = g.op("InstanceNormalization", input_reshaped, weight_, bias_, epsilon_f=eps)
-    norm = g.op("Reshape", norm_reshaped, g.op("Shape", input))
+    norm = sym_help._reshape_helper(g, norm_reshaped, g.op("Shape", input))
 
     if weight is None or weight.node().mustBeNone():
         weight_value = torch.tensor([1.]).type(
@@ -3027,7 +3005,7 @@ def item(g, self):
 
 
 def take(g, self, index):
-    self_flattened = g.op("Reshape", self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)))
+    self_flattened = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)))
     out = index_select(g, self_flattened, 0, index)
     out = reshape_as(g, out, index)
     return out
@@ -3071,7 +3049,7 @@ def kl_div(g, input, target, reduction, log_target):
 def as_strided(g, self, sizes, strides, offset=None):
     sizes = sym_help._maybe_get_const(sizes, "is")
     rank = len(strides)
-    self_1d = g.op("Reshape", self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)))
+    self_1d = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)))
     ind: Optional[torch.Tensor]
     if not sym_help._is_value(sizes):
         ind = torch.tensor([0], dtype=torch.long)
@@ -3088,7 +3066,8 @@ def as_strided(g, self, sizes, strides, offset=None):
             r_size = [1] * rank
             r_size[i] = -1
             size = select(g, sizes, g.op("Constant", value_t=torch.tensor([0])), g.op("Constant", value_t=torch.tensor(i)))
-            tmp_ind = g.op("Reshape", arange(g, size, 4, None, None, None), g.op("Constant", value_t=torch.tensor(r_size)))
+            tmp_ind = sym_help._reshape_helper(g, arange(g, size, 4, None, None, None),
+                                               g.op("Constant", value_t=torch.tensor(r_size)))
             tmp_ind = g.op("Mul", tmp_ind, g.op("Constant", value_t=torch.tensor([stride])))
             if ind is None:
                 ind = tmp_ind

From e182401062323ab613ca9f1e3786272e5ffc6eb4 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 18 Aug 2021 13:25:19 -0700
Subject: [PATCH 037/530] [ONNX] Remove aten parameter (#61652) (#62759)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62759

* remove aten argument in export()

* add export_to_pretty_string default value OperatorExportTypes.ONNX

* add DPYTORCH_ONNX_CAFFE2_BUNDLE description

Test Plan: Imported from OSS

Reviewed By: SplitInfinity

Differential Revision: D30349062

Pulled By: msaroufim

fbshipit-source-id: d9738f3aa8b80eac54548d0b9494f9f1e544f20f

Co-authored-by: Gary Miguel <garymiguel@microsoft.com>
---
 torch/onnx/__init__.py | 24 +++++++++++++-----------
 torch/onnx/utils.py    | 26 ++++++++------------------
 2 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index c143d0ce8984d..b726b2b55e8b6 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -30,11 +30,11 @@ def _export(*args, **kwargs):
 
 
 def export(model, args, f, export_params=True, verbose=False, training=TrainingMode.EVAL,
-           input_names=None, output_names=None, aten=False,
-           operator_export_type=None, opset_version=None, _retain_param_name=True,
-           do_constant_folding=True, example_outputs=None, strip_doc_string=True,
-           dynamic_axes=None, keep_initializers_as_inputs=None, custom_opsets=None,
-           enable_onnx_checker=True, use_external_data_format=False):
+           input_names=None, output_names=None, operator_export_type=None,
+           opset_version=None, _retain_param_name=True, do_constant_folding=True,
+           example_outputs=None, strip_doc_string=True, dynamic_axes=None,
+           keep_initializers_as_inputs=None, custom_opsets=None, enable_onnx_checker=True,
+           use_external_data_format=False):
     r"""
     Exports a model into ONNX format. If ``model`` is not a
     :class:`torch.jit.ScriptModule` nor a :class:`torch.jit.ScriptFunction`, this runs
@@ -116,9 +116,12 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM
             input nodes of the graph, in order.
         output_names (list of str, default empty list): names to assign to the
             output nodes of the graph, in order.
-        aten (bool, default False): [DEPRECATED. use operator_export_type] equivalent to
-            setting ``operator_export_type=OperatorExportTypes.ONNX_ATEN``.
-        operator_export_type (enum, default OperatorExportTypes.ONNX):
+        operator_export_type (enum, default None):
+
+            None usually means ``OperatorExportTypes.ONNX``.
+            However if PyTorch was built with ``-DPYTORCH_ONNX_CAFFE2_BUNDLE``, None means
+            ``OperatorExportTypes.ONNX_ATEN_FALLBACK``.
+
             * ``OperatorExportTypes.ONNX``: Export all ops as regular ONNX ops
               (in the default opset domain).
             * ``OperatorExportTypes.ONNX_FALLTHROUGH``: Try to convert all ops
@@ -303,9 +306,8 @@ def forward(self, x):
 
     from torch.onnx import utils
     return utils.export(model, args, f, export_params, verbose, training,
-                        input_names, output_names, aten,
-                        operator_export_type, opset_version, _retain_param_name,
-                        do_constant_folding, example_outputs,
+                        input_names, output_names, operator_export_type, opset_version,
+                        _retain_param_name, do_constant_folding, example_outputs,
                         strip_doc_string, dynamic_axes, keep_initializers_as_inputs,
                         custom_opsets, enable_onnx_checker, use_external_data_format)
 
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index f5dc2f2270165..41ba20f3ad102 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -71,15 +71,12 @@ def select_model_mode_for_export(model, mode):
 
 
 def export(model, args, f, export_params=True, verbose=False, training=None,
-           input_names=None, output_names=None, aten=False,
-           operator_export_type=None, opset_version=None, _retain_param_name=True,
-           do_constant_folding=True, example_outputs=None, strip_doc_string=True,
-           dynamic_axes=None, keep_initializers_as_inputs=None, custom_opsets=None,
+           input_names=None, output_names=None, operator_export_type=None,
+           opset_version=None, _retain_param_name=True, do_constant_folding=True,
+           example_outputs=None, strip_doc_string=True, dynamic_axes=None,
+           keep_initializers_as_inputs=None, custom_opsets=None,
            enable_onnx_checker=True, use_external_data_format=False):
-    if aten:
-        assert operator_export_type is None
-        operator_export_type = OperatorExportTypes.ONNX_ATEN
-    elif operator_export_type is None:
+    if operator_export_type is None:
         if torch.onnx.PYTORCH_ONNX_CAFFE2_BUNDLE:
             operator_export_type = OperatorExportTypes.ONNX_ATEN_FALLBACK
         else:
@@ -529,18 +526,11 @@ def _model_to_graph(model, args, verbose=False,
 
 
 def export_to_pretty_string(model, args, f, export_params=True, verbose=False, training=None,
-                            input_names=None, output_names=None, aten=False,
-                            operator_export_type=None, export_type=ExportTypes.PROTOBUF_FILE,
-                            example_outputs=None, google_printer=False,
-                            opset_version=None, _retain_param_name=True,
+                            input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX,
+                            export_type=ExportTypes.PROTOBUF_FILE, example_outputs=None,
+                            google_printer=False, opset_version=None, _retain_param_name=True,
                             keep_initializers_as_inputs=None, custom_opsets=None, add_node_names=True,
                             do_constant_folding=True):
-    if aten:
-        assert operator_export_type is None
-        assert aten
-        operator_export_type = OperatorExportTypes.ONNX_ATEN
-    elif operator_export_type is None:
-        operator_export_type = OperatorExportTypes.ONNX
     return _export_to_pretty_string(model, args, f, export_params, verbose, training,
                                     input_names, output_names, operator_export_type,
                                     export_type, example_outputs, google_printer,

From 2aa19f33c6272cb016c629792ddc70cf9636fc9a Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 18 Aug 2021 13:25:19 -0700
Subject: [PATCH 038/530] [ONNX] Fix for batchnorm training op mode (#52758)
 (#62760)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62760

* Rebase

# Conflicts:
#	torch/csrc/jit/passes/onnx/eval_peephole.cpp

# Conflicts:
#	test/onnx/test_utility_funs.py
#	torch/onnx/symbolic_opset9.py

* Update symbolic_opset12.py

* Update test.sh
# Conflicts:
#	.jenkins/caffe2/test.sh

* Merge

* Fix utility tests

# Conflicts:
#	test/onnx/test_pytorch_onnx_onnxruntime.py
#	test/onnx/test_utility_funs.py

* Fix for comment

* Enable BN tests

* Fix for test

* Update test_pytorch_onnx_onnxruntime.py

* Update test_pytorch_onnx_onnxruntime.py

* Update test_utility_funs.py

* Update test_pytorch_onnx_onnxruntime.py

Test Plan: Imported from OSS

Reviewed By: SplitInfinity

Differential Revision: D30349060

Pulled By: msaroufim

fbshipit-source-id: 93312c17607974731c17099ae181acb6e4c1c409
---
 .jenkins/caffe2/test.sh                       |   4 +-
 .../TestOperators.test_dropout_default.expect |  16 +-
 test/onnx/test_operators.py                   |   2 +-
 test/onnx/test_pytorch_onnx_onnxruntime.py    | 148 +++++++++---------
 test/onnx/test_utility_funs.py                |  13 +-
 torch/csrc/jit/passes/onnx/eval_peephole.cpp  |  23 +--
 torch/onnx/symbolic_helper.py                 |  18 ++-
 torch/onnx/symbolic_opset12.py                |   8 +-
 torch/onnx/symbolic_opset14.py                |   2 +-
 torch/onnx/symbolic_opset9.py                 |  12 +-
 10 files changed, 135 insertions(+), 111 deletions(-)

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 4c577ed437439..75e269d6f6909 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -170,7 +170,9 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   # JIT C++ extensions require ninja, so put it into PATH.
   export PATH="/var/lib/jenkins/.local/bin:$PATH"
   if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then
-    pip install -q --user onnxruntime==1.8.0
+    pip install -q --user flatbuffers==2.0
+    wget https://ortpypackage.blob.core.windows.net/ort-nightly/ort_nightly-1.8.0.dev202107131-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+    pip install -q --user ort_nightly-1.8.0.dev202107131-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
   fi
   "$ROOT_DIR/scripts/onnx/test.sh"
 fi
diff --git a/test/onnx/expect/TestOperators.test_dropout_default.expect b/test/onnx/expect/TestOperators.test_dropout_default.expect
index dcbc25a55045f..550bc65f2700b 100644
--- a/test/onnx/expect/TestOperators.test_dropout_default.expect
+++ b/test/onnx/expect/TestOperators.test_dropout_default.expect
@@ -5,7 +5,19 @@ graph {
   node {
     input: "x"
     output: "1"
-    name: "ReduceMax_0"
+    output: "2"
+    name: "Dropout_0"
+    op_type: "Dropout"
+    attribute {
+      name: "ratio"
+      f: 0.5
+      type: FLOAT
+    }
+  }
+  node {
+    input: "1"
+    output: "3"
+    name: "ReduceMax_1"
     op_type: "ReduceMax"
     attribute {
       name: "keepdims"
@@ -31,7 +43,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "3"
     type {
       tensor_type {
         elem_type: 1
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index 9fe38ca7b2455..b9e391b540663 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -681,7 +681,7 @@ def test_dropout_training(self):
 
     def test_dropout_opset12(self):
         x = torch.randn(3, 4, requires_grad=True)
-        self.assertONNX(lambda x: torch.max(functional.dropout(x)), x, opset_version=12)
+        self.assertONNX(lambda x: torch.max(functional.dropout(x, training=False)), x, opset_version=12)
 
     def test_dropout_training_opset12(self):
         x = torch.randn(3, 4, requires_grad=True)
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index fe877948666dd..67903fb0bd94c 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -100,7 +100,10 @@ def run_model_test(self, model, batch_size=2, state_dict=None,
                    input_names=None, output_names=None,
                    fixed_batch_size=False, dict_check=True,
                    training=None, remained_onnx_input_idx=None):
-    model.eval()
+    if training is not None and training == torch.onnx.TrainingMode.TRAINING:
+        model.train()
+    elif training is None or training == torch.onnx.TrainingMode.EVAL:
+        model.eval()
     if input is None:
         input = torch.randn(batch_size, 3, 224, 224, requires_grad=True)
     with torch.no_grad():
@@ -281,11 +284,14 @@ def _run_test(m, remained_onnx_input_idx):
     def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7,
                                           example_outputs=None, do_constant_folding=True,
                                           dynamic_axes=None, input_names=None, output_names=None,
-                                          ort_optim_on=True):
+                                          ort_optim_on=True, training=None):
         import os
         import tempfile
 
-        model.eval()
+        if training is not None and training == torch.onnx.TrainingMode.TRAINING:
+            model.train()
+        elif training is None or training == torch.onnx.TrainingMode.EVAL:
+            model.eval()
         with torch.no_grad():
             if isinstance(input, torch.Tensor):
                 input = (input,)
@@ -3295,7 +3301,6 @@ def test_batchnorm1d_noaffine(self):
         x = torch.randn(10, 10, 128)
         self.run_test(model, x)
 
-    @skipIfUnsupportedOpsetVersion([14])
     def test_batchnorm1d_norunningstats(self):
         x = torch.randn(10, 10)
         model = torch.nn.BatchNorm1d(10, track_running_stats=False)
@@ -3314,7 +3319,6 @@ def test_batchnorm2d_noaffine(self):
         model = torch.nn.BatchNorm2d(3, affine=False)
         self.run_test(model, x)
 
-    @skipIfUnsupportedOpsetVersion([14])
     def test_batchnorm2d_norunningstats(self):
         x = torch.randn(10, 3, 128, 128)
         model = torch.nn.BatchNorm2d(3, track_running_stats=False)
@@ -7646,50 +7650,79 @@ def forward(self, input):
         x = torch.randn(6, 4, 3, 3)
         self.run_test(FakeQuantizePerChannelModel(), (x))
 
-    # Tests skipped temporarliy as latest onnxruntime release does not include training ops
-    @skipForAllOpsetVersions()
     def test_batchnorm_training(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
                 super(MyModule, self).__init__()
-                self.bn = torch.nn.BatchNorm2d(3, affine=True)
+                self.bn1 = torch.nn.BatchNorm2d(3, affine=False)
+                self.cv1 = torch.nn.Conv2d(3, 3, 10)
+                self.bn2 = torch.nn.BatchNorm2d(3, affine=True)
+                self.cv2 = torch.nn.Conv2d(3, 3, 10)
+                self.bn3 = torch.nn.BatchNorm2d(3, affine=False)
 
             def forward(self, x):
-                bn = self.bn(x)
-                return bn
-
-        model = MyModule()
-        x = torch.randn(10, 3, 128, 128)
+                x = self.bn1(x)
+                x = self.cv1(x)
+                x = self.bn2(x)
+                x = self.cv2(x)
+                x = self.bn3(x)
+                return x
 
-        model.train()
-        out = model(x)
+        x = torch.randn(10, 3, 20, 20) * 2
+        model_export = MyModule()
+        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5)
+        model_export.train()
+        self.run_test(model_export, (x, ), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5)
 
-        # state after 1 train epoch
-        running_mean = model.bn.running_mean
-        running_var = model.bn.running_var
-        saved_mean = x.mean((0, 2, 3))
-        saved_var = x.var((0, 2, 3), correction=1)
+    def test_batchnorm_training_mode_fix_layer(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.bn1 = torch.nn.BatchNorm2d(3, affine=True)
+                self.cv1 = torch.nn.Conv2d(3, 3, 10)
+                self.bn2 = torch.nn.BatchNorm2d(3, affine=False)
+                self.cv2 = torch.nn.Conv2d(3, 3, 10)
+                self.bn3 = torch.nn.BatchNorm2d(3, affine=True)
+                self.bn3.eval()
 
-        pytorch_out = [out.detach().numpy(),
-                       running_mean.cpu().numpy(), running_var.cpu().numpy(),
-                       saved_mean.cpu().numpy(), saved_var.cpu().numpy()]
+            def forward(self, x):
+                x = self.bn1(x)
+                x = self.cv1(x)
+                x = self.bn2(x)
+                x = self.cv2(x)
+                x = self.bn3(x)
+                return x
 
+        x = torch.randn(10, 3, 128, 128)
         model_export = MyModule()
-        f = io.BytesIO()
+        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5)
+        model_export.train()
+        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5)
 
-        ort_sess = convert_to_onnx(model_export, input=(x,), opset_version=self.opset_version,
-                                   training=torch.onnx.TrainingMode.TRAINING)
-        ort_outs = run_ort(ort_sess, input=(x,))
-        [np.testing.assert_allclose(p_out, ort_out, atol=10e-3, rtol=10e-3) for p_out, ort_out in zip(pytorch_out, ort_outs)]
+    def test_batchnorm_eval_mode_train_layer(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.bn1 = torch.nn.BatchNorm2d(3, affine=True)
+                self.cv1 = torch.nn.Conv2d(3, 3, 10)
+                self.bn2 = torch.nn.BatchNorm2d(3, affine=False)
+                self.cv2 = torch.nn.Conv2d(3, 3, 10)
+                self.bn3 = torch.nn.BatchNorm2d(3, affine=True)
+                self.bn3.train()
 
-        model_export = torch.jit.script(MyModule())
-        ort_sess = convert_to_onnx(model_export, input=(x,), opset_version=self.opset_version,
-                                   example_outputs=out,
-                                   training=torch.onnx.TrainingMode.TRAINING,
-                                   onnx_shape_inference=True)
-        ort_outs = run_ort(ort_sess, input=(x,))
-        [np.testing.assert_allclose(p_out, ort_out, atol=10e-3, rtol=10e-3) for p_out, ort_out in
-         zip(pytorch_out, ort_outs)]
+            def forward(self, x):
+                x = self.bn1(x)
+                x = self.cv1(x)
+                x = self.bn2(x)
+                x = self.cv2(x)
+                x = self.bn3(x)
+                return x
+
+        x = torch.randn(10, 3, 128, 128)
+        model_export = MyModule()
+        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL, rtol=1e-3, atol=1e-5)
+        model_export.eval()
+        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5)
 
     @skipIfUnsupportedMinOpsetVersion(12)
     def test_dropout_training(self):
@@ -7704,7 +7737,6 @@ def forward(self, x):
 
         model = MyModule()
         x = torch.randn(10)
-
         model.train()
 
         ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
@@ -7741,7 +7773,6 @@ def forward(self, x):
         nb_elements = torch.numel(input)
 
         model.train()
-
         ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
                                    training=torch.onnx.TrainingMode.TRAINING)
         ort_outs = run_ort(ort_sess, input=(x,))
@@ -7771,8 +7802,6 @@ def forward(self, x):
 
         np.testing.assert_allclose(ratio_pytorch, ratio_ort, rtol=0.01, atol=0.01)
 
-    # Tests skipped temporarliy as latest onnxruntime release does not include training ops
-    @skipForAllOpsetVersions()
     def test_conv_bn(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -7785,32 +7814,11 @@ def forward(self, x):
                 bn = self.bn(x)
                 return bn
 
-        model = MyModule()
+        model_export = MyModule()
         x = torch.randn(10, 3, 128, 128)
-        ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
-                                    training=torch.onnx.TrainingMode.TRAINING)
-        ort_outs1 = run_ort(ort_sess1, input=(x,))
-        ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
-                                    training=torch.onnx.TrainingMode.EVAL)
-        ort_outs2 = run_ort(ort_sess2, input=(x,))
-        [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in
-         zip(ort_outs1, ort_outs2)]
+        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL)
+        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5)
 
-        script_model = torch.jit.script(model)
-        outputs = model(x)
-        ort_sess1 = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version,
-                                    example_outputs=outputs,
-                                    training=torch.onnx.TrainingMode.TRAINING)
-        ort_outs1 = run_ort(ort_sess1, input=(x,))
-        ort_sess2 = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version,
-                                    example_outputs=outputs,
-                                    training=torch.onnx.TrainingMode.EVAL)
-        ort_outs2 = run_ort(ort_sess2, input=(x,))
-        [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in
-         zip(ort_outs1, ort_outs2)]
-
-    # Tests skipped temporarliy as latest onnxruntime release does not include training ops
-    @skipForAllOpsetVersions()
     def test_multiple_conv_bn(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -7836,16 +7844,10 @@ def forward(self, x):
                 x = self.relu(x)
                 return x
 
-        model = MyModule()
+        model_export = MyModule()
         x = torch.randn(2, 3, 224, 224)
-        ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
-                                    training=torch.onnx.TrainingMode.TRAINING)
-        ort_outs1 = run_ort(ort_sess1, input=(x,))
-        ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
-                                    training=torch.onnx.TrainingMode.EVAL)
-        ort_outs2 = run_ort(ort_sess2, input=(x,))
-        [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in
-         zip(ort_outs1, ort_outs2)]
+        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5)
+        self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL)
 
     def test_script_custom_class_error(self):
         class BoxCoder(object):
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 06faf410f865e..02da90dd3066e 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -5,7 +5,7 @@
 from torch.onnx import utils, OperatorExportTypes, TrainingMode
 from torch.onnx.symbolic_helper import _set_opset_version, _set_operator_export_type, _set_onnx_shape_inference
 import torch.utils.cpp_extension
-from test_pytorch_common import (skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedOpsetVersion,
+from test_pytorch_common import (skipIfUnsupportedMinOpsetVersion,
                                  skipIfUnsupportedMaxOpsetVersion)
 import caffe2.python.onnx.backend as backend
 from verify import verify
@@ -37,7 +37,10 @@ def _model_to_graph(self, model, input,
                         operator_export_type=OperatorExportTypes.ONNX,
                         input_names=None,
                         dynamic_axes=None):
-
+        if training == torch.onnx.TrainingMode.TRAINING:
+            model.train()
+        elif training == torch.onnx.TrainingMode.EVAL:
+            model.eval()
         # Need disable onnx_shape_inference for this test because it puts const node to initializers.
         _set_onnx_shape_inference(False)
         utils._validate_dynamic_axes(dynamic_axes, model, None, None)
@@ -811,11 +814,11 @@ def forward(self, x):
         model = torch.jit.script(MyModule())
         x = torch.randn(10, 3, 128, 128)
         example_outputs = model(x)
-        f = io.BytesIO()
         _set_opset_version(self.opset_version)
         _set_operator_export_type(OperatorExportTypes.ONNX)
         graph, _, __ = self._model_to_graph(model, (x,), do_constant_folding=True, example_outputs=example_outputs,
                                             operator_export_type=OperatorExportTypes.ONNX,
+                                            training=torch.onnx.TrainingMode.TRAINING,
                                             input_names=['x'], dynamic_axes={'x': [0, 1, 2, 3]})
 
         graph_input_params = [param.debugName() for param in graph.inputs()]
@@ -836,7 +839,6 @@ def forward(self, x):
         x = torch.tensor([1, 2])
         verify(MyModel(), x, backend, do_constant_folding=False)
 
-    @skipIfUnsupportedOpsetVersion([14])
     def test_fuse_conv_bn(self):
         class Fuse(torch.nn.Module):
             def __init__(self):
@@ -858,11 +860,11 @@ def forward(self, x):
 
         assert len(list(graph.nodes())) == 1
 
-    @skipIfUnsupportedOpsetVersion([14])
     def test_fuse_resnet18(self):
         model = torchvision.models.resnet18(pretrained=True)
         x = torch.randn(2, 3, 224, 224, requires_grad=True)
         graph, _, __ = self._model_to_graph(model, (x, ),
+                                            training=TrainingMode.EVAL,
                                             input_names=['x'], dynamic_axes={'x': [0, 1, 2, 3]})
 
         for node in graph.nodes():
@@ -882,7 +884,6 @@ def __init__(self):
             def forward(self, x, y):
                 return f(x, y)
 
-        model = MyModule()
         input_1 = torch.tensor(11)
         input_2 = torch.tensor(12)
         _set_opset_version(self.opset_version)
diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.cpp b/torch/csrc/jit/passes/onnx/eval_peephole.cpp
index 18dea16cb97ae..05afb69ef0f23 100644
--- a/torch/csrc/jit/passes/onnx/eval_peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/eval_peephole.cpp
@@ -47,14 +47,20 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) {
       fuseConvBatchNorm(child_block, valsToParamsMap);
     }
     if (it->kind() == onnx::Conv) {
-      if (it->output()->uses().size() != 1) {
+      auto oldConv = *it;
+      if (oldConv->outputs().at(0)->uses().size() != 1) {
         continue;
       }
-      auto bnNode = it->output()->uses()[0].user;
+      auto bnNode = oldConv->outputs().at(0)->uses()[0].user;
       if (bnNode->kind() != onnx::BatchNormalization) {
         continue;
       }
-      auto oldConv = *it;
+
+      if (oldConv->outputs().size() !=
+          bnNode->outputs().size()) { // BN layer is not in eval mode
+        continue;
+      }
+
       auto epsilon = bnNode->f(attr::epsilon);
       auto convInputVals = getValues(oldConv, valsToParamsMap);
       if (convInputVals.size() < 1 ||
@@ -109,11 +115,8 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) {
         convB = bnB;
       }
 
-      Node* newConv =
-          b->owningGraph()->create(onnx::Conv, bnNode->outputs().size());
-      for (size_t i = 0; i < newConv->outputs().size(); ++i) {
-        newConv->outputs()[i]->copyMetadata(bnNode->outputs()[i]);
-      }
+      Node* newConv = b->owningGraph()->create(onnx::Conv, 1);
+      newConv->outputs().at(0)->copyMetadata(bnNode->outputs().at(0));
 
       newConv->copyAttributes(*oldConv);
       newConv->insertBefore(bnNode);
@@ -131,9 +134,7 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) {
       newConvB->inferTypeFrom(convB);
       newConv->addInput(newConvB);
 
-      bnNode->replaceAllUsesWith(newConv);
-      bnNode->removeAllInputs();
-      it->removeAllInputs();
+      bnNode->outputs().at(0)->replaceAllUsesWith(newConv->outputs().at(0));
       bnNode->destroy();
       it.destroyCurrent();
     }
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 66f276ce42f55..13bc4800a6700 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -755,19 +755,23 @@ def _avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override, na
     padding = tuple(tuple_fn(padding))
     return padding
 
-def assert_training_mode(op_mode, op_name):
+
+def check_training_mode(op_train_mode, op_name):
     global _training_mode
-    op_mode = True if op_mode == 1 else False
-    if op_mode != _training_mode:
-        op_mode = "training " if op_mode else "inference"
+    op_train_mode = True if op_train_mode == 1 else False
+    if _training_mode is not None and op_train_mode != _training_mode:
+        op_mode = "training " if op_train_mode else "inference"
         training_mode = "training " if _training_mode else "inference"
         # setting the model mode could result in op_mode != _training_mode
         # if the model is a FuncModule. In this case we warn the user of
-        # the state and export depending on training_mode
+        # the state and export depending on op_mode
+        # This is to support use-cases of fixing certain layer weights
+        # in training.
         warnings.warn("ONNX export mode is set to " + training_mode +
                       " mode, but operator " + op_name + " is set to " +
-                      op_mode + " mode. The model will be exported in " +
-                      training_mode + ", as specified by the export mode.")
+                      op_mode + " mode. The operators will be exported in " +
+                      op_mode + ", as specified by the functional operator.")
+
 
 def _flatten_helper(g, input, start_dim, end_dim, dim):
     input_size = g.op("Shape", input)
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index 8e989cccf9867..d8f954148a1ee 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -4,6 +4,7 @@
 from torch.onnx.utils import _add_block, _add_input_to_block, _add_output_to_block
 from sys import maxsize
 from torch.onnx.symbolic_opset9 import permute, _reshape_from_tensor
+import warnings
 
 
 # EDITING THIS FILE? READ THIS FIRST!
@@ -25,11 +26,12 @@ def outer(g, input, other):
 
 @parse_args("v", "f", "i")
 def dropout(g, input, p, train):
-    sym_help.assert_training_mode(train, "dropout")
+    sym_help.check_training_mode(train, "dropout")
     # in eval mode, dropout is non-op - if the node's train param is set to False, dropout is non-op
-    if not sym_help._training_mode:
+    if not train:
         return input
-
+    warnings.warn("Dropout is a training op and should not be exported in inference mode. "
+                  "For inference, make sure to call eval() on the model and to export it with param training=False.")
     p = g.op("Constant", value_t=torch.tensor(p))
     t = g.op("Constant", value_t=torch.tensor(True))
     r, _ = g.op("Dropout", input, p, t, outputs=2)
diff --git a/torch/onnx/symbolic_opset14.py b/torch/onnx/symbolic_opset14.py
index 443aac97c9404..d4775b553da8d 100644
--- a/torch/onnx/symbolic_opset14.py
+++ b/torch/onnx/symbolic_opset14.py
@@ -38,7 +38,7 @@ def reshape(g, self, shape):
 
 @parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
 def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled):
-    sym_help.assert_training_mode(training, "batch_norm")
+    sym_help.check_training_mode(training, "batch_norm")
     weight, bias, running_mean, running_var = sym_help._batchnorm_helper(g, input, weight, bias, running_mean, running_var)
     out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var,
                epsilon_f=eps,
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 616c2fcaff5b8..993284a292a96 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1343,13 +1343,13 @@ def conv_transpose3d(g, input, weight, bias, stride, padding, output_padding, gr
 
 @parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
 def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled):
-    sym_help.assert_training_mode(training, "batch_norm")
+    sym_help.check_training_mode(training, "batch_norm")
     weight, bias, running_mean, running_var = sym_help._batchnorm_helper(g, input, weight, bias, running_mean, running_var)
     out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var,
                epsilon_f=eps,
                momentum_f=1 - momentum,
-               outputs=1 if not sym_help._training_mode else 5)
-    if not sym_help._training_mode:
+               outputs=1 if not training else 5)
+    if not training:
         return out
     else:
         res, new_running_mean, new_running_var, saved_mean, saved_var = out
@@ -1624,12 +1624,12 @@ def exp(g, self):
 
 @parse_args("v", "f", "i")
 def dropout(g, input, p, train):
-    sym_help.assert_training_mode(train, "dropout")
+    sym_help.check_training_mode(train, "dropout")
     # in eval mode, dropout is non-op - if the node's train param is set to False, dropout is non-op
-    if not sym_help._training_mode:
+    if not train:
         return input
     warnings.warn("Dropout is a training op and should not be exported in inference mode. "
-                  "Make sure to call eval() on the model, and to export it with param training=False.")
+                  "For inference, make sure to call eval() on the model and to export it with param training=False.")
     r, _ = g.op("Dropout", input, ratio_f=p, outputs=2)
     return r
 

From 877e6f2be3e78258247fb969577cb86be392e90c Mon Sep 17 00:00:00 2001
From: Charles David Hernandez <cdhernandez@fb.com>
Date: Wed, 18 Aug 2021 13:30:35 -0700
Subject: [PATCH 039/530] Bugfix for fuse qconfig comparison (#63384)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63384

In some cases the changes to qconfig on module would cause the
fusions to fail. This bugfix solves that problem by adding a
qconfig_function_comparison that compares the functions within the
qconfig rather than the modules the qconfigs are on. The comparison
looks at the partial object within QConfig.activation/weight.p and
compares args, keywords and func. This is necessary to do mannually
because partial doesn't have __eq__ implemented and so == reverts to is.

Test Plan:
python test/test_quantization.py
TestFuseFx.test_problematic_fuse_example

Imported from OSS

Reviewed By: supriyar, ejguan

Differential Revision: D30386264

fbshipit-source-id: 51e358c021c39d6f48dc12ad2a82b2838677b9de
---
 test/quantization/fx/test_quantize_fx.py | 32 ++++++++++++++++++++++++
 torch/quantization/fx/prepare.py         |  4 +--
 torch/quantization/qconfig.py            | 17 +++++++++++++
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 2f5f7c4a27f6e..bf15a06831bac 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -314,6 +314,38 @@ def test_qconfig_fused_module(self):
 
             self.checkGraphModuleNodes(quantized, expected_node_list=node_list)
 
+    def test_problematic_fuse_example(self):
+        class LinearRelu(nn.Sequential):
+            def __init__(self):
+                super().__init__(
+                    nn.Linear(5, 5),
+                    nn.ReLU(),
+                )
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lin_relu = LinearRelu()
+                self.linear = nn.Linear(5, 5)
+
+            def forward(self, x):
+                x = self.lin_relu(x)
+                x = self.linear(x)
+                return x
+
+        model = M().eval()
+        # these qconfigs somehow fail equality where default_qconfig does not
+        qconfig_dict = {
+            "": None,
+            "object_type": [
+                (torch.nn.Linear, get_default_qconfig('fbgemm')),
+                (torch.nn.ReLU, get_default_qconfig('fbgemm')),
+            ],
+        }
+        m = prepare_fx(model, qconfig_dict)
+
+        self.checkGraphModuleNodes(m, expected_node=ns.call_module(torch.nn.intrinsic.modules.fused.LinearReLU))
+
     def test_fuse_custom_config_dict_validity(self):
         r"""
         Verifies that if a user passes an invalid key or makes a typo when
diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py
index 873d11acaa82e..23d1d40bb543b 100644
--- a/torch/quantization/fx/prepare.py
+++ b/torch/quantization/fx/prepare.py
@@ -15,7 +15,7 @@
 )
 from torch.fx.node import Argument
 
-from ..qconfig import QConfigAny
+from ..qconfig import QConfigAny, qconfig_function_equality
 from .qconfig_utils import (
     convert_dict_to_ordered_dict,
     generate_qconfig_map,
@@ -195,7 +195,7 @@ def update_qconfig_for_fusion(
                     # Raise an error if the modules in the fused module have
                     # different qconfigs specified in the qconfig_dict
                     for op in ops:
-                        if object_type_dict.get(op, None) != fused_qconfig:
+                        if not qconfig_function_equality(object_type_dict.get(op, None), fused_qconfig):
                             raise LookupError("During fusion, we need to specify the same " +
                                               f"qconfigs for both modules in {module_type}.")
 
diff --git a/torch/quantization/qconfig.py b/torch/quantization/qconfig.py
index 15eb174f021b9..01d67ddcbd8b1 100644
--- a/torch/quantization/qconfig.py
+++ b/torch/quantization/qconfig.py
@@ -209,3 +209,20 @@ def configure_constructor_to_put_obs_on_module_device(original_constructor):
         return QConfig(activation, weight)
     else:
         return QConfigDynamic(activation, weight)
+
+
+def qconfig_function_equality(q1: QConfigAny, q2: QConfigAny):
+    # functools.partial has no __eq__ operator defined so '==' defaults to 'is'
+    def compare_partial(p1, p2):
+        same = p1.func == p2.func
+        same = same and p1.args == p2.args
+        return same and p1.keywords == p2.keywords
+
+    if q1 is None or q2 is None:
+        return q1 == q2
+    else:
+        assert q1 is not None and q2 is not None
+        try:
+            return compare_partial(q1.activation.p, q2.activation.p) and compare_partial(q1.weight.p, q2.weight.p)
+        except AttributeError:
+            return q1 == q2

From 9253dc1e5819a5638e4c60b0721fe14258bbae55 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Wed, 18 Aug 2021 13:33:36 -0700
Subject: [PATCH 040/530] Fix segmentation fault due to access to destroyed
 CudaIPCGlobalEntities instance (#56141)

Summary:
There is an instance of the static destruction order fiasco where cuda_ipc_global_entities may be accessed after it is destroyed. See https://github.com/pytorch/pytorch/issues/51961

This change uses a flag and avoids accesses to the destroyed class when it is set to false.

Fixes https://github.com/pytorch/pytorch/issues/51961

This removes the function to clear shared_blocks introduced by https://github.com/pytorch/pytorch/issues/53080 which had multiple issues: Unprotected access to a shared structure and modification of the vector which is being cleared by the destructors of the objects contained.
I.e. what happened was:

- `CudaIPCSentDataLimbo_.clear_shared_blocks();` is called from the destructor of CudaIPCGlobalEntities as of your PR
- This deletes instances of `CudaIPCSentData` which hold `at::DataPtr` created by `GetNewRefCountedSentData`
- This means `CudaIPCSentDataDelete` is called with still active pointers
- Hence `CudaIPCSentDataLimbo_.add` is called adding a new value to `shared_blocks_`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/56141

Reviewed By: ejguan

Differential Revision: D30397279

Pulled By: VitalyFedyunin

fbshipit-source-id: ce4b8b90fa1c90d275e5eca93ba84321cbc6140a
---
 torch/csrc/CudaIPCTypes.cpp | 39 ++++++++++++++++++++++++++-----------
 torch/csrc/CudaIPCTypes.h   |  5 +----
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/torch/csrc/CudaIPCTypes.cpp b/torch/csrc/CudaIPCTypes.cpp
index 6b42ca078024a..9033d445081ea 100644
--- a/torch/csrc/CudaIPCTypes.cpp
+++ b/torch/csrc/CudaIPCTypes.cpp
@@ -19,24 +19,27 @@ void warnProducerTerminatedBeforeSharedTensorsReleased() {
 }
 
 struct CudaIPCGlobalEntities {
+  // This class is used as a singleton (see cuda_ipc_global_entities)
+  // This variable is used to track its lifetime to avoid accessing it
+  // after it was destroyed which would lead to segmentation faults
+  // Note that a trvial type is used which doesn't suffer from construction
+  // and destruction order issues
+  static bool alive;
+
   std::mutex ref_counters_mutex_;
   std::atomic<int64_t> sync_events_used_{0};
   std::map<std::string, std::shared_ptr<CudaIPCRefCountersFile>>
       ref_counters_files_;
   std::shared_ptr<CudaIPCRefCountersFile> next_available_ref_counters_file_;
   CudaIPCSentDataLimbo CudaIPCSentDataLimbo_;
-  CudaIPCGlobalEntities()  = default;
+  CudaIPCGlobalEntities() { alive = true; }
   ~CudaIPCGlobalEntities() {
     CudaIPCSentDataLimbo_.collect();
-    // Clear shared blocks to avoid releasing shared blocks after
-    // ~CudaIPCGlobalEntities is done since circular references causes the
-    // destructor of ~CudaIPCSentData to access the cuda_ipc_global_entities
-    // again.
-    CudaIPCSentDataLimbo_.clear_shared_blocks();
     safe_clean_current_file();
     if (next_available_ref_counters_file_) {
       warnProducerTerminatedBeforeSharedTensorsReleased();
     }
+    alive = false;
   }
   void safe_clean_current_file() {
     std::lock_guard<std::mutex> lock(ref_counters_mutex_);
@@ -48,19 +51,16 @@ struct CudaIPCGlobalEntities {
   }
 };
 
+bool CudaIPCGlobalEntities::alive = false;
 CudaIPCGlobalEntities cuda_ipc_global_entities;
 
 CudaIPCSentDataLimbo::~CudaIPCSentDataLimbo() {
   collect();
-  if (shared_blocks_.size() > 0) {
+  if (size() > 0) {
     warnProducerTerminatedBeforeSharedTensorsReleased();
   }
 }
 
-void CudaIPCSentDataLimbo::clear_shared_blocks() {
-  shared_blocks_.clear();
-}
-
 bool CudaIPCSentDataLimbo::collect() {
   bool freed_memory = false;
   std::vector<std::unique_ptr<CudaIPCSentData>> reset_blocks;
@@ -99,9 +99,17 @@ void CudaIPCSentDataLimbo::add(std::unique_ptr<CudaIPCSentData> shared_block) {
   shared_blocks_.push_back(std::move(shared_block));
 }
 
+uint64_t CudaIPCSentDataLimbo::size() {
+  std::lock_guard<std::mutex> lock(limbo_mutex_);
+  return shared_blocks_.size();
+}
+
 void CudaIPCSentDataDelete(void* ptr) {
   std::unique_ptr<CudaIPCSentData> sent_data(
       static_cast<CudaIPCSentData*>(ptr));
+  if(!CudaIPCGlobalEntities::alive) {
+    return;
+  }
   if (sent_data->counter_value() > 0) {
     cuda_ipc_global_entities.CudaIPCSentDataLimbo_.add(std::move(sent_data));
   }
@@ -109,6 +117,9 @@ void CudaIPCSentDataDelete(void* ptr) {
 }
 
 void ReturnRefCounter(const std::string& handle, uint64_t offset /* unused */) {
+  if(!CudaIPCGlobalEntities::alive) {
+    return;
+  }
   std::lock_guard<std::mutex> lock(
       cuda_ipc_global_entities.ref_counters_mutex_);
   auto& map = cuda_ipc_global_entities.ref_counters_files_;
@@ -180,6 +191,9 @@ CudaIPCSentData::~CudaIPCSentData() {
     if (event_sync_required_) {
       at::cuda::CUDAGuard device_guard(device_.index());
       cudaEventDestroy(event_);
+      if(!CudaIPCGlobalEntities::alive) {
+        return;
+      }
       cuda_ipc_global_entities.sync_events_used_ --;
     }
   } catch (...) { /* No throw */
@@ -226,6 +240,9 @@ at::DataPtr GetNewRefCountedSentData(void* data, at::Device device) {
 }
 
 bool CudaIPCCollect() {
+  if(!CudaIPCGlobalEntities::alive) {
+    return true;
+  }
   bool freed_memory = cuda_ipc_global_entities.CudaIPCSentDataLimbo_.collect();
   if (cuda_ipc_global_entities.CudaIPCSentDataLimbo_.size() == 0) {
     cuda_ipc_global_entities.safe_clean_current_file();
diff --git a/torch/csrc/CudaIPCTypes.h b/torch/csrc/CudaIPCTypes.h
index 63e1d1d416a5a..ab9ede006916d 100644
--- a/torch/csrc/CudaIPCTypes.h
+++ b/torch/csrc/CudaIPCTypes.h
@@ -63,11 +63,8 @@ constexpr int64_t CUDA_IPC_MAXIMUM_EVENTS_TO_USE = 1000;
 struct CudaIPCSentDataLimbo final {
   ~CudaIPCSentDataLimbo();
   bool collect();
-  void clear_shared_blocks();
   void add(std::unique_ptr<CudaIPCSentData> shared_block);
-  uint64_t size() {
-    return shared_blocks_.size();
-  }
+  uint64_t size();
 
  private:
   // TODO: Can be changed to FIFO in order to avoid full traverse on every

From feba6806c9b7a5e13ffd5839f8bc6ddc2e016a26 Mon Sep 17 00:00:00 2001
From: Michael Dagitses <mikeyd@fb.com>
Date: Wed, 18 Aug 2021 13:43:54 -0700
Subject: [PATCH 041/530] clarify that `torch.finfo.tiny` is the smallest
 normal number (#63241)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63241

This is a common source of confusion, but it matches the NumPy
behavior.

Fixes #44010
Fixes #59526

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30307646

Pulled By: dagitses

fbshipit-source-id: d848140ba267560387d83f3e7acba8c3cdc53d82
---
 docs/source/type_info.rst | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/source/type_info.rst b/docs/source/type_info.rst
index fe8eaa1769adf..0647cca544c0f 100644
--- a/docs/source/type_info.rst
+++ b/docs/source/type_info.rst
@@ -26,13 +26,18 @@ bits        int     The number of bits occupied by the type.
 eps         float   The smallest representable number such that ``1.0 + eps != 1.0``.
 max         float   The largest representable number.
 min         float   The smallest representable number (typically ``-max``).
-tiny        float   The smallest positive representable number.
+tiny        float   The smallest positive normal number. See notes.
 resolution  float   The approximate decimal resolution of this type, i.e., ``10**-precision``.
 ==========  =====   ========================================
 
 .. note::
   The constructor of :class:`torch.finfo` can be called without argument, in which case the class is created for the pytorch default dtype (as returned by :func:`torch.get_default_dtype`).
 
+.. note::
+  `tiny` returns the smallest *normal* number, but there are smaller
+  subnormal numbers. See https://en.wikipedia.org/wiki/Denormal_number
+  for more information.
+
 
 .. _iinfo-doc:
 

From 8bdd5424173cd08ddafd77cb45d38c7540ae72d6 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Wed, 18 Aug 2021 14:46:25 -0700
Subject: [PATCH 042/530] [TensorExpr] Add debug logging to
 LoopNest::computeInline. (#63196)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63196

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D30292778

Pulled By: ZolotukhinM

fbshipit-source-id: d8a111b75466a9354f6d048119cc6f814c9d5abb
---
 torch/csrc/jit/tensorexpr/loopnest.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index ea6f09349e444..e9bc76c6e8791 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -12,6 +12,7 @@
 #include <c10/util/string_utils.h>
 
 #include <ATen/core/functional.h>
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/bounds_inference.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
@@ -558,12 +559,20 @@ class FunctionInliner : public IRMutator {
       }
       // Add a mapping for each function parameter to it's source name.
       inline_mapping_[func_callee_arg] = func_caller_param;
+      GRAPH_DEBUG(
+          "ComputeInline: Inline mapping: ",
+          std::to_string(func_callee_arg),
+          " -> ",
+          std::to_string(func_caller_param));
       index_vars.push_back(func_callee_arg);
     }
 
     // Call the actual replacement.
     ExprPtr body = producer_->value();
+    GRAPH_DEBUG("ComputeInline: Before rewriting body: ", std::to_string(body));
     ExprPtr result = Expr::clone(body)->accept_mutator(this);
+    GRAPH_DEBUG(
+        "ComputeInline: After rewriting body: ", std::to_string(result));
 
     // Remove the mappings we created for this function parameters.
     for (auto v : index_vars) {
@@ -575,6 +584,7 @@ class FunctionInliner : public IRMutator {
           }
         }
       }
+      GRAPH_DEBUG("ComputeInline: Inline mapping: erasing", std::to_string(v));
       inline_mapping_.erase(v);
     }
     return result;
@@ -617,6 +627,8 @@ class FunctionInliner : public IRMutator {
     const std::string& name = buf_->name_hint();
     VarPtr new_var = alloc<Var>(name, v->dtype());
     random_bindings_[alloc<Let>(new_var, v)] = index_vars_;
+    GRAPH_DEBUG(
+        "ComputeInline: created random bindings for ", std::to_string(new_var));
     return new_var;
   }
 
@@ -731,6 +743,7 @@ bool LoopNest::computeInline(BufPtr b) {
 
   TORCH_INTERNAL_ASSERT(relevant_store);
 
+  GRAPH_DEBUG("ComputeInline: Def: ", std::to_string(relevant_store));
   FunctionInliner inliner(relevant_store, output_bufs_);
   root_stmt_ = root_stmt_->accept_mutator(&inliner);
 

From 7fdba4564af4c21727811aeaf3d58703d189f76d Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Wed, 18 Aug 2021 14:46:25 -0700
Subject: [PATCH 043/530] [TensorExpr] IRSimplifier: sort terms in polynomials,
 terms, minterms, maxterms. (#63197)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63197

This solves non-determinism from using hash values in sort methods.
Changes in tests are mostly mechanical.

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D30292776

Pulled By: ZolotukhinM

fbshipit-source-id: 74f57b53c3afc9d4be45715fd74781271373e055
---
 test/cpp/tensorexpr/test_cuda.cpp           |  20 +-
 test/cpp/tensorexpr/test_loopnest.cpp       |  47 +--
 test/cpp/tensorexpr/test_reductions.cpp     |   8 +-
 test/cpp/tensorexpr/test_registerizer.cpp   |  76 ++---
 test/cpp/tensorexpr/test_simplify.cpp       | 322 +++++++-------------
 test/cpp/tensorexpr/test_utils.h            |   5 +
 torch/csrc/jit/tensorexpr/ir_simplifier.cpp |  99 +++++-
 torch/csrc/jit/tensorexpr/ir_simplifier.h   |  40 +--
 8 files changed, 288 insertions(+), 329 deletions(-)

diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
index 3ca6e0d9f5c3a..e36e17ad432f9 100644
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ b/test/cpp/tensorexpr/test_cuda.cpp
@@ -1575,10 +1575,10 @@ TEST(Cuda, MaskMultiDim_CUDA) {
   const std::string& verification_pattern =
       R"IR(
 # CHECK-NOT: if (
-# CHECK: C[100 * blockIdx.x + threadIdx.x] =
+# CHECK: C[threadIdx.x + 100 * blockIdx.x] =
 # CHECK: __syncthreads();
 # CHECK: if (threadIdx.x<50
-# CHECK:   D[50 * blockIdx.x + threadIdx.x] =)IR";
+# CHECK:   D[threadIdx.x + 50 * blockIdx.x] =)IR";
 
   torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
 
@@ -1705,10 +1705,10 @@ TEST(Cuda, MaskMultiDimSymbolic_CUDA) {
   const std::string& verification_pattern =
       R"IR(
 # CHECK: if (threadIdx.x<A_SIZE
-# CHECK:   C[threadIdx.x + A_SIZE * blockIdx.x] =
+# CHECK:   C[A_SIZE * blockIdx.x + threadIdx.x] =
 # CHECK: __syncthreads();
 # CHECK: if (threadIdx.x<B_SIZE
-# CHECK:   D[threadIdx.x + B_SIZE * blockIdx.x] =)IR";
+# CHECK:   D[B_SIZE * blockIdx.x + threadIdx.x] =)IR";
 
   torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
 
@@ -1852,10 +1852,10 @@ TEST(Cuda, MaskCompoundInnerLoop_CUDA) {
   const std::string& verification_pattern =
       R"IR(
 # CHECK-NOT: if (
-# CHECK: c[100 * blockIdx.x + threadIdx.x] =
+# CHECK: c[threadIdx.x + 100 * blockIdx.x] =
 # CHECK: __syncthreads();
 # CHECK: if (threadIdx.x<50
-# CHECK:   d[50 * blockIdx.x + threadIdx.x] =)IR";
+# CHECK:   d[threadIdx.x + 50 * blockIdx.x] =)IR";
 
   torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
 
@@ -1991,10 +1991,10 @@ TEST(Cuda, MaskInnerLoopOneBlock_CUDA) {
       R"IR(
 # CHECK: for (int i = 0; i < 10
 # CHECK-NOT: if (
-# CHECK: c[100 * i + threadIdx.x] =
+# CHECK: c[threadIdx.x + 100 * i] =
 # CHECK: __syncthreads();
 # CHECK: if (threadIdx.x<50
-# CHECK:   d[50 * i + threadIdx.x] =)IR";
+# CHECK:   d[threadIdx.x + 50 * i] =)IR";
 
   torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
 
@@ -2119,7 +2119,7 @@ TEST(Cuda, MaskMultiDimMultiAxis_CUDA) {
   const std::string& verification_pattern =
       R"IR(
 # CHECK: if (threadIdx.y<1
-# CHECK:   C[30 * blockIdx.x + threadIdx.x] =
+# CHECK:   C[threadIdx.x + 30 * blockIdx.x] =
 # CHECK: __syncthreads();
 # CHECK: if (threadIdx.x<1
 # CHECK:   D[threadIdx.y + 15 * blockIdx.x] =)IR";
@@ -2250,7 +2250,7 @@ TEST(Cuda, MaskMultiDimMultiLevel_CUDA) {
   const std::string& verification_pattern =
       R"IR(
 # CHECK-NOT: if (
-# CHECK: C[30 * blockIdx.x + threadIdx.x] =
+# CHECK: C[threadIdx.x + 30 * blockIdx.x] =
 # CHECK: __syncthreads();
 # CHECK: if (blockIdx.x<5
 # CHECK:   if (threadIdx.x<15
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index 440b169d57259..f2ae208ca7fed 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -29,6 +29,17 @@ void checkIR(StmtPtr s, const std::string& pattern) {
   torch::jit::testing::FileCheck().run(pattern, oss.str());
 }
 
+void checkExprIR(ExprPtr e, const std::string& pattern) {
+  std::string prefixed_pattern = "# CHECK: " + pattern + "\n";
+  std::ostringstream oss;
+  oss << *e << "\n";
+  torch::jit::testing::FileCheck().run(prefixed_pattern, oss.str());
+}
+
+void checkExprIR(const ExprHandle& e, const std::string& pattern) {
+  checkExprIR(e.node(), pattern);
+}
+
 TEST(LoopNest, ExprSimple01) {
   KernelScope kernel_scope;
   Tensor* tensor = Compute(
@@ -1305,7 +1316,7 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) {
 # CHECK: for (int m2 = 0; m2 < 4; m2++)
 # CHECK:   for (int n2 = 0; n2 < 5; n2++)
 # CHECK:     for (int k2 = 0; k2 < 6; k2++)
-# CHECK:       y[m2, n2, k2] = ((n2 * m2) * k2 + (rand())) + (rand());)IR");
+# CHECK:       y[m2, n2, k2] = ((k2 * m2) * n2 + (rand())) + (rand());)IR");
 }
 
 // Make sure we generate the right number of random values == the dimensionality
@@ -1710,11 +1721,11 @@ TEST(LoopNest, ScheduleInlineOutputTensors) {
 # CHECK: for (int m1 = 0; m1 < 4; m1++)
 # CHECK:   for (int n1 = 0; n1 < 5; n1++)
 # CHECK:     for (int k1 = 0; k1 < 6; k1++)
-# CHECK:       x[m1, n1, k1] = (n1 * m1) * k1;
+# CHECK:       x[m1, n1, k1] = (k1 * m1) * n1;
 # CHECK: for (int m2 = 0; m2 < 4; m2++)
 # CHECK:   for (int n2 = 0; n2 < 5; n2++)
 # CHECK:     for (int k2 = 0; k2 < 6; k2++)
-# CHECK:       y[m2, n2, k2] = (n2 * m2) * k2 + m2;)IR");
+# CHECK:       y[m2, n2, k2] = (k2 * m2) * n2 + m2;)IR");
 }
 
 TEST(LoopNest, ScheduleFuserStyle) {
@@ -2130,7 +2141,7 @@ TEST(LoopNest, Reduce2dComputeAt) {
 # CHECK:     cons[(0 + cy * (1 * W)) + cx * 1] = int(0);
 # CHECK:     for (int r = 0; r < 2; r++) {
 # CHECK:       for (int s = 0; s < 2; s++) {
-# CHECK:         cons[(0 + cy * (1 * W)) + cx * 1] = (cons[(0 + cy * (1 * W)) + cx * 1]) + (temp[(0 + r * (1 * (W + 1))) + (s + cx) * 1]);
+# CHECK:         cons[(0 + cy * (1 * W)) + cx * 1] = (cons[(0 + cy * (1 * W)) + cx * 1]) + (temp[(0 + r * (1 * (W + 1))) + (cx + s) * 1]);
 # CHECK:       }
 # CHECK:     }
 # CHECK:   }
@@ -3225,7 +3236,7 @@ TEST(LoopNest, NormalizeStartVariable) {
       {Store::make(a_buf, {x}, Load::make(kInt, b_buf, {x})),
        Store::make(b_buf, {x}, x * 2)});
   auto for_stmt = For::make(x, y, 100, for_body);
-  Block::make({for_stmt});
+  auto parent_block = Block::make({for_stmt});
 
   LoopNest::normalize(for_stmt);
 
@@ -3235,8 +3246,8 @@ TEST(LoopNest, NormalizeStartVariable) {
   const std::string& expected_ir =
       R"IR(
         # CHECK: for (int x = 0; x < 100 - y; x++) {
-        # CHECK:   A[y + x] = B[y + x];
-        # CHECK:   B[y + x] = 2 * (y + x);
+        # CHECK:   A[x + y] = B[x + y];
+        # CHECK:   B[x + y] = 2 * (x + y);
       )IR";
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
 }
@@ -3304,7 +3315,7 @@ TEST(LoopNest, NormalizeOnNestedInnerLoop) {
       R"IR(
         # CHECK: for (int x = 50; x < 100; x++) {
         # CHECK:   for (int y = 0; y < 90; y++) {
-        # CHECK:     A[x] = (((B[y + 10]) + 2 * y) + (A[x])) + 20;
+        # CHECK:     A[x] = (((A[x]) + (B[y + 10])) + 2 * y) + 20;
       )IR";
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
 }
@@ -3327,7 +3338,7 @@ TEST(LoopNest, NormalizeAndSplitWithTail) {
   BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
   VarHandle x("x", kInt);
   auto for_stmt = For::make(x, 5, 10, Store::make(a_buf, {x}, x * 2));
-  Block::make({for_stmt});
+  auto parent_block = Block::make({for_stmt});
 
   LoopNest::normalize(for_stmt);
 
@@ -3373,7 +3384,7 @@ TEST(LoopNest, FlattenSimpleLoopNest2D) {
   auto for_body = Block::make({Store::make(a_buf, {i, j}, i * j)});
   auto inner_for = For::make(j, 0, 5, for_body);
   auto outer_for = For::make(i, 0, 10, inner_for);
-  Block::make({outer_for});
+  auto parent_block = Block::make({outer_for});
 
   std::vector<ForPtr> loops = {outer_for, inner_for};
   ForPtr flattened = nullptr;
@@ -3420,7 +3431,7 @@ TEST(LoopNest, FlattenSimpleLoopNest3D) {
   auto for1 = For::make(k, 0, 7, for_body);
   auto for2 = For::make(j, 0, 5, for1);
   auto for3 = For::make(i, 0, 10, for2);
-  Block::make({for3});
+  auto parent_block = Block::make({for3});
 
   std::vector<ForPtr> loops = {for3, for2, for1};
   ForPtr flattened = nullptr;
@@ -3463,7 +3474,7 @@ TEST(LoopNest, FlattenLoopNestAfterNormalize) {
   auto for_body = Block::make({Store::make(a_buf, {i - 2, j - 3}, i * j)});
   auto inner_for = For::make(j, 3, 15, for_body);
   auto outer_for = For::make(i, 2, 10, inner_for);
-  Block::make({outer_for});
+  auto parent_block = Block::make({outer_for});
 
   std::vector<ForPtr> loops = {outer_for, inner_for};
   ForPtr flattened = nullptr;
@@ -3712,7 +3723,7 @@ TEST(LoopNest, CacheReadsSimple) {
 #CHECK:   A_local[j_1] = A[
 #CHECK:  }
 #CHECK:  for (int j_2
-#CHECK:   B[10 * i_1 + j_2] = A_local[j_2];
+#CHECK:   B[j_2 + 10 * i_1] = A_local[j_2];
 #CHECK:  }
 #CHECK: }
 #CHECK: for (int i_2
@@ -3769,7 +3780,7 @@ TEST(LoopNest, CacheReadsOuter) {
   checkIR(result, R"IR(
 #CHECK: Allocate(A_local); // dtype=int, dims=[21, 11]
 #CHECK: A_local[j_1 + 11 * i_1] =
-#CHECK: B[10 * i_2 + j_2] = (A_local[(j_2 + 11 * i_2) + 12]) + (A_local[j_2 + 11 * i_2]);
+#CHECK: B[j_2 + 10 * i_2] = (A_local[j_2 + 11 * i_2]) + (A_local[(j_2 + 11 * i_2) + 12]);
       )IR");
 
   std::vector<int> b_data(200, 0);
@@ -3816,7 +3827,7 @@ TEST(LoopNest, CacheReadsInternal) {
   checkIR(result, R"IR(
 #CHECK: Allocate(A_local); // dtype=int, dims=[2, 11]
 #CHECK: A_local[j_1 + 11 * i_2] =
-#CHECK: B[10 * i_1 + j_2] = (A_local[j_2 + 12]) + (A_local[j_2]);
+#CHECK: B[j_2 + 10 * i_1] = (A_local[j_2 + 12]) + (A_local[j_2]);
       )IR");
 
   std::vector<int> b_data(200, 0);
@@ -3863,8 +3874,8 @@ TEST(LoopNest, CacheReadsInner) {
 
   checkIR(result, R"IR(
 #CHECK: Allocate(A_local); // dtype=int, dims=[5, 2]
-#CHECK: A_local[2 * i_2 + j_2] =
-#CHECK: B[10 * i_1 + j_1] = (A_local[1]) + (A_local[8]);
+#CHECK: A_local[j_2 + 2 * i_2] =
+#CHECK: B[j_1 + 10 * i_1] = (A_local[1]) + (A_local[8]);
       )IR");
 
   std::vector<int> b_data(200, 0);
@@ -3914,7 +3925,7 @@ TEST(LoopNest, CacheWritesSimple) {
 #CHECK: for (int j = 0; j < 64
 #CHECK:   A_local[j] = i * j;
 #CHECK: for (int j_1 = 0; j_1 < 64
-#CHECK:   A[64 * i + j_1] = A_local[
+#CHECK:   A[j_1 + 64 * i] = A_local[
 #CHECK: Free(A_local);
 #CHECK-NOT: A_local
       )IR");
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index bd71a4fd8da14..0d033e0bd8a1f 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -1578,8 +1578,8 @@ TEST(Reductions, ReductionCacheBodyAccess) {
 #CHECK: Allocate(scale_local); // dtype=float, dims=[1, 32, 12]
 #CHECK: for (int j = 0; j < 32; j++) {
 #CHECK:   for (int k = 0; k < 12; k++) {
-#CHECK:     scale_local[k + 12 * j] = scale[(k + 384 * l1) + 12 * j];
-#CHECK: sum[l1] = (sum[l1]) + (scale_local[12 * n1_1 + m1_1]);
+#CHECK:     scale_local[k + 12 * j] = scale[(k + 12 * j) + 384 * l1];
+#CHECK: sum[l1] = (sum[l1]) + (scale_local[m1_1 + 12 * n1_1]);
 #CHECK: scale_1[l] = (b[l]) * (sum[l]);
 #CHECK: Free(scale_local);
       )IR";
@@ -1667,7 +1667,7 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
   const std::string& expected_ir =
       R"IR(
 #CHECK: Allocate(sum_local); // dtype=float, dims=[4]
-#CHECK: sum[l1_inner + 4 * l1_outer] = (sum[l1_inner + 4 * l1_outer]) + (scale[((12 * n1_1 + 384 * l1_inner) + m1_1) + 1536 * l1_outer]);
+#CHECK: sum[l1_inner + 4 * l1_outer] = (sum[l1_inner + 4 * l1_outer]) + (scale[((m1_1 + 12 * n1_1) + 1536 * l1_outer) + 384 * l1_inner]);
 #CHECK: for (int i = 0; i < 4
 #CHECK:   sum_local[i] = sum[i + 4 * l_outer];
 #CHECK:   scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]);
@@ -1716,7 +1716,7 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
   const std::string& expected_ir =
       R"IR(
 #CHECK: Allocate(sum_local); // dtype=float, dims=[4]
-#CHECK: sum[l1] = (sum[l1]) + (scale[(12 * n1_1 + m1_1) + 384 * l1]);
+#CHECK: sum[l1] = (sum[l1]) + (scale[(m1_1 + 12 * n1_1) + 384 * l1]);
 #CHECK: for (int i = 0; i < 4
 #CHECK:   sum_local[i] = sum[i + 4 * l_outer];
 #CHECK: scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]);
diff --git a/test/cpp/tensorexpr/test_registerizer.cpp b/test/cpp/tensorexpr/test_registerizer.cpp
index a0ac095db757f..98a53058a1a65 100644
--- a/test/cpp/tensorexpr/test_registerizer.cpp
+++ b/test/cpp/tensorexpr/test_registerizer.cpp
@@ -192,8 +192,8 @@ TEST(Registerizer, RegisterizerLoopInternal) {
       R"IR(
 # CHECK: for (int x = 0; x < 10; x++)
 # CHECK: int A_1 = A[x];
-# CHECK:   A_1 = x + A_1;
-# CHECK:   A_1 = x + A_1;
+# CHECK:   A_1 = A_1 + x;
+# CHECK:   A_1 = A_1 + x;
 # CHECK:   A[x] = A_1;
 # CHECK: })IR";
 
@@ -273,12 +273,12 @@ TEST(Registerizer, RegisterizerLoopInternalRepeated) {
    * int A_1 = A[1];
    * int A_2 = A[0];
    * for (int x = 0; x < 10; x++) {
-   *   A_2 = x + A_1;
-   *   A_2 = x + A_1;
+   *   A_2 = A_1 + x;
+   *   A_2 = A_1 + x;
    * }
    * for (int x = 0; x < 10; x++) {
-   *   A_2 = x + A_1;
-   *   A_2 = x + A_1;
+   *   A_2 = A_1 + x;
+   *   A_2 = A_1 + x;
    * }
    * A[0] = A_2;
    */
@@ -291,12 +291,12 @@ TEST(Registerizer, RegisterizerLoopInternalRepeated) {
 # CHECK: int A_1 = A[1];
 # CHECK: int A_2 = A[0];
 # CHECK: for (int x = 0; x < 10; x++)
-# CHECK:   A_2 = x + A_1;
-# CHECK:   A_2 = x + A_1;
+# CHECK:   A_2 = A_1 + x;
+# CHECK:   A_2 = A_1 + x;
 # CHECK: }
 # CHECK: for (int x = 0; x < 10; x++)
-# CHECK:   A_2 = x + A_1;
-# CHECK:   A_2 = x + A_1;
+# CHECK:   A_2 = A_1 + x;
+# CHECK:   A_2 = A_1 + x;
 # CHECK: }
 # CHECK-NOT: A[1]
 # CHECK: A[0] = A_2;
@@ -357,7 +357,7 @@ TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapOther) {
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
-  StmtPtr stmt = Block::make(
+  StmtPtr stmt = IRSimplifier::simplify(Block::make(
       {For::make(
            x,
            0,
@@ -373,7 +373,7 @@ TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapOther) {
                {Store::make(a, {0}, Add::make(x, Load::make(a, {y}))),
                 Store::make(a, {0}, Add::make(x, Load::make(a, {y})))}))
 
-      });
+      }));
 
   /*
    * for (int x = 0; x < 10; x++) {
@@ -2044,7 +2044,7 @@ TEST(Registerizer, RegisterizerPartialAfter) {
   /*
    * int A_1 = 0;
    * for (int x = 0; x < 10; x++) {
-   *   A_1 = x + A_1;
+   *   A_1 = A_1 + x;
    * }
    * A[0] = A_1;
    * for (int x = 1; x < 10; x++) {
@@ -2059,7 +2059,7 @@ TEST(Registerizer, RegisterizerPartialAfter) {
       R"IR(
 # CHECK: int A_1 = 0;
 # CHECK: for (
-# CHECK:   A_1 = x + A_1;
+# CHECK:   A_1 = A_1 + x;
 # CHECK: }
 # CHECK: A[0] = A_1;
 # CHECK: for (
@@ -2104,7 +2104,7 @@ TEST(Registerizer, RegisterizerPartialBefore) {
    * }
    * int A_1 = 0;
    * for (int x = 0; x < 10; x++) {
-   *   A_1 = x + A_1;
+   *   A_1 = A_1 + x;
    * }
    * A[0] = A_1;
    */
@@ -2120,7 +2120,7 @@ TEST(Registerizer, RegisterizerPartialBefore) {
 # CHECK: }
 # CHECK: int A_1 = 0;
 # CHECK: for (
-# CHECK:   A_1 = x + A_1;
+# CHECK:   A_1 = A_1 + x;
 # CHECK: }
 # CHECK: A[0] = A_1;)IR";
 
@@ -2161,7 +2161,7 @@ TEST(Registerizer, RegisterizerPartialInside) {
   /*
    * int A_1 = 2;
    * for (int x1 = 0; x1 < 10; x1++) {
-   *   A_1 = x1 + A_1;
+   *   A_1 = A_1 + x1;
    * }
    * A[0] = A_1;
    * for (int x2 = 1; x2 < 10; x2++) {
@@ -2169,7 +2169,7 @@ TEST(Registerizer, RegisterizerPartialInside) {
    * }
    * int A_2 = A[0];
    * for (int x3 = 0; x3 < 10; x3++) {
-   *   A_2 = x3 + A_2;
+   *   A_2 = A_2 + x3;
    * }
    * A[0] = A_2;
    */
@@ -2181,7 +2181,7 @@ TEST(Registerizer, RegisterizerPartialInside) {
       R"IR(
 # CHECK: int A_1 = 2;
 # CHECK: for (
-# CHECK:   A_1 = x1 + A_1;
+# CHECK:   A_1 = A_1 + x1;
 # CHECK: }
 # CHECK: A[0] = A_1;
 # CHECK: for (
@@ -2189,7 +2189,7 @@ TEST(Registerizer, RegisterizerPartialInside) {
 # CHECK: }
 # CHECK: int A_2 = A[0];
 # CHECK: for (
-# CHECK:   A_2 = x3 + A_2;
+# CHECK:   A_2 = A_2 + x3;
 # CHECK: }
 # CHECK: A[0] = A_2;)IR";
 
@@ -2232,7 +2232,7 @@ TEST(Registerizer, RegisterizerPartialCondition) {
   /*
    * int A_1 = 2;
    * for (int x = 0; x < 10; x++) {
-   *   A_1 = x + A_1;
+   *   A_1 = A_1 + x;
    * }
    * A[0] = A_1;
    * if (x<5 ? 1 : 0) {
@@ -2240,7 +2240,7 @@ TEST(Registerizer, RegisterizerPartialCondition) {
    * }
    * int A_2 = A[0];
    * for (int x = 0; x < 10; x++) {
-   *   A_2 = x + A_2;
+   *   A_2 = A_2 + x;
    * }
    * A[0] = A_2;
    */
@@ -2252,7 +2252,7 @@ TEST(Registerizer, RegisterizerPartialCondition) {
       R"IR(
 # CHECK: int A_1 = 2;
 # CHECK: for (
-# CHECK:   A_1 = x + A_1;
+# CHECK:   A_1 = A_1 + x;
 # CHECK: }
 # CHECK: A[0] = A_1;
 # CHECK: if (
@@ -2260,7 +2260,7 @@ TEST(Registerizer, RegisterizerPartialCondition) {
 # CHECK: }
 # CHECK: int A_2 = A[0];
 # CHECK: for (
-# CHECK:   A_2 = x + A_2;
+# CHECK:   A_2 = A_2 + x;
 # CHECK: }
 # CHECK: A[0] = A_2;)IR";
 
@@ -2937,7 +2937,7 @@ TEST(Registerizer, RegisterizerNestedLoopSimple) {
    * for (int y = 0; y < 10; y++) {
    *   int A_1 = A[y];
    *   for (int x = 0; x < 10; x++) {
-   *     A_1 = x + A_1;
+   *     A_1 = A_1 + x;
    *   }
    * A[y] = A_1;
    * }
@@ -2951,7 +2951,7 @@ TEST(Registerizer, RegisterizerNestedLoopSimple) {
 # CHECK: for (int y
 # CHECK:   int A_1 = A[y];
 # CHECK:   for (int x
-# CHECK:     A_1 = x + A_1;
+# CHECK:     A_1 = A_1 + x;
 # CHECK:   }
 # CHECK:   A[y] = A_1;
 # CHECK: })IR";
@@ -3366,13 +3366,13 @@ TEST(Registerizer, RegisterizerLoopLetVar) {
   BufHandle a("A", {10}, kInt);
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
-  StmtPtr stmt = Block::make({For::make(
+  StmtPtr stmt = IRSimplifier::simplify(Block::make({For::make(
       x,
       0,
       10,
       Block::make(
           {Let::make(y, 30),
-           Store::make(a, {y}, Add::make(x, Load::make(a, {y})))}))});
+           Store::make(a, {y}, Add::make(x, Load::make(a, {y})))}))}));
 
   /*
    * for (int x = 0; x < 10; x++) {
@@ -3422,7 +3422,7 @@ TEST(Registerizer, RegisterizerLoopLetVarOuter) {
    * int y = 30;
    * int A_1 = A[y];
    * for (int x = 0; x < 10; x++) {
-   *   A_1 = x + A_1;
+   *   A_1 = A_1 + x;
    * }
    * A[y] = A_1;
    */
@@ -3435,7 +3435,7 @@ TEST(Registerizer, RegisterizerLoopLetVarOuter) {
 # CHECK: int y = 30;
 # CHECK: int A_1 = A[y];
 # CHECK: for (int x
-# CHECK:   A_1 = x + A_1;
+# CHECK:   A_1 = A_1 + x;
 # CHECK: A[y] = A_1;)IR";
 
   torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
@@ -3516,7 +3516,7 @@ TEST(Registerizer, RegisterizerMultiDimPartial) {
    * int A_1 = A[0, 1, 4];
    * int A_2 = A[0, 2, 2];
    * for (int x = 0; x < 10; x++) {
-   *   A_2 = x + A_1;
+   *   A_2 = A_1 + x;
    * }
    * A[0, 2, 2] = A_2;
    */
@@ -3530,7 +3530,7 @@ TEST(Registerizer, RegisterizerMultiDimPartial) {
 # CHECK: int A_1 = A[0, 1, 4];
 # CHECK: int A_2 = A[0, 2, 2];
 # CHECK: for (
-# CHECK:   A_2 = x + A_1;
+# CHECK:   A_2 = A_1 + x;
 # CHECK: A[0, 2, 2] = A_2;)IR";
 
   torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
@@ -3599,7 +3599,7 @@ TEST(Registerizer, RegisterizerMultiDimPartialOverlap) {
    * A[0, 1, 2] = 0;
    * int A_1 = A[y, 2, 4];
    * for (int x = 0; x < 10; x++) {
-   *   A[0, x, 2] = x + A_1;
+   *   A[0, x, 2] = A_1 + x;
    * }
    */
 
@@ -3611,7 +3611,7 @@ TEST(Registerizer, RegisterizerMultiDimPartialOverlap) {
 # CHECK: A[0, 1, 2] = 0;
 # CHECK: int A_1 = A[y, 2, 4];
 # CHECK: for (
-# CHECK:   A[0, x, 2] = x + A_1;
+# CHECK:   A[0, x, 2] = A_1 + x;
 # CHECK: })IR";
 
   torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
@@ -3736,12 +3736,12 @@ TEST(Registerizer, RegisterizerMultiDim3DReduction2) {
 
   /*
    * for (int x = 0; x < 10; x++) {
-   *   int C_1 = C[x];
    *   int A_1 = A[x];
+   *   int C_1 = C[x];
    *   for (int y = 0; y < 10; y++) {
    *     int B_1 = B[y];
    *     for (int z = 0; z < 10; z++) {
-   *       C_1 = C_1 + A_1 * B_1;
+   *       C_1 = A_1 * B_1 + C_1;
    *     }
    *   }
    *   C[x] = C_1;
@@ -3754,12 +3754,12 @@ TEST(Registerizer, RegisterizerMultiDim3DReduction2) {
   const std::string& verification_pattern =
       R"IR(
 # CHECK: for (int x
-# CHECK:   int C_1 = C[x];
 # CHECK:   int A_1 = A[x];
+# CHECK:   int C_1 = C[x];
 # CHECK:   for (int y
 # CHECK:     int B_1 = B[y];
 # CHECK:       for (int z
-# CHECK:         C_1 = C_1 + A_1 * B_1;
+# CHECK:         C_1 = A_1 * B_1 + C_1;
 # CHECK:       }
 # CHECK:     }
 # CHECK:   C[x] = C_1;
diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
index a08d4ca974fd1..c25ae4f68a1fc 100644
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@@ -649,12 +649,12 @@ TEST(Simplify, SimplifyMultiVar) {
   ASSERT_NE(lhs, nullptr);
   VarPtr varX = to<Var>(lhs->rhs());
   ASSERT_NE(varX, nullptr);
-  ASSERT_EQ(varX->name_hint(), "y");
+  ASSERT_EQ(varX->name_hint(), "x");
   MulPtr rhs = to<Mul>(root->rhs());
   ASSERT_NE(rhs, nullptr);
   VarPtr varY = to<Var>(rhs->rhs());
   ASSERT_NE(varY, nullptr);
-  ASSERT_EQ(varY->name_hint(), "x");
+  ASSERT_EQ(varY->name_hint(), "y");
 }
 
 // x + 2 + y => x + y + 2
@@ -698,8 +698,8 @@ TEST(Simplify, SimplifyAdds) {
     IS_NODE_WITH_NAME(Mul, simplified.node(), root);
     IS_IMM_WITH_VAL(Int, root->lhs(), 2);
     IS_NODE_WITH_NAME(Add, root->rhs(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "y");
-    IS_VAR_WITH_NAME(add->rhs(), "x");
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_VAR_WITH_NAME(add->rhs(), "y");
   }
 
   {
@@ -770,11 +770,11 @@ TEST(Simplify, SimplifyMuls) {
 
     IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
     IS_NODE_WITH_NAME(Add, mul->lhs(), lhs);
-    IS_VAR_WITH_NAME(lhs->lhs(), "y");
-    IS_VAR_WITH_NAME(lhs->rhs(), "x");
+    IS_VAR_WITH_NAME(lhs->lhs(), "x");
+    IS_VAR_WITH_NAME(lhs->rhs(), "y");
     IS_NODE_WITH_NAME(Add, mul->rhs(), rhs);
-    IS_VAR_WITH_NAME(rhs->lhs(), "y");
-    IS_VAR_WITH_NAME(rhs->rhs(), "x");
+    IS_VAR_WITH_NAME(rhs->lhs(), "x");
+    IS_VAR_WITH_NAME(rhs->rhs(), "y");
   }
 
   {
@@ -867,8 +867,8 @@ TEST(Simplify, SimplifyMuls) {
     ExprHandle simplified = IRSimplifier::simplify(body);
     IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
     IS_NODE_WITH_NAME(Add, mul->lhs(), lhs);
-    IS_VAR_WITH_NAME(lhs->lhs(), "y");
-    IS_VAR_WITH_NAME(lhs->rhs(), "x");
+    IS_VAR_WITH_NAME(lhs->lhs(), "x");
+    IS_VAR_WITH_NAME(lhs->rhs(), "y");
     IS_NODE_WITH_NAME(Sub, mul->rhs(), rhs);
     IS_VAR_WITH_NAME(rhs->lhs(), "x");
     IS_VAR_WITH_NAME(rhs->rhs(), "y");
@@ -1654,14 +1654,14 @@ TEST(Simplify, SimplifyMultiOp) {
   }
 
   {
-    // (x + y) - (x * y) => x + y - (x * y)
-    ExprHandle body = (x + y) - (x * y);
+    // (x + y) - x * y => (x + y) - x * y
+    ExprHandle body = (x + y) - x * y;
     ExprHandle simplified = IRSimplifier::simplify(body);
     IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
     IS_NODE_WITH_NAME(Add, sub->lhs(), add);
     IS_NODE_WITH_NAME(Mul, sub->rhs(), mul);
-    IS_VAR_WITH_NAME(add->lhs(), "y");
-    IS_VAR_WITH_NAME(add->rhs(), "x");
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_VAR_WITH_NAME(add->rhs(), "y");
     IS_VAR_WITH_NAME(mul->lhs(), "x");
     IS_VAR_WITH_NAME(mul->rhs(), "y");
   }
@@ -1709,19 +1709,19 @@ TEST(Simplify, SimplifyManyOps) {
   VarHandle y("y", kInt);
 
   {
-    // x + y + x + x + y + y + x + y + x = 5 * x + 4 * y
+    // x + y + x + x + y + y + x + y + x = 4 * y + 5 * x
     ExprHandle body = x + y + x + x + y + y + x + y + x;
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     IS_NODE_WITH_NAME(Add, simplified.node(), add);
 
     IS_NODE_WITH_NAME(Mul, add->lhs(), lhs);
-    IS_IMM_WITH_VAL(Int, lhs->lhs(), 5);
-    IS_VAR_WITH_NAME(lhs->rhs(), "x");
+    IS_IMM_WITH_VAL(Int, lhs->lhs(), 4);
+    IS_VAR_WITH_NAME(lhs->rhs(), "y");
 
     IS_NODE_WITH_NAME(Mul, add->rhs(), rhs);
-    IS_IMM_WITH_VAL(Int, rhs->lhs(), 4);
-    IS_VAR_WITH_NAME(rhs->rhs(), "y");
+    IS_IMM_WITH_VAL(Int, rhs->lhs(), 5);
+    IS_VAR_WITH_NAME(rhs->rhs(), "x");
   }
 
   {
@@ -1765,8 +1765,8 @@ TEST(Simplify, SimplifyFactorization) {
     IS_IMM_WITH_VAL(Int, mul->lhs(), 2);
 
     IS_NODE_WITH_NAME(Add, mul->rhs(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "y");
-    IS_VAR_WITH_NAME(add->rhs(), "x");
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_VAR_WITH_NAME(add->rhs(), "y");
   }
 
   {
@@ -1794,12 +1794,12 @@ TEST(Simplify, SimplifyFactorization) {
     IS_NODE_WITH_NAME(Add, simplified.node(), add);
 
     IS_NODE_WITH_NAME(Mul, add->lhs(), lhs);
-    IS_IMM_WITH_VAL(Int, lhs->lhs(), 5);
-    IS_VAR_WITH_NAME(lhs->rhs(), "y");
+    IS_IMM_WITH_VAL(Int, lhs->lhs(), 2);
+    IS_VAR_WITH_NAME(lhs->rhs(), "x");
 
     IS_NODE_WITH_NAME(Mul, add->rhs(), rhs);
-    IS_IMM_WITH_VAL(Int, rhs->lhs(), 2);
-    IS_VAR_WITH_NAME(rhs->rhs(), "x");
+    IS_IMM_WITH_VAL(Int, rhs->lhs(), 5);
+    IS_VAR_WITH_NAME(rhs->rhs(), "y");
   }
 
   {
@@ -1813,8 +1813,8 @@ TEST(Simplify, SimplifyFactorization) {
     IS_IMM_WITH_VAL(Int, mul->lhs(), 10);
 
     IS_NODE_WITH_NAME(Add, mul->rhs(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "y");
-    IS_VAR_WITH_NAME(add->rhs(), "x");
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_VAR_WITH_NAME(add->rhs(), "y");
   }
 
   {
@@ -1863,18 +1863,12 @@ TEST(Simplify, SimplifyFactorization) {
     VarHandle g("g", kInt);
     VarHandle h("h", kInt);
 
-    ExprHandle body = ExprHandle(0) + (ExprHandle(1024) * a) +
-        (ExprHandle(-1) * b) + (ExprHandle(-1) * c) + (ExprHandle(1) * d) +
-        (ExprHandle(1) * e) + (ExprHandle(32) * f) + (ExprHandle(-1024) * g) +
-        (ExprHandle(-32) * h);
+    ExprHandle body = a * 1024 + 0 + b * (-1) + c * (-1) + d * 1 + e * 1 +
+        f * 32 + g * (-1024) + h * (-32);
     ExprHandle simplified = IRSimplifier::simplify(body);
-
-    // We only check for the top level nodes here, since the main purpose
-    // here is ensure that this simplification completes.
-    IS_NODE_WITH_NAME(Sub, simplified.node(), sub);
-    IS_NODE_WITH_NAME(Mul, sub->rhs(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 1024);
-    IS_VAR_WITH_NAME(mul->rhs(), "g");
+    checkExprIR(
+        simplified,
+        "((((((d + e) + 1024 * a) + 32 * f) - b) - c) - 1024 * g) - 32 * h");
   }
 }
 
@@ -1904,7 +1898,7 @@ TEST(Simplify, SimplifyFactorizeUneven) {
   IS_VAR_WITH_NAME(zmul->rhs(), "z");
 }
 
-// (x * y) + (2 * x) * (x + y) => 3 * (x * y) + 2 * (x * x)
+// (x * y) + (2 * x) * (x + y) => 2 * (x * x) + 3 * (x * y)
 // This is kind of a placeholder test for variable factorization.
 TEST(Simplify, SimplifyDeeperTerms) {
   KernelScope kernel_scope;
@@ -1916,16 +1910,16 @@ TEST(Simplify, SimplifyDeeperTerms) {
   IS_NODE_WITH_NAME(Add, simplified.node(), add);
 
   IS_NODE_WITH_NAME(Mul, add->lhs(), lhs);
-  IS_IMM_WITH_VAL(Int, lhs->lhs(), 3);
-  IS_NODE_WITH_NAME(Mul, lhs->rhs(), xyTerm);
-  IS_VAR_WITH_NAME(xyTerm->lhs(), "x");
-  IS_VAR_WITH_NAME(xyTerm->rhs(), "y");
+  IS_IMM_WITH_VAL(Int, lhs->lhs(), 2);
+  IS_NODE_WITH_NAME(Mul, lhs->rhs(), xxTerm);
+  IS_VAR_WITH_NAME(xxTerm->lhs(), "x");
+  IS_VAR_WITH_NAME(xxTerm->rhs(), "x");
 
   IS_NODE_WITH_NAME(Mul, add->rhs(), rhs);
-  IS_IMM_WITH_VAL(Int, rhs->lhs(), 2);
-  IS_NODE_WITH_NAME(Mul, rhs->rhs(), xxTerm);
-  IS_VAR_WITH_NAME(xxTerm->rhs(), "x");
-  IS_VAR_WITH_NAME(xxTerm->rhs(), "x");
+  IS_IMM_WITH_VAL(Int, rhs->lhs(), 3);
+  IS_NODE_WITH_NAME(Mul, rhs->rhs(), xyTerm);
+  IS_VAR_WITH_NAME(xyTerm->lhs(), "x");
+  IS_VAR_WITH_NAME(xyTerm->rhs(), "y");
 }
 
 // Tests the difference between two less trivial expressions.
@@ -1987,15 +1981,15 @@ TEST(Simplify, SimplifyOpaqueTerms) {
   VarHandle y("y", kInt);
 
   {
-    // 2 * x/y * x - x/y * y => y * x/y
+    // 2 * x/y * y - x/y * y => x/y * y
     ExprHandle body = ((ExprHandle(2)) * (x / y) * y) - ((x / y) * y);
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     IS_NODE_WITH_NAME(Mul, simplified.node(), mul);
-    IS_VAR_WITH_NAME(mul->lhs(), "y");
-    IS_NODE_WITH_NAME(Div, mul->rhs(), div);
+    IS_NODE_WITH_NAME(Div, mul->lhs(), div);
     IS_VAR_WITH_NAME(div->lhs(), "x");
     IS_VAR_WITH_NAME(div->rhs(), "y");
+    IS_VAR_WITH_NAME(mul->rhs(), "y");
   }
 
   {
@@ -2055,46 +2049,46 @@ TEST(Simplify, SimplifyNestedMax) {
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    IS_BINOP_W_VARS(Add, simplified.node(), add, "y", "x");
+    IS_BINOP_W_VARS(Add, simplified.node(), add, "x", "y");
   }
 
   {
-    // Max(x + y, Max(x + y, z)) => Max(y + x, z)
+    // Max(x + y, Max(x + y, z)) => Max(x + y, z)
     ExprHandle body = Max::make(x + y, Max::make(x + y, z, true), true);
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_BINOP_W_VARS(Add, max->lhs(), add, "y", "x");
+    IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y");
     IS_VAR_WITH_NAME(max->rhs(), "z");
   }
 
   {
-    // Max(x + y, Max(z, x + y)) => Max(y + x, z)
+    // Max(x + y, Max(z, x + y)) => Max(x + y, z)
     ExprHandle body = Max::make(x + y, Max::make(z, x + y, true), true);
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_BINOP_W_VARS(Add, max->lhs(), add, "y", "x");
+    IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y");
     IS_VAR_WITH_NAME(max->rhs(), "z");
   }
 
   {
-    // Max(Max(x + y, z), x + y) => Max(y + x, z)
+    // Max(Max(x + y, z), x + y) => Max(x + y, z)
     ExprHandle body = Max::make(Max::make(x + y, z, true), x + y, true);
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_BINOP_W_VARS(Add, max->lhs(), add, "y", "x");
+    IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y");
     IS_VAR_WITH_NAME(max->rhs(), "z");
   }
 
   {
-    // Max(Max(z, x + y), x + y) => Max(y + x, z)
+    // Max(Max(z, x + y), x + y) => Max(x + y, z)
     ExprHandle body = Max::make(Max::make(z, x + y, true), x + y, true);
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_BINOP_W_VARS(Add, max->lhs(), add, "y", "x");
+    IS_BINOP_W_VARS(Add, max->lhs(), add, "x", "y");
     IS_VAR_WITH_NAME(max->rhs(), "z");
   }
 
@@ -2112,55 +2106,39 @@ TEST(Simplify, SimplifyNestedMax) {
   }
 
   {
-    // Max(Min(x, y), Min(x, z)) => Min(x, Max(y, z))
+    // Max(Min(x, y), Min(x, z)) => Min(Max(y, z), x)
     ExprHandle body =
         Max::make(Min::make(x, y, true), Min::make(x, z, true), true);
     ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_VAR_WITH_NAME(min->lhs(), "x");
-    IS_BINOP_W_VARS(Max, min->rhs(), max, "y", "z");
-    ASSERT_TRUE(max->propagate_nans());
+    checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)");
   }
 
   {
-    // Max(Min(x, y), Min(z, x)) => Min(x, Max(y, z))
+    // Max(Min(x, y), Min(z, x)) => Min(Max(y, z), x)
     ExprHandle body =
         Max::make(Min::make(x, y, true), Min::make(z, x, true), true);
     ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_VAR_WITH_NAME(min->lhs(), "x");
-    IS_BINOP_W_VARS(Max, min->rhs(), max, "y", "z");
-    ASSERT_TRUE(max->propagate_nans());
+    checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)");
   }
 
   {
-    // Max(Min(y, x), Min(x, z)) => Min(x, Max(y, z))
+    // Max(Min(y, x), Min(x, z)) => Min(Max(y, z), x)
     ExprHandle body =
         Max::make(Min::make(y, x, true), Min::make(x, z, true), true);
     ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_VAR_WITH_NAME(min->lhs(), "x");
-    IS_BINOP_W_VARS(Max, min->rhs(), max, "y", "z");
-    ASSERT_TRUE(max->propagate_nans());
+    checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)");
   }
 
   {
-    // Max(Min(y, x), Min(z, x)) => Min(x, Max(y, z))
+    // Max(Min(y, x), Min(z, x)) => Min(Max(y, z), x)
     ExprHandle body =
         Max::make(Min::make(y, x, true), Min::make(z, x, true), true);
     ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_VAR_WITH_NAME(min->lhs(), "x");
-    IS_BINOP_W_VARS(Max, min->rhs(), max, "y", "z");
-    ASSERT_TRUE(max->propagate_nans());
+    checkExprIR(simplified, "Min(Max(y, z, 1), x, 1)");
   }
 
   {
-    // Max(Min(y, x), Min(z, x)) => Max(Min(x, z), Min(x, y))
+    // Max(Min(y, x), Min(z, x)) => Max(Min(x, y), Min(x, z))
     // When all the ops in the pattern do not have the same propagate_nans,
     // it should not be simplified.
     ExprHandle body =
@@ -2168,10 +2146,10 @@ TEST(Simplify, SimplifyNestedMax) {
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_BINOP_W_VARS(Min, max->lhs(), min1, "x", "z");
-    ASSERT_FALSE(min1->propagate_nans());
-    IS_BINOP_W_VARS(Min, max->rhs(), min2, "x", "y");
-    ASSERT_TRUE(min2->propagate_nans());
+    IS_BINOP_W_VARS(Min, max->lhs(), min1, "x", "y");
+    ASSERT_TRUE(min1->propagate_nans());
+    IS_BINOP_W_VARS(Min, max->rhs(), min2, "x", "z");
+    ASSERT_FALSE(min2->propagate_nans());
     ASSERT_TRUE(max->propagate_nans());
   }
 
@@ -2304,18 +2282,7 @@ TEST(Simplify, SimplifyNestedMax) {
         8,
         false);
     ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max1);
-    IS_NODE_WITH_NAME(Max, max1->lhs(), max2);
-    IS_VAR_WITH_NAME(max2->lhs(), "x");
-    IS_NODE_WITH_NAME(Max, max2->rhs(), max3);
-    IS_BINOP_W_CONST(Max, max3->lhs(), max4, "z", 5);
-    ASSERT_TRUE(max4->propagate_nans());
-    IS_VAR_WITH_NAME(max3->rhs(), "y");
-    ASSERT_FALSE(max3->propagate_nans());
-    ASSERT_TRUE(max2->propagate_nans());
-    IS_IMM_WITH_VAL(Int, max1->rhs(), 8);
-    ASSERT_FALSE(max1->propagate_nans());
+    checkExprIR(simplified, "Max(Max(Max(Max(z, 5, 1), y, 0), x, 1), 8, 0)");
   }
 
   {
@@ -2359,46 +2326,46 @@ TEST(Simplify, SimplifyNestedMin) {
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    IS_BINOP_W_VARS(Add, simplified.node(), add, "y", "x");
+    IS_BINOP_W_VARS(Add, simplified.node(), add, "x", "y");
   }
 
   {
-    // Min(x + y, Min(x + y, z)) => Min(y + x, z)
+    // Min(x + y, Min(x + y, z)) => Min(x + y, z)
     ExprHandle body = Min::make(x + y, Min::make(x + y, z, true), true);
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_BINOP_W_VARS(Add, min->lhs(), add, "y", "x");
+    IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y");
     IS_VAR_WITH_NAME(min->rhs(), "z");
   }
 
   {
-    // Min(x + y, Min(z, x + y)) => Min(y + x, z)
+    // Min(x + y, Min(z, x + y)) => Min(x + y, z)
     ExprHandle body = Min::make(x + y, Min::make(z, x + y, true), true);
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_BINOP_W_VARS(Add, min->lhs(), add, "y", "x");
+    IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y");
     IS_VAR_WITH_NAME(min->rhs(), "z");
   }
 
   {
-    // Min(Min(x + y, z), x + y) => Min(y + x, z)
+    // Min(Min(x + y, z), x + y) => Min(x + y, z)
     ExprHandle body = Min::make(Min::make(x + y, z, true), x + y, true);
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_BINOP_W_VARS(Add, min->lhs(), add, "y", "x");
+    IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y");
     IS_VAR_WITH_NAME(min->rhs(), "z");
   }
 
   {
-    // Min(Min(z, x + y), x + y) => Min(y + x, z)
+    // Min(Min(z, x + y), x + y) => Min(x + y, z)
     ExprHandle body = Min::make(Min::make(z, x + y, true), x + y, true);
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_BINOP_W_VARS(Add, min->lhs(), add, "y", "x");
+    IS_BINOP_W_VARS(Add, min->lhs(), add, "x", "y");
     IS_VAR_WITH_NAME(min->rhs(), "z");
   }
 
@@ -2416,55 +2383,39 @@ TEST(Simplify, SimplifyNestedMin) {
   }
 
   {
-    // Min(Max(x, y), Max(x, z)) => Max(x, Min(y, z))
+    // Min(Max(x, y), Max(x, z)) => Max(Min(y, z), x)
     ExprHandle body =
         Min::make(Max::make(x, y, true), Max::make(x, z, true), true);
     ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_VAR_WITH_NAME(max->lhs(), "x");
-    IS_BINOP_W_VARS(Min, max->rhs(), min, "y", "z");
-    ASSERT_TRUE(min->propagate_nans());
+    checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)");
   }
 
   {
-    // Min(Max(x, y), Max(z, x)) => Max(x, Min(y, z))
+    // Min(Max(x, y), Max(z, x)) => Max(Min(y, z), x)
     ExprHandle body =
         Min::make(Max::make(x, y, true), Max::make(z, x, true), true);
     ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_VAR_WITH_NAME(max->lhs(), "x");
-    IS_BINOP_W_VARS(Min, max->rhs(), min, "y", "z");
-    ASSERT_TRUE(min->propagate_nans());
+    checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)");
   }
 
   {
-    // Min(Max(y, x), Max(x, z)) => Max(x, Min(y, z))
+    // Min(Max(y, x), Max(x, z)) => Max(Min(y, z), x)
     ExprHandle body =
         Min::make(Max::make(y, x, true), Max::make(x, z, true), true);
     ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_VAR_WITH_NAME(max->lhs(), "x");
-    IS_BINOP_W_VARS(Min, max->rhs(), min, "y", "z");
-    ASSERT_TRUE(min->propagate_nans());
+    checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)");
   }
 
   {
-    // Min(Max(y, x), Max(z, x)) => Max(x, Min(y, z))
+    // Min(Max(y, x), Max(z, x)) => Max(Min(y, z), x)
     ExprHandle body =
         Min::make(Max::make(y, x, true), Max::make(z, x, true), true);
     ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Max, simplified.node(), max);
-    IS_VAR_WITH_NAME(max->lhs(), "x");
-    IS_BINOP_W_VARS(Min, max->rhs(), min, "y", "z");
-    ASSERT_TRUE(min->propagate_nans());
+    checkExprIR(simplified, "Max(Min(y, z, 1), x, 1)");
   }
 
   {
-    // Min(Max(y, x), Max(z, x)) => Min(Max(x, z), Max(x, y))
+    // Min(Max(y, x), Max(z, x)) => Min(Max(x, y), Max(x, z))
     // When all the ops in the pattern do not have the same propagate_nans,
     // it should not be simplified.
     ExprHandle body =
@@ -2472,10 +2423,10 @@ TEST(Simplify, SimplifyNestedMin) {
     ExprHandle simplified = IRSimplifier::simplify(body);
 
     IS_NODE_WITH_NAME(Min, simplified.node(), min);
-    IS_BINOP_W_VARS(Max, min->lhs(), max1, "x", "z");
-    ASSERT_FALSE(max1->propagate_nans());
-    IS_BINOP_W_VARS(Max, min->rhs(), max2, "x", "y");
-    ASSERT_TRUE(max2->propagate_nans());
+    IS_BINOP_W_VARS(Max, min->lhs(), max1, "x", "y");
+    ASSERT_TRUE(max1->propagate_nans());
+    IS_BINOP_W_VARS(Max, min->rhs(), max2, "x", "z");
+    ASSERT_FALSE(max2->propagate_nans());
     ASSERT_TRUE(min->propagate_nans());
   }
 
@@ -2600,7 +2551,7 @@ TEST(Simplify, SimplifyNestedMin) {
   }
 
   {
-    // Min(Min(Min(Min(z, 5), y), x), 8) => Min(Min(x, Min(Min(z, 5), y)), 8)
+    // Min(Min(Min(Min(z, 5), y), x), 8) => Min(Min(Min(Min(z, 5), y), x), 8)
     // Do not simplify when all the Min ops do not have the same
     // propagate_nans.
     ExprHandle body = Min::make(
@@ -2608,18 +2559,7 @@ TEST(Simplify, SimplifyNestedMin) {
         8,
         false);
     ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Min, simplified.node(), min1);
-    IS_NODE_WITH_NAME(Min, min1->lhs(), min2);
-    IS_VAR_WITH_NAME(min2->lhs(), "x");
-    IS_NODE_WITH_NAME(Min, min2->rhs(), min3);
-    IS_BINOP_W_CONST(Min, min3->lhs(), min4, "z", 5);
-    ASSERT_TRUE(min4->propagate_nans());
-    IS_VAR_WITH_NAME(min3->rhs(), "y");
-    ASSERT_FALSE(min3->propagate_nans());
-    ASSERT_TRUE(min2->propagate_nans());
-    IS_IMM_WITH_VAL(Int, min1->rhs(), 8);
-    ASSERT_FALSE(min1->propagate_nans());
+    checkExprIR(simplified, "Min(Min(Min(Min(z, 5, 1), y, 0), x, 1), 8, 0)");
   }
 
   {
@@ -2922,16 +2862,7 @@ TEST(Simplify, SimplifyRoundModPattern) {
     VarHandle z("z", kInt);
     ExprHandle body = ((x / y) * y) + (x % z);
     ExprHandle simplified = IRSimplifier::simplify(body);
-
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_NODE_WITH_NAME(Mul, add->lhs(), roundMul);
-    IS_VAR_WITH_NAME(roundMul->lhs(), "y");
-    IS_NODE_WITH_NAME(Div, roundMul->rhs(), roundDiv);
-    IS_VAR_WITH_NAME(roundDiv->lhs(), "x");
-    IS_VAR_WITH_NAME(roundDiv->rhs(), "y");
-    IS_NODE_WITH_NAME(Mod, add->rhs(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "x");
-    IS_VAR_WITH_NAME(mod->rhs(), "z");
+    checkExprIR(simplified, "(x / y) * y + x % z");
   }
 
   {
@@ -2941,15 +2872,7 @@ TEST(Simplify, SimplifyRoundModPattern) {
     VarHandle z("z", kInt);
     ExprHandle body = (y * (x / z)) + (x % y);
     ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_NODE_WITH_NAME(Mul, add->lhs(), roundMul);
-    IS_VAR_WITH_NAME(roundMul->lhs(), "y");
-    IS_NODE_WITH_NAME(Div, roundMul->rhs(), roundDiv);
-    IS_VAR_WITH_NAME(roundDiv->lhs(), "x");
-    IS_VAR_WITH_NAME(roundDiv->rhs(), "z");
-    IS_NODE_WITH_NAME(Mod, add->rhs(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "x");
-    IS_VAR_WITH_NAME(mod->rhs(), "y");
+    checkExprIR(simplified, "x % y + (x / z) * y");
   }
 
   {
@@ -2959,15 +2882,7 @@ TEST(Simplify, SimplifyRoundModPattern) {
     VarHandle z("z", kInt);
     ExprHandle body = ((x / y) * z) + (x % y);
     ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_NODE_WITH_NAME(Mul, add->lhs(), roundMul);
-    IS_VAR_WITH_NAME(roundMul->lhs(), "z");
-    IS_NODE_WITH_NAME(Div, roundMul->rhs(), roundDiv);
-    IS_VAR_WITH_NAME(roundDiv->lhs(), "x");
-    IS_VAR_WITH_NAME(roundDiv->rhs(), "y");
-    IS_NODE_WITH_NAME(Mod, add->rhs(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "x");
-    IS_VAR_WITH_NAME(mod->rhs(), "y");
+    checkExprIR(simplified, "x % y + (x / y) * z");
   }
 }
 
@@ -3036,20 +2951,20 @@ TEST(Simplify, SimplifyRoundModPatternMultivar) {
 
   {
     // Multivar.
-    // (x/8) * 8 + (y/5)*5 + x%8 + y%5 => y + x.
+    // (x/8) * 8 + (y/5)*5 + x%8 + y%5 => x + y.
     VarHandle x("x", kInt);
     VarHandle y("y", kInt);
     ExprHandle body = (x / ExprHandle(8) * ExprHandle(8)) +
         (y / ExprHandle(5) * ExprHandle(5)) + (x % 8) + (y % 5);
     ExprHandle simplified = IRSimplifier::simplify(body);
     IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_VAR_WITH_NAME(add->lhs(), "y");
-    IS_VAR_WITH_NAME(add->rhs(), "x");
+    IS_VAR_WITH_NAME(add->lhs(), "x");
+    IS_VAR_WITH_NAME(add->rhs(), "y");
   }
 
   {
     // Find the right var.
-    // (y/8) * 8  x%8 + y%8 + z%8 => z%8 + x%8 + y
+    // (y/8) * 8  x%8 + y%8 + z%8 => x%8 + y + z%8
     VarHandle x("x", kInt);
     VarHandle y("y", kInt);
     VarHandle z("z", kInt);
@@ -3075,16 +2990,9 @@ TEST(Simplify, SimplifyRoundModPatternMultivar) {
     VarHandle y("y", kInt);
     VarHandle z("z", kInt);
 
-    ExprHandle body = x + (z + ExprHandle(512) * y) % ExprHandle(16) +
-        ExprHandle(16) * ((z + ExprHandle(512) * y) / ExprHandle(16));
+    ExprHandle body = x + (z + y * 512) % 16 + ((z + y * 512) / 16 * 16);
     ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_VAR_WITH_NAME(add->rhs(), "x");
-    IS_NODE_WITH_NAME(Add, add->lhs(), add2);
-    IS_VAR_WITH_NAME(add2->lhs(), "z");
-    IS_NODE_WITH_NAME(Mul, add2->rhs(), mul);
-    IS_IMM_WITH_VAL(Int, mul->lhs(), 512);
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
+    checkExprIR(simplified, "x + (z + 512 * y)");
   }
 }
 
@@ -3135,13 +3043,7 @@ TEST(Simplify, SimplifyModRoundModPattern) {
     VarHandle k("k", kInt);
     ExprHandle body = (k * t / x % y) * x + k * t % x;
     ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Mod, simplified.node(), mod);
-    IS_NODE_WITH_NAME(Mul, mod->lhs(), mul1);
-    IS_VAR_WITH_NAME(mul1->lhs(), "t");
-    IS_VAR_WITH_NAME(mul1->rhs(), "k");
-    IS_NODE_WITH_NAME(Mul, mod->rhs(), mul2);
-    IS_VAR_WITH_NAME(mul2->lhs(), "x");
-    IS_VAR_WITH_NAME(mul2->rhs(), "y");
+    checkExprIR(simplified, "(k * t) % (x * y)");
   }
 
   {
@@ -3259,11 +3161,7 @@ TEST(Simplify, SimplifyModRoundModPatternMultivar) {
     VarHandle t("t", kInt);
     ExprHandle body = (t / 7 % 9) * 7 + t % 7 + t;
     ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_NODE_WITH_NAME(Mod, add->rhs(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "t");
-    IS_IMM_WITH_VAL(Int, mod->rhs(), 63);
-    IS_VAR_WITH_NAME(add->lhs(), "t");
+    checkExprIR(simplified, "t % 63 + t");
   }
 
   {
@@ -3306,19 +3204,7 @@ TEST(Simplify, SimplifyModRoundModPatternMultivar) {
     VarHandle k("k", kInt);
     ExprHandle body = (t / x % y) * x + t % x + (t / k / x % y) * x + t / k % x;
     ExprHandle simplified = IRSimplifier::simplify(body);
-    IS_NODE_WITH_NAME(Add, simplified.node(), add);
-    IS_NODE_WITH_NAME(Mod, add->lhs(), mod);
-    IS_VAR_WITH_NAME(mod->lhs(), "t");
-    IS_NODE_WITH_NAME(Mul, mod->rhs(), mul);
-    IS_VAR_WITH_NAME(mul->lhs(), "x");
-    IS_VAR_WITH_NAME(mul->rhs(), "y");
-    IS_NODE_WITH_NAME(Mod, add->rhs(), mod2);
-    IS_NODE_WITH_NAME(Div, mod2->lhs(), div);
-    IS_VAR_WITH_NAME(div->lhs(), "t");
-    IS_VAR_WITH_NAME(div->rhs(), "k");
-    IS_NODE_WITH_NAME(Mul, mod2->rhs(), mul2);
-    IS_VAR_WITH_NAME(mul2->lhs(), "x");
-    IS_VAR_WITH_NAME(mul2->rhs(), "y");
+    checkExprIR(simplified, "(t / k) % (x * y) + t % (x * y)");
   }
 
   {
@@ -3971,7 +3857,7 @@ TEST(Simplify, SimplifyForWontLoseLoopOptions) {
     BufHandle c("C", {4}, kInt);
     VarHandle i("i", kInt);
     LoopOptions options;
-    options.set_gpu_block_index(12);
+    options.set_gpu_block_index(LoopOptions::IDX_W);
     auto body =
         For::make(i, 0, 1, Store::make(c, {i}, Load::make(a, {i})), options);
     StmtPtr simplified = IRSimplifier::simplify(body);
diff --git a/test/cpp/tensorexpr/test_utils.h b/test/cpp/tensorexpr/test_utils.h
index 01b92a7832a40..065e513c1a645 100644
--- a/test/cpp/tensorexpr/test_utils.h
+++ b/test/cpp/tensorexpr/test_utils.h
@@ -4,6 +4,7 @@
 #include <vector>
 
 #include <test/cpp/tensorexpr/test_base.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
 #include <torch/csrc/jit/testing/file_check.h>
 
 namespace torch {
@@ -69,5 +70,9 @@ using namespace torch::jit::tensorexpr;
     ASSERT_EQ(node_->op_type(), kRand); \
   }
 
+void checkIR(StmtPtr s, const std::string& pattern);
+void checkExprIR(ExprPtr e, const std::string& pattern);
+void checkExprIR(const ExprHandle& e, const std::string& pattern);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index 3d849fec6d9db..cb731d2525e71 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -35,8 +35,15 @@ void Term::sort() {
   if (dtype().is_floating_point()) {
     throw std::logic_error("reordering FP ops");
   }
+  std::unordered_map<ExprPtr, std::string> str_repr_cache;
   std::sort(variables_.begin(), variables_.end(), [&](ExprPtr a, ExprPtr b) {
-    return hasher_.hash(a) < hasher_.hash(b);
+    if (!str_repr_cache.count(a)) {
+      str_repr_cache[a] = std::to_string(a);
+    }
+    if (!str_repr_cache.count(b)) {
+      str_repr_cache[b] = std::to_string(b);
+    }
+    return str_repr_cache.at(a) < str_repr_cache.at(b);
   });
 }
 
@@ -52,8 +59,15 @@ void Polynomial::sort() {
   if (dtype().is_floating_point()) {
     throw std::logic_error("reordering FP ops");
   }
+  std::unordered_map<ExprPtr, std::string> str_repr_cache;
   std::sort(variables_.begin(), variables_.end(), [&](ExprPtr a, ExprPtr b) {
-    return hasher_.hash(a) < hasher_.hash(b);
+    if (!str_repr_cache.count(a)) {
+      str_repr_cache[a] = std::to_string(a);
+    }
+    if (!str_repr_cache.count(b)) {
+      str_repr_cache[b] = std::to_string(b);
+    }
+    return str_repr_cache.at(a) < str_repr_cache.at(b);
   });
 }
 
@@ -66,6 +80,18 @@ void MaxTerm::uniquefy() {
         return hasher_.hash(a) == hasher_.hash(b);
       });
   variables_.resize(std::distance(variables_.begin(), it));
+
+  // Once we removed duplicates, sort terms alphabetically for stability.
+  std::unordered_map<ExprPtr, std::string> str_repr_cache;
+  std::sort(variables_.begin(), variables_.end(), [&](ExprPtr a, ExprPtr b) {
+    if (!str_repr_cache.count(a)) {
+      str_repr_cache[a] = std::to_string(a);
+    }
+    if (!str_repr_cache.count(b)) {
+      str_repr_cache[b] = std::to_string(b);
+    }
+    return str_repr_cache.at(a) < str_repr_cache.at(b);
+  });
 }
 
 void MinTerm::uniquefy() {
@@ -77,6 +103,18 @@ void MinTerm::uniquefy() {
         return hasher_.hash(a) == hasher_.hash(b);
       });
   variables_.resize(std::distance(variables_.begin(), it));
+
+  // Once we removed duplicates, sort terms alphabetically for stability.
+  std::unordered_map<ExprPtr, std::string> str_repr_cache;
+  std::sort(variables_.begin(), variables_.end(), [&](ExprPtr a, ExprPtr b) {
+    if (!str_repr_cache.count(a)) {
+      str_repr_cache[a] = std::to_string(a);
+    }
+    if (!str_repr_cache.count(b)) {
+      str_repr_cache[b] = std::to_string(b);
+    }
+    return str_repr_cache.at(a) < str_repr_cache.at(b);
+  });
 }
 
 // Handles optimization cases for Broadcast/Ramp +/- Broadcast/Ramp
@@ -2076,8 +2114,20 @@ ExprPtr TermExpander::mutate(PolynomialPtr v) {
   std::vector<TermPtr> addTerms;
   std::vector<TermPtr> subTerms;
 
+  auto vars = v->variables();
+  std::unordered_map<ExprPtr, std::string> str_repr_cache;
+  std::sort(vars.begin(), vars.end(), [&](ExprPtr a, ExprPtr b) {
+    if (!str_repr_cache.count(a)) {
+      str_repr_cache[a] = std::to_string(a);
+    }
+    if (!str_repr_cache.count(b)) {
+      str_repr_cache[b] = std::to_string(b);
+    }
+    return str_repr_cache.at(a) < str_repr_cache.at(b);
+  });
+
   // partition the terms into a list to add and list to subtract.
-  for (auto node : v->variables()) {
+  for (auto node : vars) {
     if (immediateIsNegative(node->scalar())) {
       subTerms.push_back(node);
     } else if (!immediateEquals(node->scalar(), 0)) {
@@ -2822,6 +2872,49 @@ bool exprEquals(ExprPtr A, ExprPtr B) {
   }
 }
 
+ExprPtr IRSimplifier::simplify(ExprPtr e) {
+  GRAPH_DEBUG("(Simplifier) Original: ", std::to_string(e));
+  SimplifierUnderContext ctxsimplifier;
+  e = e->accept_mutator(&ctxsimplifier);
+
+  PolynomialTransformer simplifier;
+  e = e->accept_mutator(&simplifier);
+
+  // There may be terms left in the IR, expand them.
+  TermExpander expander(&simplifier);
+  e = e->accept_mutator(&expander);
+  // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
+  if (!expander.check_safe()) {
+    throw malformed_input("eliminated null Allocation without free");
+  }
+
+  GRAPH_DEBUG("(Simplifier) Simplified: ", std::to_string(e));
+  return e;
+}
+
+StmtPtr IRSimplifier::simplify(StmtPtr s) {
+  GRAPH_DEBUG("(Simplifier) Original: ", std::to_string(s));
+  SimplifierUnderContext ctxsimplifier;
+  s = s->accept_mutator(&ctxsimplifier);
+
+  PolynomialTransformer simplifier;
+  s = s->accept_mutator(&simplifier);
+  if (s == nullptr) {
+    GRAPH_DEBUG("(Simplifier) Simplified: NULL");
+    return nullptr;
+  }
+
+  // There may be terms left in the IR, expand them.
+  TermExpander expander(&simplifier);
+  s = s->accept_mutator(&expander);
+  if (!expander.check_safe()) {
+    throw malformed_input("eliminated null Allocation without free");
+  }
+
+  GRAPH_DEBUG("(Simplifier) Simplified: ", std::to_string(s));
+  return s;
+}
+
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.h b/torch/csrc/jit/tensorexpr/ir_simplifier.h
index 6281b77349b37..87c476242e8de 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.h
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.h
@@ -596,47 +596,11 @@ class TORCH_API TermExpander : public PolynomialBase {
 
 class TORCH_API IRSimplifier {
  public:
-  static ExprPtr simplify(ExprPtr e) {
-    SimplifierUnderContext ctxsimplifier;
-    e = e->accept_mutator(&ctxsimplifier);
-
-    PolynomialTransformer simplifier;
-    e = e->accept_mutator(&simplifier);
-
-    // There may be terms left in the IR, expand them.
-    TermExpander expander(&simplifier);
-    e = e->accept_mutator(&expander);
-    // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    if (!expander.check_safe()) {
-      throw malformed_input("eliminated null Allocation without free");
-    }
-
-    return e;
-  }
-
+  static StmtPtr simplify(StmtPtr s);
+  static ExprPtr simplify(ExprPtr e);
   static ExprHandle simplify(const ExprHandle& e) {
     return ExprHandle(simplify(e.node()));
   }
-
-  static StmtPtr simplify(StmtPtr s) {
-    SimplifierUnderContext ctxsimplifier;
-    s = s->accept_mutator(&ctxsimplifier);
-
-    PolynomialTransformer simplifier;
-    s = s->accept_mutator(&simplifier);
-    if (s == nullptr) {
-      return nullptr;
-    }
-
-    // There may be terms left in the IR, expand them.
-    TermExpander expander(&simplifier);
-    s = s->accept_mutator(&expander);
-    if (!expander.check_safe()) {
-      throw malformed_input("eliminated null Allocation without free");
-    }
-
-    return s;
-  }
 };
 
 // Flattens the buf and performs the simplifier on the flattened dims.

From 9bbf80969ece148ca5da2107ef9ad26a99891738 Mon Sep 17 00:00:00 2001
From: Dhruv Matani <dhruvbird@fb.com>
Date: Wed, 18 Aug 2021 14:47:19 -0700
Subject: [PATCH 044/530] [PyTorch] Avoid using std::regex for device string
 parsing in Device.cpp (#63464)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63464

This was previously committed as D30281388 (https://github.com/pytorch/pytorch/commit/4d6f98ecada2d85b2474b023838debad4305316d), but was reverted due to t98478641. jnkwok1 confirmed that this change was not the root cause, so trying to land it again.

Currently, `std::regex` is used for parsing device strings. This is undesirable for a few reasons.

1. Increases binary size
2. Slows down model loading
3. Potentially uses more memory at runtime
4. Takes marginally longer time to build code that uses std::regex v/s not using std::regex

This change avoids the use of `std::regex` for parsing the device string since we don't need to.
ghstack-source-id: 136006963
ghstack-source-id: 136081898

Test Plan:
### AI Bench Runs

**Before this change:**
1. Model Load time: [252ms](https://www.internalfb.com/intern/aibench/details/332471502816548)
2. Model unload time: 3.5ms

**After this change:**
1. Model Load time: [240ms](https://www.internalfb.com/intern/aibench/details/652195589031318), which is an approx 5% reduction for the current model. I suspect percentage wise, it will be larger for smaller models since this is a fixed cost reduction.
2. Model unload time: 3.3ms (probably too small to be meaningfully impactful to an end user).

### BSB Results

```
D30281388 (https://github.com/pytorch/pytorch/commit/4d6f98ecada2d85b2474b023838debad4305316d)-V1 (https://www.internalfb.com/intern/diff/D30281388 (https://github.com/pytorch/pytorch/commit/4d6f98ecada2d85b2474b023838debad4305316d)/?dest_number=135713848)

messenger-pika-optimized-device: Succeeded
Change in Download Size for arm64 + 3x assets variation: -7.1 KiB
Change in Uncompressed Size for arm64 + 3x assets variation: -17.6 KiB

Mbex Comparison: https://our.intern.facebook.com/intern/mbex/bsb:551399955987465@base/bsb:551399955987465@diff/
```

Reviewed By: raziel, pavithranrao

Differential Revision: D30388269

fbshipit-source-id: 10942e7aa56f9ea47aa479a8f50187f2ce2899bf
---
 c10/core/Device.cpp | 108 +++++++++++++++++++++++++++++---------------
 1 file changed, 72 insertions(+), 36 deletions(-)

diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp
index ee6f1b473fe08..2709c29ce8460 100644
--- a/c10/core/Device.cpp
+++ b/c10/core/Device.cpp
@@ -4,28 +4,13 @@
 
 #include <algorithm>
 #include <array>
+#include <cctype>
 #include <exception>
 #include <ostream>
-#include <regex>
 #include <string>
 #include <tuple>
 #include <vector>
 
-// Check if compiler has working std::regex implementation
-//
-// Test below is adapted from https://stackoverflow.com/a/41186162
-#if defined(_MSVC_LANG) && _MSVC_LANG >= 201103L
-// Compiler has working regex. MSVC has erroneous __cplusplus.
-#elif __cplusplus >= 201103L &&                           \
-    (!defined(__GLIBCXX__) || (__cplusplus >= 201402L) || \
-     (defined(_GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT) ||    \
-      defined(_GLIBCXX_REGEX_STATE_LIMIT) ||              \
-      (defined(_GLIBCXX_RELEASE) && _GLIBCXX_RELEASE > 4)))
-// Compiler has working regex.
-#else
-static_assert(false, "Compiler does not have proper regex support.");
-#endif
-
 namespace c10 {
 namespace {
 DeviceType parse_type(const std::string& device_string) {
@@ -65,33 +50,84 @@ DeviceType parse_type(const std::string& device_string) {
       "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ",
       device_string);
 }
+enum DeviceStringParsingState { START, INDEX_START, INDEX_REST, ERROR };
+
 } // namespace
 
 Device::Device(const std::string& device_string) : Device(Type::CPU) {
   TORCH_CHECK(!device_string.empty(), "Device string must not be empty");
 
-  // We assume gcc 5+, so we can use proper regex.
-  static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
-  std::smatch match;
-  TORCH_CHECK(
-      std::regex_match(device_string, match, regex),
-      "Invalid device string: '",
-      device_string,
-      "'");
-  type_ = parse_type(match[1].str());
-  if (match[2].matched) {
-    try {
-      index_ = c10::stoi(match[2].str());
-    } catch (const std::exception&) {
-      TORCH_CHECK(
-          false,
-          "Could not parse device index '",
-          match[2].str(),
-          "' in device string '",
-          device_string,
-          "'");
+  std::string device_name, device_index_str;
+  DeviceStringParsingState pstate = DeviceStringParsingState::START;
+
+  // The code below tries to match the string in the variable
+  // device_string against the regular expression:
+  // ([a-zA-Z_]+)(?::([1-9]\\d*|0))?
+  for (size_t i = 0;
+       pstate != DeviceStringParsingState::ERROR && i < device_string.size();
+       ++i) {
+    const char ch = device_string.at(i);
+    switch (pstate) {
+      case DeviceStringParsingState::START:
+        if (ch != ':') {
+          if (isalpha(ch) || ch == '_') {
+            device_name.push_back(ch);
+          } else {
+            pstate = DeviceStringParsingState::ERROR;
+          }
+        } else {
+          pstate = DeviceStringParsingState::INDEX_START;
+        }
+        break;
+
+      case DeviceStringParsingState::INDEX_START:
+        if (isdigit(ch)) {
+          device_index_str.push_back(ch);
+          pstate = DeviceStringParsingState::INDEX_REST;
+        } else {
+          pstate = DeviceStringParsingState::ERROR;
+        }
+        break;
+
+      case DeviceStringParsingState::INDEX_REST:
+        if (device_index_str.at(0) == '0') {
+          pstate = DeviceStringParsingState::ERROR;
+          break;
+        }
+        if (isdigit(ch)) {
+          device_index_str.push_back(ch);
+        } else {
+          pstate = DeviceStringParsingState::ERROR;
+        }
+        break;
+
+      case DeviceStringParsingState::ERROR:
+        // Execution won't reach here.
+        break;
+    }
+  }
+
+  const bool has_error = device_name.empty() ||
+      pstate == DeviceStringParsingState::ERROR ||
+      (pstate == DeviceStringParsingState::INDEX_START &&
+       device_index_str.empty());
+
+  TORCH_CHECK(!has_error, "Invalid device string: '", device_string, "'");
+
+  try {
+    if (!device_index_str.empty()) {
+      index_ = c10::stoi(device_index_str);
     }
+  } catch (const std::exception&) {
+    TORCH_CHECK(
+        false,
+        "Could not parse device index '",
+        device_index_str,
+        "' in device string '",
+        device_string,
+        "'");
   }
+  type_ = parse_type(device_name);
   validate();
 }
 

From 139413078fe2f2b7cf451943461e7c76038446a4 Mon Sep 17 00:00:00 2001
From: Mostafa Elhoushi <melhoushi@fb.com>
Date: Wed, 18 Aug 2021 14:47:40 -0700
Subject: [PATCH 045/530] [FX] make ASTReriter patch wrapped functions properly
 (#62987)

Summary:
reference the same global namespace (instead of copying it) in ASTRewriter to patch wrapped functions properly

Fixes #{62071}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62987

Test Plan:
To test it you may write this snippet and ensure the results are as shown in the comments:

```
import torch
import torch.fx

torch.fx.wrap
def to_be_wrapped(x):
    return torch.relu(x)

class Foo(torch.nn.Module):
    def forward(self, x):
        return to_be_wrapped(x)

traced = torch.fx.symbolic_trace(Foo())
print(traced.graph)
"""
graph():
    %x : [#users=1] = placeholder[target=x]
    %to_be_wrapped : [#users=1] = call_function[target=__main__.to_be_wrapped](args = (%x,), kwargs = {})
    return to_be_wrapped
"""

from torch.fx.experimental.rewriter import RewritingTracer

rt = RewritingTracer()
graph = rt.trace(Foo())
print(graph)
"""
### AFTER FIX (CORRECT):
graph():
    %x : [#users=1] = placeholder[target=x]
    %to_be_wrapped : [#users=1] = call_function[target=__main__.to_be_wrapped](args = (%x,), kwargs = {})
    return to_be_wrapped

### BEFORE FIX (WRONG):
graph():
    %x : [#users=1] = placeholder[target=x]
    %relu : [#users=1] = call_function[target=torch.relu](args = (%x,), kwargs = {})
    return relu
"""
```

Reviewed By: ansley

Differential Revision: D30396176

Pulled By: mostafaelhoushi

fbshipit-source-id: f61eddf32e9ef42b5f5c3ce21d559945214ee833
---
 test/test_fx.py                   | 90 +++++++++++++++++++++++++++++++
 torch/fx/experimental/rewriter.py | 18 ++++++-
 2 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index 1708634653a64..e39469d0a0676 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -2335,6 +2335,96 @@ def forward(self, x: torch.Tensor):
 
         traced.graph.lint()
 
+    def test_ast_rewriter_wrap(self):
+        self.assertEqual(3 + 4 + 5, a_lifted_leaf((3, 4), 5))
+
+        def to_trace(y):
+            return (
+                a_lifted_leaf((4, y), 3)
+                + a_lifted_leaf((3, 4), 5)
+                + a_lifted_leaf((y, y), y)
+            )
+
+        ast_rewriter = RewritingTracer()
+        graph = ast_rewriter.trace(to_trace)
+        traced = GraphModule(ast_rewriter.root, graph, "gm")
+
+        self.assertIn("a_lifted_leaf", traced.code)
+        self.assertEqual(27, traced(2))
+        self.assertIs(a_lifted_leaf, real_a_lifed_leaf)
+
+    def test_ast_rewriter_wrap_fn_directly(self):
+        self.assertEqual(3 + 4 + 5, a_lifted_leaf2((3, 4), 5))
+
+        def to_trace(y):
+            return (
+                a_lifted_leaf2((4, y), 3)
+                + a_lifted_leaf2((3, 4), 5)
+                + a_lifted_leaf2((y, y), y)
+            )
+
+        ast_rewriter = RewritingTracer()
+        graph = ast_rewriter.trace(to_trace)
+        traced = GraphModule(ast_rewriter.root, graph, "gm")
+
+        self.assertIn("a_lifted_leaf2", traced.code)
+        self.assertEqual(27, traced(2))
+        self.assertIs(a_lifted_leaf2, real_a_lifed_leaf2)
+
+    def test_ast_rewriter_wrapped_via_decorator(self):
+        class F(torch.nn.Module):
+            def forward(self, x):
+                return wrapped_via_decorator(x)
+
+        ast_rewriter = RewritingTracer()
+        graph = ast_rewriter.trace(F())
+        traced = GraphModule(ast_rewriter.root, graph, "gm")
+
+        self.assertIn("wrapped_via_decorator", traced.code)
+        self.assertEqual(traced(0), 1)
+        self.assertIs(wrapped_via_decorator, real_wrapped_via_decorator)
+        self.assertFalse(hasattr(wrapped_via_decorator, "__fx_already_patched"))
+
+    def test_ast_rewriter_wrapped_via_decorator_and_transformed(self):
+        self.assertEqual(wrapped_via_decorator(0), 1)
+
+        def to_trace(y):
+            return wrapped_via_decorator(y)
+
+        ast_rewriter = RewritingTracer()
+        graph = ast_rewriter.trace(to_trace)
+        traced = GraphModule(ast_rewriter.root, graph, "gm")
+
+        self.assertIn("wrapped_via_decorator", traced.code)
+        self.assertEqual(traced(0), 1)
+        self.assertIs(wrapped_via_decorator, real_wrapped_via_decorator)
+        self.assertFalse(hasattr(wrapped_via_decorator, "__fx_already_patched"))
+
+        transformed = torch.fx.Transformer(traced).transform()
+        self.assertIn("wrapped_via_decorator", transformed.code)
+        self.assertEqual(transformed(0), 1)
+        self.assertIs(wrapped_via_decorator, real_wrapped_via_decorator)
+        self.assertFalse(hasattr(wrapped_via_decorator, "__fx_already_patched"))
+
+    def test_ast_rewriter_wrap_with_submodule(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.batchnorm1d = torch.nn.BatchNorm1d(2, affine=False)
+
+            def forward(self, x: torch.Tensor):
+                return wrapped_with_submodule(x, self.batchnorm1d)
+
+        ast_rewriter = RewritingTracer()
+        graph = ast_rewriter.trace(M())
+        traced = GraphModule(ast_rewriter.root, graph, "gm")
+
+        self.assertIn("wrapped_with_submodule", traced.code)
+
+        input = torch.rand(3, 2)
+        ref_batchnorm1d = torch.nn.BatchNorm1d(2, affine=False)
+        self.assertEqual(ref_batchnorm1d(input), traced(input))
+
     def test_submodule_manipulation_API(self):
         class C(torch.nn.Module):
             def __init__(self):
diff --git a/torch/fx/experimental/rewriter.py b/torch/fx/experimental/rewriter.py
index b3f71d5de6cd2..de08ebaa69880 100644
--- a/torch/fx/experimental/rewriter.py
+++ b/torch/fx/experimental/rewriter.py
@@ -2,6 +2,7 @@
 import inspect
 import textwrap
 import copy
+import functools
 from types import FunctionType
 from typing import cast, Union, Callable, Dict, Optional, Any
 from torch.fx._symbolic_trace import Tracer
@@ -41,8 +42,23 @@ def rewrite(self, fn: FunctionType):
         assert len(new_keys) == 1
         fn_compiled = globals_dict[new_keys[0]]
 
+        # return the compiled function with the original globals
+        def change_func_globals(f, globals):
+            """Based on https://stackoverflow.com/a/13503277/2988730 (@unutbu)"""
+            # __globals__ is a private member of the function class
+            # so we have to copy the function, f, all of its member, except f.__globals__
+            g = FunctionType(
+                f.__code__,
+                globals,
+                name=f.__name__,
+                argdefs=f.__defaults__,
+                closure=f.__closure__,
+            )
+            g = functools.update_wrapper(g, f)
+            g.__kwdefaults__ = copy.copy(f.__kwdefaults__)
+            return g
         # Return the correct FunctionType object
-        return fn_compiled
+        return change_func_globals(fn_compiled, globals=fn.__globals__)
 
     def visit_Assert(self, node):
         """

From 779a3d47b0c057211374a9c4128fbdf78acee4c9 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Wed, 18 Aug 2021 14:56:51 -0700
Subject: [PATCH 046/530] [Static Runtime] Benchmark reports native nodes
 (#63346)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63346

We have seen that we can get significant perf wins essentially for free by implementing native ops for ops that we cannot write out variants for (e.g. TupleUnpack D30306955 (https://github.com/pytorch/pytorch/commit/078b8004a62a51f75e1fbd8d08eea359af6bb1d7), append D30326461 (https://github.com/pytorch/pytorch/commit/9d9e7a8d7294834ddad957ddb1f4cd5a0e741e55)). Therefore, whether or not SR is using a native implementation is valuable information. By capturing this in the benchmarking suite, we can hopefully avoid wasting time profiling/manually inspecting `native_ops.cpp`

Reviewed By: hlu1

Differential Revision: D30346752

fbshipit-source-id: 205b090513b6a5a6ce4cb92f75ab0395b15d08f9
---
 torch/csrc/jit/runtime/static/impl.cpp | 10 +++++++---
 torch/csrc/jit/runtime/static/impl.h   |  5 +++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index a0c3bac2bbc83..1ee69a642384f 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -897,10 +897,12 @@ void StaticRuntime::benchmark(
     std::cout << std::setw(15) << ms << " ms. " << std::setw(10)
               << results.percent_per_node_type[kind] << "%. " << kind << " ("
               << results.instances_per_node_type[kind] << " nodes";
-    if (results.out_nodes.count(kind) == 0) {
-      std::cout << ")" << std::endl;
-    } else {
+    if (results.out_nodes.count(kind)) {
       std::cout << ", out variant)" << std::endl;
+    } else if (results.native_nodes.count(kind)) {
+      std::cout << ", native)" << std::endl;
+    } else {
+      std::cout << ")" << std::endl;
     }
   }
   std::cout << std::setw(15) << results.total_time << " ms. in Total"
@@ -1136,6 +1138,8 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
     if (nodes_[i].has_out_variant()) {
       results.out_nodes.insert(kind);
       results.out_nodes_count++;
+    } else if (nodes_[i].has_native()) {
+      results.native_nodes.insert(kind);
     }
     results.total_time += results.time_per_node[i];
   }
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index cc36df037b02d..b16cfefbc0b60 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -231,6 +231,7 @@ class TORCH_API StaticRuntime {
     std::unordered_map<std::string, float> percent_per_node_type;
     std::unordered_map<std::string, int> instances_per_node_type;
     std::unordered_set<std::string> out_nodes;
+    std::unordered_set<std::string> native_nodes;
   };
 
   IndividualMetrics benchmark_individual_ops(
@@ -410,6 +411,10 @@ class TORCH_API ProcessedNode {
     return static_cast<bool>(fn_);
   }
 
+  bool has_native() const {
+    return static_cast<bool>(native_fn_);
+  }
+
   bool verify_outputs_not_overlapping_with_immutable_inputs() const;
 
  private:

From 15eec8e1d1ea5b3354bc305f1afe0c01a64ea748 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Wed, 18 Aug 2021 15:02:05 -0700
Subject: [PATCH 047/530] using PR number instead of IN_PULL_REQUEST (#63360)

Summary:
PR numbers should be available on GHA after this.

This fixes some target determinator not working issue discovered when manually running: https://github.com/pytorch/pytorch/issues/63412.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63360

Reviewed By: malfet, zhouzhuojie, seemethere

Differential Revision: D30374615

Pulled By: walterddr

fbshipit-source-id: eee8d8bb7aa4308a6a50cfdcd4423a96d846777f
---
 .circleci/config.yml                                |  9 +++++++++
 .../job-specs/pytorch-job-specs.yml                 |  9 +++++++++
 .github/templates/linux_ci_workflow.yml.j2          |  2 ++
 .../generated-linux-bionic-cuda10.2-py3.9-gcc7.yml  |  2 ++
 .../generated-linux-bionic-py3.8-gcc9-coverage.yml  |  2 ++
 .../generated-linux-xenial-cuda10.2-py3.6-gcc7.yml  |  2 ++
 .../generated-linux-xenial-cuda11.1-py3.6-gcc7.yml  |  2 ++
 .../generated-linux-xenial-py3.6-gcc5.4.yml         |  2 ++
 ...ed-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml |  2 ++
 .jenkins/pytorch/test.sh                            | 13 ++++++-------
 10 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3a64240fcf8bb..cb3e148e2e162 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -644,6 +644,15 @@ jobs:
           set -ex
           export SCRIBE_GRAPHQL_ACCESS_TOKEN="${SCRIBE_GRAPHQL_ACCESS_TOKEN}"
           export JOB_BASE_NAME="$CIRCLE_JOB"
+          # temporary fix for https://github.com/pytorch/pytorch/issues/60746
+          if [ -z "$CIRCLE_PR_NUMBER" ]; then
+            if [[ $CIRCLE_BRANCH =~ .*pull.* ]]; then
+              export PR_NUMBER="$(echo $CIRCLE_BRANCH | sed 's/[^0-9]//g')"
+              export CIRCLE_PR_NUMBER="$PR_NUMBER"
+            fi
+          else
+            export PR_NUMBER="$CIRCLE_PR_NUMBER"
+          fi
           ${PARALLEL_FLAGS}
           cd workspace
           EOL
diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
index bcc02edd5f0b9..422e44e8a606d 100644
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -174,6 +174,15 @@ jobs:
           set -ex
           export SCRIBE_GRAPHQL_ACCESS_TOKEN="${SCRIBE_GRAPHQL_ACCESS_TOKEN}"
           export JOB_BASE_NAME="$CIRCLE_JOB"
+          # temporary fix for https://github.com/pytorch/pytorch/issues/60746
+          if [ -z "$CIRCLE_PR_NUMBER" ]; then
+            if [[ $CIRCLE_BRANCH =~ .*pull.* ]]; then
+              export PR_NUMBER="$(echo $CIRCLE_BRANCH | sed 's/[^0-9]//g')"
+              export CIRCLE_PR_NUMBER="$PR_NUMBER"
+            fi
+          else
+            export PR_NUMBER="$CIRCLE_PR_NUMBER"
+          fi
           ${PARALLEL_FLAGS}
           cd workspace
           EOL
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index ec39ef6f5f260..bceeba51f20bc 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -353,6 +353,7 @@ jobs:
       - name: Test PyTorch
         env:
           BUILD_ENVIRONMENT: !{{ build_environment }}-${{ matrix.config }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
@@ -368,6 +369,7 @@ jobs:
           docker run \
             ${GPU_FLAG:-} \
             -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e GITHUB_ACTIONS \
             -e IN_CI \
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 7aa572d83321b..1e1aec057c7d4 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -321,6 +321,7 @@ jobs:
       - name: Test PyTorch
         env:
           BUILD_ENVIRONMENT: linux-bionic-cuda10.2-py3.9-gcc7-${{ matrix.config }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
@@ -336,6 +337,7 @@ jobs:
           docker run \
             ${GPU_FLAG:-} \
             -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e GITHUB_ACTIONS \
             -e IN_CI \
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index eda7568a809dd..28180e3e98727 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -321,6 +321,7 @@ jobs:
       - name: Test PyTorch
         env:
           BUILD_ENVIRONMENT: linux-bionic-py3.8-gcc9-coverage-${{ matrix.config }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
@@ -336,6 +337,7 @@ jobs:
           docker run \
             ${GPU_FLAG:-} \
             -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e GITHUB_ACTIONS \
             -e IN_CI \
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index c50cac76a2c3e..ddb1522962dff 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -321,6 +321,7 @@ jobs:
       - name: Test PyTorch
         env:
           BUILD_ENVIRONMENT: linux-xenial-cuda10.2-py3.6-gcc7-${{ matrix.config }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
@@ -336,6 +337,7 @@ jobs:
           docker run \
             ${GPU_FLAG:-} \
             -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e GITHUB_ACTIONS \
             -e IN_CI \
diff --git a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
index cf2395e9ca829..fb6d83a0f2432 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -321,6 +321,7 @@ jobs:
       - name: Test PyTorch
         env:
           BUILD_ENVIRONMENT: linux-xenial-cuda11.1-py3.6-gcc7-${{ matrix.config }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
@@ -336,6 +337,7 @@ jobs:
           docker run \
             ${GPU_FLAG:-} \
             -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e GITHUB_ACTIONS \
             -e IN_CI \
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index dd3cb50cfc903..eabc42408fa91 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -321,6 +321,7 @@ jobs:
       - name: Test PyTorch
         env:
           BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc5.4-${{ matrix.config }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
@@ -336,6 +337,7 @@ jobs:
           docker run \
             ${GPU_FLAG:-} \
             -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e GITHUB_ACTIONS \
             -e IN_CI \
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
index dcbd19d661eb1..47ac9f73d422f 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -319,6 +319,7 @@ jobs:
       - name: Test PyTorch
         env:
           BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.3-py3.6-gcc7-${{ matrix.config }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
@@ -334,6 +335,7 @@ jobs:
           docker run \
             ${GPU_FLAG:-} \
             -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e GITHUB_ACTIONS \
             -e IN_CI \
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 93de6fbf68969..124fd7c8cdb3e 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -26,11 +26,7 @@ echo "Testing pytorch"
 
 export LANG=C.UTF-8
 
-# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
-# CIRCLE_PULL_REQUEST comes from CircleCI
-# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs
-#       see https://github.com/pytorch/pytorch/issues/60111
-IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-}
+PR_NUMBER=${PR_NUMBER:-${CIRCLE_PR_NUMBER:-}}
 
 if [[ "$BUILD_ENVIRONMENT" == *-slow-* || $TEST_CONFIG == 'slow' ]]; then
   export PYTORCH_TEST_WITH_SLOW=1
@@ -64,7 +60,7 @@ else
   export PYTORCH_TEST_SKIP_NOARCH=1
 fi
 
-if [[ -n "$IN_PULL_REQUEST" ]] && [[ -z "$CI_MASTER" || "$CI_MASTER" == "false" ]]; then
+if [[ -n "$PR_NUMBER" ]] && [[ -z "$CI_MASTER" || "$CI_MASTER" == "false" ]]; then
   # skip expensive checks when on PR and CI_MASTER flag is not set
   export PYTORCH_TEST_SKIP_CUDA_MEM_LEAK_CHECK=1
 else
@@ -146,7 +142,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX512-* || $TEST_CONFIG == 'nogpu_NO_AVX
   export ATEN_CPU_CAPABILITY=avx2
 fi
 
-if [ -n "$IN_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
+# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs
+#       see https://github.com/pytorch/pytorch/issues/60111
+#       change it back to PR_NUMBER when issue is fixed.
+if [ -n "$CIRCLE_PR_NUMBER" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
   DETERMINE_FROM=$(mktemp)
   file_diff_from_base "$DETERMINE_FROM"
 fi

From 1f4e019d8e8b00d004ca02d17cede0c7aec9f92d Mon Sep 17 00:00:00 2001
From: Sangbaek Park <sangbaek@fb.com>
Date: Wed, 18 Aug 2021 15:50:33 -0700
Subject: [PATCH 048/530] [Vulkan] Fix incorrect input range for Hardshrink
 tests (#63515)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63515

Fixed inappropriate input range for Hardshrink tests:
The range -10 ~ +10 for input tensors is more proper when we use the test set of lambda {-4.2, -1.0, -0.42, 0.0, 0.42, 1.0, 4.2, 42.42}.
ghstack-source-id: 136141416

Test Plan:
```build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 //xplat/caffe2:pt_vulkan_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_api_test
adb shell "/data/local/tmp/vulkan_api_test"
```
Note that the test can fail sporadically due to the precision loss by FP16(Vulkan)/FP32(CPU). This issue will be handled separately after some design discussions.

Reviewed By: SS-JIA

Differential Revision: D30389646

fbshipit-source-id: 7224bd8ba4e4972f5fc147df8a0cb84808f8c62e
---
 aten/src/ATen/test/vulkan_api_test.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 474aa36c40cca..2873d3c0584c8 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -942,7 +942,7 @@ TEST(VulkanAPITest, hardshrink) {
   }
 
   for (const auto lambd_value : {-4.2, -1.0, -0.42, 0.0, 0.42, 1.0, 4.2, 42.42}) {
-    const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu = (at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) - 0.5) * 20;  // between -10 and +10
     const auto in_vulkan = in_cpu.vulkan();
 
     const auto out_cpu = at::hardshrink(in_cpu, lambd_value);
@@ -964,7 +964,7 @@ TEST(VulkanAPITest, hardshrink_) {
   }
 
   for (const auto lambd_value : {-4.2, -1.0, -0.42, 0.0, 0.42, 1.0, 4.2, 42.42}) {
-    const auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto cpu = (at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) - 0.5) * 20;  // between -10 and +10
     const auto vulkan = cpu.vulkan();
 
     cpu.hardshrink(lambd_value);

From 4dcc2197ced0e12f20f296e2e12baad5fad94b0e Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Wed, 18 Aug 2021 16:08:48 -0700
Subject: [PATCH 049/530] [fix] tensor_split : non-contiguous indices tensor
 (#63390)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63281

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63390

Reviewed By: ejguan

Differential Revision: D30362649

Pulled By: mruberry

fbshipit-source-id: 3ea3ad02199e4345beb0b580d056babd56112309
---
 aten/src/ATen/native/TensorShape.cpp                  | 8 +++++++-
 torch/testing/_internal/common_methods_invocations.py | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index e915078249171..2545ec4c1e035 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -609,7 +609,13 @@ std::vector<Tensor> tensor_split(const Tensor& self, const Tensor& tensor_indice
     return self.tensor_split(sections, dim);
   } else {
     auto indices_data = tensor_indices_or_sections.data_ptr<int64_t>();
-    std::vector<int64_t> indices(indices_data, indices_data + tensor_indices_or_sections.numel());
+    auto stride = tensor_indices_or_sections.stride(0);
+    auto numel = tensor_indices_or_sections.numel();
+    std::vector<int64_t> indices(numel);
+    for (size_t offset = 0; offset < numel; offset++) {
+      // indices tensor could be non-contiguous
+      indices[offset] = *(indices_data + offset * stride);
+    }
     return self.tensor_split(indices, dim);
   }
 }
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 5d55f0ec64291..7e57d5d693ec7 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -859,6 +859,7 @@ def sample_inputs_tensor_split(op_info, device, dtype, requires_grad, **kwargs):
         (torch.tensor([1, 2, 3]),),
         (torch.tensor(1),),
         (torch.tensor([1, 2, 3]), 1),
+        (torch.tensor([1, 4, 2, 5, 3, 6])[::2], 1),
         # Cases with list of indices.
         ((2, 4),),
         ((2, 4), 1),
@@ -7590,6 +7591,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                                 active_if=(IS_MACOS or IS_WINDOWS)),
                    )),
     OpInfo('tensor_split',
+           ref=np.array_split,
            dtypes=all_types_and_complex_and(torch.bool),
            dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
            dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),

From d9547b9bb29ca5ba926b3707c5c8313ee65792b2 Mon Sep 17 00:00:00 2001
From: Amy He <ahe@fb.com>
Date: Wed, 18 Aug 2021 16:23:48 -0700
Subject: [PATCH 050/530] Nnapi Delegation: Quick improvements (#63489)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63489

A few quick improvements to the Android NNAPI Delegate, some of which were discussed here https://github.com/pytorch/pytorch/pull/62272:
1) `throw std::exception` replaced with `TORCH_CHECK` to reduce runtime
size (nnapi_backend_lib.cpp)
2) weights processing moved from compile to preprocess step, since it can
be done AOT (nnapi_backend_lib.cpp & nnapi_backend_preprocess.cpp)
3) `ser_model_` and `shape_compute_module_` member variables removed, since they are never used after
`init()`, so they are not needed (nnapi_backend_lib.cpp)

Test Plan:
Unit tests: `python test/test_jit.py TestNnapiBackend`
Run SparkAR segmentation with delegated NNAPI as done here D30259033 (can use `jf download GAekdAwsyGKXhggFALN4LnSBTzcubsIXAAAz --file "v303-nnd-mod.ptl"` to get a preprocessed model from these changes)

Imported from OSS

Reviewed By: raziel, iseeyuan

Differential Revision: D30398880

fbshipit-source-id: b6872e1e9ccd583622b80659da00c83fdd82580e
---
 .../jit/backends/nnapi/nnapi_backend_lib.cpp  | 31 ++++++-------------
 .../nnapi/nnapi_backend_preprocess.cpp        |  3 ++
 2 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp b/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp
index 0533b7d85175f..7d9dc18c12589 100644
--- a/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp
+++ b/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp
@@ -31,19 +31,8 @@ class NnapiBackend : public PyTorchBackendInterface {
   c10::impl::GenericDict compile(
       c10::IValue processed,
       c10::impl::GenericDict method_compile_spec) override {
-    auto dict = processed.toGenericDict();
-
-    // Prepare weights
-    auto weights = dict.at("weights").toTensorList();
-    for (int i = 0; i < weights.size(); i++) {
-      weights.set(i, weights.get(i).contiguous());
-    }
-    dict.insert("weights", weights);
-
-    // Save ser_model to member variable
-    ser_model_ = dict.at("ser_model").toTensor();
-
     // Wrap procesed in dictionary: {"forward": processed}
+    auto dict = processed.toGenericDict();
     c10::Dict<c10::IValue, c10::IValue> handles(
         c10::StringType::get(), c10::AnyType::get());
     handles.insert("forward", dict);
@@ -86,8 +75,7 @@ class NnapiBackend : public PyTorchBackendInterface {
         fixed_inputs.push_back(
             tensorInp.get(i).permute({0, 2, 3, 1}).contiguous());
       } else {
-        throw std::exception();
-        std::cerr << "Invalid mem_fmt" << std::endl;
+        TORCH_CHECK(false, "Invalid mem_fmt");
       }
     }
 
@@ -103,9 +91,8 @@ class NnapiBackend : public PyTorchBackendInterface {
       // TODO: See if it's possible to use those directly.
       if (fmt == 1) {
         outputs.set(i, outputs.get(i).permute({0, 3, 1, 2}));
-      } else if (fmt != 0) {
-        throw std::exception();
-        std::cerr << "Invalid mem_fmt" << std::endl;
+      } else {
+        TORCH_CHECK(fmt == 0, "Invalid mem_fmt");
       }
     }
 
@@ -117,8 +104,6 @@ class NnapiBackend : public PyTorchBackendInterface {
   // and cannot be passed through the handles dictionary
   std::unique_ptr<torch::nnapi::bind::NnapiCompilation> comp_;
   c10::List<at::Tensor> out_templates_;
-  at::Tensor ser_model_;
-  mobile::Module shape_compute_module_;
 
   // Runs once per model initialization
   // Cannot be moved to compile(), because init() requires actual inputs
@@ -126,19 +111,21 @@ class NnapiBackend : public PyTorchBackendInterface {
     TORCH_CHECK(comp_ == nullptr);
     auto dict = handle.toGenericDict();
 
+    // Get ser_model
+    auto ser_model = dict.at("ser_model").toTensor();
     // Load shape computation module
     std::stringstream ss;
     auto shape_ptr = dict.at("shape_compute_module").toString();
     ss.str(*shape_ptr);
-    shape_compute_module_ = _load_for_mobile(ss);
+    auto shape_compute_module = _load_for_mobile(ss);
     out_templates_ =
-        shape_compute_module_.run_method("prepare", ser_model_, inputs)
+        shape_compute_module.run_method("prepare", ser_model, inputs)
             .toTensorList();
 
     // Create and initialize NnapiComilation object
     comp_ = std::make_unique<torch::nnapi::bind::NnapiCompilation>();
     auto weights = dict.at("weights").toTensorVector();
-    comp_->init(ser_model_, weights);
+    comp_->init(ser_model, weights);
   }
 };
 
diff --git a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
index 2f68536b64107..be0dbe18d90d0 100644
--- a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
+++ b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
@@ -96,6 +96,9 @@ c10::IValue preprocess(
   // transform Python lists to C++ c10::List
   c10::List<at::Tensor> weights(
       py::cast<std::vector<at::Tensor>>(nnapi_processed[2]));
+  for (int i = 0; i < weights.size(); i++) {
+    weights.set(i, weights.get(i).contiguous());
+  }
   c10::List<int64_t> inp_mem_fmts(
       py::cast<std::vector<int64_t>>(nnapi_processed[3]));
   c10::List<int64_t> out_mem_fmts(

From 9477211e7d609ce382c0e22d7721c14c36d083de Mon Sep 17 00:00:00 2001
From: John Clow <jclow@fb.com>
Date: Wed, 18 Aug 2021 16:28:02 -0700
Subject: [PATCH 051/530] Hoisting common expressions out of If blocks (#59492)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/59492

Adding code to find common expressions from the two subblocks of an if
operation and hoist them before the if block.
This also allows Dead Code Elimination to
then eliminate some if blocks.

Also eliminated some dead code in the codebase.

Test Plan:
python test_jit.py TestIfHoisting

Imported from OSS

Reviewed By: ngimel

Differential Revision: D29399533

fbshipit-source-id: 9336b9dc48c02c38862f98f98cd72fc1767a1802
---
 test/jit/test_if_hoisting.py                  | 213 ++++++++++++++++++
 test/quantization/jit/test_quantize_jit.py    |   3 +-
 test/test_jit.py                              |   1 +
 tools/build_variables.bzl                     |   1 +
 torch/_C/__init__.pyi.in                      |   1 +
 torch/csrc/jit/ir/node_hashing.cpp            |  14 ++
 .../jit/passes/common_expression_hoisting.cpp | 153 +++++++++++++
 .../jit/passes/common_expression_hoisting.h   |  10 +
 .../jit/passes/symbolic_shape_analysis.cpp    |   1 +
 torch/csrc/jit/python/init.cpp                |   6 +
 torch/csrc/jit/runtime/graph_executor.cpp     |   9 +-
 .../runtime/profiling_graph_executor_impl.cpp | 111 +--------
 12 files changed, 418 insertions(+), 105 deletions(-)
 create mode 100644 test/jit/test_if_hoisting.py
 create mode 100644 torch/csrc/jit/passes/common_expression_hoisting.cpp
 create mode 100644 torch/csrc/jit/passes/common_expression_hoisting.h

diff --git a/test/jit/test_if_hoisting.py b/test/jit/test_if_hoisting.py
new file mode 100644
index 0000000000000..c8fd4a4bab349
--- /dev/null
+++ b/test/jit/test_if_hoisting.py
@@ -0,0 +1,213 @@
+
+import torch
+from torch.testing import FileCheck
+from torch.testing._internal.jit_utils import JitTestCase
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
+
+class TestIfHoisting(JitTestCase):
+    def test_if_hoist_basic(self):
+        def fn(x: bool, y: int):
+            if x:
+                z = y + 3
+            else:
+                z = y + 3
+            return z
+
+
+        fn_script = torch.jit.script(fn)
+        op_graph = fn_script.graph
+        self.run_pass("common_expression_hoisting", op_graph)
+        self.run_pass("dce", op_graph)
+        FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph)
+        FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph)
+        self.assertEqual(fn(True, 1), fn_script(True, 1))
+
+    def test_if_hoist_transposed_expr(self):
+        """
+        Making sure that we can properly eliminate
+        an expression even if it is not at the start
+        of a block
+        """
+        def fn(x: bool, y: int):
+            if x:
+                a = y + 3
+                b = y * 2
+            else:
+                b = y * 2
+                a = y + 3
+            return a, b
+
+        fn_script = torch.jit.script(fn)
+        op_graph = fn_script.graph
+        self.run_pass("common_expression_hoisting", op_graph)
+        self.run_pass("dce", op_graph)
+
+        FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph)
+        FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph)
+
+        self.assertEqual(fn(True, 1), fn_script(True, 1))
+        self.assertEqual(fn(False, 5), fn_script(False, 5))
+
+    def test_if_hoist_swapped_expr(self):
+        """
+        Making sure that the if statement
+        doesn't get fully eliminated here
+        """
+        def fn(x: bool, y: int):
+            if x:
+                a = y + 3
+                b = y * 2
+            else:
+                a = y * 2
+                b = y + 3
+            return a, b
+
+        fn_script = torch.jit.script(fn)
+        op_graph = fn_script.graph
+        self.run_pass("common_expression_hoisting", op_graph)
+        self.run_pass("dce", op_graph)
+
+        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
+        FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph)
+
+        self.assertEqual(fn(True, 1), fn_script(True, 1))
+        self.assertEqual(fn(False, 5), fn_script(False, 5))
+
+    def test_if_hoist_reused_var(self):
+        """
+        Making sure that cases where the python variable is reused
+        is handled correctly
+        """
+        def fn(x: bool, y: int):
+            b = 6
+            if x:
+                a = y + 3
+                a = y * 2
+            else:
+                a = y * 2
+                b = y + 3
+            return a, b
+
+        fn_script = torch.jit.script(fn)
+        op_graph = fn_script.graph
+        self.run_pass("common_expression_hoisting", op_graph)
+        self.run_pass("dce", op_graph)
+
+        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
+        FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph)
+        FileCheck().check_count("aten::mul", 1, exactly=True).run(op_graph)
+
+        self.assertEqual(fn(True, 1), fn_script(True, 1))
+        self.assertEqual(fn(False, 5), fn_script(False, 5))
+
+    def test_no_hoist(self):
+        """
+        Nothing should happen here, expressions are different
+        """
+        def fn(x: bool, y: int, z: int):
+            if x:
+                a = y + 3
+            else:
+                a = z + 3
+            return a
+
+        fn_script = torch.jit.script(fn)
+        op_graph = fn_script.graph
+        self.run_pass("common_expression_hoisting", op_graph)
+        self.run_pass("dce", op_graph)
+
+        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
+        FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph)
+
+        self.assertEqual(fn(True, 1, 3), fn_script(True, 1, 3))
+        self.assertEqual(fn(False, 5, 10), fn_script(False, 5, 10))
+
+    def test_mutate_before(self):
+        """
+        Make sure that if there is a mutation before the common
+        op, the hoist doesn't happen
+        """
+        def fn(x: bool, y: torch.Tensor):
+            if x:
+                y.add_(8)
+                a = y + 3
+            else:
+                a = y + 3
+            return a
+
+        fn_script = torch.jit.script(fn)
+        op_graph = fn_script.graph
+        self.run_pass("common_expression_hoisting", op_graph)
+        self.run_pass("dce", op_graph)
+
+        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
+        FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph)
+        FileCheck().check_count("aten::add_", 1, exactly=True).run(op_graph)
+
+        t1 = torch.Tensor([1])
+        t2 = torch.Tensor([5, 6])
+        self.assertEqual(fn(True, t1), fn_script(True, t1))
+        self.assertEqual(fn(False, t2), fn_script(False, t2))
+
+    def test_mutate_after(self):
+        """
+        Check that the hoist can happen properly, and
+        that the output is still correct.
+        """
+        def fn(x: bool, y: torch.Tensor):
+            if x:
+                b = 1
+                a = y + 3
+                y.add_(8)
+            else:
+                b = 2
+                a = y + 3
+            c = b + a
+            return a
+
+        fn_script = torch.jit.script(fn)
+        op_graph = fn_script.graph
+        self.run_pass("common_expression_hoisting", op_graph)
+        self.run_pass("dce", op_graph)
+
+        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
+        FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph)
+
+        t1 = torch.Tensor([1])
+        t2 = torch.Tensor([5, 6])
+        self.assertEqual(fn(True, t1.clone()), fn_script(True, t1.clone()))
+        self.assertEqual(fn(False, t2.clone()), fn_script(False, t2.clone()))
+
+    def test_multiple_hoists(self):
+        """
+        test that hoists that depend on other hoists are done correctly
+        """
+        def fn(x: bool, y: torch.Tensor):
+            if x:
+                a = y + 3
+                b = a + y
+            else:
+                a = y + 3
+                b = a + y
+            c = b * 2
+            return c
+
+        fn_script = torch.jit.script(fn)
+        op_graph = fn_script.graph
+        self.run_pass("common_expression_hoisting", op_graph)
+        self.run_pass("dce", op_graph)
+
+        FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph)
+        FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph)
+
+        t1 = torch.Tensor([1])
+        t2 = torch.Tensor([5, 6])
+        self.assertEqual(fn(True, t1), fn_script(True, t1))
+        self.assertEqual(fn(False, t2), fn_script(False, t2))
diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py
index 14bb31cf07f1a..5fde8e2cc533d 100644
--- a/test/quantization/jit/test_quantize_jit.py
+++ b/test/quantization/jit/test_quantize_jit.py
@@ -1214,6 +1214,7 @@ class Res(torch.nn.Module):
             def __init__(self):
                 super(Res, self).__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 1).float()
+                self.conv2 = torch.nn.Conv2d(3, 3, 1).float()
                 self.use_skip = True
 
             def forward(self, x: torch.Tensor, cond: bool) -> torch.Tensor:
@@ -1222,7 +1223,7 @@ def forward(self, x: torch.Tensor, cond: bool) -> torch.Tensor:
                 if self.use_skip:
                     return self.conv(x)
                 else:
-                    return self.conv(x)
+                    return self.conv2(x)
 
         class M(torch.nn.Module):
             def __init__(self):
diff --git a/test/test_jit.py b/test/test_jit.py
index 99df960da5dc4..6cf1d8e7d5c6e 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -23,6 +23,7 @@
 from jit.test_builtins import TestBuiltins, TestTensorBuiltins  # noqa: F401
 from jit.test_ignore_context_manager import TestIgnoreContextManager  # noqa: F401
 from jit.test_symbolic_shape_analysis import TestSymbolicShapeAnalysis  # noqa: F401
+from jit.test_if_hoisting import TestIfHoisting  # noqa: F401
 from jit.test_unsupported_ops import TestUnsupportedOps  # noqa: F401
 from jit.test_freezing import TestFreezing, TestFrozenOptimizations, TestMKLDNNReinplacing  # noqa: F401
 from jit.test_peephole import TestPeephole  # noqa: F401
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 89697b4428ca1..2e71bedb35db4 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -191,6 +191,7 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/passes/clear_profiling.cpp",
     "torch/csrc/jit/passes/clear_undefinedness.cpp",
     "torch/csrc/jit/passes/common_subexpression_elimination.cpp",
+    "torch/csrc/jit/passes/common_expression_hoisting.cpp",
     "torch/csrc/jit/passes/concat_opt.cpp",
     "torch/csrc/jit/passes/constant_pooling.cpp",
     "torch/csrc/jit/passes/constant_propagation.cpp",
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index b683a60615dc5..30885d3107176 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -204,6 +204,7 @@ def _jit_pass_inline(Graph) -> None: ...
 def _jit_pass_constant_propagation(Graph) -> None: ...
 def _jit_pass_propagate_shapes_on_graph(Graph) -> None: ...
 def _jit_erase_non_input_shape_information(Graph) -> None: ...
+def _jit_pass_common_expression_hoisting(Graph) -> None: ...
 def _jit_get_schemas_for_operator(name :str) -> List[FunctionSchema]: ...
 def _jit_check_alias_annotation(g: Graph, args: Tuple[Any, ...], unqualified_op_name: str): ...
 def _jit_can_fuse_on_cpu() -> _bool: ...
diff --git a/torch/csrc/jit/ir/node_hashing.cpp b/torch/csrc/jit/ir/node_hashing.cpp
index 3fd4974ed421b..9a876d062d2fd 100644
--- a/torch/csrc/jit/ir/node_hashing.cpp
+++ b/torch/csrc/jit/ir/node_hashing.cpp
@@ -204,6 +204,8 @@ bool attributesEqualCSE(const Node* lhs, const Node* rhs) {
 
 } // anonymous namespace
 
+// Makes a hash that hashes the input Value, the output type
+// as well as the node attributes
 size_t HashNode::operator()(const Node* k) const {
   AT_ASSERT(k != nullptr);
   size_t constant_hash = 0;
@@ -231,6 +233,8 @@ size_t HashNode::operator()(const Node* k) const {
       constant_hash);
 };
 
+// Checks that two nodes have the same inputs, output types
+// and node attributes.
 bool EqualNode::operator()(const Node* lhs, const Node* rhs) const {
   if (lhs == nullptr && rhs == nullptr)
     return true;
@@ -261,6 +265,16 @@ bool EqualNode::operator()(const Node* lhs, const Node* rhs) const {
   if (!attributesEqualCSE(lhs, rhs))
     return false;
 
+  // Check if the blocks contained in a op are the same
+  if (lhs->blocks().size() != rhs->blocks().size()) {
+    return false;
+  }
+  for (size_t i = 0; i < lhs->blocks().size(); ++i) {
+    if (lhs->blocks()[i] != rhs->blocks()[i]) {
+      return false;
+    }
+  }
+
   return true;
 };
 
diff --git a/torch/csrc/jit/passes/common_expression_hoisting.cpp b/torch/csrc/jit/passes/common_expression_hoisting.cpp
new file mode 100644
index 0000000000000..ab2b9d41afa8b
--- /dev/null
+++ b/torch/csrc/jit/passes/common_expression_hoisting.cpp
@@ -0,0 +1,153 @@
+#include <torch/csrc/jit/passes/common_expression_hoisting.h>
+
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/node_hashing.h>
+#include <torch/csrc/jit/jit_log.h>
+
+#include <cstddef>
+#include <unordered_set>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace {
+
+struct CommonExpressionHoister {
+  CommonExpressionHoister(std::shared_ptr<Graph> graph)
+      : graph_(std::move(graph)) {}
+
+  bool run() {
+    HoistCommonExpression(graph_->block());
+    return changed_;
+  }
+
+  void HoistFromIfNode(Node* if_node) {
+    Block* true_block = if_node->blocks()[0];
+    Block* false_block = if_node->blocks()[1];
+    // find common statements in the two subblocks
+
+    auto true_block_nodes = std::unordered_set<Node*, HashNode, EqualNode>(
+        true_block->nodes().begin(), true_block->nodes().end());
+    for (auto it = false_block->nodes().begin();
+         it != false_block->nodes().end();) {
+      Node* false_b_node = *it;
+      // node may be moved to a different block so advance iterator now
+      ++it;
+
+      auto matching_elem = true_block_nodes.find(false_b_node);
+      if (matching_elem == true_block_nodes.end()) {
+        continue;
+      }
+      Node* true_b_node = *matching_elem;
+
+      // Check if a move to the front of the block is valid
+      // If both of the moves are valid, then we know we can move the item out
+      // of the if blocks entirely.
+      AliasDb& aliasDb = getOrCreateAliasDb();
+      bool true_moveable = aliasDb.couldMoveAfterTopologically(
+          true_b_node, true_block->nodes().front());
+      bool false_moveable = aliasDb.couldMoveAfterTopologically(
+          false_b_node, false_block->nodes().front());
+
+      if (!true_moveable || !false_moveable) {
+        continue;
+      }
+
+      // Get all the uses of the output to delete and reinsert them
+      // as the input would change, the HashNode value would also change.
+      std::unordered_set<Node*> true_b_uses;
+      for (Value* true_out : true_b_node->outputs()) {
+        for (Use true_use : true_out->uses()) {
+          if (true_use.user->owningBlock() == true_block) {
+            // Make sure we are not accidentally adding stuff from subblocks
+            true_b_uses.insert(true_use.user);
+          }
+        }
+      }
+      for (Node* uses_node : true_b_uses) {
+        true_block_nodes.erase(uses_node);
+      }
+
+      // Now hoist the statement out of the block
+      changed_ = true;
+      false_b_node->moveBefore(if_node);
+
+      true_b_node->replaceAllUsesWith(false_b_node);
+
+      true_block_nodes.erase(true_b_node);
+      true_block_nodes.insert(true_b_uses.cbegin(), true_b_uses.cend());
+      true_b_node->destroy();
+    }
+  }
+
+  void EliminateUnnecessaryIfOutputs(Node* if_node) {
+    Block* true_block = if_node->blocks()[0];
+    Block* false_block = if_node->blocks()[1];
+
+    // fix up the if block outputs
+    for (size_t i = 0; i < true_block->outputs().size();) {
+      // Need to check both sides match to eliminate common if block outputs
+      Value* true_block_output = true_block->outputs().at(i);
+      Value* false_block_output = false_block->outputs().at(i);
+      if (true_block_output != false_block_output) {
+        i++;
+        continue;
+      }
+
+      // We have a matching output, and can remove it from the block itself
+      if_node->outputs().at(i)->replaceAllUsesWith(true_block_output);
+      if_node->eraseOutput(i);
+      true_block->eraseOutput(i);
+      false_block->eraseOutput(i);
+      changed_ = true;
+    }
+
+    // No need to test here if the IF block should be eliminated.
+    // The DCE pass will determine that for us.
+  }
+
+  void HoistCommonExpression(Block* block) {
+    for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+      Node* node = *it;
+      ++it;
+
+      for (auto sub_block : node->blocks()) {
+        HoistCommonExpression(sub_block);
+      }
+
+      if (node->kind() == prim::If) {
+        HoistFromIfNode(node);
+        EliminateUnnecessaryIfOutputs(node);
+      }
+    }
+  }
+
+  AliasDb& getOrCreateAliasDb() {
+    if (!alias_db_) {
+      alias_db_ = std::make_unique<AliasDb>(graph_);
+    }
+
+    return *alias_db_;
+  }
+
+ private:
+  std::unique_ptr<AliasDb> alias_db_;
+  std::shared_ptr<Graph> graph_;
+  bool changed_ = false;
+};
+} // anonymous namespace
+bool HoistCommonExpression(const std::shared_ptr<Graph>& graph) {
+  // This moves common subexpressions from the two sides of an
+  // if block out of the if block.
+
+  GRAPH_DUMP("Before CEH", graph);
+  CommonExpressionHoister ceh(graph);
+  bool changed = ceh.run();
+  if (changed) {
+    GRAPH_DUMP("After CEH Changes", graph);
+  }
+  return changed;
+}
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/common_expression_hoisting.h b/torch/csrc/jit/passes/common_expression_hoisting.h
new file mode 100644
index 0000000000000..2aad158eea8f8
--- /dev/null
+++ b/torch/csrc/jit/passes/common_expression_hoisting.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API bool HoistCommonExpression(const std::shared_ptr<Graph>& graph);
+}
+} // namespace torch
diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
index f74a91176cfe8..10edfb4c496ef 100644
--- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
@@ -6,6 +6,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/ir_views.h>
 #include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_expression_hoisting.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 5fca575593551..d582035d6e95e 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -12,6 +12,7 @@
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include <torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h>
+#include <torch/csrc/jit/passes/common_expression_hoisting.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
@@ -282,6 +283,11 @@ void initJITBindings(PyObject* module) {
           [](std::shared_ptr<Graph>& g) {
             return EliminateCommonSubexpression(g); // overload resolution
           })
+      .def(
+          "_jit_pass_common_expression_hoisting",
+          [](std::shared_ptr<Graph>& g) {
+            return HoistCommonExpression(g); // overload resolution
+          })
       .def(
           "_jit_pass_fuse_quantized_add_relu",
           [](std::shared_ptr<Graph>& g) {
diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp
index 476882650a1dd..bb5f272080601 100644
--- a/torch/csrc/jit/runtime/graph_executor.cpp
+++ b/torch/csrc/jit/runtime/graph_executor.cpp
@@ -9,6 +9,7 @@
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/batch_mm.h>
 #include <torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h>
+#include <torch/csrc/jit/passes/common_expression_hoisting.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
@@ -918,7 +919,7 @@ void runOptimization(
       "After EliminateDeadCode, before EliminateCommonSubexpression\n", *graph);
   EliminateCommonSubexpression(graph);
   GRAPH_DEBUG(
-      "After EliminateCommonSubexpression, before PeepholeOptimize\n", *graph);
+      "After EliminateCommonSubexpression , before PeepholeOptimize\n", *graph);
 
   PeepholeOptimize(graph);
   GRAPH_DEBUG("After PeepholeOptimize, before ConstantPropagation\n", *graph);
@@ -949,8 +950,10 @@ void runOptimization(
 
   EliminateCommonSubexpression(graph);
   GRAPH_DEBUG(
-      "After EliminateCommonSubexpression, before CheckInplace\n", *graph);
-
+      "After EliminateCommonSubexpression, before HoistCommonExpression\n",
+      *graph);
+  HoistCommonExpression(graph);
+  GRAPH_DEBUG("After HoistCommonExpression, before CheckInplace\n", *graph);
   CheckInplace(graph);
   GRAPH_DEBUG("After CheckInplace (end of runOptimization)", *graph);
 }
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
index b099db17931b0..40d94a4a205fe 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@@ -7,6 +7,7 @@
 #include <torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h>
 #include <torch/csrc/jit/passes/clear_profiling.h>
 #include <torch/csrc/jit/passes/clear_undefinedness.h>
+#include <torch/csrc/jit/passes/common_expression_hoisting.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
@@ -332,112 +333,16 @@ void runPreAutodiffPassPipeline(std::shared_ptr<Graph>& graph) {
 
     EliminateCommonSubexpression(graph);
     GRAPH_DEBUG(
-        "After EliminateCommonSubexpression, before CheckInplace\n", *graph);
-
+        "After EliminateCommonSubexpression, before HoistCommonExpression\n",
+        *graph);
+    HoistCommonExpression(graph);
+    GRAPH_DEBUG("After HoistCommonExpression, before CheckInplace\n", *graph);
     CheckInplace(graph);
   }
   GRAPH_DEBUG(
       "After CheckInplace (end of runPreAutodiffPassPipeline)\n", *graph);
 }
 
-void runDiffGraphPasses(std::shared_ptr<Graph>& graph) {
-  GRAPH_DEBUG(
-      "Before EliminateDeadCode (beginning of runDiffGraphPasses)\n", *graph);
-  // runOptimization:
-  {
-    // Basic graph preprocessing to eliminate noise.
-    EliminateDeadCode(graph);
-    GRAPH_DEBUG(
-        "After EliminateDeadCode, before EliminateCommonSubexpression\n",
-        *graph);
-    EliminateCommonSubexpression(graph);
-    GRAPH_DEBUG(
-        "After EliminateCommonSubexpression, before PeepholeOptimize\n",
-        *graph);
-
-    PeepholeOptimize(graph);
-    GRAPH_DEBUG("After PeepholeOptimize, before ConstantPropagation\n", *graph);
-    ConstantPropagation(graph);
-    GRAPH_DEBUG("After ConstantPropagation, before ConstantPooling\n", *graph);
-    ConstantPooling(graph);
-    GRAPH_DEBUG("After ConstantPooling, before UnrollLoops\n", *graph);
-
-    UnrollLoops(graph);
-    GRAPH_DEBUG("After UnrollLoops, before RemoveListMutation\n", *graph);
-    // run again with unrolled loops
-    RemoveListMutation(graph);
-    GRAPH_DEBUG("After RemoveListMutation, before PeepholeOptimize\n", *graph);
-    PeepholeOptimize(graph);
-    GRAPH_DEBUG("After PeepholeOptimize, before ConstantPropagation\n", *graph);
-    ConstantPropagation(graph);
-    GRAPH_DEBUG(
-        "After ConstantPropagation, before EliminateCommonSubexpression\n",
-        *graph);
-
-    EliminateCommonSubexpression(graph);
-    GRAPH_DEBUG(
-        "After EliminateCommonSubexpression, before CheckInplace\n", *graph);
-
-    CheckInplace(graph);
-  }
-  GRAPH_DEBUG("After CheckInplace, before customPrePasses\n", *graph);
-
-  // runNondiffOptimization
-  {
-    // Run custom passes that different backends can register.
-    for (const auto& passPair : getCustomPrePasses()) {
-      passPair.first(graph);
-    }
-    GRAPH_DEBUG("After customPrePasses, before LowerSimpleTuples\n", *graph);
-
-    // TupleConstruct / TupleUnpack pairs can still be present at this point
-    // and must be removed for fusion.
-    LowerSimpleTuples(graph);
-    GRAPH_DEBUG("After LowerSimpleTuples\n", *graph);
-
-    if (tensorExprFuserEnabled()) {
-      // Remove prim::profile nodes and embed the profile info directly in the
-      // IR in value types. We're doing such transformation as optimizations
-      // that try to merge/fuse nodes in the graph (e.g. BatchMM and GraphFuser)
-      // work worse in the presence of intermittent prim::profile nodes.
-      // Optimizations relying on the type info are also responsible for
-      // inserting proper type checks. Once we're done with these optimizations
-      // we will wipe the tensor type information from the IR, so that it's not
-      // accidentally used by any other pass.
-      RemoveProfileNodesAndSpecializeTypes(graph);
-      GRAPH_DEBUG(
-          "After RemoveProfileNodesAndSpecializeTypes, before BatchMM\n",
-          *graph);
-      // Rewrite subgraphs with many MMs into expressions that batch them.
-      BatchMM(graph);
-      GRAPH_DEBUG("After BatchMM, before Fusion\n", *graph);
-
-      FuseTensorExprs(graph, getFusionGroupInlining() ? 2 : 1);
-      GRAPH_DEBUG(
-          "After Fusion, before RemoveTensorTypeSpecializations\n", *graph);
-
-      // Wipe tensor type info from the IR
-      RemoveTensorTypeSpecializations(graph);
-      GRAPH_DEBUG(
-          "After RemoveTensorTypeSpecializations, before customPostPasses\n",
-          *graph);
-    } else {
-      // Rewrite subgraphs with many MMs into expressions that batch them.
-      BatchMM(graph);
-      GRAPH_DEBUG("After BatchMM, before Fusion\n", *graph);
-
-      FuseGraph(graph, true);
-      GRAPH_DEBUG("After Fusion, before customPostPasses\n", *graph);
-    }
-
-    // Run custom post-fusion passes
-    for (const auto& passPair : getCustomPostPasses()) {
-      passPair.first(graph);
-    }
-  }
-  GRAPH_DEBUG("After customPostPasses (end of runDiffGraphPasses)\n", *graph);
-}
-
 void runNoGradOptimizations(std::shared_ptr<Graph>& graph) {
   GRAPH_DEBUG(
       "After customPostPasses (beginning of runNoGradOptimizations)\n", *graph);
@@ -593,7 +498,11 @@ void ProfilingGraphExecutorImpl::runProfilingInsensitiveOptimizations(
   DecomposeOps(graph);
   GRAPH_DEBUG("After DecomposeOps, before ConstantPropagation\n", *graph);
   ConstantPropagation(graph);
-  GRAPH_DEBUG("After ConstantPropagation, before EliminateDeadCode\n", *graph);
+  GRAPH_DEBUG(
+      "After ConstantPropagation, before HoistCommonExpression\n", *graph);
+  HoistCommonExpression(graph);
+  GRAPH_DEBUG(
+      "After EliminateCommonSubexpression, before ElimiateDeadCode\n", *graph);
   EliminateDeadCode(graph);
   GRAPH_DEBUG(
       "After EliminateDeadCode, before EliminateCommonSubexpression\n", *graph);

From 0d437fe6d0ef17648072eb586484a4a5a080b094 Mon Sep 17 00:00:00 2001
From: Yinbin Ma <yinbin@fb.com>
Date: Wed, 18 Aug 2021 20:52:17 -0700
Subject: [PATCH 052/530] BF16 allreduce hook (#63260)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63260

Add BF16 all-reduce communication hook. Skip if CUDA version < 11 or NCCL version < 2.9.7.

Reviewed By: SciPioneer

Differential Revision: D30238317

fbshipit-source-id: bad35bf7d43f10f1c40997a282b831b61ef592bb
---
 test/distributed/test_c10d_nccl.py            | 67 ++++++++++++++++-
 torch/csrc/distributed/c10d/NCCLUtils.hpp     |  9 +++
 .../distributed/c10d/ProcessGroupNCCL.cpp     |  2 +-
 .../algorithms/ddp_comm_hooks/__init__.py     |  3 +
 .../ddp_comm_hooks/default_hooks.py           | 72 +++++++++++++++++++
 torch/nn/parallel/distributed.py              | 13 ++++
 6 files changed, 162 insertions(+), 4 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index f7f6681b43a76..9efebc94a9288 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -44,6 +44,7 @@
     run_tests,
     retry_on_connect_failures,
     TEST_WITH_DEV_DBG_ASAN,
+    TEST_WITH_ROCM,
     TEST_WITH_TSAN,
     sandcastle_skip,
     sandcastle_skip_if,
@@ -69,6 +70,11 @@
     )
     sys.exit(0)
 
+# bfloat16 is only supported by CUDA 11+
+BFLOAT16_AVAILABLE = (
+    torch.cuda.is_available()
+    and torch.version.cuda is not None
+    and int(torch.version.cuda.split('.')[0]) >= 11)
 
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
@@ -1559,15 +1565,23 @@ def allreduce_hook(
 
     def _test_default_ddp_comm_hooks_nccl(self, gradient_as_bucket_view=False):
         """
-        This unit test verifies whether default Python DDP communication hooks ALLREDUCE and FP16_COMPRESS
-        can give the same result with the case of no hook registered.
+        This unit test verifies whether default Python DDP communication hooks ALLREDUCE, FP16_COMPRESS
+        and BF16_COMPRESS, can give the same result with the case of no hook registered.
         """
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
 
         # For these default DDP comm hooks, the only state is process group.
         state = process_group
-        for hook in [default.allreduce_hook, default.fp16_compress_hook]:
+        hook_options = [default.allreduce_hook, default.fp16_compress_hook]
+        if (
+            not TEST_WITH_ROCM
+            and BFLOAT16_AVAILABLE
+            and c10d.is_nccl_available()
+            and torch.cuda.nccl.version() >= (2, 9, 7)
+        ):
+            hook_options.append(default.bf16_compress_hook)
+        for hook in hook_options:
             # Get GPU model with the hook registered.
             # The first arg 'process_group' is used for initializing the test environment,
             # so it cannot be replaced by 'state', although they have the same value.
@@ -1603,6 +1617,31 @@ def _test_fp16_compress_wrapper(self, gradient_as_bucket_view=False):
             # check whether the grads are equal to what DDP without hook would return.
             self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
 
+    def _test_bf16_compress_wrapper(self, gradient_as_bucket_view=False):
+        """
+        This unit test verifies whether wrapping the ALLREDUCE and POWER_SGD hooks with
+        the BF16_WRAPPER can give the same result as when there is no hook registered.
+        """
+        store = c10d.FileStore(self.file_name, self.world_size)
+        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+        powerSGD_state = powerSGD.PowerSGDState(process_group=process_group)
+
+        hook_args = [
+            (powerSGD.powerSGD_hook, powerSGD_state),
+            (default.allreduce_hook, process_group),
+        ]
+
+        for hook, state in hook_args:
+            gpu_model = self._gpu_model_with_ddp_comm_hook(
+                process_group,
+                default.bf16_compress_wrapper(hook),
+                gradient_as_bucket_view,
+                state,
+            )
+
+            # check whether the grads are equal to what DDP without hook would return.
+            self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
+
     def _test_hook_then_optimizer(
         self,
         functional_optim_cls,
@@ -1709,6 +1748,17 @@ def test_default_ddp_comm_hooks_nccl(self):
     def test_fp16_compress_wrapper_nccl(self):
         self._test_fp16_compress_wrapper()
 
+    @requires_nccl()
+    @requires_nccl_version((2, 9, 7), "Need NCCL 2.9.7+ for BF16_COMPRESS")
+    @sandcastle_skip_if(
+        not BFLOAT16_AVAILABLE,
+        "BFloat16 is only supported by CUDA 11+",
+    )
+    @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
+    def test_bf16_compress_wrapper_nccl(self):
+        self._test_bf16_compress_wrapper()
+
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_hook_then_sgd_nccl(self):
@@ -1808,6 +1858,17 @@ def test_default_ddp_comm_hooks_nccl_is_view(self):
     def test_fp16_compress_wrapper_is_view(self):
         self._test_fp16_compress_wrapper(gradient_as_bucket_view=True)
 
+    @requires_nccl()
+    @requires_nccl_version((2, 9, 7), "Need NCCL 2.9.7+ for BF16_COMPRESS")
+    @sandcastle_skip_if(
+        not BFLOAT16_AVAILABLE,
+        "BFloat16 is only supported by CUDA 11+",
+    )
+    @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
+    def test_bf16_compress_wrapper_is_view(self):
+        self._test_bf16_compress_wrapper(gradient_as_bucket_view=True)
+
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_builtin_ddp_comm_hooks_nccl_grad_is_view(self):
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index e3ee14da0f542..bd50bba3606b9 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -50,6 +50,15 @@ const inline char* getNcclErrorDetailStr(ncclResult_t error) {
 #define ENABLE_NCCL_P2P_SUPPORT
 #endif
 
+// NCCL BFloat16 is enabled only for CUDA 11+ and NCCL versions 2.9.7+
+#if (defined(__CUDA_BF16_TYPES_EXIST__) && \
+    defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && \
+    (defined(NCCL_MINOR) && ((NCCL_MINOR > 9) || \
+    ((NCCL_MINOR == 9) && defined(NCCL_PATCH) && (NCCL_PATCH >= 7))))) || \
+    (defined(__HIP_PLATFORM_HCC__) && (TORCH_HIP_VERSION >= 301))
+#define ENABLE_NCCL_BF16_DATATYPE
+#endif
+
 // Macro to throw on a non-successful NCCL return value.
 #define C10D_NCCL_CHECK(cmd)                                                  \
   do {                                                                        \
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 3c7041a2dd691..911963b76cd7c 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -63,7 +63,7 @@ std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
     {at::kLong, ncclInt64},
     {at::kHalf, ncclHalf},
     {at::kBool, ncclUint8},
-#if defined(__HIP_PLATFORM_HCC__) && TORCH_HIP_VERSION >= 301
+#if defined(ENABLE_NCCL_BF16_DATATYPE)
     {at::kBFloat16, ncclBfloat16},
 #endif
 };
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
index 35ddf316e91c5..c3f3b066ee478 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -46,6 +46,9 @@ class DDPCommHookType(Enum):
     FP16_COMPRESS = partial(
         _ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook
     )
+    BF16_COMPRESS = partial(
+        _ddp_comm_hook_wrapper, comm_hook=default.bf16_compress_hook
+    )
     QUANTIZE_PER_TENSOR = partial(
         _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
     )
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
index 0642deace3565..d11e39b23f6f0 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
@@ -69,6 +69,41 @@ def decompress(fut):
 
     return fut.then(decompress)
 
+# TODO: create an internal helper function and extract the duplicate code in FP16_compress and BF16_compress.
+def bf16_compress_hook(
+    process_group: dist.ProcessGroup, bucket: dist.GradBucket
+) -> torch.futures.Future[torch.Tensor]:
+    """
+    Warning: This API is experimental, and it requires NCCL version later than 2.9.6.
+
+    This DDP communication hook implements a simple gradient compression
+    approach that casts ``GradBucket`` tensor to half-precision
+    `Brain floating point format <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_ (``torch.bfloat16``)
+    and then divides it by the process group size.
+    It allreduces those ``bfloat16`` gradient tensors. Once compressed gradient
+    tensors are allreduced, the chained callback ``decompress`` casts it back to the input data type (such as ``float32``).
+
+    Example::
+        >>> ddp_model.register_comm_hook(process_group, bf16_compress_hook)
+    """
+    group_to_use = process_group if process_group is not None else dist.group.WORLD
+    world_size = group_to_use.size()
+
+    compressed_tensor = bucket.buffer().to(torch.bfloat16).div_(world_size)
+
+    fut = dist.all_reduce(
+        compressed_tensor, group=group_to_use, async_op=True
+    ).get_future()
+
+    def decompress(fut):
+        decompressed_tensor = bucket.buffer()
+        # Decompress in place to reduce the peak memory.
+        # See: https://github.com/pytorch/pytorch/issues/45968
+        decompressed_tensor.copy_(fut.value()[0])
+        return decompressed_tensor
+
+    return fut.then(decompress)
+
 
 class _OptimizerHookState(object):
     """
@@ -160,3 +195,40 @@ def decompress(fut):
         return fut.then(decompress)
 
     return fp16_compress_wrapper_hook
+
+def bf16_compress_wrapper(
+    hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]
+) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
+    """
+    Warning: This API is experimental, and it requires NCCL version later than 2.9.6.
+
+    This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision
+    `Brain floating point format <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format> `_  (``torch.bfloat16``),
+    and casts the resulting tensor of the given hook back to the input data type, such as ``float32``.
+
+    Therefore, ``bf16_compress_hook`` is equivalent to ``bf16_compress_wrapper(allreduce_hook)``.
+
+    Example::
+        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
+        >>> ddp_model.register_comm_hook(state, bf16_compress_wrapper(powerSGD_hook))
+    """
+
+    def bf16_compress_wrapper_hook(
+        hook_state, bucket: dist.GradBucket
+    ) -> torch.futures.Future[torch.Tensor]:
+        # Cast bucket tensor to BF16.
+        bucket.set_buffer(bucket.buffer().to(torch.bfloat16))
+
+        fut = hook(hook_state, bucket)
+
+        def decompress(fut):
+            decompressed_tensor = bucket.buffer()
+            # Decompress in place to reduce the peak memory.
+            # See: https://github.com/pytorch/pytorch/issues/45968
+            decompressed_tensor.copy_(fut.value())
+            return decompressed_tensor
+
+        # Decompress after hook has run.
+        return fut.then(decompress)
+
+    return bf16_compress_wrapper_hook
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index ed5c3656203ee..60d21431dc5bf 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -1408,6 +1408,19 @@ def _check_comm_hook(self, hook):
                 "Communication hook: return annotation should be torch.futures.Future[torch.Tensor].",
             )
 
+        if (
+            hook.__name__ in ["bf16_compress_hook", "bf16_compress_wrapper_hook"]
+            and
+            (
+                torch.version.cuda is None
+                or int(torch.version.cuda.split('.')[0]) < 11
+                or not dist.is_available()
+                or not dist.is_nccl_available()
+                or torch.cuda.nccl.version() < (2, 9, 7)
+            )
+        ):
+            self._log_and_throw(TypeError, "BF16 all reduce communication hook required CUDA 11+ and NCCL 2.9.7+.")
+
     @property
     def _distributed_rank(self):
         return dist.get_rank(self.process_group)

From 2544664e5470cd5012bd88d07fb573aa31ca8c60 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 18 Aug 2021 20:56:25 -0700
Subject: [PATCH 053/530] Beef up comment in AccumulateType (#63503)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63503

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D30403160

Pulled By: ezyang

fbshipit-source-id: 6cb24418152d9fb146f86b6f973ec50f1a397a58
---
 aten/src/ATen/AccumulateType.h | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/AccumulateType.h b/aten/src/ATen/AccumulateType.h
index 09c8cdb6c095a..4270ec021dbc7 100644
--- a/aten/src/ATen/AccumulateType.h
+++ b/aten/src/ATen/AccumulateType.h
@@ -6,7 +6,38 @@
 
 // Defines the accumulation type for a scalar type.
 // Example:
-//   using accscalar_t = acc_type<scalar_t, true>;
+//   using accscalar_t = acc_type<scalar_t, /*is_cuda*/true>;
+//
+// Accumulation types are an important concept in numeric computing
+// because you frequently want to perform intermediate computations
+// at a higher precision than the input and output precision, to avoid
+// compounding internal rounding errors.  Accumulation is the most
+// well-known intermediate computation (it is of great importance for
+// sum reduction and matrix multiply, for example), but in PyTorch
+// acc_type ends up getting used for all sorts of other intermediate
+// computations, so it perhaps would be more accurately (ahem) called an
+// "accurate" type.  acc_type is especially important for reduced
+// precision operations like float16 and bfloat16, where relatively
+// benign looking inputs can easily end up overflowing/underflowing.
+//
+// acc_type is parametrized by whether or not you are running on CUDA
+// or not, because on CUDA double precision operations are expensive
+// and so by default, we don't actually want to use double as an
+// acc_type on CUDA.  A lot of things are typed out below, but
+// basically, the table is generated by a few rules:
+//
+//  If bool:
+//      Use 'bool' as acc_type.
+//  If floating point:
+//      If CUDA, use 'float' as acc_type (unless scalar_t is double),
+//      otherwise (CPU) use 'double'
+//  If integral:
+//      Use 'int64_t' as acc_type
+//
+// You're not forced to use this template; if you happen to know
+// something specific about your use case, you can specify your own
+// desired behavior.  This template, however, will give you a reasonable
+// default that will work for all dtypes supported in PyTorch.
 
 #if defined(__CUDACC__)
 #include <cuda.h>

From 773c8b6440eee7d903f3d3a80d1fa866370b1ba2 Mon Sep 17 00:00:00 2001
From: Michael Dagitses <mikeyd@fb.com>
Date: Wed, 18 Aug 2021 21:39:18 -0700
Subject: [PATCH 054/530] support optional comparisons with different but
 comparable types (#62890)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/62565

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62890

Reviewed By: ejguan

Differential Revision: D30396008

Pulled By: dagitses

fbshipit-source-id: fca02207509f882973d54484f89c4d116505fc66
---
 c10/test/util/optional_test.cpp |  8 +++++-
 c10/util/Optional.h             | 48 ++++++++++++++++-----------------
 2 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/c10/test/util/optional_test.cpp b/c10/test/util/optional_test.cpp
index cac325f9188ab..ac976b4b16f79 100644
--- a/c10/test/util/optional_test.cpp
+++ b/c10/test/util/optional_test.cpp
@@ -146,10 +146,16 @@ TEST(OptionalTest, Nullopt) {
 using CmpTestTypes = testing::Types<
     // between two optionals
     std::pair<c10::optional<int>, c10::optional<int>>,
+
     // between an optional and a value
     std::pair<c10::optional<int>, int>,
     // between a value and an optional
-    std::pair<int, c10::optional<int>>>;
+    std::pair<int, c10::optional<int>>,
+
+    // between an optional and a differently typed value
+    std::pair<c10::optional<int>, long>,
+    // between a differently typed value and an optional
+    std::pair<long, c10::optional<int>>>;
 template <typename T>
 class CmpTest : public testing::Test {};
 TYPED_TEST_CASE(CmpTest, CmpTestTypes);
diff --git a/c10/util/Optional.h b/c10/util/Optional.h
index 5e0684bb7d2f5..7044c798d2de4 100644
--- a/c10/util/Optional.h
+++ b/c10/util/Optional.h
@@ -1049,63 +1049,63 @@ constexpr bool operator>=(nullopt_t, const optional<T>& x) noexcept {
 }
 
 // 20.5.10, Comparison with T
-template <class T>
-constexpr bool operator==(const optional<T>& x, const T& v) {
+template <class T, class U>
+constexpr bool operator==(const optional<T>& x, const U& v) {
   return bool(x) ? *x == v : false;
 }
 
-template <class T>
-constexpr bool operator==(const T& v, const optional<T>& x) {
+template <class T, class U>
+constexpr bool operator==(const U& v, const optional<T>& x) {
   return bool(x) ? v == *x : false;
 }
 
-template <class T>
-constexpr bool operator!=(const optional<T>& x, const T& v) {
+template <class T, class U>
+constexpr bool operator!=(const optional<T>& x, const U& v) {
   return bool(x) ? *x != v : true;
 }
 
-template <class T>
-constexpr bool operator!=(const T& v, const optional<T>& x) {
+template <class T, class U>
+constexpr bool operator!=(const U& v, const optional<T>& x) {
   return bool(x) ? v != *x : true;
 }
 
-template <class T>
-constexpr bool operator<(const optional<T>& x, const T& v) {
+template <class T, class U>
+constexpr bool operator<(const optional<T>& x, const U& v) {
   return bool(x) ? *x < v : true;
 }
 
-template <class T>
-constexpr bool operator>(const T& v, const optional<T>& x) {
+template <class T, class U>
+constexpr bool operator>(const U& v, const optional<T>& x) {
   return bool(x) ? v > *x : true;
 }
 
-template <class T>
-constexpr bool operator>(const optional<T>& x, const T& v) {
+template <class T, class U>
+constexpr bool operator>(const optional<T>& x, const U& v) {
   return bool(x) ? *x > v : false;
 }
 
-template <class T>
-constexpr bool operator<(const T& v, const optional<T>& x) {
+template <class T, class U>
+constexpr bool operator<(const U& v, const optional<T>& x) {
   return bool(x) ? v < *x : false;
 }
 
-template <class T>
-constexpr bool operator>=(const optional<T>& x, const T& v) {
+template <class T, class U>
+constexpr bool operator>=(const optional<T>& x, const U& v) {
   return bool(x) ? *x >= v : false;
 }
 
-template <class T>
-constexpr bool operator<=(const T& v, const optional<T>& x) {
+template <class T, class U>
+constexpr bool operator<=(const U& v, const optional<T>& x) {
   return bool(x) ? v <= *x : false;
 }
 
-template <class T>
-constexpr bool operator<=(const optional<T>& x, const T& v) {
+template <class T, class U>
+constexpr bool operator<=(const optional<T>& x, const U& v) {
   return bool(x) ? *x <= v : true;
 }
 
-template <class T>
-constexpr bool operator>=(const T& v, const optional<T>& x) {
+template <class T, class U>
+constexpr bool operator>=(const U& v, const optional<T>& x) {
   return bool(x) ? v >= *x : true;
 }
 

From 1d62fb8a63a4c69f791f623e913ee1eabb56c344 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Wed, 18 Aug 2021 22:56:47 -0700
Subject: [PATCH 055/530] [TensorExpr] Speedup ExternalCall.ComputeInterop test
 by reducing tensor sizes. (#63526)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63526

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D30411410

Pulled By: ZolotukhinM

fbshipit-source-id: d9a99afac14d2238b5100c98ae9ed4467f9f05ea
---
 test/cpp/tensorexpr/test_external_calls.cpp | 22 ++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp
index 9ae99ca5d3b2f..24ddfbf095ab3 100644
--- a/test/cpp/tensorexpr/test_external_calls.cpp
+++ b/test/cpp/tensorexpr/test_external_calls.cpp
@@ -563,12 +563,12 @@ TEST(ExternalCall, ComputeInterop) {
   // use Tensors built with Compute API.
   KernelScope kernel_scope;
 
-  BufHandle ConvResultBuf("ConvResult", {1, 16, 112, 112}, kFloat);
-  BufHandle MatmulResultBuf("MatmulResult", {1, 16, 112, 112}, kFloat);
+  BufHandle ConvResultBuf("ConvResult", {1, 16, 32, 32}, kFloat);
+  BufHandle MatmulResultBuf("MatmulResult", {1, 16, 32, 32}, kFloat);
 
   Tensor* Input = Compute(
       "Input",
-      {{1, "n"}, {16, "c"}, {112, "h"}, {112, "w"}},
+      {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}},
       [&](const VarHandle& n,
           const VarHandle& c,
           const VarHandle& h,
@@ -597,7 +597,7 @@ TEST(ExternalCall, ComputeInterop) {
           {}));
   Tensor* Result = Compute(
       "Result",
-      {{1, "n"}, {16, "c"}, {112, "h"}, {112, "w"}},
+      {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}},
       [&](const VarHandle& n,
           const VarHandle& c,
           const VarHandle& h,
@@ -619,18 +619,18 @@ TEST(ExternalCall, ComputeInterop) {
                      .layout(at::kStrided)
                      .device(at::kCPU)
                      .requires_grad(false);
-  at::Tensor input = at::ones({1, 16, 112, 112}, options) * 5.f;
+  at::Tensor input = at::ones({1, 16, 32, 32}, options) * 5.f;
   at::Tensor weight = at::ones({16, 16, 1, 1}, options) * 6.f;
   at::Tensor t = at::conv2d(input, weight);
   at::Tensor t2 = at::matmul(t, t);
   at::Tensor ref = t + t2;
 
   at::Tensor nnc_result;
-  std::vector<float> input_buf(1 * 16 * 112 * 112, 5.f);
+  std::vector<float> input_buf(1 * 16 * 32 * 32, 5.f);
   std::vector<float> weight_buf(16 * 16 * 1 * 1, 6.f);
-  std::vector<float> conv_result_buf(1 * 16 * 112 * 112, -1.f);
-  std::vector<float> matmul_result_buf(1 * 16 * 112 * 112, -1.f);
-  std::vector<float> result_buf(1 * 16 * 112 * 112, -1.f);
+  std::vector<float> conv_result_buf(1 * 16 * 32 * 32, -1.f);
+  std::vector<float> matmul_result_buf(1 * 16 * 32 * 32, -1.f);
+  std::vector<float> result_buf(1 * 16 * 32 * 32, -1.f);
 
 #ifdef TORCH_ENABLE_LLVM
   LLVMCodeGen llvm_codegen(
@@ -638,7 +638,7 @@ TEST(ExternalCall, ComputeInterop) {
 
   llvm_codegen.call(
       {input_buf, weight_buf, conv_result_buf, matmul_result_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
+  nnc_result = at::from_blob(result_buf.data(), {1, 16, 32, 32}, options);
   ASSERT_TRUE(at::allclose(nnc_result, ref));
 #endif
 
@@ -647,7 +647,7 @@ TEST(ExternalCall, ComputeInterop) {
 
   ir_eval.call(
       {input_buf, weight_buf, conv_result_buf, matmul_result_buf, result_buf});
-  nnc_result = at::from_blob(result_buf.data(), {1, 16, 112, 112}, options);
+  nnc_result = at::from_blob(result_buf.data(), {1, 16, 32, 32}, options);
   ASSERT_TRUE(at::allclose(nnc_result, ref));
 }
 

From 6e00b31b15ba9a09b6aa71b0da1ba200be482011 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Wed, 18 Aug 2021 22:56:47 -0700
Subject: [PATCH 056/530] [TensorExpr] Make CacheReplacer and IndexFlattener
 mutate stmts/exprs inplace. (#63527)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63527

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D30411411

Pulled By: ZolotukhinM

fbshipit-source-id: efb14ee57b36537fa4fefa89bdd6bafe7151c012
---
 test/cpp/tensorexpr/test_loopnest.cpp      |  2 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp     | 66 +++++++++++++---------
 torch/csrc/jit/tensorexpr/registerizer.cpp |  6 +-
 3 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index f2ae208ca7fed..4a2a1d07db12e 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -4017,7 +4017,7 @@ TEST(LoopNest, DeadStoreEliminationWithIntermediates) {
 
   // Will eliminate the write to g, but not f since it used by the producer of
   // h.
-  LoopNest loop(stmt, {h.node()});
+  LoopNest loop(Stmt::clone(stmt), {h.node()});
   loop.eliminateDeadStores();
 
   checkIR(loop.root_stmt(), R"IR(
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index e9bc76c6e8791..2256369e2e9e2 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -109,12 +109,13 @@ class IndexFlattener : public IRMutator {
     ExprPtr value = v->value();
     ExprPtr new_value = value->accept_mutator(this);
     if (v->indices().size() == 1 && value == new_value) {
-      return (StmtPtr)v;
+      return v;
     }
-    return alloc<Store>(
-        v->buf(),
-        std::vector<ExprPtr>({flatten_index(v->buf()->dims(), v->indices())}),
-        new_value);
+    std::vector<ExprPtr> indices = {
+        flatten_index(v->buf()->dims(), v->indices())};
+    v->set_indices(indices);
+    v->set_value(new_value);
+    return v;
   }
 };
 
@@ -2575,8 +2576,9 @@ class CacheReplacer : public IRMutator {
       ExprPtr sub = IRSimplifier::simplify(alloc<Sub>(index, offset));
       newIndices.push_back(sub);
     }
-
-    return alloc<Load>(cache_, newIndices);
+    v->set_buf(cache_);
+    v->set_indices(newIndices);
+    return v;
   }
 
   StmtPtr mutate(StorePtr v) override {
@@ -2596,8 +2598,10 @@ class CacheReplacer : public IRMutator {
       ExprPtr sub = IRSimplifier::simplify(alloc<Sub>(index, offset));
       newIndices.push_back(sub);
     }
-
-    return alloc<Store>(cache_, newIndices, newValue);
+    v->set_buf(cache_);
+    v->set_indices(newIndices);
+    v->set_value(newValue);
+    return v;
   }
 
   BufPtr buf_;
@@ -2669,21 +2673,13 @@ LoopNest::AccessResult LoopNest::cacheAccesses(
 
   // Replace acceses to the producer in the consumer with the cache.
   CacheReplacer replacer(producer, tmp_buf, info.start);
-  // TODO: Can we reuse 'consumer' below without cloning?
-  StmtPtr new_consumer =
-      IRSimplifier::simplify(Stmt::clone(consumer)->accept_mutator(&replacer));
+  consumer->accept_mutator(&replacer);
 
   // replace the old consumer with the replaced consumer.
-  BlockPtr consumer_block = nullptr;
+  BlockPtr consumer_block = to<Block>(consumer);
+  BlockPtr parent_block = to<Block>(consumer->get_parent());
   // if the consumer is a block, we should mutate it in place.
-  if ((consumer_block = to<Block>(consumer))) {
-    consumer_block->clear();
-    consumer_block->append_stmt(new_consumer);
-  } else {
-    consumer_block = to<Block>(consumer->get_parent());
-    assert(consumer_block);
-    consumer_block->replace_stmt(consumer, new_consumer);
-  }
+  bool is_block = consumer_block != nullptr;
 
   // If there's a reduction and we are operating on the reduce axis, we need to
   // initialize the cache with 0s. Also, we can't just write the result straight
@@ -2715,7 +2711,11 @@ LoopNest::AccessResult LoopNest::cacheAccesses(
           alloc<For>(new_loop_vars[i], alloc<IntImm>(0), tmp_dims[i], tmp_init);
     }
 
-    consumer_block->insert_stmt_before(tmp_init, new_consumer);
+    if (is_block) {
+      consumer_block->prepend_stmt(tmp_init);
+    } else {
+      parent_block->insert_stmt_before(tmp_init, consumer);
+    }
 
     // Reduce back to the original buffer:
     StmtPtr tmp_store = alloc<Store>(
@@ -2732,9 +2732,13 @@ LoopNest::AccessResult LoopNest::cacheAccesses(
           new_loop_vars[i], alloc<IntImm>(0), tmp_dims[i], tmp_store);
     }
 
-    consumer_block->insert_stmt_after(tmp_store, new_consumer);
+    if (is_block) {
+      consumer_block->append_stmt(tmp_store);
+    } else {
+      parent_block->insert_stmt_after(tmp_store, consumer);
+    }
 
-    return std::make_pair(tmp_buf, new_consumer);
+    return std::make_pair(tmp_buf, consumer);
   }
 
   if (hasReads) {
@@ -2747,7 +2751,11 @@ LoopNest::AccessResult LoopNest::cacheAccesses(
           new_loop_vars[i], alloc<IntImm>(0), tmp_dims[i], tmp_store);
     }
 
-    consumer_block->insert_stmt_before(tmp_store, new_consumer);
+    if (is_block) {
+      consumer_block->prepend_stmt(tmp_store);
+    } else {
+      parent_block->insert_stmt_before(tmp_store, consumer);
+    }
   }
 
   if (hasWrites) {
@@ -2760,10 +2768,14 @@ LoopNest::AccessResult LoopNest::cacheAccesses(
           new_loop_vars[i], alloc<IntImm>(0), tmp_dims[i], tmp_store);
     }
 
-    consumer_block->insert_stmt_after(tmp_store, new_consumer);
+    if (is_block) {
+      consumer_block->append_stmt(tmp_store);
+    } else {
+      parent_block->insert_stmt_after(tmp_store, consumer);
+    }
   }
 
-  return std::make_pair(tmp_buf, new_consumer);
+  return std::make_pair(tmp_buf, consumer);
 }
 
 /*
diff --git a/torch/csrc/jit/tensorexpr/registerizer.cpp b/torch/csrc/jit/tensorexpr/registerizer.cpp
index 07aee209e6e53..bc26581970383 100644
--- a/torch/csrc/jit/tensorexpr/registerizer.cpp
+++ b/torch/csrc/jit/tensorexpr/registerizer.cpp
@@ -668,8 +668,10 @@ StmtPtr RegisterizerReplacer::mutate(StorePtr v) {
 
   ExprPtr new_val = v->value()->accept_mutator(this);
 
-  return alloc<Store>(
-      info->replacement().var_wrapper, std::vector<ExprPtr>({}), new_val);
+  v->set_value(new_val);
+  v->set_buf(info->replacement().var_wrapper);
+  v->set_indices({});
+  return v;
 }
 
 StmtPtr RegisterizerReplacer::mutate(BlockPtr v) {

From 531262fe2eee7e2b4464f5eceec1877dd57e2deb Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Wed, 18 Aug 2021 22:59:40 -0700
Subject: [PATCH 057/530] layernorm (#63436)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63436

use MKLDNN layernorm

use mkldnn version 2

address Elias feedback

fix build CI errors

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D30388825

Pulled By: Krovatkin

fbshipit-source-id: fb909bfbf53cb8567a43aac40f51c491daeec908
---
 aten/src/ATen/native/mkldnn/Normalization.cpp | 51 ++++++++++++++++++
 aten/src/ATen/native/mkldnn/Utils.h           |  7 +++
 test/jit/test_freezing.py                     | 26 ++++++++-
 .../csrc/jit/passes/frozen_ops_to_mkldnn.cpp  | 54 +++++++++++++++++++
 4 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mkldnn/Normalization.cpp b/aten/src/ATen/native/mkldnn/Normalization.cpp
index 9836f3560d038..3a151828f236c 100644
--- a/aten/src/ATen/native/mkldnn/Normalization.cpp
+++ b/aten/src/ATen/native/mkldnn/Normalization.cpp
@@ -25,6 +25,13 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm_backward(
   TORCH_CHECK(false, "mkldnn_batch_norm_backward: ATen not compiled with MKLDNN support");
 }
 
+std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
+    const Tensor& input,
+    IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias,
+    double eps) {
+  TORCH_CHECK(false, "mkldnn_layer_norm_last_index_weight_bias_f32: ATen not compiled with MKLDNN support");
+}
+
 } // namespace native
 } // namespace at
 
@@ -32,10 +39,54 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm_backward(
 
 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <ATen/native/mkldnn/Utils.h>
+#include <ATen/native/layer_norm.h>
+#include <ideep/abstract_types.hpp>
 
 namespace at {
 namespace native {
 
+std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
+    const Tensor& input,
+    IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias,
+    double eps) {
+
+  TORCH_INTERNAL_ASSERT(normalized_shape.size() == 1, "only accept shapes with the last dimension");
+  TORCH_INTERNAL_ASSERT(input.scalar_type() == at::kFloat);
+  auto M_N = at::native::_check_layer_norm_inputs(input, normalized_shape, weight, bias);
+  auto M = M_N.first;
+
+  auto mean = empty_mkldnn(
+        {M},
+        input.scalar_type(),
+        input.options().layout_opt(),
+        input.options().device_opt(),
+        input.options().pinned_memory_opt());
+  auto rstd = empty_mkldnn(
+        {M},
+        input.scalar_type(),
+        input.options().layout_opt(),
+        input.options().device_opt(),
+        input.options().pinned_memory_opt());
+
+  auto mean_it = at::native::itensor_from_mkldnn(mean);
+  auto rstd_it = at::native::itensor_from_mkldnn(rstd);
+
+  auto input_it = at::native::itensor_from_mkldnn(input);
+  auto weight_it = at::native::itensor_from_mkldnn(weight);
+  auto bias_it = at::native::itensor_from_mkldnn(bias);
+
+  auto out_it = ideep::tensor(input_it.get_desc());
+  ideep::layer_normalization_forward::compute(input_it, weight_it, bias_it, out_it, mean_it, rstd_it, static_cast<float>(eps));
+
+  auto dst = at::native::new_with_itensor_mkldnn(
+      std::move(out_it),
+      optTypeMetaToScalarType(input.options().dtype_opt()),
+      input.options().device_opt());
+
+  return std::make_tuple(dst, mean, rstd);
+}
+
+
 std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm(
     const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
     bool train,
diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h
index f2e4e8f9056df..60a7d457fe12a 100644
--- a/aten/src/ATen/native/mkldnn/Utils.h
+++ b/aten/src/ATen/native/mkldnn/Utils.h
@@ -1,11 +1,18 @@
 #pragma once
 
 #include <c10/util/ArrayRef.h>
+#include <ATen/ATen.h>
 #include <vector>
 #include <cpuinfo.h>
 
+
 namespace at { namespace native {
 
+std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
+    const Tensor& input,
+    IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias,
+    double eps);
+
 std::vector<int64_t> pool_output_sizes(
     IntArrayRef input_size,
     IntArrayRef kernel_size,
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 1e2037e59a0ba..5c70b2a1ec233 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -1927,6 +1927,30 @@ def test_pool3d_batchnorm(self):
                 FileCheck().check("aten::to_dense").check_next("return").run(mod.graph)
                 self.assertTrue(torch.allclose(sub_model(inp), mod(inp)))
 
+    @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
+    @skipIfNoTorchVision
+    def test_layernorm(self):
+        with set_default_dtype(torch.float):
+            model = torchvision.models.resnet18()
+            N, C, H, W, = 10, 3, 224, 224
+            for param in ((model.conv1, [W // 2], torch.randn(N, C, H, W)),
+                          (model.conv1, [H // 2, W // 2], torch.randn(N, C, H, W)),
+                          (torch.nn.Linear(H, W), [W], torch.randn(N, C, W)),):
+
+                for layernorm in (torch.nn.LayerNorm(param[1]),
+                                  torch.nn.LayerNorm(param[1], elementwise_affine=False)):
+                    sub_model = torch.nn.Sequential(param[0], layernorm)
+                    sub_model.eval()
+                    mod = torch.jit.freeze(torch.jit.script(sub_model))
+                    self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
+                    # if weight and bias are present and shape is the last dimension
+                    # we should convert `aten::layer_norm` to `prim::MKLDNNLayerNorm`
+                    if layernorm.elementwise_affine and len(param[1]) == 1:
+                        FileCheck().check("prim::MKLDNNLayerNorm").check_count("aten::to_dense", 1, exactly=True).run(mod.graph)
+                    else:
+                        FileCheck().check_count("aten::to_dense", 1, exactly=True).run(mod.graph)
+                    self.assertTrue(torch.allclose(sub_model(param[2]), mod(param[2]), 1e-04, 1e-04))
+
     @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
     @skipIfNoTorchVision
     def test_conv_hardswish(self):
@@ -1940,6 +1964,7 @@ def __init__(self, min_val, max_val, **kwargs):
                 def forward(self, x):
                     return torch.clamp(x, self.min_val, self.max_val)
 
+            N, C, H, W, = 10, 3, 224, 224
             activations = [
                 torch.nn.Hardswish(),
                 torch.nn.Hardsigmoid(),
@@ -1960,7 +1985,6 @@ def forward(self, x):
                 sub_model = torch.nn.Sequential(model.conv1, activation)
                 sub_model.eval()
                 mod = torch.jit.freeze(torch.jit.script(sub_model))
-                N, C, H, W, = 10, 3, 224, 224
                 inp = torch.randn(N, C, H, W)
                 self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
                 FileCheck().check_count("aten::to_dense", 1, exactly=True).run(mod.graph)
diff --git a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
index e6faf90b6f2b6..3358b9db26808 100644
--- a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
+++ b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
@@ -1,9 +1,11 @@
 #include <ATen/Config.h>
 #include <ATen/Utils.h>
 #include <ATen/core/interned_strings.h>
+#include <ATen/native/layer_norm.h>
 #include <c10/core/ScalarType.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
+
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/ir/constants.h>
 #include <torch/csrc/jit/ir/ir.h>
@@ -33,6 +35,7 @@
 #if AT_MKLDNN_ENABLED()
 #include <ATen/CPUFunctions.h>
 #include <dnnl_types.h>
+#include <ATen/native/mkldnn/Utils.h>
 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <ideep.hpp>
 #endif
@@ -271,6 +274,33 @@ Operation createUnaryOp(
   };
 }
 
+void MKLDNNLayerNormOp(Stack* stack) {
+  c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
+
+  // enable_cudnn not used
+  pop(stack);
+  auto eps = pop(stack).toDouble();
+
+  Tensor bias{};
+  Tensor weight{};
+  auto bias_ival = pop(stack);
+  TORCH_INTERNAL_ASSERT(bias_ival.isTensor());
+  bias = bias_ival.toTensor();
+
+  auto weight_ival = pop(stack);
+  TORCH_INTERNAL_ASSERT(weight_ival.isTensor());
+  weight = weight_ival.toTensor();
+
+  auto shape = pop(stack).toIntVector();
+  auto input = pop(stack).toTensor();
+
+  at::Tensor dst, mean, rstd;
+  std::tie(dst, mean, rstd) =
+      at::native::mkldnn_layer_norm_last_index_weight_bias_f32(
+          input, shape, weight, bias, eps);
+  push(stack, dst);
+};
+
 Operation BroadOp(const Node* node) {
   return [](Stack* stack) {
     auto b = pop(stack).toTensor();
@@ -437,6 +467,13 @@ const RegisterOperators BroadOpReg({
         AliasAnalysisKind::INTERNAL_SPECIAL_CASE),
 });
 
+const RegisterOperators MKLDNNLayerNormOpReg({
+    torch::jit::Operator(
+        "prim::MKLDNNLayerNorm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor",
+        MKLDNNLayerNormOp,
+        AliasAnalysisKind::FROM_SCHEMA),
+});
+
 Operation ConstantMKLDNNTensorOp(const Node* node) {
   const auto& t = node->t(attr::value);
   return [t](Stack* stack) {
@@ -719,6 +756,13 @@ void ComputeSubgraphInMKLDNN(Node* subgraph_node) {
       continue;
     }
 
+    if (body_node->matches(
+            "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor")) {
+      body_node->replaceWithNewSymbol(Symbol::prim("MKLDNNLayerNorm"));
+      body_node->destroy();
+      continue;
+    }
+
     if (body_node->kind() == aten::hardswish) {
       body_node->replaceWithNewSymbol(prim::MKLDNNHardSwish);
       body_node->destroy();
@@ -917,6 +961,16 @@ class MKLDNNSubgraphSlicer {
         return false;
       }
     }
+
+    if (n->matches(
+            "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor") &&
+        n->namedInput("weight")->type() != NoneType::get() &&
+        n->namedInput("bias")->type() != NoneType::get()) {
+      auto norm_shape =
+          constant_as<std::vector<int64_t>>(n->namedInput("normalized_shape"));
+      return norm_shape.has_value() && norm_shape->size() == 1;
+    }
+
     // unary ops we dont need to prove anything else than
     // the input is mkldnn supported
     switch (n->kind()) {

From 5254e3adb8b2b14e71aee95fffe467a70fd2f93a Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Wed, 18 Aug 2021 22:59:40 -0700
Subject: [PATCH 058/530] layernom inplace (#63437)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63437

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D30388824

Pulled By: Krovatkin

fbshipit-source-id: 852d19bf238544c5de177ed5854dcd01c7ae5572
---
 aten/src/ATen/native/mkldnn/Normalization.cpp |  6 +--
 aten/src/ATen/native/mkldnn/Utils.h           |  2 +-
 test/jit/test_freezing.py                     | 37 +++++++++++++------
 .../csrc/jit/passes/frozen_ops_to_mkldnn.cpp  | 13 +++++--
 4 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/native/mkldnn/Normalization.cpp b/aten/src/ATen/native/mkldnn/Normalization.cpp
index 3a151828f236c..f01bbb3d2b4bd 100644
--- a/aten/src/ATen/native/mkldnn/Normalization.cpp
+++ b/aten/src/ATen/native/mkldnn/Normalization.cpp
@@ -28,7 +28,7 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm_backward(
 std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
     const Tensor& input,
     IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias,
-    double eps) {
+    double eps, bool inplace) {
   TORCH_CHECK(false, "mkldnn_layer_norm_last_index_weight_bias_f32: ATen not compiled with MKLDNN support");
 }
 
@@ -48,7 +48,7 @@ namespace native {
 std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
     const Tensor& input,
     IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias,
-    double eps) {
+    double eps, bool inplace) {
 
   TORCH_INTERNAL_ASSERT(normalized_shape.size() == 1, "only accept shapes with the last dimension");
   TORCH_INTERNAL_ASSERT(input.scalar_type() == at::kFloat);
@@ -75,7 +75,7 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
   auto weight_it = at::native::itensor_from_mkldnn(weight);
   auto bias_it = at::native::itensor_from_mkldnn(bias);
 
-  auto out_it = ideep::tensor(input_it.get_desc());
+  auto out_it = inplace ? input_it : ideep::tensor(input_it.get_desc());
   ideep::layer_normalization_forward::compute(input_it, weight_it, bias_it, out_it, mean_it, rstd_it, static_cast<float>(eps));
 
   auto dst = at::native::new_with_itensor_mkldnn(
diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h
index 60a7d457fe12a..abfafd5230e98 100644
--- a/aten/src/ATen/native/mkldnn/Utils.h
+++ b/aten/src/ATen/native/mkldnn/Utils.h
@@ -11,7 +11,7 @@ namespace at { namespace native {
 std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
     const Tensor& input,
     IntArrayRef normalized_shape, const Tensor& weight, const Tensor& bias,
-    double eps);
+    double eps, bool inplace = false);
 
 std::vector<int64_t> pool_output_sizes(
     IntArrayRef input_size,
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 5c70b2a1ec233..8e07af06b70ea 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -1931,6 +1931,17 @@ def test_pool3d_batchnorm(self):
     @skipIfNoTorchVision
     def test_layernorm(self):
         with set_default_dtype(torch.float):
+
+            class ResidualLayernorm(torch.nn.Module):
+                def __init__(self, op, layernorm, **kwargs):
+                    super(ResidualLayernorm, self).__init__()
+                    self.op = op
+                    self.layernorm = layernorm
+
+                def forward(self, x):
+                    y = self.op(x)
+                    return self.layernorm(y) + y
+
             model = torchvision.models.resnet18()
             N, C, H, W, = 10, 3, 224, 224
             for param in ((model.conv1, [W // 2], torch.randn(N, C, H, W)),
@@ -1939,17 +1950,21 @@ def test_layernorm(self):
 
                 for layernorm in (torch.nn.LayerNorm(param[1]),
                                   torch.nn.LayerNorm(param[1], elementwise_affine=False)):
-                    sub_model = torch.nn.Sequential(param[0], layernorm)
-                    sub_model.eval()
-                    mod = torch.jit.freeze(torch.jit.script(sub_model))
-                    self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
-                    # if weight and bias are present and shape is the last dimension
-                    # we should convert `aten::layer_norm` to `prim::MKLDNNLayerNorm`
-                    if layernorm.elementwise_affine and len(param[1]) == 1:
-                        FileCheck().check("prim::MKLDNNLayerNorm").check_count("aten::to_dense", 1, exactly=True).run(mod.graph)
-                    else:
-                        FileCheck().check_count("aten::to_dense", 1, exactly=True).run(mod.graph)
-                    self.assertTrue(torch.allclose(sub_model(param[2]), mod(param[2]), 1e-04, 1e-04))
+                    # to generate non inplace tests we extend the use of layernorm's input
+                    for inplace in (True, False):
+                        sub_model = torch.nn.Sequential(param[0], layernorm) if inplace else ResidualLayernorm(param[0], layernorm)
+                        sub_model.eval()
+                        mod = torch.jit.freeze(torch.jit.script(sub_model))
+                        self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
+                        # if weight and bias are present and shape is the last dimension
+                        # we should convert `aten::layer_norm` to `prim::MKLDNNLayerNorm`
+                        if layernorm.elementwise_affine and len(param[1]) == 1:
+                            inplace_suffix = "_" if inplace else ""
+                            (FileCheck().check("prim::MKLDNNLayerNorm" + inplace_suffix).
+                                check_count("aten::to_dense", 1, exactly=True).run(mod.graph))
+                        else:
+                            FileCheck().check_count("aten::to_dense", 1, exactly=True).check("aten::layer_norm").run(mod.graph)
+                        self.assertTrue(torch.allclose(sub_model(param[2]), mod(param[2]), 1e-04, 1e-04))
 
     @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
     @skipIfNoTorchVision
diff --git a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
index 3358b9db26808..6d218af06e34c 100644
--- a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
+++ b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
@@ -185,7 +185,8 @@ void InplaceMKLDNNSubgraph(std::shared_ptr<Graph> graph) {
     if (k == aten::relu || k == aten::sigmoid || k == aten::dropout ||
         k == prim::MKLDNNHardSwish || k == prim::MKLDNNHardSigmoid ||
         k == prim::MKLDNNHardTanh || k == aten::tanh ||
-        k == prim::MKLDNNClamp || k == Symbol::prim("MKLDNNScalarMul")) {
+        k == prim::MKLDNNClamp || k == Symbol::prim("MKLDNNScalarMul") ||
+        k == Symbol::prim("MKLDNNLayerNorm")) {
       if (set_liveness[alias_mapping[node->inputs().at(0)]]->isAfter(node)) {
         continue;
       }
@@ -274,7 +275,7 @@ Operation createUnaryOp(
   };
 }
 
-void MKLDNNLayerNormOp(Stack* stack) {
+void MKLDNNLayerNormOp(Stack* stack, bool inplace) {
   c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
 
   // enable_cudnn not used
@@ -297,7 +298,7 @@ void MKLDNNLayerNormOp(Stack* stack) {
   at::Tensor dst, mean, rstd;
   std::tie(dst, mean, rstd) =
       at::native::mkldnn_layer_norm_last_index_weight_bias_f32(
-          input, shape, weight, bias, eps);
+          input, shape, weight, bias, eps, inplace);
   push(stack, dst);
 };
 
@@ -470,7 +471,11 @@ const RegisterOperators BroadOpReg({
 const RegisterOperators MKLDNNLayerNormOpReg({
     torch::jit::Operator(
         "prim::MKLDNNLayerNorm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor",
-        MKLDNNLayerNormOp,
+        [](Stack* stack) { MKLDNNLayerNormOp(stack, false); },
+        AliasAnalysisKind::FROM_SCHEMA),
+    torch::jit::Operator(
+        "prim::MKLDNNLayerNorm_(Tensor(a!) input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor(a!)",
+        [](Stack* stack) { MKLDNNLayerNormOp(stack, true); },
         AliasAnalysisKind::FROM_SCHEMA),
 });
 

From 6bb68ba5075a9948e9f52246453e964749226098 Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@fb.com>
Date: Thu, 19 Aug 2021 02:12:44 -0700
Subject: [PATCH 059/530] Fix interpreter debug logging message (#63499)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63499

https://github.com/pytorch/pytorch/pull/62418 combine the instruction and debug handle. This change fix the debugging message.
ghstack-source-id: 136184053

Test Plan: Uncomment and it works

Reviewed By: kimishpatel, raziel

Differential Revision: D30390699

fbshipit-source-id: e32b7b297ad3b7d8bffebd025d15519083a244c4
---
 torch/csrc/jit/mobile/interpreter.cpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index 275b84beba97b..02e7c35792693 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -58,14 +58,15 @@ bool InterpreterState::run(Stack& stack) {
       Instruction inst = inst_with_handle.instruction;
       DebugHandle debug_handle = inst_with_handle.debug_handle;
 
-      //    std::cout << "RUNNING " << pc << " " << code_->instructions_[pc];
-      //    if (inst.op == OP) {
-      //      std::cout << ", " << code_->op_names_[inst.X].name;
-      //      if (!code_->op_names_[inst.X].overload_name.empty()) {
-      //        std::cout << "." << code_->op_names_[inst.X].overload_name;
-      //      }
-      //    }
-      //    std::cout << std::endl;
+      // std::cout << "RUNNING " << pc << " "
+      //           << code_->instructions_with_handles_[pc].instruction;
+      // if (inst.op == OP) {
+      //   std::cout << ", " << code_->op_names_[inst.X].name;
+      //   if (!code_->op_names_[inst.X].overload_name.empty()) {
+      //     std::cout << "." << code_->op_names_[inst.X].overload_name;
+      //   }
+      // }
+      // std::cout << std::endl;
 
       // TODO(iliacher): remove the workaround after RecordFunction is in
       // Dispatcher

From ce61100923b3b423a8c0dbbda4d551d0c1e8c358 Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Thu, 19 Aug 2021 06:19:20 -0700
Subject: [PATCH 060/530] Revert D29399533: Hoisting common expressions out of
 If blocks

Test Plan: revert-hammer

Differential Revision:
D29399533 (https://github.com/pytorch/pytorch/commit/9477211e7d609ce382c0e22d7721c14c36d083de)

Original commit changeset: 9336b9dc48c0

fbshipit-source-id: f081c7280203f40328bcbb0c03a7c6a007acedb7
---
 test/jit/test_if_hoisting.py                  | 213 ------------------
 test/quantization/jit/test_quantize_jit.py    |   3 +-
 test/test_jit.py                              |   1 -
 tools/build_variables.bzl                     |   1 -
 torch/_C/__init__.pyi.in                      |   1 -
 torch/csrc/jit/ir/node_hashing.cpp            |  14 --
 .../jit/passes/common_expression_hoisting.cpp | 153 -------------
 .../jit/passes/common_expression_hoisting.h   |  10 -
 .../jit/passes/symbolic_shape_analysis.cpp    |   1 -
 torch/csrc/jit/python/init.cpp                |   6 -
 torch/csrc/jit/runtime/graph_executor.cpp     |   9 +-
 .../runtime/profiling_graph_executor_impl.cpp | 111 ++++++++-
 12 files changed, 105 insertions(+), 418 deletions(-)
 delete mode 100644 test/jit/test_if_hoisting.py
 delete mode 100644 torch/csrc/jit/passes/common_expression_hoisting.cpp
 delete mode 100644 torch/csrc/jit/passes/common_expression_hoisting.h

diff --git a/test/jit/test_if_hoisting.py b/test/jit/test_if_hoisting.py
deleted file mode 100644
index c8fd4a4bab349..0000000000000
--- a/test/jit/test_if_hoisting.py
+++ /dev/null
@@ -1,213 +0,0 @@
-
-import torch
-from torch.testing import FileCheck
-from torch.testing._internal.jit_utils import JitTestCase
-
-if __name__ == "__main__":
-    raise RuntimeError(
-        "This test file is not meant to be run directly, use:\n\n"
-        "\tpython test/test_jit.py TESTNAME\n\n"
-        "instead."
-    )
-
-
-class TestIfHoisting(JitTestCase):
-    def test_if_hoist_basic(self):
-        def fn(x: bool, y: int):
-            if x:
-                z = y + 3
-            else:
-                z = y + 3
-            return z
-
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-        FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph)
-        self.assertEqual(fn(True, 1), fn_script(True, 1))
-
-    def test_if_hoist_transposed_expr(self):
-        """
-        Making sure that we can properly eliminate
-        an expression even if it is not at the start
-        of a block
-        """
-        def fn(x: bool, y: int):
-            if x:
-                a = y + 3
-                b = y * 2
-            else:
-                b = y * 2
-                a = y + 3
-            return a, b
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph)
-
-        self.assertEqual(fn(True, 1), fn_script(True, 1))
-        self.assertEqual(fn(False, 5), fn_script(False, 5))
-
-    def test_if_hoist_swapped_expr(self):
-        """
-        Making sure that the if statement
-        doesn't get fully eliminated here
-        """
-        def fn(x: bool, y: int):
-            if x:
-                a = y + 3
-                b = y * 2
-            else:
-                a = y * 2
-                b = y + 3
-            return a, b
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph)
-
-        self.assertEqual(fn(True, 1), fn_script(True, 1))
-        self.assertEqual(fn(False, 5), fn_script(False, 5))
-
-    def test_if_hoist_reused_var(self):
-        """
-        Making sure that cases where the python variable is reused
-        is handled correctly
-        """
-        def fn(x: bool, y: int):
-            b = 6
-            if x:
-                a = y + 3
-                a = y * 2
-            else:
-                a = y * 2
-                b = y + 3
-            return a, b
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::mul", 1, exactly=True).run(op_graph)
-
-        self.assertEqual(fn(True, 1), fn_script(True, 1))
-        self.assertEqual(fn(False, 5), fn_script(False, 5))
-
-    def test_no_hoist(self):
-        """
-        Nothing should happen here, expressions are different
-        """
-        def fn(x: bool, y: int, z: int):
-            if x:
-                a = y + 3
-            else:
-                a = z + 3
-            return a
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph)
-
-        self.assertEqual(fn(True, 1, 3), fn_script(True, 1, 3))
-        self.assertEqual(fn(False, 5, 10), fn_script(False, 5, 10))
-
-    def test_mutate_before(self):
-        """
-        Make sure that if there is a mutation before the common
-        op, the hoist doesn't happen
-        """
-        def fn(x: bool, y: torch.Tensor):
-            if x:
-                y.add_(8)
-                a = y + 3
-            else:
-                a = y + 3
-            return a
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add_", 1, exactly=True).run(op_graph)
-
-        t1 = torch.Tensor([1])
-        t2 = torch.Tensor([5, 6])
-        self.assertEqual(fn(True, t1), fn_script(True, t1))
-        self.assertEqual(fn(False, t2), fn_script(False, t2))
-
-    def test_mutate_after(self):
-        """
-        Check that the hoist can happen properly, and
-        that the output is still correct.
-        """
-        def fn(x: bool, y: torch.Tensor):
-            if x:
-                b = 1
-                a = y + 3
-                y.add_(8)
-            else:
-                b = 2
-                a = y + 3
-            c = b + a
-            return a
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph)
-
-        t1 = torch.Tensor([1])
-        t2 = torch.Tensor([5, 6])
-        self.assertEqual(fn(True, t1.clone()), fn_script(True, t1.clone()))
-        self.assertEqual(fn(False, t2.clone()), fn_script(False, t2.clone()))
-
-    def test_multiple_hoists(self):
-        """
-        test that hoists that depend on other hoists are done correctly
-        """
-        def fn(x: bool, y: torch.Tensor):
-            if x:
-                a = y + 3
-                b = a + y
-            else:
-                a = y + 3
-                b = a + y
-            c = b * 2
-            return c
-
-        fn_script = torch.jit.script(fn)
-        op_graph = fn_script.graph
-        self.run_pass("common_expression_hoisting", op_graph)
-        self.run_pass("dce", op_graph)
-
-        FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph)
-        FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph)
-
-        t1 = torch.Tensor([1])
-        t2 = torch.Tensor([5, 6])
-        self.assertEqual(fn(True, t1), fn_script(True, t1))
-        self.assertEqual(fn(False, t2), fn_script(False, t2))
diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py
index 5fde8e2cc533d..14bb31cf07f1a 100644
--- a/test/quantization/jit/test_quantize_jit.py
+++ b/test/quantization/jit/test_quantize_jit.py
@@ -1214,7 +1214,6 @@ class Res(torch.nn.Module):
             def __init__(self):
                 super(Res, self).__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 1).float()
-                self.conv2 = torch.nn.Conv2d(3, 3, 1).float()
                 self.use_skip = True
 
             def forward(self, x: torch.Tensor, cond: bool) -> torch.Tensor:
@@ -1223,7 +1222,7 @@ def forward(self, x: torch.Tensor, cond: bool) -> torch.Tensor:
                 if self.use_skip:
                     return self.conv(x)
                 else:
-                    return self.conv2(x)
+                    return self.conv(x)
 
         class M(torch.nn.Module):
             def __init__(self):
diff --git a/test/test_jit.py b/test/test_jit.py
index 6cf1d8e7d5c6e..99df960da5dc4 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -23,7 +23,6 @@
 from jit.test_builtins import TestBuiltins, TestTensorBuiltins  # noqa: F401
 from jit.test_ignore_context_manager import TestIgnoreContextManager  # noqa: F401
 from jit.test_symbolic_shape_analysis import TestSymbolicShapeAnalysis  # noqa: F401
-from jit.test_if_hoisting import TestIfHoisting  # noqa: F401
 from jit.test_unsupported_ops import TestUnsupportedOps  # noqa: F401
 from jit.test_freezing import TestFreezing, TestFrozenOptimizations, TestMKLDNNReinplacing  # noqa: F401
 from jit.test_peephole import TestPeephole  # noqa: F401
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 2e71bedb35db4..89697b4428ca1 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -191,7 +191,6 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/passes/clear_profiling.cpp",
     "torch/csrc/jit/passes/clear_undefinedness.cpp",
     "torch/csrc/jit/passes/common_subexpression_elimination.cpp",
-    "torch/csrc/jit/passes/common_expression_hoisting.cpp",
     "torch/csrc/jit/passes/concat_opt.cpp",
     "torch/csrc/jit/passes/constant_pooling.cpp",
     "torch/csrc/jit/passes/constant_propagation.cpp",
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 30885d3107176..b683a60615dc5 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -204,7 +204,6 @@ def _jit_pass_inline(Graph) -> None: ...
 def _jit_pass_constant_propagation(Graph) -> None: ...
 def _jit_pass_propagate_shapes_on_graph(Graph) -> None: ...
 def _jit_erase_non_input_shape_information(Graph) -> None: ...
-def _jit_pass_common_expression_hoisting(Graph) -> None: ...
 def _jit_get_schemas_for_operator(name :str) -> List[FunctionSchema]: ...
 def _jit_check_alias_annotation(g: Graph, args: Tuple[Any, ...], unqualified_op_name: str): ...
 def _jit_can_fuse_on_cpu() -> _bool: ...
diff --git a/torch/csrc/jit/ir/node_hashing.cpp b/torch/csrc/jit/ir/node_hashing.cpp
index 9a876d062d2fd..3fd4974ed421b 100644
--- a/torch/csrc/jit/ir/node_hashing.cpp
+++ b/torch/csrc/jit/ir/node_hashing.cpp
@@ -204,8 +204,6 @@ bool attributesEqualCSE(const Node* lhs, const Node* rhs) {
 
 } // anonymous namespace
 
-// Makes a hash that hashes the input Value, the output type
-// as well as the node attributes
 size_t HashNode::operator()(const Node* k) const {
   AT_ASSERT(k != nullptr);
   size_t constant_hash = 0;
@@ -233,8 +231,6 @@ size_t HashNode::operator()(const Node* k) const {
       constant_hash);
 };
 
-// Checks that two nodes have the same inputs, output types
-// and node attributes.
 bool EqualNode::operator()(const Node* lhs, const Node* rhs) const {
   if (lhs == nullptr && rhs == nullptr)
     return true;
@@ -265,16 +261,6 @@ bool EqualNode::operator()(const Node* lhs, const Node* rhs) const {
   if (!attributesEqualCSE(lhs, rhs))
     return false;
 
-  // Check if the blocks contained in a op are the same
-  if (lhs->blocks().size() != rhs->blocks().size()) {
-    return false;
-  }
-  for (size_t i = 0; i < lhs->blocks().size(); ++i) {
-    if (lhs->blocks()[i] != rhs->blocks()[i]) {
-      return false;
-    }
-  }
-
   return true;
 };
 
diff --git a/torch/csrc/jit/passes/common_expression_hoisting.cpp b/torch/csrc/jit/passes/common_expression_hoisting.cpp
deleted file mode 100644
index ab2b9d41afa8b..0000000000000
--- a/torch/csrc/jit/passes/common_expression_hoisting.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-#include <torch/csrc/jit/passes/common_expression_hoisting.h>
-
-#include <torch/csrc/jit/ir/alias_analysis.h>
-#include <torch/csrc/jit/ir/ir.h>
-#include <torch/csrc/jit/ir/node_hashing.h>
-#include <torch/csrc/jit/jit_log.h>
-
-#include <cstddef>
-#include <unordered_set>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace {
-
-struct CommonExpressionHoister {
-  CommonExpressionHoister(std::shared_ptr<Graph> graph)
-      : graph_(std::move(graph)) {}
-
-  bool run() {
-    HoistCommonExpression(graph_->block());
-    return changed_;
-  }
-
-  void HoistFromIfNode(Node* if_node) {
-    Block* true_block = if_node->blocks()[0];
-    Block* false_block = if_node->blocks()[1];
-    // find common statements in the two subblocks
-
-    auto true_block_nodes = std::unordered_set<Node*, HashNode, EqualNode>(
-        true_block->nodes().begin(), true_block->nodes().end());
-    for (auto it = false_block->nodes().begin();
-         it != false_block->nodes().end();) {
-      Node* false_b_node = *it;
-      // node may be moved to a different block so advance iterator now
-      ++it;
-
-      auto matching_elem = true_block_nodes.find(false_b_node);
-      if (matching_elem == true_block_nodes.end()) {
-        continue;
-      }
-      Node* true_b_node = *matching_elem;
-
-      // Check if a move to the front of the block is valid
-      // If both of the moves are valid, then we know we can move the item out
-      // of the if blocks entirely.
-      AliasDb& aliasDb = getOrCreateAliasDb();
-      bool true_moveable = aliasDb.couldMoveAfterTopologically(
-          true_b_node, true_block->nodes().front());
-      bool false_moveable = aliasDb.couldMoveAfterTopologically(
-          false_b_node, false_block->nodes().front());
-
-      if (!true_moveable || !false_moveable) {
-        continue;
-      }
-
-      // Get all the uses of the output to delete and reinsert them
-      // as the input would change, the HashNode value would also change.
-      std::unordered_set<Node*> true_b_uses;
-      for (Value* true_out : true_b_node->outputs()) {
-        for (Use true_use : true_out->uses()) {
-          if (true_use.user->owningBlock() == true_block) {
-            // Make sure we are not accidentally adding stuff from subblocks
-            true_b_uses.insert(true_use.user);
-          }
-        }
-      }
-      for (Node* uses_node : true_b_uses) {
-        true_block_nodes.erase(uses_node);
-      }
-
-      // Now hoist the statement out of the block
-      changed_ = true;
-      false_b_node->moveBefore(if_node);
-
-      true_b_node->replaceAllUsesWith(false_b_node);
-
-      true_block_nodes.erase(true_b_node);
-      true_block_nodes.insert(true_b_uses.cbegin(), true_b_uses.cend());
-      true_b_node->destroy();
-    }
-  }
-
-  void EliminateUnnecessaryIfOutputs(Node* if_node) {
-    Block* true_block = if_node->blocks()[0];
-    Block* false_block = if_node->blocks()[1];
-
-    // fix up the if block outputs
-    for (size_t i = 0; i < true_block->outputs().size();) {
-      // Need to check both sides match to eliminate common if block outputs
-      Value* true_block_output = true_block->outputs().at(i);
-      Value* false_block_output = false_block->outputs().at(i);
-      if (true_block_output != false_block_output) {
-        i++;
-        continue;
-      }
-
-      // We have a matching output, and can remove it from the block itself
-      if_node->outputs().at(i)->replaceAllUsesWith(true_block_output);
-      if_node->eraseOutput(i);
-      true_block->eraseOutput(i);
-      false_block->eraseOutput(i);
-      changed_ = true;
-    }
-
-    // No need to test here if the IF block should be eliminated.
-    // The DCE pass will determine that for us.
-  }
-
-  void HoistCommonExpression(Block* block) {
-    for (auto it = block->nodes().begin(); it != block->nodes().end();) {
-      Node* node = *it;
-      ++it;
-
-      for (auto sub_block : node->blocks()) {
-        HoistCommonExpression(sub_block);
-      }
-
-      if (node->kind() == prim::If) {
-        HoistFromIfNode(node);
-        EliminateUnnecessaryIfOutputs(node);
-      }
-    }
-  }
-
-  AliasDb& getOrCreateAliasDb() {
-    if (!alias_db_) {
-      alias_db_ = std::make_unique<AliasDb>(graph_);
-    }
-
-    return *alias_db_;
-  }
-
- private:
-  std::unique_ptr<AliasDb> alias_db_;
-  std::shared_ptr<Graph> graph_;
-  bool changed_ = false;
-};
-} // anonymous namespace
-bool HoistCommonExpression(const std::shared_ptr<Graph>& graph) {
-  // This moves common subexpressions from the two sides of an
-  // if block out of the if block.
-
-  GRAPH_DUMP("Before CEH", graph);
-  CommonExpressionHoister ceh(graph);
-  bool changed = ceh.run();
-  if (changed) {
-    GRAPH_DUMP("After CEH Changes", graph);
-  }
-  return changed;
-}
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/passes/common_expression_hoisting.h b/torch/csrc/jit/passes/common_expression_hoisting.h
deleted file mode 100644
index 2aad158eea8f8..0000000000000
--- a/torch/csrc/jit/passes/common_expression_hoisting.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#pragma once
-
-#include <torch/csrc/jit/ir/ir.h>
-
-namespace torch {
-namespace jit {
-
-TORCH_API bool HoistCommonExpression(const std::shared_ptr<Graph>& graph);
-}
-} // namespace torch
diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
index 10edfb4c496ef..f74a91176cfe8 100644
--- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
@@ -6,7 +6,6 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/ir_views.h>
 #include <torch/csrc/jit/jit_log.h>
-#include <torch/csrc/jit/passes/common_expression_hoisting.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index d582035d6e95e..5fca575593551 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -12,7 +12,6 @@
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include <torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h>
-#include <torch/csrc/jit/passes/common_expression_hoisting.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
@@ -283,11 +282,6 @@ void initJITBindings(PyObject* module) {
           [](std::shared_ptr<Graph>& g) {
             return EliminateCommonSubexpression(g); // overload resolution
           })
-      .def(
-          "_jit_pass_common_expression_hoisting",
-          [](std::shared_ptr<Graph>& g) {
-            return HoistCommonExpression(g); // overload resolution
-          })
       .def(
           "_jit_pass_fuse_quantized_add_relu",
           [](std::shared_ptr<Graph>& g) {
diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp
index bb5f272080601..476882650a1dd 100644
--- a/torch/csrc/jit/runtime/graph_executor.cpp
+++ b/torch/csrc/jit/runtime/graph_executor.cpp
@@ -9,7 +9,6 @@
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/batch_mm.h>
 #include <torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h>
-#include <torch/csrc/jit/passes/common_expression_hoisting.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
@@ -919,7 +918,7 @@ void runOptimization(
       "After EliminateDeadCode, before EliminateCommonSubexpression\n", *graph);
   EliminateCommonSubexpression(graph);
   GRAPH_DEBUG(
-      "After EliminateCommonSubexpression , before PeepholeOptimize\n", *graph);
+      "After EliminateCommonSubexpression, before PeepholeOptimize\n", *graph);
 
   PeepholeOptimize(graph);
   GRAPH_DEBUG("After PeepholeOptimize, before ConstantPropagation\n", *graph);
@@ -950,10 +949,8 @@ void runOptimization(
 
   EliminateCommonSubexpression(graph);
   GRAPH_DEBUG(
-      "After EliminateCommonSubexpression, before HoistCommonExpression\n",
-      *graph);
-  HoistCommonExpression(graph);
-  GRAPH_DEBUG("After HoistCommonExpression, before CheckInplace\n", *graph);
+      "After EliminateCommonSubexpression, before CheckInplace\n", *graph);
+
   CheckInplace(graph);
   GRAPH_DEBUG("After CheckInplace (end of runOptimization)", *graph);
 }
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
index 40d94a4a205fe..b099db17931b0 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@@ -7,7 +7,6 @@
 #include <torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h>
 #include <torch/csrc/jit/passes/clear_profiling.h>
 #include <torch/csrc/jit/passes/clear_undefinedness.h>
-#include <torch/csrc/jit/passes/common_expression_hoisting.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
@@ -333,16 +332,112 @@ void runPreAutodiffPassPipeline(std::shared_ptr<Graph>& graph) {
 
     EliminateCommonSubexpression(graph);
     GRAPH_DEBUG(
-        "After EliminateCommonSubexpression, before HoistCommonExpression\n",
-        *graph);
-    HoistCommonExpression(graph);
-    GRAPH_DEBUG("After HoistCommonExpression, before CheckInplace\n", *graph);
+        "After EliminateCommonSubexpression, before CheckInplace\n", *graph);
+
     CheckInplace(graph);
   }
   GRAPH_DEBUG(
       "After CheckInplace (end of runPreAutodiffPassPipeline)\n", *graph);
 }
 
+void runDiffGraphPasses(std::shared_ptr<Graph>& graph) {
+  GRAPH_DEBUG(
+      "Before EliminateDeadCode (beginning of runDiffGraphPasses)\n", *graph);
+  // runOptimization:
+  {
+    // Basic graph preprocessing to eliminate noise.
+    EliminateDeadCode(graph);
+    GRAPH_DEBUG(
+        "After EliminateDeadCode, before EliminateCommonSubexpression\n",
+        *graph);
+    EliminateCommonSubexpression(graph);
+    GRAPH_DEBUG(
+        "After EliminateCommonSubexpression, before PeepholeOptimize\n",
+        *graph);
+
+    PeepholeOptimize(graph);
+    GRAPH_DEBUG("After PeepholeOptimize, before ConstantPropagation\n", *graph);
+    ConstantPropagation(graph);
+    GRAPH_DEBUG("After ConstantPropagation, before ConstantPooling\n", *graph);
+    ConstantPooling(graph);
+    GRAPH_DEBUG("After ConstantPooling, before UnrollLoops\n", *graph);
+
+    UnrollLoops(graph);
+    GRAPH_DEBUG("After UnrollLoops, before RemoveListMutation\n", *graph);
+    // run again with unrolled loops
+    RemoveListMutation(graph);
+    GRAPH_DEBUG("After RemoveListMutation, before PeepholeOptimize\n", *graph);
+    PeepholeOptimize(graph);
+    GRAPH_DEBUG("After PeepholeOptimize, before ConstantPropagation\n", *graph);
+    ConstantPropagation(graph);
+    GRAPH_DEBUG(
+        "After ConstantPropagation, before EliminateCommonSubexpression\n",
+        *graph);
+
+    EliminateCommonSubexpression(graph);
+    GRAPH_DEBUG(
+        "After EliminateCommonSubexpression, before CheckInplace\n", *graph);
+
+    CheckInplace(graph);
+  }
+  GRAPH_DEBUG("After CheckInplace, before customPrePasses\n", *graph);
+
+  // runNondiffOptimization
+  {
+    // Run custom passes that different backends can register.
+    for (const auto& passPair : getCustomPrePasses()) {
+      passPair.first(graph);
+    }
+    GRAPH_DEBUG("After customPrePasses, before LowerSimpleTuples\n", *graph);
+
+    // TupleConstruct / TupleUnpack pairs can still be present at this point
+    // and must be removed for fusion.
+    LowerSimpleTuples(graph);
+    GRAPH_DEBUG("After LowerSimpleTuples\n", *graph);
+
+    if (tensorExprFuserEnabled()) {
+      // Remove prim::profile nodes and embed the profile info directly in the
+      // IR in value types. We're doing such transformation as optimizations
+      // that try to merge/fuse nodes in the graph (e.g. BatchMM and GraphFuser)
+      // work worse in the presence of intermittent prim::profile nodes.
+      // Optimizations relying on the type info are also responsible for
+      // inserting proper type checks. Once we're done with these optimizations
+      // we will wipe the tensor type information from the IR, so that it's not
+      // accidentally used by any other pass.
+      RemoveProfileNodesAndSpecializeTypes(graph);
+      GRAPH_DEBUG(
+          "After RemoveProfileNodesAndSpecializeTypes, before BatchMM\n",
+          *graph);
+      // Rewrite subgraphs with many MMs into expressions that batch them.
+      BatchMM(graph);
+      GRAPH_DEBUG("After BatchMM, before Fusion\n", *graph);
+
+      FuseTensorExprs(graph, getFusionGroupInlining() ? 2 : 1);
+      GRAPH_DEBUG(
+          "After Fusion, before RemoveTensorTypeSpecializations\n", *graph);
+
+      // Wipe tensor type info from the IR
+      RemoveTensorTypeSpecializations(graph);
+      GRAPH_DEBUG(
+          "After RemoveTensorTypeSpecializations, before customPostPasses\n",
+          *graph);
+    } else {
+      // Rewrite subgraphs with many MMs into expressions that batch them.
+      BatchMM(graph);
+      GRAPH_DEBUG("After BatchMM, before Fusion\n", *graph);
+
+      FuseGraph(graph, true);
+      GRAPH_DEBUG("After Fusion, before customPostPasses\n", *graph);
+    }
+
+    // Run custom post-fusion passes
+    for (const auto& passPair : getCustomPostPasses()) {
+      passPair.first(graph);
+    }
+  }
+  GRAPH_DEBUG("After customPostPasses (end of runDiffGraphPasses)\n", *graph);
+}
+
 void runNoGradOptimizations(std::shared_ptr<Graph>& graph) {
   GRAPH_DEBUG(
       "After customPostPasses (beginning of runNoGradOptimizations)\n", *graph);
@@ -498,11 +593,7 @@ void ProfilingGraphExecutorImpl::runProfilingInsensitiveOptimizations(
   DecomposeOps(graph);
   GRAPH_DEBUG("After DecomposeOps, before ConstantPropagation\n", *graph);
   ConstantPropagation(graph);
-  GRAPH_DEBUG(
-      "After ConstantPropagation, before HoistCommonExpression\n", *graph);
-  HoistCommonExpression(graph);
-  GRAPH_DEBUG(
-      "After EliminateCommonSubexpression, before ElimiateDeadCode\n", *graph);
+  GRAPH_DEBUG("After ConstantPropagation, before EliminateDeadCode\n", *graph);
   EliminateDeadCode(graph);
   GRAPH_DEBUG(
       "After EliminateDeadCode, before EliminateCommonSubexpression\n", *graph);

From 47a9e8ff320b638fcff0e25147e7f042740bf734 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Thu, 19 Aug 2021 06:37:44 -0700
Subject: [PATCH 061/530] [Static Runtime] Support __getitem__ for lists
 (#63398)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63398

This change provides a native `__getitem__` implementation for lists to avoid overhead associated with falling back to the JIT interpreter.

Test Plan: Unit tests: `buck test //caffe2/benchmarks/static_runtime:static_runtime_cpptest`

Reviewed By: hlu1

Differential Revision: D30368464

fbshipit-source-id: e0e0971508cd5d9bcf6025606993dc24ecbf6764
---
 benchmarks/static_runtime/test_scripts.h      | 18 ++++++++++--
 .../static_runtime/test_static_runtime.cc     | 21 ++++++++++----
 torch/csrc/jit/runtime/static/native_ops.cpp  | 29 ++++++++++++-------
 3 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index 6045a1c2f9772..8db8da2887799 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -632,24 +632,36 @@ const auto argmin_with_keep_dim_script = R"JIT(
       return torch.argmin(a, dim, True).clone()
 )JIT";
 
-const auto getitem_tensor_script = R"JIT(
+const auto getitem_dict_tensor_script = R"JIT(
   def forward(self, key: Tensor):
       d = {key: 1}
       return d[key]
 )JIT";
 
-const auto getitem_int_script = R"JIT(
+const auto getitem_dict_int_script = R"JIT(
   def forward(self, key: int):
       d = {key: 1}
       return d[key]
 )JIT";
 
-const auto getitem_str_script = R"JIT(
+const auto getitem_dict_str_script = R"JIT(
   def forward(self, key: str):
       d = {key: 1}
       return d[key]
 )JIT";
 
+const auto getitem_list_int_script = R"JIT(
+  def forward(self, idx: int):
+      lst = [1, 2, 3]
+      return lst[idx]
+)JIT";
+
+const auto getitem_list_tensor_script = R"JIT(
+  def forward(self, tensor: Tensor, idx: int):
+      lst = [tensor, tensor]
+      return lst[idx]
+)JIT";
+
 const auto transpose_script = R"JIT(
   def forward(self, a: Tensor, dim1: int, dim2: int):
       return torch.transpose(a, dim1, dim2).clone()
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 7af49d6c8fa63..14d613f074858 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -1043,19 +1043,30 @@ TEST(StaticRuntime, IndividualOps_Argmin) {
   testStaticRuntime(argmin_with_keep_dim_script, args_a, args_b);
 }
 
-TEST(StaticRuntime, IndividualOps_GetItem) {
+TEST(StaticRuntime, IndividualOps_GetItem_Dict) {
   int int_key = 0;
   std::string str_key = "str";
 
   // No need to test these multiple times, args are not tensors
-  testStaticRuntime(getitem_int_script, {int_key});
-  testStaticRuntime(getitem_str_script, {str_key});
+  testStaticRuntime(getitem_dict_int_script, {int_key});
+  testStaticRuntime(getitem_dict_str_script, {str_key});
 
   auto a = torch::tensor({1});
   auto b = torch::tensor({1, 1});
 
-  testStaticRuntime(getitem_tensor_script, {a});
-  testStaticRuntime(getitem_tensor_script, {a}, {b});
+  testStaticRuntime(getitem_dict_tensor_script, {a});
+  testStaticRuntime(getitem_dict_tensor_script, {a}, {b});
+}
+
+TEST(StaticRuntime, IndividualOps_GetItem_List) {
+  testStaticRuntime(getitem_list_int_script, {1});
+  testStaticRuntime(getitem_list_int_script, {-1});
+
+  auto a = torch::tensor({1});
+  auto b = torch::tensor({1, 1});
+
+  testStaticRuntime(getitem_list_tensor_script, {a, 1});
+  testStaticRuntime(getitem_list_tensor_script, {a, 1}, {b, -1});
 }
 
 TEST(StaticRuntime, IndividualOps_Transpose) {
diff --git a/torch/csrc/jit/runtime/static/native_ops.cpp b/torch/csrc/jit/runtime/static/native_ops.cpp
index d84b1cd8b28d2..616ad87119ab3 100644
--- a/torch/csrc/jit/runtime/static/native_ops.cpp
+++ b/torch/csrc/jit/runtime/static/native_ops.cpp
@@ -9,6 +9,7 @@
 #include <ATen/native/TensorAdvancedIndexing.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/register_ops_utils.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
 namespace torch {
@@ -100,17 +101,25 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR(
       if (n->inputs().size() != 2) {
         return nullptr;
       }
-      // TODO: make __getitem__ work for other container types
-      if (n->input(0)->type()->castRaw<DictType>() == nullptr) {
-        return nullptr;
+
+      if (n->input(0)->type()->castRaw<DictType>()) {
+        return [](ProcessedNode* p_node) {
+          auto dict = p_node->Input(0).toGenericDict();
+          auto key = p_node->Input(1);
+          auto value = dict.find(key);
+          TORCH_CHECK(value != dict.end(), "Key not in dict: ", key);
+          p_node->Output(0) = value->value();
+        };
+      } else if (n->input(0)->type()->castRaw<ListType>()) {
+        return [](ProcessedNode* p_node) {
+          auto list = p_node->Input(0).toList();
+          auto idx = p_node->Input(1).toInt();
+          p_node->Output(0) = getItem(list, idx);
+        };
       }
-      return [](ProcessedNode* p_node) {
-        auto dict = p_node->Input(0).toGenericDict();
-        auto key = p_node->Input(1);
-        auto value = dict.find(key);
-        TORCH_CHECK(value != dict.end(), "Key not in dict: ", key);
-        p_node->Output(0) = value->value();
-      };
+
+      // TODO(T98581096): make __getitem__ work for other container types
+      return nullptr;
     });
 
 REGISTER_NATIVE_OPERATOR_FUNCTOR(

From 2d5b19f62b636eb9924bcef43a23690c2b8018b4 Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Thu, 19 Aug 2021 06:47:31 -0700
Subject: [PATCH 062/530] Update full backward hook doc with not-same-object
 note (#63245)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/61446

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63245

Reviewed By: ejguan

Differential Revision: D30352656

Pulled By: albanD

fbshipit-source-id: 7000ecb54a80f2da968ec7600b98574b608578ae
---
 torch/nn/modules/module.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index f4ef4533de600..2376422117306 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -145,13 +145,6 @@ def register_module_full_backward_hook(
         This adds global state to the `nn.module` module
         and it is only intended for debugging/profiling purposes.
 
-        The current implementation will not have the presented behavior
-        for complex :class:`Module` that perform many operations.
-        In some failure cases, :attr:`grad_input` and :attr:`grad_output` will only
-        contain the gradients for a subset of the inputs and outputs.
-        For such :class:`Module`, you should use :func:`torch.Tensor.register_hook`
-        directly on a specific input or output to get the required gradients.
-
     The hook will be called every time the gradients with respect to module
     inputs are computed. The hook should have the following signature::
 
@@ -165,6 +158,10 @@ def register_module_full_backward_hook(
     in :attr:`grad_input` and :attr:`grad_output` will be ``None`` for all non-Tensor
     arguments.
 
+    For technical reasons, when this hook is applied to a Module, its forward function will
+    receive a view of each Tensor passed to the Module. Similarly the caller will receive a view
+    of each Tensor returned by the Module's forward function.
+
     Global hooks are called before hooks registered with `register_backward_hook`
 
     Returns:
@@ -907,6 +904,10 @@ def register_full_backward_hook(
         in :attr:`grad_input` and :attr:`grad_output` will be ``None`` for all non-Tensor
         arguments.
 
+        For technical reasons, when this hook is applied to a Module, its forward function will
+        receive a view of each Tensor passed to the Module. Similarly the caller will receive a view
+        of each Tensor returned by the Module's forward function.
+
         .. warning ::
             Modifying inputs or outputs inplace is not allowed when using backward hooks and
             will raise an error.

From e7c4988b526afa73b8bae4b18daaa9a86dbfbff2 Mon Sep 17 00:00:00 2001
From: Ilqar Ramazanli <iramazanli@fb.com>
Date: Thu, 19 Aug 2021 07:15:16 -0700
Subject: [PATCH 063/530] To fix the chainability at epoch zero for some
 schedulers (#63457)

Summary:
It has been discussed in the https://github.com/pytorch/pytorch/pull/60836#issuecomment-899084092 that we have observed an obstacle to chain some type of learning rate schedulers. In particular we observed

* some of the learning rate schedulers returns initial learning rates at epoch 0 as
```
       return self.base_lrs`
```

* This can be a problem when two schedulers called as chained as

```
     scheduler1.step()
     scheduler2.step()
```

in particular, we completely ignore the effect of scheduler1 at epoch 0.  This could not be an issue if at epoch 0, scheduler1 was ineffective as in many schedulers, however for schedulers as WarmUp Schedulers, where at epoch 0 schedulers multiplicative value is smaller than 1 this could lead to undesired behaviors.

The following code snippet illustrates the problem better

## Reproducing the bug

```python
import torch
from torch.nn import Parameter
from torch.optim import SGD
from torch.optim.lr_scheduler import WarmUpLR, ExponentialLR

model = [Parameter(torch.randn(2, 2, requires_grad=True))]
optimizer = SGD(model, 1.0)
scheduler1 = WarmUpLR(optimizer, warmup_factor=0.1, warmup_iters=5, warmup_method="constant")
scheduler2 = ExponentialLR(optimizer, gamma=0.9)

for epoch in range(10):
     print(epoch, scheduler2.get_last_lr()[0])
     optimizer.step()
     scheduler1.step()
     scheduler2.step()
```

### Current Result

```
0 1.0
1 0.9
2 0.81
3 0.7290000000000001
4 0.6561000000000001
5 5.904900000000001
6 5.314410000000001
7 4.782969000000001
8 4.304672100000001
9 3.874204890000001
```

### Expected Result

```
0 1.0
1 0.9
2 0.81
3 0.7290000000000001
4 0.6561000000000001
5 0.5904900000000001
6 0.5314410000000001
7 0.4782969000000001
8 0.4304672100000001
9 0.3874204890000001
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63457

Reviewed By: datumbox

Differential Revision: D30424160

Pulled By: iramazanli

fbshipit-source-id: 3e15af8d278c872cd6f53406b55f4d3ce5002867
---
 test/test_optim.py          | 12 ++++++------
 torch/optim/lr_scheduler.py |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/test/test_optim.py b/test/test_optim.py
index 20b8e5c443de5..01ec43bbea883 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -440,8 +440,8 @@ def test_adam(self):
             )
             self._test_basic_cases(
                 lambda weight, bias: optimizer([weight, bias], lr=1e-3, amsgrad=True),
-                [lambda opt: ExponentialLR(opt, gamma=0.9),
-                 lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="constant")]
+                [lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="constant"),
+                 lambda opt: ExponentialLR(opt, gamma=0.9)]
             )
             self._test_basic_cases(
                 lambda weight, bias: optimizer([weight, bias], lr=1e-3, amsgrad=True),
@@ -1294,8 +1294,8 @@ def test_compound_exp_and_linear_warmup_lr(self):
         for i in range(iters):
             single_targets[i] *= factor + i / iters * (1 - factor)
         targets = [single_targets, [x * epochs for x in single_targets]]
-        schedulers[0] = ExponentialLR(self.opt, gamma=0.9)
-        schedulers[1] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear")
+        schedulers[0] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear")
+        schedulers[1] = ExponentialLR(self.opt, gamma=0.9)
         self._test(schedulers, targets, epochs)
 
     def test_compound_step_and_constant_warmup(self):
@@ -1361,8 +1361,8 @@ def test_compound_cosanneal_and_linear_warmup_lr(self):
         for i in range(iters):
             single_targets[i] *= factor + i / iters * (1 - factor)
         targets = [single_targets, [x * epochs for x in single_targets]]
-        schedulers[0] = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
-        schedulers[1] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear")
+        schedulers[0] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear")
+        schedulers[1] = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
         self._test(schedulers, targets, epochs)
 
     def test_compound_cosanneal_and_exp_lr(self):
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 78a8cfad0d637..657a35ad681b0 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -328,7 +328,7 @@ def get_lr(self):
             return [group['lr'] * lmbda(self.last_epoch)
                     for lmbda, group in zip(self.lr_lambdas, self.optimizer.param_groups)]
         else:
-            return list(self.base_lrs)
+            return [group['lr'] for group in self.optimizer.param_groups]
 
 
 class StepLR(_LRScheduler):
@@ -526,7 +526,7 @@ def get_lr(self):
                           "please use `get_last_lr()`.", UserWarning)
 
         if self.last_epoch == 0:
-            return self.base_lrs
+            return [group['lr'] for group in self.optimizer.param_groups]
         return [group['lr'] * self.gamma
                 for group in self.optimizer.param_groups]
 
@@ -586,7 +586,7 @@ def get_lr(self):
                           "please use `get_last_lr()`.", UserWarning)
 
         if self.last_epoch == 0:
-            return self.base_lrs
+            return [group['lr'] for group in self.optimizer.param_groups]
         elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
             return [group['lr'] + (base_lr - self.eta_min) *
                     (1 - math.cos(math.pi / self.T_max)) / 2

From be9be9bfdd3be7cbd5b03027bb5597bc234bc79c Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Thu, 19 Aug 2021 07:49:43 -0700
Subject: [PATCH 064/530] add distributed/_sharded_tensor/test_sharded_tensor
 to ROCM_BLOCKLIST (#63508)

Summary:
Fixes current ROCm CI test2 brokenness until tensorpipe is fully supported by ROCm.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63508

Reviewed By: ejguan

Differential Revision: D30406450

Pulled By: walterddr

fbshipit-source-id: c07509271d5d33901f3eaf7ffb916dc3626e1f9a
---
 test/run_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/run_test.py b/test/run_test.py
index e40f580bbe9e7..e043bcd0ad152 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -245,6 +245,7 @@
     'distributed/rpc/test_faulty_agent',
     'distributed/rpc/test_tensorpipe_agent',
     'distributed/rpc/cuda/test_tensorpipe_agent',
+    'distributed/_sharded_tensor/test_sharded_tensor',
     'test_determination',
     'test_multiprocessing',
     'test_jit_legacy',

From f596aa8b77d6c57dd82f33a45926fad95ab2a21e Mon Sep 17 00:00:00 2001
From: Till Hoffmann <tillahoffmann@gmail.com>
Date: Thu, 19 Aug 2021 08:28:55 -0700
Subject: [PATCH 065/530] Poisson zero rate (#61511)

Summary:
This PR fixes https://github.com/pytorch/pytorch/issues/53485 by allowing zero rates for the Poisson distribution. This implementation is consistent with `scipy.stats.poisson` which admits zero rates. In addition to addressing the aforementioned issue, this PR makes two supporting changes:

1. add a `nonnegative` constraint to enforce non-negative rates for the Poisson distribution.
2. adjust the evaluation of the gradient of `xlogy` such that it is well defined for `x == 0 and y == 0`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61511

Reviewed By: ejguan

Differential Revision: D30352917

Pulled By: albanD

fbshipit-source-id: f3d33da58360e80d75eb83519f199b93232a2a2d
---
 test/distributions/test_distributions.py   | 26 ++++++++++++++++++----
 tools/autograd/derivatives.yaml            |  4 ++--
 torch/distributions/constraint_registry.py |  2 ++
 torch/distributions/constraints.py         |  1 +
 torch/distributions/poisson.py             |  4 ++--
 5 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 85e4dbacd4b6a..319b55795addb 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -387,6 +387,12 @@ def is_all_nan(tensor):
         },
         {
             'rate': 0.2,
+        },
+        {
+            'rate': torch.tensor([0.0], requires_grad=True),
+        },
+        {
+            'rate': 0.0,
         }
     ]),
     Example(RelaxedBernoulli, [
@@ -667,7 +673,7 @@ def is_all_nan(tensor):
     ]),
     Example(Poisson, [
         {
-            'rate': torch.tensor([0.0], requires_grad=True),
+            'rate': torch.tensor([-0.1], requires_grad=True),
         },
         {
             'rate': -1.0,
@@ -1315,17 +1321,29 @@ def test_poisson_shape(self):
     def test_poisson_log_prob(self):
         rate = torch.randn(2, 3).abs().requires_grad_()
         rate_1d = torch.randn(1).abs().requires_grad_()
+        rate_zero = torch.zeros([], requires_grad=True)
 
-        def ref_log_prob(idx, x, log_prob):
-            l = rate.view(-1)[idx].detach()
+        def ref_log_prob(ref_rate, idx, x, log_prob):
+            l = ref_rate.view(-1)[idx].detach()
             expected = scipy.stats.poisson.logpmf(x, l)
             self.assertEqual(log_prob, expected, atol=1e-3, rtol=0)
 
         set_rng_seed(0)
-        self._check_log_prob(Poisson(rate), ref_log_prob)
+        self._check_log_prob(Poisson(rate), lambda *args: ref_log_prob(rate, *args))
+        self._check_log_prob(Poisson(rate_zero), lambda *args: ref_log_prob(rate_zero, *args))
         self._gradcheck_log_prob(Poisson, (rate,))
         self._gradcheck_log_prob(Poisson, (rate_1d,))
 
+        # We cannot check gradients automatically for zero rates because the finite difference
+        # approximation enters the forbidden parameter space. We instead compare with the
+        # theoretical results.
+        dist = Poisson(rate_zero)
+        s = dist.sample()
+        dist.log_prob(s).backward()
+        torch.testing.assert_allclose(rate_zero.grad, -1.0)
+        dist.log_prob(torch.ones_like(rate_zero)).backward()
+        torch.testing.assert_allclose(rate_zero.grad, torch.inf)
+
     @unittest.skipIf(IS_MACOS, "See https://github.com/pytorch/pytorch/issues/60347")
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_poisson_sample(self):
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index b52b69018e177..49e574a1651ba 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -801,11 +801,11 @@
 
 - name: xlogy.Tensor(Tensor self, Tensor other) -> Tensor
   self: grad * at::xlogy((self != 0), other)
-  other: grad * self / other
+  other: grad * at::where(other.isnan() | (self != 0), self / other, zeros_like(other))
   result: self_t * at::xlogy((self_p != 0), other_p) + other_t * self_p / other_p
 
 - name: xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor
-  other: grad * self / other
+  other: grad * at::where(other.isnan() | (!self.equal(0)), self / other, zeros_like(other))
   result: auto_element_wise
 
 - name: xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor
diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py
index cbe987e72c798..c03f0ad02d2c6 100644
--- a/torch/distributions/constraint_registry.py
+++ b/torch/distributions/constraint_registry.py
@@ -173,7 +173,9 @@ def _transform_to_independent(constraint):
 
 
 @biject_to.register(constraints.positive)
+@biject_to.register(constraints.nonnegative)
 @transform_to.register(constraints.positive)
+@transform_to.register(constraints.nonnegative)
 def _transform_to_positive(constraint):
     return transforms.ExpTransform()
 
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
index 99808b6b80beb..5eed19afd09ec 100644
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@@ -545,6 +545,7 @@ def check(self, value):
 real = _Real()
 real_vector = independent(real, 1)
 positive = _GreaterThan(0.)
+nonnegative = _GreaterThanEq(0.)
 greater_than = _GreaterThan
 greater_than_eq = _GreaterThanEq
 less_than = _LessThan
diff --git a/torch/distributions/poisson.py b/torch/distributions/poisson.py
index 954ed6e0d3206..9adb641d7fcee 100644
--- a/torch/distributions/poisson.py
+++ b/torch/distributions/poisson.py
@@ -24,7 +24,7 @@ class Poisson(ExponentialFamily):
     Args:
         rate (Number, Tensor): the rate parameter
     """
-    arg_constraints = {'rate': constraints.positive}
+    arg_constraints = {'rate': constraints.nonnegative}
     support = constraints.nonnegative_integer
 
     @property
@@ -60,7 +60,7 @@ def log_prob(self, value):
         if self._validate_args:
             self._validate_sample(value)
         rate, value = broadcast_all(self.rate, value)
-        return (rate.log() * value) - rate - (value + 1).lgamma()
+        return value.xlogy(rate) - rate - (value + 1).lgamma()
 
     @property
     def _natural_params(self):

From e1334512a3aa0f8f8a3a0a59cb868355a33b6233 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Thu, 19 Aug 2021 08:41:08 -0700
Subject: [PATCH 066/530] Add fastpath for dot and vdot when the inputs have
 conj bit set to True (#62915)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62915

As much as 45% and 20% perf improvement on CUDA and CPU respectively.
consistent improvement in perf for all cases -- see perf numbers in comments below

Test Plan: Imported from OSS

Reviewed By: heitorschueroff

Differential Revision: D30404006

Pulled By: anjali411

fbshipit-source-id: 565940da28c7761d993cf43346932c24292e8a4d
---
 aten/src/ATen/ConjugateFallback.cpp           |  4 +++
 aten/src/ATen/native/Blas.cpp                 | 27 ++++++++++++++++---
 aten/src/ATen/native/cuda/Blas.cpp            | 23 +++++++++++++++-
 .../_internal/common_methods_invocations.py   | 20 ++++++++++----
 4 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/ConjugateFallback.cpp b/aten/src/ATen/ConjugateFallback.cpp
index 3ae9859f2d618..a64ef4950940b 100644
--- a/aten/src/ATen/ConjugateFallback.cpp
+++ b/aten/src/ATen/ConjugateFallback.cpp
@@ -56,6 +56,10 @@ TORCH_LIBRARY_IMPL(aten, Conjugate, m) {
   m.impl("view", torch::CppFunction::makeFallthrough());
   m.impl("_unsafe_view", torch::CppFunction::makeFallthrough());
   m.impl("reshape", torch::CppFunction::makeFallthrough());
+  m.impl("dot", torch::CppFunction::makeFallthrough());
+  m.impl("vdot", torch::CppFunction::makeFallthrough());
+  m.impl("dot.out", torch::CppFunction::makeFallthrough());
+  m.impl("vdot.out", torch::CppFunction::makeFallthrough());
 }
 
 } // namespace at
diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp
index ab522ac21ea92..114de632a384a 100644
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@@ -133,8 +133,19 @@ inline void dot_check(const Tensor& self, const Tensor& other) {
 }
 
 Tensor dot(const Tensor &self, const Tensor &other){
-  at::NoNamesGuard guard;
+  if (self.is_complex()) {
+    if (self.is_conj()) {
+      if (other.is_conj()) {
+        return (at::native::dot(self.conj(), other.conj())).conj();
+       } else {
+         return at::native::vdot(self.conj(), other);
+       }
+    } else if (other.is_conj()) {
+      return at::native::vdot(other.conj(), self);
+    }
+  }
 
+  at::NoNamesGuard guard;
   dot_check(self, other);
 
   return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, self.scalar_type(), "dot", [&] {
@@ -145,15 +156,25 @@ Tensor dot(const Tensor &self, const Tensor &other){
 }
 
 Tensor vdot(const Tensor &self, const Tensor &other){
-  at::NoNamesGuard guard;
-
   // Dispatch to `dot` for real dtypes.
   if (!self.is_complex()){
     return at::dot(self, other);
   }
 
+  if (self.is_conj()) {
+    if (other.is_conj()) {
+      return at::native::vdot(other.conj(), self.conj());
+    } else {
+      return at::native::dot(self.conj(), other);
+    }
+  } else if (other.is_conj()) {
+    return (at::native::dot(self, other.conj())).conj();
+  }
+
+  at::NoNamesGuard guard;
   // For complex dtypes.
   dot_check(self, other);
+
   return AT_DISPATCH_COMPLEX_TYPES(self.scalar_type(), "vdot", [&] {
     Tensor result = at::empty({}, self.options());
     result.fill_(vdot_impl<scalar_t>(self.numel(), self.data_ptr<scalar_t>(), self.stride(0), other.data_ptr<scalar_t>(), other.stride(0)));
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index b0fe0ac7a05b6..b4479101c59c9 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -353,8 +353,19 @@ inline void dot_check(const Tensor& self, const Tensor& other) {
 } // anonymous namespace
 
 Tensor dot_cuda(const Tensor& self, const Tensor& other) {
-  at::NoNamesGuard guard;
+  if (self.is_complex()) {
+    if (self.is_conj()) {
+      if (other.is_conj()) {
+        return (dot_cuda(self.conj(), other.conj())).conj();
+       } else {
+         return vdot_cuda(self.conj(), other);
+       }
+    } else if (other.is_conj()) {
+      return vdot_cuda(other.conj(), self);
+    }
+  }
 
+  at::NoNamesGuard guard;
   dot_check(self, other);
 
   const int n = static_cast<int>(self.numel());
@@ -391,6 +402,16 @@ Tensor vdot_cuda(const Tensor& self, const Tensor& other) {
     return dot_cuda(self, other);
   }
 
+  if (self.is_conj()) {
+    if (other.is_conj()) {
+      return vdot_cuda(other.conj(), self.conj());
+    } else {
+      return dot_cuda(self.conj(), other);
+    }
+  } else if (other.is_conj()) {
+    return (dot_cuda(self, other.conj())).conj();
+  }
+
   at::NoNamesGuard guard;
   dot_check(self, other);
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 7e57d5d693ec7..873d91c0e1293 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1377,14 +1377,24 @@ def sample_inputs_bmm(self, device, dtype, requires_grad, **kwargs):
     )
 
 def sample_inputs_dot_vdot(self, device, dtype, requires_grad, **kwargs):
-    return (
-        SampleInput(
+    sample_inputs = []
+    sample_inputs.append(SampleInput(
+        make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        args=(
+            make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        )
+    ))
+    if dtype.is_complex:
+        # dot/vdot for (conj(input), conj(arg_tensor)) and (conj(input), arg_tensor)
+        # is tested in test_conj_view (which tests operations with only conjugated input tensor
+        # -- not conjugated arg tensors)
+        sample_inputs.append(SampleInput(
             make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad),
             args=(
-                make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad),
+                torch.conj(make_tensor((S, ), device, dtype, low=None, high=None, requires_grad=requires_grad)),
             )
-        ),
-    )
+        ))
+    return sample_inputs
 
 def sample_inputs_addmv(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)

From 79693bb86a3f601a5c0d3da52d99acec95bb48c1 Mon Sep 17 00:00:00 2001
From: David Esiobu <davides@fb.com>
Date: Thu, 19 Aug 2021 09:15:34 -0700
Subject: [PATCH 067/530] Use linecache.lazycache to cache generated code.
 (#63453)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63453

Instead of patching linecache.getlines, use linecache.lazycache and
parts of the loader protocol described in PEP-302

Test Plan:
python3 test/test_fx.py

Imported from OSS

Reviewed By: suo

Differential Revision: D30388176

fbshipit-source-id: 92933711ecf3a21a07e1d6b0d1185ab0efd8341c
---
 torch/fx/graph_module.py | 70 +++++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 19 deletions(-)

diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 85479f069d53d..b87aeaaa78a03 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -15,28 +15,60 @@
 import os
 import warnings
 
-# normal exec loses the source code, however we can patch
-# the linecache module to still recover it.
-# using exec_with_source will add it to our local cache
+# Normal exec loses the source code, however we can work with
+# the linecache module to recover it.
+# Using exec_with_source will add it to our local cache
 # and then tools like TorchScript will be able to get source info.
-_next_id = 0
+class EvalCacheLoader(object):
+    def __init__(self):
+        self.eval_cache = {}
+        self.next_id = 0
+
+    def cache(self, src: str, globals: Dict[str, Any]):
+        """Store the source in a private cache, and add a lazy entry in linecache
+        that allows the source to be retrieved by 'filename'.
+
+        Args:
+            src (str): The module source to cache
+            globals (dict): The module globals
+
+        Returns:
+            str: The cache key (and dummy filename) generated for src.
+        """
+
+        key = self._get_key()
+        self.eval_cache[key] = src
+
+        # Don't mutate globals so that this loader is only used
+        # to populate linecache, and doesn't interact with other modules
+        # that might check `__loader__`
+        globals_copy = globals.copy()
+        globals_copy['__file__'] = key
+        globals_copy['__name__'] = key
+        globals_copy['__loader__'] = self
+        linecache.lazycache(key, globals_copy)
+
+        return key
+
+    # Part of the loader protocol (PEP 302)
+    # linecache will use this method when trying to find source code
+    def get_source(self, module_name) -> Optional[str]:
+        if module_name in self.eval_cache:
+            return self.eval_cache[module_name]
+        return None
+
+    def _get_key(self):
+        key = f'<eval_with_key>.{self.next_id}'
+        self.next_id += 1
+        return key
+
+_loader = EvalCacheLoader()
+
+
 def exec_with_source(src: str, globals: Dict[str, Any]):
-    global _next_id
-    key = f'<eval_with_key_{_next_id}>'
-    _next_id += 1
-    _eval_cache[key] = [line + '\n' for line in src.splitlines()]
+    key = _loader.cache(src, globals)
     exec(compile(src, key, 'exec'), globals)
 
-# patch linecache so that any code we exec using exec_with_source
-# works with inspect
-_eval_cache : Dict[str, List[str]] = {}
-_orig_getlines = linecache.getlines
-def patched_getline(*args, **kwargs):
-    if args[0] in _eval_cache:
-        return _eval_cache[args[0]]
-    return _orig_getlines(*args, **kwargs)
-linecache.getlines = patched_getline
-
 
 def _forward_from_src(src: str, globals: Dict[str, Any]):
     # avoid mutating the passed in dict
@@ -539,7 +571,7 @@ def generate_error_message(frame_summary: traceback.FrameSummary) -> str:
             # auxiliary variables (for readability)
             err_lineno = frame_summary.lineno
             err_line_len = len(frame_summary.line)
-            all_src_lines = _eval_cache[frame_summary.filename]
+            all_src_lines = linecache.getlines(frame_summary.filename)
 
             # constituent substrings of the error message
             tb_repr = traceback.format_exc()

From e7831fe5de574e4ce542e02a6c56f57cc0493bf9 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Thu, 19 Aug 2021 09:49:12 -0700
Subject: [PATCH 068/530] [PyTorch] Test IValue move/copy/assign/swap more
 (#54717)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/54717

Hit more tags in these tests
ghstack-source-id: 136140508

Test Plan: buck test //caffe2/aten:ivalue_test

Reviewed By: anjali411

Differential Revision: D27339736

fbshipit-source-id: 610c8e92846bb70ba725ab117440326ab50af5ce
---
 aten/src/ATen/test/ivalue_test.cpp | 113 +++++++++++++++++++++++++----
 1 file changed, 100 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/test/ivalue_test.cpp b/aten/src/ATen/test/ivalue_test.cpp
index 915e267347170..3ae18390f8f6e 100644
--- a/aten/src/ATen/test/ivalue_test.cpp
+++ b/aten/src/ATen/test/ivalue_test.cpp
@@ -90,6 +90,18 @@ TEST(IValueTest, Basic) {
   ASSERT_EQ(complex_tuple.toTuple()->elements()[1], foo1);
 }
 
+TEST(IValueTest, BasicStorage) {
+  at::Storage emptyStorage;
+  at::Storage nonemptyStorage(at::rand({3, 4}).storage());
+  IValue ivEmpty(emptyStorage);
+  IValue ivNonempty(nonemptyStorage);
+
+  ASSERT_TRUE(ivEmpty.isStorage());
+  ASSERT_TRUE(ivNonempty.isStorage());
+  ASSERT_EQ(emptyStorage.unsafeGetStorageImpl(), ivEmpty.toStorage().unsafeGetStorageImpl());
+  ASSERT_EQ(nonemptyStorage.unsafeGetStorageImpl(), ivNonempty.toStorage().unsafeGetStorageImpl());
+}
+
 TEST(IValueTest, ComplexDict) {
   typedef c10::complex<double> c_type;
   c10::Dict<c_type, c_type> m;
@@ -102,21 +114,70 @@ TEST(IValueTest, ComplexDict) {
   ASSERT_EQ(m_.at(num1), 2 * num1);
   ASSERT_EQ(m_.at(num2), 2 * num2);
 }
-static std::array<IValue, 5> makeSampleIValues() {
-  return { at::rand({3, 4}), "hello", 42, true, 1.5 };
-}
 
-static std::array<IValue, 5> makeMoreSampleIValues() {
-  return { at::rand({3, 4}), "goodbye", 23, false, 0.5 };
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+static std::array<IValue, 16> makeSampleIValues() {
+  return {
+    IValue(),
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    at::rand({3, 4}),
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    at::rand({3, 4}).storage(),
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    1.5,
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    c10::complex<double>(2.5, -0.5),
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    42,
+    true,
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    std::make_tuple(23, "hello"),
+    "hello",
+    c10::make_intrusive<caffe2::Blob>(),
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    c10::List<int64_t>({1, 2, 3}),
+    c10::Dict<std::string, std::string>(),
+    c10::make_intrusive<ivalue::Future>(FloatType::get()),
+    c10::Device(c10::DeviceType::CPU, 0),
+    c10::Stream(c10::Stream::DEFAULT, c10::Device(c10::DeviceType::CPU, 0)),
+    c10::make_intrusive<ivalue::Object>(c10::StrongTypePtr(nullptr, ClassType::create("class1", {})), 1),
+  };
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+static std::array<IValue, 16> makeMoreSampleIValues() {
+  return {
+    IValue(),
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    at::rand({3, 4}),
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    at::rand({3, 4}).storage(),
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    2.5,
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    c10::complex<double>(2.7, -0.3),
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    43,
+    false,
+    std::make_tuple(1, "goodbye"),
+    "goodbye",
+    c10::make_intrusive<caffe2::Blob>(),
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+    c10::List<int64_t>({4, 5, 6}),
+    c10::Dict<std::string, std::string>(),
+    c10::make_intrusive<ivalue::Future>(IntType::get()),
+    c10::Device(c10::DeviceType::CUDA, 2),
+    c10::Stream(c10::Stream::DEFAULT, c10::Device(c10::DeviceType::CUDA, 1)),
+    c10::make_intrusive<ivalue::Object>(c10::StrongTypePtr(nullptr, ClassType::create("class2", {})), 2),
+  };}
+
 // IValue::operator== doesn't seem to work on Tensors.
 #define EXPECT_IVALUE_EQ(a, b)                          \
   EXPECT_EQ((a).isTensor(), (b).isTensor());            \
   if ((a).isTensor()) {                                 \
-    EXPECT_TRUE(a.toTensor().equal(b.toTensor()));      \
+    EXPECT_TRUE((a).toTensor().equal((b).toTensor()));  \
   } else {                                              \
-    EXPECT_EQ(a, b);                                    \
+    EXPECT_EQ((a), (b));                                \
   }
 
 TEST(IValueTest, Swap) {
@@ -580,13 +641,31 @@ TEST(IValueTest, IdentityComparisonAndHashing) {
 
   ASSERT_EQ(sampleIValues.size(), moreSampleIValues.size());
   for (int ii = 0; ii < sampleIValues.size(); ++ii) {
-    // Constant strings will have the same pointer value.
-    if (sampleIValues[ii].isPtrType() && !sampleIValues[ii].isString()) {
-      EXPECT_NE(sampleIValues[ii].hash(), sampleIValues2[ii].hash());
-    } else {
-      EXPECT_EQ(sampleIValues[ii].hash(), sampleIValues2[ii].hash());
+    if (sampleIValues[ii].isComplexDouble() ||
+        sampleIValues[ii].isBlob() ||
+        sampleIValues[ii].isList() ||
+        sampleIValues[ii].isFuture() ||
+        sampleIValues[ii].isStream() ||
+        sampleIValues[ii].isObject() ||
+        sampleIValues[ii].isGenericDict()) {
+      // Not hashable.
+      continue;
+    }
+    // Tuples may or may not have the same hash across instantiations.
+    if (!sampleIValues[ii].isTuple()) {
+      // Constant strings will have the same pointer value.
+      if (sampleIValues[ii].isPtrType() && !sampleIValues[ii].isString()) {
+        EXPECT_NE(sampleIValues[ii].hash(), sampleIValues2[ii].hash())
+          << " at index " << ii;
+      } else {
+        EXPECT_EQ(sampleIValues[ii].hash(), sampleIValues2[ii].hash())
+          << " at index " << ii;
+      }
+    }
+    if (!sampleIValues[ii].isNone() && !moreSampleIValues[ii].isNone()) {
+      EXPECT_NE(sampleIValues[ii].hash(), moreSampleIValues[ii].hash())
+        << " at index " << ii;
     }
-    EXPECT_NE(sampleIValues[ii].hash(), moreSampleIValues[ii].hash());
   }
 }
 
@@ -656,5 +735,13 @@ TEST(IValueTest, ScalarBool) {
   EXPECT_TRUE(actual.toBool());
 }
 
+TEST(IValueTest, ToWeakAndBack) {
+  auto sampleInputs = makeSampleIValues();
+  for (const auto& sample: sampleInputs) {
+    WeakIValue weak(sample);
+    EXPECT_IVALUE_EQ(sample, weak.lock());
+  }
+}
+
 // TODO(gmagogsfm): Add type conversion test?
 } // namespace c10

From 885e312ce05b51294b27737347f736d632d8d8e6 Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@fb.com>
Date: Thu, 19 Aug 2021 10:16:26 -0700
Subject: [PATCH 069/530] Add permute021 fx2trt converter (#63238)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63238

Reviewed By: yinghai

Differential Revision: D30295373

fbshipit-source-id: 2a189fe485edaa978fd03e4b8d8582edb34ec648
---
 torch/fx/experimental/fx2trt/fx2trt.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch/fx/experimental/fx2trt/fx2trt.py b/torch/fx/experimental/fx2trt/fx2trt.py
index 9879fd7e0952e..0e7cc24c18be5 100644
--- a/torch/fx/experimental/fx2trt/fx2trt.py
+++ b/torch/fx/experimental/fx2trt/fx2trt.py
@@ -302,8 +302,9 @@ def validate_conversion(self):
                 missing_converter.add(f"{node.op} {node.target}")
             elif node.op == "call_module":
                 submod = self.fetch_attr(node.target)
-                if not CONVERTERS.get(type(submod)):
-                    missing_converter.add(f"{node.op} {type(submod)}")
+                submod_type = getattr(submod, "_base_class_origin", type(submod))
+                if not CONVERTERS.get(submod_type):
+                    missing_converter.add(f"{node.op} {submod_type}")
 
         return missing_converter
 
@@ -373,12 +374,11 @@ def placeholder(self, target, args, kwargs):
     def call_module(self, target, args, kwargs):
         assert isinstance(target, str)
         submod = self.fetch_attr(target)
-        converter = CONVERTERS.get(type(submod))
+        submod_type = getattr(submod, "_base_class_origin", type(submod))
+        converter = CONVERTERS.get(submod_type)
 
         if not converter:
-            raise RuntimeError(
-                f"Conversion of module of type {type(submod)} not currently supported!"
-            )
+            raise RuntimeError(f'Conversion of module of type {submod_type} not currently supported!')
 
         return converter(self.network, submod, args, kwargs, self._cur_node_name)
 

From 3aa4521fe8151fd9e072a79b44d34b725e00550e Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@fb.com>
Date: Thu, 19 Aug 2021 10:16:26 -0700
Subject: [PATCH 070/530] [hpc] use fx2trt for exploration track (#63535)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63535

Reviewed By: yinghai, jianyuh

Differential Revision: D30272810

fbshipit-source-id: 61f3edf2a2282cd8c268a92acf92feb05a6ae3e1
---
 .../fx2trt/converters/acc_ops_converters.py   | 124 +++++++++++++++++-
 torch/fx/experimental/fx_acc/acc_ops.py       |  16 +++
 2 files changed, 133 insertions(+), 7 deletions(-)

diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
index 88a74fe9e32c0..e7fcb94475b3e 100644
--- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
+++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
@@ -842,6 +842,77 @@ def acc_ops_reshape(network, target, args, kwargs, name):
     layer.name = name
     return layer.get_output(0)
 
+@tensorrt_converter(acc_ops.slice_tensor)
+def acc_ops_slice_tensor(network, target, args, kwargs, name):
+    input_val = kwargs["input"]
+
+    if not isinstance(input_val, trt.tensorrt.ITensor):
+        raise RuntimeError(f"slice_tensor received input {input_val} that is not part "
+                           "of the TensorRT region!")
+
+    dims = kwargs["dims"]
+    if network.has_implicit_batch_dimension:
+        if not len(dims):
+            raise RuntimeError("dim argument cannot be empty!")
+        if any([dim == 0 for dim in dims]):
+            raise RuntimeError(
+                f"We do not support slice_tensor at batch dim when it's implicit, got {dims}!"
+            )
+        dims = [d - 1 for d in dims]
+    else:
+        raise RuntimeError("We don't support slice_tensor with explicit batch dimension yet!")
+
+    start = [0] * len(input_val.shape)
+    stride = [1] * len(start)
+    output_shape = list(input_val.shape)
+    starts = kwargs["starts"]
+    stops = kwargs["stops"]
+    steps = kwargs["steps"]
+
+    for i, dim in enumerate(dims):
+        start[dim] = starts[i]
+        stride[dim] = steps[i]
+        output_shape[dim] = (stops[i] - start[i]) // steps[i]
+
+    layer = network.add_slice(input_val, start=start, shape=output_shape, stride=stride)
+    layer.name = name
+    return layer.get_output(0)
+
+@tensorrt_converter(acc_ops.split)
+def acc_ops_split(network, target, args, kwargs, name):
+    input_val = kwargs["input"]
+
+    if not isinstance(input_val, trt.tensorrt.ITensor):
+        raise RuntimeError(f"split received input {input_val} that is not part "
+                           "of the TensorRT region!")
+
+    dim = kwargs["dim"]
+    if network.has_implicit_batch_dimension:
+        assert dim != 0, "Can't split on batch dim when it's implicit!"
+        dim -= 1
+    else:
+        raise RuntimeError("We don't support split with explicit batch dimension yet!")
+
+    split_size = kwargs["split_size"]
+    start = [0] * len(input_val.shape)
+    stride = [1] * len(start)
+    offset = 0
+    num_splits = (input_val.shape[dim] + split_size - 1) // split_size
+    if num_splits < 1:
+        raise RuntimeError(f"Invalid split: {input_val.shape[dim]} wuth split_size={split_size}")
+
+    max_offset = input_val.shape[dim]
+    # add slice layers
+    output = []
+    for i in range(num_splits):
+        shape = list(input_val.shape)
+        shape[dim] = min(split_size, max_offset - offset)
+        start[dim] = offset
+        layer = network.add_slice(input_val, start=start, shape=shape, stride=stride)
+        offset += split_size
+        layer.name = f"{name}_{i}"
+        output.append(layer.get_output(0))
+    return output
 
 @tensorrt_converter(acc_ops.linear)
 def acc_ops_linear(network, target, args, kwargs, name):
@@ -859,13 +930,42 @@ def acc_ops_linear(network, target, args, kwargs, name):
         "dim for linear and it can't be the last dim."
     )
 
-    # add matrix multiply and add
-    weight = get_trt_tensor(network, kwargs["weight"], f"{name}_linear_weight", squeeze_vector=False)
-    output = add_matrix_multiply_layer(network, input_val, weight, f"{name}_linear_mm", transpose_other=True)
-    if kwargs["bias"] is not None:
-        return add_binary_elementwise_layer(network, output, kwargs["bias"], trt.ElementWiseOperation.SUM, f"{name}_linear_add")
+    weight = kwargs["weight"]
+
+    # For quantization, weight here would be a trt tensor because it goes through
+    # quant + dequant. In this case, we need to use matmul + add because fully_connected
+    # can't take non-constant weight.
+    # TODO: Need to benchmark the performance of lowering linear as fully_connected versus
+    # lowering as matmul + add. TensorRT documentation suggests to always lower it as
+    # matmul + add but we found in some cases this results in performance regression compared
+    # with lowering to fully_connected layer.
+    if isinstance(weight, torch.Tensor):
+        layer = network.add_shuffle(input_val)
+        layer.reshape_dims = tuple(input_val.shape) + (1, 1)
+        layer.name = f"{name}_pre_shuffle"
+
+        # add fully connected
+        layer = network.add_fully_connected(
+            input=layer.get_output(0),
+            num_outputs=kwargs["weight"].shape[0],
+            kernel=to_numpy(kwargs["weight"]),
+            bias=to_numpy(kwargs["bias"]),
+        )
+        layer.name = f"{name}_linear"
+
+        # reshape back
+        layer = network.add_shuffle(layer.get_output(0))
+        layer.reshape_dims = tuple(input_val.shape[:-1]) + (kwargs["weight"].shape[0],)
+        layer.name = f"{name}_post_shuffle"
+
+        return layer.get_output(0)
     else:
-        return output
+        # add matrix multiply and add
+        output = add_matrix_multiply_layer(network, input_val, weight, f"{name}_linear_mm", transpose_other=True)
+        if kwargs["bias"] is not None:
+            return add_binary_elementwise_layer(network, output, kwargs["bias"], trt.ElementWiseOperation.SUM, f"{name}_linear_add")
+        else:
+            return output
 
 
 def add_clamp(network, input, val, op):
@@ -910,6 +1010,16 @@ def acc_ops_clamp(network, target, args, kwargs, name):
     return input_val
 
 
+@tensorrt_converter(acc_ops.tuple_construct)
+def acc_ops_tuple_construct(network, target, args, kwargs, name):
+    return kwargs["tensors"]
+
+
+@tensorrt_converter(acc_ops.contiguous)
+def acc_ops_contiguous(network, target, args, kwargs, name):
+    return kwargs["input"]
+
+
 @tensorrt_converter(acc_ops.getitem)
 def acc_ops_getitem(network, target, args, kwargs, name):
     input_val = kwargs["input"]
@@ -951,7 +1061,7 @@ def slice_to_trt_params(py_slice, dim_size):
         batch_subscript = slices[0]
         if batch_subscript != slice(None, None, None):
             raise RuntimeError(
-                f"Can't subscript batch dimension when it's implicit. Got {slices}"
+                f"{name}: Can't subscript batch dimension when it's implicit. Got {slices}"
             )
 
         # Remove batch_dim subscript
diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py
index 9b2c7f95e0000..7c9520660ef77 100644
--- a/torch/fx/experimental/fx_acc/acc_ops.py
+++ b/torch/fx/experimental/fx_acc/acc_ops.py
@@ -162,6 +162,7 @@ def add(*, input, other):
     return input + other
 
 
+@register_acc_op_mapping(op_and_target=("call_method", "unsqueeze"))
 @register_acc_op_mapping(op_and_target=("call_function", torch.unsqueeze))
 @register_acc_op
 def unsqueeze(*, input, dim):
@@ -222,6 +223,12 @@ def transpose(*, input, dim0, dim1):
     return torch.transpose(**locals())
 
 
+@register_acc_op_mapping(op_and_target=("call_method", "contiguous"))
+@register_acc_op
+def contiguous(*, input):
+    return input.contiguous()
+
+
 @register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.softmax))
 @register_acc_op
 def softmax(*, input, dim, dtype):
@@ -873,6 +880,15 @@ def slice_tensor(*, input, dims, starts, stops, steps):
         ("length", "length"),
     ],
 )
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "narrow"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("start", "start"),
+        ("length", "length"),
+    ],
+)
 def custom_narrow_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node:
     kwargs = {
         "input": node.kwargs["input"],

From e030b813569c0fa89d527b8fe31c5e5fd31753d8 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Thu, 19 Aug 2021 10:37:31 -0700
Subject: [PATCH 071/530] [easy] Fix missing move in TupleType::createNamed
 (#61572)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61572

ghstack-source-id: 136161829

Test Plan: CI

Reviewed By: SplitInfinity

Differential Revision: D29672872

fbshipit-source-id: d8ba2d54f7914dbeb3fc52aa21dd77025951c4b5
---
 aten/src/ATen/core/type.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index 4214f4d3e1f6f..6bfba7b6d181a 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -802,7 +802,7 @@ TupleTypePtr TupleType::createNamed(const c10::optional<c10::QualifiedName>& qua
   auto schema = std::make_shared<FunctionSchema>(
       /*name=*/qualName.value_or(c10::QualifiedName()).name(),
       /*overload_name=*/std::string(""),
-      /*arguments=*/arguments,
+      /*arguments=*/std::move(arguments),
       /*returns=*/std::vector<Argument>{});
   return std::shared_ptr<TupleType>(new TupleType(
       field_types, qualName, schema)); // NOLINT(modernize-make-shared)

From bd8608cd5cdaa69384b11f1253b4bcc822032a51 Mon Sep 17 00:00:00 2001
From: driazati <driazati@users.noreply.github.com>
Date: Thu, 19 Aug 2021 10:38:41 -0700
Subject: [PATCH 072/530] Use CMake for breakpad (#63186)

Summary:
We currently build breakpad from [this fork](https://github.com/driazati/breakpad) to include extra logic to restore signal handlers that were previously present. With some [new additions](https://github.com/google/breakpad/compare/main...driazati:main) this fork now includes a CMake based build, so we can add breakpad as a proper dependency rather than rely on including it in Docker images as a system library which is error prone (we have a bunch of images) and hard to extend to MacOS / Windows. This also includes some changes to the crash handling code to support MacOS / Windows in a similar way to Linux.

```python
import torch

# On Windows this writes crashes to C:\Users\<user>\AppData\pytorch_crashes
# On MacOS/Linux this writes crashes to /tmp/pytorch_crashes
torch.utils._crash_handler.enable_minidumps()

# Easy way to cause a segfault and trigger the handler
torch.bincount(input=torch.tensor([9223372036854775807]))
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63186

Reviewed By: malfet, seemethere

Differential Revision: D30318404

Pulled By: driazati

fbshipit-source-id: 0d7daf3701cfaba5451cc529a0730272ab1eb1dc
---
 .gitmodules                             |  3 +
 CMakeLists.txt                          |  5 ++
 caffe2/CMakeLists.txt                   | 25 ++-----
 cmake/Dependencies.cmake                |  4 ++
 cmake/Summary.cmake                     |  1 +
 test/test_cpp_extensions_jit.py         | 65 +++++++++++-------
 test/test_utils.py                      |  5 +-
 third_party/breakpad                    |  1 +
 torch/csrc/utils/crash_handler.cpp      | 87 ++++++++++++++++++++-----
 torch/csrc/utils/crash_handler.h        | 10 ++-
 torch/testing/_internal/common_utils.py | 23 +++++--
 torch/utils/_crash_handler.py           |  5 +-
 12 files changed, 162 insertions(+), 72 deletions(-)
 create mode 160000 third_party/breakpad

diff --git a/.gitmodules b/.gitmodules
index 6836ccb49c881..a7cc437f43840 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -139,3 +139,6 @@
 [submodule "third_party/pocketfft"]
 	path = third_party/pocketfft
 	url = https://github.com/mreineck/pocketfft
+[submodule "third_party/breakpad"]
+	path = third_party/breakpad
+	url = https://github.com/driazati/breakpad.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 188f35a9981e0..4d1653ffaded3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -202,6 +202,7 @@ cmake_dependent_option(
     "USE_CUDNN" OFF)
 option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
+option(USE_BREAKPAD "Use breakpad crash dump library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" OFF)
 option(USE_FAKELOWP "Use FakeLowp operators" OFF)
 option(USE_FFMPEG "Use ffmpeg" OFF)
@@ -264,6 +265,10 @@ if(NOT DEFINED USE_VULKAN)
       "ANDROID" OFF)
 endif()
 
+if(IOS)
+  set(USE_BREAKPAD OFF)
+endif()
+
 option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON)
 option(USE_LITE_INTERPRETER_PROFILER "Enable " ON)
 option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 619455421f282..523fea8181cf8 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1042,27 +1042,10 @@ if(USE_TBB)
   target_link_libraries(torch_cpu PUBLIC TBB::tbb)
 endif()
 
-
-if(LINUX)
-  find_library(BREAKPAD_LIB breakpad_client
-    PATHS /usr/local/lib/)
-  find_path(BREAKPAD_INCLUDE_DIR breakpad
-    PATHS /usr/local/include/)
-
-  if(BREAKPAD_LIB AND BREAKPAD_INCLUDE_DIR)
-    message(STATUS "found breakpad library")
-    target_link_libraries(torch_cpu PRIVATE ${BREAKPAD_LIB})
-    target_compile_definitions(torch_cpu PRIVATE ADD_BREAKPAD_SIGNAL_HANDLER)
-    target_include_directories(torch_cpu PRIVATE ${BREAKPAD_INCLUDE_DIR}/breakpad)
-  else()
-    if(BREAKPAD_INCLUDE_DIR)
-      message(STATUS "breakpad_client library not found")
-    elseif(BREAKPAD_LIB)
-      message(STATUS "breakpad include path not found")
-    else()
-      message(STATUS "breakpad_client library and include path not found")
-    endif()
-  endif()
+if(USE_BREAKPAD)
+  target_compile_definitions(torch_cpu PRIVATE ADD_BREAKPAD_SIGNAL_HANDLER)
+  target_include_directories(torch_cpu PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../third_party ${CMAKE_CURRENT_LIST_DIR}/../third_party/breakpad/src)
+  target_link_libraries(torch_cpu PRIVATE breakpad)
 endif()
 
   target_include_directories(torch_cpu PRIVATE ${ATen_CPU_INCLUDE})
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 2c19dae96c909..3e37c3538f6fd 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1881,6 +1881,10 @@ set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")
 list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)
 set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
 
+if(USE_BREAKPAD)
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/breakpad)
+endif()
+
 # ---[ Kineto
 # edge profiler depends on KinetoProfiler but it only does cpu
 # profiling. Thus we dont need USE_CUDA/USE_ROCM
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 4de2d79cb9757..afc63b18f5f07 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -178,6 +178,7 @@ function(caffe2_print_configuration_summary)
     message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
   endif()
   message(STATUS "  USE_DEPLOY           : ${USE_DEPLOY}")
+  message(STATUS "  USE_BREAKPAD         : ${USE_BREAKPAD}")
   message(STATUS "  Public Dependencies  : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}")
   message(STATUS "  Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}")
 endfunction()
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
index 073835277e678..89d9af10e0d35 100644
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@@ -869,11 +869,29 @@ def test_custom_compound_op_autograd(self):
 
         gradcheck(torch.ops.my.add, [a, b], eps=1e-2)
 
-    @unittest.skipIf(not has_breakpad(), "Breakpad library must be present on system for crash handler")
-    @unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler")
-    def test_crash_handler(self):
-        def run_test(stderr_file, destination):
-            # Code to enable dumps and trigger a segfault
+    @staticmethod
+    def _crash_handler_test_process(stderr_file, destination):
+        # Code to enable dumps and trigger a segfault
+        if sys.platform == "win32":
+            destination = destination.replace("\\", "\\\\")
+            csrc = textwrap.dedent(f"""
+            #include <torch/torch.h>
+            #include <locale>
+            #include <iostream>
+            #include <codecvt>
+            #include <string>
+
+            int fail() {{
+                std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+                std::string narrow("{destination}");
+                std::wstring wide = converter.from_bytes(narrow);
+                torch::crash_handler::enable_minidumps(wide.c_str());
+
+                volatile int* bad = nullptr;
+                return *bad;
+            }}
+            """)
+        else:
             csrc = textwrap.dedent(f"""
             #include <torch/torch.h>
 
@@ -885,29 +903,32 @@ def run_test(stderr_file, destination):
             }}
             """)
 
-            # Some special stuff to overwrite stderr for a C++ extension
-            # Copied from: https://stackoverflow.com/questions/8804893/redirect-stdout-from-python-for-c-calls
-            sys.stdout.flush()
-            newstdout = os.dup(2)
-            devnull = os.open(stderr_file, os.O_WRONLY)
-            os.dup2(devnull, 2)
-            os.close(devnull)
-            sys.stdout = os.fdopen(newstdout, 'w')
-
-            module = torch.utils.cpp_extension.load_inline(
-                name="segfault",
-                cpp_sources=csrc,
-                functions=["fail"],
-            )
-            module.fail()
+        # Some special stuff to overwrite stderr for a C++ extension
+        # Copied from: https://stackoverflow.com/questions/8804893/redirect-stdout-from-python-for-c-calls
+        sys.stdout.flush()
+        newstdout = os.dup(2)
+        devnull = os.open(stderr_file, os.O_WRONLY)
+        os.dup2(devnull, 2)
+        os.close(devnull)
+        sys.stdout = os.fdopen(newstdout, 'w')
 
+        module = torch.utils.cpp_extension.load_inline(
+            name="segfault",
+            cpp_sources=csrc,
+            functions=["fail"],
+        )
+        module.fail()
 
-        with tempfile.TemporaryDirectory() as temp_dir, tempfile.NamedTemporaryFile() as stderr:
+    @unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler")
+    @unittest.skipIf(not has_breakpad(), "Built without breakpad")
+    def test_crash_handler(self):
+        with tempfile.TemporaryDirectory() as temp_dir, tempfile.NamedTemporaryFile(delete=not sys.platform == "win32") as stderr:
             # Use multiprocessing to spin up a separate process to make catching
             # the segfault easier
-            p = Process(target=run_test, args=(stderr.name, temp_dir))
+            p = Process(target=self._crash_handler_test_process, args=(stderr.name, temp_dir))
             p.start()
             p.join()
+
             with open(stderr.name) as f:
                 result = f.read().strip()
 
diff --git a/test/test_utils.py b/test/test_utils.py
index d0f8d10d9fbd4..6f9432e0e6392 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -19,7 +19,7 @@
 import torch.hub as hub
 from torch.autograd._functions.utils import check_onnx_broadcast
 from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
-from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, has_breakpad
+from torch.testing._internal.common_utils import has_breakpad, load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, TEST_WITH_ASAN
 from urllib.error import URLError
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
@@ -739,7 +739,8 @@ def forward(self, x):
 
 
 class TestCrashHandler(TestCase):
-    @unittest.skipIf(not has_breakpad(), "Crash handler lib was not linked in")
+    @unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler")
+    @unittest.skipIf(not has_breakpad(), "Built without breakpad")
     def test_python_exception_writing(self):
         with tempfile.TemporaryDirectory() as temp_dir:
             torch.utils._crash_handler.enable_minidumps(temp_dir)
diff --git a/third_party/breakpad b/third_party/breakpad
new file mode 160000
index 0000000000000..469a80ee54947
--- /dev/null
+++ b/third_party/breakpad
@@ -0,0 +1 @@
+Subproject commit 469a80ee54947ad8d000d33a615f1a199165a711
diff --git a/torch/csrc/utils/crash_handler.cpp b/torch/csrc/utils/crash_handler.cpp
index 2de22be0d2e86..8fb318b265a83 100644
--- a/torch/csrc/utils/crash_handler.cpp
+++ b/torch/csrc/utils/crash_handler.cpp
@@ -3,8 +3,16 @@
 #include <iostream>
 
 #ifdef ADD_BREAKPAD_SIGNAL_HANDLER
-#include <breakpad/client/linux/handler/exception_handler.h>
+#ifdef __linux__
+#include <breakpad/src/client/linux/handler/exception_handler.h>
 #include <csignal>
+#elif __APPLE__
+#include <breakpad/src/client/mac/handler/exception_handler.h>
+#elif _WIN32
+#include <breakpad/src/client/windows/handler/exception_handler.h>
+#else
+#error unsupported platform
+#endif
 #endif
 
 #include <c10/util/Exception.h>
@@ -16,9 +24,10 @@ namespace crash_handler {
 #ifdef ADD_BREAKPAD_SIGNAL_HANDLER
 
 static std::unique_ptr<google_breakpad::ExceptionHandler> handler; // NOLINT
-static std::string minidump_directory; // NOLINT
+static STRING_TYPE minidump_directory; // NOLINT
 static bool enabled_for_exceptions = false; // NOLINT
 
+#if __linux__
 bool dump_callback(
     const google_breakpad::MinidumpDescriptor& descriptor,
     void* context,
@@ -28,10 +37,45 @@ bool dump_callback(
   }
   return succeeded;
 }
+#elif __APPLE__
 
-void enable_minidumps(const std::string& dir) {
+bool dump_callback(
+    const char* dump_dir,
+    const char* minidump_id,
+    void* context,
+    bool succeeded) {
+  if (succeeded) {
+    std::cerr << "Wrote minidump to " << dump_dir << "/" << minidump_id
+              << ".dmp" << std::endl;
+  }
+  return succeeded;
+}
+#elif _WIN32
+bool dump_callback(
+    const wchar_t* dump_path,
+    const wchar_t* minidump_id,
+    void* context,
+    EXCEPTION_POINTERS* exinfo,
+    MDRawAssertionInfo* assertion,
+    bool succeeded) {
+  if (succeeded) {
+    // Printing with wcerr inserts spaces between all the characters for some
+    // reason. If someone figures that out then we can get rid of the std::string
+    // conversions here.
+    std::wstring dump_path_ws(dump_path);
+    std::string dump_path_string(dump_path_ws.begin(), dump_path_ws.end());
+    std::wstring minidump_id_ws(minidump_id);
+    std::string minidump_id_string(minidump_id_ws.begin(), minidump_id_ws.end());
+    std::cerr << "Wrote minidump to " << dump_path_string << "\\" << minidump_id_string << ".dmp" << std::endl;
+  }
+  return succeeded;
+}
+#endif
+
+void enable_minidumps(const STRING_TYPE& dir) {
   minidump_directory = dir;
-  // The constructor here registers the actual signal handler
+// The constructor here registers the actual signal handler
+#ifdef __linux__
   handler = std::make_unique<google_breakpad::ExceptionHandler>(
       google_breakpad::MinidumpDescriptor(minidump_directory),
       nullptr,
@@ -39,13 +83,30 @@ void enable_minidumps(const std::string& dir) {
       nullptr,
       true,
       -1);
+#elif __APPLE__
+  handler = std::make_unique<google_breakpad::ExceptionHandler>(
+      /*dump_path=*/minidump_directory.c_str(),
+      /*filter=*/nullptr,
+      /*callback=*/dump_callback,
+      /*callback_context=*/nullptr,
+      /*install_handler=*/true,
+      /*port_name=*/nullptr);
+#elif _WIN32
+  handler = std::make_unique<google_breakpad::ExceptionHandler>(
+      /*dump_path=*/minidump_directory.c_str(),
+      /*filter=*/nullptr,
+      /*callback=*/dump_callback,
+      /*callback_context=*/nullptr,
+      /*handler_types=*/
+      google_breakpad::ExceptionHandler::HandlerType::HANDLER_ALL);
+#endif
 }
 
 void disable_minidumps() {
   handler.reset();
 }
 
-const std::string& get_minidump_directory() {
+const STRING_TYPE& get_minidump_directory() {
   if (handler == nullptr) {
     AT_ERROR(
         "Minidump handler is uninintialized, make sure to call enable_minidumps first");
@@ -78,18 +139,16 @@ void enable_minidumps_on_exceptions() {
 
 #else
 // On unspported systems we can't do anything, so stub out everything.
-void enable_minidumps(const std::string& dir) {
-  AT_ERROR(
-      "Minidump collection is currently only implemented for Linux platforms");
+void enable_minidumps(const STRING_TYPE& dir) {
+  AT_ERROR("Compiled without minidump support");
 }
 
 void disable_minidumps() {
   // Purposefully do nothing
 }
 
-const std::string& get_minidump_directory() {
-  AT_ERROR(
-      "Minidump collection is currently only implemented for Linux platforms");
+const STRING_TYPE& get_minidump_directory() {
+  AT_ERROR("Compiled without minidump support");
 }
 
 bool is_enabled_on_exceptions() {
@@ -97,13 +156,11 @@ bool is_enabled_on_exceptions() {
 }
 
 void write_minidump() {
-  AT_ERROR(
-      "Minidump collection is currently only implemented for Linux platforms");
+  AT_ERROR("Compiled without minidump support");
 }
 
 void enable_minidumps_on_exceptions() {
-  AT_ERROR(
-      "Minidump collection is currently only implemented for Linux platforms");
+  AT_ERROR("Compiled without minidump support");
 }
 
 #endif
diff --git a/torch/csrc/utils/crash_handler.h b/torch/csrc/utils/crash_handler.h
index 5fe0503b2ed00..dc11945195372 100644
--- a/torch/csrc/utils/crash_handler.h
+++ b/torch/csrc/utils/crash_handler.h
@@ -5,10 +5,16 @@
 namespace torch {
 namespace crash_handler {
 
+#ifdef _WIN32
+typedef std::wstring STRING_TYPE;
+#else
+typedef std::string STRING_TYPE;
+#endif
+
 // Set up a handler that writes minidumps to 'dir' on signals. This is not
 // necessary to call unless you want to change 'dir' to something other than
 // the default '/tmp/pytorch_crashes'.
-TORCH_API void enable_minidumps(const std::string& dir);
+TORCH_API void enable_minidumps(const STRING_TYPE& dir);
 
 // Enable minidumps when passing exceptions up to Python. By default these don't
 // do anything special, but it can be useful to write out a minidump on
@@ -19,7 +25,7 @@ TORCH_API void enable_minidumps_on_exceptions();
 TORCH_API void disable_minidumps();
 
 // Get the directory that minidumps will be written to
-TORCH_API const std::string& get_minidump_directory();
+TORCH_API const STRING_TYPE& get_minidump_directory();
 
 // These are TORCH_API'ed since they are used from libtorch_python.so
 TORCH_API bool is_enabled_on_exceptions();
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index fed9a005a55c5..a16056cd55cf7 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -2533,13 +2533,6 @@ def disable_gc():
     else:
         yield
 
-def has_breakpad() -> bool:
-    # If not on a special build, check that the library was actually linked in
-    try:
-        torch._C._get_minidump_directory()  # type: ignore[attr-defined]
-        return True
-    except RuntimeError as e:
-        return False
 
 def find_library_location(lib_name: str) -> Path:
     # return the shared library file in the installed folder if exist,
@@ -2590,6 +2583,22 @@ def get_tensors_from(args, kwargs):
     return set([arg for arg in args if isinstance(arg, Tensor)] +
                [v for v in kwargs.values() if isinstance(v, Tensor)])
 
+
+def has_breakpad():
+    # We always build with breakpad in CI
+    if IS_IN_CI:
+        return True
+
+    # If not on a special build, check that the library was actually linked in
+    try:
+        torch._C._get_minidump_directory()  # type: ignore[attr-defined]
+        return True
+    except RuntimeError as e:
+        if "Minidump handler is uninintialized" in str(e):
+            return True
+        return False
+
+
 def sandcastle_skip_if(condition, reason):
     """
     Similar to unittest.skipIf, however in the sandcastle environment it just
diff --git a/torch/utils/_crash_handler.py b/torch/utils/_crash_handler.py
index 3d736c3f85ce0..84b345229bde9 100644
--- a/torch/utils/_crash_handler.py
+++ b/torch/utils/_crash_handler.py
@@ -5,11 +5,10 @@
 import torch
 
 DEFAULT_MINIDUMP_DIR = "/tmp/pytorch_crashes"
+if sys.platform == "win32":
+    DEFAULT_MINIDUMP_DIR = str(pathlib.Path.home() / "AppData" / "pytorch_crashes")
 
 def enable_minidumps(directory=DEFAULT_MINIDUMP_DIR):
-    if sys.platform != "linux":
-        raise RuntimeError("Minidump collection is currently only implemented for Linux platforms")
-
     if directory == DEFAULT_MINIDUMP_DIR:
         pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
     elif not os.path.exists(directory):

From 535d44141b6a3d99eddfa241c4dfb6fc4aed7cab Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Thu, 19 Aug 2021 11:21:26 -0700
Subject: [PATCH 073/530] [7/N] Remove fork tests for RPC. (#63443)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63443

After https://github.com/pytorch/pytorch/pull/63442, all distributed
tests can run with opt-asan. As a result, we can now remove all of our fork
based tests.

This is the first PR in a stack, which first removes fork based tests from RPC.
ghstack-source-id: 136177744

Test Plan: waitforbuildbot

Reviewed By: lw

Differential Revision: D30384905

fbshipit-source-id: 86d438aebaa6cb02ae2a966fea244849849a1889
---
 .../rpc/cuda/test_tensorpipe_agent.py         |  2 -
 test/distributed/rpc/test_faulty_agent.py     |  2 -
 test/distributed/rpc/test_tensorpipe_agent.py |  2 -
 .../_internal/distributed/rpc_utils.py        | 74 +++++--------------
 4 files changed, 18 insertions(+), 62 deletions(-)

diff --git a/test/distributed/rpc/cuda/test_tensorpipe_agent.py b/test/distributed/rpc/cuda/test_tensorpipe_agent.py
index 5647434f6f53e..7cb35f9f73aa1 100644
--- a/test/distributed/rpc/cuda/test_tensorpipe_agent.py
+++ b/test/distributed/rpc/cuda/test_tensorpipe_agent.py
@@ -15,7 +15,6 @@
 from torch.testing._internal.distributed.rpc_utils import (
     GENERIC_CUDA_TESTS,
     TENSORPIPE_CUDA_TESTS,
-    MultiProcess,
     generate_tests,
 )
 
@@ -25,7 +24,6 @@
         "TensorPipe",
         TensorPipeRpcAgentTestFixture,
         GENERIC_CUDA_TESTS + TENSORPIPE_CUDA_TESTS,
-        MultiProcess.SPAWN,
         __name__,
     )
 )
diff --git a/test/distributed/rpc/test_faulty_agent.py b/test/distributed/rpc/test_faulty_agent.py
index 7c26643ab6b60..cb889115be8a1 100644
--- a/test/distributed/rpc/test_faulty_agent.py
+++ b/test/distributed/rpc/test_faulty_agent.py
@@ -15,7 +15,6 @@
 )
 from torch.testing._internal.distributed.rpc_utils import (
     FAULTY_AGENT_TESTS,
-    MultiProcess,
     generate_tests,
 )
 
@@ -28,7 +27,6 @@
             "Faulty",
             FaultyRpcAgentTestFixture,
             FAULTY_AGENT_TESTS,
-            MultiProcess.SPAWN,
             __name__,
         )
     )
diff --git a/test/distributed/rpc/test_tensorpipe_agent.py b/test/distributed/rpc/test_tensorpipe_agent.py
index 32b0e1c69357a..b741bc443c460 100644
--- a/test/distributed/rpc/test_tensorpipe_agent.py
+++ b/test/distributed/rpc/test_tensorpipe_agent.py
@@ -16,7 +16,6 @@
 from torch.testing._internal.distributed.rpc_utils import (
     GENERIC_TESTS,
     TENSORPIPE_TESTS,
-    MultiProcess,
     generate_tests,
 )
 
@@ -29,7 +28,6 @@
             "TensorPipe",
             TensorPipeRpcAgentTestFixture,
             GENERIC_TESTS + TENSORPIPE_TESTS,
-            MultiProcess.SPAWN,
             __name__,
         )
     )
diff --git a/torch/testing/_internal/distributed/rpc_utils.py b/torch/testing/_internal/distributed/rpc_utils.py
index b5cf9f73548c1..dd11c8dc450e0 100644
--- a/torch/testing/_internal/distributed/rpc_utils.py
+++ b/torch/testing/_internal/distributed/rpc_utils.py
@@ -2,13 +2,11 @@
 import os
 import sys
 import unittest
-from enum import Flag, auto
 from typing import Dict, List, Type
 
 from torch.testing._internal.common_distributed import MultiProcessTestCase
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
-    TEST_WITH_TSAN,
     find_free_port,
     IS_SANDCASTLE,
 )
@@ -75,25 +73,12 @@ def _check_and_unset_tcp_init():
 # The tests for the RPC module need to cover multiple possible combinations:
 # - different aspects of the API, each one having its own suite of tests;
 # - different agents (ProcessGroup, TensorPipe, ...);
-# - and subprocesses launched with either fork or spawn.
 # To avoid a combinatorial explosion in code size, and to prevent forgetting to
 # add a combination, these are generated automatically by the code in this file.
-# Here, we collect all the test suites that we need to cover and the two multi-
-# processing methods. We then have one separate file for each agent, from which
+# Here, we collect all the test suites that we need to cover.
+# We then have one separate file for each agent, from which
 # we call the generate_tests function of this file, passing to it a fixture for
-# the agent, which then gets mixed-in with each test suite and each mp method.
-
-
-@unittest.skipIf(TEST_WITH_TSAN, "TSAN and fork() is broken")
-class ForkHelper(MultiProcessTestCase):
-    def setUp(self):
-        super().setUp()
-        _check_and_set_tcp_init()
-        self._fork_processes()
-
-    def tearDown(self):
-        _check_and_unset_tcp_init()
-        super().tearDown()
+# the agent, which then gets mixed-in with each test suite.
 
 @unittest.skipIf(
     TEST_WITH_DEV_DBG_ASAN, "Skip ASAN as torch + multiprocessing spawn have known issues"
@@ -109,17 +94,6 @@ def tearDown(self):
         super().tearDown()
 
 
-class MultiProcess(Flag):
-    FORK = auto()
-    SPAWN = auto()
-
-
-MP_HELPERS_AND_SUFFIXES = {
-    MultiProcess.FORK: (ForkHelper, "WithFork"),
-    MultiProcess.SPAWN: (SpawnHelper, "WithSpawn"),
-}
-
-
 # This list contains test suites that are agent-agnostic and that only verify
 # compliance with the generic RPC interface specification. These tests should
 # *not* make use of implementation details of a specific agent (options,
@@ -175,7 +149,6 @@ def generate_tests(
     prefix: str,
     mixin: Type[RpcAgentTestFixture],
     tests: List[Type[RpcAgentTestFixture]],
-    mp_type_filter: MultiProcess,
     module_name: str,
 ) -> Dict[str, Type[RpcAgentTestFixture]]:
     """Mix in the classes needed to autogenerate the tests based on the params.
@@ -183,36 +156,25 @@ def generate_tests(
     Takes a series of test suites, each written against a "generic" agent (i.e.,
     derived from the abstract RpcAgentTestFixture class), as the `tests` args.
     Takes a concrete subclass of RpcAgentTestFixture, which specializes it for a
-    certain agent, as the `mixin` arg. Produces all combinations of them, and of
-    the multiprocessing start methods (fork or spawn), possibly filtered using
-    the `mp_type_filter`. Returns a dictionary of class names to class type
+    certain agent, as the `mixin` arg. Produces all combinations of them.
+    Returns a dictionary of class names to class type
     objects which can be inserted into the global namespace of the calling
-    module. The name of each test will be a concatenation of the `prefix` arg,
-    the original name of the test suite, and a suffix of either `WithFork` or
-    `WithSpawn`. The `module_name` should be the name of the calling module so
+    module. The name of each test will be a concatenation of the `prefix` arg
+    and the original name of the test suite.
+    The `module_name` should be the name of the calling module so
     that the classes can be fixed to make it look like they belong to it, which
     is necessary for pickling to work on them.
     """
     ret: Dict[str, Type[RpcAgentTestFixture]] = {}
     for test_class in tests:
-        for mp_type in MultiProcess:
-            if mp_type & mp_type_filter:
-                mp_helper, suffix = MP_HELPERS_AND_SUFFIXES[mp_type]
-                if IS_SANDCASTLE:
-                    if mp_helper == SpawnHelper and TEST_WITH_DEV_DBG_ASAN:
-                        print(
-                            f'Skipping test {test_class} on sandcastle for the following reason: '
-                            'Skip dev-asan as torch + multiprocessing spawn have known issues', file=sys.stderr)
-                        continue
-                    elif mp_helper == ForkHelper and TEST_WITH_TSAN:
-                        print(
-                            f'Skipping test {test_class} on sandcastle for the following reason: '
-                            'TSAN and fork() is broken'
-                        )
-                        continue
-
-                name = f"{prefix}{test_class.__name__}{suffix}"
-                class_ = type(name, (test_class, mixin, mp_helper), dict())
-                class_.__module__ = module_name
-                ret[name] = class_
+        if IS_SANDCASTLE and TEST_WITH_DEV_DBG_ASAN:
+            print(
+                f'Skipping test {test_class} on sandcastle for the following reason: '
+                'Skip dev-asan as torch + multiprocessing spawn have known issues', file=sys.stderr)
+            continue
+
+        name = f"{prefix}{test_class.__name__}"
+        class_ = type(name, (test_class, mixin, SpawnHelper), dict())
+        class_.__module__ = module_name
+        ret[name] = class_
     return ret

From 0c3904d18061ea31c9fe1bded5893ffb07f0a4b5 Mon Sep 17 00:00:00 2001
From: Yusuo Hu <yusuo@fb.com>
Date: Thu, 19 Aug 2021 12:37:58 -0700
Subject: [PATCH 074/530] [BF16] Add a missing thread local specifier to
 autocast_gpu_dtype (#63416)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63416

Fix a missing thread local specifier introduced by recent PR

https://github.com/pytorch/pytorch/pull/61002

Test Plan: Unit Tests

Reviewed By: ngimel

Differential Revision: D30376154

fbshipit-source-id: c70d37ec85c3eba88eb87f766f1c4e7aeff8eaf9
---
 aten/src/ATen/autocast_mode.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 97ec9ec69dbeb..1ac5ad1c88ba6 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -59,7 +59,7 @@ thread_local int nesting = 0;
 thread_local at::ScalarType autocast_cpu_dtype = at::kBFloat16;
 
 // autocast_gpu_dtype is the lower_precision_fp used by AutocastGPU.
-at::ScalarType autocast_gpu_dtype = at::kHalf;
+thread_local at::ScalarType autocast_gpu_dtype = at::kHalf;
 }
 
 void clear_cache() {

From d986d4bf6354bc02d5e68eaaea60b02234a4449f Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Thu, 19 Aug 2021 12:40:37 -0700
Subject: [PATCH 075/530] [special] use __all__ to hide internal imports
 (#63135)

Summary:
Reference: https://github.com/pytorch/pytorch/issues/50345

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63135

Reviewed By: ngimel

Differential Revision: D30364287

Pulled By: mruberry

fbshipit-source-id: 20078668943fafa45ce09610634b1d2c424b1922
---
 torch/special/__init__.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/torch/special/__init__.py b/torch/special/__init__.py
index 1f3b3fc5dc899..2fea9c6cb1b04 100644
--- a/torch/special/__init__.py
+++ b/torch/special/__init__.py
@@ -1,9 +1,12 @@
-import sys
-
 import torch
 from torch._C import _add_docstr, _special  # type: ignore[attr-defined]
 from torch._torch_docs import common_args, multi_dim_common
 
+__all__ = ['entr', 'psi', 'digamma', 'gammaln', 'polygamma', 'erf', 'erfc', 'erfinv',
+           'erfcx', 'logit', 'logsumexp', 'expit', 'exp2', 'expm1', 'xlog1py', 'xlogy',
+           'i0', 'i0e', 'i1', 'i1e', 'ndtr', 'ndtri', 'log1p', 'sinc', 'round', 'log_softmax',
+           'zeta', 'multigammaln']
+
 Tensor = torch.Tensor
 
 entr = _add_docstr(_special.special_entr,

From efd70b7ce6b2d3c000494ce5e527198637db5bc9 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Thu, 19 Aug 2021 12:41:42 -0700
Subject: [PATCH 076/530] Modernizes add and mul documentation (#63309)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/39329.

The documentation for torch.add and torch.mul was sorely out of date and even included deprecated references. This PR modernizes their descriptions consistent with torch.sub.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63309

Reviewed By: ngimel

Differential Revision: D30338004

Pulled By: mruberry

fbshipit-source-id: ee1c2a8106af8341253cafb0003b06e8f652624d
---
 torch/_torch_docs.py | 104 ++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 75 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index deee91dc5fa7b..a4f3bdaef7df7 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -214,25 +214,26 @@ def merge_dicts(*dicts):
 """.format(**common_args))
 
 add_docstr(torch.add, r"""
-add(input, other, *, out=None) -> Tensor
+add(input, other, *, alpha=1, out=None) -> Tensor
 
-Adds the scalar :attr:`other` to each element of the input :attr:`input`
-and returns a new resulting tensor.
+Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`.
 
 .. math::
-    \text{{out}} = \text{{input}} + \text{{other}}
+    \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i
+""" + r"""
 
-If :attr:`input` is of type FloatTensor or DoubleTensor, :attr:`other` must be
-a real number, otherwise it should be an integer.
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+:ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
 
 Args:
     {input}
-    other (Number): the number to be added to each element of :attr:`input`
+    other (Tensor or Number): the tensor or number to add to input.
 
 Keyword arguments:
+    alpha (Number): the multiplier for :attr:`other`.
     {out}
 
-Example::
+Examples::
 
     >>> a = torch.randn(4)
     >>> a
@@ -240,42 +241,16 @@ def merge_dicts(*dicts):
     >>> torch.add(a, 20)
     tensor([ 20.0202,  21.0985,  21.3506,  19.3944])
 
-.. function:: add(input, other, *, alpha=1, out=None) -> Tensor
-   :noindex:
-
-Each element of the tensor :attr:`other` is multiplied by the scalar
-:attr:`alpha` and added to each element of the tensor :attr:`input`.
-The resulting tensor is returned.
-
-The shapes of :attr:`input` and :attr:`other` must be
-:ref:`broadcastable <broadcasting-semantics>`.
-
-.. math::
-    \text{{out}} = \text{{input}} + \text{{alpha}} \times \text{{other}}
-
-If :attr:`other` is of type FloatTensor or DoubleTensor, :attr:`alpha` must be
-a real number, otherwise it should be an integer.
-
-Args:
-    input (Tensor): the first input tensor
-    other (Tensor): the second input tensor
-
-Keyword args:
-    alpha (Number): the scalar multiplier for :attr:`other`
-    {out}
-
-Example::
-
-    >>> a = torch.randn(4)
-    >>> a
-    tensor([-0.9732, -0.3497,  0.6245,  0.4022])
-    >>> b = torch.randn(4, 1)
+    >>> b = torch.randn(4)
     >>> b
+    tensor([-0.9732, -0.3497,  0.6245,  0.4022])
+    >>> c = torch.randn(4, 1)
+    >>> c
     tensor([[ 0.3743],
             [-1.7724],
             [-0.5811],
             [-0.8017]])
-    >>> torch.add(a, b, alpha=10)
+    >>> torch.add(b, c, alpha=10)
     tensor([[  2.7695,   3.3930,   4.3672,   4.1450],
             [-18.6971, -18.0736, -17.0994, -17.3216],
             [ -6.7845,  -6.1610,  -5.1868,  -5.4090],
@@ -6640,23 +6615,24 @@ def merge_dicts(*dicts):
 add_docstr(torch.mul, r"""
 mul(input, other, *, out=None) -> Tensor
 
-Multiplies each element of the input :attr:`input` with the scalar
-:attr:`other` and returns a new resulting tensor.
+Multiplies :attr:`input` by :attr:`other`.
+
 
 .. math::
-    \text{out}_i = \text{other} \times \text{input}_i
+    \text{out}_i = \text{input}_i \times \text{other}_i
 """ + r"""
-If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, :attr:`other`
-should be a real number, otherwise it should be an integer
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+:ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
 
 Args:
     {input}
-    other (Number): the number to be multiplied to each element of :attr:`input`
+    other (Tensor or Number) - the tensor or number to multiply input by.
 
 Keyword args:
     {out}
 
-Example::
+Examples::
 
     >>> a = torch.randn(3)
     >>> a
@@ -6664,38 +6640,16 @@ def merge_dicts(*dicts):
     >>> torch.mul(a, 100)
     tensor([  20.1494,  -42.5491,  260.8663])
 
-.. function:: mul(input, other, *, out=None) -> Tensor
-   :noindex:
-
-Each element of the tensor :attr:`input` is multiplied by the corresponding
-element of the Tensor :attr:`other`. The resulting tensor is returned.
-
-The shapes of :attr:`input` and :attr:`other` must be
-:ref:`broadcastable <broadcasting-semantics>`.
-
-.. math::
-    \text{{out}}_i = \text{{input}}_i \times \text{{other}}_i
-""".format(**common_args) + r"""
-
-Args:
-    input (Tensor): the first multiplicand tensor
-    other (Tensor): the second multiplicand tensor
-
-Keyword args:
-    {out}
-
-Example::
-
-    >>> a = torch.randn(4, 1)
-    >>> a
+    >>> b = torch.randn(4, 1)
+    >>> b
     tensor([[ 1.1207],
             [-0.3137],
             [ 0.0700],
             [ 0.8378]])
-    >>> b = torch.randn(1, 4)
-    >>> b
+    >>> c = torch.randn(1, 4)
+    >>> c
     tensor([[ 0.5146,  0.1216, -0.5244,  2.2382]])
-    >>> torch.mul(a, b)
+    >>> torch.mul(b, c)
     tensor([[ 0.5767,  0.1363, -0.5877,  2.5083],
             [-0.1614, -0.0382,  0.1645, -0.7021],
             [ 0.0360,  0.0085, -0.0367,  0.1567],
@@ -8977,10 +8931,10 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
-    other (Tensor or Scalar): the tensor or scalar to subtract from :attr:`input`
+    other (Tensor or Number): the tensor or number to subtract from :attr:`input`.
 
 Keyword args:
-    alpha (Scalar): the scalar multiplier for :attr:`other`
+    alpha (Number): the multiplier for :attr:`other`.
     {out}
 
 Example::

From 99203580a9e2bb468a9d814f36b6c6a4c13fbed4 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 19 Aug 2021 12:45:32 -0700
Subject: [PATCH 077/530] Updates internal `assert_allclose` callsites in favor
 of `assert_close` (#61841)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61841

Redo of #60863.

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D30408145

Pulled By: mruberry

fbshipit-source-id: 0b34ebc7f23ba38ecd89640b61d8aca59b7eab58
---
 benchmarks/cpp/tensorexpr/bench_ops.py        |  4 +-
 docs/source/jit.rst                           |  2 +-
 test/mobile/test_bytecode.py                  |  6 +-
 test/mobile/test_lite_script_module.py        | 18 ++---
 test/quantization/core/test_quantized_op.py   | 23 +++----
 .../jit/test_deprecated_jit_quant.py          | 22 +++---
 test/test_fx.py                               |  6 +-
 test/test_fx_experimental.py                  | 10 +--
 test/test_jit.py                              | 26 +++----
 test/test_jit_fuser_te.py                     | 16 ++---
 test/test_mobile_optimizer.py                 | 18 ++---
 test/test_nn.py                               | 69 +++++++++----------
 test/test_pruning_op.py                       |  2 +-
 test/test_reductions.py                       | 40 ++++++-----
 test/test_static_runtime.py                   | 26 +++----
 test/test_tensorexpr.py                       |  8 +--
 test/test_tensorexpr_pybind.py                |  8 +--
 test/test_throughput_benchmark.py             |  3 +-
 test/test_torch.py                            | 18 ++---
 test/test_xnnpack_integration.py              | 30 ++++----
 .../fx2trt/example/fx2trt_example.py          |  2 +-
 torch/jit/_trace.py                           |  9 +--
 torch/testing/_core.py                        | 41 -----------
 torch/testing/_deprecated.py                  | 61 +++++++++++++++-
 .../testing/_internal/common_quantization.py  |  6 +-
 .../_internal/distributed/distributed_test.py | 17 ++---
 26 files changed, 246 insertions(+), 245 deletions(-)

diff --git a/benchmarks/cpp/tensorexpr/bench_ops.py b/benchmarks/cpp/tensorexpr/bench_ops.py
index ca40e5d3c7459..12d766ae74862 100644
--- a/benchmarks/cpp/tensorexpr/bench_ops.py
+++ b/benchmarks/cpp/tensorexpr/bench_ops.py
@@ -59,7 +59,7 @@ def hardswish(x):
         traced(x)
 
     # Validate result.
-    torch.testing.assert_allclose(op(x), traced(x))
+    torch.testing.assert_close(op(x), traced(x))
 
     # Benchmark.
     bench_iters = 100
@@ -94,7 +94,7 @@ def test_batch_norm():
             traced(x, y, z)
 
         # Validate result.
-        torch.testing.assert_allclose(op(x, y, z), traced(x, y, z))
+        torch.testing.assert_close(op(x, y, z), traced(x, y, z))
 
         # Benchmark.
         bench_iters = 100
diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index eeb0d2a2c4ac3..f791c1c687153 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -475,7 +475,7 @@ In this case, data-dependent control flow like this can be captured using
     #print(str(scripted_fn.graph).strip())
 
     for input_tuple in [inputs] + check_inputs:
-        torch.testing.assert_allclose(fn(*input_tuple), scripted_fn(*input_tuple))
+        torch.testing.assert_close(fn(*input_tuple), scripted_fn(*input_tuple))
 
 .. testoutput::
     :hide:
diff --git a/test/mobile/test_bytecode.py b/test/mobile/test_bytecode.py
index 5511e6a63b085..95baa86d5763e 100644
--- a/test/mobile/test_bytecode.py
+++ b/test/mobile/test_bytecode.py
@@ -228,7 +228,7 @@ def test_bytecode_values_for_all_backport_functions(self):
     #             # Load model and run forward method
     #             mobile_module = _load_for_lite_interpreter(str(tmp_input_model_path))
     #             mobile_module_result = mobile_module(module_input)
-    #             torch.testing.assert_allclose(mobile_module_result, expected_mobile_module_result)
+    #             torch.testing.assert_close(mobile_module_result, expected_mobile_module_result)
     #             current_to_version -= 1
 
     #         # Check backport failure case
@@ -270,7 +270,7 @@ def test_backport_bytecode_from_file_to_file(self):
                 module_input = 1
                 mobile_module_result = mobile_module(module_input)
                 expected_mobile_module_result = 3 * torch.ones([2, 4], dtype=torch.float64)
-                torch.testing.assert_allclose(mobile_module_result, expected_mobile_module_result)
+                torch.testing.assert_close(mobile_module_result, expected_mobile_module_result)
                 shutil.rmtree(tmpdirname)
 
     # Check just the _backport_for_mobile_to_buffer mechanism but not the function implementations
@@ -296,7 +296,7 @@ def test_backport_bytecode_from_file_to_buffer(self):
             module_input = 1
             mobile_module_result = mobile_module(module_input)
             expected_mobile_module_result = 3 * torch.ones([2, 4], dtype=torch.float64)
-            torch.testing.assert_allclose(mobile_module_result, expected_mobile_module_result)
+            torch.testing.assert_close(mobile_module_result, expected_mobile_module_result)
 
 
     def test_get_model_ops_and_info(self):
diff --git a/test/mobile/test_lite_script_module.py b/test/mobile/test_lite_script_module.py
index 369371fd3279c..a86669ec574b7 100644
--- a/test/mobile/test_lite_script_module.py
+++ b/test/mobile/test_lite_script_module.py
@@ -48,13 +48,13 @@ def forward(self, x):
         mobile_module = _load_for_lite_interpreter(buffer)
 
         mobile_module_result = mobile_module(input)
-        torch.testing.assert_allclose(script_module_result, mobile_module_result)
+        torch.testing.assert_close(script_module_result, mobile_module_result)
 
         mobile_module_forward_result = mobile_module.forward(input)
-        torch.testing.assert_allclose(script_module_result, mobile_module_forward_result)
+        torch.testing.assert_close(script_module_result, mobile_module_forward_result)
 
         mobile_module_run_method_result = mobile_module.run_method("forward", input)
-        torch.testing.assert_allclose(script_module_result, mobile_module_run_method_result)
+        torch.testing.assert_close(script_module_result, mobile_module_run_method_result)
 
     def test_save_mobile_module_with_debug_info_with_trace(self):
         class A(torch.nn.Module):
@@ -117,13 +117,13 @@ def forward(self, x):
         mobile_module = _load_for_lite_interpreter(buffer)
 
         mobile_module_result = mobile_module(input)
-        torch.testing.assert_allclose(script_module_result, mobile_module_result)
+        torch.testing.assert_close(script_module_result, mobile_module_result)
 
         mobile_module_forward_result = mobile_module.forward(input)
-        torch.testing.assert_allclose(script_module_result, mobile_module_forward_result)
+        torch.testing.assert_close(script_module_result, mobile_module_forward_result)
 
         mobile_module_run_method_result = mobile_module.run_method("forward", input)
-        torch.testing.assert_allclose(script_module_result, mobile_module_run_method_result)
+        torch.testing.assert_close(script_module_result, mobile_module_run_method_result)
 
     def test_find_and_run_method(self):
         class MyTestModule(torch.nn.Module):
@@ -154,7 +154,7 @@ def forward(self, arg):
 
         bundled_inputs = mobile_module.run_method("get_all_bundled_inputs")
         mobile_module_result = mobile_module.forward(*bundled_inputs[0])
-        torch.testing.assert_allclose(script_module_result, mobile_module_result)
+        torch.testing.assert_close(script_module_result, mobile_module_result)
 
     def test_method_calls_with_optional_arg(self):
         class A(torch.nn.Module):
@@ -183,7 +183,7 @@ def forward(self, x, one: int = 1):
         input = torch.tensor([5])
         script_module_forward_result = script_module.forward(input)
         mobile_module_forward_result = mobile_module.forward(input)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             script_module_forward_result,
             mobile_module_forward_result
         )
@@ -198,7 +198,7 @@ def forward(self, x, one: int = 1):
 
         # now both match again
         mobile_module_forward_result = mobile_module.forward(input, 2)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             script_module_forward_result,
             mobile_module_forward_result
         )
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index d0a2dea45e8e3..6c94586d3101e 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -1617,8 +1617,8 @@ def test_qtopk(self, X, k, dim, largest, sorted):
         quantized_out = torch.topk(qX, k, dim=dim, largest=largest, sorted=sorted)
 
         assert(len(unquantized_out) == len(quantized_out))
-        torch.testing.assert_allclose(quantized_out[0].dequantize(), unquantized_out[0])
-        torch.testing.assert_allclose(quantized_out[1], unquantized_out[1])
+        torch.testing.assert_close(quantized_out[0].dequantize(), unquantized_out[0])
+        torch.testing.assert_close(quantized_out[1], unquantized_out[1])
 
     @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=4, max_dims=4,
                                               min_side=1, max_side=10),
@@ -1643,8 +1643,8 @@ def test_qtopk_nhwc(self, X, k, dim, largest, sorted):
         quantized_out = torch.topk(qX, k, dim=dim, largest=largest, sorted=sorted)
 
         assert(len(unquantized_out) == len(quantized_out))
-        torch.testing.assert_allclose(quantized_out[0].dequantize(), unquantized_out[0])
-        torch.testing.assert_allclose(quantized_out[1], unquantized_out[1])
+        torch.testing.assert_close(quantized_out[0].dequantize(), unquantized_out[0])
+        torch.testing.assert_close(quantized_out[1], unquantized_out[1])
 
 
     """Tests quantize concatenation (both fused and not)."""
@@ -1846,7 +1846,7 @@ def test_cat_nhwc(self, X, relu):
         else:
             out = torch.ops.quantized.cat([qX, qY], dim=1, scale=scale, zero_point=zero_point)
 
-        torch.testing.assert_allclose(out.dequantize(), ref.dequantize())
+        torch.testing.assert_close(out.dequantize(), ref.dequantize())
         self.assertNotEqual(out.stride(), sorted(out.stride()))
 
     @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=1, max_dims=5,
@@ -3400,8 +3400,7 @@ def get_reference_result(
             num_embeddings, embedding_dim, include_last_offset, weights,
             per_sample_weights, indices, offsets)
 
-        torch.testing.assert_allclose(reference_result, result, atol=atol,
-                                      rtol=rtol)
+        torch.testing.assert_close(reference_result, result, atol=atol, rtol=rtol)
 
 
         if bit_rate == 8 or bit_rate == 4:
@@ -3424,7 +3423,7 @@ def get_reference_result(
                         per_sample_weights=per_sample_weights,
                         compressed_indices_mapping=torch.tensor(mapping_table),
                         include_last_offset=include_last_offset)
-            torch.testing.assert_allclose(reference_result, result, atol=atol, rtol=rtol)
+            torch.testing.assert_close(reference_result, result, atol=atol, rtol=rtol)
 
 
@@ -3510,7 +3509,7 @@ def test_embedding_byte(self, num_embeddings, embedding_dim):
         qresult = quant_op(packed_weight, indices, pruned_weights=False)
 
         ref = torch.embedding(weights, indices, padding_idx=-1, scale_grad_by_freq=False, sparse=False)
-        torch.testing.assert_allclose(ref, qresult, atol=0.005, rtol=1e-3)
+        torch.testing.assert_close(ref, qresult, atol=0.005, rtol=1e-3)
 
 
     def test_embedding_2d_indices(self):
@@ -3533,7 +3532,7 @@ def test_embedding_2d_indices(self):
         qweight = torch.quantize_per_channel(weights, qparams[0], qparams[1], axis=0, dtype=torch.quint8)
         packed_weight = prepack_op(qweight)
         qresult = quant_op(packed_weight, indices, pruned_weights=False)
-        torch.testing.assert_allclose(ref, qresult, atol=0.05, rtol=1e-3)
+        torch.testing.assert_close(ref, qresult, atol=0.05, rtol=1e-3)
 
     def test_embedding_bag_2d_indices(self):
         """
@@ -3555,7 +3554,7 @@ def test_embedding_bag_2d_indices(self):
         pt_prepack_op = torch.ops.quantized.embedding_bag_byte_prepack
         q_weights = pt_prepack_op(weights)
         qresult = pt_op(q_weights, indices, mode=0, pruned_weights=False)
-        torch.testing.assert_allclose(result, qresult, atol=0.05, rtol=1e-3)
+        torch.testing.assert_close(result, qresult, atol=0.05, rtol=1e-3)
 
         # Test TorchBind based embedding_bag operator
         obs = PerChannelMinMaxObserver(dtype=torch.quint8, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0)
@@ -3569,7 +3568,7 @@ def test_embedding_bag_2d_indices(self):
         packed_weight = torch.ops.quantized.embedding_bag_prepack(qweight)
         qresult = torch.ops.quantized.embedding_bag_byte(packed_weight, indices, mode=0)
 
-        torch.testing.assert_allclose(result, qresult, atol=0.05, rtol=1e-3)
+        torch.testing.assert_close(result, qresult, atol=0.05, rtol=1e-3)
 
 
 class TestQuantizedConv(TestCase):
diff --git a/test/quantization/jit/test_deprecated_jit_quant.py b/test/quantization/jit/test_deprecated_jit_quant.py
index 662ead35bcf01..68ddb5c346a49 100644
--- a/test/quantization/jit/test_deprecated_jit_quant.py
+++ b/test/quantization/jit/test_deprecated_jit_quant.py
@@ -99,7 +99,7 @@ def forward(self, x: torch.Tensor, hiddens: torch.Tensor) -> torch.Tensor:
 
             self.assertEqual(len(outs), len(ref_outs))
             for out, ref_out in zip(outs, ref_outs):
-                torch.testing.assert_allclose(out, ref_out)
+                torch.testing.assert_close(out, ref_out)
 
     @skipIfNoFBGEMM
     def test_rnn_quantized(self):
@@ -165,32 +165,32 @@ def test_rnn_quantized(self):
             # Compare int8 quantized to unquantized
             output_int8, final_hiddens_int8 = cell_int8(x, hiddens)
 
-            torch.testing.assert_allclose(output_int8, ref_out)
+            torch.testing.assert_close(output_int8, ref_out)
             for out, ref in zip(final_hiddens_int8, ref_hid):
-                torch.testing.assert_allclose(out, ref)
+                torch.testing.assert_close(out, ref)
 
             # Compare fp16 quantized to unquantized
             output_fp16, final_hiddens_fp16 = cell_fp16(x, hiddens)
 
-            torch.testing.assert_allclose(output_fp16, ref_out)
+            torch.testing.assert_close(output_fp16, ref_out)
             for out, ref in zip(final_hiddens_fp16, ref_hid):
-                torch.testing.assert_allclose(out, ref)
+                torch.testing.assert_close(out, ref)
 
             def compare_quantized_unquantized(ScriptWrapper, cell):
                 wrapper = ScriptWrapper(cell)
 
                 # Compare quantize scripted module to unquantized
                 script_out, script_hid = wrapper(x, hiddens)
-                torch.testing.assert_allclose(script_out, ref_out)
+                torch.testing.assert_close(script_out, ref_out)
                 for out, ref in zip(script_hid, ref_hid):
-                    torch.testing.assert_allclose(out, ref)
+                    torch.testing.assert_close(out, ref)
 
                 # Compare export/import to unquantized
                 export_import_wrapper = self.getExportImportCopyWithPacking(wrapper)
                 ei_out, ei_hid = export_import_wrapper(x, hiddens)
-                torch.testing.assert_allclose(ei_out, ref_out)
+                torch.testing.assert_close(ei_out, ref_out)
                 for out, ref in zip(ei_hid, ref_hid):
-                    torch.testing.assert_allclose(out, ref)
+                    torch.testing.assert_close(out, ref)
 
             if isinstance(cell, torch.jit.quantized.QuantizedGRU):
                 class ScriptWrapper(torch.jit.ScriptModule):
@@ -252,8 +252,8 @@ def forward(self, x):
             fb_fp16 = self.getExportImportCopyWithPacking(traced_fp16)
             y_fp16 = fb_fp16(value)
 
-            torch.testing.assert_allclose(y_int8, y_ref, rtol=0.0001, atol=1e-3)
-            torch.testing.assert_allclose(y_fp16, y_ref, rtol=0.0001, atol=1e-3)
+            torch.testing.assert_close(y_int8, y_ref, rtol=0.0001, atol=1e-3)
+            torch.testing.assert_close(y_fp16, y_ref, rtol=0.0001, atol=1e-3)
 
     @skipIfNoFBGEMM
     def test_erase_class_tensor_shapes(self):
diff --git a/test/test_fx.py b/test/test_fx.py
index e39469d0a0676..c55e97dc7da84 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -593,17 +593,17 @@ def __init__(self, interpreter):
         x = torch.rand(3, 4)
         ref_out = msm(x)
         test_out = lowered(x)
-        torch.testing.assert_allclose(test_out, ref_out)
+        torch.testing.assert_close(test_out, ref_out)
 
         # Test TorchScript compilation
         scripted_lowered = torch.jit.script(lowered)
         script_out = scripted_lowered(x)
-        torch.testing.assert_allclose(script_out, ref_out)
+        torch.testing.assert_close(script_out, ref_out)
 
         # Test TorchScript ser/de
         import_copy = self.getExportImportCopy(scripted_lowered)
         imported_out = import_copy(x)
-        torch.testing.assert_allclose(imported_out, ref_out)
+        torch.testing.assert_close(imported_out, ref_out)
 
     def test_reserved_getattr(self):
         """Ensure that we do not name any nodes with a reserved builtin like `getattr`"""
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 00f3201452964..f000b0af59598 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -876,7 +876,7 @@ def forward(self, x, y):
             traced = symbolic_trace(WrapperMod())
             normalized = NormalizeOperators(traced).transform()
             x, y = torch.randn(3, 4), torch.randn(3, 4)
-            torch.testing.assert_allclose(traced(x, y), normalized(x, y))
+            torch.testing.assert_close(traced(x, y), normalized(x, y))
             self.assertFalse(
                 any(n.target in ops_to_test for n in normalized.graph.nodes)
             )
@@ -891,7 +891,7 @@ def forward(self, x):
             traced = symbolic_trace(WrapperMod())
             normalized = NormalizeOperators(traced).transform()
             x = torch.randn(3, 4)
-            torch.testing.assert_allclose(traced(x), normalized(x))
+            torch.testing.assert_close(traced(x), normalized(x))
             self.assertFalse(
                 any(n.target in ops_to_test for n in normalized.graph.nodes)
             )
@@ -1413,12 +1413,12 @@ def forward(self, x):
         with torch.no_grad():
             model = Foo().eval()
             optimized_model = optimization.optimize_for_inference(model)
-            torch.testing.assert_allclose(model(inp), optimized_model(inp))
+            torch.testing.assert_close(model(inp), optimized_model(inp))
 
             optimized_model2 = optimization.optimize_for_inference(
                 model, pass_config={"remove_dropout": False}
             )
-            torch.testing.assert_allclose(model(inp), optimized_model2(inp))
+            torch.testing.assert_close(model(inp), optimized_model2(inp))
 
     @skipIfNoTorchVision
     @skipIfNoMkldnn
@@ -1450,7 +1450,7 @@ def test_optimize_for_inference_cpu_torchvision(self):
 
                 orig_out = model(inp)
                 new_out = optimized_model(inp)
-                torch.testing.assert_allclose(orig_out, new_out)
+                torch.testing.assert_close(orig_out, new_out)
 
 
 class TestNormalizeOperators(JitTestCase):
diff --git a/test/test_jit.py b/test/test_jit.py
index 99df960da5dc4..2dd0d4764c46c 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -497,7 +497,7 @@ def forward(self, a, b, c):
         FileCheck().check_not("aten::relu(") \
             .check("aten::_add_relu(") \
             .run(m.graph)
-        torch.testing.assert_allclose(orig_res, new_res)
+        torch.testing.assert_close(orig_res, new_res)
 
         # add, relu_
         a = torch.rand((7, 11))
@@ -516,7 +516,7 @@ def forward(self, a, b, c):
         FileCheck().check_not("aten::relu_(") \
             .check("aten::_add_relu(") \
             .run(m.graph)
-        torch.testing.assert_allclose(orig_res, new_res)
+        torch.testing.assert_close(orig_res, new_res)
 
         class Madd_(torch.nn.Module):
             def __init__(self, relu_op):
@@ -547,10 +547,10 @@ def forward(self, a, b):
             .check_not("aten::relu_(") \
             .check("aten::_add_relu_(") \
             .run(m.graph)
-        torch.testing.assert_allclose(orig_res, new_res)
+        torch.testing.assert_close(orig_res, new_res)
         # Since _add_relu_ does inplace mutation ensure
         # a_copy is modified
-        torch.testing.assert_allclose(orig_res, a_copy)
+        torch.testing.assert_close(orig_res, a_copy)
 
         class Madd_out(torch.nn.Module):
             def __init__(self, relu_op):
@@ -585,10 +585,10 @@ def forward(self, a, b):
             .check_not("aten::relu_(") \
             .check("aten::_add_relu(") \
             .run(m.graph)
-        torch.testing.assert_allclose(orig_res, new_res)
+        torch.testing.assert_close(orig_res, new_res)
         # Since _add_relu_ with out=a does inplace mutation ensure
         # a_copy is modified
-        torch.testing.assert_allclose(orig_res, a_copy)
+        torch.testing.assert_close(orig_res, a_copy)
 
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, "Simple executor doesn't have shape information")
     def test_peephole_optimize_shape_ops(self):
@@ -8888,7 +8888,7 @@ def forward(self, x):
     def test_pack_unpack_state(self):
         sm = TestScript.DerivedStateModule()
         x = torch.rand(3, 4, dtype=torch.float)
-        torch.testing.assert_allclose(sm(x), x + torch.neg(torch.ones(3, 4, dtype=torch.float)))
+        torch.testing.assert_close(sm(x), x + torch.neg(torch.ones(3, 4, dtype=torch.float)))
 
         # Test save path
         self.assertFalse(sm.pack_called.item())
@@ -8899,11 +8899,11 @@ def test_pack_unpack_state(self):
         # ensure unpack was called after serialization so as to leave the module in an initialized state
         self.assertTrue(sm.unpack_called.item())
 
-        torch.testing.assert_allclose(sm.derived, torch.neg(sm.param))
+        torch.testing.assert_close(sm.derived, torch.neg(sm.param))
 
         # Test load paths
         self.assertTrue(imported.unpack_called.item())
-        torch.testing.assert_allclose(imported(x), x + torch.neg(torch.ones(3, 4, dtype=torch.float)))
+        torch.testing.assert_close(imported(x), x + torch.neg(torch.ones(3, 4, dtype=torch.float)))
 
     @unittest.skipIf(not TEST_MKL, "PyTorch is built without MKL support")
     def test_torch_functional(self):
@@ -9101,11 +9101,11 @@ def forward(self, x):
                 return self.submod(x + self.buf)
 
         m = Mod()
-        torch.testing.assert_allclose(m(torch.zeros(3, 4)), torch.ones(3, 4) * 6)
+        torch.testing.assert_close(m(torch.zeros(3, 4)), torch.ones(3, 4) * 6)
         m.apply(lambda s: s._pack())
-        torch.testing.assert_allclose(m(torch.zeros(3, 4)), torch.zeros(3, 4))
+        torch.testing.assert_close(m(torch.zeros(3, 4)), torch.zeros(3, 4))
         m.apply(lambda s: s._unpack())
-        torch.testing.assert_allclose(m(torch.zeros(3, 4)), torch.ones(3, 4) * 6)
+        torch.testing.assert_close(m(torch.zeros(3, 4)), torch.ones(3, 4) * 6)
 
     def test_torch_any(self):
         def fn(x):
@@ -10958,7 +10958,7 @@ def forward(self, x):
         torch._C._jit_pass_remove_dropout(m._c)
         res = m(data)
         FileCheck().check_not("aten::dropout").run(str(m.graph))
-        torch.testing.assert_allclose(ref_res, res, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_res, res, rtol=1e-2, atol=1e-3)
 
     def test_unfold_zero_dim(self):
         def fn(x):
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index ba47547256b75..64c26b7936b54 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1186,7 +1186,7 @@ def fn(input_v, mask):
             ref = fn(input_v, mask)
             try:
                 t = torch.jit.trace(fn, (input_v, mask))
-                torch.testing.assert_allclose(ref, t(input_v, mask))
+                torch.testing.assert_close(ref, t(input_v, mask))
                 print(torch.jit.last_executed_optimized_graph())
                 self.assertLastGraphAllFused()
             except Exception as e:
@@ -1287,7 +1287,7 @@ def apply(fn):
                 continue
             try:
                 t = torch.jit.trace(fn, (x,))
-                torch.testing.assert_allclose(ref, t(x))
+                torch.testing.assert_close(ref, t(x))
                 self.assertAllFused(t.graph_for(x))
             except Exception as e:
                 raise RuntimeError(
@@ -1683,7 +1683,7 @@ def eager(t0, t1, t2, t3, t4):
             for _ in range(4):
                 for pair in zip(script(*inputs), eager(*inputs)):
                     test, ref = pair
-                    torch.testing.assert_allclose(test, ref)
+                    torch.testing.assert_close(test, ref)
                     self.assertAllFused(script.graph_for(*inputs))
 
     def test_sub_gt_and(self):
@@ -1776,10 +1776,10 @@ def eager(x, y):
                 one = torch.tensor([[1]]).to(dtype2)
                 script = torch.jit.trace(eager, (x, zero))
                 for _ in range(3):
-                    torch.testing.assert_allclose(
+                    torch.testing.assert_close(
                         script(x, zero),
                         eager(x, zero))
-                    torch.testing.assert_allclose(
+                    torch.testing.assert_close(
                         script(x, one),
                         eager(x, one))
                 self.assertAllFused(script.graph_for(x, one))
@@ -1824,7 +1824,7 @@ def _test_fwd_bwd(self, fn):
                 xs -= 0.1 * xs.grad
                 x.grad = None
                 xs.grad = None
-        torch.testing.assert_allclose(y, ys)
+        torch.testing.assert_close(y, ys)
 
     def test_relu_fwd_bwd(self):
         def eager(x):
@@ -1907,12 +1907,12 @@ def eager(x):
             for _ in range(3):
                 script(x)
 
-            torch.testing.assert_allclose(eager(x), script(x))
+            torch.testing.assert_close(eager(x), script(x))
 
             # Now when an input hits the unrolled path, it will produce an
             # incorrectly-sized tensor, since size=1 has been burned in.
             x = torch.ones((8, 1))
-            torch.testing.assert_allclose(eager(x), script(x))
+            torch.testing.assert_close(eager(x), script(x))
 
 works_list = [
     '__radd__',
diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py
index 78ebb550d0227..19f07e2454488 100644
--- a/test/test_mobile_optimizer.py
+++ b/test/test_mobile_optimizer.py
@@ -119,7 +119,7 @@ def forward(self, x):
                    .check_not("aten::relu(") \
                    .check_count("aten::_add_relu(", 1, exactly=True) \
                    .run(optimized_scripted_model.graph)
-        torch.testing.assert_allclose(initial_result, optimized_result, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(initial_result, optimized_result, rtol=1e-2, atol=1e-3)
 
         FileCheck().check_not("Tensor = aten::conv2d") \
                    .check_not("Tensor = prim::CallFunction") \
@@ -131,7 +131,7 @@ def forward(self, x):
                    .check_not("aten::relu(") \
                    .check_count("aten::_add_relu(", 1, exactly=True) \
                    .run(optimized_scripted_model.foo.graph)
-        torch.testing.assert_allclose(initial_foo_result, optimized_foo_result, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(initial_foo_result, optimized_foo_result, rtol=1e-2, atol=1e-3)
 
 
         optimization_blocklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS}
@@ -142,7 +142,7 @@ def forward(self, x):
                    .check_not("prepacked::linear_clamp_run") \
                    .check_not("prepacked::conv2d_clamp_run") \
                    .run(optimized_scripted_model_no_prepack.graph)
-        torch.testing.assert_allclose(initial_result, optimized_result_no_prepack, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(initial_result, optimized_result_no_prepack, rtol=1e-2, atol=1e-3)
 
 
         bn_test_module = BNTestModule()
@@ -157,14 +157,14 @@ def forward(self, x):
         bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blocklist_no_prepack)
         self.assertEqual(len(torch.jit.export_opnames(bn_fold_scripted_module)), 1)
         bn_input = torch.rand(1, 1, 6, 6)
-        torch.testing.assert_allclose(bn_scripted_module(bn_input), bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(bn_scripted_module(bn_input), bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3)
 
         optimization_blocklist_no_fold_bn = {MobileOptimizerType.CONV_BN_FUSION}
         no_bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blocklist_no_fold_bn)
         FileCheck().check_count("aten::batch_norm", 1, exactly=True) \
                    .run(str(get_forward_graph(no_bn_fold_scripted_module._c)))
         bn_input = torch.rand(1, 1, 6, 6)
-        torch.testing.assert_allclose(bn_scripted_module(bn_input), no_bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(bn_scripted_module(bn_input), no_bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3)
 
         class MyMobileOptimizedTagTest(torch.nn.Module):
             def __init__(self):
@@ -231,7 +231,7 @@ def foo(self, x):
         FileCheck().check_not("dropout.__") \
             .check_count("aten::_add_relu(", 1, exactly=True) \
             .run(optimized_scripted_model.foo.graph)
-        torch.testing.assert_allclose(initial_result, optimized_result, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(initial_result, optimized_result, rtol=1e-2, atol=1e-3)
 
         class BNTestNoForwardModule(torch.nn.Module):
             def __init__(self):
@@ -257,7 +257,7 @@ def foo(self, x):
         bn_fold_no_forward_scripted_module = optimize_for_mobile(bn_no_forward_scripted_module, preserved_methods=['foo'])
         self.assertEqual(len(torch.jit.export_opnames(bn_fold_no_forward_scripted_module)), 1)
         bn_input = torch.rand(1, 1, 6, 6)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             bn_no_forward_scripted_module.foo(bn_input),
             bn_fold_no_forward_scripted_module.foo(bn_input),
             rtol=1e-2,
@@ -493,7 +493,7 @@ def _quant_script_and_optimize(model):
             data = torch.randn(4, 1, 4, 4)
             m_res = m(data)
             m_optim_res = m_optim(data)
-            torch.testing.assert_allclose(m_res, m_optim_res, rtol=1e-2, atol=1e-3)
+            torch.testing.assert_close(m_res, m_optim_res, rtol=1e-2, atol=1e-3)
 
             # generic case
 
@@ -507,7 +507,7 @@ def _quant_script_and_optimize(model):
             data = torch.randn(4, 1, 4, 4)
             m_res = m(data)
             m_optim_res = m_optim(data)
-            torch.testing.assert_allclose(m_res, m_optim_res, rtol=1e-2, atol=1e-3)
+            torch.testing.assert_close(m_res, m_optim_res, rtol=1e-2, atol=1e-3)
 
     @unittest.skipUnless(HAS_TORCHVISION, "Needs torchvision")
     def test_mobilenet_optimize_for_mobile(self):
diff --git a/test/test_nn.py b/test/test_nn.py
index ccf6f6e933c10..d21e0477715db 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -4717,7 +4717,7 @@ def fc_op(X, W, b):
         packed_w_tensor = torch.fbgemm_pack_gemm_matrix_fp16(w_tensor)
         actual_output = torch.fbgemm_linear_fp16_weight(x_tensor, packed_w_tensor, b_tensor)
         expected_output = fc_op(X, W, b)
-        torch.testing.assert_allclose(expected_output, actual_output.cpu(), atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(torch.from_numpy(expected_output), actual_output.cpu(), atol=1e-3, rtol=1e-3)
 
     def test_embeddingbag_from_pretrained(self):
         a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
@@ -6797,8 +6797,7 @@ def perm_fn(x):
             encoder_input = torch.tensor([[[20., 30., 40., 50.]]])
             result = model(encoder_input)
             ref_output = torch.tensor([[[2.249815, 0.131006, -0.702199, 0.177868]]])
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output)
+            torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0)
 
             # deterministic input
             encoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]],
@@ -6806,8 +6805,7 @@ def perm_fn(x):
             result = model(encoder_input)
             ref_output = perm_fn(torch.tensor([[[2.264103, 0.121417, -0.696012, 0.159724]],
                                                [[2.264103, 0.121417, -0.696012, 0.159724]]]))
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output)
+            torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0)
 
             # deterministic input
             encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
@@ -6831,8 +6829,7 @@ def perm_fn(x):
                                                 [2.4237977, 0.03290575, -0.60561789, -0.05940082]],
                                                [[2.41383916, 0.02686345, -0.61256377, -0.06380707],
                                                 [2.42000277, 0.03800944, -0.60824798, -0.04754947]]]))
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output)
+            torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0)
 
     def test_transformerdecoderlayer(self):
         # this is a deterministic test for TransformerDecoderLayer
@@ -7013,8 +7010,7 @@ def perm_fn(x):
             memory_input = torch.tensor([[[60., 70., 80., 90.]]])
             result = model(decoder_input, memory_input)
             ref_output = torch.tensor([[[2.306435, 0.095946, -0.675796, 0.10687]]])
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output)
+            torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0)
 
             # deterministic input
             decoder_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]],
@@ -7023,8 +7019,7 @@ def perm_fn(x):
             result = model(decoder_input, memory_input)
             ref_output = perm_fn(torch.tensor([[[2.415448, 0.054389, -0.610932, -0.0156613]],
                                                [[2.415448, 0.054389, -0.610932, -0.0156613]]]))
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output)
+            torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0)
 
             # deterministic input
             decoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]],
@@ -7034,8 +7029,7 @@ def perm_fn(x):
             result = model(decoder_input, memory_input)
             ref_output = perm_fn(torch.tensor([[[2.338531, 0.087709, -0.65776, 0.080646]],
                                                [[2.338531, 0.087709, -0.65776, 0.080646]]]))
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output)
+            torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0)
 
             # deterministic input
             decoder_input = perm_fn(torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034],
@@ -7061,8 +7055,7 @@ def perm_fn(x):
                                                 [2.42216881, 0.03586554, -0.6067524, -0.05289126]],
                                                [[2.42205716, 0.03488046, -0.60683681, -0.05460596],
                                                 [2.42240309, 0.0354595, -0.60659063, -0.05378816]]]))
-            self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output)
+            torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0)
 
     def test_transformerencoder(self):
         def get_a_test_layer(use_cuda, activation, batch_first=False):
@@ -7130,13 +7123,13 @@ def perm_fn(x):
                                                 [2.422901, 0.024187, -0.606178, -0.074929]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             # all 0
             mask = torch.zeros([2, 5]).to(device) == 1
             result = model(encoder_input, src_key_padding_mask=mask)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
             mask[0, 1] = 1
             mask[1, 3] = 1
             mask[1, 4] = 1
@@ -7153,7 +7146,7 @@ def perm_fn(x):
                                                 [2.4242, 0.024653, -0.605266, -0.074959]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             # test case 2, multiple layers no norm
             model = nn.TransformerEncoder(encoder_layer, 2).to(device)
@@ -7170,7 +7163,7 @@ def perm_fn(x):
                                                 [2.419075, 0.017449, -0.608722, -0.085014]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             model = nn.TransformerEncoder(encoder_layer, 6).to(device)
             result = model(encoder_input, src_key_padding_mask=mask)
@@ -7186,7 +7179,7 @@ def perm_fn(x):
                                                 [2.419101, 0.017453, -0.608704, -0.085025]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             # test case 3, multiple layers with norm
             # d_model = 4
@@ -7205,7 +7198,7 @@ def perm_fn(x):
                                                 [1.695952, -0.357637, -0.893065, -0.445251]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             model = nn.TransformerEncoder(encoder_layer, 6, norm=norm).to(device)
             result = model(encoder_input, src_key_padding_mask=mask)
@@ -7221,7 +7214,7 @@ def perm_fn(x):
                                                 [1.695955, -0.357639, -0.893051, -0.445265]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
 
     def test_transformerdecoder(self):
@@ -7271,7 +7264,7 @@ def perm_fn(x):
             ref_output = torch.tensor(
                 [[[2.314351, 0.094805, -0.671322, 0.101977]]]).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-3)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-3)
 
             # deterministic input
             decoder_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]],
@@ -7282,7 +7275,7 @@ def perm_fn(x):
                                                [[2.422245, 0.051716, -0.606338, -0.024756]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-4)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-4)
 
             # deterministic input
             decoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]],
@@ -7294,7 +7287,7 @@ def perm_fn(x):
                                                [[2.343536, 0.085561, -0.654954, 0.074991]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-4)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-4)
 
             # deterministic input
             decoder_input = perm_fn(torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034],
@@ -7324,7 +7317,7 @@ def perm_fn(x):
                                                 [2.432306, 0.028858, -0.599542, -0.072846]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             # key_padding_mask
             key_padding_mask = torch.zeros(2, 3).to(device) == 1
@@ -7338,7 +7331,7 @@ def perm_fn(x):
                                                 [2.432306, 0.028858, -0.599542, -0.072846]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             # key_padding_mask
             key_padding_mask[0, 2] = 1
@@ -7354,7 +7347,7 @@ def perm_fn(x):
                                                 [2.432659, 0.029244, -0.599294, -0.072382]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             # memory_key_padding_mask
             key_padding_mask = torch.zeros(2, 5).to(device) == 1
@@ -7368,7 +7361,7 @@ def perm_fn(x):
                                                 [2.432306, 0.028858, -0.599542, -0.072846]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             # memory_key_padding_mask
             key_padding_mask[0, 4] = 1
@@ -7385,7 +7378,7 @@ def perm_fn(x):
                                                 [2.433075, 0.028543, -0.598987, -0.073985]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             # multiple layers no norm
             model = nn.TransformerDecoder(decoder_layer, 2).to(device)
@@ -7397,7 +7390,7 @@ def perm_fn(x):
             ref_output = torch.tensor(
                 [[[2.31316, 0.0950293, -0.671995, 0.102802]]]).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-3)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-3)
 
             # multiple layers no norm
             model = nn.TransformerDecoder(decoder_layer, 6).to(device)
@@ -7430,7 +7423,7 @@ def perm_fn(x):
                                                 [2.43113, 0.0279516, -0.600376, -0.0736896]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             # multiple layers with norm
             # d_model = 4
@@ -7444,7 +7437,7 @@ def perm_fn(x):
             ref_output = torch.tensor(
                 [[[1.66166, -0.326986, -1.01466, -0.320017]]]).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-3)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-3)
 
             # multiple layers with norm
             model = nn.TransformerDecoder(decoder_layer, 6, norm=norm).to(device)
@@ -7477,7 +7470,7 @@ def perm_fn(x):
                                                 [1.69571, -0.357363, -0.894154, -0.444196]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
             # gelu activation test cases
             activation = "gelu"
@@ -7495,7 +7488,7 @@ def perm_fn(x):
             result = model(decoder_input, memory_input)
             ref_output = torch.tensor([[[2.306435, 0.095946, -0.675796, 0.10687]]]).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-3)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-3)
 
             # deterministic input
             decoder_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]],
@@ -7505,7 +7498,7 @@ def perm_fn(x):
             ref_output = perm_fn(torch.tensor([[[2.415448, 0.054389, -0.610932, -0.0156613]],
                                                [[2.415448, 0.054389, -0.610932, -0.0156613]]])).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-4)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-4)
 
             # deterministic input
             decoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]],
@@ -7516,7 +7509,7 @@ def perm_fn(x):
             ref_output = perm_fn(torch.tensor([[[2.338531, 0.087709, -0.65776, 0.080646]],
                                                [[2.338531, 0.087709, -0.65776, 0.080646]]])).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-4)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-4)
 
             # deterministic input
             decoder_input = perm_fn(torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034],
@@ -7546,7 +7539,7 @@ def perm_fn(x):
                                                 [2.42240309, 0.0354595, -0.60659063, -0.05378816]]]
                                               )).to(device)
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
-            torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+            torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
     @unittest.skipIf(not (TEST_CUDNN and TEST_MULTIGPU), 'CUDNN or multi-gpu not available')
     def test_cudnn_rnn_dropout_states_device(self):
diff --git a/test/test_pruning_op.py b/test/test_pruning_op.py
index 28f31aeabd705..97a499b03ac15 100644
--- a/test/test_pruning_op.py
+++ b/test/test_pruning_op.py
@@ -50,7 +50,7 @@ def get_reference_result(embedding_weights, mask, indices_type):
         ref_pruned_weights, ref_compressed_indices_map = get_reference_result(
             embedding_weights, mask, indices_type)
 
-        torch.testing.assert_allclose(pt_pruned_weights, ref_pruned_weights)
+        torch.testing.assert_close(pt_pruned_weights, ref_pruned_weights)
         self.assertEqual(pt_compressed_indices_map, ref_compressed_indices_map)
         self.assertEqual(pt_compressed_indices_map.dtype, indices_type)
 
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 42edfb3817ce1..f3f0d4c936451 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -2664,36 +2664,38 @@ def test_tensor_reduce_ops_empty(self, device):
             self.assertEqual(np_function(np_input, axis=-1), fn(master_input, dim=-1).cpu().numpy(), msg=error_msg,
                              exact_dtype=False)
 
-            self.assertEqual(torch.empty((2, 0, 1), device=device), fn(master_input, dim=2, keepdim=True), msg=error_msg)
+            self.assertEqual(torch.empty((2, 0, 1), device=device), fn(master_input, dim=2, keepdim=True),
+                             msg=error_msg)
             self.assertEqual(np_function(np_input, axis=2, keepdims=True), fn(master_input, dim=2, keepdim=True),
                              msg=error_msg, exact_dtype=False)
 
-            self.assertEqual(torch.empty((2, 0, 1), device=device), fn(master_input, dim=-1, keepdim=True), msg=error_msg)
+            self.assertEqual(torch.empty((2, 0, 1), device=device), fn(master_input, dim=-1, keepdim=True),
+                             msg=error_msg)
             self.assertEqual(np_function(np_input, axis=-1, keepdims=True), fn(master_input, dim=-1, keepdim=True),
                              msg=error_msg, exact_dtype=False)
 
-            # Check if returned data is correct.
-            check_func = (torch.testing.assert_allclose if math.isnan(return_value) or math.isinf(return_value) else
-                          self.assertEqual)
-
-            check_func(torch.full((2, 4), return_value, device=device), fn(master_input, dim=1), msg=error_msg)
-            check_func(torch.full((2, 4), return_value, device=device), fn(master_input, dim=-2), msg=error_msg)
-            check_func(torch.full((2, 1, 4), return_value, device=device), fn(master_input, dim=1, keepdim=True), msg=error_msg)
-            check_func(torch.full((2, 1, 4), return_value, device=device), fn(master_input, dim=-2, keepdim=True), msg=error_msg)
+            self.assertEqual(torch.full((2, 4), return_value, device=device), fn(master_input, dim=1), msg=error_msg)
+            self.assertEqual(torch.full((2, 4), return_value, device=device), fn(master_input, dim=-2), msg=error_msg)
+            self.assertEqual(torch.full((2, 1, 4), return_value, device=device), fn(master_input, dim=1, keepdim=True),
+                             msg=error_msg)
+            self.assertEqual(torch.full((2, 1, 4), return_value, device=device), fn(master_input, dim=-2, keepdim=True),
+                             msg=error_msg)
 
             if name != 'logsumexp':
                 # The scipy function does not work for reduction the zero dimension
-                check_func(np.float32(np_function(np_input, axis=1)), fn(master_input, dim=1).cpu().numpy(), msg=error_msg)
-                check_func(np.float32(np_function(np_input, axis=-2)), fn(master_input, dim=-2).cpu().numpy(), msg=error_msg)
-                check_func(np.float32(np_function(np_input, axis=1, keepdims=True)),
-                           fn(master_input, dim=1, keepdim=True).cpu().numpy(),
-                           msg=error_msg)
-                check_func(np.float32(np_function(np_input, axis=-2, keepdims=True)),
-                           fn(master_input, dim=-2, keepdim=True).cpu().numpy(),
-                           msg=error_msg)
+                self.assertEqual(np.float32(np_function(np_input, axis=1)), fn(master_input, dim=1).cpu().numpy(),
+                                 msg=error_msg)
+                self.assertEqual(np.float32(np_function(np_input, axis=-2)), fn(master_input, dim=-2).cpu().numpy(),
+                                 msg=error_msg)
+                self.assertEqual(np.float32(np_function(np_input, axis=1, keepdims=True)),
+                                 fn(master_input, dim=1, keepdim=True).cpu().numpy(),
+                                 msg=error_msg)
+                self.assertEqual(np.float32(np_function(np_input, axis=-2, keepdims=True)),
+                                 fn(master_input, dim=-2, keepdim=True).cpu().numpy(),
+                                 msg=error_msg)
 
                 # logsumexp throws a type error when not specifying dim so test separately.
-                check_func(torch.full((), return_value, device=device), fn(master_input), msg=error_msg)
+                self.assertEqual(torch.full((), return_value, device=device), fn(master_input), msg=error_msg)
             else:
                 self.assertRaises(TypeError, lambda: fn(master_input))
 
diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index 9b38a5a7e36a8..94043e2745626 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -186,10 +186,10 @@ def test_multihead_attention_layer(self):
         o_test_kw = attention_a(src, src, value=src, mask=src_mask)
 
         for a, b in zip(o_ref, o_test):
-            torch.testing.assert_allclose(a, b)
+            torch.testing.assert_close(a, b)
 
         for a, b in zip(o_ref, o_test_kw):
-            torch.testing.assert_allclose(a, b)
+            torch.testing.assert_close(a, b)
 
     def test_multihead_attention_layer_benchmark(self):
         HID_DIM = 256
@@ -228,20 +228,20 @@ def test_mlp(self):
             top_inp = torch.randn(2048, 100)  # torch.Size([2048, 100])
         ref_bot = bot_l(bot_inp)
         acc_bot = bot_l_acc(bot_inp)[0]
-        torch.testing.assert_allclose(acc_bot, ref_bot)
+        torch.testing.assert_close(acc_bot, ref_bot)
         ref_top = top_l(top_inp)
         acc_top = top_l_acc(top_inp)[0]
-        torch.testing.assert_allclose(acc_top, ref_top)
+        torch.testing.assert_close(acc_top, ref_top)
         for _ in range(5):
             with torch.no_grad():
                 bot_inp = torch.randn(2048, 512)  # torch.Size([2048, 512])
                 top_inp = torch.randn(2048, 100)  # torch.Size([2048, 100])
             ref_bot = bot_l(bot_inp)
             acc_bot = bot_l_acc(bot_inp)[0]
-            torch.testing.assert_allclose(acc_bot, ref_bot)
+            torch.testing.assert_close(acc_bot, ref_bot)
             ref_top = top_l(top_inp)
             acc_top = top_l_acc(top_inp)[0]
-            torch.testing.assert_allclose(acc_top, ref_top)
+            torch.testing.assert_close(acc_top, ref_top)
 
     def test_trivial_graph(self):
         s = torch.full((2, 2), 2)
@@ -249,7 +249,7 @@ def test_trivial_graph(self):
         o_ref = tg(s, s, s)
         tg_a = StaticModule(tg)
         o_test = tg_a(s, s, s)[0]
-        torch.testing.assert_allclose(o_ref, o_test)
+        torch.testing.assert_close(o_ref, o_test)
 
     def test_leaky_relu(self):
         s = torch.randn(5, 5)
@@ -257,7 +257,7 @@ def test_leaky_relu(self):
         o_ref = tg(s)
         tg_a = StaticModule(tg)
         o_test = tg_a(s)[0]
-        torch.testing.assert_allclose(o_ref, o_test)
+        torch.testing.assert_close(o_ref, o_test)
 
     def test_attr(self):
         """
@@ -293,7 +293,7 @@ def test_attr(self):
         ms = torch.jit.script(m)
         sm = StaticModule(ms)
         output_sm = sm(input)[0]
-        torch.testing.assert_allclose(output_s, output_sm)
+        torch.testing.assert_close(output_s, output_sm)
         sm.benchmark([input], {}, 2, 2)
         sm.benchmark_individual_ops([input], {}, 2, 2)
         sm.benchmark([], {"x": input}, 2, 2)
@@ -307,7 +307,7 @@ def test_fusion_trivial_graph(self):
         torch._C._fuse_to_static_module(tg.graph)
         assert "StaticSubgraph" in str(tg.graph)
         o_test = tg(s, s, s)
-        torch.testing.assert_allclose(o_ref, o_test)
+        torch.testing.assert_close(o_ref, o_test)
 
     @unittest.skip("Temporarily disabled")
     def test_fusion_multihead_attention_layer(self):
@@ -332,7 +332,7 @@ def test_fusion_multihead_attention_layer(self):
         o_test = attention(src, src, src, src_mask)
 
         for a, b in zip(o_ref, o_test):
-            torch.testing.assert_allclose(a, b)
+            torch.testing.assert_close(a, b)
 
     @unittest.skip("Temporarily disabled")
     def test_fusion_loop(self):
@@ -344,7 +344,7 @@ def test_fusion_loop(self):
         torch._C._fuse_to_static_module(lg.graph)
         assert "StaticSubgraph" in str(lg.graph)
         o_test = lg(a, b, c)
-        torch.testing.assert_allclose(o_ref, o_test)
+        torch.testing.assert_close(o_ref, o_test)
 
     @unittest.skip("Temporarily disabled")
     def test_fusion_outputs(self):
@@ -357,7 +357,7 @@ def test_fusion_outputs(self):
         assert "StaticSubgraph" in str(og.graph)
         o_test = og(a, b, b, c)
         for i in o_ref.keys():
-            torch.testing.assert_allclose(o_ref[i], o_test[i])
+            torch.testing.assert_close(o_ref[i], o_test[i])
 
 
 if __name__ == "__main__":
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 50145100abf8f..6353113a1ec4c 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -1468,7 +1468,7 @@ def getModule(script):
         am_s = getModule(True)
         ref = am(x, x, x)
         test = am_s(x, x, x)
-        torch.testing.assert_allclose(ref, test)
+        torch.testing.assert_close(ref, test)
 
         # Now do the aliasing
         am.a = am.b
@@ -1477,7 +1477,7 @@ def getModule(script):
         am_s.a = am_s.b
         test = am_s(x, x, x)
 
-        torch.testing.assert_allclose(ref, test)
+        torch.testing.assert_close(ref, test)
 
     def test_alias_analysis_inputs(self):
         class AliasModule(nn.Module):
@@ -1510,7 +1510,7 @@ def getModule(script):
         x = torch.randn(128, 128)
         test = am_s(x, x, x)
 
-        torch.testing.assert_allclose(ref, test)
+        torch.testing.assert_close(ref, test)
 
     def test_alias_analysis_input_and_module(self):
         class AliasModule(nn.Module):
@@ -1545,7 +1545,7 @@ def getModule(script):
         am_s.b = x
         test = am_s(x, x, x)
 
-        torch.testing.assert_allclose(ref, test)
+        torch.testing.assert_close(ref, test)
 
     def test_multiple_outputs(self):
         for device in self.devices:
diff --git a/test/test_tensorexpr_pybind.py b/test/test_tensorexpr_pybind.py
index 4138b2f81dfda..d838892975c0c 100644
--- a/test/test_tensorexpr_pybind.py
+++ b/test/test_tensorexpr_pybind.py
@@ -44,7 +44,7 @@ def test_simple_sum(self):
             tB = torch.randn(n)
             tC = torch.empty(n)
             cg.call([tA, tB, tC])
-            torch.testing.assert_allclose(tA + tB, tC)
+            torch.testing.assert_close(tA + tB, tC)
 
     def test_call_raw(self):
         with kernel_arena_scope():
@@ -55,7 +55,7 @@ def test_call_raw(self):
             tB = torch.randn(n, dtype=torch.float64)
             tC = torch.empty(n, dtype=torch.float64)
             cg.call_raw([tA.data_ptr(), tB.data_ptr(), tC.data_ptr()])
-            torch.testing.assert_allclose(tA + tB, tC)
+            torch.testing.assert_close(tA + tB, tC)
 
     def test_external_calls(self):
         with kernel_arena_scope():
@@ -77,7 +77,7 @@ def test_external_calls(self):
             tB = torch.ones(4, 1)
             tC = torch.empty(1, 1)
             codegen.call([tA, tB, tC])
-            torch.testing.assert_allclose(torch.matmul(tA, tB), tC)
+            torch.testing.assert_close(torch.matmul(tA, tB), tC)
 
     def test_dynamic_shape(self):
         with kernel_arena_scope():
@@ -103,7 +103,7 @@ def test_with_shape(n):
                 tB = torch.randn(n, dtype=torch.double)
                 tC = torch.empty(n, dtype=torch.double)
                 cg.call([tA, tB, tC, n])
-                torch.testing.assert_allclose(tA - tB, tC)
+                torch.testing.assert_close(tA - tB, tC)
 
             test_with_shape(8)
             test_with_shape(31)
diff --git a/test/test_throughput_benchmark.py b/test/test_throughput_benchmark.py
index 9d60344b5912b..139ca0c4cc559 100644
--- a/test/test_throughput_benchmark.py
+++ b/test/test_throughput_benchmark.py
@@ -1,7 +1,6 @@
 
 import torch
 from torch.utils import ThroughputBenchmark
-from torch.testing import assert_allclose
 
 from torch.testing._internal.common_utils import run_tests, TestCase, TemporaryFileName
 
@@ -56,7 +55,7 @@ def linear_test(self, Module, profiler_output_path=""):
             # or just unpack the list of inputs
             module_result = module(*inputs[i])
             bench_result = bench.run_once(*inputs[i])
-            assert_allclose(bench_result, module_result)
+            torch.testing.assert_close(bench_result, module_result)
 
         stats = bench.benchmark(
             num_calling_threads=4,
diff --git a/test/test_torch.py b/test/test_torch.py
index 6766d50e6425d..515052ae5ad67 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -1566,7 +1566,7 @@ def test_sobolengine_continuing(self, scramble: bool = False):
             n_half = len(ref_sample) // 2
             _ = engine.draw(n=n_half)
             sample = engine.draw(n=n_half)
-            torch.testing.assert_allclose(sample, ref_sample[n_half:])
+            torch.testing.assert_close(sample, ref_sample[n_half:])
 
         def test_sobolengine_continuing_scrambled(self):
             self.test_sobolengine_continuing(scramble=True)
@@ -1578,7 +1578,7 @@ def test_sobolengine_reset(self, scramble: bool = False):
             engine.reset()
             self.assertEqual(engine.num_generated, 0)
             sample = engine.draw(n=len(ref_sample))
-            torch.testing.assert_allclose(sample, ref_sample)
+            torch.testing.assert_close(sample, ref_sample)
 
         def test_sobolengine_reset_scrambled(self):
             self.test_sobolengine_reset(scramble=True)
@@ -1588,7 +1588,7 @@ def test_sobolengine_fast_forward(self, scramble: bool = False):
             engine = torch.quasirandom.SobolEngine(2, scramble=scramble, seed=123456)
             engine.fast_forward(4)
             sample = engine.draw(n=4)
-            torch.testing.assert_allclose(sample, ref_sample[4:])
+            torch.testing.assert_close(sample, ref_sample[4:])
             # alternate fast forwarding with sampling
             engine.reset()
             even_draws = []
@@ -1597,9 +1597,9 @@ def test_sobolengine_fast_forward(self, scramble: bool = False):
                     even_draws.append(engine.draw())
                 else:
                     engine.fast_forward(1)
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(
                 ref_sample[[i for i in range(8) if i % 2 == 0]],
-                np.concatenate(even_draws),
+                torch.from_numpy(np.concatenate(even_draws)),
             )
 
         def test_sobolengine_fast_forward_scrambled(self):
@@ -1609,13 +1609,13 @@ def test_sobolengine_distribution(self, scramble=False):
             d = 50
             engine = torch.quasirandom.SobolEngine(d, scramble=scramble, seed=123456)
             sample = engine.draw(1024)
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(
                 torch.mean(sample, dim=0), torch.full((d,), 0.5), atol=2, rtol=2
             )
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(
                 np.percentile(sample, 25, axis=0), np.repeat(0.25, d), atol=2, rtol=2
             )
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(
                 np.percentile(sample, 75, axis=0), np.repeat(0.75, d), atol=2, rtol=2
             )
 
@@ -2440,7 +2440,7 @@ def test_c10_layer_norm(self):
             actual_norm, actual_mean, actual_stdev = \
                 torch.ops._caffe2.LayerNorm(torch.tensor(X), torch.tensor(
                     weight), torch.tensor(bias), 1, epsilon, True)
-            torch.testing.assert_allclose(expected_norm, actual_norm)
+            torch.testing.assert_close(expected_norm, actual_norm)
 
         def test_memory_format(self):
             def test_helper(x, memory_format):
diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
index 4fa64e75eceb4..a0f8328ec660b 100644
--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@@ -34,7 +34,7 @@ def test_linear(self, batch_size, data_shape, weight_output_dim, use_bias):
         ref_result = F.linear(input_data, weight, bias)
         packed_weight_bias = torch.ops.prepacked.linear_clamp_prepack(weight, bias)
         output_linearprepacked = torch.ops.prepacked.linear_clamp_run(input_data, packed_weight_bias)
-        torch.testing.assert_allclose(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3)
 
     @given(input_size=st.integers(2, 32),
            weight_output_dim=st.integers(2, 64),
@@ -49,7 +49,7 @@ def test_linear_1d_input(self, input_size, weight_output_dim, use_bias):
         ref_result = F.linear(input_data, weight, bias)
         packed_weight_bias = torch.ops.prepacked.linear_clamp_prepack(weight, bias)
         output_linearprepacked = torch.ops.prepacked.linear_clamp_run(input_data, packed_weight_bias)
-        torch.testing.assert_allclose(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3)
 
 
     @given(batch_size=st.integers(0, 3),
@@ -107,7 +107,7 @@ def test_conv2d(self,
         packed_weight_bias = torch.ops.prepacked.conv2d_clamp_prepack(weight, bias,
                                                                       strides, paddings, dilations, groups)
         xnnpack_result = torch.ops.prepacked.conv2d_clamp_run(input_data, packed_weight_bias)
-        torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
 
     @given(batch_size=st.integers(1, 3),
            input_channels_per_group=st.integers(1, 32),
@@ -174,7 +174,7 @@ def test_conv2d_transpose(self,
                                                                                 output_paddings, dilations,
                                                                                 groups)
         xnnpack_result = torch.ops.prepacked.conv2d_transpose_clamp_run(input_data, packed_weight_bias)
-        torch.testing.assert_allclose(ref_result.contiguous(), xnnpack_result.contiguous(), rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_result.contiguous(), xnnpack_result.contiguous(), rtol=1e-2, atol=1e-3)
 
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
@@ -214,7 +214,7 @@ def forward(self, x):
         input_data = torch.rand(data_shape)
         ref_result = scripted_linear(input_data)
         output_linearprepacked = scripted_linear_clamp_prepacked(input_data)
-        torch.testing.assert_allclose(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3)
 
         # Serialize the modules and then deserialize
         input_data = torch.rand(data_shape)
@@ -228,7 +228,7 @@ def forward(self, x):
         deserialized_linear_clamp_prepacked = torch.jit.load(buffer)
         ref_result = deserialized_linear(input_data)
         output_linearprepacked = deserialized_linear_clamp_prepacked(input_data)
-        torch.testing.assert_allclose(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3)
 
     @given(batch_size=st.integers(0, 3),
            input_channels_per_group=st.integers(1, 32),
@@ -309,7 +309,7 @@ def forward(self, x):
             weight, bias, strides, paddings, dilations, groups))
         ref_result = scripted_conv2d(input_data)
         xnnpack_result = scripted_conv2d_clamp_prepacked(input_data)
-        torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
 
         # Serialize the modules and then deserialize
         input_data = torch.rand((batch_size, input_channels, height, width))
@@ -325,7 +325,7 @@ def forward(self, x):
         deserialized_conv2d_clamp_prepacked = torch.jit.load(buffer)
         ref_result = deserialized_conv2d(input_data)
         xnnpack_result = deserialized_conv2d_clamp_prepacked(input_data)
-        torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
 
     @given(batch_size=st.integers(0, 3),
            input_channels_per_group=st.integers(1, 32),
@@ -417,7 +417,7 @@ def forward(self, x):
             weight, bias, strides, paddings, output_paddings, dilations, groups))
         ref_result = scripted_conv2d(input_data)
         xnnpack_result = scripted_conv2d_clamp_prepacked(input_data)
-        torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
 
         # Serialize the modules and then deserialize
         input_data = torch.rand((batch_size, input_channels, height, width))
@@ -433,7 +433,7 @@ def forward(self, x):
         deserialized_conv2d_clamp_prepacked = torch.jit.load(buffer)
         ref_result = deserialized_conv2d(input_data)
         xnnpack_result = deserialized_conv2d_clamp_prepacked(input_data)
-        torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
 
     @given(batch_size=st.integers(0, 3),
            input_channels_per_group=st.integers(1, 32),
@@ -549,7 +549,7 @@ def forward(self, x):
                 groups))
         ref_result = scripted_m(input_data)
         xnnpack_result = scripted_m_prepacked(input_data)
-        torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
 
         # Serialize the modules and then deserialize
         input_data = torch.rand((batch_size, input_channels, height, width))
@@ -564,7 +564,7 @@ def forward(self, x):
         deserialized_m_prepacked = torch.jit.load(buffer)
         ref_result = deserialized_m(input_data)
         xnnpack_result = deserialized_m_prepacked(input_data)
-        torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+        torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
 
 
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
@@ -610,7 +610,7 @@ def validate_transformed_module(
                 else:
                     FileCheck().check_count(pattern, v, exactly=True).run(deserialized_scripted_model.graph)
             xnnpack_result = deserialized_scripted_model(input_data)
-            torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+            torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
 
     def test_linear(self):
         data_shape = [2, 3, 32]
@@ -965,7 +965,7 @@ def validate_transform_conv1d_to_conv2d(
                 else:
                     FileCheck().check_count(pattern, v, exactly=True).run(deserialized_scripted_model.graph)
             transformed_result = deserialized_scripted_model(input_data)
-            torch.testing.assert_allclose(ref_result, transformed_result, rtol=1e-2, atol=1e-3)
+            torch.testing.assert_close(ref_result, transformed_result, rtol=1e-2, atol=1e-3)
 
             optimized_buffer = io.BytesIO()
             torch.jit.save(optimized_scripted_model, optimized_buffer)
@@ -980,7 +980,7 @@ def validate_transform_conv1d_to_conv2d(
                 else:
                     FileCheck().check_count(pattern, v, exactly=True).run(deserialized_optimized_scripted_model.graph)
             xnnpack_result = deserialized_optimized_scripted_model(input_data)
-            torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+            torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
 
 
     def test_conv1d_basic(self):
diff --git a/torch/fx/experimental/fx2trt/example/fx2trt_example.py b/torch/fx/experimental/fx2trt/example/fx2trt_example.py
index fff539d3bbe99..76bf69a181ad6 100644
--- a/torch/fx/experimental/fx2trt/example/fx2trt_example.py
+++ b/torch/fx/experimental/fx2trt/example/fx2trt_example.py
@@ -236,7 +236,7 @@ def _find_culprit(self, mod, inputs):
 
     # Assert results are equal with the original model.
     rn18 = rn18.cuda()
-    torch.testing.assert_allclose(split_mod(x), rn18(x))
+    torch.testing.assert_close(split_mod(x), rn18(x))
 
     import time
     NITER = 100
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index 069b73e847d5a..5a2f6e5e0c487 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -24,7 +24,7 @@
 from torch.autograd import function
 from torch.nn import Module
 
-from torch.testing._core import _get_default_tolerance
+from torch.testing._asserts import _get_default_rtol_and_atol
 
 _flatten = torch._C._jit_flatten
 _unflatten = torch._C._jit_unflatten
@@ -417,7 +417,7 @@ def graph_diagnostic_info():
                     check_tensor_val = n_check.t("value")
 
                     try:
-                        torch.testing.assert_allclose(mod_tensor_val, check_tensor_val)
+                        torch.testing.assert_close(mod_tensor_val, check_tensor_val, equal_nan=True)
                     except (RuntimeError, AssertionError) as e:
                         if tensor_compare_errors is None:
                             tensor_compare_errors = ""
@@ -489,11 +489,12 @@ def compare_outputs(original, reference, match_what):
                         orig = orig.to_dense()
                     if ref.is_mkldnn:
                         ref = ref.to_dense()
-                    torch.testing.assert_allclose(
+                    torch.testing.assert_close(
                         orig.double(),
                         ref.double(),
                         rtol=check_tolerance,
-                        atol=_get_default_tolerance(orig, ref)[1],
+                        atol=_get_default_rtol_and_atol(orig, ref)[1],
+                        equal_nan=True,
                     )
                 except AssertionError as e:
                     maybe_warn_nondeterministic()
diff --git a/torch/testing/_core.py b/torch/testing/_core.py
index 9a5fb0c643097..d9806150047c5 100644
--- a/torch/testing/_core.py
+++ b/torch/testing/_core.py
@@ -18,7 +18,6 @@
     "all_types_and_complex",
     "all_types_and_complex_and",
     "all_types_and_half",
-    "assert_allclose",
     "complex_types",
     "empty_types",
     "floating_and_complex_types",
@@ -246,30 +245,6 @@ def _helper(a, b, s) -> _compare_return_type:
 
     return _helper(a, b, " ")
 
-def assert_allclose(actual, expected, rtol=None, atol=None, equal_nan=True, msg='') -> None:
-    if not isinstance(actual, torch.Tensor):
-        actual = torch.tensor(actual)
-    if not isinstance(expected, torch.Tensor):
-        expected = torch.tensor(expected, dtype=actual.dtype)
-    if expected.shape != actual.shape:
-        raise AssertionError("expected tensor shape {0} doesn't match with actual tensor "
-                             "shape {1}!".format(expected.shape, actual.shape))
-    if rtol is None or atol is None:
-        if rtol is not None or atol is not None:
-            raise ValueError("rtol and atol must both be specified or both be unspecified")
-        rtol, atol = _get_default_tolerance(actual, expected)
-
-    result, debug_msg = _compare_tensors_internal(actual, expected,
-                                                  rtol=rtol, atol=atol,
-                                                  equal_nan=equal_nan)
-
-    if result:
-        return
-
-    if msg is None or msg == '':
-        msg = debug_msg
-
-    raise AssertionError(msg)
 
 def make_non_contiguous(tensor: torch.Tensor) -> torch.Tensor:
     if tensor.numel() <= 1:  # can't make non-contiguous
@@ -406,19 +381,3 @@ def get_all_fp_dtypes(include_half=True, include_bfloat16=True) -> List[torch.dt
 
 def get_all_device_types() -> List[str]:
     return ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
-
-# 'dtype': (rtol, atol)
-_default_tolerances = {
-    'float64': (1e-5, 1e-8),  # NumPy default
-    'float32': (1e-4, 1e-5),  # This may need to be changed
-    'float16': (1e-3, 1e-3),  # This may need to be changed
-}
-
-
-def _get_default_tolerance(a, b=None) -> Tuple[float, float]:
-    if b is None:
-        dtype = str(a.dtype).split('.')[-1]  # e.g. "float32"
-        return _default_tolerances.get(dtype, (0, 0))
-    a_tol = _get_default_tolerance(a)
-    b_tol = _get_default_tolerance(b)
-    return (max(a_tol[0], b_tol[0]), max(a_tol[1], b_tol[1]))
diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py
index 7355aeea1a292..3cf7338bff889 100644
--- a/torch/testing/_deprecated.py
+++ b/torch/testing/_deprecated.py
@@ -5,17 +5,24 @@
 
 import functools
 import warnings
-from typing import Any, Callable
+from typing import Any, Callable, Optional, Tuple
 
 import torch
 
 
-__all__ = ["rand", "randn"]
+__all__ = [
+    "rand",
+    "randn",
+    "assert_allclose",
+]
 
 
 def warn_deprecated(instructions: str) -> Callable:
     def outer_wrapper(fn: Callable) -> Callable:
-        msg = f"torch.testing.{fn.__name__} is deprecated and will be removed in the future. {instructions.strip()}"
+        msg = (
+            f"torch.testing.{fn.__name__} is deprecated and will be removed in a future release. "
+            f"{instructions.strip()}"
+        )
 
         @functools.wraps(fn)
         def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
@@ -29,3 +36,51 @@ def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
 
 rand = warn_deprecated("Use torch.rand instead.")(torch.rand)
 randn = warn_deprecated("Use torch.randn instead.")(torch.randn)
+
+
+_DTYPE_PRECISIONS = {
+    torch.float16: (1e-3, 1e-3),
+    torch.float32: (1e-4, 1e-5),
+    torch.float64: (1e-5, 1e-8),
+}
+
+
+def _get_default_rtol_and_atol(actual: torch.Tensor, expected: torch.Tensor) -> Tuple[float, float]:
+    actual_rtol, actual_atol = _DTYPE_PRECISIONS.get(actual.dtype, (0.0, 0.0))
+    expected_rtol, expected_atol = _DTYPE_PRECISIONS.get(expected.dtype, (0.0, 0.0))
+    return max(actual_rtol, expected_rtol), max(actual_atol, expected_atol)
+
+
+# TODO: include the deprecation as soon as torch.testing.assert_close is stable
+# @warn_deprecated(
+#     "Use torch.testing.assert_close instead. "
+#     "For detailed upgrade instructions see https://github.com/pytorch/pytorch/issues/61844."
+# )
+def assert_allclose(
+    actual: Any,
+    expected: Any,
+    rtol: Optional[float] = None,
+    atol: Optional[float] = None,
+    equal_nan: bool = True,
+    msg: str = "",
+) -> None:
+    if not isinstance(actual, torch.Tensor):
+        actual = torch.tensor(actual)
+    if not isinstance(expected, torch.Tensor):
+        expected = torch.tensor(expected, dtype=actual.dtype)
+
+    if rtol is None and atol is None:
+        rtol, atol = _get_default_rtol_and_atol(actual, expected)
+
+    torch.testing.assert_close(
+        actual,
+        expected,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        check_device=True,
+        check_dtype=False,
+        check_stride=False,
+        check_is_coalesced=False,
+        msg=msg or None,
+    )
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 2470b5392de11..6b2d1dd13a33d 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -975,12 +975,12 @@ def _compare_script_and_mobile(self,
 
                     mobile_module_result = mobile_module(input)
 
-                    torch.testing.assert_allclose(script_module_result, mobile_module_result)
+                    torch.testing.assert_close(script_module_result, mobile_module_result)
                     mobile_module_forward_result = mobile_module.forward(input)
-                    torch.testing.assert_allclose(script_module_result, mobile_module_forward_result)
+                    torch.testing.assert_close(script_module_result, mobile_module_forward_result)
 
                     mobile_module_run_method_result = mobile_module.run_method("forward", input)
-                    torch.testing.assert_allclose(script_module_result, mobile_module_run_method_result)
+                    torch.testing.assert_close(script_module_result, mobile_module_run_method_result)
                 except AssertionError as e:
                     if retry == max_retry:
                         raise e
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 2a126ab894a06..096b7182851c3 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -4119,20 +4119,13 @@ def _test_ddp_hook_parity(self, state, hook):
                 grad_hook = net_with_hook.module.weight.grad
                 avg_hook = grad_hook.clone()
                 # Verify hook grad with expected.
-                # Cannot use exact match here due to a very small accuracy loss,
-                # e.g. 1e-05, for powerSGD hook case.
-                assert_func = (
-                    self.assertEqual
-                    if hook == default.allreduce_hook
-                    else torch.testing.assert_allclose
-                )
-                assert_func(
-                    avg_hook[0, 0],
+                self.assertEqual(
+                    avg_hook[0, 0].item(),
                     expected_grad,
                     msg=f"Expected hook grad of {expected_grad} but got {avg_hook[0, 0]}",
                 )
                 # Verify hook grad with vanilla allreduce
-                assert_func(
+                self.assertEqual(
                     avg_hook[0, 0],
                     avg[0, 0],
                     msg=f"Expected hook grad to be close to allreduce {avg[0, 0]}, but got {avg_hook[0, 0]}",
@@ -4937,8 +4930,8 @@ def test_DistributedDataParallel_SyncBatchNorm_Diff_Input_Sizes_Running_Value(
                 model.module.running_mean,
                 model.module.running_var,
             )
-            torch.testing.assert_allclose(running_mean, all_input_var.mean(1))
-            torch.testing.assert_allclose(running_var, all_input_var.var(1))
+            torch.testing.assert_close(running_mean, all_input_var.mean(1))
+            torch.testing.assert_close(running_var, all_input_var.var(1))
 
         @sandcastle_skip_if(
             BACKEND != "nccl" and BACKEND != "gloo",

From ce6fe50158c631f7f54b6df3ac91632cd41d48ea Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Thu, 19 Aug 2021 13:00:08 -0700
Subject: [PATCH 078/530] Revert embedding thrust->cub migration (#63451)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63427

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63451

Reviewed By: mruberry

Differential Revision: D30398482

Pulled By: ngimel

fbshipit-source-id: e153786d204215555a6571688eabae712facad7e
---
 aten/src/ATen/cuda/cub.cuh                    | 19 +----
 aten/src/ATen/native/cuda/Embedding.cu        | 85 ++++++++++++++-----
 .../native/cuda/EmbeddingBackwardKernel.cuh   |  4 +
 aten/src/ATen/native/cuda/Indexing.cu         |  3 +
 .../ATen/native/cuda/LegacyThrustHelpers.cu   | 43 ----------
 aten/src/ATen/native/cuda/Randperm.cu         |  2 +
 aten/src/ATen/native/cuda/UniqueCub.cu        | 13 ++-
 7 files changed, 86 insertions(+), 83 deletions(-)

diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
index 38e5852260f3a..62da28d34e8e5 100644
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@@ -3,7 +3,6 @@
 #include <cstddef>
 #include <type_traits>
 #include <iterator>
-#include <limits>
 
 // include cub in a safe manner, see:
 // https://github.com/pytorch/pytorch/pull/55292
@@ -103,8 +102,6 @@ static inline void sort_keys(
     const key_t *keys_in, key_t *keys_out,
     int64_t n, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8
 ) {
-  TORCH_CHECK(n <= std::numeric_limits<int>::max(),
-    "cub sort does not support sorting more than INT_MAX elements");
   using key_t_ = typename detail::cuda_type<key_t>::type;
 
   const key_t_ *keys_in_ = reinterpret_cast<const key_t_*>(keys_in);
@@ -127,8 +124,6 @@ static inline void sort_pairs(
     const value_t *values_in, value_t *values_out,
     int64_t n, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8
 ) {
-  TORCH_CHECK(n <= std::numeric_limits<int>::max(),
-    "cub sort does not support sorting more than INT_MAX elements");
   using key_t_ = typename detail::cuda_type<key_t>::type;
 
   auto allocator = c10::cuda::CUDACachingAllocator::get();
@@ -161,10 +156,6 @@ static inline void segmented_sort_pairs(
     OffsetIteratorT begin_offsets, OffsetIteratorT end_offsets,
     bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8
 ) {
-  TORCH_CHECK(num_elements <= std::numeric_limits<int>::max(),
-    "cub sort does not support sorting more than INT_MAX elements");
-  TORCH_CHECK(num_segments <= std::numeric_limits<int>::max(),
-    "cub sort does not support sorting more than INT_MAX elements");
   using key_t_ = typename detail::cuda_type<key_t>::type;
 
   auto allocator = c10::cuda::CUDACachingAllocator::get();
@@ -314,12 +305,4 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
   }
 }
 
-template<typename InputIteratorT , typename OutputIteratorT , typename NumSelectedIteratorT >
-inline void unique(InputIteratorT input, OutputIteratorT output, NumSelectedIteratorT num_selected_out, int64_t num_items) {
-  TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
-    "cub unique does not support more than INT_MAX elements");
-  CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceSelect::Unique,
-    input, output, num_selected_out, num_items, at::cuda::getCurrentCUDAStream());
-}
-
-}}}  // namespace at::cuda::cub
+}}}
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 100ffbd99388c..10a42b8914e62 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -7,9 +7,12 @@
 
 #include <THC/THCDeviceUtils.cuh>
 #include <THC/THCTensorMathReduce.cuh>
+#include <THC/THCThrustAllocator.cuh>
 #include <THC/THCReduceApplyUtils.cuh>
 
-#include <ATen/cuda/cub.cuh>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/unique.h>
 
 #include <ATen/native/cuda/EmbeddingBackwardKernel.cuh>
 #include <ATen/native/cuda/SortingCommon.cuh>
@@ -221,9 +224,6 @@ __global__ void renorm_kernel(
 
 } // anonymous namespace
 
-template<typename index_t>
-void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
-
 Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices,
                                int64_t num_weights, int64_t padding_idx,
                                bool scale_grad_by_freq) {
@@ -272,16 +272,59 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
   auto orig_indices = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   Tensor count;
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
-    auto range = at::arange(num_indices, indices.options());
-    int64_t nbits = cuda::cub::get_num_bits(num_weights);
-    cuda::cub::sort_pairs(
-      indices.data_ptr<index_t>(), sorted_indices.data_ptr<index_t>(),
-      range.data_ptr<index_t>(), orig_indices.data_ptr<index_t>(),
-      num_indices, false, 0, nbits);
+    using device_ptr = thrust::device_ptr<index_t>;
+
+    // Sort the inputs into sorted with the corresponding indices; we
+    // don't need a stable or multidimensional sort, so just use Thrust
+    // directly
+    {
+        sorted_indices.copy_(indices);
+
+        auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+        auto policy = thrust::cuda::par(allocator).on(stream);
+
+        // Fill sortedOrigIndices with sequential indices
+        auto count_iter = thrust::counting_iterator<index_t>(0);
+        auto orig_data = device_ptr(orig_indices.data_ptr<index_t>());
+        thrust::copy(policy, count_iter, count_iter + num_indices, orig_data);
+
+        // Sort; a stable sort is not required
+        auto sorted_data = device_ptr(sorted_indices.data_ptr<index_t>());
+        thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data,
+                            LTOp<index_t>());
+    }
 
     if (scale_grad_by_freq) {
       count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-      embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
+
+      auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+      auto policy = thrust::cuda::par(allocator).on(stream);
+
+      // Compute an increasing sequence per unique item in sortedIndices:
+      // sorted: 2 5 5 5 7 7 8 9 9
+      //  count: 1 1 2 3 1 2 1 1 2
+      auto sorted_data = device_ptr(sorted_indices.data_ptr<index_t>());
+      auto count_data = device_ptr(count.data_ptr<index_t>());
+      thrust::inclusive_scan_by_key(
+        policy,
+        sorted_data,
+        sorted_data + num_indices,
+        thrust::make_constant_iterator(1),
+        count_data
+      );
+
+      // Take the maximum of each count per unique key in reverse:
+      // sorted: 2 5 5 5 7 7 8 9 9
+      //  count: 1 3 3 3 2 2 1 2 2
+      thrust::inclusive_scan_by_key(
+        policy,
+        thrust::make_reverse_iterator(sorted_data + num_indices),
+        thrust::make_reverse_iterator(sorted_data),
+        thrust::make_reverse_iterator(count_data + num_indices),
+        thrust::make_reverse_iterator(count_data + num_indices),
+        thrust::equal_to<index_t>(),
+        thrust::maximum<index_t>()
+      );
     }
   });
 
@@ -297,23 +340,23 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
   checkSameGPU("embedding_renorm", self_arg, indices_arg);
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
 
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_renorm_cuda_", [&] () {
+    using device_ptr = thrust::device_ptr<index_t>;
 
     auto num_indices = indices.numel();
     auto indices_contig = std::get<0>(indices.sort()).contiguous();
-    auto unique_indices = at::empty(indices.numel(), indices.options());
-    auto num_unique_indices = at::empty({}, indices.options().dtype(kLong));
+    auto indices_data = device_ptr(indices_contig.data_ptr<index_t>());
 
-    cuda::cub::unique(
-      indices_contig.data_ptr<index_t>(),
-      unique_indices.data_ptr<index_t>(),
-      num_unique_indices.data_ptr<int64_t>(),
-      num_indices
-    );
+    auto unique_indices = at::empty(indices.numel(), indices.options());
+    auto unique_data = device_ptr(unique_indices.data_ptr<index_t>());
+    auto end = thrust::unique_copy(policy, indices_data, indices_data + num_indices, unique_data);
+    auto num_unique_indices = static_cast<int>(end - unique_data);
 
-    dim3 grid = num_unique_indices.item<int64_t>();
-    dim3 block = 128;
+    dim3 grid(num_unique_indices);
+    dim3 block(128);
     int dim = self.stride(0);
 
     AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "embedding_backward", [&] {
diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh
index c79bf83cc8a6a..f06b850668591 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh
+++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh
@@ -10,6 +10,10 @@
 #include <THC/THCThrustAllocator.cuh>
 #include <THC/THCAtomics.cuh>
 
+#include <thrust/execution_policy.h>
+#include <thrust/unique.h>
+#include <thrust/device_vector.h>
+
 #pragma once
 
 namespace at {
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index 57654f2fb9b74..95ab33e512f02 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -218,6 +218,9 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<c10::optional<Ten
   std::tie(linearIndex, src, nElemBefore, strideBefore, sliceSize, inversePerm) = makeLinearIndex(self, indices, !unsafe);
   int64_t num_indices = linearIndex.numel();
 
+  TORCH_CHECK(num_indices <= std::numeric_limits<int>::max(),
+    "index_put of tensors larger than INT_MAX is not supported yet in pytorch");
+
   if (num_indices > 0 && sliceSize > 0) {
       const bool permuted = !src.is_contiguous();
       auto src_ = permuted ? src.contiguous() : src;
diff --git a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
index 446aa085a31d3..582dc9ebe0498 100644
--- a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
+++ b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
@@ -5,8 +5,6 @@
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/sort.h>
-#include <thrust/unique.h>
-#include <thrust/device_ptr.h>
 
 namespace at { namespace native {
 
@@ -32,45 +30,4 @@ void index_put_with_sort_kernel_thrust_helper(Tensor &linearIndex, Tensor &orig_
   thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data, LTOp<int64_t>());
 }
 
-template<typename index_t>
-void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) {
-  using device_ptr = thrust::device_ptr<index_t>;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
-  auto policy = thrust::cuda::par(allocator).on(stream);
-
-  auto num_indices = count.numel();
-
-  // Compute an increasing sequence per unique item in sortedIndices:
-  // sorted: 2 5 5 5 7 7 8 9 9
-  //  count: 1 1 2 3 1 2 1 1 2
-  auto sorted_data = device_ptr(sorted_indices.data_ptr<index_t>());
-  auto count_data = device_ptr(count.data_ptr<index_t>());
-  thrust::inclusive_scan_by_key(
-    policy,
-    sorted_data,
-    sorted_data + num_indices,
-    thrust::make_constant_iterator(1),
-    count_data
-  );
-
-  // Take the maximum of each count per unique key in reverse:
-  // sorted: 2 5 5 5 7 7 8 9 9
-  //  count: 1 3 3 3 2 2 1 2 2
-  thrust::inclusive_scan_by_key(
-    policy,
-    thrust::make_reverse_iterator(sorted_data + num_indices),
-    thrust::make_reverse_iterator(sorted_data),
-    thrust::make_reverse_iterator(count_data + num_indices),
-    thrust::make_reverse_iterator(count_data + num_indices),
-    thrust::equal_to<index_t>(),
-    thrust::maximum<index_t>()
-  );
-}
-
-template
-void embedding_dense_backward_cuda_scan<int>(Tensor &sorted_indices, Tensor &count);
-template
-void embedding_dense_backward_cuda_scan<int64_t>(Tensor &sorted_indices, Tensor &count);
-
 }}
diff --git a/aten/src/ATen/native/cuda/Randperm.cu b/aten/src/ATen/native/cuda/Randperm.cu
index 56b8eb20faae6..4c5e16a1ceed0 100644
--- a/aten/src/ATen/native/cuda/Randperm.cu
+++ b/aten/src/ATen/native/cuda/Randperm.cu
@@ -47,6 +47,8 @@ template <int N> struct alignas(N) OpaqueType { char data[N]; };
 
 Tensor& randperm_out_cuda(int64_t n, c10::optional<Generator> generator, Tensor& result) {
   TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
+  TORCH_CHECK(n <= std::numeric_limits<int>::max(),
+    "randperm of tensors larger than INT_MAX is not supported yet in pytorch");
 
   check_supported_max_int_with_precision(n, result);
 
diff --git a/aten/src/ATen/native/cuda/UniqueCub.cu b/aten/src/ATen/native/cuda/UniqueCub.cu
index eb31fd2f76bb8..1b9619b29812b 100644
--- a/aten/src/ATen/native/cuda/UniqueCub.cu
+++ b/aten/src/ATen/native/cuda/UniqueCub.cu
@@ -94,7 +94,13 @@ std::tuple<Tensor, Tensor, Tensor, int64_t> compute_unique(
   Tensor length = at::empty({1}, options);
   int64_t num_out;
   if (!return_counts) {
-    cuda::cub::unique(data, data_out.data_ptr<scalar_t>(), length.data_ptr<int64_t>(), num_inp);
+    CUB_WRAPPER(
+        cub::DeviceSelect::Unique,
+        data,
+        data_out.data_ptr<scalar_t>(),
+        length.data_ptr<int64_t>(),
+        num_inp,
+        stream);
     num_out = length.item<int64_t>();
   } else {
     counts.resize_(num_inp);
@@ -129,6 +135,11 @@ std::tuple<Tensor, Tensor, Tensor> unique_cuda_template(
 
   auto options = self.options().dtype(kLong);
   int64_t num_inp = self.numel();
+  TORCH_CHECK(
+      num_inp <= INT_MAX,
+      "num_inp ",
+      num_inp,
+      " is too big to be handled by cub");
   Tensor sorted;
   Tensor self_c = self.contiguous();
   if (consecutive) {

From 6c3ebccc00dfca217094357a63e2be901ad7beb4 Mon Sep 17 00:00:00 2001
From: Charles David Hernandez <cdhernandez@fb.com>
Date: Thu, 19 Aug 2021 13:04:48 -0700
Subject: [PATCH 079/530] Updating the names of these functions (#63513)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63513

updating these names per Jerry's nits in the previous pr

Test Plan: Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D30406710

fbshipit-source-id: a9f1577a2b8c4a93f5005e0f6278b7d7348d8b66
---
 torch/quantization/fx/prepare.py | 4 ++--
 torch/quantization/qconfig.py    | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py
index 23d1d40bb543b..29600b8797c52 100644
--- a/torch/quantization/fx/prepare.py
+++ b/torch/quantization/fx/prepare.py
@@ -15,7 +15,7 @@
 )
 from torch.fx.node import Argument
 
-from ..qconfig import QConfigAny, qconfig_function_equality
+from ..qconfig import QConfigAny, qconfig_equals
 from .qconfig_utils import (
     convert_dict_to_ordered_dict,
     generate_qconfig_map,
@@ -195,7 +195,7 @@ def update_qconfig_for_fusion(
                     # Raise an error if the modules in the fused module have
                     # different qconfigs specified in the qconfig_dict
                     for op in ops:
-                        if not qconfig_function_equality(object_type_dict.get(op, None), fused_qconfig):
+                        if not qconfig_equals(object_type_dict.get(op, None), fused_qconfig):
                             raise LookupError("During fusion, we need to specify the same " +
                                               f"qconfigs for both modules in {module_type}.")
 
diff --git a/torch/quantization/qconfig.py b/torch/quantization/qconfig.py
index 01d67ddcbd8b1..ae89b4a50b70a 100644
--- a/torch/quantization/qconfig.py
+++ b/torch/quantization/qconfig.py
@@ -211,9 +211,9 @@ def configure_constructor_to_put_obs_on_module_device(original_constructor):
         return QConfigDynamic(activation, weight)
 
 
-def qconfig_function_equality(q1: QConfigAny, q2: QConfigAny):
+def qconfig_equals(q1: QConfigAny, q2: QConfigAny):
     # functools.partial has no __eq__ operator defined so '==' defaults to 'is'
-    def compare_partial(p1, p2):
+    def partial_equals(p1, p2):
         same = p1.func == p2.func
         same = same and p1.args == p2.args
         return same and p1.keywords == p2.keywords
@@ -223,6 +223,6 @@ def compare_partial(p1, p2):
     else:
         assert q1 is not None and q2 is not None
         try:
-            return compare_partial(q1.activation.p, q2.activation.p) and compare_partial(q1.weight.p, q2.weight.p)
+            return partial_equals(q1.activation.p, q2.activation.p) and partial_equals(q1.weight.p, q2.weight.p)
         except AttributeError:
             return q1 == q2

From 11a40ad915d4d3d8551588e303204810887fcf8d Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 19 Aug 2021 13:32:26 -0700
Subject: [PATCH 080/530] [Pytorch] Fix callstack pointer serialization bug
 (#63576)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63576

We serialize function name associated with InlinedCallStackPtr. This is derived
via querying Function* stored in InlinedCallStack. However this is a raw
pointer that is not gauranteed to be valid when we serialization happens. On
the other hand we also store function name separately when constructing
InlinedCallStack anyways. So this change just uniformly relies on function_name
instead of Function*

Test Plan: Internal build's asan failure + CI

Reviewed By: larryliu0820

Differential Revision: D30427029

fbshipit-source-id: de9617482404785920ed2e67b72f38461590fba3
---
 torch/csrc/jit/mobile/debug_info.cpp                 |  6 +-----
 .../callstack_debug_info_serialization.cpp           | 12 ++++--------
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp
index 9c734f40a25a2..41ce3c6d46d52 100644
--- a/torch/csrc/jit/mobile/debug_info.cpp
+++ b/torch/csrc/jit/mobile/debug_info.cpp
@@ -49,11 +49,7 @@ std::pair<std::vector<StackEntry>, std::string> getStackTraceWithModuleHierarchy
       // Now add source range info to stack
       entries.emplace_back(
           StackEntry{prev_function_name, callstack_ptr->source_range()});
-      if (callstack_ptr->function()) {
-        prev_function_name = callstack_ptr->function()->name();
-      } else {
-        prev_function_name = callstack_ptr->function_name();
-      }
+      prev_function_name = callstack_ptr->function_name();
       // Function name appended here
       // It is renamed to prev_function_name because for StackEntry
       // it will be appended in the next iteration. This is the format
diff --git a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp
index c26c7e575c547..93da38ad768c5 100644
--- a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp
+++ b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp
@@ -47,15 +47,11 @@ c10::IValue InlinedCallStackSerializer::serialize(
   } else {
     elements.emplace_back(c10::IValue());
   }
-  if (cs_ptr->function()) {
-    elements.emplace_back(cs_ptr->function()->name());
+  auto fn_name = cs_ptr->function_name();
+  if (!fn_name.empty()) {
+    elements.emplace_back(fn_name);
   } else {
-    auto fn_name = cs_ptr->function_name();
-    if (!fn_name.empty()) {
-      elements.emplace_back(fn_name);
-    } else {
-      elements.emplace_back("FunctionName_UNKNOWN");
-    }
+    elements.emplace_back("FunctionName_UNKNOWN");
   }
   c10::IValue serialized_cs = c10::ivalue::Tuple::create(elements);
   serialized_inlined_callstack_[cs_ptr] = serialized_cs;

From e5ab0d1013072c26586b369536bccac648843958 Mon Sep 17 00:00:00 2001
From: "Adam J. Stewart" <ajstewart426@gmail.com>
Date: Thu, 19 Aug 2021 14:54:26 -0700
Subject: [PATCH 081/530] DataLoader: allow non-integer Samplers (#63500)

Summary:
Not entirely sure how to use TypeVar but if someone could give me a hint it would be appreciated. Also let me know if you want me to add tests so we can make sure non-integer samplers actually work. It seems like `test/test_dataloader.py` is the correct location but that's a big file.

Fixes https://github.com/pytorch/pytorch/issues/63483

ejguan

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63500

Reviewed By: mruberry

Differential Revision: D30403689

Pulled By: ejguan

fbshipit-source-id: 464e09e5aad3215b94a29cc5e21cb4b10ec136e3
---
 torch/utils/data/dataloader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index c85296f8f807f..0f46ad283ea5a 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -160,8 +160,8 @@ class DataLoader(Generic[T_co]):
     __initialized = False
 
     def __init__(self, dataset: Dataset[T_co], batch_size: Optional[int] = 1,
-                 shuffle: bool = False, sampler: Optional[Sampler[int]] = None,
-                 batch_sampler: Optional[Sampler[Sequence[int]]] = None,
+                 shuffle: bool = False, sampler: Optional[Sampler] = None,
+                 batch_sampler: Optional[Sampler[Sequence]] = None,
                  num_workers: int = 0, collate_fn: Optional[_collate_fn_t] = None,
                  pin_memory: bool = False, drop_last: bool = False,
                  timeout: float = 0, worker_init_fn: Optional[_worker_init_fn_t] = None,

From ccca66597ab7079c39b744c2906171aa63e7db61 Mon Sep 17 00:00:00 2001
From: Sergei Vorobev <sergei.vorobev@getcruise.com>
Date: Thu, 19 Aug 2021 14:57:00 -0700
Subject: [PATCH 082/530] Replace hardcoded values in IndexKernel.cu (#63372)

Summary:
This is a small change that helps to maintain Cruise pytorch fork, since we use a different hardcoded value.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63372

Reviewed By: mruberry

Differential Revision: D30396171

Pulled By: ejguan

fbshipit-source-id: cc0023f58b5922d3d98c7283495e6dc8d35049b6
---
 aten/src/ATen/native/cuda/IndexKernel.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index 5a28e79136e81..1aabb0b9a5041 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -69,9 +69,9 @@ void gpu_index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef
     return;
   }
 
-  auto sizes = at::detail::Array<int64_t, 25>(0);
-  auto strides = at::detail::Array<int64_t, 25>(0);
-  auto index_ptrs = at::detail::Array<char*, 25>(nullptr);
+  auto sizes = at::detail::Array<int64_t, MAX_DIMS>(0);
+  auto strides = at::detail::Array<int64_t, MAX_DIMS>(0);
+  auto index_ptrs = at::detail::Array<char*, MAX_DIMS>(nullptr);
   for (int i = 0; i < num_indices; i++) {
     sizes[i] = index_size[i];
     strides[i] = index_stride[i];

From 71ab48ed3b99fa43b715cb2efa3e4b8da40e92cf Mon Sep 17 00:00:00 2001
From: Zeina Migeed <migeedz@fb.com>
Date: Thu, 19 Aug 2021 15:22:52 -0700
Subject: [PATCH 083/530] acc type inference (#63119)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63119

Test Plan:
buck run mode/opt-clang caffe2/torch/fb/model_transform/experimental:fx_ir_lower_inline_cvr -- \
    --action=lower_and_run \
    --filename=inline_cvr_7x_dec_2020.model \
    --print_glow_glog=True

Reviewed By: jamesr66a, jfix71, ansley

Differential Revision: D30235895

fbshipit-source-id: dab7f96e1799b99eeae0ee519cf0ddd636fddf2e
---
 .../experimental/graph_gradual_typechecker.py | 38 +++++++++++++++++--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index e3c1ce82d7a46..5ce53a7ff6896 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -63,8 +63,6 @@ def broadcast_types(t1, t2):
 
         (t1, t2) = TensorType(tuple(new_t1)), TensorType(tuple(new_t2))
 
-        if not is_consistent(t1, t2):
-            raise TypeError
 
         return (t1, t2)
     else:
@@ -521,7 +519,7 @@ def type_check_node(self, n: Node):
             return n.type
 
         elif n.op == 'get_attr':
-            t = self.traced.get_parameter(n.target)
+            t = get_parameter(self.traced, n.target)  # type: ignore[arg-type]
             if isinstance(t.data, torch.Tensor):
                 n.type = TensorType(t.data.shape)
             return n.type
@@ -705,3 +703,37 @@ def refine_node(self, n: Node):
 
         else:
             pass
+
+
+def get_parameter(traced, target: str):
+    """
+    Returns the parameter given by ``target`` if it exists,
+    otherwise throws an error.
+
+    See the docstring for ``get_submodule`` for a more detailed
+    explanation of this method's functionality as well as how to
+    correctly specify ``target``.
+
+    Args:
+        target: The fully-qualified string name of the Parameter
+            to look for. (See ``get_submodule`` for how to specify a
+            fully-qualified string.)
+
+    Returns:
+        torch.nn.Parameter: The Parameter referenced by ``target``
+
+    Raises:
+        AttributeError: If the target string references an invalid
+            path or resolves to something that is not an
+            ``nn.Parameter``
+    """
+    module_path, _, param_name = target.rpartition(".")
+
+    mod: torch.nn.Module = traced.get_submodule(module_path)
+
+    if not hasattr(mod, param_name):
+        raise AttributeError(mod._get_name() + " has no attribute `" + param_name + "`")
+
+    param: torch.nn.Parameter = getattr(mod, param_name)
+
+    return param

From d0d27f697101e45cf3c8c8b3f762af4bb1396626 Mon Sep 17 00:00:00 2001
From: zhouzhuojie <zhouzhuojie@gmail.com>
Date: Thu, 19 Aug 2021 15:37:10 -0700
Subject: [PATCH 084/530] Add concurrency group for more workflows (#63606)

Summary:
Fixes unnecessary duplicated workflows runs

![image](https://user-images.githubusercontent.com/658840/130146332-ecf54e49-3538-49c1-88de-b099f1c1e41f.png)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63606

Reviewed By: malfet, mruberry

Differential Revision: D30436889

Pulled By: zhouzhuojie

fbshipit-source-id: aafbad1edc45e3ab9bceb00e8f3b4204f18e43d0
---
 .github/workflows/add_annotations.yml | 6 ++++++
 .github/workflows/auto_label.yml      | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/.github/workflows/add_annotations.yml b/.github/workflows/add_annotations.yml
index 40c2677aaf80d..9bb3c1b46e7b4 100644
--- a/.github/workflows/add_annotations.yml
+++ b/.github/workflows/add_annotations.yml
@@ -7,6 +7,12 @@ on:
     workflows:
       - Lint
 
+
+concurrency:
+  group: add-annotations-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+
 jobs:
   annotate:
     if: ${{ github.repository_owner == 'pytorch' }}
diff --git a/.github/workflows/auto_label.yml b/.github/workflows/auto_label.yml
index 24fc02eff1439..1616ea9c90b8a 100644
--- a/.github/workflows/auto_label.yml
+++ b/.github/workflows/auto_label.yml
@@ -6,6 +6,12 @@ on:
   pull_request_target:
     types: [edited, opened, synchronize, reopened]
 
+
+concurrency:
+  group: auto-label-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+
 jobs:
   auto-label-rocm:
     if: ${{ github.repository_owner == 'pytorch' }}

From f2bf0f229fdd0713064bc0fbd6dbc2063c71e2d4 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Thu, 19 Aug 2021 16:46:31 -0700
Subject: [PATCH 085/530] Revert D30359218: [pytorch][PR] [doc] pre-commit fix
 instructions

Test Plan: revert-hammer

Differential Revision:
D30359218 (https://github.com/pytorch/pytorch/commit/4e1d84ae8fae49995c8966ccbe0f34360978492f)

Original commit changeset: 61771babeac4

fbshipit-source-id: c2ac0a4a7463fafa03ad0b20bfb0701a8c1476c4
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2d820a360dd7c..7d8659a8babff 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1151,7 +1151,7 @@ formatting and semantic checking of code. We provide a pre-commit git hook for
 performing these checks, before a commit is created:
 
   ```bash
-  ln -s tools/git-pre-commit .git/hooks/pre-commit
+  ln -s ../../tools/git-pre-commit .git/hooks/pre-commit
   ```
 
 You'll need to install an appropriately configured flake8; see

From 0b6cc8daf22f574d57d01156a879a33fa244306f Mon Sep 17 00:00:00 2001
From: Pavithran Ramachandran <pavithran@fb.com>
Date: Thu, 19 Aug 2021 18:39:50 -0700
Subject: [PATCH 086/530] [PyTorch][Edge] Support backtrace symbolication for
 Android builds (#63339)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63339

# Context
https://fb.workplace.com/groups/pytorch.dev/permalink/900474523864362/?comment_id=901125403799274&reply_comment_id=905023386742809

##### WHAT IS A STACK TRACE?
A stack trace (also called stack backtrace or stack traceback) is a report of the active stack frames at a certain point in time during the execution of a program.

Typically when an exception is thrown, one would expect to see the code (file:line) that threw the exception, and every intermediate frame up to and including the main function.

We are enabling android stack trace to help debugging on android devices.

Test Plan:
## Steps to test
```
buck build fbsource//xplat/caffe2/mode/aibench_pytorch_android -c pt.enable_qpl=0 -c pt.has_backtraces=1 fbsource//xplat/caffe2/fb/lite_predictor:lite_predictorAndroid#android-x86_64

one_world android emulator android-28

adb push ~/fbsource/buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictorAndroid#android-x86_64 /data/local/tmp

cd /data/local/tmp
./lite_predictorAndroid#android-x86_64

./lite_predictorAndroid#android-x86_64 --model ./detect.bc --input_dims "1,3,192,192" --input_type float --warmup 20 --iter 5 --report_pep true
```

## See how model file is not found stack traces is:

### before
```
./lite_predictorAndroid#android-x86_64 --model ./detect.bc --input_dims "1,3,192,192" --input_type float --warmup 20 --iter 5 --report_pep true

Run with 2 threads
Run with 2 threads
Loading model...
terminating with uncaught exception of type c10::Error: open file failed, file path: ./detect.bc
Exception raised from RAIIFile at xplat/caffe2/caffe2/serialize/file_adapter.cc:13 (most recent call first):
(no backtrace available)
Aborted
```

### after
```
134|generic_x86_64:/data/local/tmp $ ./lite_predictorAndroid#android-x86_64 --model ./detect.bc --input_dims "1,3,192,192" --input_type float --warmup 20 --iter 5 --report_pep true
Run with 2 threads
Run with 2 threads
Loading model...
terminating with uncaught exception of type c10::Error: open file failed, file path: ./detect.bc
Exception raised from RAIIFile at xplat/caffe2/caffe2/serialize/file_adapter.cc:13 (most recent call first):
 frame #0       c10::get_backtrace(unsigned long, unsigned long, bool)[0x59494274f10e]
 frame #1       [0x5949427b1eee]
 frame #2       [0x5949427b1eb2]
 frame #3       [0x5949427b1cdc]
 frame #4       std::__ndk1::function<std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > ()>::operator()() const[0x5949427afc34]
 frame #5       c10::Error::Error(c10::SourceLocation, std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> >)[0x5949427b05b1]
 frame #6       c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > const&)[0x5949427aca5f]
 frame #7       caffe2::serialize::FileAdapter::RAIIFile::RAIIFile(std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > const&)[0x5949426b37b2]
 frame #8       caffe2::serialize::FileAdapter::FileAdapter(std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > const&)[0x5949426b3903]
 frame #9       torch::jit::_load_for_mobile(std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > const&, c10::optional<c10::Device>, std::__ndk1::unordered_map<std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> >, std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> >, std::__ndk1::hash<std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > >, std::__ndk1::equal_to<std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > >, std::__ndk1::allocator<std::__ndk1::pair<std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > const, std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > > > >&)[0x5949422737bd]
 frame #10      torch::jit::_load_for_mobile(std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > const&, c10::optional<c10::Device>)[0x594942273769]
 frame #11      benchmark(std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > const&, int, std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > const&, std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > const&, std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > const&, bool, int, int, int, bool, int, bool, int, double, bool, bool, bool, std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char> > const&)[0x59494189b21d]
 frame #12      main[0x594941882aff]
 frame #13      __libc_init[0x7b699d08578d]
```

### what we get for os:linux
```
(base) [pavithran@devvm1803.vll0 /data/users/pavithran/fbsource] ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor --model ./detect.bc --input_dims "1,3,192,192" --input_type float --warmup 20 --iter 5 --report_pep true
Run with 24 threads
Run with 24 threads
Loading model...
terminate called after throwing an instance of 'c10::Error'
  what():  open file failed, file path: ./detect.bc
Exception raised from RAIIFile at xplat/caffe2/caffe2/serialize/file_adapter.cc:13 (most recent call first):
frame #0: ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor() [0x20cb7fe]
frame #1: ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor() [0x20cb6c6]
frame #2: std::function<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > ()>::operator()() const + 0x54 (0x20ca4e4 in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor)
frame #3: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x57 (0x20ca9a7 in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor)
frame #4: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x7a (0x20c823a in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor)
frame #5: caffe2::serialize::FileAdapter::RAIIFile::RAIIFile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x96 (0x206f3d6 in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor)
frame #6: caffe2::serialize::FileAdapter::FileAdapter(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x42 (0x206f502 in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor)
frame #7: torch::jit::_load_for_mobile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, c10::optional<c10::Device>, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >&) + 0x30 (0x1be826c in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor)
frame #8: torch::jit::_load_for_mobile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, c10::optional<c10::Device>) + 0x35 (0x1be8214 in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor)
frame #9: benchmark(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool, int, int, int, bool, int, bool, int, double, bool, bool, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x16d (0x12093ad in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor)
frame #10: main + 0x25c (0x11f933c in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor)
frame #11: __libc_start_main + 0x105 (0x7fc7b9f2ed95 in /usr/local/fbcode/platform009/lib/libc.so.6)
frame #12: _start + 0x2a (0x11f902a in ./buck-out/gen/xplat/caffe2/fb/lite_predictor/lite_predictor)

Aborted (core dumped)
````

Reviewed By: dhruvbird

Differential Revision: D30135947

fbshipit-source-id: f50c634ef4545843305cad4b4a14a8776b1aec76
---
 c10/util/Backtrace.cpp | 69 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 68 insertions(+), 1 deletion(-)

diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp
index d978f32cd00e0..2c5e2e4cdca16 100644
--- a/c10/util/Backtrace.cpp
+++ b/c10/util/Backtrace.cpp
@@ -16,8 +16,13 @@
 
 #if SUPPORTS_BACKTRACE
 #include <cxxabi.h>
+#ifdef C10_ANDROID
+#include <dlfcn.h>
+#include <unwind.h>
+#else
 #include <execinfo.h>
 #endif
+#endif
 
 #ifdef FBCODE_CAFFE2
 #include <common/process/StackTrace.h>
@@ -25,6 +30,59 @@
 
 namespace c10 {
 
+#if SUPPORTS_BACKTRACE && defined(C10_ANDROID)
+
+struct AndroidBacktraceState {
+  std::vector<void*> buffer;
+};
+
+_Unwind_Reason_Code android_unwind_callback(
+    struct _Unwind_Context* context,
+    void* arg) {
+  AndroidBacktraceState* state = (AndroidBacktraceState*)arg;
+  uintptr_t pc = _Unwind_GetIP(context);
+  if (pc) {
+    state->buffer.emplace_back(reinterpret_cast<void*>(pc));
+  }
+  return _URC_NO_REASON;
+}
+
+void dump_stack(
+    std::ostream& os,
+    size_t frames_to_skip,
+    size_t maximum_number_of_frames) {
+  AndroidBacktraceState state;
+
+  _Unwind_Backtrace(android_unwind_callback, &state);
+
+  int idx = 0;
+  char* demangled = nullptr;
+  size_t length = 0;
+
+  for (const void* addr : state.buffer) {
+    const char* symbol = "";
+
+    Dl_info info;
+    if (dladdr(addr, &info) && info.dli_sname) {
+      symbol = info.dli_sname;
+    }
+
+    int status = 0;
+    demangled = __cxxabiv1::__cxa_demangle(
+        /*mangled_name*/ symbol,
+        /*output_buffer*/ demangled,
+        /*length*/ &length,
+        /*status*/ &status);
+
+    os << " frame #" << idx++ << "\t"
+       << ((demangled != NULL && status == 0) ? demangled : symbol) << "["
+       << addr << "]\t" << std::endl;
+  }
+  free(demangled);
+}
+
+#endif /* SUPPORTS_BACKTRACE && defined(C10_ANDROID) */
+
 #if SUPPORTS_BACKTRACE
 namespace {
 
@@ -42,6 +100,7 @@ struct FrameInformation {
   std::string object_file;
 };
 
+#ifndef C10_ANDROID
 bool is_python_frame(const FrameInformation& frame) {
   return frame.object_file == "python" || frame.object_file == "python3" ||
       (frame.object_file.find("libpython") != std::string::npos);
@@ -113,6 +172,7 @@ c10::optional<FrameInformation> parse_frame_information(
   frame.function_name = demangle(mangled_function_name.c_str());
   return frame;
 }
+#endif /* !defined(C10_ANDROID) */
 } // anonymous namespace
 #elif defined(_MSC_VER)
 namespace {
@@ -178,7 +238,7 @@ std::string get_backtrace(
   facebook::process::StackTrace st;
   return st.toString();
 
-#elif SUPPORTS_BACKTRACE
+#elif SUPPORTS_BACKTRACE && !defined(C10_ANDROID)
 
   // We always skip this frame (backtrace).
   frames_to_skip += 1;
@@ -249,6 +309,13 @@ std::string get_backtrace(
   }
 
   return stream.str();
+
+#elif SUPPORTS_BACKTRACE && defined(C10_ANDROID)
+
+  std::ostringstream oss;
+  dump_stack(oss, frames_to_skip, maximum_number_of_frames);
+  return oss.str().c_str();
+
 #elif defined(_MSC_VER) // !SUPPORTS_BACKTRACE
   // This backtrace retrieval is implemented on Windows via the Windows
   // API using `CaptureStackBackTrace`, `SymFromAddr` and

From b99a299c6002354acef5c43eba5cd25b41e773c7 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Thu, 19 Aug 2021 18:52:33 -0700
Subject: [PATCH 087/530] [PyTorch] Remove unused dump() methods in vec headers
 (#63533)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63533

These methods don't seem to be used, and they use std::cout, which incurs a small code size overhead on platforms using libstdc++ due to std::__ioinit (see #61500). Seems like we can just delete them?
ghstack-source-id: 136163409

Test Plan:
CI

Reviwers: #sentinel, dhruvbird

Reviewed By: dskhudia

Differential Revision: D30412269

fbshipit-source-id: 380b9aa2f9aabc4107188b6b209d2afc1769c0ee
---
 aten/src/ATen/cpu/vec/vec256/vec256_int.h     |  6 -----
 aten/src/ATen/cpu/vec/vec256/vec256_qint.h    | 25 -------------------
 .../cpu/vec/vec256/vsx/vec256_qint32_vsx.h    | 12 ---------
 .../cpu/vec/vec256/vsx/vec256_qint8_vsx.h     |  9 -------
 aten/src/ATen/cpu/vec/vec512/vec512_int.h     |  6 -----
 aten/src/ATen/cpu/vec/vec512/vec512_qint.h    | 25 -------------------
 6 files changed, 83 deletions(-)

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
index 86cf42556d192..ab8e1d0252fd4 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@@ -237,12 +237,6 @@ class Vectorized<int32_t> : public Vectorizedi {
       std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
     }
   }
-  void dump() const {
-      for (size_t i = 0; i < size(); ++i) {
-          std::cout << (int)((value_type*)&values)[i] << " ";
-      }
-      std::cout << std::endl;
-  }
   const int32_t& operator[](int idx) const  = delete;
   int32_t& operator[](int idx)  = delete;
   Vectorized<int32_t> abs() const {
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
index dc5e833127327..b247d46fff9bb 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@@ -309,12 +309,6 @@ struct Vectorized<c10::qint32> : public Vectorizedqi {
       return _mm256_add_epi32(rounded, zero_point_v);
     }
 
-    void dump() const {
-        for (size_t i = 0; i < 8; ++i) {
-          std::cout << ((int32_t*)&vals)[i] << " ";
-        }
-        std::cout << std::endl;
-    }
  private:
     // Load from memory constructor
     Vectorized(const void* ptr) {
@@ -537,12 +531,6 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
       return RequantizeAvx2<value_type>(inp, multiplier_v, zero_point_v);
     }
 
-    void dump() const {
-        for (size_t i = 0; i < size(); ++i) {
-            std::cout << (int)((value_type*)&vals)[i] << " ";
-        }
-        std::cout << std::endl;
-    }
  private:
     // Load from memory constructor
     Vectorized(const void* ptr) {
@@ -702,12 +690,6 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
       return RequantizeAvx2<value_type>(inp, multiplier_v, zero_point_v);
     }
 
-    void dump() const {
-        for (size_t i = 0; i < size(); ++i) {
-            std::cout << (int)((value_type*)&vals)[i] << " ";
-        }
-        std::cout << std::endl;
-    }
  private:
 
     // Load from memory constructor
@@ -792,13 +774,6 @@ struct VectorizedQuantizedConverter {
     return rv;
   }
 
-  void dump() const {
-      for (int i = 0; i < size(); ++i) {
-          std::cout << vals[i] << " ";
-      }
-      std::cout << std::endl;
-  }
-
  protected:
   VectorizedQuantizedConverter() {}
 };
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
index ed457b9adefc8..5b1622e825cb0 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
@@ -196,18 +196,6 @@ struct Vectorized<c10::qint32> {
     return {veci0, veci1};
   }
 
-  void dump() const {
-    std::cout << _vec0[0] << " ";
-    std::cout << _vec0[1] << " ";
-    std::cout << _vec0[2] << " ";
-    std::cout << _vec0[3] << " ";
-    std::cout << _vec1[0] << " ";
-    std::cout << _vec1[1] << " ";
-    std::cout << _vec1[2] << " ";
-    std::cout << _vec1[3] << " ";
-    std::cout << std::endl;
-  }
-
   DEFINE_MEMBER_OP(operator==, c10::qint32, vec_cmpeq)
   DEFINE_MEMBER_OP(operator!=, c10::qint32, vec_cmpne)
   DEFINE_MEMBER_OP(operator<, c10::qint32, vec_cmplt)
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
index f2a8446cd0ed9..82b2530b7ef3f 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
@@ -361,15 +361,6 @@ struct Vectorized<c10::qint8> {
     return {vec0, vec1};
   }
 
-  void dump() const {
-    value_type vals[size()];
-    store((void*)vals);
-    for (int i = 0; i < size(); ++i) {
-      std::cout << (int)(vals[i]) << " ";
-    }
-    std::cout << std::endl;
-  }
-
   DEFINE_MEMBER_OP(operator==, c10::qint8, vec_cmpeq)
   DEFINE_MEMBER_OP(operator!=, c10::qint8, vec_cmpne)
   DEFINE_MEMBER_OP(operator<, c10::qint8, vec_cmplt)
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
index cc866c065bfba..f28c14ed3f73f 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@@ -270,12 +270,6 @@ class Vectorized<int32_t> : public Vectorizedi {
       std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
     }
   }
-  void dump() const {
-      for (size_t i = 0; i < size(); ++i) {
-          std::cout << (int)((value_type*)&values)[i] << " ";
-      }
-      std::cout << std::endl;
-  }
   const int32_t& operator[](int idx) const  = delete;
   int32_t& operator[](int idx)  = delete;
   Vectorized<int32_t> abs() const {
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
index 5b5ac195f3caa..3a1eda8874f1a 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
@@ -321,12 +321,6 @@ struct Vectorized<c10::qint32> : public Vectorizedqi {
       return _mm512_add_epi32(rounded, zero_point_v);
     }
 
-    void dump() const {
-        for (size_t i = 0; i < 16; ++i) {
-          std::cout << ((int32_t*)&vals)[i] << " ";
-        }
-        std::cout << std::endl;
-    }
  private:
     // Load from memory constructor
     Vectorized(const void* ptr) {
@@ -549,12 +543,6 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
       return RequantizeAvx512<value_type>(inp, multiplier_v, zero_point_v);
     }
 
-    void dump() const {
-        for (size_t i = 0; i < size(); ++i) {
-            std::cout << (int)((value_type*)&vals)[i] << " ";
-        }
-        std::cout << std::endl;
-    }
  private:
     // Load from memory constructor
     Vectorized(const void* ptr) {
@@ -714,12 +702,6 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
       return RequantizeAvx512<value_type>(inp, multiplier_v, zero_point_v);
     }
 
-    void dump() const {
-        for (size_t i = 0; i < size(); ++i) {
-            std::cout << (int)((value_type*)&vals)[i] << " ";
-        }
-        std::cout << std::endl;
-    }
  private:
 
     // Load from memory constructor
@@ -806,13 +788,6 @@ struct VectorizedQuantizedConverter {
     return rv;
   }
 
-  void dump() const {
-      for (int i = 0; i < size(); ++i) {
-          std::cout << vals[i] << " ";
-      }
-      std::cout << std::endl;
-  }
-
  protected:
   VectorizedQuantizedConverter() {}
 };

From 0a66d5b3253fd2d2304f3897526db3c8fb139376 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Thu, 19 Aug 2021 18:52:33 -0700
Subject: [PATCH 088/530] [PyTorch] Remove unnecessary iostream includes in
 headers (#61500)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61500

libstdc++ defines a static variable called `std::__ioinit` in iostream that adds global constructor size overhead to each translation that includes iostream. To reduce the size overhead from that, we can often include ostream instead.
ghstack-source-id: 136163529

Test Plan: buildsizebot some mobile apps

Reviewed By: dhruvbird

Differential Revision: D29648016

fbshipit-source-id: 9c3139712c71248513cc5032d21e77f3ecbae8fe
---
 aten/src/ATen/core/Formatting.cpp             |  3 +++
 aten/src/ATen/core/Formatting.h               |  6 ++----
 aten/src/ATen/core/Vitals.cpp                 |  1 +
 aten/src/ATen/core/Vitals.h                   |  2 +-
 aten/src/ATen/core/function_schema.cpp        |  2 ++
 aten/src/ATen/core/interned_strings_class.h   |  2 --
 aten/src/ATen/core/ivalue.cpp                 |  1 +
 aten/src/ATen/core/jit_type.h                 |  5 +++--
 aten/src/ATen/cpu/vec/vec256/vec256.h         |  2 +-
 aten/src/ATen/cpu/vec/vec256/vec256_int.h     |  1 +
 aten/src/ATen/cpu/vec/vec256/vec256_qint.h    |  1 +
 aten/src/ATen/cpu/vml.h                       |  1 -
 aten/src/ATen/cudnn/Descriptors.cpp           |  2 +-
 aten/src/ATen/miopen/Descriptors.cpp          |  2 ++
 .../cpu/qnnpack/include/pack_block_sparse.h   | 21 +------------------
 .../cpu/qnnpack/src/pack_block_sparse.cc      | 21 +++++++++++++++++++
 c10/core/DispatchKey.h                        |  2 +-
 c10/core/Layout.h                             |  2 +-
 c10/core/MemoryFormat.h                       |  2 +-
 c10/core/ScalarType.h                         |  2 +-
 c10/util/Bitset.h                             |  1 -
 c10/util/complex.h                            |  1 -
 c10/util/either.h                             |  1 -
 c10/util/typeid.h                             |  1 -
 caffe2/core/init.cc                           |  1 +
 caffe2/core/operator.cc                       | 12 +++++++++++
 caffe2/core/operator.h                        | 11 +---------
 caffe2/core/operator_schema.cc                | 18 ++++++++++++++++
 caffe2/core/operator_schema.h                 | 16 +-------------
 caffe2/operators/utility_ops.cc               |  1 +
 30 files changed, 79 insertions(+), 65 deletions(-)

diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
index baf1691bd1d53..dbbed6e3b0785 100644
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@@ -232,6 +232,9 @@ void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
   }
 }
 
+void print(const Tensor & t, int64_t linesize) {
+  print(std::cout,t,linesize);
+}
 std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesize) {
   FormatGuard guard(stream);
   if(!tensor_.defined()) {
diff --git a/aten/src/ATen/core/Formatting.h b/aten/src/ATen/core/Formatting.h
index 86ea603951613..55cfe7b3bdf7e 100644
--- a/aten/src/ATen/core/Formatting.h
+++ b/aten/src/ATen/core/Formatting.h
@@ -2,7 +2,7 @@
 
 #include <c10/core/Scalar.h>
 #include <ATen/core/Tensor.h>
-#include <iostream>
+#include <ostream>
 
 
 namespace c10 {
@@ -18,9 +18,7 @@ TORCH_API std::ostream& print(
 static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
   return print(out,t,80);
 }
-static inline void print(const Tensor & t, int64_t linesize=80) {
-  print(std::cout,t,linesize);
-}
+TORCH_API void print(const Tensor & t, int64_t linesize=80);
 
 static inline std::ostream& operator<<(std::ostream & out, Scalar s) {
   if (s.isFloatingPoint()) {
diff --git a/aten/src/ATen/core/Vitals.cpp b/aten/src/ATen/core/Vitals.cpp
index edff5211ea0f0..76fc652f9407e 100644
--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@@ -1,5 +1,6 @@
 #include <ATen/core/Vitals.h>
 #include <cstdlib>
+#include <iostream>
 
 namespace at {
 namespace vitals {
diff --git a/aten/src/ATen/core/Vitals.h b/aten/src/ATen/core/Vitals.h
index c64cf7e629210..48913c54185f3 100644
--- a/aten/src/ATen/core/Vitals.h
+++ b/aten/src/ATen/core/Vitals.h
@@ -1,8 +1,8 @@
 #pragma once
 #include <cstring>
-#include <iostream>
 #include <map>
 #include <memory>
+#include <ostream>
 #include <sstream>
 #include <unordered_map>
 
diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp
index cc6de61dccead..a4319f03132cc 100644
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@@ -1,5 +1,7 @@
 #include <ATen/core/function_schema.h>
 
+#include <iostream>
+
 namespace c10 {
 
 void FunctionSchema::dump() const {
diff --git a/aten/src/ATen/core/interned_strings_class.h b/aten/src/ATen/core/interned_strings_class.h
index 54303e0384d28..8bbf3294844a5 100644
--- a/aten/src/ATen/core/interned_strings_class.h
+++ b/aten/src/ATen/core/interned_strings_class.h
@@ -1,8 +1,6 @@
 #include <cstdint>
 #include <cstring>
-#include <iostream>
 #include <mutex>
-#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 6fab54ff9dd82..1404e01fa2434 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -8,6 +8,7 @@
 #include <c10/util/StringUtil.h>
 #include <c10/util/hash.h>
 #include <cmath>
+#include <iostream>
 
 namespace c10 {
 bool _fastEqualsForContainer(const IValue& lhs, const IValue& rhs) {
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index d733fbd2da5b1..eee5acaccd655 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -9,10 +9,11 @@
 #include <c10/util/TypeList.h>
 #include <c10/util/Optional.h>
 
-#include <iostream>
+#include <array>
 #include <memory>
+#include <ostream>
+#include <sstream>
 #include <type_traits>
-#include <array>
 
 struct ClassType;
 namespace torch {
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
index 0d13458bc4c1c..906d8a8653661 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@@ -23,7 +23,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <iostream>
+#include <ostream>
 
 namespace at {
 namespace vec {
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
index ab8e1d0252fd4..5ee9919abca02 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@@ -6,6 +6,7 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/macros/Macros.h>
+#include <iostream>
 
 namespace at {
 namespace vec {
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
index b247d46fff9bb..8cde485c90d7d 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@@ -11,6 +11,7 @@
 #include <c10/util/quint8.h>
 
 #include <array>
+#include <iostream>
 
 // This file defines Vectorized<> for the quantized types.
 //
diff --git a/aten/src/ATen/cpu/vml.h b/aten/src/ATen/cpu/vml.h
index b9cc47f3fe73b..dbdef0b459928 100644
--- a/aten/src/ATen/cpu/vml.h
+++ b/aten/src/ATen/cpu/vml.h
@@ -28,7 +28,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <iostream>
 #include <type_traits>
 
 #if AT_MKL_ENABLED() && !defined(__APPLE__)
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
index 873431c1d96e5..f52280e9d2401 100644
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -2,7 +2,7 @@
 
 #include <ATen/ATen.h>
 
-#include <ostream>
+#include <iostream>
 #include <sstream>
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp
index 6a6476706ac6f..38875191b448b 100644
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@@ -1,6 +1,8 @@
 #include <ATen/miopen/Descriptors.h>
 #include <ATen/ATen.h>
 
+#include <iostream>
+
 namespace at { namespace native {
 
 namespace {
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/include/pack_block_sparse.h b/aten/src/ATen/native/quantized/cpu/qnnpack/include/pack_block_sparse.h
index 0f329296bc18b..62fdef2cdf9b2 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/include/pack_block_sparse.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/include/pack_block_sparse.h
@@ -8,7 +8,6 @@
 
 #pragma once
 #include <cstdint>
-#include <iostream>
 #include <memory>
 #include <vector>
 #include <cassert>
@@ -33,25 +32,7 @@ typedef struct BCSRMatrix {
 #endif
   uint32_t col_block_size;  // input features block size
   uint32_t row_block_size;  // output features block size
-  void print() {
-    std::cout << "row block size:" << row_block_size << std::endl;
-    std::cout << "col block size:" << col_block_size << std::endl;
-    std::cout << "row ptr\n";
-    for (const auto& t : row_values) {
-      std::cout << t << ", ";
-    }
-    std::cout << std::endl;
-    std::cout << "col indices\n";
-    for (const auto& t : col_indices) {
-      std::cout << t << ", ";
-    }
-    std::cout << std::endl;
-    std::cout << "Actual values\n";
-    for (const auto& t : values) {
-      std::cout << (uint32_t)t << ", ";
-    }
-    std::cout << std::endl;
-  }
+  void print() const;
 } BCSRMatrix;
 
 std::unique_ptr<BCSRMatrix> generateBlockCSRMatrix(
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/pack_block_sparse.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/src/pack_block_sparse.cc
index ca694df3aba45..6a6134023bfc8 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/pack_block_sparse.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/pack_block_sparse.cc
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include <cassert>
+#include <iostream>
 
 #include <pack_block_sparse.h>
 
@@ -78,4 +79,24 @@ std::unique_ptr<BCSRMatrix> generateBlockCSRMatrix(
   bcsr_mat.col_block_size = col_block_size;
   return bcsr_mat_ptr;
 }
+
+void BCSRMatrix::print() const {
+  std::cout << "row block size:" << row_block_size << std::endl;
+  std::cout << "col block size:" << col_block_size << std::endl;
+  std::cout << "row ptr\n";
+  for (const auto& t : row_values) {
+    std::cout << t << ", ";
+  }
+  std::cout << std::endl;
+  std::cout << "col indices\n";
+  for (const auto& t : col_indices) {
+    std::cout << t << ", ";
+  }
+  std::cout << std::endl;
+  std::cout << "Actual values\n";
+  for (const auto& t : values) {
+    std::cout << (uint32_t)t << ", ";
+  }
+  std::cout << std::endl;
+}
 } // namsepace qnnpack
diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h
index 9f21838ddb4a3..5b20a1ca327df 100644
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@@ -3,7 +3,7 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Exception.h>
-#include <iostream>
+#include <ostream>
 #include <string>
 #include <vector>
 
diff --git a/c10/core/Layout.h b/c10/core/Layout.h
index 44168ebca4360..f37ceb18a835d 100644
--- a/c10/core/Layout.h
+++ b/c10/core/Layout.h
@@ -3,7 +3,7 @@
 #include <c10/core/Backend.h>
 #include <c10/util/Exception.h>
 
-#include <iostream>
+#include <ostream>
 
 namespace c10 {
 enum class Layout : int8_t { Strided, Sparse, SparseCsr, Mkldnn, NumOptions };
diff --git a/c10/core/MemoryFormat.h b/c10/core/MemoryFormat.h
index ba4e056e1e6c8..8cafde1b5c5e7 100644
--- a/c10/core/MemoryFormat.h
+++ b/c10/core/MemoryFormat.h
@@ -4,7 +4,7 @@
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Exception.h>
 
-#include <iostream>
+#include <ostream>
 
 // Memory format is not the property of a Tensor. It is the way to tell an
 // operator how the result should be organized in memory and nothing more. That
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index d652db5a215c6..f7b07100365fa 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -12,7 +12,7 @@
 
 #include <complex>
 #include <cstdint>
-#include <iostream>
+#include <ostream>
 
 namespace c10 {
 
diff --git a/c10/util/Bitset.h b/c10/util/Bitset.h
index 6f7c4b9a1d78b..bed04a438abea 100644
--- a/c10/util/Bitset.h
+++ b/c10/util/Bitset.h
@@ -3,7 +3,6 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/C++17.h>
 #include <c10/util/Optional.h>
-#include <iostream>
 #if defined(_MSC_VER)
 #include <intrin.h>
 #endif
diff --git a/c10/util/complex.h b/c10/util/complex.h
index 2a565f8f2bf8f..67ed463febd94 100644
--- a/c10/util/complex.h
+++ b/c10/util/complex.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <complex>
-#include <iostream>
 
 #include <c10/macros/Macros.h>
 
diff --git a/c10/util/either.h b/c10/util/either.h
index da765b9a9bb17..757663f5896fb 100644
--- a/c10/util/either.h
+++ b/c10/util/either.h
@@ -6,7 +6,6 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/C++17.h>
 #include <c10/util/Optional.h>
-#include <iostream>
 
 namespace c10 {
 /**
diff --git a/c10/util/typeid.h b/c10/util/typeid.h
index e6a5822a3e7ce..240c69e92400e 100644
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@@ -4,7 +4,6 @@
 #include <cassert>
 #include <complex>
 #include <cstdlib>
-#include <iostream>
 #include <memory>
 #include <mutex>
 #include <type_traits>
diff --git a/caffe2/core/init.cc b/caffe2/core/init.cc
index 529665869b3e1..bafbc825f8b79 100644
--- a/caffe2/core/init.cc
+++ b/caffe2/core/init.cc
@@ -3,6 +3,7 @@
 #include "caffe2/core/scope_guard.h"
 
 #include <iomanip>
+#include <iostream>
 #include <mutex>
 
 C10_DEFINE_bool(
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 846ab8ab55b46..ca66f7846c300 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -1,6 +1,7 @@
 #include "caffe2/core/operator.h"
 
 #include <algorithm>
+#include <iostream>
 
 #include "caffe2/core/init.h"
 #include "caffe2/core/logging.h"
@@ -355,6 +356,17 @@ void SetOpEnginePref(
   }
 }
 
+DeviceTypeRegisterer::DeviceTypeRegisterer(DeviceType type, RegistryFunction func) {
+  if (gDeviceTypeRegistry()->count(type)) {
+    std::cerr << "Device type " << DeviceTypeName(type)
+              << "registered twice. This should not happen. Did you have "
+      "duplicated numbers assigned to different devices?";
+    std::exit(1);
+  }
+  // Calling the registry function to get the actual registry pointer.
+  gDeviceTypeRegistry()->emplace(type, func());
+}
+
 unique_ptr<OperatorBase> CreateOperator(
     const OperatorDef& operator_def,
     Workspace* ws,
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index fc9a6769c4e65..b840254612929 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -1330,16 +1330,7 @@ typedef c10::Registry<
 TORCH_API std::map<DeviceType, OperatorRegistry*>* gDeviceTypeRegistry();
 
 struct TORCH_API DeviceTypeRegisterer {
-  explicit DeviceTypeRegisterer(DeviceType type, RegistryFunction func) {
-    if (gDeviceTypeRegistry()->count(type)) {
-      std::cerr << "Device type " << DeviceTypeName(type)
-                << "registered twice. This should not happen. Did you have "
-                   "duplicated numbers assigned to different devices?";
-      std::exit(1);
-    }
-    // Calling the registry function to get the actual registry pointer.
-    gDeviceTypeRegistry()->emplace(type, func());
-  }
+  explicit DeviceTypeRegisterer(DeviceType type, RegistryFunction func);
 };
 
 #if defined(_MSC_VER)
diff --git a/caffe2/core/operator_schema.cc b/caffe2/core/operator_schema.cc
index fbfb8f404d359..29d0b3e78d9a4 100644
--- a/caffe2/core/operator_schema.cc
+++ b/caffe2/core/operator_schema.cc
@@ -1,6 +1,8 @@
 #include "caffe2/core/operator_schema.h"
 #include "caffe2/core/logging.h"
 
+#include <iostream>
+
 #include <c10/util/irange.h>
 
 namespace caffe2 {
@@ -520,6 +522,22 @@ C10_EXPORT std::ostream& operator<<(std::ostream& out, const OpSchema& schema) {
   return out;
 }
 
+OpSchema& OpSchemaRegistry::NewSchema(const string& key, const string& file, const int line) {
+  auto& m = map();
+  auto it = m.find(key);
+  if (it != m.end()) {
+    const auto& schema = it->second;
+    std::ios_base::Init init;
+    std::cerr << "Trying to register schema with name " << key
+              << " from file " << file << " line " << line
+              << ", but it is already registered from file " << schema.file()
+              << " line " << schema.line();
+    abort();
+  }
+  m.emplace(key, OpSchema(key, file, line));
+  return m[key];
+}
+
 CaffeMap<string, OpSchema>& OpSchemaRegistry::map() {
   static CaffeMap<string, OpSchema> map;
   return map;
diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
index b19d5be079af2..64f5ef3ed883a 100644
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@@ -460,21 +460,7 @@ class TORCH_API OpSchema {
 class TORCH_API OpSchemaRegistry {
  public:
   static OpSchema&
-  NewSchema(const string& key, const string& file, const int line) {
-    auto& m = map();
-    auto it = m.find(key);
-    if (it != m.end()) {
-      const auto& schema = it->second;
-      std::ios_base::Init init;
-      std::cerr << "Trying to register schema with name " << key
-                << " from file " << file << " line " << line
-                << ", but it is already registered from file " << schema.file()
-                << " line " << schema.line();
-      abort();
-    }
-    m.emplace(key, OpSchema(key, file, line));
-    return m[key];
-  }
+  NewSchema(const string& key, const string& file, const int line);
 
   static const OpSchema* Schema(const string& key) {
     auto& m = map();
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index 7b2a02fae696b..8b5e116024b81 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -1,5 +1,6 @@
 #include "caffe2/operators/utility_ops.h"
 #include <cmath>
+#include <iostream>
 #include "caffe2/utils/eigen_utils.h"
 
 namespace caffe2 {

From 5e31a3b9044f7724a36bd7e491a63914279ef259 Mon Sep 17 00:00:00 2001
From: Raghavan Raman <raghavanr@fb.com>
Date: Thu, 19 Aug 2021 22:50:32 -0700
Subject: [PATCH 089/530] [nnc] Updated sliceHead to do inplace mutation
 (#63531)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63531

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D30412183

Pulled By: navahgar

fbshipit-source-id: 47ee9482a36e606788d28d22eee4edaca45ffa50
---
 test/cpp/tensorexpr/test_loopnest.cpp  | 2 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index 4a2a1d07db12e..b550f4819e970 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -284,7 +284,7 @@ TEST(LoopNest, ExprSliceHead) {
   ASSERT_NE(head, nullptr);
   ASSERT_NE(head, loops[0]);
   ASSERT_NE(tail, nullptr);
-  ASSERT_NE(tail, loops[0]);
+  ASSERT_EQ(tail, loops[0]);
 
   BlockPtr body = getSimplifiedBody(l);
   assertForRanges(body, {{0, 4}, {4, 10}});
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 2256369e2e9e2..3c39dcde82e46 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -1306,11 +1306,10 @@ void LoopNest::sliceHead(ForPtr f, int factor, ForPtr* head, ForPtr* tail) {
   ExprPtr head_end = alloc<Min>(
       alloc<Add>(f->start(), alloc<IntImm>(factor)), f->stop(), true);
   *head = alloc<For>(f->var(), f->start(), head_end, Stmt::clone(f->body()));
-  *tail = alloc<For>(
-      f->var(), head_end, f->stop(), Stmt::clone(f->body()), f->loop_options());
+  p->insert_stmt_before(*head, f);
 
-  p->replace_stmt(f, *head);
-  p->insert_stmt_after(*tail, *head);
+  f->set_start(head_end);
+  *tail = f;
 
   if (f->loop_options().is_gpu_block_index() ||
       f->loop_options().is_gpu_thread_index()) {

From d82667f7e2cd812d98b9cc4f40df46b37a9ef653 Mon Sep 17 00:00:00 2001
From: Raghavan Raman <raghavanr@fb.com>
Date: Thu, 19 Aug 2021 22:50:32 -0700
Subject: [PATCH 090/530] [nnc] Updated sliceTail to do inplace mutation
 (#63532)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63532

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D30412184

Pulled By: navahgar

fbshipit-source-id: e7669d3b9d24e14501f3feb6505c88d1d42030c6
---
 test/cpp/tensorexpr/test_loopnest.cpp  |  2 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp | 11 +++--------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index b550f4819e970..898ee5293edab 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -380,7 +380,7 @@ TEST(LoopNest, ExprSliceTail) {
   LoopNest::sliceTail(loops[0], 4, &head, &tail);
 
   ASSERT_NE(head, nullptr);
-  ASSERT_NE(head, loops[0]);
+  ASSERT_EQ(head, loops[0]);
   ASSERT_NE(tail, nullptr);
   ASSERT_NE(tail, loops[0]);
 
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 3c39dcde82e46..a296d8c7af79b 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -1345,16 +1345,11 @@ void LoopNest::sliceTail(ForPtr f, int factor, ForPtr* head, ForPtr* tail) {
 
   ExprPtr tail_start = alloc<Max>(
       f->start(), alloc<Sub>(f->stop(), alloc<IntImm>(factor)), true);
-  *head = alloc<For>(
-      f->var(),
-      f->start(),
-      tail_start,
-      Stmt::clone(f->body()),
-      f->loop_options());
   *tail = alloc<For>(f->var(), tail_start, f->stop(), Stmt::clone(f->body()));
+  p->insert_stmt_after(*tail, f);
 
-  p->replace_stmt(f, *head);
-  p->insert_stmt_after(*tail, *head);
+  f->set_stop(tail_start);
+  *head = f;
 
   if (f->loop_options().is_gpu_block_index() ||
       f->loop_options().is_gpu_thread_index()) {

From bec75daa77ade04ccef4f3de67fcae216bfd122d Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Thu, 19 Aug 2021 23:42:24 -0700
Subject: [PATCH 091/530] Update protobuf to 3.13.1 (#62571)

Summary:
Update bazel to 4.10.0

Update ASAN_SYMBOLIZER_PATH to llvm-7
Suppress `vptr` ubsan violations in `test_jit`
Fix ProtoBuf patching for ONNX which caused Windows builds to crash while attempting to free `std::string` allocated on stack

Fixes https://github.com/pytorch/pytorch/issues/62569

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62571

Reviewed By: walterddr

Differential Revision: D30048685

Pulled By: malfet

fbshipit-source-id: 6462c1bef9c42318551d2cf906bbab41e1d4e1cd
---
 .jenkins/pytorch/common_utils.sh | 4 ++--
 .jenkins/pytorch/test.sh         | 2 +-
 cmake/ProtoBuf.cmake             | 4 ++--
 cmake/ProtoBufPatch.cmake        | 4 ++--
 third_party/protobuf             | 2 +-
 ubsan.supp                       | 1 +
 6 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index fd94ce14a1c5f..49db051a0f484 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -59,9 +59,9 @@ function file_diff_from_base() {
 
 function get_bazel() {
   # download bazel version
-  wget https://github.com/bazelbuild/bazel/releases/download/3.1.0/bazel-3.1.0-linux-x86_64 -O tools/bazel
+  wget https://github.com/bazelbuild/bazel/releases/download/4.1.0/bazel-4.1.0-linux-x86_64 -O tools/bazel
   # verify content
-  echo '753434f4fa730266cf5ce21d1fdd425e1e167dd9347ad3e8adc19e8c0d54edca  tools/bazel' | sha256sum --quiet -c
+  echo '0eb2e378d2782e7810753e2162245ad1179c1bb12f848c692b4a595b4edf779b tools/bazel' | sha256sum --quiet -c
 
   chmod +x tools/bazel
 }
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 124fd7c8cdb3e..9f3e378a45fb8 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -91,7 +91,7 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     export PYTORCH_TEST_WITH_ASAN=1
     export PYTORCH_TEST_WITH_UBSAN=1
     # TODO: Figure out how to avoid hard-coding these paths
-    export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-5.0/bin/llvm-symbolizer
+    export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-7/bin/llvm-symbolizer
     export TORCH_USE_RTLD_GLOBAL=1
     # NB: We load libtorch.so with RTLD_GLOBAL for UBSAN, unlike our
     # default behavior.
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index d8a2c279aee47..8d7633c4ab037 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -196,7 +196,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var)
 
         # If we remove all reference to these pb.h files from external
         # libraries and binaries this rewrite can be removed.
-        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -DLOCAL_PROTOBUF=${CAFFE2_LINK_LOCAL_PROTOBUF} -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
+        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
 
         DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil}
         COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )
@@ -209,7 +209,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var)
         COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
         COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --cpp_out=${DLLEXPORT_STR}${PROJECT_BINARY_DIR} ${abs_fil}
         COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --python_out "${PROJECT_BINARY_DIR}" ${abs_fil}
-        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -DLOCAL_PROTOBUF=${CAFFE2_LINK_LOCAL_PROTOBUF} -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
+        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -DSYSTEM_PROTOBUF=YES -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
         DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil}
         COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )
     endif()
diff --git a/cmake/ProtoBufPatch.cmake b/cmake/ProtoBufPatch.cmake
index 704dcd7da1545..7f1de9a4a1de9 100644
--- a/cmake/ProtoBufPatch.cmake
+++ b/cmake/ProtoBufPatch.cmake
@@ -4,7 +4,7 @@
 
 file(READ ${FILENAME} content)
 
-if(LOCAL_PROTOBUF)
+if(NOT SYSTEM_PROTOBUF)
   # protobuf-3.6.0 pattern
   string(
     REPLACE
@@ -77,7 +77,7 @@ if(LOCAL_PROTOBUF)
 
     file(WRITE ${SOURCE_FILENAME} "${content_cc}")
   endif()
-endif()
+endif(NOT SYSTEM_PROTOBUF)
 
 # constexpr int TensorBoundShape_DimType_DimType_ARRAYSIZE = TensorBoundShape_DimType_DimType_MAX + 1;
 # throws
diff --git a/third_party/protobuf b/third_party/protobuf
index d0bfd5221182d..d1eca4e4b421c 160000
--- a/third_party/protobuf
+++ b/third_party/protobuf
@@ -1 +1 @@
-Subproject commit d0bfd5221182da1a7cc280f3337b5e41a89539cf
+Subproject commit d1eca4e4b421cd2997495c4b4e65cea6be4e9b8a
diff --git a/ubsan.supp b/ubsan.supp
index 62e64b785b94c..395f5208c8437 100644
--- a/ubsan.supp
+++ b/ubsan.supp
@@ -1 +1,2 @@
 vptr:libtorch_python.so
+vptr:test_jit

From 913c1f83f49f9e1e2a494186cc0069d780cee852 Mon Sep 17 00:00:00 2001
From: Don Jang <djang@fb.com>
Date: Fri, 20 Aug 2021 00:43:40 -0700
Subject: [PATCH 092/530] [Static Runtime] Add native op for aten::detach
 (#63625)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63625

This change adds a static runtime's native op implementation for `aten::detach` op.

See the standard  `aten::detach`'s implementation (https://codebrowser.bddppq.com/pytorch/pytorch/aten/src/ATen/native/TensorShape.cpp.html#_ZN2at6native6detachERKNS_6TensorE ) for comparison.

Test Plan:
- Added `StaticRuntime.IndividualOps_Detach`.

- Observed

```
V0819 18:55:33.181188 3092034 impl.cpp:1398] Switch to native impl for node: %a.1 : Tensor = aten::detach(%input.1)
```

Reviewed By: hlu1

Differential Revision: D30443187

fbshipit-source-id: d6e0eadb1b817e0a126c4fc97526abc276ee8a17
---
 benchmarks/static_runtime/test_scripts.h         | 12 ++++++++++++
 benchmarks/static_runtime/test_static_runtime.cc | 11 +++++++++++
 torch/csrc/jit/runtime/static/native_ops.cpp     | 15 +++++++++++++++
 3 files changed, 38 insertions(+)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index 8db8da2887799..9946c7af02e5a 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -286,6 +286,18 @@ const auto to_script_4 = R"JIT(
       return (c)
 )JIT";
 
+const auto detach_script_0 = R"JIT(
+  def forward(self, input: Tensor):
+      a = input.detach()
+      return input is a
+)JIT";
+
+const auto detach_script_1 = R"JIT(
+  def forward(self, input: Tensor):
+      a = input.detach()
+      return a.clone()
+)JIT";
+
 const std::string embedding_bag_default = R"JIT(
   def forward(self, a: Tensor, b: Tensor, c: Tensor):
       return torch.embedding_bag(a, b, c)
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 14d613f074858..ec703ef8a2ec1 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -589,6 +589,17 @@ TEST(StaticRuntime, IndividualOps_to) {
   test_to(at::ScalarType::Half, false, true, c10::MemoryFormat::ChannelsLast);
 }
 
+TEST(StaticRuntime, IndividualOps_Detach) {
+  auto a = at::randn({4, 3, 1, 2});
+  auto b = at::randn({3, 2, 2});
+  std::vector<IValue> args{a};
+  std::vector<IValue> args2{b};
+  testStaticRuntime(detach_script_0, args);
+  testStaticRuntime(detach_script_0, args, args2);
+  testStaticRuntime(detach_script_1, args);
+  testStaticRuntime(detach_script_1, args, args2);
+}
+
 TEST(StaticRuntime, IndividualOps_Full) {
   auto dtype = at::ScalarType::Int;
   auto cpu = at::Device(DeviceType::CPU);
diff --git a/torch/csrc/jit/runtime/static/native_ops.cpp b/torch/csrc/jit/runtime/static/native_ops.cpp
index 616ad87119ab3..61a6554a3c5cc 100644
--- a/torch/csrc/jit/runtime/static/native_ops.cpp
+++ b/torch/csrc/jit/runtime/static/native_ops.cpp
@@ -355,6 +355,21 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR(aten::to, aten_to, [](Node* n) -> SROperator {
   };
 });
 
+REGISTER_NATIVE_OPERATOR_FUNCTOR(
+    aten::detach,
+    aten_detach,
+    [](Node* n) -> SROperator {
+      if (!n->matches(
+              torch::schema("aten::detach(Tensor(a) self) -> Tensor(a)"))) {
+        LogAndDumpSchema(n);
+        return nullptr;
+      }
+      return [](ProcessedNode* p_node) {
+        const auto& in0_t = p_node->Input(0).toTensor();
+        p_node->Output(0) = at::native::alias(in0_t);
+      };
+    });
+
 REGISTER_NATIVE_OPERATOR_FUNCTOR(
     prim::isinstance,
     prim_isinstance,

From ae901e372e7b05fe1802e44fe2f1f6aa015710af Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Fri, 20 Aug 2021 06:14:13 -0700
Subject: [PATCH 093/530] [Static Runtime] Enable RemoveListMutation (#63536)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63536

Enable a pass that transforms sequences like this:
```
li = []
li.append(1)
li.append(2)
```
into this:
```
li = [1, 2]
```
Initially I implemented this pass myself (D30387213), but I discovered that there is an existing pass that does the same thing.

Reviewed By: hlu1

Differential Revision: D30412970

fbshipit-source-id: 0810ef03480878d5039bd800a40f5fd31c2652ec
---
 torch/csrc/jit/passes/freeze_module.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index 063b867319629..df1c64bcc4740 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -7,6 +7,7 @@
 #include <torch/csrc/jit/passes/clear_profiling.h>
 #include <torch/csrc/jit/passes/inliner.h>
 #include <torch/csrc/jit/passes/lower_tuples.h>
+#include <torch/csrc/jit/passes/remove_mutation.h>
 #include <torch/csrc/jit/runtime/graph_executor_impl.h>
 
 #include <stack>
@@ -90,6 +91,7 @@ class AttributePropagator {
     auto applyOptimizations = [](std::shared_ptr<Graph>& subgraph) {
       runOptimization(
           subgraph, /* unroll? */ false, /* const_prop_user_classes? */ false);
+      RemoveListMutation(subgraph);
       LowerSimpleTuples(subgraph);
     };
 

From 0f2c60f0e37fe1738d30d3db3fd48e88b8087af7 Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitaly.fedyunin@gmail.com>
Date: Fri, 20 Aug 2021 08:36:14 -0700
Subject: [PATCH 094/530] Adding IterableAsDataPipe IterDataPipe (#63522)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63522

Supports sharding and batching on loader level
* **#63522 Adding IterableAsDataPipe IterDataPipe
usefull for tests and simple cases**

usefull for tests and simple cases

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30426528

Pulled By: VitalyFedyunin

fbshipit-source-id: 535b5cc1505bb58731fcca8170541ac5ee7bd417
---
 torch/utils/data/datapipes/iter/__init__.py |  4 ++++
 torch/utils/data/datapipes/iter/utils.py    | 10 ++++++++++
 2 files changed, 14 insertions(+)
 create mode 100644 torch/utils/data/datapipes/iter/utils.py

diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
index 0bcfdc44c31cf..bdaef95e9fa56 100644
--- a/torch/utils/data/datapipes/iter/__init__.py
+++ b/torch/utils/data/datapipes/iter/__init__.py
@@ -43,6 +43,9 @@
 from torch.utils.data.datapipes.iter.tobytes import (
     ToBytesIterDataPipe as ToBytes,
 )
+from torch.utils.data.datapipes.iter.utils import (
+    IterableAsDataPipeIterDataPipe as IterableAsDataPipe,
+)
 
 __all__ = ['Batch',
            'BucketBatcher',
@@ -51,6 +54,7 @@
            'Filter',
            'GroupByKey',
            'HttpReader',
+           'IterableAsDataPipe',
            'ListDirFiles',
            'LoadFilesFromDisk',
            'Map',
diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py
new file mode 100644
index 0000000000000..ea241d9f2716c
--- /dev/null
+++ b/torch/utils/data/datapipes/iter/utils.py
@@ -0,0 +1,10 @@
+from torch.utils.data import IterDataPipe
+
+
+class IterableAsDataPipeIterDataPipe(IterDataPipe):
+    def __init__(self, iterable):
+        self.iterable = iterable
+
+    def __iter__(self):
+        for data in self.iterable:
+            yield data

From 99e28baeba4f1ffb2623e64694b2aac13df5e0fb Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Fri, 20 Aug 2021 08:42:31 -0700
Subject: [PATCH 095/530] Small custom function refactor which doesn't change
 anything (#63433)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63433

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D30431970

Pulled By: albanD

fbshipit-source-id: 905fa4d2ddeca18005b1bcb13dd6f8a080327e7c
---
 torch/csrc/autograd/custom_function.cpp | 34 ++++++++++++++++++-------
 torch/csrc/autograd/custom_function.h   |  2 ++
 torch/csrc/autograd/python_function.cpp | 19 ++++++++++++--
 torch/csrc/autograd/python_function.h   | 11 --------
 4 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp
index 502919ff3a6a4..fdcf9971a0606 100644
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@@ -26,17 +26,13 @@ Variable VariableInfo::zeros(at::OptionalDeviceGuard& device_guard) const {
   }
 }
 
-std::vector<c10::optional<Variable>> _wrap_outputs(const variable_list &input_vars,
+optional_variable_list _process_backward_mode_ad(
+  const std::unordered_set<at::TensorImpl*> &inputs_set,
   const std::unordered_set<at::TensorImpl*> &non_differentiable,
   const std::unordered_set<at::TensorImpl*> &dirty_inputs,
   const at::ArrayRef<c10::optional<Variable>> raw_outputs,
   const std::shared_ptr<Node> &cdata) {
 
-  std::unordered_set<at::TensorImpl*> inputs;
-  inputs.reserve(input_vars.size());
-  for (auto& var : input_vars) {
-    inputs.emplace(var.unsafeGetTensorImpl());
-  }
 
   int num_outputs = raw_outputs.size();
 
@@ -63,7 +59,7 @@ std::vector<c10::optional<Variable>> _wrap_outputs(const variable_list &input_va
       // Here, `y` requires_grad (!).
     } else if (is_modified) {
       if (var.is_leaf() && var.requires_grad()) {
-        throw std::runtime_error("a leaf Variable that requires grad has been used in an in-place operation.");
+        TORCH_CHECK(false, "a leaf Variable that requires grad has been used in an in-place operation.");
       }
       // No need to mark as modified Tensors that are not inputs.
       if (!is_input) {
@@ -105,7 +101,7 @@ std::vector<c10::optional<Variable>> _wrap_outputs(const variable_list &input_va
     }
   };
 
-  std::vector<c10::optional<Variable>> outputs;
+  optional_variable_list outputs;
   std::unordered_set<at::TensorImpl*> outputs_impl; // For dirty_inputs check
   outputs.reserve(num_outputs);
   int num_diff_outputs = 0;
@@ -125,7 +121,7 @@ std::vector<c10::optional<Variable>> _wrap_outputs(const variable_list &input_va
     Variable var = raw_outputs[i].value();
 
     auto out_tensor_impl = var.unsafeGetTensorImpl();
-    bool is_input = inputs.count(out_tensor_impl) > 0;
+    bool is_input = inputs_set.count(out_tensor_impl) > 0;
     bool is_modified = dirty_inputs.count(out_tensor_impl) > 0;
     bool is_differentiable = cdata && non_differentiable.count(out_tensor_impl) == 0
                               && isDifferentiableType(var.scalar_type());
@@ -177,6 +173,26 @@ std::vector<c10::optional<Variable>> _wrap_outputs(const variable_list &input_va
   return outputs;
 }
 
+
+
+optional_variable_list _wrap_outputs(const variable_list &input_vars,
+  const std::unordered_set<at::TensorImpl*> &non_differentiable,
+  const std::unordered_set<at::TensorImpl*> &dirty_inputs,
+  const at::ArrayRef<c10::optional<Variable>> raw_outputs,
+  const std::shared_ptr<Node> &cdata) {
+
+  std::unordered_set<at::TensorImpl*> inputs_set;
+  inputs_set.reserve(input_vars.size());
+  for (auto& var : input_vars) {
+    inputs_set.emplace(var.unsafeGetTensorImpl());
+  }
+
+  auto outputs = _process_backward_mode_ad(inputs_set, non_differentiable, dirty_inputs, raw_outputs, cdata);
+
+
+  return outputs;
+}
+
 void check_variable_result(const Variable& original, const Variable& result, std::string hook_name) {
   if (!original.options().type_equal(result.options())) {
     std::stringstream ss;
diff --git a/torch/csrc/autograd/custom_function.h b/torch/csrc/autograd/custom_function.h
index 243622f650666..376cab693e453 100644
--- a/torch/csrc/autograd/custom_function.h
+++ b/torch/csrc/autograd/custom_function.h
@@ -9,6 +9,8 @@
 
 namespace torch { namespace autograd {
 
+using optional_variable_list = std::vector<c10::optional<Variable>>;
+
 TORCH_API std::vector<c10::optional<Variable>> _wrap_outputs(
   const variable_list &input_vars,
   const std::unordered_set<at::TensorImpl*> &non_differentiable,
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index dd58a68134b8f..14874186d6f22 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -45,14 +45,29 @@ PyObject *THPFunctionClass = nullptr;
 #define THPFunction_assert(condition, ...)                                     \
   if (!(condition)) { THPUtils_setError(__VA_ARGS__); throw python_error(); }
 
-namespace torch { namespace autograd {
+// Anonymous namespace for helpful functions used in this file
+namespace {
 
-void PyNode::throw_python_error() {
+// Throw a python_error with the PyErr state persisted, so that we
+// don't lose the error state if the GIL is released when we don't
+// have a PyThreadState created beforehand, this is made so that
+// even for pure C++ thread without a pre-created PyThreadState could
+// also capture the correct error message.
+// TODO: This is a temporary approach to allow C++ thread to correctly
+// capture Python Error in autograd, remove this when c10 thread pool
+// allow to do one time initialization.
+// see discussion in https://github.com/pytorch/pytorch/pull/34845
+// Follow up issue: https://github.com/pytorch/pytorch/issues/35006
+void throw_python_error() {
   python_error err;
   err.persist();
   throw err;
 }
 
+}
+
+namespace torch { namespace autograd {
+
 // NOTE: this function is written in a way that assumes it's only called for backward;
 // it's used by engine.cpp.  This is responsible for forwarding a call from
 // C++'s Node::apply to a Python method "apply".
diff --git a/torch/csrc/autograd/python_function.h b/torch/csrc/autograd/python_function.h
index 8f4d12ba640fc..3657807f35964 100644
--- a/torch/csrc/autograd/python_function.h
+++ b/torch/csrc/autograd/python_function.h
@@ -27,17 +27,6 @@ struct PyNode : public Node {
 
   variable_list apply(variable_list&& inputs) override;
 
-  // Throw a python_error with the PyErr state persisted, so that we
-  // don't lose the error state if the GIL is released when we don't
-  // have a PyThreadState created beforehand, this is made so that
-  // even for pure C++ thread without a pre-created PyThreadState could
-  // also capture the correct error message.
-  // TODO: This is a temporary approach to allow C++ thread to correctly
-  // capture Python Error in autograd, remove this when c10 thread pool
-  // allow to do one time initialization.
-  // see discussion in https://github.com/pytorch/pytorch/pull/34845
-  // Follow up issue: https://github.com/pytorch/pytorch/issues/35006
-  void throw_python_error();
   void release_variables() override;
   std::string name() const override;
   bool is_traceable() override;

From 5a7133b87fe2fd7d025d36855ed4cc06539a9299 Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitaly.fedyunin@gmail.com>
Date: Fri, 20 Aug 2021 09:00:23 -0700
Subject: [PATCH 096/530] Adding DataLoader2 class as future replacement of
 DataLoader (#63523)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63523

Supports sharding and batching on loader level**
* #63522 Adding IterableAsDataPipe IterDataPipe
usefull for tests and simple cases

Supports sharding and batching on loader level

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30426527

Pulled By: VitalyFedyunin

fbshipit-source-id: e5905d3364c4880e720dd62fb066f08881c71a6e
---
 test/test_dataloader.py                     | 25 +++++-
 torch/utils/data/__init__.py                |  6 +-
 torch/utils/data/dataloader_experimental.py | 89 +++++++++++++++++++++
 3 files changed, 117 insertions(+), 3 deletions(-)
 create mode 100644 torch/utils/data/dataloader_experimental.py

diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index c68d7e2e14b33..71230cfbb7a67 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -13,9 +13,20 @@
 import warnings
 import tempfile
 from torch import multiprocessing as mp
-from torch.utils.data import _utils, Dataset, IterableDataset, TensorDataset, DataLoader, ConcatDataset, ChainDataset, Subset
+from torch.utils.data import (
+    ChainDataset,
+    ConcatDataset,
+    DataLoader,
+    DataLoader2,
+    Dataset,
+    IterableDataset,
+    Subset,
+    TensorDataset,
+    _utils
+)
 from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL
 from torch.utils.data.dataset import random_split
+from torch.utils.data.datapipes.iter import IterableAsDataPipe
 from torch._utils import ExceptionWrapper
 from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS,
                                                   IS_IN_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest,
@@ -1934,6 +1945,18 @@ def test_excessive_thread_creation_warning(self):
             dataloader = DataLoader(self.dataset, batch_size=2, num_workers=1000)
 
 
+@unittest.skipIf(
+    TEST_WITH_TSAN,
+    "Fails with TSAN with the following error: starting new threads after multi-threaded "
+    "fork is not supported. Dying (set die_after_fork=0 to override)")
+class TestDataLoader2(TestCase):
+    def test_basics(self):
+        dp = IterableAsDataPipe(list(range(10)))
+        dl = DataLoader(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2)
+        dl2 = DataLoader2(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2)
+        self.assertEquals(list(dl), list(dl2))
+
+
 class StringDataset(Dataset):
     def __init__(self):
         self.s = '12345'
diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py
index 1d18b7b030894..0af9e6193af3d 100644
--- a/torch/utils/data/__init__.py
+++ b/torch/utils/data/__init__.py
@@ -11,9 +11,9 @@
 from torch.utils.data.dataset import (
     ChainDataset,
     ConcatDataset,
+    DataChunk,
     Dataset,
     Dataset as MapDataPipe,
-    DataChunk,
     IterableDataset,
     IterableDataset as IterDataPipe,
     Subset,
@@ -34,11 +34,14 @@
     runtime_validation,
     runtime_validation_disabled,
 )
+from torch.utils.data.dataloader_experimental import DataLoader2
+
 
 __all__ = ['BatchSampler',
            'ChainDataset',
            'ConcatDataset',
            'DataLoader',
+           'DataLoader2',
            'Dataset',
            'DistributedSampler',
            'IterDataPipe',
@@ -68,4 +71,3 @@
 ################################################################################
 # import subpackage
 ################################################################################
-from torch.utils.data import datapipes
diff --git a/torch/utils/data/dataloader_experimental.py b/torch/utils/data/dataloader_experimental.py
new file mode 100644
index 0000000000000..85028afd22124
--- /dev/null
+++ b/torch/utils/data/dataloader_experimental.py
@@ -0,0 +1,89 @@
+
+import functools
+
+import torch.utils.data.backward_compatibility
+from torch.utils.data import DataLoader, IterDataPipe
+from torch.utils.data.datapipes.iter import IterableAsDataPipe
+
+class DataLoader2:
+    def __new__(cls,
+                dataset,
+                batch_size=1,
+                shuffle=False,
+                sampler=None,
+                batch_sampler=None,
+                num_workers=0,
+                collate_fn=None,
+                pin_memory=False,
+                drop_last=False,
+                timeout=0,
+                worker_init_fn=None,
+                *,
+                prefetch_factor=2,
+                persistent_workers=False,
+                batch_outside_worker=False):
+        if isinstance(dataset, IterDataPipe):
+            datapipe = dataset
+            if batch_sampler is not None:
+                raise Exception(
+                    'batch_sampler is not yet supported for DataPipes')
+            if sampler is not None:
+                raise Exception(
+                    'sampler is not yet supported for DataPipes')
+            if shuffle:
+                datapipe = datapipe.shuffle()
+            if batch_outside_worker and pin_memory:
+                raise Exception(
+                    'pin_memory is not yet compatible with batch_outside_worker')
+            if not batch_outside_worker:
+                if batch_size is not None:
+                    datapipe = datapipe.batch(batch_size, drop_last=drop_last)
+                    if collate_fn is None:
+                        collate_fn = torch.utils.data._utils.collate.default_collate
+
+            def sharding_worker_init_fn(worker_init_fn, worker_id):
+                if worker_init_fn is not None:
+                    worker_init_fn(worker_id)
+                torch.utils.data.backward_compatibility.worker_init_fn(
+                    worker_id)
+
+            my_worker_init_fn = functools.partial(
+                sharding_worker_init_fn, worker_init_fn)
+
+            data_loader = DataLoader(datapipe,
+                                     batch_size=None,  # Replaced by .batch DataPipe
+                                     shuffle=False,  # Replaced by .shuffle DataPipe
+                                     sampler=None,
+                                     batch_sampler=None,
+                                     num_workers=num_workers,
+                                     collate_fn=collate_fn,
+                                     pin_memory=pin_memory,
+                                     drop_last=False,  # Replaced by .batch DataPipe
+                                     timeout=timeout,
+                                     worker_init_fn=my_worker_init_fn,
+                                     prefetch_factor=prefetch_factor,
+                                     persistent_workers=persistent_workers)
+
+            if not batch_outside_worker:
+                return data_loader
+            else:
+                if collate_fn is None:
+                    collate_fn = torch.utils.data._utils.collate.default_collate
+                datapipe = IterableAsDataPipe(data_loader).batch(
+                    batch_size, drop_last=drop_last).map(collate_fn)
+                return datapipe
+
+        else:
+            return DataLoader(dataset,
+                              batch_size=batch_size,
+                              shuffle=shuffle,
+                              sampler=sampler,
+                              batch_sampler=batch_sampler,
+                              num_workers=num_workers,
+                              collate_fn=collate_fn,
+                              pin_memory=pin_memory,
+                              drop_last=drop_last,
+                              timeout=timeout,
+                              worker_init_fn=worker_init_fn,
+                              prefetch_factor=prefetch_factor,
+                              persistent_workers=persistent_workers)

From 5f997a7d2fcd81584d1c9f6e173e30c867892ee8 Mon Sep 17 00:00:00 2001
From: Pavithran Ramachandran <pavithran@fb.com>
Date: Fri, 20 Aug 2021 09:34:53 -0700
Subject: [PATCH 097/530] [PyTorch][Edge] Improve InflatableArgs for Bundled
 Inputs (#62368)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62368

# Context
The bundled inputs accepts an expression in the form of string InflatableArg.fmt that can be applied on the inputs to inflate. The InflatableArg.fmt provides flexibility to have custom transformation to inflate. When the input arguments to a function are not Tensor type, TorchScript casts the inputs from type T to Optional[T] expects the function to handle Nullable (None) clause as well. This becomes tricky to handle in one line code or lambda functions.

We propose an alternative way which allows InflatableArg to include the text of a TorchScript function that would be defined on the module as a helper, then use that in its inflation expression. This can be provided by InflatableArg.fmt_fn. Please refer to pytorch/test/test_bundled_inputs.py for example on how to use the same.

Also refer JacobSzwejbka comment on the same [here](https://github.com/pytorch/pytorch/pull/62368#issuecomment-892012812)

# Mitigation
Allow InflatedArg to include the text of a TorchScript function that would be defined on the module as a helper, then use that in its inflation expression.
ghstack-source-id: 135158680

Test Plan:
To run `test_dict_args`

```
(base) [pavithran@devvm1803.vll0 /data/users/pavithran/fbsource/fbcode] buck test //caffe2/test:test_bundled_inputs -- test_dict_args
Action graph will be rebuilt because files have been added or removed.
Building: finished in 5.4 sec (100%) 12180/12180 jobs, 0/12180 updated
  Total time: 5.8 sec
More details at https://www.internalfb.com/intern/buck/build/fafcf277-1095-4cba-978d-6022f0d391ad
Tpx test run coordinator for Facebook. See https://fburl.com/tpx for details.
Running with tpx session id: 5ef9de71-c1b1-406b-a6c0-3321c2368b8d
Trace available for this run at /tmp/tpx-20210727-163946.454212/trace.log
Started reporting to test run: https://www.internalfb.com/intern/testinfra/testrun/7036874465805934
    ✓ ListingSuccess: caffe2/test:test_bundled_inputs - main (11.365)
    ✓ Pass: caffe2/test:test_bundled_inputs - test_dict_args (test_bundled_inputs.TestBundledInputs) (12.307)
Summary
  Pass: 1
  ListingSuccess: 1
If you need help understanding your runs, please follow the wiki: https://fburl.com/posting_in_tpx_users
Finished test run: https://www.internalfb.com/intern/testinfra/testrun/7036874465805934
```

To check the py code of TS module:
P433043973

Reviewed By: dreiss

Differential Revision: D29950421

fbshipit-source-id: c819ec5c94429b7fbf6c4beb0259457f169b08ec
---
 test/test_bundled_inputs.py   | 115 +++++++++++++++++++++++++++++++++-
 torch/utils/bundled_inputs.py |  75 +++++++++++++++++++---
 2 files changed, 180 insertions(+), 10 deletions(-)

diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py
index a0fb535da8a86..62263e130fd8b 100644
--- a/test/test_bundled_inputs.py
+++ b/test/test_bundled_inputs.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 import io
 import textwrap
-from typing import List
+from typing import List, Optional, Dict
 
 import torch
 import torch.utils.bundled_inputs
@@ -324,5 +324,118 @@ def forward(self, arg):
         )
         self.assertEqual(bundled_model2.get_all_bundled_inputs(), [(torch.ones(2),)])
 
+
+    def test_dict_args(self):
+        class MyModel(torch.nn.Module):
+            def forward(
+                self,
+                arg1: Optional[Dict[str, torch.Tensor]],
+                arg2: Optional[List[torch.Tensor]],
+                arg3: torch.Tensor,
+            ):
+                if arg1 is None:
+                    return arg3
+                elif arg2 is None:
+                    return arg1["a"] + arg1["b"]
+                else:
+                    return arg1["a"] + arg1["b"] + arg2[0]
+
+        small_sample = dict(
+            a=torch.zeros([10, 20]),
+            b=torch.zeros([1, 1]),
+            c=torch.zeros([10, 20]),
+        )
+        small_list = [torch.zeros([10, 20])]
+
+        big_sample = dict(
+            a=torch.zeros([1 << 5, 1 << 8, 1 << 10]),
+            b=torch.zeros([1 << 5, 1 << 8, 1 << 10]),
+            c=torch.zeros([1 << 5, 1 << 8, 1 << 10]),
+        )
+        big_list = [torch.zeros([1 << 5, 1 << 8, 1 << 10])]
+
+        def condensed(t):
+            ret = torch.empty_like(t).flatten()[0].clone().expand(t.shape)
+            assert ret.storage().size() == 1
+            # ret.storage()[0] = 0
+            return ret
+
+        def bundle_optional_dict_of_randn(template):
+            return torch.utils.bundled_inputs.InflatableArg(
+                value=(
+                    None
+                    if template is None
+                    else {k: condensed(v) for (k, v) in template.items()}
+                ),
+                fmt="{}",
+                fmt_fn="""
+                def {}(self, value: Optional[Dict[str, Tensor]]):
+                    if value is None:
+                        return None
+                    output = {{}}
+                    for k, v in value.items():
+                        output[k] = torch.randn_like(v)
+                    return output
+                """,
+            )
+
+        def bundle_optional_list_of_randn(template):
+            return torch.utils.bundled_inputs.InflatableArg(
+                value=(None if template is None else [condensed(v) for v in template]),
+                fmt="{}",
+                fmt_fn="""
+                def {}(self, value: Optional[List[Tensor]]):
+                    if value is None:
+                        return None
+                    output = []
+                    for v in value:
+                        output.append(torch.randn_like(v))
+                    return output
+                """,
+            )
+
+        out : List[str] = []
+        sm = torch.jit.script(MyModel())
+        original_size = model_size(sm)
+        small_inputs = (
+            bundle_optional_dict_of_randn(small_sample),
+            bundle_optional_list_of_randn(small_list),
+            torch.zeros([3, 4]),
+        )
+        big_inputs = (
+            bundle_optional_dict_of_randn(big_sample),
+            bundle_optional_list_of_randn(big_list),
+            torch.zeros([1 << 5, 1 << 8, 1 << 10]),
+        )
+
+        torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
+            sm,
+            [
+                big_inputs,
+                small_inputs,
+            ],
+            _receive_inflate_expr=out,
+        )
+        augmented_size = model_size(sm)
+        # assert the size has not increased more than 8KB
+        self.assertLess(augmented_size, original_size + (1 << 13))
+
+        loaded = save_and_load(sm)
+        inflated = loaded.get_all_bundled_inputs()
+        self.assertEqual(len(inflated[0]), len(small_inputs))
+
+        methods, _ = torch.utils.bundled_inputs._get_bundled_inputs_attributes_and_methods(
+            loaded
+        )
+
+        # One Function (forward)
+        # two bundled inputs (big_inputs and small_inputs)
+        # two args which have InflatableArg with fmt_fn
+        # 1 * 2 * 2 = 4
+        self.assertEqual(
+            sum([method.startswith("_inflate_helper") for method in methods]), 4
+        )
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py
index bce658b997255..8a6d466f20da4 100644
--- a/torch/utils/bundled_inputs.py
+++ b/torch/utils/bundled_inputs.py
@@ -21,13 +21,18 @@ class InflatableArg(NamedTuple):
         the appropriate input. It can use 'value' as an input to the format str. It must result
         in a value of the same type as 'value'.
 
+        'fmt_fn' is a formatable function code string that is executed to inflate the compressed
+        data into the appropriate input. It must result in a value of the same type as 'value'.
+        The function name should be the formatable part of the string.
+
     Note: Only top level InflatableArgs can be inflated. i.e. you cannot place
     an inflatable arg inside of some other structure. You should instead create
     an inflatable arg such that the fmt code string returns the full structure
     of your input.
     """
     value: Any
-    fmt: str
+    fmt: str = "{}"
+    fmt_fn: str = ""
 
 
 def bundle_inputs(
@@ -279,13 +284,21 @@ def augment_many_model_functions_with_bundled_inputs(
                 deflated_args = []
                 parts.append("(")
                 for arg_idx, arg in enumerate(args):
-                    deflated, inflater = _inflate_expr(arg, f"deflated[{inp_idx}][{arg_idx}]")
+                    inflate_helper_fn_name = _get_inflate_helper_fn_name(arg_idx, inp_idx, function_name)
+                    deflated, inflater, helper_definition = _inflate_expr(
+                        arg,
+                        f"deflated[{inp_idx}][{arg_idx}]",
+                        inflate_helper_fn_name,
+                    )
                     deflated_args.append(deflated)
                     parts.append(f"    {inflater},")
+                    if helper_definition:
+                        model.define(textwrap.dedent(helper_definition))
                 deflated_inputs.append(tuple(deflated_args))
                 parts.append("),")
             parts.append("")
             expr = "\n".join(parts)
+
             # Back-channel return this expr for debugging.
             if _receive_inflate_expr is not None:
                 _receive_inflate_expr.append(expr)
@@ -332,7 +345,6 @@ def get_num_bundled_inputs(self):
                     return len(self.get_all_bundled_inputs_for_forward())
                 """))
 
-
     # Define some high level helper methods that act on all bundled inputs
     model.define(textwrap.dedent("""
         def get_bundled_inputs_functions_and_info(self):
@@ -341,27 +353,44 @@ def get_bundled_inputs_functions_and_info(self):
             return all_inputs
         """.format(template=get_bundled_inputs_functions_and_info_template)))
 
-def _inflate_expr(arg: T, ref: str) -> Tuple[Union[T, torch.Tensor], str]:
+def _inflate_expr(
+    arg: T, ref: str, inflate_helper_fn_name: str
+) -> Tuple[Union[T, torch.Tensor], str, Optional[str]]:
     # Allow custom inflation expressions any object.
     # For example, calling custom image-decoding ops.
     # Or just use "{}" as the format string to ignore size limits.
     if isinstance(arg, InflatableArg):
-        return arg.value, arg.fmt.format(ref)
+        if arg.fmt_fn:
+            if arg.fmt not in ["{}", ""]:
+                raise Exception(
+                    f"Bundled input argument at position '{ref}' has "
+                    f"both arg.fmt_fn => \n{arg.fmt_fn} "
+                    f"\n and arg.fmt  => {arg.fmt}. "
+                    "Please choose `arg.fmt` if the deflater is straightforward or "
+                    "`arg.fmt_fn` if you need a function."
+                )
+
+            helper_definition = arg.fmt_fn.format(inflate_helper_fn_name)
+            expr = f"self.{inflate_helper_fn_name}({ref})"
+
+            return arg.value, expr, helper_definition
+        else:
+            return arg.value, arg.fmt.format(ref), None
 
     if isinstance(arg, torch.Tensor):
         # Small-storage tensors can just be saved directly.
         if arg.storage().size() <= MAX_RAW_TENSOR_SIZE:
-            return arg, ref
+            return arg, ref, None
         # Small contiguous tensors can be cloned to have small storage.
         # TODO: Should we do this even for non-contiguous tensors?
         if arg.is_contiguous() and arg.numel() <= MAX_RAW_TENSOR_SIZE:
-            return arg.clone(), ref
+            return arg.clone(), ref, None
         # Example inputs commonly come from torch.zeros, torch.ones, or torch.full.
         # These can be represented compactly.
         for fmt in [torch.contiguous_format, torch.channels_last]:
             if arg.is_contiguous(memory_format=fmt) and (arg == arg.flatten()[0]).all().item():
                 return (arg.flatten()[0].clone().expand(*arg.size()),
-                        f"{ref}.contiguous(memory_format={fmt})")
+                        f"{ref}.contiguous(memory_format={fmt})", None)
         # Prevent big tensors from being bundled by default.
         # TODO: Provide more useful diagnostics.
         raise Exception(
@@ -370,7 +399,7 @@ def _inflate_expr(arg: T, ref: str) -> Tuple[Union[T, torch.Tensor], str]:
             f"You probably don't want to bundle this as an input. "
         )
     else:
-        return arg, ref
+        return arg, ref, None
 
 def _get_bundled_inputs_attributes_and_methods(script_module: torch.jit.ScriptModule) -> Tuple[List[str], List[str]]:
     methods: List[str] = []
@@ -389,9 +418,37 @@ def _get_bundled_inputs_attributes_and_methods(script_module: torch.jit.ScriptMo
             methods.append("get_all_bundled_inputs_for_" + function_name)
             methods.append("_generate_bundled_inputs_for_" + function_name)
             attributes.append("_bundled_inputs_deflated_" + function_name)
+
+            bundled_inputs_fn = getattr(
+                script_module,
+                f"get_all_bundled_inputs_for_{function_name}"
+            )
+            num_bundled_inputs: int = len(bundled_inputs_fn())
+
+            # Check inflate helper functions for each function, argument and bundled input
+            func = getattr(script_module, function_name, None)
+            for arg_idx in range(len(func.schema.arguments) - 1):
+                for input_idx in range(num_bundled_inputs):
+                    helper_fn_name = _get_inflate_helper_fn_name(
+                        arg_idx=arg_idx,
+                        input_idx=input_idx,
+                        function_name=function_name
+                    )
+                    # if the arg has an InflatableArg with fmt_fn, add the helper function name
+                    if hasattr(script_module, helper_fn_name):
+                        methods.append(helper_fn_name)
+
     return (methods, attributes)
 
 
+def _get_inflate_helper_fn_name(
+    arg_idx: int,
+    input_idx: int,
+    function_name: str,
+) -> str:
+    return f"_inflate_helper_for_{function_name}_input_{input_idx}_arg_{arg_idx}"
+
+
 
 def bundle_randn(*size, dtype=None):
     """Generate a tensor that will be inflated with torch.randn."""

From 03cc46a0acadcba618402a5b366f1d02bc3e21af Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@fb.com>
Date: Fri, 20 Aug 2021 10:49:21 -0700
Subject: [PATCH 098/530] [fx2trt] Add layernorm plugin for dynamic shape
 (#63620)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63620

Added layernorm dynamic plugin, so that it works when explicit batch dim is required. Needed for ig model.

Changed the way of how we creating a plugin layer from instantiating the plugin directly to use plugin creator with `PluginFieldCollection`.

Follow ups:
Another way to convert layernorm is by breaking it down to supported trt layers. T97398182

Test Plan: layernorm unittest

Reviewed By: yinghai

Differential Revision: D30138205

fbshipit-source-id: aebe021d8de818e20376634f30e84579b9807f9b
---
 .../fx2trt/converters/acc_ops_converters.py   | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
index e7fcb94475b3e..eddb079afcac5 100644
--- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
+++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
@@ -414,6 +414,66 @@ def acc_ops_batch_norm(network, target, args, kwargs, name):
 
     return layer.get_output(0)
 
+@tensorrt_converter(acc_ops.layer_norm)
+def acc_ops_layer_norm(network, target, args, kwargs, name):
+    input_val = kwargs["input"]
+
+    if not isinstance(input_val, trt.tensorrt.ITensor):
+        raise RuntimeError(f"LayerNorm received input {input_val} that is not part "
+                           "of the TensorRT region!")
+
+    shape = kwargs["weight"].shape
+    broadcasted_shape = (1,) * (len(input_val.shape) - len(shape)) + shape
+    gamma = to_numpy(kwargs["weight"].reshape(*shape))
+    beta = to_numpy(kwargs["bias"].reshape(*shape))
+    eps = kwargs["eps"]
+    normalized_shape = kwargs["normalized_shape"]
+
+    axes = 0
+    for d in range(len(normalized_shape)):
+        axes |= 1 << (len(input_val.shape) - d - 1)
+
+    # E[x]
+    mean_expected_layer = network.add_reduce(input_val, trt.ReduceOperation.AVG, axes, keep_dims=True)
+    mean_expected_layer.name = f"{name}_mean_expected"
+    # X-E[x]
+    sub_trt = add_binary_elementwise_layer(
+        network, input_val, mean_expected_layer.get_output(0), trt.ElementWiseOperation.SUB, f"{name}_sub"
+    )
+    # Variance = mean(pow(x_sub_mean,2))
+    pow_tensor = network.add_constant(
+        (1,) * len(input_val.shape), trt.Weights(np.ascontiguousarray([2.0], dtype=np.float32))
+    )
+    pow_tensor.name = f"{name}_power"
+    pow_var = add_binary_elementwise_layer(
+        network, sub_trt, pow_tensor.get_output(0), trt.ElementWiseOperation.POW, f"{name}_pow_var"
+    )
+    mean_trt_layer = network.add_reduce(pow_var, trt.ReduceOperation.AVG, axes, keep_dims=True)
+    mean_trt_layer.name = f"{name}_mean"
+    # Variance + eps
+    eps_tensor = network.add_constant(
+        (1,) * len(input_val.shape), trt.Weights(np.ascontiguousarray([eps], dtype=np.float32))
+    )
+    eps_tensor.name = f"{name}_eps"
+    add_trt = add_binary_elementwise_layer(
+        network, mean_trt_layer.get_output(0), eps_tensor.get_output(0), trt.ElementWiseOperation.SUM, f"{name}_add"
+    )
+    # SQRT((Var + eps))
+    sqrt_trt = add_unary_layer(network, add_trt, trt.UnaryOperation.SQRT, f"{name}_sqrt")
+    # (x - E[x]) / sqrt((var + eps))
+    div_trt = add_binary_elementwise_layer(network, sub_trt, sqrt_trt, trt.ElementWiseOperation.DIV, f"{name}_div_trt")
+
+    gamma_tensor = network.add_constant(gamma.shape, trt.Weights(np.ascontiguousarray(gamma)))
+    gamma_tensor.name = f"{name}_gamma"
+    beta_tensor = network.add_constant(gamma.shape, trt.Weights(np.ascontiguousarray(beta)))
+    beta_tensor.name = f"{name}_beta"
+    # y * gamma + beta
+    scale_layer = add_binary_elementwise_layer(
+        network, div_trt, gamma_tensor.get_output(0), trt.ElementWiseOperation.PROD, f"{name}_scale"
+    )
+    return add_binary_elementwise_layer(
+        network, scale_layer, beta_tensor.get_output(0), trt.ElementWiseOperation.SUM, name
+    )
 
 @tensorrt_converter(acc_ops.softmax)
 def acc_ops_softmax(network, target, args, kwargs, name):

From b95ce1591d56d545391ad5651f17ceb3b398a666 Mon Sep 17 00:00:00 2001
From: Victor Quach <quach@fb.com>
Date: Fri, 20 Aug 2021 11:07:22 -0700
Subject: [PATCH 099/530] Add docs describing saved tensor hooks (#62362)

Summary:
Add section to the Autograd mechanics docs to describe the recently
exposed saved tensors (https://github.com/pytorch/pytorch/issues/52451), how to register packing / unpacking
hooks (https://github.com/pytorch/pytorch/issues/60975) and how to use default hooks (https://github.com/pytorch/pytorch/issues/61834)

Sister PR: https://github.com/pytorch/pytorch/issues/62361 (will add a link from autograd.rst to notes/autograd in whatever PR does not land first)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62362

Reviewed By: soulitzer

Differential Revision: D30453177

Pulled By: Varal7

fbshipit-source-id: f5759977b069ff0ef36a47b08856d297691a6caa
---
 docs/source/autograd.rst       |   1 +
 docs/source/notes/autograd.rst | 199 +++++++++++++++++++++++++++++++++
 2 files changed, 200 insertions(+)

diff --git a/docs/source/autograd.rst b/docs/source/autograd.rst
index 6423d5d6d088c..8aace1ef12ab8 100644
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
@@ -252,6 +252,7 @@ You can define how these saved tensors should be packed / unpacked using hooks.
 A common application is to trade compute for memory by saving those intermediary results
 to disk or to CPU instead of leaving them on the GPU. This is especially useful if you
 notice your model fits on GPU during evaluation, but not training.
+Also see :ref:`saved-tensors-hooks-doc`.
 
 .. autoclass:: torch.autograd.graph.saved_tensors_hooks
 
diff --git a/docs/source/notes/autograd.rst b/docs/source/notes/autograd.rst
index 0c1eed3f42457..2a59d976e9a6a 100644
--- a/docs/source/notes/autograd.rst
+++ b/docs/source/notes/autograd.rst
@@ -36,6 +36,57 @@ flow statements, that can change the overall shape and size of the graph at
 every iteration. You don't have to encode all possible paths before you
 launch the training - what you run is what you differentiate.
 
+.. _saved-tensors-doc:
+
+Saved tensors
+^^^^^^^^^^^^^
+
+Some operations need intermediary results to be saved during the forward pass
+in order to execute the backward pass. For example, the function
+:math:`x\mapsto x^2` saves the input :math:`x` to compute the gradient.
+
+When defining a custom Python :class:`~torch.autograd.Function`, you can use
+:func:`~torch.autograd.function._ContextMethodMixin.save_for_backward` to save
+tensors during the forward pass and
+:attr:`~torch.autograd.function.Function.saved_tensors` to retrieve them
+during the backward pass. See :doc:`/notes/extending` for more information.
+
+For operations that PyTorch defines (e.g. :func:`torch.pow`), tensors are
+automatically saved as needed. You can explore (for educational or debugging
+purposes) which tensors are saved by a certain ``grad_fn`` by looking for its
+attributes starting with the prefix ``_saved``.
+
+.. code::
+
+    x = torch.randn(5, requires_grad=True)
+    y = x.pow(2)
+    print(x.equal(y.grad_fn._saved_self))  # True
+    print(x is y.grad_fn._saved_self)  # True
+
+
+In the previous code, ``y.grad_fn._saved_self`` refers to the same Tensor object as `x`.
+But that may not always be the case. For instance:
+
+.. code::
+
+    x = torch.randn(5, requires_grad=True)
+    y = x.exp()
+    print(y.equal(y.grad_fn._saved_result))  # True
+    print(y is y.grad_fn._saved_result)  # False
+
+
+Under the hood, to prevent reference cycles, PyTorch has *packed* the tensor
+upon saving and *unpacked* it into a different tensor for reading. Here, the
+tensor you get from accessing ``y.grad_fn._saved_result`` is a different tensor
+object than ``x`` (but they still share the same storage).
+
+Whether a tensor will be packed into a different tensor object depends on
+whether it is an output of its own `grad_fn`, which is an implementation detail
+subject to change and that users should not rely on.
+
+You can control how PyTorch does packing / unpacking with :ref:`saved-tensors-hooks-doc`.
+
+
 .. _locally-disable-grad-doc:
 
 Locally disabling gradient computation
@@ -598,3 +649,151 @@ chain rule:
 
         .. math::
             \frac{\partial L}{\partial z^*} = 2 * Re(grad\_out^* * \frac{\partial s}{\partial z^{*}})
+
+.. _saved-tensors-hooks-doc:
+
+Hooks for saved tensors
+-----------------------
+
+You can control :ref:`how saved tensors are packed / unpacked
+<saved-tensors-doc>` by defining a pair of ``pack_hook`` / ``unpack_hook``
+hooks.  The ``pack_hook`` function should take a tensor as its single argument
+but can return any python object (e.g. another tensor, a tuple, or even a
+string containing a filename). The ``unpack_hook`` function takes as its single
+argument the output of ``pack_hook`` and should return a tensor to be used in
+the backward pass. The tensor returned by ``unpack_hook`` only needs to have
+the same content as the tensor passed as input to ``pack_hook``. In particular,
+any autograd-related metadata can be ignored as they will be overwritten during
+unpacking.
+
+An example of such pair is:
+
+.. code::
+
+    class SelfDeletingTempFile():
+        def __init__(self):
+            self.name = os.path.join(tmp_dir, str(uuid.uuid4()))
+
+        def __del__(self):
+            os.remove(self.name)
+
+    def pack_hook(tensor):
+        temp_file = SelfDeletingTempFile()
+        torch.save(tensor, temp_file.name)
+        return temp_file
+
+    def unpack_hook(temp_file):
+        return torch.load(temp_file.name)
+
+Notice that the ``unpack_hook`` should not delete the temporary file because it
+might be called multiple times: the temporary file should be alive for as long
+as the returned `SelfDeletingTempFile` object is alive.  In the above example,
+we prevent leaking the temporary file by closing it when it is no longer needed
+(on deletion of the `SelfDeletingTempFile` object).
+
+.. note::
+
+    We guarantee that ``pack_hook`` will only be called once but ``unpack_hook`` can
+    be called as many times as the backward pass requires it and we expect it to
+    return the same data each time.
+
+.. warning::
+
+    Performing inplace operations on the input of any of the functions is forbidden
+    as they may lead to unexpected side-effects. PyTorch will throw an error if the
+    input to a pack hook is modified inplace but does not catch the case where the
+    input to an unpack hook is modified inplace.
+
+
+Registering hooks for a saved tensor
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can register a pair of hooks on a saved tensor by calling the
+:meth:`~torch.autograd.SavedTensor.register_hooks` method on a
+:class:`SavedTensor` object. Those objects are exposed as attributes of a
+``grad_fn`` and start with the ``_raw_saved_`` prefix.
+
+.. code::
+
+    x = torch.randn(5, requires_grad=True)
+    y = x.pow(2)
+    y.grad_fn._raw_saved_self.register_hooks(pack_hook, unpack_hook)
+
+The ``pack_hook`` method is called as soon as the pair is registered.
+The ``unpack_hook`` method is called each time the saved tensor needs to be
+accessed, either by means of ``y.grad_fn._saved_self`` or during the backward
+pass.
+
+.. warning::
+
+    If you maintain a reference to a :class:`SavedTensor` after the saved
+    tensors have been released (i.e. after backward has been called), calling
+    its :meth:`~torch.autograd.SavedTensor.register_hooks` is forbidden.
+    PyTorch will throw an error most of the time but it may fail
+    to do so in some cases and undefined behavior may arise.
+
+Registering default hooks for saved tensors
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Alternatively, you can use the context-manager
+:class:`~torch.autograd.graph.saved_tensors_hooks` to register a pair of
+hooks which will be applied to *all* saved tensors that are created in
+that context.
+
+Example:
+
+.. code::
+
+    # Only save on disk tensors that have size >= 1000
+    SAVE_ON_DISK_THRESHOLD = 1000
+
+    def pack_hook(x):
+        if x.numel() < SAVE_ON_DISK_THRESHOLD:
+            return x
+        temp_file = SelfDeletingTempFile()
+        torch.save(tensor, temp_file.name)
+        return temp_file
+
+    def unpack_hook(tensor_or_sctf):
+        if isinstance(tensor_or_sctf, torch.Tensor):
+            return tensor_or_sctf
+        return torch.load(tensor_or_sctf.name)
+
+    class Model(nn.Module):
+        def forward(self, x):
+            with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
+              # ... compute output
+              output = x
+            return output
+
+    model = Model()
+    net = nn.DataParallel(model)
+
+
+
+The hooks defined with this context manager are thread-local.
+Hence, the following code will not produce the desired effects because the hooks do not go
+through `DataParallel`.
+
+.. code::
+
+      # Example what NOT to do
+
+      net = nn.DataParallel(model)
+      with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
+          output = net(input)
+
+
+Note that using those hooks disables all the optimization in place to reduce
+Tensor object creation. For example:
+
+.. code::
+
+    with torch.autograd.graph.saved_tensors_hooks(lambda x: x, lambda x: x):
+        x = torch.randn(5, requires_grad=True)
+        y = x * x
+
+Without the hooks, ``x``, ``y.grad_fn._saved_self`` and
+``y.grad_fn._saved_other`` all refer to the same tensor object.
+With the hooks, PyTorch will pack and unpack `x` into two new tensor objects
+that share the same storage with the original `x` (no copy performed).

From c78ab28441f5616899d363d57317f4de29147e5c Mon Sep 17 00:00:00 2001
From: Aaron Bockover <abock@microsoft.com>
Date: Fri, 20 Aug 2021 11:11:47 -0700
Subject: [PATCH 100/530] Add support for the ONNX Runtime Eager Mode backend
 (#58248)

Summary:
This PR implements the necessary hooks/stubs/enums/etc for complete ONNX Runtime (ORT) Eager Mode integration. The actual extension will live out of tree at https://github.com/pytorch/ort.

We have been [working on this at Microsoft](https://github.com/microsoft/onnxruntime-pytorch/tree/eager-ort/torch_onnxruntime) for the last few months, and are finally ready to contribute the PyTorch core changes upstream (nothing major or exciting, just the usual boilerplate for adding new backends).

The ORT backend will allow us to ferry [almost] all torch ops into granular ONNX kernels that ORT will eagerly execute against any devices it supports (therefore, we only need a single ORT backend from a PyTorch perspective).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/58248

Reviewed By: astaff

Differential Revision: D30344992

Pulled By: albanD

fbshipit-source-id: 69082b32121246340d686e16653626114b7714b2
---
 aten/src/ATen/Context.h                       |  8 ++++
 aten/src/ATen/Version.cpp                     |  4 ++
 aten/src/ATen/core/aten_interned_strings.h    |  1 +
 aten/src/ATen/core/op_registration/README.md  |  4 +-
 aten/src/ATen/detail/ORTHooksInterface.cpp    | 31 +++++++++++++++
 aten/src/ATen/detail/ORTHooksInterface.h      | 36 ++++++++++++++++++
 aten/src/ATen/templates/TensorBody.h          |  6 +++
 aten/src/ATen/test/extension_backend_test.cpp | 23 ++++++-----
 c10/core/Backend.h                            | 18 ++++-----
 c10/core/Device.cpp                           |  4 +-
 c10/core/DeviceType.cpp                       |  6 +--
 c10/core/DeviceType.h                         |  4 +-
 c10/core/DispatchKey.cpp                      |  4 +-
 c10/core/DispatchKey.h                        | 13 +++++--
 c10/core/DispatchKeySet.cpp                   |  1 +
 c10/core/DispatchKeySet.h                     |  2 +-
 c10/core/TensorImpl.h                         |  4 ++
 c10/core/TensorOptions.h                      | 10 ++---
 caffe2/proto/caffe2.proto                     |  2 +-
 caffe2/proto/caffe2_pb2.pyi                   |  4 +-
 ...{msnpu_extension.cpp => ort_extension.cpp} | 38 +++++++++----------
 test/cpp_extensions/setup.py                  |  2 +-
 test/test_cpp_extensions_aot.py               | 38 +++++++++----------
 test/test_gen_backend_stubs.py                | 12 +++---
 test/test_torch.py                            |  6 +--
 tools/build_variables.bzl                     |  1 +
 tools/codegen/model.py                        |  2 +-
 tools/pyi/gen_pyi.py                          |  1 +
 torch/_C/_autograd.pyi                        |  2 +-
 torch/_tensor.py                              | 27 +++++--------
 torch/_utils.py                               |  9 ++---
 torch/csrc/Device.h                           |  4 +-
 torch/csrc/autograd/init.cpp                  |  2 +-
 torch/csrc/autograd/python_variable.cpp       | 12 ++++++
 torch/csrc/jit/frontend/sugared_value.cpp     |  2 +-
 torch/csrc/jit/runtime/register_prim_ops.cpp  |  8 ++++
 torch/library.h                               |  4 +-
 torch/overrides.py                            |  1 +
 38 files changed, 236 insertions(+), 120 deletions(-)
 create mode 100644 aten/src/ATen/detail/ORTHooksInterface.cpp
 create mode 100644 aten/src/ATen/detail/ORTHooksInterface.h
 rename test/cpp_extensions/{msnpu_extension.cpp => ort_extension.cpp} (78%)

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 26f1d11f92b48..4a45ac6f8ac18 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -9,6 +9,7 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/detail/HIPHooksInterface.h>
+#include <ATen/detail/ORTHooksInterface.h>
 #include <c10/util/Exception.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/core/QEngine.h>
@@ -79,6 +80,9 @@ class TORCH_API Context {
   static bool hasMLC() {
     return c10::impl::hasDeviceGuardImpl(at::DeviceType::MLC);
   }
+  static bool hasORT() {
+    return c10::impl::hasDeviceGuardImpl(at::DeviceType::ORT);
+  }
   // defined in header so that getNonVariableType has ability to inline
   // call_once check. getNonVariableType is called fairly frequently
   THCState* lazyInitCUDA() {
@@ -292,6 +296,10 @@ static inline bool hasMLC() {
   return globalContext().hasMLC();
 }
 
+static inline bool hasORT() {
+  return globalContext().hasORT();
+}
+
 // Despite its name, this function returns the number of *CUDA* GPUs.
 static inline size_t getNumGPUs() {
   // WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS
diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp
index 750c90bb4c59f..0c0ea61ceb3c2 100644
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@@ -184,6 +184,10 @@ std::string show_config() {
     ss << detail::getCUDAHooks().showConfig();
   }
 
+  if (hasORT()) {
+    ss << detail::getORTHooks().showConfig();
+  }
+
   ss << "  - Build settings: ";
   for (const auto& pair : caffe2::GetBuildOptions()) {
     if (!pair.second.empty()) {
diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index 584e3db9ee193..abdf397544468 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -405,6 +405,7 @@ _(aten, is_complex) \
 _(aten, is_contiguous) \
 _(aten, is_cuda) \
 _(aten, is_mlc) \
+_(aten, is_ort) \
 _(aten, is_distributed) \
 _(aten, is_floating_point) \
 _(aten, is_inference) \
diff --git a/aten/src/ATen/core/op_registration/README.md b/aten/src/ATen/core/op_registration/README.md
index edd9f911cd0e1..5605e962a6e5e 100644
--- a/aten/src/ATen/core/op_registration/README.md
+++ b/aten/src/ATen/core/op_registration/README.md
@@ -13,13 +13,13 @@ There’s four main use cases
 * You’re writing a new operator that isn’t supposed to be part of the public PyTorch API.
 * You’re writing a new operator but don’t want to change the core pytorch code base, say you’re developing a shared library with operators.
 * You’re writing a C++ extension for PyTorch or you’re using inline c++ in your .py model files.
-* You’re writing a backend library like XLA or MSNPU that adds new kernels to all operators defined in `native_functions.yaml`.
+* You’re writing a backend library like XLA or ORT that adds new kernels to all operators defined in `native_functions.yaml`.
 
 For these use cases, the custom operator API is the better solution.
 
 ### What is the price for using the custom operator API instead of `native_functions.yaml`?
 
-If you’re just using the custom operator API to add new kernels for existing operators (e.g. the XLA/MSNPU example above), then you’re fine and don’t pay any price. If, however, you define a new operator purely using the custom op API, i.e. your operator never shows up in `native_functions.yaml`, then you need to be aware of a few caveats.
+If you’re just using the custom operator API to add new kernels for existing operators (e.g. the XLA/ORT example above), then you’re fine and don’t pay any price. If, however, you define a new operator purely using the custom op API, i.e. your operator never shows up in `native_functions.yaml`, then you need to be aware of a few caveats.
 
 * It will not get a C++ API generated. There will not be `Tensor::your_op()` methods or `at::your_op()` functions to call your operator.
 * The API for calling the operator from Python looks a little bit different. It needs to be called through `torch.ops.your_op()` instead of `torch._C`.
diff --git a/aten/src/ATen/detail/ORTHooksInterface.cpp b/aten/src/ATen/detail/ORTHooksInterface.cpp
new file mode 100644
index 0000000000000..33f70935a04d0
--- /dev/null
+++ b/aten/src/ATen/detail/ORTHooksInterface.cpp
@@ -0,0 +1,31 @@
+#include <ATen/detail/ORTHooksInterface.h>
+
+#include <c10/util/Exception.h>
+
+#include <cstddef>
+#include <memory>
+#include <mutex>
+
+namespace at {
+namespace detail {
+
+// See getCUDAHooks for some more commentary
+const ORTHooksInterface& getORTHooks() {
+  static std::unique_ptr<ORTHooksInterface> ort_hooks;
+  static std::once_flag once;
+  std::call_once(once, [] {
+    ort_hooks = ORTHooksRegistry()->Create("ORTHooks", {});
+    if (!ort_hooks) {
+      ort_hooks =
+          // NOLINTNEXTLINE(modernize-make-unique)
+          std::unique_ptr<ORTHooksInterface>(new ORTHooksInterface());
+    }
+  });
+  return *ort_hooks;
+}
+} // namespace detail
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+C10_DEFINE_REGISTRY(ORTHooksRegistry, ORTHooksInterface, ORTHooksArgs)
+
+} // namespace at
diff --git a/aten/src/ATen/detail/ORTHooksInterface.h b/aten/src/ATen/detail/ORTHooksInterface.h
new file mode 100644
index 0000000000000..caee55cdfaf99
--- /dev/null
+++ b/aten/src/ATen/detail/ORTHooksInterface.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+constexpr const char* ORT_HELP =
+  " You need to 'import torch_ort' to use the 'ort' device in PyTorch. "
+  "The 'torch_ort' module is provided by the ONNX Runtime itself "
+  "(https://onnxruntime.ai).";
+
+// NB: Class must live in `at` due to limitations of Registry.h.
+namespace at {
+
+struct TORCH_API ORTHooksInterface {
+  // This should never actually be implemented, but it is used to
+  // squelch -Werror=non-virtual-dtor
+  virtual ~ORTHooksInterface() {}
+
+  virtual std::string showConfig() const {
+    TORCH_CHECK(false, "Cannot query detailed ORT version information.", ORT_HELP);
+  }
+};
+
+// NB: dummy argument to suppress "ISO C++11 requires at least one argument
+// for the "..." in a variadic macro"
+struct TORCH_API ORTHooksArgs {};
+
+C10_DECLARE_REGISTRY(ORTHooksRegistry, ORTHooksInterface, ORTHooksArgs);
+#define REGISTER_ORT_HOOKS(clsname) \
+  C10_REGISTER_CLASS(ORTHooksRegistry, clsname, clsname)
+
+namespace detail {
+TORCH_API const ORTHooksInterface& getORTHooks();
+} // namespace detail
+
+} // namespace at
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index be14980fb2d14..a6e6583c7b19c 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -492,6 +492,12 @@ class TORCH_API Tensor {
     return impl_->is_mlc();
   }
 
+  /// Returns if a `Tensor` is ort tensor.
+  bool is_ort() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_ort();
+  }
+
   /// Returns if a `Tensor` is vulkan tensor.
   bool is_vulkan() const {
     // NB: this is not a native function to avoid dispatching overhead.
diff --git a/aten/src/ATen/test/extension_backend_test.cpp b/aten/src/ATen/test/extension_backend_test.cpp
index 531507e96697e..9b215a90ae74a 100644
--- a/aten/src/ATen/test/extension_backend_test.cpp
+++ b/aten/src/ATen/test/extension_backend_test.cpp
@@ -6,6 +6,11 @@
 
 #include <torch/csrc/jit/runtime/operator.h>
 
+// NB. These tests use the ORT dispatch key to test backend dispatching
+// machinery, but these tests are not specific to ORT at all. The ORT
+// backend is fully out-of-tree, so it's safe to use this key for
+// in-tree tests.
+
 using namespace at;
 
 static int test_int;
@@ -17,16 +22,16 @@ Tensor empty_override(IntArrayRef size, c10::optional<ScalarType> dtype, c10::op
       Storage(
           Storage::use_byte_size_t(),
           0,
-          at::DataPtr(nullptr, Device(DeviceType::MSNPU, 1)),
+          at::DataPtr(nullptr, Device(DeviceType::ORT, 1)),
           nullptr,
           false),
-      DispatchKey::MSNPU,
+      DispatchKey::ORT,
       caffe2::TypeMeta::Make<float>());
   return Tensor(std::move(tensor_impl));
 }
 
 Tensor add_override(const Tensor & a, const Tensor & b , const Scalar& c) {
-  auto out = empty({5, 5}, at::kMSNPU);  // Don't return self as-is
+  auto out = empty({5, 5}, at::kORT);  // Don't return self as-is
   test_int = 2;
   return out;
 }
@@ -42,28 +47,28 @@ Tensor empty_strided_override(
   return empty_override(size, dtype, layout, device, pin_memory, c10::nullopt);
 }
 
-TORCH_LIBRARY_IMPL(aten, MSNPU, m) {
+TORCH_LIBRARY_IMPL(aten, ORT, m) {
   m.impl("aten::empty.memory_format",  empty_override);
   m.impl("aten::empty_strided",        empty_strided_override);
   m.impl("aten::add.Tensor",           add_override);
 }
 
 TEST(BackendExtensionTest, TestRegisterOp) {
-  Tensor a = empty({5, 5}, at::kMSNPU);
-  ASSERT_EQ(a.device().type(), at::kMSNPU);
+  Tensor a = empty({5, 5}, at::kORT);
+  ASSERT_EQ(a.device().type(), at::kORT);
   ASSERT_EQ(a.device().index(), 1);
   ASSERT_EQ(a.dtype(), caffe2::TypeMeta::Make<float>());
   ASSERT_EQ(test_int, 1);
 
-  Tensor b = empty_like(a, at::kMSNPU);
-  ASSERT_EQ(b.device().type(), at::kMSNPU);
+  Tensor b = empty_like(a, at::kORT);
+  ASSERT_EQ(b.device().type(), at::kORT);
   ASSERT_EQ(b.device().index(), 1);
   ASSERT_EQ(b.dtype(), caffe2::TypeMeta::Make<float>());
 
   add(a, b);
   ASSERT_EQ(test_int, 2);
 
-  // Ensure that non-MSNPU operator still works
+  // Ensure that non-ORT operator still works
   Tensor d = empty({5, 5}, at::kCPU);
   ASSERT_EQ(d.device().type(), at::kCPU);
 }
diff --git a/c10/core/Backend.h b/c10/core/Backend.h
index 2f071345311f2..e17a1bc4226c6 100644
--- a/c10/core/Backend.h
+++ b/c10/core/Backend.h
@@ -40,7 +40,7 @@ enum class Backend {
   SparseHIP,
   SparseVE,
   SparseXPU,
-  MSNPU,
+  ORT,
   XLA,
   Vulkan,
   Metal,
@@ -66,8 +66,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
     return Backend::VE;
   } else if (t == DispatchKey::FPGA) {
     return Backend::FPGA;
-  } else if (t == DispatchKey::MSNPU) {
-    return Backend::MSNPU;
+  } else if (t == DispatchKey::ORT) {
+    return Backend::ORT;
   } else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) {
     return Backend::XLA;
   } else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) {
@@ -123,8 +123,8 @@ static inline DispatchKey backendToDispatchKey(Backend b) {
       return DispatchKey::VE;
     case Backend::FPGA:
       return DispatchKey::FPGA;
-    case Backend::MSNPU:
-      return DispatchKey::MSNPU;
+    case Backend::ORT:
+      return DispatchKey::ORT;
     case Backend::XLA:
       return DispatchKey::XLA;
     case Backend::Lazy:
@@ -178,8 +178,8 @@ static inline DeviceType backendToDeviceType(Backend b) {
       return DeviceType::VE;
     case Backend::FPGA:
       return DeviceType::FPGA;
-    case Backend::MSNPU:
-      return DeviceType::MSNPU;
+    case Backend::ORT:
+      return DeviceType::ORT;
     case Backend::XLA:
       return DeviceType::XLA;
     case Backend::Lazy:
@@ -235,8 +235,8 @@ static inline const char* toString(Backend b) {
       return "FPGA";
     case Backend::XPU:
       return "XPU";
-    case Backend::MSNPU:
-      return "MSNPU";
+    case Backend::ORT:
+      return "ORT";
     case Backend::XLA:
       return "XLA";
     case Backend::Lazy:
diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp
index 2709c29ce8460..2531e3942271a 100644
--- a/c10/core/Device.cpp
+++ b/c10/core/Device.cpp
@@ -28,7 +28,7 @@ DeviceType parse_type(const std::string& device_string) {
           {"hip", DeviceType::HIP},
           {"ve", DeviceType::VE},
           {"fpga", DeviceType::FPGA},
-          {"msnpu", DeviceType::MSNPU},
+          {"ort", DeviceType::ORT},
           {"xla", DeviceType::XLA},
           {"lazy", DeviceType::Lazy},
           {"vulkan", DeviceType::Vulkan},
@@ -47,7 +47,7 @@ DeviceType parse_type(const std::string& device_string) {
   }
   TORCH_CHECK(
       false,
-      "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, msnpu, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ",
+      "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, ort, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ",
       device_string);
 }
 enum DeviceStringParsingState { START, INDEX_START, INDEX_REST, ERROR };
diff --git a/c10/core/DeviceType.cpp b/c10/core/DeviceType.cpp
index 4ff939806f980..4635acdb148c2 100644
--- a/c10/core/DeviceType.cpp
+++ b/c10/core/DeviceType.cpp
@@ -25,8 +25,8 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) {
       return lower_case ? "ve" : "VE";
     case DeviceType::FPGA:
       return lower_case ? "fpga" : "FPGA";
-    case DeviceType::MSNPU:
-      return lower_case ? "msnpu" : "MSNPU";
+    case DeviceType::ORT:
+      return lower_case ? "ort" : "ORT";
     case DeviceType::XLA:
       return lower_case ? "xla" : "XLA";
     case DeviceType::Lazy:
@@ -75,7 +75,7 @@ bool isValidDeviceType(DeviceType d) {
     case DeviceType::HIP:
     case DeviceType::VE:
     case DeviceType::FPGA:
-    case DeviceType::MSNPU:
+    case DeviceType::ORT:
     case DeviceType::XLA:
     case DeviceType::Lazy:
     case DeviceType::MLC:
diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h
index 2ae028d144026..c6bd56914d6d1 100644
--- a/c10/core/DeviceType.h
+++ b/c10/core/DeviceType.h
@@ -21,7 +21,7 @@ enum class DeviceType : int8_t {
   IDEEP = 5, // IDEEP.
   HIP = 6, // AMD HIP
   FPGA = 7, // FPGA
-  MSNPU = 8, // MSNPU
+  ORT = 8, // ONNX Runtime / Microsoft
   XLA = 9, // XLA / TPU
   Vulkan = 10, // Vulkan
   Metal = 11, // Metal
@@ -42,7 +42,7 @@ constexpr DeviceType kCPU = DeviceType::CPU;
 constexpr DeviceType kCUDA = DeviceType::CUDA;
 constexpr DeviceType kHIP = DeviceType::HIP;
 constexpr DeviceType kFPGA = DeviceType::FPGA;
-constexpr DeviceType kMSNPU = DeviceType::MSNPU;
+constexpr DeviceType kORT = DeviceType::ORT;
 constexpr DeviceType kXLA = DeviceType::XLA;
 constexpr DeviceType kMLC = DeviceType::MLC;
 constexpr DeviceType kMeta = DeviceType::Meta;
diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp
index 5c414484b38fd..18aa4fc32fb64 100644
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@@ -19,8 +19,8 @@ const char* toString(DispatchKey t) {
       return "FPGA";
     case DispatchKey::XPU:
       return "XPU";
-    case DispatchKey::MSNPU:
-      return "MSNPU";
+    case DispatchKey::ORT:
+      return "ORT";
     case DispatchKey::XLA:
       return "XLA";
     case DispatchKey::Lazy:
diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h
index 5b20a1ca327df..07222b79ee964 100644
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@@ -59,8 +59,15 @@ enum class DispatchKey : uint8_t {
   // CUDA]
   FPGA, // Xilinx support lives out of tree at
   // https://gitlab.com/pytorch-complex/vitis_kernels
-  MSNPU, // unused externally, but tested at
-  // test/cpp_extensions/msnpu_extension.cpp
+
+  // ONNX Runtime, lives out of tree at https://github.com/pytorch/ort and
+  // https://github.com/microsoft/onnxruntime, and is also used to test general
+  // backend/extension machinery in the core. cf:
+  // - test/cpp_extensions/ort_extension.cpp
+  // - test/test_torch.py
+  // - aten/src/ATen/test/extension_backend_test.cpp
+  ORT,
+
   XLA, // lives out of tree at https://github.com/pytorch/xla
   MLC, // lives out of tree at https://github.com/pytorch/MLCompute
   Vulkan,
@@ -114,7 +121,7 @@ enum class DispatchKey : uint8_t {
 
   // Here are reserved backends for user-defined backends, see Note [Private use
   // DispatchKey]
-  // To see some example about how to use this, check out MSNPU
+  // To see some example about how to use this, check out ORT
   PrivateUse1,
   PrivateUse2,
   PrivateUse3,
diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp
index b796114d4a608..404acc7cb1db3 100644
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@@ -19,6 +19,7 @@ constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends |
         DispatchKey::PrivateUse3,
         DispatchKey::MLC,
         DispatchKey::HPU,
+        DispatchKey::ORT,
         DispatchKey::Meta,
     });
 
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index 0d3a25ea9d8d1..b1f5f04524d19 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -248,7 +248,7 @@ constexpr DispatchKeySet autogradother_backends = DispatchKeySet(
     {DispatchKey::HIP,
      DispatchKey::VE,
      DispatchKey::FPGA,
-     DispatchKey::MSNPU,
+     DispatchKey::ORT,
      DispatchKey::Vulkan,
      DispatchKey::Metal,
      DispatchKey::QuantizedCPU,
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 65d7af38e3599..7051e36b35516 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -873,6 +873,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return key_set_.has(DispatchKey::MLC);
   }
 
+  bool is_ort() const {
+    return key_set_.has(DispatchKey::ORT);
+  }
+
   // TODO: remove this once we don't automatically enabled Autograd dispatch
   // keys
   //       in TensorImpl constructor.
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index fff9433e270f7..287b2fa41b2a3 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -663,8 +663,8 @@ inline DispatchKey computeDispatchKey(
           return DispatchKey::VE;
         case DeviceType::FPGA:
           return DispatchKey::FPGA;
-        case DeviceType::MSNPU:
-          return DispatchKey::MSNPU;
+        case DeviceType::ORT:
+          return DispatchKey::ORT;
         case DeviceType::XLA:
           return DispatchKey::XLA;
         case DeviceType::Lazy:
@@ -790,10 +790,8 @@ inline DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) {
     case DispatchKey::HPU:
     case DispatchKey::AutogradHPU:
       return DeviceType::HPU;
-
-    // stuff that isn't real
-    case DispatchKey::MSNPU:
-      return DeviceType::MSNPU;
+    case DispatchKey::ORT:
+      return DeviceType::ORT;
     default:
       TORCH_CHECK(
           false,
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index 6e055778578ab..90a2020195f60 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -219,7 +219,7 @@ enum DeviceTypeProto {
   PROTO_IDEEP = 5;                  // IDEEP.
   PROTO_HIP = 6;                    // AMD HIP
   PROTO_FPGA = 7;                   // FPGA
-  PROTO_MSNPU = 8;                  // MSNPU
+  PROTO_ORT = 8;                    // ONNX Runtime
   PROTO_XLA = 9;                    // XLA / TPU
   PROTO_MLC = 10;                   // ML Compute
   // Change the following number if you add more devices in the code.
diff --git a/caffe2/proto/caffe2_pb2.pyi b/caffe2/proto/caffe2_pb2.pyi
index 1258664bee165..f7f4430d7b761 100644
--- a/caffe2/proto/caffe2_pb2.pyi
+++ b/caffe2/proto/caffe2_pb2.pyi
@@ -23,7 +23,7 @@ class _DeviceTypeProto(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapp
     PROTO_IDEEP = DeviceTypeProto.V(5)
     PROTO_HIP = DeviceTypeProto.V(6)
     PROTO_FPGA = DeviceTypeProto.V(7)
-    PROTO_MSNPU = DeviceTypeProto.V(8)
+    PROTO_ORT = DeviceTypeProto.V(8)
     PROTO_XLA = DeviceTypeProto.V(9)
     PROTO_MLC = DeviceTypeProto.V(10)
     PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = DeviceTypeProto.V(11)
@@ -37,7 +37,7 @@ PROTO_OPENCL = DeviceTypeProto.V(4)
 PROTO_IDEEP = DeviceTypeProto.V(5)
 PROTO_HIP = DeviceTypeProto.V(6)
 PROTO_FPGA = DeviceTypeProto.V(7)
-PROTO_MSNPU = DeviceTypeProto.V(8)
+PROTO_ORT = DeviceTypeProto.V(8)
 PROTO_XLA = DeviceTypeProto.V(9)
 PROTO_MLC = DeviceTypeProto.V(10)
 PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = DeviceTypeProto.V(11)
diff --git a/test/cpp_extensions/msnpu_extension.cpp b/test/cpp_extensions/ort_extension.cpp
similarity index 78%
rename from test/cpp_extensions/msnpu_extension.cpp
rename to test/cpp_extensions/ort_extension.cpp
index e47347c40fbfa..b646f3b14939d 100644
--- a/test/cpp_extensions/msnpu_extension.cpp
+++ b/test/cpp_extensions/ort_extension.cpp
@@ -10,10 +10,10 @@ Tensor get_tensor(caffe2::TypeMeta dtype, IntArrayRef size) {
       Storage(
           Storage::use_byte_size_t(),
           0,
-          at::DataPtr(nullptr, Device(DeviceType::MSNPU, 0)),
+          at::DataPtr(nullptr, Device(DeviceType::ORT, 0)),
           nullptr,
           false),
-      DispatchKey::MSNPU,
+      DispatchKey::ORT,
       dtype);
   // This is a hack to workaround the shape checks in _convolution.
   tensor_impl->set_sizes_contiguous(size);
@@ -52,7 +52,7 @@ std::tuple<Tensor,Tensor,Tensor> fake_convolution_backward(
             get_tensor(input.dtype(), {}));
 }
 
-TORCH_LIBRARY_IMPL(aten, MSNPU, m) {
+TORCH_LIBRARY_IMPL(aten, ORT, m) {
   m.impl("empty.memory_format",                empty_override);
   m.impl("add.out",                            add_out_override);
   m.impl("convolution_overrideable",           fake_convolution);
@@ -61,34 +61,34 @@ TORCH_LIBRARY_IMPL(aten, MSNPU, m) {
 
 // TODO: Extend this to exercise multi-device setting.  In that case,
 // we need to add a thread local variable to track the current device.
-struct MSNPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
-  static constexpr DeviceType static_type = DeviceType::MSNPU;
-  MSNPUGuardImpl() {}
-  MSNPUGuardImpl(DeviceType t) {
-    AT_ASSERT(t == DeviceType::MSNPU);
+struct ORTGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = DeviceType::ORT;
+  ORTGuardImpl() {}
+  ORTGuardImpl(DeviceType t) {
+    AT_ASSERT(t == DeviceType::ORT);
   }
   DeviceType type() const override {
-    return DeviceType::MSNPU;
+    return DeviceType::ORT;
   }
   Device exchangeDevice(Device d) const override {
-    AT_ASSERT(d.type() == DeviceType::MSNPU);
+    AT_ASSERT(d.type() == DeviceType::ORT);
     AT_ASSERT(d.index() == 0);
     return d;
   }
   Device getDevice() const override {
-    return Device(DeviceType::MSNPU, 0);
+    return Device(DeviceType::ORT, 0);
   }
   void setDevice(Device d) const override {
-    AT_ASSERT(d.type() == DeviceType::MSNPU);
+    AT_ASSERT(d.type() == DeviceType::ORT);
     AT_ASSERT(d.index() == 0);
   }
   void uncheckedSetDevice(Device d) const noexcept override {
   }
   Stream getStream(Device d) const noexcept override {
-    return Stream(Stream::DEFAULT, Device(DeviceType::MSNPU, 0));
+    return Stream(Stream::DEFAULT, Device(DeviceType::ORT, 0));
   }
   Stream exchangeStream(Stream s) const noexcept override {
-    return Stream(Stream::DEFAULT, Device(DeviceType::MSNPU, 0));
+    return Stream(Stream::DEFAULT, Device(DeviceType::ORT, 0));
   }
   DeviceIndex deviceCount() const noexcept override {
     return 1;
@@ -99,23 +99,23 @@ struct MSNPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     const Stream& stream,
     const DeviceIndex device_index,
     const EventFlag flag) const override {
-    TORCH_CHECK(false, "MSNPU backend doesn't support events.");
+    TORCH_CHECK(false, "ORT backend doesn't support events.");
   }
   void block(
     void* event,
     const Stream& stream) const override {
-    TORCH_CHECK(false, "MSNPU backend doesn't support events.");
+    TORCH_CHECK(false, "ORT backend doesn't support events.");
   }
   bool queryEvent(void* event) const override {
-    TORCH_CHECK(false, "MSNPU backend doesn't support events.");
+    TORCH_CHECK(false, "ORT backend doesn't support events.");
   }
   void destroyEvent(
     void* event,
     const DeviceIndex device_index) const noexcept override { }
 };
 
-constexpr DeviceType MSNPUGuardImpl::static_type;
-C10_REGISTER_GUARD_IMPL(MSNPU, MSNPUGuardImpl);
+constexpr DeviceType ORTGuardImpl::static_type;
+C10_REGISTER_GUARD_IMPL(ORT, ORTGuardImpl);
 
 int get_test_int() {
   return test_int;
diff --git a/test/cpp_extensions/setup.py b/test/cpp_extensions/setup.py
index 8f77938ae3226..7888d0e3a88bb 100644
--- a/test/cpp_extensions/setup.py
+++ b/test/cpp_extensions/setup.py
@@ -21,7 +21,7 @@
         'torch_test_cpp_extension.cpp', ['extension.cpp'],
         extra_compile_args=CXX_FLAGS),
     CppExtension(
-        'torch_test_cpp_extension.msnpu', ['msnpu_extension.cpp'],
+        'torch_test_cpp_extension.ort', ['ort_extension.cpp'],
         extra_compile_args=CXX_FLAGS),
     CppExtension(
         'torch_test_cpp_extension.rng', ['rng_extension.cpp'],
diff --git a/test/test_cpp_extensions_aot.py b/test/test_cpp_extensions_aot.py
index 307df0eed5e9a..cf35e6b13265d 100644
--- a/test/test_cpp_extensions_aot.py
+++ b/test/test_cpp_extensions_aot.py
@@ -19,11 +19,11 @@
 try:
     if HAS_PYTEST:
         cpp_extension = pytest.importorskip("torch_test_cpp_extension.cpp")
-        msnpu_extension = pytest.importorskip("torch_test_cpp_extension.msnpu")
+        ort_extension = pytest.importorskip("torch_test_cpp_extension.ort")
         rng_extension = pytest.importorskip("torch_test_cpp_extension.rng")
     else:
         import torch_test_cpp_extension.cpp as cpp_extension
-        import torch_test_cpp_extension.msnpu as msnpu_extension
+        import torch_test_cpp_extension.ort as ort_extension
         import torch_test_cpp_extension.rng as rng_extension
 except ImportError as e:
     raise RuntimeError(
@@ -100,45 +100,45 @@ def test_optional(self):
         self.assertFalse(has_value)
 
 
-class TestMSNPUTensor(common.TestCase):
+class TestORTTensor(common.TestCase):
     def test_unregistered(self):
         a = torch.arange(0, 10, device='cpu')
         with self.assertRaisesRegex(RuntimeError, "Could not run"):
-            b = torch.arange(0, 10, device='msnpu')
+            b = torch.arange(0, 10, device='ort')
 
     def test_zeros(self):
         a = torch.empty(5, 5, device='cpu')
         self.assertEqual(a.device, torch.device('cpu'))
 
-        b = torch.empty(5, 5, device='msnpu')
-        self.assertEqual(b.device, torch.device('msnpu', 0))
-        self.assertEqual(msnpu_extension.get_test_int(), 0)
+        b = torch.empty(5, 5, device='ort')
+        self.assertEqual(b.device, torch.device('ort', 0))
+        self.assertEqual(ort_extension.get_test_int(), 0)
         self.assertEqual(torch.get_default_dtype(), b.dtype)
 
-        c = torch.empty((5, 5), dtype=torch.int64, device='msnpu')
-        self.assertEqual(msnpu_extension.get_test_int(), 0)
+        c = torch.empty((5, 5), dtype=torch.int64, device='ort')
+        self.assertEqual(ort_extension.get_test_int(), 0)
         self.assertEqual(torch.int64, c.dtype)
 
     def test_add(self):
-        a = torch.empty(5, 5, device='msnpu', requires_grad=True)
-        self.assertEqual(msnpu_extension.get_test_int(), 0)
+        a = torch.empty(5, 5, device='ort', requires_grad=True)
+        self.assertEqual(ort_extension.get_test_int(), 0)
 
-        b = torch.empty(5, 5, device='msnpu')
-        self.assertEqual(msnpu_extension.get_test_int(), 0)
+        b = torch.empty(5, 5, device='ort')
+        self.assertEqual(ort_extension.get_test_int(), 0)
 
         c = a + b
-        self.assertEqual(msnpu_extension.get_test_int(), 1)
+        self.assertEqual(ort_extension.get_test_int(), 1)
 
     def test_conv_backend_override(self):
         # To simplify tests, we use 4d input here to avoid doing view4d( which
         # needs more overrides) in _convolution.
-        input = torch.empty(2, 4, 10, 2, device='msnpu', requires_grad=True)
-        weight = torch.empty(6, 4, 2, 2, device='msnpu', requires_grad=True)
-        bias = torch.empty(6, device='msnpu')
+        input = torch.empty(2, 4, 10, 2, device='ort', requires_grad=True)
+        weight = torch.empty(6, 4, 2, 2, device='ort', requires_grad=True)
+        bias = torch.empty(6, device='ort')
 
         # Make sure forward is overriden
         out = torch.nn.functional.conv1d(input, weight, bias, 2, 0, 1, 1)
-        self.assertEqual(msnpu_extension.get_test_int(), 2)
+        self.assertEqual(ort_extension.get_test_int(), 2)
         self.assertEqual(out.shape[0], input.shape[0])
         self.assertEqual(out.shape[1], weight.shape[0])
 
@@ -146,7 +146,7 @@ def test_conv_backend_override(self):
         # Double backward is dispatched to _convolution_double_backward.
         # It is not tested here as it involves more computation/overrides.
         grad = torch.autograd.grad(out, input, out, create_graph=True)
-        self.assertEqual(msnpu_extension.get_test_int(), 3)
+        self.assertEqual(ort_extension.get_test_int(), 3)
         self.assertEqual(grad[0].shape, input.shape)
 
 
diff --git a/test/test_gen_backend_stubs.py b/test/test_gen_backend_stubs.py
index e1a66c69fe6f5..f788a8f34c761 100644
--- a/test/test_gen_backend_stubs.py
+++ b/test/test_gen_backend_stubs.py
@@ -138,11 +138,11 @@ def test_supported_invalid_op(self):
         self.assertExpectedInline(output_error, '''Found an invalid operator name: abs_BAD''')
 
     # The backend is valid, but doesn't have a valid autograd key. They can't override autograd kernels in that case.
-    # Only using MSNPU here because it has a valid backend key but not an autograd key- if this changes we can update the test.
+    # Only using Vulkan here because it has a valid backend key but not an autograd key- if this changes we can update the test.
     def test_backend_has_no_autograd_key_but_provides_entries(self):
         yaml_str = '''\
-backend: MSNPU
-cpp_namespace: torch_msnpu
+backend: Vulkan
+cpp_namespace: torch_vulkan
 supported:
 - add
 autograd:
@@ -155,7 +155,7 @@ def test_backend_has_no_autograd_key_but_provides_entries(self):
     def test_backend_autograd_kernel_mismatch_out_functional(self):
         yaml_str = '''\
 backend: XLA
-cpp_namespace: torch_msnpu
+cpp_namespace: torch_xla
 supported:
 - add.Tensor
 autograd:
@@ -168,7 +168,7 @@ def test_backend_autograd_kernel_mismatch_out_functional(self):
     def test_backend_autograd_kernel_mismatch_functional_inplace(self):
         yaml_str = '''\
 backend: XLA
-cpp_namespace: torch_msnpu
+cpp_namespace: torch_xla
 supported:
 - add.Tensor
 autograd:
@@ -182,7 +182,7 @@ def test_backend_autograd_kernel_mismatch_functional_inplace(self):
     def test_op_appears_in_supported_and_autograd_lists(self):
         yaml_str = '''\
 backend: XLA
-cpp_namespace: torch_msnpu
+cpp_namespace: torch_xla
 supported:
 - add.Tensor
 autograd:
diff --git a/test/test_torch.py b/test/test_torch.py
index 515052ae5ad67..d0f631a2eab52 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -221,10 +221,10 @@ def test_namespace(ns, *skips):
             # TODO: add torch.* tests when we have proper namespacing on ATen functions
             # test_namespace(torch)
 
-        def test_msnpu_error(self):
+        def test_ort_error(self):
             with self.assertRaisesRegex(RuntimeError,
-                                        "Could not run 'aten::empty.memory_format' with arguments from the 'MSNPU' backend"):
-                torch.zeros(1, device=torch.device('msnpu'))
+                                        "Could not run 'aten::empty.memory_format' with arguments from the 'ORT' backend"):
+                torch.zeros(1, device=torch.device('ort'))
 
         def test_has_storage(self):
             self.assertIsNotNone(torch.tensor([]).storage())
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 89697b4428ca1..e20d97333c83e 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -829,6 +829,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/detail/CPUGuardImpl.cpp",
     "aten/src/ATen/detail/CUDAHooksInterface.cpp",
     "aten/src/ATen/detail/HIPHooksInterface.cpp",
+    "aten/src/ATen/detail/ORTHooksInterface.cpp",
     "aten/src/ATen/metal/Context.cpp",
     "aten/src/ATen/native/AutogradComposite.cpp",
     "aten/src/ATen/native/BatchLinearAlgebraKernel.cpp",
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index d6f02d5a6898d..4f82b70ee31f2 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -56,7 +56,7 @@ class DispatchKey(Enum):
     CUDA = auto()
     HIP = auto()
     FPGA = auto()
-    MSNPU = auto()
+    ORT = auto()
     XLA = auto()
     Lazy = auto()
     Vulkan = auto()
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 4f39fec2188fc..882b7f114e2e3 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -469,6 +469,7 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) -
         'is_sparse_csr' : ['is_sparse_csr: _bool'],
         'is_quantized': ['is_quantized: _bool'],
         'is_meta': ['is_meta: _bool'],
+        'is_ort': ['is_ort: _bool'],
         'is_mkldnn': ['is_mkldnn: _bool'],
         'is_vulkan': ['is_vulkan: _bool'],
         'storage_offset': ['def storage_offset(self) -> _int: ...'],
diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi
index 6468eb551f9cd..7ffb618e3f072 100644
--- a/torch/_C/_autograd.pyi
+++ b/torch/_C/_autograd.pyi
@@ -24,7 +24,7 @@ class DeviceType(Enum):
     IDEEP = ...
     HIP = ...
     FPGA = ...
-    MSNPU = ...
+    ORT = ...
     XLA = ...
     MLC = ...
     HPU = ...
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 2bd617d3971a9..b4cee9aa2a32c 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -90,7 +90,7 @@ def __deepcopy__(self, memo):
             # does accurate alias tracking; however, the code below
             # doesn't work because of
             # https://github.com/pytorch/pytorch/issues/47442
-            if self.is_sparse or self.device.type in ['xla', 'mlc', 'meta']:
+            if self.is_sparse or self.device.type in ['xla', 'mlc', 'ort', 'meta']:
                 new_tensor = self.clone()
             else:
                 new_storage = self.storage().__deepcopy__(memo)
@@ -153,28 +153,21 @@ def _reduce_ex_internal(self, proto):
         # See Note [Don't serialize hooks]
         torch.utils.hooks.warn_if_has_hooks(self)
         backward_hooks: Dict[Any, Any] = OrderedDict()
-        # Note: Numpy array is chosen to be the rebuild component for XLA Tensor.
+        # Note: Numpy array is chosen to be the rebuild component for XLA, ORT, MLC Tensors.
         # We considered a few options:
         # 1. CPU tensor can't be used here.
         #    Otherwise in torch.load CPU storage is reconstructed with randomly
-        #    initialized data, moved onto XLA device, and then storage is updated
-        #    to the serialized content. This works perfectly for CPU/CUDA but not XLA.
-        #    XLA tensor is disconnected with storage so it doesn't get the update.
+        #    initialized data, moved onto backend device, and then storage is updated
+        #    to the serialized content. This works perfectly for CPU/CUDA but not these backends;
+        #    their tensors are disconnected with storage so they don't get the update.
         # 2. Python list is not a good fit due to performance reason.
         #    `tolist()` converts every single element in the tensor into python objects
         #    and serialize them one by one.
-        if self.device.type == 'xla':
-            arg_xla = (self.cpu().numpy(),
-                       self.dtype,
-                       str(self.device),
-                       self.requires_grad)
-            return (torch._utils._rebuild_xla_tensor, arg_xla)
-        if self.device.type == 'mlc':
-            arg_mlc = (self.cpu().numpy(),
-                       self.dtype,
-                       str(self.device),
-                       self.requires_grad)
-            return (torch._utils._rebuild_mlc_tensor, arg_mlc)
+        if self.device.type in ['xla', 'ort', 'mlc']:
+            return (torch._utils._rebuild_device_tensor_from_numpy, (self.cpu().numpy(),
+                                                                     self.dtype,
+                                                                     str(self.device),
+                                                                     self.requires_grad))
         if self.device.type == 'meta':
             # NB: This implementation BREAKS storage sharing.  Current
             # hypothesis is that no one cares for meta tensors.
diff --git a/torch/_utils.py b/torch/_utils.py
index 210b0cde793a6..75e9075e4250f 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -173,16 +173,15 @@ def _rebuild_sparse_tensor(layout, data):
     raise NotImplementedError("rebuilding sparse tensor for layout %s" % (layout))
 
 
-def _rebuild_xla_tensor(data, dtype, device, requires_grad):
+def _rebuild_device_tensor_from_numpy(data, dtype, device, requires_grad):
     tensor = torch.from_numpy(data).to(dtype=dtype, device=device)
     tensor.requires_grad = requires_grad
     return tensor
 
 
-def _rebuild_mlc_tensor(data, dtype, device, requires_grad):
-    tensor = torch.from_numpy(data).to(dtype=dtype, device=device)
-    tensor.requires_grad = requires_grad
-    return tensor
+# Should not be used, only here to be able to load Tensors serialized with older versions of pytorch
+_rebuild_xla_tensor = _rebuild_device_tensor_from_numpy
+_rebuild_mlc_tensor = _rebuild_device_tensor_from_numpy
 
 
 def _rebuild_meta_tensor_no_storage(dtype, size, stride, requires_grad):
diff --git a/torch/csrc/Device.h b/torch/csrc/Device.h
index b1f18dcebd1ab..32868120c06a1 100644
--- a/torch/csrc/Device.h
+++ b/torch/csrc/Device.h
@@ -17,6 +17,6 @@ inline bool THPDevice_Check(PyObject *obj) {
   return Py_TYPE(obj) == &THPDeviceType;
 }
 
-PyObject * THPDevice_New(const at::Device& device);
+TORCH_API PyObject * THPDevice_New(const at::Device& device);
 
-void THPDevice_init(PyObject *module);
+TORCH_API void THPDevice_init(PyObject *module);
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 2eacbf1cd3839..697ca871f83c5 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -114,7 +114,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .value("IDEEP", c10::DeviceType::IDEEP)
       .value("HIP", c10::DeviceType::HIP)
       .value("FPGA", c10::DeviceType::FPGA)
-      .value("MSNPU", c10::DeviceType::MSNPU)
+      .value("ORT", c10::DeviceType::ORT)
       .value("XLA", c10::DeviceType::XLA)
       .value("Lazy", c10::DeviceType::Lazy)
       .value("MLC", c10::DeviceType::MLC)
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 303584603aaa0..50d6eb9ab7e05 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -834,6 +834,17 @@ PyObject *THPVariable_is_mlc(THPVariable *self, void *unused)
   END_HANDLE_TH_ERRORS
 }
 
+PyObject *THPVariable_is_ort(THPVariable *self, void *unused)
+{
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function((PyObject *)self)) {
+    return handle_torch_function_getter(self, "is_ort");
+  }
+  auto& self_ = THPVariable_Unpack(self);
+  return torch::autograd::utils::wrap(self_.is_ort());
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject *THPVariable_is_vulkan(THPVariable *self, void *unused)
 {
   HANDLE_TH_ERRORS
@@ -980,6 +991,7 @@ static struct PyGetSetDef THPVariable_properties[] = {
   {"is_sparse_csr", (getter)THPVariable_is_sparse_csr, nullptr, nullptr, nullptr},
   {"is_mkldnn", (getter)THPVariable_is_mkldnn, nullptr, nullptr, nullptr},
   {"is_mlc", (getter)THPVariable_is_mlc, nullptr, nullptr, nullptr},
+  {"is_ort", (getter)THPVariable_is_ort, nullptr, nullptr, nullptr},
   {"is_vulkan", (getter)THPVariable_is_vulkan, nullptr, nullptr, nullptr},
   {"is_complex", (getter)THPVariable_is_complex, nullptr, nullptr, nullptr},
   {"is_quantized", (getter)THPVariable_is_quantized, nullptr, nullptr, nullptr},
diff --git a/torch/csrc/jit/frontend/sugared_value.cpp b/torch/csrc/jit/frontend/sugared_value.cpp
index ab70d6c6f326a..a5f000769badc 100644
--- a/torch/csrc/jit/frontend/sugared_value.cpp
+++ b/torch/csrc/jit/frontend/sugared_value.cpp
@@ -119,7 +119,7 @@ std::shared_ptr<SugaredValue> SimpleValue::attr(
            {"layout", "prim"},        {"T", "prim"},
            {"ndim", "prim"},          {"name", "prim"},
            {"real", "aten"},          {"imag", "aten"},
-           {"retains_grad", "aten"},
+           {"retains_grad", "aten"},  {"is_ort", "prim"},
        }},
       {TypeKind::DeviceObjType, {{"type", "prim"}, {"index", "prim"}}}};
   auto kind = value_->type()->kind();
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index a61cb48b1ddce..984073fbf72c1 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -2211,6 +2211,14 @@ RegisterOperators reg1(
            push(stack, a.is_meta());
          },
          aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("prim::is_ort(Tensor a) -> bool"),
+         [](Stack* stack) {
+           at::Tensor a;
+           pop(stack, a);
+           push(stack, a.is_ort());
+         },
+         aliasAnalysisFromSchema()),
      OperatorGenerator(
          TORCH_SELECTIVE_SCHEMA("prim::name(Tensor a) -> str?"),
          [](Stack* stack) {
diff --git a/torch/library.h b/torch/library.h
index ce2bb92e5723e..a873b4226dbca 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -317,8 +317,8 @@ inline CppFunction dispatch(c10::DeviceType type, Func&& raw_f) {
         return c10::DispatchKey::Meta;
       case c10::DeviceType::HIP:
         return c10::DispatchKey::HIP;
-      case c10::DeviceType::MSNPU:
-        return c10::DispatchKey::MSNPU;
+      case c10::DeviceType::ORT:
+        return c10::DispatchKey::ORT;
       case c10::DeviceType::HPU:
         return c10::DispatchKey::HPU;
       default:
diff --git a/torch/overrides.py b/torch/overrides.py
index 5a0ea6ca81737..09748b982b428 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -1030,6 +1030,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         Tensor.retains_grad.__get__: lambda self: -1,
         Tensor.is_meta.__get__: lambda self: -1,
         Tensor.is_mlc.__get__: lambda self: -1,
+        Tensor.is_ort.__get__: lambda self: -1,
         Tensor.is_mkldnn.__get__: lambda self: -1,
         Tensor.is_quantized.__get__: lambda self: -1,
         Tensor.is_sparse.__get__: lambda self: -1,

From d6d86efb1c839ddafd1398d6dab9caa4f31a9f0b Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Fri, 20 Aug 2021 11:11:49 -0700
Subject: [PATCH 101/530] [nnc] Support thread level parallelism in fused
 kernels (#63386)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63386

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D30360382

Pulled By: bertmaher

fbshipit-source-id: 29acf4e932c669ce0f35823faea9099bcd8119b6
---
 test/cpp/tensorexpr/test_kernel.cpp        | 30 ++++++++
 torch/csrc/jit/tensorexpr/kernel.cpp       | 87 ++++++++++++++++++++++
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 24 ++++--
 torch/csrc/jit/tensorexpr/llvm_jit.h       |  8 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp     |  7 ++
 5 files changed, 148 insertions(+), 8 deletions(-)

diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index 8f36f54395f49..8d4e48c4a0bff 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -206,6 +206,36 @@ TEST_F(Kernel, _3) {
   }
 }
 
+TEST_F(Kernel, ParallelStrided) {
+  KernelScope kernel_scope;
+
+  const auto graph_string = R"IR(
+      graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu),
+            %1 : Float(5, 3, 40005, strides=[960120, 160020, 2], device=cpu)):
+        %2 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %1)
+        %3 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto a = at::rand({5, 3, 40005}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({10, 6, 80010}, TensorOptions(kCPU).dtype(at::kFloat))
+               .index(
+                   {Slice(None, None, 2),
+                    Slice(None, None, 2),
+                    Slice(None, None, 2)});
+  auto ref = a * (a * b);
+  auto o = at::zeros_like(ref);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {a, b};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  o = stack[0].toTensor();
+  for (size_t i = 0; i < 5 * 3; i++) {
+    CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+  }
+}
+
 TEST_F(Kernel, DISABLED_Shape_Inference) {
   // disabled: doesn't do stride propagation, and isn't being used currently
 
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index faacd022e7e0b..c5333b2010610 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 
 #include <ATen/ExpandUtils.h>
+#include <ATen/Parallel.h>
 #include <ATen/TensorGeometry.h>
 #include <c10/util/irange.h>
 #include <c10/util/string_utils.h>
@@ -2487,6 +2488,86 @@ void fuseAllLoops(StmtPtr st) {
   }
 }
 
+// Compute the trip count of a loop if it is a constant.
+c10::optional<int64_t> tripCount(ForPtr loop) {
+  auto tc = IRSimplifier::simplify(
+      cast<int64_t>(ExprHandle(loop->stop()) - ExprHandle(loop->start())));
+  if (auto val = to<LongImm>(tc.node())) {
+    return val->value();
+  }
+  return c10::nullopt;
+}
+
+// Prune innermost loops until iterations satisfies a minimum grain size.
+static void pruneByGrainSize(std::vector<ForPtr>& loops) {
+  constexpr int64_t minGrainSize = 32768;
+  int64_t grainSize = 1;
+  for (int64_t i = loops.size(); i > 0; i--) {
+    auto tc = tripCount(loops[i - 1]);
+    if (!tc) {
+      break;
+    }
+    grainSize *= *tc;
+    if (grainSize < minGrainSize) {
+      loops.pop_back();
+    }
+  }
+}
+
+// Retain enough outermost loops to fill the number of threads.
+static void pruneByThreadCount(std::vector<ForPtr>& loops) {
+  int64_t trips = 1;
+  auto threads = at::get_num_threads();
+  auto it = loops.begin();
+  for (; it != loops.end(); it++) {
+    if (trips >= threads) {
+      break;
+    }
+    auto tc = tripCount(*it);
+    if (!tc) {
+      break;
+    }
+    trips *= *tc;
+  }
+  loops.erase(it, loops.end());
+}
+
+// Flatten and parallelize outer loops, subject to a minimum number of elements
+// in the inner loop, and a maximum level of thread-level parallelism in the
+// outer loops.
+template <typename Bufs>
+static void parallelizeOuterLoops(LoopNest& l, Bufs&& bufs) {
+  for (auto const& buf : bufs) {
+    auto loops = l.getLoopStmtsFor(buf);
+    pruneByGrainSize(loops);
+    pruneByThreadCount(loops);
+
+    // There are no loops to parallelize; give up.
+    if (loops.size() == 0) {
+      continue;
+    }
+    // The loop nest contains a reduction; give up.
+    auto reductions = NodeFinder<ReduceOp>::find(loops[0]);
+    if (reductions.size() > 0) {
+      continue;
+    }
+    // The loop nest has loop carried dependences; give up.
+    if (LoopNest::hasLoopCarriedDependence(loops[0])) {
+      continue;
+    }
+    // Try to flatten the outer loops and parallelize them if successful.
+    ForPtr flattened = nullptr;
+    if (loops.size() == 1) {
+      flattened = loops[0];
+    } else {
+      LoopNest::flatten(loops, &flattened);
+    }
+    if (flattened) {
+      flattened->set_parallel();
+    }
+  }
+}
+
 StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) {
   torch::jit::tensorexpr::LoopNest l(st, bufOutputs_);
   GRAPH_DEBUG("Original Stmt:\n", std::to_string(l.root_stmt()), "\n");
@@ -2528,6 +2609,8 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) {
   if (backendType == kLLVMCodeGen) {
     fuseAllLoops(l.root_stmt());
     GRAPH_DEBUG("after fuse", *l.root_stmt());
+    parallelizeOuterLoops(l, bufOutputs_);
+    GRAPH_DEBUG("after parallelize", *l.root_stmt());
   }
 
   if (backendType == kCudaCodeGen) {
@@ -2602,9 +2685,13 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) {
   }
 
   l.prepareForCodegen();
+  GRAPH_DEBUG("after prepareForCodegen", *l.root_stmt());
+  l.simplify();
+  GRAPH_DEBUG("after simplification", *l.root_stmt());
 
   if (backendType == kLLVMCodeGen && !hasReduction) {
     l.vectorizeInnerLoops();
+    GRAPH_DEBUG("after vectorization", *l.root_stmt());
   }
 
   StmtPtr stmt = l.root_stmt();
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index eac1f82f25c4b..d5a95bc4cf886 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -274,15 +274,24 @@ class LLVMCodeGenImpl : public IRVisitor {
   }
 };
 
+extern "C" {
 typedef void (*ParallelCallee)(int index, int8_t* packed_data);
-void DispatchParallel(int8_t* func, int start, int stop, int8_t* packed_data) {
+void DispatchParallel(
+    int8_t* func,
+    int start,
+    int stop,
+    int8_t* packed_data) noexcept {
   // TODO: preserve the func type.
-  ParallelCallee callee = reinterpret_cast<ParallelCallee>(func);
-  at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) {
-    for (int index = f_begin; index < f_end; index++) {
-      callee(index, packed_data);
-    }
-  });
+  try {
+    ParallelCallee callee = reinterpret_cast<ParallelCallee>(func);
+    at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) {
+      for (int index = f_begin; index < f_end; index++) {
+        callee(index, packed_data);
+      }
+    });
+  } catch (...) {
+  }
+}
 }
 
 } // namespace tensorexpr
@@ -1287,6 +1296,7 @@ void LLVMCodeGenImpl::processParallelFor(ForPtr v) {
       module_->getOrInsertFunction("DispatchParallel", dispatcher_fntype);
   llvm::Function* dispatcher =
       llvm::cast<llvm::Function>(dispatcher_callee.getCallee());
+  dispatcher->addFnAttr(llvm::Attribute::NoUnwind);
   irb_.CreateCall(
       dispatcher, {func_value, start, stop, packed_caller_args_ptr});
   value_ = llvm::ConstantInt::get(IntTy_, 0);
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h
index 30ad5317a1b3c..8585900abc8d6 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.h
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.h
@@ -17,7 +17,13 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-void DispatchParallel(int8_t* func, int start, int stop, int8_t* packed_data);
+extern "C" {
+void DispatchParallel(
+    int8_t* func,
+    int start,
+    int stop,
+    int8_t* packed_data) noexcept;
+}
 
 inline std::string formatError(llvm::Error&& err, const char* msg) {
   static constexpr char* defaultErrorMsg = "Unexpected failure in LLVM JIT";
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index a296d8c7af79b..7bcdd1a666f7b 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -179,6 +179,13 @@ class Vectorizer : public IRMutator {
     });
   }
 
+  ExprPtr mutate(ModPtr v) override {
+    std::vector<ExprPtr> inputs = {v->lhs(), v->rhs()};
+    return try_vectorize(v, inputs, [&]() {
+      return ExprHandle(inputs[0]) % ExprHandle(inputs[1]);
+    });
+  }
+
   ExprPtr mutate(AndPtr v) override {
     std::vector<ExprPtr> inputs = {v->lhs(), v->rhs()};
     return try_vectorize(v, inputs, [&]() {

From 6600bc96517269c608ea47b76b6bda9476c7bcef Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Fri, 20 Aug 2021 11:11:49 -0700
Subject: [PATCH 102/530] Remove flag to toggle CPU fusion in the presence of
 parallelism (#63514)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63514

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D30417127

Pulled By: bertmaher

fbshipit-source-id: b77d7c68364f2af73570740540f3b1152313016e
---
 test/cpp/tensorexpr/test_te_fuser_pass.cpp |  6 +-----
 test/jit/test_profiler.py                  |  3 ---
 test/test_jit_fuser_te.py                  |  5 -----
 test/test_tensorexpr.py                    |  4 ----
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 19 +------------------
 torch/csrc/jit/passes/tensorexpr_fuser.h   |  2 --
 torch/csrc/jit/python/init.cpp             |  2 --
 7 files changed, 2 insertions(+), 39 deletions(-)

diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
index 8dd616453362b..91fb4c2b7582c 100644
--- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp
+++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@@ -15,19 +15,15 @@ namespace jit {
 using namespace torch::jit::tensorexpr;
 
 struct WithCPUFuser {
-  WithCPUFuser(bool val = true)
-      : cpuFuserEnabled(canFuseOnCPU()), parallel(texprParallelCPUEnabled()) {
+  WithCPUFuser(bool val = true) : cpuFuserEnabled(canFuseOnCPU()) {
     overrideCanFuseOnCPU(val);
-    setTexprParallelCPUEnabled(true);
   }
 
   ~WithCPUFuser() {
     overrideCanFuseOnCPU(cpuFuserEnabled);
-    setTexprParallelCPUEnabled(parallel);
   }
 
   bool cpuFuserEnabled;
-  bool parallel;
 };
 
 TEST(TEFuserPass, FuserPass_1) {
diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py
index aa8be0518385f..b9ed9d0b78eb5 100644
--- a/test/jit/test_profiler.py
+++ b/test/jit/test_profiler.py
@@ -29,8 +29,6 @@ def setUp(self):
         torch._C._debug_set_fusion_group_inlining(False)
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
-        self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled()
-        torch._C._jit_set_texpr_parallel_cpu_enabled(True)
 
     def tearDown(self):
         torch._C._jit_set_profiling_executor(self.prev_exec)
@@ -42,7 +40,6 @@ def tearDown(self):
         torch._C._jit_set_texpr_reductions_enabled(self.old_reduction_enabled)
         torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
-        torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel)
 
     def test_tensor_type_not_determined_by_inputs(self):
         @torch.jit.script
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 64c26b7936b54..614226ff871ba 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -85,10 +85,6 @@ def setUp(self):
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
 
-        # TODO: CPU fuser currently is disabled when multithreading.
-        self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled()
-        torch._C._jit_set_texpr_parallel_cpu_enabled(True)
-
         self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         self.int_dtypes = [
             torch.int8,
@@ -116,7 +112,6 @@ def tearDown(self):
 
         torch._C._jit_set_texpr_fuser_enabled(self.texpr_fuser_state)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
-        torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel)
 
     def assertLastGraphAllFused(self):
         self.assertAllFused(torch.jit.last_executed_optimized_graph())
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 6353113a1ec4c..47c7e689aa6a4 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -24,9 +24,6 @@ def setUp(self):
         torch._C._debug_set_fusion_group_inlining(False)
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
-        # TODO: CPU fuser currently is disabled when multithreading.
-        self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled()
-        torch._C._jit_set_texpr_parallel_cpu_enabled(True)
 
         self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
 
@@ -39,7 +36,6 @@ def tearDown(self):
         torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuser_state)
         torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
-        torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel)
 
     def assertLastGraphAllFused(self):
         self.assertAllFused(torch.jit.last_executed_optimized_graph())
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index d4add03506c4f..52bf4539479df 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 
-#include <ATen/Parallel.h>
 #include <ATen/core/interned_strings.h>
 #include <ATen/record_function.h>
 #include <c10/util/FunctionRef.h>
@@ -250,15 +249,6 @@ bool isSupported(Node* node) {
 } // namespace tensorexpr
 
 static bool texpr_fuser_enabled_ = true;
-static bool texpr_parallel_cpu_enabled = false;
-
-bool texprParallelCPUEnabled() {
-  return texpr_parallel_cpu_enabled;
-}
-
-void setTexprParallelCPUEnabled(bool val) {
-  texpr_parallel_cpu_enabled = val;
-}
 
 void setTensorExprFuserEnabled(bool val) {
   texpr_fuser_enabled_ = val;
@@ -898,14 +888,7 @@ class TensorExprFuser {
       return false;
     }
     if (device->is_cpu()) {
-      // CPU fusion is only supported for single-thread.
-      if (!canFuseOnCPU()) {
-        return false;
-      }
-      if (at::get_num_threads() == 1 || texprParallelCPUEnabled()) {
-        return true;
-      }
-      return false;
+      return canFuseOnCPU();
     } else if (device->is_cuda()) {
       return canFuseOnGPU();
     } else if (device->is_xpu()) {
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h
index 3f6538b7e587a..254aebd91d12f 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.h
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -24,8 +24,6 @@ TORCH_API void setTensorExprFuserEnabled(bool val);
 TORCH_API bool tensorExprFuserEnabled();
 TORCH_API bool setTexprReductionsEnabled(bool value);
 TORCH_API bool texprReductionsEnabled();
-TORCH_API bool texprParallelCPUEnabled();
-TORCH_API void setTexprParallelCPUEnabled(bool val);
 
 TORCH_API void RemoveProfileNodesAndSpecializeTypes(
     std::shared_ptr<Graph>& graph);
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 5fca575593551..992e60edd7d19 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -711,8 +711,6 @@ void initJITBindings(PyObject* module) {
       .def("_jit_texpr_set_fallback_allowed", &tensorexpr::setFallbackAllowed)
       .def("_jit_set_texpr_reductions_enabled", &setTexprReductionsEnabled)
       .def("_jit_texpr_reductions_enabled", &texprReductionsEnabled)
-      .def("_jit_set_texpr_parallel_cpu_enabled", &setTexprParallelCPUEnabled)
-      .def("_jit_texpr_parallel_cpu_enabled", &texprParallelCPUEnabled)
       .def(
           "_jit_set_te_generate_block_code",
           [](bool gen_block_code) {

From b9fc656cf26d60127bd695e4e5a7d27622f2563d Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Fri, 20 Aug 2021 11:11:49 -0700
Subject: [PATCH 103/530] [nnc] Enable CPU fusion (#63545)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63545

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D30417370

Pulled By: bertmaher

fbshipit-source-id: 84ce7a578a3678d5562bab99d1dc00330c4f72d1
---
 torch/csrc/jit/codegen/fuser/interface.cpp |  8 ++------
 torch/csrc/jit/passes/graph_fuser.cpp      | 12 +++++++++++-
 torch/csrc/jit/passes/graph_fuser.h        |  3 +++
 torch/csrc/jit/python/init.cpp             |  2 ++
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/jit/codegen/fuser/interface.cpp b/torch/csrc/jit/codegen/fuser/interface.cpp
index ec67c4bd83773..ef7e9e0b629d5 100644
--- a/torch/csrc/jit/codegen/fuser/interface.cpp
+++ b/torch/csrc/jit/codegen/fuser/interface.cpp
@@ -8,15 +8,12 @@
 #include <c10/util/Flags.h>
 #include <stdexcept>
 
-C10_DEFINE_bool(torch_jit_enable_cpu_fusion, false, "enable cpu fusion");
-
 namespace torch {
 namespace jit {
 
 namespace detail {
 
-// Note: CPU fusion is currently disabled due to test flakiness
-#if defined(FBCODE_CAFFE2)
+#ifdef TORCH_ENABLE_LLVM
 bool cpu_fuser_enabled = true;
 #else
 bool cpu_fuser_enabled = false;
@@ -37,8 +34,7 @@ void runFusion(const int64_t key, Stack& stack) {
 }
 
 bool canFuseOnCPU() {
-  return fuser::hasFusionBackend(DeviceType::CPU) &&
-      (detail::cpu_fuser_enabled || FLAGS_torch_jit_enable_cpu_fusion);
+  return fuser::hasFusionBackend(DeviceType::CPU) && detail::cpu_fuser_enabled;
 }
 
 bool canFuseOnGPU() {
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index f7dd466de4ff4..653f9fec08b32 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -183,7 +183,7 @@ struct GraphFuser {
       return !strict_fuser_check;
     }
     if ((*device).is_cpu()) {
-      return canFuseOnCPU();
+      return canFuseOnCPULegacy();
     } else if ((*device).is_cuda()) {
       return canFuseOnGPU();
     } else if ((*device).is_xpu()) {
@@ -1244,6 +1244,16 @@ void PeepholeOptimizeShapeExpressions(Block* block, AliasDb* db) {
 
 } // anonymous namespace
 
+static bool cpu_fuser_enabled_legacy = false;
+
+bool canFuseOnCPULegacy() {
+  return cpu_fuser_enabled_legacy;
+}
+
+void overrideCanFuseOnCPULegacy(bool value) {
+  cpu_fuser_enabled_legacy = value;
+}
+
 void FuseGraph(std::shared_ptr<Graph>& graph, bool strict_fuser_check) {
   AliasDb db(graph);
   GraphFuser(&db, graph->block(), strict_fuser_check).run();
diff --git a/torch/csrc/jit/passes/graph_fuser.h b/torch/csrc/jit/passes/graph_fuser.h
index 0cdcc2e20f469..d710e5a098098 100644
--- a/torch/csrc/jit/passes/graph_fuser.h
+++ b/torch/csrc/jit/passes/graph_fuser.h
@@ -5,6 +5,9 @@
 namespace torch {
 namespace jit {
 
+TORCH_API bool canFuseOnCPULegacy();
+TORCH_API void overideCanFuseOnCPULegacy(bool value);
+
 // NB: Be sure to run DCE before fusion, because dead instructions
 // can prevent fusion opportunities from being exploited.
 // On Windows will noop, NYI
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 992e60edd7d19..f5da7b30c29d7 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -589,6 +589,8 @@ void initJITBindings(PyObject* module) {
       .def("_jit_override_can_fuse_on_gpu", &overrideCanFuseOnGPU)
       .def("_jit_can_fuse_on_cpu", &canFuseOnCPU)
       .def("_jit_can_fuse_on_gpu", &canFuseOnGPU)
+      .def("_jit_can_fuse_on_cpu_legacy", &canFuseOnCPULegacy)
+      .def("_jit_override_can_fuse_on_cpu_legacy", &canFuseOnCPULegacy)
       .def(
           "_jit_differentiate",
           [](Graph& g) {

From 70a3210ecaa0162b4673f53faa17675a9d3ca8de Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 20 Aug 2021 11:43:07 -0700
Subject: [PATCH 104/530] Add `BinaryUfuncOpInfo` and broadcasting tests
 (#61964)

Summary:
As proof of concept, this PR uses the new `BinaryUfuncOpInfo` in broadcasting tests for `add`, `sub`, `mul`, `div`, `floor_div`, and `true_div`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61964

Reviewed By: ngimel

Differential Revision: D30407734

Pulled By: mruberry

fbshipit-source-id: ada28994f43b0635f279f45a02ecba18bc8ee033
---
 test/test_binary_ufuncs.py                    |  80 ++++-
 test/test_jit_fuser_te.py                     |   2 +
 .../_internal/common_methods_invocations.py   | 333 +++++++++++-------
 3 files changed, 287 insertions(+), 128 deletions(-)

diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index f952911d206f6..4995e0dfc6cc7 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -17,8 +17,9 @@
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA,
     dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyOnCPUAndCUDA,
-    skipCUDAIfRocm, skipIf)
+    skipCUDAIfRocm, skipIf, ops)
 from torch.testing import all_types_and_complex_and, integral_types_and
+from torch.testing._internal.common_methods_invocations import binary_ufuncs
 
 if TEST_SCIPY:
     import scipy.special
@@ -89,6 +90,74 @@ def _make_tensor(shape, dtype, device, fill_ones=False) -> torch.Tensor:
 
 # TODO: update to use opinfos consistently
 class TestBinaryUfuncs(TestCase):
+    @ops(binary_ufuncs, allowed_dtypes=(torch.float32,))
+    def test_broadcasting(self, device, dtype, op):
+        for shape_lhs, shape_rhs in (
+            ((1,), ()),
+            ((2,), ()),
+            ((1,), (2,)),
+            ((2,), (2,)),
+            ((2, 1), (2,)),
+            ((1, 2), (2,)),
+            ((3, 2), (2,)),
+            ((3, 2), (3, 2)),
+            ((1, 3, 2), (2,)),
+            ((1, 3, 2), (3, 2)),
+            ((3, 1, 2), (3, 2)),
+            ((1, 3, 2), (1, 3, 2)),
+            ((2, 3, 2), ()),
+            ((2, 3, 2), (2, 3, 2)),
+            ((3, 1, 2), (1, 3, 2)),
+        ):
+            lhs = make_tensor(shape_lhs, device=device, dtype=dtype, **op.lhs_make_tensor_kwargs)
+            rhs = make_tensor(shape_rhs, device=device, dtype=dtype, **op.rhs_make_tensor_kwargs)
+
+            actual = op(lhs, rhs).shape
+            expected = torch.broadcast_shapes(shape_lhs, shape_rhs)
+
+            msg = (
+                f"On {device}, torch.{op.name} broadcasts inputs of shapes {shape_lhs} and {shape_rhs} incorrectly: "
+                f"{actual} != {expected}"
+            )
+            self.assertEqual(actual, expected, msg=msg)
+
+    @ops(binary_ufuncs, allowed_dtypes=(torch.float32,))
+    def test_broadcast_python_scalar(self, device, dtype, op):
+        for shape_lhs in ((), (1,), (2,), (1, 2, 3),):
+            lhs = make_tensor(shape_lhs, device=device, dtype=dtype, **op.lhs_make_tensor_kwargs)
+            rhs_tensor = make_tensor((), device=device, dtype=dtype, **op.rhs_make_tensor_kwargs)
+            rhs_python = rhs_tensor.item()
+
+            actual = op(lhs, rhs_python)
+            expected = op(lhs, rhs_tensor)
+
+            self.assertEqual(
+                actual.shape,
+                expected.shape,
+                msg=f"On {device}, torch.{op.name} broadcasts Python scalars different than 0d tensors.",
+            )
+
+    @ops(binary_ufuncs, allowed_dtypes=(torch.float32,))
+    def test_not_broadcastable(self, device, dtype, op):
+        for shape_lhs, shape_rhs in (
+                ((2,), (3,)),
+                ((3, 1), (2, 1)),
+                ((1, 3, 2), (3,)),
+                ((3, 1, 2), (2, 1, 2)),
+        ):
+            lhs = make_tensor(shape_lhs, device=device, dtype=dtype, **op.lhs_make_tensor_kwargs)
+            rhs = make_tensor(shape_rhs, device=device, dtype=dtype, **op.rhs_make_tensor_kwargs)
+
+            try:
+                broadcasted_shape = op(lhs, rhs).shape
+            except RuntimeError:
+                continue
+
+            msg = (
+                f"On {device}, torch.{op.name} broadcasts inputs shapes {shape_lhs} and {shape_rhs} into "
+                f"{broadcasted_shape}, although they are not broadcastable."
+            )
+            raise AssertionError(msg)
 
     def test_add_broadcast_empty(self, device):
         # empty + empty
@@ -1184,11 +1253,10 @@ def _wrapped_ifloordiv_scalar(a):
     # Also tests that reverse operations are equivalent to forward ops
     # NOTE: division ops are tested separately above
     def test_binary_ops_with_scalars(self, device):
-        for ops in ((operator.add, torch.add),
-                    (operator.sub, torch.sub),
-                    (operator.mul, torch.mul),
-                    (operator.truediv, torch.div)):
-            python_op, torch_op = ops
+        for python_op, torch_op in ((operator.add, torch.add),
+                                    (operator.sub, torch.sub),
+                                    (operator.mul, torch.mul),
+                                    (operator.truediv, torch.div)):
 
             for a, b in product(range(-10, 10), range(-10, 10)):
                 for op in (lambda x: x * .5, lambda x: math.floor(x)):
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 614226ff871ba..b89caca44a1b2 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1929,6 +1929,8 @@ def eager(x):
     'cosh',
     'div.no_rounding_mode',
     'div.true_rounding',
+    'div.floor_rounding',
+    'div.trunc_rounding',
     'eq',
     'erf',
     'erfc',
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 873d91c0e1293..617b102642d05 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1268,53 +1268,151 @@ def sample_inputs_linalg_vector_norm(op_info, device, dtype, requires_grad, **kw
 
     return inputs
 
-# In order to use the kwarg alpha, partials should be used in an OpInfo's sample_inputs_func
-# eg. sample_inputs_func=partial(sample_inputs_binary_pwise, alpha=2)
-# Then one sample input would also be generated corresponding to the value of alpha provided.
-# In the future, kwargs 'alpha_floating', 'alpha_integral' & 'alpha_complex' can be used to
-# specify scalars of floating, integral & complex types as values for "alpha".
-# Keyword argument `rhs_exclude_zero` is used to exclude zero values from rhs tensor argument
-# This is necessary for operations like `true_divide`, where divide by zero throws an exception.
-def sample_inputs_binary_pwise(op_info, device, dtype, requires_grad, extra_kwargs=None, **kwargs):
-    if extra_kwargs is None:
-        extra_kwargs = {}
-
-    scalar = 3.14 + 3.14j if dtype.is_complex else (3.14 if dtype.is_floating_point else 3)
-    scalar = 1 if dtype is torch.bool else scalar
-    tests_list = [
-        ((S, S, S), (S, S, S), False),
-        ((S, S, S), (S, S), False),
-        ((), (), False),
-        ((S, S, S), (), False),
-        ((S, S, S), scalar, False),
-        ((), scalar, False)
-    ]
-    tests_with_lhs_broadcasting = [
-        ((S, S), (S, S, S), True),
-        ((), (S, S, S), True),
-        ((S, 1, S), (M, S), True),
+
+# Metadata class for binary "universal functions (ufuncs)" that accept two
+# tensor and have common properties
+class BinaryUfuncInfo(OpInfo):
+    """Operator information for 'universal binary functions (binary ufuncs).'
+    These are functions of two tensors with common properties like:
+      - they are elementwise functions
+      - the output shape is determined by the input shape
+      - they typically have method and inplace variants
+      - they typically support the out kwarg
+      - they typically have NumPy or SciPy references
+    See NumPy's universal function documentation
+    (https://numpy.org/doc/stable/reference/ufuncs.html) for more details
+    about the concept of ufuncs.
+    """
+    def __init__(self, name, *, lhs_make_tensor_kwargs=None, rhs_make_tensor_kwargs=None, **kwargs):
+        super().__init__(name, **kwargs)
+
+        # [lr]hs_make_tensor_kwargs are part of the OpInfo to be able to dynamically generate valid samples later on.
+        if lhs_make_tensor_kwargs is None:
+            lhs_make_tensor_kwargs = {}
+        self.lhs_make_tensor_kwargs = lhs_make_tensor_kwargs
+
+        if rhs_make_tensor_kwargs is None:
+            rhs_make_tensor_kwargs = {}
+        self.rhs_make_tensor_kwargs = rhs_make_tensor_kwargs
+
+
+def _resolve_binay_pwise_kwargs(
+        op_info, *, op_kwargs=None, lhs_make_tensor_kwargs=None, rhs_make_tensor_kwargs=None
+):
+    """Resolves default values for :func:`sample_inputs_binary_pwise`.
+
+    By default :attr:`op_kwargs`, :attr:`lhs_make_tensor_kwargs`, and :attr:`rhs_make_tensor_kwargs` are just empty
+    dictionaries. In case :attr:`op_info` is a :class:`BinaryUfuncInfo`, :attr:`BinaryUfuncInfo.lhs_make_tensor_kwargs`
+    and :attr:`BinaryUfuncInfo.rhs_make_tensor_kwargs` will be used as defaults.
+    """
+    if op_kwargs is None:
+        op_kwargs = {}
+    if lhs_make_tensor_kwargs is None:
+        lhs_make_tensor_kwargs = op_info.lhs_make_tensor_kwargs if isinstance(op_info, BinaryUfuncInfo) else {}
+    if rhs_make_tensor_kwargs is None:
+        rhs_make_tensor_kwargs = op_info.rhs_make_tensor_kwargs if isinstance(op_info, BinaryUfuncInfo) else {}
+
+    return op_kwargs, lhs_make_tensor_kwargs, rhs_make_tensor_kwargs
+
+
+def sample_inputs_binary_pwise(
+    op_info,
+    device,
+    dtype,
+    requires_grad,
+    *,
+    python_scalars=False,
+    op_kwargs=None,
+    lhs_make_tensor_kwargs=None,
+    rhs_make_tensor_kwargs=None,
+    **kwargs,
+):
+    op_kwargs, lhs_make_tensor_kwargs, rhs_make_tensor_kwargs = _resolve_binay_pwise_kwargs(
+        op_info,
+        op_kwargs=op_kwargs,
+        lhs_make_tensor_kwargs=lhs_make_tensor_kwargs,
+        rhs_make_tensor_kwargs=rhs_make_tensor_kwargs,
+    )
+
+    scalar = make_tensor((), device=device, dtype=dtype, **rhs_make_tensor_kwargs)
+    if python_scalars:
+        scalar = scalar.item()  # type: ignore[assignment]
+
+    shapes = [
+        ((), scalar),
+        ((S,), scalar),
+        ((S, 1), (S,)),
+        ((M, S), scalar),
+        ((S, M, S), (M, S)),
+        ((S, M, S), (S, M, S)),
+        ((M, 1, S), (M, S)),
+        ((M, 1, S), (1, M, S)),
     ]
-    test_cases = tests_list + tests_with_lhs_broadcasting  # type: ignore[operator]
-    samples = []
-    for first_shape, shape_or_scalar, broadcasts_input in test_cases:
-        arg = shape_or_scalar
-
-        if isinstance(shape_or_scalar, tuple):
-            exclude_zero = kwargs.get('rhs_exclude_zero', False)
-            arg = make_tensor(shape_or_scalar, device=device, dtype=dtype,
-                              requires_grad=requires_grad, exclude_zero=exclude_zero)
-        samples.append(SampleInput(make_tensor(first_shape, device=device, dtype=dtype,
-                                               requires_grad=requires_grad),
-                                   args=(arg,), kwargs=extra_kwargs,
-                                   broadcasts_input=broadcasts_input))
-    # Adds an extra sample using "alpha" if it's passed in kwargs
-    if 'alpha' in kwargs:
-        a = make_tensor((S, S, S), device=device, dtype=dtype, requires_grad=requires_grad)
-        b = make_tensor((S, S, S), device=device, dtype=dtype, requires_grad=requires_grad)
-        extra_kwargs['alpha'] = kwargs['alpha']
-        sample = SampleInput(a, args=(b,), kwargs=extra_kwargs)
-        samples.append(sample)
-    return tuple(samples)
+
+    sample_inputs = []
+    for shape_lhs, shape_rhs_or_scalar in shapes:
+        lhs = make_tensor(
+            shape_lhs,
+            device=device,
+            dtype=dtype,
+            requires_grad=requires_grad,
+            **lhs_make_tensor_kwargs,
+        )
+        if isinstance(shape_rhs_or_scalar, tuple):
+            # shape
+            rhs = make_tensor(
+                shape_rhs_or_scalar,
+                device=device,
+                dtype=dtype,
+                requires_grad=requires_grad,
+                **rhs_make_tensor_kwargs,
+            )
+            broadcasts_input = torch.broadcast_shapes(shape_lhs, shape_rhs_or_scalar) != shape_lhs
+        else:
+            # scalar
+            rhs = shape_rhs_or_scalar  # type: ignore[assignment]
+            broadcasts_input = False
+
+        sample_inputs.append(SampleInput(lhs, args=(rhs,), kwargs=op_kwargs, broadcasts_input=broadcasts_input))
+    return sample_inputs
+
+
+def sample_inputs_add_sub(
+    op_info,
+    device,
+    dtype,
+    requires_grad,
+    python_scalars=False,
+    alpha=1,
+    op_kwargs=None,
+    lhs_make_tensor_kwargs=None,
+    rhs_make_tensor_kwargs=None,
+    **kwargs,
+):
+    op_kwargs, lhs_make_tensor_kwargs, rhs_make_tensor_kwargs = _resolve_binay_pwise_kwargs(
+        op_info,
+        op_kwargs=op_kwargs,
+        lhs_make_tensor_kwargs=lhs_make_tensor_kwargs,
+        rhs_make_tensor_kwargs=rhs_make_tensor_kwargs,
+    )
+
+    sample_inputs = sample_inputs_binary_pwise(
+        op_info,
+        device,
+        dtype,
+        requires_grad,
+        python_scalars=python_scalars,
+        op_kwargs=op_kwargs,
+        lhs_make_tensor_kwargs=lhs_make_tensor_kwargs,
+        rhs_make_tensor_kwargs=rhs_make_tensor_kwargs,
+        **kwargs,
+    )
+
+    lhs = make_tensor((S, S), device=device, dtype=dtype, requires_grad=requires_grad, **lhs_make_tensor_kwargs)
+    rhs = make_tensor((S, S), device=device, dtype=dtype, requires_grad=requires_grad, **rhs_make_tensor_kwargs)
+    sample_inputs.append(SampleInput(lhs, args=(rhs,), kwargs=dict(op_kwargs, alpha=alpha), broadcasts_input=False))
+
+    return sample_inputs
 
 
 def sample_inputs_t(op_info, device, dtype, requires_grad, **kwargs):
@@ -4045,19 +4143,6 @@ def sample_inputs_logit(op_info, device, dtype, requires_grad, **kwargs):
 
     return samples
 
-def sample_inputs_floor_divide(op_info, device, dtype, requires_grad, **kwargs):
-    lhs = make_tensor((S, S, S), device, dtype, low=None, high=None, requires_grad=requires_grad)
-    rhs = make_tensor((S, S, S), device, dtype, low=None, high=None, requires_grad=requires_grad)
-    # Avoid integer divide by 0
-    if not (dtype.is_floating_point or dtype.is_complex):
-        rhs[rhs == 0] = 1
-
-    return [
-        SampleInput(lhs, args=(rhs,)),
-        SampleInput(lhs, args=(rhs[0],)),
-        SampleInput(lhs, args=(3.14,)),
-    ]
-
 def sample_inputs_isin(op_info, device, dtype, requires_grad):
     element = make_tensor((L,), device, dtype, low=None, high=None, requires_grad=requires_grad)
     indices = torch.randint(0, L, size=[S])
@@ -5452,29 +5537,29 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                        SkipInfo('TestGradients', 'test_forward_mode_AD',
                                 dtypes=[torch.cdouble]),
                    )),
-    OpInfo('add',
-           # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
-           ref=lambda input, other, *, alpha=1: np.add(input, np.multiply(alpha, other)),
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
-           assert_autodiffed=True,
-           sample_inputs_func=partial(sample_inputs_binary_pwise, alpha=2),
-           supports_inplace_autograd=False,
-           supports_forward_ad=True),
-    OpInfo('mul',
-           aliases=('multiply',),
-           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16, torch.bool),
-           assert_autodiffed=True,
-           supports_forward_ad=True,
-           sample_inputs_func=sample_inputs_binary_pwise),
-    OpInfo('sub',
-           # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
-           ref=lambda input, other, *, alpha=1: np.subtract(input, np.multiply(alpha, other)),
-           aliases=('subtract',),
-           dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16),
-           assert_autodiffed=True,
-           supports_forward_ad=True,
-           sample_inputs_func=partial(sample_inputs_binary_pwise, alpha=2),
-           supports_inplace_autograd=False),
+    BinaryUfuncInfo('add',
+                    # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
+                    ref=lambda input, other, *, alpha=1: np.add(input, np.multiply(alpha, other)),
+                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+                    assert_autodiffed=True,
+                    sample_inputs_func=partial(sample_inputs_add_sub, alpha=2),
+                    supports_inplace_autograd=False,
+                    supports_forward_ad=True),
+    BinaryUfuncInfo('mul',
+                    aliases=('multiply',),
+                    dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16, torch.bool),
+                    assert_autodiffed=True,
+                    supports_forward_ad=True,
+                    sample_inputs_func=sample_inputs_binary_pwise),
+    BinaryUfuncInfo('sub',
+                    # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
+                    ref=lambda input, other, *, alpha=1: np.subtract(input, np.multiply(alpha, other)),
+                    aliases=('subtract',),
+                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16),
+                    assert_autodiffed=True,
+                    supports_forward_ad=True,
+                    sample_inputs_func=partial(sample_inputs_add_sub, alpha=2),
+                    supports_inplace_autograd=False),
     OpInfo('addmm',
            # This addmm OpInfo is for when alpha and beta are not both equal to 1.
            # alpha=beta=1 is tested in the following opinfo, because that special case will
@@ -6029,41 +6114,43 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            supports_forward_ad=True,
            sample_inputs_func=sample_inputs_diff),
-    OpInfo('div',
-           aliases=('divide',),
-           variant_test_name='no_rounding_mode',
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           sample_inputs_func=partial(sample_inputs_binary_pwise, rhs_exclude_zero=True),
-           supports_forward_ad=True,
-           assert_autodiffed=True),
-    OpInfo('div',
-           aliases=('divide',),
-           variant_test_name='trunc_rounding',
-           dtypes=all_types_and(torch.half, torch.bfloat16),
-           sample_inputs_func=partial(sample_inputs_binary_pwise, extra_kwargs={
-                                      "rounding_mode": 'trunc'}, rhs_exclude_zero=True),
-           supports_forward_ad=True,
-           skips=(
-               # Reference: https://github.com/pytorch/pytorch/issues/59174
-               SkipInfo('TestJit', 'test_variant_consistency_jit'),
-           ),
-           assert_autodiffed=True),
-    OpInfo('div',
-           aliases=('divide',),
-           variant_test_name='floor_rounding',
-           dtypes=all_types_and(torch.half, torch.bfloat16),
-           sample_inputs_func=partial(sample_inputs_binary_pwise, extra_kwargs={
-                                      "rounding_mode": 'floor'}, rhs_exclude_zero=True),
-           supports_forward_ad=True,
-           skips=(
-               # Reference: https://github.com/pytorch/pytorch/issues/59174
-               SkipInfo('TestJit', 'test_variant_consistency_jit'),
-           ),
-           assert_autodiffed=True),
-    OpInfo('true_divide',
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_forward_ad=True,
-           sample_inputs_func=partial(sample_inputs_binary_pwise, rhs_exclude_zero=True)),
+    BinaryUfuncInfo('div',
+                    aliases=('divide',),
+                    variant_test_name='no_rounding_mode',
+                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                    sample_inputs_func=sample_inputs_binary_pwise,
+                    supports_forward_ad=True,
+                    assert_autodiffed=True,
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True)),
+    BinaryUfuncInfo('div',
+                    aliases=('divide',),
+                    variant_test_name='trunc_rounding',
+                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                    sample_inputs_func=partial(sample_inputs_binary_pwise, rounding_mode="trunc"),
+                    supports_forward_ad=True,
+                    skips=(
+                        # Reference: https://github.com/pytorch/pytorch/issues/59174
+                        SkipInfo('TestJit', 'test_variant_consistency_jit'),
+                    ),
+                    assert_autodiffed=True,
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True)),
+    BinaryUfuncInfo('div',
+                    aliases=('divide',),
+                    variant_test_name='floor_rounding',
+                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                    sample_inputs_func=partial(sample_inputs_binary_pwise, rounding_mode="floor"),
+                    supports_forward_ad=True,
+                    skips=(
+                        # Reference: https://github.com/pytorch/pytorch/issues/59174
+                        SkipInfo('TestJit', 'test_variant_consistency_jit'),
+                    ),
+                    assert_autodiffed=True,
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True)),
+    BinaryUfuncInfo('true_divide',
+                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                    supports_forward_ad=True,
+                    sample_inputs_func=sample_inputs_binary_pwise,
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True)),
     UnaryUfuncInfo('exp',
                    ref=np_unary_ufunc_integer_promotion_wrapper(np.exp),
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
@@ -6316,11 +6403,12 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.bfloat16, torch.float16),
                    safe_casts_outputs=True),
-    OpInfo('floor_divide',
-           dtypes=all_types_and(torch.half, torch.bfloat16),
-           sample_inputs_func=sample_inputs_floor_divide,
-           supports_autograd=False,
-           ),
+    BinaryUfuncInfo('floor_divide',
+                    dtypes=all_types_and(torch.half, torch.bfloat16),
+                    sample_inputs_func=sample_inputs_binary_pwise,
+                    supports_autograd=False,
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True),
+                    ),
     UnaryUfuncInfo('frexp',
                    op=torch.frexp,
                    ref=np.frexp,
@@ -8752,6 +8840,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
 
 # Common operator groupings
 unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo)]
+binary_ufuncs = [op for op in op_db if isinstance(op, BinaryUfuncInfo)]
 spectral_funcs = [op for op in op_db if isinstance(op, SpectralFuncInfo)]
 sparse_unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo) and op.supports_sparse is True]
 shape_funcs = [op for op in op_db if isinstance(op, ShapeFuncInfo)]

From 71da1144126d07f4ac26aa372ed4f86fa3c0ba62 Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Fri, 20 Aug 2021 12:05:32 -0700
Subject: [PATCH 105/530] Revert D30426527: Adding DataLoader2 class as future
 replacement of DataLoader

Test Plan: revert-hammer

Differential Revision:
D30426527 (https://github.com/pytorch/pytorch/commit/5a7133b87fe2fd7d025d36855ed4cc06539a9299)

Original commit changeset: e5905d3364c4

fbshipit-source-id: 794d8a4e9256ccff8cf894aee10eff6adc30d502
---
 test/test_dataloader.py                     | 25 +-----
 torch/utils/data/__init__.py                |  6 +-
 torch/utils/data/dataloader_experimental.py | 89 ---------------------
 3 files changed, 3 insertions(+), 117 deletions(-)
 delete mode 100644 torch/utils/data/dataloader_experimental.py

diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 71230cfbb7a67..c68d7e2e14b33 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -13,20 +13,9 @@
 import warnings
 import tempfile
 from torch import multiprocessing as mp
-from torch.utils.data import (
-    ChainDataset,
-    ConcatDataset,
-    DataLoader,
-    DataLoader2,
-    Dataset,
-    IterableDataset,
-    Subset,
-    TensorDataset,
-    _utils
-)
+from torch.utils.data import _utils, Dataset, IterableDataset, TensorDataset, DataLoader, ConcatDataset, ChainDataset, Subset
 from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL
 from torch.utils.data.dataset import random_split
-from torch.utils.data.datapipes.iter import IterableAsDataPipe
 from torch._utils import ExceptionWrapper
 from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS,
                                                   IS_IN_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest,
@@ -1945,18 +1934,6 @@ def test_excessive_thread_creation_warning(self):
             dataloader = DataLoader(self.dataset, batch_size=2, num_workers=1000)
 
 
-@unittest.skipIf(
-    TEST_WITH_TSAN,
-    "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
-class TestDataLoader2(TestCase):
-    def test_basics(self):
-        dp = IterableAsDataPipe(list(range(10)))
-        dl = DataLoader(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2)
-        dl2 = DataLoader2(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2)
-        self.assertEquals(list(dl), list(dl2))
-
-
 class StringDataset(Dataset):
     def __init__(self):
         self.s = '12345'
diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py
index 0af9e6193af3d..1d18b7b030894 100644
--- a/torch/utils/data/__init__.py
+++ b/torch/utils/data/__init__.py
@@ -11,9 +11,9 @@
 from torch.utils.data.dataset import (
     ChainDataset,
     ConcatDataset,
-    DataChunk,
     Dataset,
     Dataset as MapDataPipe,
+    DataChunk,
     IterableDataset,
     IterableDataset as IterDataPipe,
     Subset,
@@ -34,14 +34,11 @@
     runtime_validation,
     runtime_validation_disabled,
 )
-from torch.utils.data.dataloader_experimental import DataLoader2
-
 
 __all__ = ['BatchSampler',
            'ChainDataset',
            'ConcatDataset',
            'DataLoader',
-           'DataLoader2',
            'Dataset',
            'DistributedSampler',
            'IterDataPipe',
@@ -71,3 +68,4 @@
 ################################################################################
 # import subpackage
 ################################################################################
+from torch.utils.data import datapipes
diff --git a/torch/utils/data/dataloader_experimental.py b/torch/utils/data/dataloader_experimental.py
deleted file mode 100644
index 85028afd22124..0000000000000
--- a/torch/utils/data/dataloader_experimental.py
+++ /dev/null
@@ -1,89 +0,0 @@
-
-import functools
-
-import torch.utils.data.backward_compatibility
-from torch.utils.data import DataLoader, IterDataPipe
-from torch.utils.data.datapipes.iter import IterableAsDataPipe
-
-class DataLoader2:
-    def __new__(cls,
-                dataset,
-                batch_size=1,
-                shuffle=False,
-                sampler=None,
-                batch_sampler=None,
-                num_workers=0,
-                collate_fn=None,
-                pin_memory=False,
-                drop_last=False,
-                timeout=0,
-                worker_init_fn=None,
-                *,
-                prefetch_factor=2,
-                persistent_workers=False,
-                batch_outside_worker=False):
-        if isinstance(dataset, IterDataPipe):
-            datapipe = dataset
-            if batch_sampler is not None:
-                raise Exception(
-                    'batch_sampler is not yet supported for DataPipes')
-            if sampler is not None:
-                raise Exception(
-                    'sampler is not yet supported for DataPipes')
-            if shuffle:
-                datapipe = datapipe.shuffle()
-            if batch_outside_worker and pin_memory:
-                raise Exception(
-                    'pin_memory is not yet compatible with batch_outside_worker')
-            if not batch_outside_worker:
-                if batch_size is not None:
-                    datapipe = datapipe.batch(batch_size, drop_last=drop_last)
-                    if collate_fn is None:
-                        collate_fn = torch.utils.data._utils.collate.default_collate
-
-            def sharding_worker_init_fn(worker_init_fn, worker_id):
-                if worker_init_fn is not None:
-                    worker_init_fn(worker_id)
-                torch.utils.data.backward_compatibility.worker_init_fn(
-                    worker_id)
-
-            my_worker_init_fn = functools.partial(
-                sharding_worker_init_fn, worker_init_fn)
-
-            data_loader = DataLoader(datapipe,
-                                     batch_size=None,  # Replaced by .batch DataPipe
-                                     shuffle=False,  # Replaced by .shuffle DataPipe
-                                     sampler=None,
-                                     batch_sampler=None,
-                                     num_workers=num_workers,
-                                     collate_fn=collate_fn,
-                                     pin_memory=pin_memory,
-                                     drop_last=False,  # Replaced by .batch DataPipe
-                                     timeout=timeout,
-                                     worker_init_fn=my_worker_init_fn,
-                                     prefetch_factor=prefetch_factor,
-                                     persistent_workers=persistent_workers)
-
-            if not batch_outside_worker:
-                return data_loader
-            else:
-                if collate_fn is None:
-                    collate_fn = torch.utils.data._utils.collate.default_collate
-                datapipe = IterableAsDataPipe(data_loader).batch(
-                    batch_size, drop_last=drop_last).map(collate_fn)
-                return datapipe
-
-        else:
-            return DataLoader(dataset,
-                              batch_size=batch_size,
-                              shuffle=shuffle,
-                              sampler=sampler,
-                              batch_sampler=batch_sampler,
-                              num_workers=num_workers,
-                              collate_fn=collate_fn,
-                              pin_memory=pin_memory,
-                              drop_last=drop_last,
-                              timeout=timeout,
-                              worker_init_fn=worker_init_fn,
-                              prefetch_factor=prefetch_factor,
-                              persistent_workers=persistent_workers)

From 2d671ca41b437ebe0a183d39f4c70ecd19c76a78 Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Fri, 20 Aug 2021 12:09:49 -0700
Subject: [PATCH 106/530] [8/N] Remove c10d/ddp fork tests. (#63454)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63454

Continuation of https://github.com/pytorch/pytorch/pull/63443, this
PR removes all fork tests from torch.distributed.
ghstack-source-id: 136285511

Test Plan: waitforbuildbot

Reviewed By: SciPioneer

Differential Revision: D30387872

fbshipit-source-id: f6d6313db126ae7b95b86f78a1e0726887c5c513
---
 .jenkins/pytorch/multigpu-test.sh             |   1 -
 .../ddp_comm_hooks/test_ddp_hooks.py          |  10 +-
 .../server/test/local_elastic_agent_test.py   |  99 +++++-----
 .../elastic/multiprocessing/api_test.py       |  28 ++-
 .../multiprocessing/errors/api_test.py        |   5 -
 .../elastic/timer/local_timer_example.py      |  11 +-
 .../elastic/timer/local_timer_test.py         |  34 ++--
 test/distributed/launcher/api_test.py         |  15 +-
 test/distributed/launcher/launch_test.py      |   9 +-
 test/distributed/launcher/run_test.py         |  53 +++---
 test/distributed/test_c10d_common.py          | 133 +++++++-------
 test/distributed/test_c10d_gloo.py            |  25 +--
 test/distributed/test_c10d_nccl.py            |   8 -
 test/distributed/test_c10d_spawn_gloo.py      | 172 +++++++++--------
 test/distributed/test_distributed_fork.py     | 113 ------------
 test/distributed/test_jit_c10d.py             |   6 +-
 test/distributed/test_launcher.py             |   7 +-
 test/distributed/test_pg_wrapper.py           | 173 +++++++++---------
 test/run_test.py                              |   6 +-
 test/test_determination.py                    |   2 -
 tools/stats/print_test_stats.py               |   1 -
 torch/distributed/CONTRIBUTING.md             |   1 -
 torch/testing/_internal/common_distributed.py |   5 +
 23 files changed, 366 insertions(+), 551 deletions(-)
 delete mode 100644 test/distributed/test_distributed_fork.py

diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh
index 2b918dad31385..76975310843c4 100755
--- a/.jenkins/pytorch/multigpu-test.sh
+++ b/.jenkins/pytorch/multigpu-test.sh
@@ -19,7 +19,6 @@ fi
 python tools/download_mnist.py --quiet -d test/cpp/api/mnist
 OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api
 time python test/run_test.py --verbose -i distributed/test_jit_c10d
-time python test/run_test.py --verbose -i distributed/test_distributed_fork
 time python test/run_test.py --verbose -i distributed/test_c10d_common
 time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
index 1f78d50b604e8..7b889fdc3f1bb 100644
--- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
+++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
@@ -21,8 +21,14 @@
     requires_nccl,
     skip_if_lt_x_gpu,
 )
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
+)
 
+if TEST_WITH_DEV_DBG_ASAN:
+    print("Multiprocessing spawn is not compatible with dev/dbg asan", file=sys.stderr)
+    sys.exit(0)
 
 def gpus_for_rank(world_size):
     visible_devices = list(range(torch.cuda.device_count()))
@@ -57,7 +63,7 @@ def forward(self, x, rank):
 class DistributedDataParallelCommHookTest(MultiProcessTestCase):
     def setUp(self):
         super(DistributedDataParallelCommHookTest, self).setUp()
-        self._fork_processes()
+        self._spawn_processes()
 
     def tearDown(self):
         try:
diff --git a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
index 2536b1033d56c..f8972a2be73cf 100644
--- a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
+++ b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
@@ -37,7 +37,6 @@
 from torch.distributed.rpc.backend_registry import BackendType
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
-    TEST_WITH_TSAN,
     sandcastle_skip_if,
 )
 
@@ -406,19 +405,19 @@ def dummy_compute(self):
             self.assertEqual((100, 100), return_value.shape)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_dummy_compute_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.dummy_compute)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_dummy_compute_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.dummy_compute)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_dummy_compute_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.dummy_compute)
@@ -431,19 +430,19 @@ def run_happy_function(self):
         self.assertIsNone(res.return_values[1])
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_happy_function_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_happy_function)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_happy_function_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_happy_function)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_happy_function_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_happy_function)
@@ -465,13 +464,13 @@ def check_master_addr_port_override(self):
         self.assertIsNone(res.return_values[0])
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_check_master_addr_port_override_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.check_master_addr_port_override)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_check_master_addr_port_override_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.check_master_addr_port_override)
@@ -484,7 +483,7 @@ def run_check_env_function(self):
         self.assertFalse(res.is_failed())
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_check_env_function_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_check_env_function)
@@ -497,19 +496,19 @@ def run_function_with_return_value(self):
         self.assertEqual("foo", res.return_values[1])
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_function_with_return_value_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_function_with_return_value)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_function_with_return_value_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_function_with_return_value)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_function_with_return_value_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_function_with_return_value)
@@ -520,19 +519,19 @@ def simple_dist_sum(self):
         # _dist_sum internally checks that the sum computed is valid
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_simple_dist_sum_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.simple_dist_sum)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_simple_dist_sum_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.simple_dist_sum)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_simple_dist_sum_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.simple_dist_sum)
@@ -556,19 +555,19 @@ def run_distributed_sum_homogeneous(self):
         self.assertSetEqual(set(range(4 + 4)), ranks)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_distributed_sum_homogeneous_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_distributed_sum_homogeneous)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_distributed_sum_homogeneous_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_distributed_sum_homogeneous)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_distributed_sum_homogeneous_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_distributed_sum_homogeneous)
@@ -596,19 +595,19 @@ def run_distributed_sum_heterogeneous(self):
         self.assertSetEqual(set(range(1 + 2 + 3)), ranks)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_distributed_sum_heterogeneous_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_distributed_sum_heterogeneous)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_distributed_sum_heterogeneous_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_distributed_sum_heterogeneous)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_distributed_sum_heterogeneous_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_distributed_sum_heterogeneous)
@@ -636,19 +635,19 @@ def run_sad_function(self):
                 self.assertEqual(int(data["extraInfo"]["timestamp"]), failure.timestamp)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_sad_function_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_sad_function)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_sad_function_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_sad_function)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_sad_function_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_sad_function)
@@ -668,19 +667,19 @@ def run_bipolar_function(self):
         self.assertTrue(agent._total_execution_time > 0)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_bipolar_function_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_bipolar_function)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_bipolar_function_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_bipolar_function)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_run_bipolar_function_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_bipolar_function)
@@ -711,13 +710,13 @@ def correct_rank_assignment_heterogeneous(self):
         )
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_correct_rank_assignment_heterogeneous_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.correct_rank_assignment_heterogeneous)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_correct_rank_assignment_heterogeneous_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.correct_rank_assignment_heterogeneous)
@@ -744,13 +743,13 @@ def correct_rank_assignment_homogeneous(self):
         )
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_correct_rank_assignment_homogeneous_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.correct_rank_assignment_homogeneous)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_correct_rank_assignment_homogeneous_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.correct_rank_assignment_homogeneous)
@@ -852,13 +851,13 @@ def double_agent_fault_tolerance(self):
             self.assertEqual(0, p.exitcode)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_double_agent_fault_tolerance_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.double_agent_fault_tolerance)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_double_agent_fault_tolerance_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.double_agent_fault_tolerance)
@@ -905,19 +904,19 @@ def double_agent_elastic(self):
                 self.assertEqual(-signal.SIGKILL, p.exitcode)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_double_agent_elastic_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.double_agent_elastic)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_double_agent_elastic_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.double_agent_elastic)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_double_agent_elastic_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.double_agent_elastic)
@@ -955,19 +954,19 @@ def torch_rpc(self):
         self.assertEqual([f"{msg} from worker"], list(master_retvals.values()))
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_torch_rpc_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.torch_rpc)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_torch_rpc_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.torch_rpc)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_torch_rpc_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.torch_rpc)
@@ -993,13 +992,13 @@ def workers_drift_success(self):
                 self.assertEqual(rank, output)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_workers_drift_success_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.workers_drift_success)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_workers_drift_success_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.workers_drift_success)
@@ -1024,13 +1023,13 @@ def workers_drift_fail(self):
                 self.assertEqual(rank, output)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_workers_drift_fail_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.workers_drift_fail)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_workers_drift_fail_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.workers_drift_fail)
@@ -1047,19 +1046,19 @@ def barrier_failed(self, barrier_mock):
         barrier_mock.assert_called_once()
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_barrier_failed_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.barrier_failed)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_barrier_failed_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.barrier_failed)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_barrier_failed_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.barrier_failed)
@@ -1081,19 +1080,19 @@ def shutdown_called(self, start_processes_mock):
         pcontext_mock.close.assert_called_once()
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_shutdown_called_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.shutdown_called)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_shutdown_called_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.shutdown_called)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_shutdown_called_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.shutdown_called)
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index c27d932e43cb9..811137a8d83b4 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -35,8 +35,8 @@
 from torch.testing._internal.common_utils import (
     NO_MULTIPROCESSING_SPAWN,
     TEST_WITH_ASAN,
-    TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_TSAN,
+    TEST_WITH_DEV_DBG_ASAN,
     IS_IN_CI,
     IS_WINDOWS,
     IS_MACOS,
@@ -223,15 +223,11 @@ def start_processes_zombie_test(
 
 
 # tests incompatible with tsan or asan
-if not (TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS):
+if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS):
     class StartProcessesTest(unittest.TestCase):
         def setUp(self):
             self.test_dir = tempfile.mkdtemp(prefix=f"{self.__class__.__name__}_")
-
-            if NO_MULTIPROCESSING_SPAWN:  # python 2.7 doesn't have spawn
-                self._start_methods = ["fork"]
-            else:
-                self._start_methods = ["fork", "spawn"]
+            self._start_methods = ["spawn"]
 
         def tearDown(self):
             shutil.rmtree(self.test_dir)
@@ -317,7 +313,7 @@ def test_pcontext_wait(self):
                 args={0: (1,)},
                 envs={0: {}},
                 log_dir=self.log_dir(),
-                start_method="fork",
+                start_method="spawn",
             )
 
             self.assertIsNone(pc.wait(timeout=0.1, period=0.01))
@@ -332,7 +328,7 @@ def test_multiprocess_context_close(self):
                 args={0: (1,)},
                 envs={0: {}},
                 log_dir=self.log_dir(),
-                start_method="fork",
+                start_method="spawn",
             )
 
             pids = pc.pids()
@@ -387,7 +383,7 @@ def test_void_function(self):
                     self.assertEqual({0: None, 1: None}, results.return_values)
 
         @sandcastle_skip_if(
-            TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan or asan"
+            TEST_WITH_DEV_DBG_ASAN, "tests incompatible with asan"
         )
         def test_function_large_ret_val(self):
             # python multiprocessing.queue module uses pipes and actually PipedQueues
@@ -549,7 +545,7 @@ def test_multiprocessing_context_poll_raises_exception(self):
 
 
 # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows
-if not (TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS):
+if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS):
     class StartProcessesListTest(StartProcessesTest):
         ########################################
         # start_processes as binary tests
@@ -630,7 +626,7 @@ def test_binary_redirect_and_tee(self):
                 args={0: ("hello",), 1: ("world",)},
                 envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
                 log_dir=self.log_dir(),
-                start_method="fork",
+                start_method="spawn",
                 redirects={0: Std.ERR, 1: Std.NONE},
                 tee={0: Std.OUT, 1: Std.ERR},
             )
@@ -647,7 +643,7 @@ def test_binary_redirect_and_tee(self):
 
 
 # tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows
-if not (TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN or IS_WINDOWS or IS_MACOS or IS_IN_CI):
+if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS or IS_IN_CI):
     class StartProcessesNotCITest(StartProcessesTest):
         def test_wrap_bad(self):
             none = ""
@@ -697,8 +693,8 @@ def test_binary_signal(self):
 
             failure = results.failures[0]
             self.assertNotEqual(signal.SIGSEGV, failure.exitcode)
-            if TEST_WITH_ASAN:
-                # ASAN exit code is 1.
+            if TEST_WITH_ASAN or TEST_WITH_TSAN:
+                # ASAN/TSAN exit code is 1.
                 self.assertEqual("<N/A>", failure.signal_name())
             else:
                 self.assertEqual("SIGSEGV", failure.signal_name())
@@ -714,7 +710,7 @@ def test_function_redirect_and_tee(self):
                         args={0: ("hello",), 1: ("world",)},
                         envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
                         log_dir=log_dir,
-                        start_method="fork",
+                        start_method="spawn",
                         redirects={0: Std.ERR, 1: Std.NONE},
                         tee={0: Std.OUT, 1: Std.ERR},
                     )
diff --git a/test/distributed/elastic/multiprocessing/errors/api_test.py b/test/distributed/elastic/multiprocessing/errors/api_test.py
index 14b7ab1d13970..859069004ae71 100644
--- a/test/distributed/elastic/multiprocessing/errors/api_test.py
+++ b/test/distributed/elastic/multiprocessing/errors/api_test.py
@@ -13,7 +13,6 @@
     record,
 )
 from torch.distributed.elastic.multiprocessing.errors.error_handler import _write_error
-from torch.testing._internal.common_utils import TEST_WITH_TSAN
 
 
 class SentinelError(Exception):
@@ -45,10 +44,6 @@ def read_resource_file(resource_file: str) -> str:
         return "".join(fp.readlines())
 
 
-if TEST_WITH_TSAN:
-    print("test incompatible with tsan", file=sys.stderr)
-    sys.exit(0)
-
 class ApiTest(unittest.TestCase):
     def setUp(self):
         self.test_dir = tempfile.mkdtemp(prefix=self.__class__.__name__)
diff --git a/test/distributed/elastic/timer/local_timer_example.py b/test/distributed/elastic/timer/local_timer_example.py
index 7845c4b5001e5..b52c64752e413 100644
--- a/test/distributed/elastic/timer/local_timer_example.py
+++ b/test/distributed/elastic/timer/local_timer_example.py
@@ -15,7 +15,6 @@
 import torch.multiprocessing as torch_mp
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
-    TEST_WITH_TSAN,
     run_tests,
     IS_WINDOWS,
     IS_MACOS,
@@ -55,7 +54,7 @@ class LocalTimerExample(unittest.TestCase):
         unittest. As of now this will SIGSEGV.
         """
 
-        @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible")
+        @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test is asan incompatible")
         def test_torch_mp_example(self):
             # in practice set the max_interval to a larger value (e.g. 60 seconds)
             mp_queue = mp.get_context("spawn").Queue()
@@ -80,18 +79,14 @@ def test_torch_mp_example(self):
 
             server.stop()
 
-        @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible")
+        @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test is asan incompatible")
         def test_example_start_method_spawn(self):
             self._run_example_with(start_method="spawn")
 
-        # @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test is a/tsan incompatible")
+        # @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test is asan incompatible")
         # def test_example_start_method_forkserver(self):
         #     self._run_example_with(start_method="forkserver")
 
-        @sandcastle_skip_if(TEST_WITH_TSAN, "test is tsan incompatible")
-        def test_example_start_method_fork(self):
-            self._run_example_with(start_method="fork")
-
         def _run_example_with(self, start_method):
             spawn_ctx = mp.get_context(start_method)
             mp_queue = spawn_ctx.Queue()
diff --git a/test/distributed/elastic/timer/local_timer_test.py b/test/distributed/elastic/timer/local_timer_test.py
index 4c977113aa42e..f27e5939660e5 100644
--- a/test/distributed/elastic/timer/local_timer_test.py
+++ b/test/distributed/elastic/timer/local_timer_test.py
@@ -13,19 +13,28 @@
 from torch.distributed.elastic.timer.api import TimerRequest
 from torch.distributed.elastic.timer.local_timer import MultiprocessingRequestQueue
 from torch.testing._internal.common_utils import (
-    TEST_WITH_TSAN,
     run_tests,
     IS_WINDOWS,
     IS_MACOS,
-    sandcastle_skip_if,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 
 # timer is not supported on windows or macos
-if not (IS_WINDOWS or IS_MACOS):
+if not (IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN):
+    # func2 should time out
+    def func2(n, mp_queue):
+        if mp_queue is not None:
+            timer.configure(timer.LocalTimerClient(mp_queue))
+        if n > 0:
+            with timer.expires(after=0.1):
+                func2(n - 1, None)
+                time.sleep(0.2)
+
     class LocalTimerTest(unittest.TestCase):
         def setUp(self):
-            self.mp_queue = mp.Queue()
+            self.ctx = mp.get_context('spawn')
+            self.mp_queue = self.ctx.Queue()
             self.max_interval = 0.01
             self.server = timer.LocalTimerServer(self.mp_queue, self.max_interval)
             self.server.start()
@@ -62,7 +71,6 @@ def test_happy_path(self):
             with timer.expires(after=0.5):
                 time.sleep(0.1)
 
-        @sandcastle_skip_if(TEST_WITH_TSAN, "test is tsan incompatible")
         def test_get_timer_recursive(self):
             """
             If a function acquires a countdown timer with default scope,
@@ -82,14 +90,7 @@ def func(n):
 
             func(4)
 
-            # func2 should time out
-            def func2(n):
-                if n > 0:
-                    with timer.expires(after=0.1):
-                        func2(n - 1)
-                        time.sleep(0.2)
-
-            p = mp.Process(target=func2, args=(2,))
+            p = self.ctx.Process(target=func2, args=(2, self.mp_queue))
             p.start()
             p.join()
             self.assertEqual(-signal.SIGKILL, p.exitcode)
@@ -102,7 +103,6 @@ def _run(mp_queue, timeout, duration):
             with timer.expires(after=timeout):
                 time.sleep(duration)
 
-        @sandcastle_skip_if(TEST_WITH_TSAN, "test is tsan incompatible")
         def test_timer(self):
             timeout = 0.1
             duration = 1
@@ -124,7 +124,7 @@ def _enqueue_on_interval(mp_queue, n, interval, sem):
 
 
 # timer is not supported on windows or macos
-if not (IS_WINDOWS or IS_MACOS):
+if not (IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN):
     class MultiprocessingRequestQueueTest(unittest.TestCase):
         def test_get(self):
             mp_queue = mp.Queue()
@@ -183,7 +183,7 @@ def test_get_less_than_size(self):
 
 
 # timer is not supported on windows or macos
-if not (IS_WINDOWS or IS_MACOS):
+if not (IS_WINDOWS or IS_MACOS or TEST_WITH_DEV_DBG_ASAN):
     class LocalTimerServerTest(unittest.TestCase):
         def setUp(self):
             self.mp_queue = mp.Queue()
@@ -193,7 +193,6 @@ def setUp(self):
         def tearDown(self):
             self.server.stop()
 
-        @sandcastle_skip_if(TEST_WITH_TSAN, "test is tsan incompatible")
         def test_watchdog_call_count(self):
             """
             checks that the watchdog function ran wait/interval +- 1 times
@@ -226,7 +225,6 @@ def _valid_timer(self, pid, scope):
         def _release_timer(self, pid, scope):
             return TimerRequest(worker_id=pid, scope_id=scope, expiration_time=-1)
 
-        @sandcastle_skip_if(TEST_WITH_TSAN, "test is tsan incompatible")
         @mock.patch("os.kill")
         def test_expired_timers(self, mock_os_kill):
             """
diff --git a/test/distributed/launcher/api_test.py b/test/distributed/launcher/api_test.py
index d2bfd360f9c31..685e843c10653 100644
--- a/test/distributed/launcher/api_test.py
+++ b/test/distributed/launcher/api_test.py
@@ -31,7 +31,6 @@
 )
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
-    TEST_WITH_TSAN,
     sandcastle_skip_if,
 )
 
@@ -117,7 +116,7 @@ def get_test_launch_config(
             rdzv_endpoint=endpoint,
             monitor_interval=1,
             rdzv_backend=rdzv_backend,
-            start_method="fork",
+            start_method="spawn",
             max_restarts=0,
             rdzv_configs=rdzv_configs,
         )
@@ -128,7 +127,7 @@ def check_works_ran(self, world_size: int):
         )
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_launch_script_python(self):
         nnodes = 1
@@ -145,7 +144,7 @@ def test_launch_script_python(self):
         self.check_works_ran(world_size)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_launch_script_python_local_rank_transfer(self):
         nnodes = 1
@@ -162,7 +161,7 @@ def test_launch_script_python_local_rank_transfer(self):
         self.check_works_ran(world_size)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_launch_script_bash(self):
         nnodes = 1
@@ -177,7 +176,7 @@ def test_launch_script_bash(self):
         self.check_works_ran(world_size)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_launch_function(self):
         nnodes = 1
@@ -193,7 +192,7 @@ def test_launch_function(self):
         self.assertEqual(expected_res, actual_res)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_launch_dist_sum_with_static_rdzv(self):
         nnodes = 1
@@ -224,7 +223,7 @@ def test_launch_dist_sum_with_static_rdzv(self):
         self.assertEqual(expected_res, actual_res)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_launch_elastic(self):
         nproc_per_node = 4
diff --git a/test/distributed/launcher/launch_test.py b/test/distributed/launcher/launch_test.py
index 73aed1a4ea59f..d79a18d39b995 100644
--- a/test/distributed/launcher/launch_test.py
+++ b/test/distributed/launcher/launch_test.py
@@ -15,7 +15,6 @@
 from torch.distributed.elastic.utils import get_socket_with_port
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
-    TEST_WITH_TSAN,
     sandcastle_skip_if,
 )
 
@@ -36,7 +35,7 @@ def tearDown(self):
         shutil.rmtree(self.test_dir)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_launch_without_env(self):
         nnodes = 1
@@ -49,7 +48,7 @@ def test_launch_without_env(self):
             f"--nnodes={nnodes}",
             f"--nproc_per_node={nproc_per_node}",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             "--master_addr=localhost",
             f"--master_port={master_port}",
             "--node_rank=0",
@@ -58,7 +57,7 @@ def test_launch_without_env(self):
         launch.main(args)
 
     @sandcastle_skip_if(
-        TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "tests incompatible with tsan and dev/dbg asan"
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )
     def test_launch_with_env(self):
         nnodes = 1
@@ -71,7 +70,7 @@ def test_launch_with_env(self):
             f"--nnodes={nnodes}",
             f"--nproc_per_node={nproc_per_node}",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             "--master_addr=localhost",
             f"--master_port={master_port}",
             "--node_rank=0",
diff --git a/test/distributed/launcher/run_test.py b/test/distributed/launcher/run_test.py
index a63ec0382dfa4..079fea792ed02 100644
--- a/test/distributed/launcher/run_test.py
+++ b/test/distributed/launcher/run_test.py
@@ -23,7 +23,6 @@
 from torch.distributed.elastic.utils import get_socket_with_port
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
-    TEST_WITH_TSAN,
     sandcastle_skip_if,
 )
 
@@ -100,7 +99,7 @@ def test_launch_user_script_python(self):
             f"--rdzv_endpoint={self._etcd_endpoint}",
             f"--rdzv_id={run_id}",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             path("bin/test_script.py"),
             f"--touch_file_dir={self.test_dir}",
         ]
@@ -123,7 +122,7 @@ def test_launch_user_script_python_caffe2_bc(self):
             f"--nnodes={nnodes}",
             f"--nproc_per_node={nproc_per_node}",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             "--master_addr=localhost",
             f"--master_port={master_port}",
             "--node_rank=0",
@@ -138,7 +137,7 @@ def test_launch_user_script_python_caffe2_bc(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_launch_user_script_bash(self):
         run_id = str(uuid.uuid4().int)
         nnodes = 1
@@ -151,7 +150,7 @@ def test_launch_user_script_bash(self):
             f"--rdzv_endpoint={self._etcd_endpoint}",
             f"--rdzv_id={run_id}",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             "--no_python",
         ]
 
@@ -169,7 +168,7 @@ def test_launch_user_script_bash(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_launch_user_script_default_nproc(self):
         run_id = str(uuid.uuid4().int)
         nnodes = 1
@@ -180,7 +179,7 @@ def test_launch_user_script_default_nproc(self):
             f"--rdzv_endpoint={self._etcd_endpoint}",
             f"--rdzv_id={run_id}",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             "--no_python",
         ]
 
@@ -198,7 +197,7 @@ def test_launch_user_script_default_nproc(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_launch_with_env_vars(self):
         run_id = str(uuid.uuid4().int)
         nnodes = 1
@@ -211,7 +210,7 @@ def test_launch_with_env_vars(self):
         os.environ["PET_RDZV_ENDPOINT"] = self._etcd_endpoint
         os.environ["PET_RDZV_ID"] = run_id
         os.environ["PET_MONITOR_INTERVAL"] = "1"
-        os.environ["PET_START_METHOD"] = "fork"
+        os.environ["PET_START_METHOD"] = "spawn"
         os.environ["PET_NO_PYTHON"] = "1"
 
         script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
@@ -241,7 +240,7 @@ def _test_nproc_launch_configuration(self, nproc_type, expected_number):
             f"--rdzv_endpoint={self._etcd_endpoint}",
             f"--rdzv_id={run_id}",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             "--no_python",
         ]
 
@@ -256,27 +255,27 @@ def _test_nproc_launch_configuration(self, nproc_type, expected_number):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_nproc_launch_auto_configurations(self):
         self._test_nproc_launch_configuration("auto", os.cpu_count())
 
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_nproc_launch_number_configurations(self):
         self._test_nproc_launch_configuration("4", 4)
 
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_nproc_launch_unknown_configurations(self):
         with self.assertRaises(ValueError):
             self._test_nproc_launch_configuration("unknown", 4)
 
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     @patch("torch.cuda.is_available", return_value=True)
     @patch("torch.cuda.device_count", return_value=3)
     def test_nproc_gpu_launch_configurations(self, _mock1, _mock2):
         self._test_nproc_launch_configuration("auto", 3)
         self._test_nproc_launch_configuration("gpu", 3)
 
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_launch_elastic(self):
         run_id = str(uuid.uuid4().int)
         min_nodes = 1
@@ -291,7 +290,7 @@ def test_launch_elastic(self):
             f"--rdzv_endpoint={self._etcd_endpoint}",
             f"--rdzv_id={run_id}",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             path("bin/test_script.py"),
             f"--touch_file_dir={self.test_dir}",
         ]
@@ -304,7 +303,7 @@ def test_launch_elastic(self):
         )
 
     @mock.patch("torch.distributed.elastic.events.record")
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_launch_elastic_worker_raise_exception(self, record_mock):
         """
         Asserts that when the worker program fails and lancher raieses exception
@@ -323,7 +322,7 @@ def test_launch_elastic_worker_raise_exception(self, record_mock):
             f"--rdzv_id={run_id}",
             "--monitor_interval=1",
             "--max_restarts=0",
-            "--start_method=fork",
+            "--start_method=spawn",
             path("bin/test_script.py"),
             "--fail",
         ]
@@ -332,7 +331,7 @@ def test_launch_elastic_worker_raise_exception(self, record_mock):
 
         record_mock.assert_called_once()
 
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     @mock.patch(
         "torch.distributed.elastic.agent.server.local_elastic_agent.LocalElasticAgent.run"
     )
@@ -354,7 +353,7 @@ def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run)
             f"--rdzv_id={run_id}",
             "--monitor_interval=1",
             "--max_restarts=0",
-            "--start_method=fork",
+            "--start_method=spawn",
             path("bin/test_script.py"),
             f"--touch_file_dir={self.test_dir}",
         ]
@@ -364,7 +363,7 @@ def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run)
             launch.main(args)
         record_mock.assert_called_once()
 
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_launch_standalone(self):
         nnodes = 1
         nproc_per_node = 4
@@ -374,7 +373,7 @@ def test_launch_standalone(self):
             f"--nproc_per_node={nproc_per_node}",
             "--standalone",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             path("bin/test_script.py"),
             f"--touch_file_dir={self.test_dir}",
         ]
@@ -386,7 +385,7 @@ def test_launch_standalone(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_launch_run_path(self):
         nnodes = 1
         nproc_per_node = 4
@@ -396,7 +395,7 @@ def test_launch_run_path(self):
             f"--nnodes={nnodes}",
             f"--nproc_per_node={nproc_per_node}",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             path("bin/test_script.py"),
             f"--touch_file_dir={self.test_dir}",
         ]
@@ -408,7 +407,7 @@ def test_launch_run_path(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with tsan and dev/dbg asan")
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
     def test_launch_elastic_multiple_agents(self):
         run_id = str(uuid.uuid4().int)
         min_nodes = 1
@@ -423,7 +422,7 @@ def test_launch_elastic_multiple_agents(self):
             f"--rdzv_endpoint={self._etcd_endpoint}",
             f"--rdzv_id={run_id}",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             path("bin/test_script.py"),
             f"--touch_file_dir={self.test_dir}",
         ]
@@ -462,7 +461,7 @@ def test_launch_shutdown(self, agent_mock_cls):
             f"--nnodes={nnodes}",
             f"--nproc_per_node={nproc_per_node}",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             path("bin/test_script.py"),
             f"--touch_file_dir={self.test_dir}",
         ]
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 6aa5c64658415..33939d093ca3f 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -28,9 +28,13 @@
     TestCase,
     load_tests,
     run_tests,
-    TEST_WITH_TSAN,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
+if TEST_WITH_DEV_DBG_ASAN:
+    print("Multiprocessing spawn is not compatible with dev/dbg asan", file=sys.stderr)
+    sys.exit(0)
+
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -438,37 +442,31 @@ def fut_then(fut):
         return fut.then(fut_then)
 
 
-# TSAN is not fork-safe since we're forking in a multi-threaded environment
-if not TEST_WITH_TSAN:
-
-    class DistributedDataParallelTest(
-        AbstractDistributedDataParallelTest, MultiProcessTestCase
-    ):
-        def setUp(self):
-            super(DistributedDataParallelTest, self).setUp()
-            if sys.platform == "win32":
-                self._spawn_processes()
-            else:
-                self._fork_processes()
-
-        def test_invalid_powerSGD_state(self):
-            for start_powerSGD_iter, use_error_feedback, warm_start in product(
-                [0, 1], [True, False], [True, False]
+class DistributedDataParallelTest(
+    AbstractDistributedDataParallelTest, MultiProcessTestCase
+):
+    def setUp(self):
+        super(DistributedDataParallelTest, self).setUp()
+        self._spawn_processes()
+
+    def test_invalid_powerSGD_state(self):
+        for start_powerSGD_iter, use_error_feedback, warm_start in product(
+            [0, 1], [True, False], [True, False]
+        ):
+            if not use_error_feedback and not warm_start:
+                continue
+            with self.assertRaisesRegex(
+                ValueError,
+                "Expect `start_powerSGD_iter` > 1 if `use_error_feedback` or `warm_start` is enabled, "
+                "because PowerSGD can only be applied after the first two iterations in DDP.",
             ):
-                if not use_error_feedback and not warm_start:
-                    continue
-                with self.assertRaisesRegex(
-                    ValueError,
-                    "Expect `start_powerSGD_iter` > 1 if `use_error_feedback` or `warm_start` is enabled, "
-                    "because PowerSGD can only be applied after the first two iterations in DDP.",
-                ):
-                    state = powerSGD.PowerSGDState(
-                        process_group=None,
-                        matrix_approximation_rank=1,
-                        start_powerSGD_iter=start_powerSGD_iter,
-                        use_error_feedback=use_error_feedback,
-                        warm_start=warm_start,
-                    )
+                state = powerSGD.PowerSGDState(
+                    process_group=None,
+                    matrix_approximation_rank=1,
+                    start_powerSGD_iter=start_powerSGD_iter,
+                    use_error_feedback=use_error_feedback,
+                    warm_start=warm_start,
+                )
 
 
 class ComputeBucketAssignmentTest(TestCase):
@@ -656,49 +654,42 @@ def _test_sequence_num_set_new_group(self, backend):
             dist.all_gather_object(obj_list, subgroup_seq, group=subgroup)
             self.assertEqual(len(set(obj_list)), 1)
 
+class CommTest(AbstractCommTest, MultiProcessTestCase):
+    def setUp(self):
+        super(CommTest, self).setUp()
+        self._spawn_processes()
 
-# TSAN is not fork-safe since we're forking in a multi-threaded environment
-if not TEST_WITH_TSAN:
+    def tearDown(self):
+        super(CommTest, self).tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
 
-    class CommTest(AbstractCommTest, MultiProcessTestCase):
-        def setUp(self):
-            super(CommTest, self).setUp()
-            if sys.platform == "win32":
-                self._spawn_processes()
-            else:
-                self._fork_processes()
-
-        def tearDown(self):
-            super(CommTest, self).tearDown()
-            try:
-                os.remove(self.file_name)
-            except OSError:
-                pass
-
-        def test_distributed_debug_mode(self):
-            # Default should be off
-            default_debug_mode = dist._get_debug_mode()
-            self.assertEqual(default_debug_mode, dist._DistributedDebugLevel.OFF)
-            mapping = {
-                "OFF": dist._DistributedDebugLevel.OFF,
-                "INFO": dist._DistributedDebugLevel.INFO,
-                "DETAIL": dist._DistributedDebugLevel.DETAIL,
-            }
-            invalid_debug_modes = ["foo", 0, 1, -1]
-
-            for mode in mapping.keys():
-                os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
-                set_debug_mode = dist._get_debug_mode()
-                self.assertEqual(
-                    set_debug_mode,
-                    mapping[mode],
-                    f"Expected {mode} to map to {mapping[mode]} but got {set_debug_mode}",
-                )
+    def test_distributed_debug_mode(self):
+        # Default should be off
+        default_debug_mode = dist._get_debug_mode()
+        self.assertEqual(default_debug_mode, dist._DistributedDebugLevel.OFF)
+        mapping = {
+            "OFF": dist._DistributedDebugLevel.OFF,
+            "INFO": dist._DistributedDebugLevel.INFO,
+            "DETAIL": dist._DistributedDebugLevel.DETAIL,
+        }
+        invalid_debug_modes = ["foo", 0, 1, -1]
+
+        for mode in mapping.keys():
+            os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
+            set_debug_mode = dist._get_debug_mode()
+            self.assertEqual(
+                set_debug_mode,
+                mapping[mode],
+                f"Expected {mode} to map to {mapping[mode]} but got {set_debug_mode}",
+            )
 
-            for mode in invalid_debug_modes:
-                os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
-                with self.assertRaisesRegex(RuntimeError, "to be one of"):
-                    dist._get_debug_mode()
+        for mode in invalid_debug_modes:
+            os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
+            with self.assertRaisesRegex(RuntimeError, "to be one of"):
+                dist._get_debug_mode()
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 54f29f3b11a7b..55b2948b93b71 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -43,17 +43,9 @@
     TestCase,
     run_tests,
     retry_on_connect_failures,
-    TEST_WITH_TSAN,
     sandcastle_skip,
 )
 
-if TEST_WITH_TSAN:
-    print(
-        "Skip as TSAN is not fork-safe since we're forking in a multi-threaded environment",
-        file=sys.stderr,
-    )
-    sys.exit(0)
-
 
 def simple_reduce_tests(rank, world_size):
     tests = [
@@ -218,12 +210,7 @@ def _create_process_group_gloo(self, store, rank, world_size, opts):
 
     def setUp(self):
         super(ProcessGroupGlooTest, self).setUp()
-
-        # For Windows platform, Python does not support fork, change it to spawn here.
-        if sys.platform == "win32":
-            self._spawn_processes()
-        else:
-            self._fork_processes()
+        self._spawn_processes()
 
     def opts(self, threads=2):
         opts = c10d.ProcessGroupGloo._Options()
@@ -1425,10 +1412,7 @@ class DistributedDataParallelTest(
 ):
     def setUp(self):
         super(DistributedDataParallelTest, self).setUp()
-        if sys.platform == "win32":
-            self._spawn_processes()
-        else:
-            self._fork_processes()
+        self._spawn_processes()
 
     def _test_gloo_backend(
         self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False
@@ -2197,10 +2181,7 @@ def test_forward_backward_optimizer(self):
 class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
     def setUp(self):
         super(CommTest, self).setUp()
-        if sys.platform == "win32":
-            self._spawn_processes()
-        else:
-            self._fork_processes()
+        self._spawn_processes()
 
     def tearDown(self):
         super(CommTest, self).tearDown()
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 9efebc94a9288..e42c5c6be1759 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -45,7 +45,6 @@
     retry_on_connect_failures,
     TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_ROCM,
-    TEST_WITH_TSAN,
     sandcastle_skip,
     sandcastle_skip_if,
 )
@@ -57,13 +56,6 @@
     from torch.distributed.optim.functional_adam import _FunctionalAdam
     from torch.distributed.optim.functional_adamw import _FunctionalAdamW
 
-if TEST_WITH_TSAN:
-    print(
-        "Skip as TSAN is not fork-safe since we're forking in a multi-threaded environment",
-        file=sys.stderr,
-    )
-    sys.exit(0)
-
 if TEST_WITH_DEV_DBG_ASAN:
     print(
         "Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr
diff --git a/test/distributed/test_c10d_spawn_gloo.py b/test/distributed/test_c10d_spawn_gloo.py
index 8e5e0519356cf..21f43f7ca95f6 100644
--- a/test/distributed/test_c10d_spawn_gloo.py
+++ b/test/distributed/test_c10d_spawn_gloo.py
@@ -11,7 +11,7 @@
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
 from torch.testing._internal.common_distributed import requires_gloo, \
     create_device, MultiProcessTestCase, skip_if_lt_x_gpu
-from torch.testing._internal.common_utils import TestCase, run_tests, sandcastle_skip_if, TEST_WITH_TSAN, TEST_WITH_DEV_DBG_ASAN
+from torch.testing._internal.common_utils import TestCase, run_tests, sandcastle_skip_if, TEST_WITH_DEV_DBG_ASAN
 
 # Fails on Python-3.9, see https://github.com/pytorch/pytorch/issues/51619
 if sys.version_info < (3, 9):
@@ -76,102 +76,100 @@ def test_shared_allgather_chunk_gloo(self):
                 self.world_size)
 
 
-# TSAN is not fork-safe since we're forking in a multi-threaded environment
-if not TEST_WITH_TSAN:
-    class DistributedDataParallelSingleProcessTest(TestCase):
-        def setUp(self):
-            self.rank = 0
-            self.world_size = 1
-            self.file = tempfile.NamedTemporaryFile(delete=False)  # noqa: P201
-
-        def tearDown(self):
-            try:
-                os.remove(self.file.name)
-            except OSError:
-                pass
-
-        def _test_base(self, net, inp, check_allclose=True):
-            store = c10d.FileStore(self.file.name, self.world_size)
-            process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
-            if inp[0].is_cuda:
-                device_ids = [torch.cuda.current_device()]
-            else:
-                device_ids = None
+class DistributedDataParallelSingleProcessTest(TestCase):
+    def setUp(self):
+        self.rank = 0
+        self.world_size = 1
+        self.file = tempfile.NamedTemporaryFile(delete=False)  # noqa: P201
 
-            ddp = nn.parallel.DistributedDataParallel(
-                copy.deepcopy(net),
-                device_ids=device_ids,
-                process_group=process_group
-            )
+    def tearDown(self):
+        try:
+            os.remove(self.file.name)
+        except OSError:
+            pass
 
-            net_opt = torch.optim.Adam(net.parameters(), lr=0.001)
-            ddp_opt = torch.optim.Adam(ddp.parameters(), lr=0.001)
+    def _test_base(self, net, inp, check_allclose=True):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
+        if inp[0].is_cuda:
+            device_ids = [torch.cuda.current_device()]
+        else:
+            device_ids = None
 
-            for i, j in zip(ddp.parameters(), net.parameters()):
-                self.assertTrue(i.allclose(j))
+        ddp = nn.parallel.DistributedDataParallel(
+            copy.deepcopy(net),
+            device_ids=device_ids,
+            process_group=process_group
+        )
 
-            for _ in range(10):
-                net_out = net(*inp)
-                ddp_out = ddp(*inp)
+        net_opt = torch.optim.Adam(net.parameters(), lr=0.001)
+        ddp_opt = torch.optim.Adam(ddp.parameters(), lr=0.001)
 
-                net_out.sum().backward()
-                ddp_out.sum().backward()
+        for i, j in zip(ddp.parameters(), net.parameters()):
+            self.assertTrue(i.allclose(j))
 
-                net_opt.step()
-                ddp_opt.step()
+        for _ in range(10):
+            net_out = net(*inp)
+            ddp_out = ddp(*inp)
 
-            if check_allclose:
-                for i, j in zip(ddp.parameters(), net.parameters()):
-                    self.assertTrue(i.allclose(j))
+            net_out.sum().backward()
+            ddp_out.sum().backward()
 
-        @requires_gloo()
-        def test_cpu(self):
-            self._test_base(nn.Linear(2, 2), [torch.randn(30, 2)])
+            net_opt.step()
+            ddp_opt.step()
 
-        @requires_gloo()
-        @sandcastle_skip_if(not TEST_CUDA, "At least 1 CUDA GPUS needed")
-        def test_cuda(self):
-            self._test_base(nn.Linear(2, 2).to(0), [torch.randn(30, 2).to(0)])
+        if check_allclose:
+            for i, j in zip(ddp.parameters(), net.parameters()):
+                self.assertTrue(i.allclose(j))
 
-        @requires_gloo()
-        @sandcastle_skip_if(not TEST_CUDA, "At least 1 CUDA GPUS needed")
-        def test_rnn(self):
-            # This test is inspired by the bug reported in
-            # https://github.com/pytorch/pytorch/issues/36268
-            BATCH_SIZE = 12  # Divisible by 2, 3, 4
-            INPUT_DIM = 256
-            OUTPUT_DIM = 256
-            HIDDEN_DIM = 256
-            N_LAYERS = 3
-            SEQ_LEN = 100
-
-            class Net(nn.Module):
-                def __init__(self, input_dim, hidden_dim, output_dim, hidden_layers):
-                    super(Net, self).__init__()
-                    self.input_dim = input_dim
-                    self.hidden_dim = hidden_dim
-                    self.output_dim = output_dim
-                    self.hidden_layers = hidden_layers
-
-                    self.lstm = nn.LSTM(input_dim, hidden_dim, hidden_layers, batch_first=True)
-                    self.h2o = nn.Linear(hidden_dim, output_dim)
-
-                def forward(self, x, y):
-                    self.lstm.flatten_parameters()
-                    h_t, _ = self.lstm(x)
-                    output = self.h2o(h_t)
-                    loss = nn.functional.mse_loss(output, y)
-                    return loss
-
-            net = Net(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS).to(0)
-            inp = [
-                torch.randn((BATCH_SIZE, SEQ_LEN, INPUT_DIM)).to(0),
-                torch.rand((BATCH_SIZE, SEQ_LEN, OUTPUT_DIM)).to(0)
-            ]
-
-            # Not checking result allclose as the parameter inconsistency exist
-            # prior to this change. See #37079
-            self._test_base(net, inp, check_allclose=False)
+    @requires_gloo()
+    def test_cpu(self):
+        self._test_base(nn.Linear(2, 2), [torch.randn(30, 2)])
+
+    @requires_gloo()
+    @sandcastle_skip_if(not TEST_CUDA, "At least 1 CUDA GPUS needed")
+    def test_cuda(self):
+        self._test_base(nn.Linear(2, 2).to(0), [torch.randn(30, 2).to(0)])
+
+    @requires_gloo()
+    @sandcastle_skip_if(not TEST_CUDA, "At least 1 CUDA GPUS needed")
+    def test_rnn(self):
+        # This test is inspired by the bug reported in
+        # https://github.com/pytorch/pytorch/issues/36268
+        BATCH_SIZE = 12  # Divisible by 2, 3, 4
+        INPUT_DIM = 256
+        OUTPUT_DIM = 256
+        HIDDEN_DIM = 256
+        N_LAYERS = 3
+        SEQ_LEN = 100
+
+        class Net(nn.Module):
+            def __init__(self, input_dim, hidden_dim, output_dim, hidden_layers):
+                super(Net, self).__init__()
+                self.input_dim = input_dim
+                self.hidden_dim = hidden_dim
+                self.output_dim = output_dim
+                self.hidden_layers = hidden_layers
+
+                self.lstm = nn.LSTM(input_dim, hidden_dim, hidden_layers, batch_first=True)
+                self.h2o = nn.Linear(hidden_dim, output_dim)
+
+            def forward(self, x, y):
+                self.lstm.flatten_parameters()
+                h_t, _ = self.lstm(x)
+                output = self.h2o(h_t)
+                loss = nn.functional.mse_loss(output, y)
+                return loss
+
+        net = Net(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS).to(0)
+        inp = [
+            torch.randn((BATCH_SIZE, SEQ_LEN, INPUT_DIM)).to(0),
+            torch.rand((BATCH_SIZE, SEQ_LEN, OUTPUT_DIM)).to(0)
+        ]
+
+        # Not checking result allclose as the parameter inconsistency exist
+        # prior to this change. See #37079
+        self._test_base(net, inp, check_allclose=False)
 
 
 # Skip dev-asan as torch + multiprocessing spawn have known issues
diff --git a/test/distributed/test_distributed_fork.py b/test/distributed/test_distributed_fork.py
deleted file mode 100644
index c707a313a5e47..0000000000000
--- a/test/distributed/test_distributed_fork.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import os
-import sys
-import tempfile
-from functools import wraps
-import torch
-import torch.cuda
-import torch.distributed as dist
-from torch.testing._internal.common_utils import TEST_WITH_TSAN
-
-if not dist.is_available():
-    print("Distributed not available, skipping tests", file=sys.stderr)
-    sys.exit(0)
-
-from torch.testing._internal.common_utils import TestCase, find_free_port, run_tests
-from torch.distributed.distributed_c10d import _get_default_group
-from torch.testing._internal.distributed.distributed_test import (
-    DistributedTest, TestDistBackend
-)
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-CPP_EXTENSIONS_WARNING = """
-Ninja (https://ninja-build.org) must be available to run C++ extensions tests,
-but it could not be found. Install ninja with `pip install ninja`
-or `conda install ninja`.
-"""
-
-BACKEND = os.environ["BACKEND"]
-INIT_METHOD = os.getenv("INIT_METHOD", "env://")
-
-
-def skip_if_no_ninja(func):
-
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        try:
-            import torch.utils.cpp_extension
-            torch.utils.cpp_extension.verify_ninja_availability()
-        except RuntimeError:
-            print(CPP_EXTENSIONS_WARNING)
-            return 0
-
-        return func(*args, **kwargs)
-
-    return wrapper
-
-if TEST_WITH_TSAN:
-    print("Skip as TSAN is not fork-safe since we're forking in a multi-threaded environment", file=sys.stderr)
-    sys.exit(0)
-
-if BACKEND == "gloo" or BACKEND == "nccl":
-
-    class TestDistBackendWithFork(TestDistBackend, DistributedTest._DistTestBase):
-
-        def setUp(self):
-            super().setUp()
-            self._fork_processes()
-            torch.backends.cudnn.flags(allow_tf32=False).__enter__()
-
-
-elif BACKEND == "mpi":
-    WORLD_SIZE = os.environ["WORLD_SIZE"]
-    dist.init_process_group(init_method=INIT_METHOD, backend="mpi")
-
-    class TestMPIWithFork(TestCase, DistributedTest._DistTestBase):
-        pass
-
-elif BACKEND == "test":
-    class TestBackendDynamicLoad(TestCase):
-        def setUp(self):
-            super(TestBackendDynamicLoad, self).setUp()
-
-        def _load_test_backend(self):
-            temp_dir = tempfile.mkdtemp()
-            src = "{}/../cpp_extensions/cpp_c10d_extension.cpp".format(os.path.abspath(os.path.dirname(__file__)))
-            extension = torch.utils.cpp_extension.load(
-                name="torch_test",
-                sources=[src],
-                build_directory=temp_dir
-            )
-
-        @skip_if_no_ninja
-        def test_backend_apis(self):
-            self._load_test_backend()
-
-            os.environ['WORLD_SIZE'] = '1'
-            os.environ['MASTER_ADDR'] = '127.0.0.1'
-            os.environ['MASTER_PORT'] = str(find_free_port())
-            os.environ['RANK'] = '0'
-
-            dist.init_process_group(backend='test', init_method='env://', world_size=1, rank=0)
-            self.assertEqual(dist.get_rank(), 0)
-            self.assertEqual(dist.get_world_size(), 1)
-
-            process_group = _get_default_group()
-            work = process_group.allreduce([torch.rand(1), torch.rand(1)])
-            self.assertTrue(work.wait())
-            self.assertTrue(work.is_completed())
-            self.assertTrue(work.is_success())
-
-            work = process_group.broadcast([torch.rand(1)])
-            self.assertTrue(work.wait())
-            self.assertTrue(work.is_completed())
-            self.assertTrue(work.is_success())
-
-            dist.destroy_process_group()
-
-if __name__ == "__main__":
-    assert (
-        not torch.cuda._initialized
-    ), "test_distributed must not have initialized CUDA context on main process"
-
-    run_tests()
diff --git a/test/distributed/test_jit_c10d.py b/test/distributed/test_jit_c10d.py
index be392730b3fad..65d82fb033b7d 100644
--- a/test/distributed/test_jit_c10d.py
+++ b/test/distributed/test_jit_c10d.py
@@ -6,7 +6,7 @@
 from typing import List
 
 from torch.testing._internal.common_distributed import requires_nccl, create_tcp_store
-from torch.testing._internal.common_utils import load_tests, TEST_WITH_TSAN, run_tests, sandcastle_skip_if
+from torch.testing._internal.common_utils import load_tests, run_tests, sandcastle_skip_if
 from torch.testing._internal.jit_utils import JitTestCase
 
 # load_tests from common_utils is used to automatically filter tests for
@@ -29,10 +29,6 @@ def unique_process_group_name(prefix):
     now = int(time.time() * 1000)
     return "%s_%d" % (prefix, now)
 
-if TEST_WITH_TSAN:
-    print("Skip as TSAN is not fork-safe since we're forking in a multi-threaded environment", file=sys.stderr)
-    sys.exit(0)
-
 class ProcessGroupNCCLJitTest(JitTestCase):
     MAIN_PROCESS_RANK = 0
 
diff --git a/test/distributed/test_launcher.py b/test/distributed/test_launcher.py
index 53faefba95f95..4565a266bc9ec 100644
--- a/test/distributed/test_launcher.py
+++ b/test/distributed/test_launcher.py
@@ -12,7 +12,6 @@
 
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
-    TEST_WITH_TSAN,
     TestCase,
     run_tests,
 )
@@ -25,10 +24,6 @@ def path(script):
     print("Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr)
     sys.exit(0)
 
-if TEST_WITH_TSAN:
-    print("Skip as TSAN is not fork-safe since we're forking in a multi-threaded environment", file=sys.stderr)
-    sys.exit(0)
-
 class TestDistributedLaunch(TestCase):
     def test_launch_user_script(self):
         nnodes = 1
@@ -41,7 +36,7 @@ def test_launch_user_script(self):
             f"--nnodes={nnodes}",
             f"--nproc_per_node={nproc_per_node}",
             "--monitor_interval=1",
-            "--start_method=fork",
+            "--start_method=spawn",
             "--master_addr=localhost",
             f"--master_port={master_port}",
             "--node_rank=0",
diff --git a/test/distributed/test_pg_wrapper.py b/test/distributed/test_pg_wrapper.py
index de3a66712bffe..abf77d4fdaa02 100644
--- a/test/distributed/test_pg_wrapper.py
+++ b/test/distributed/test_pg_wrapper.py
@@ -20,7 +20,6 @@
 )
 from torch.testing._internal.common_utils import (
     run_tests,
-    TEST_WITH_TSAN,
     TEST_WITH_DEV_DBG_ASAN,
 )
 
@@ -28,11 +27,7 @@
 class AbstractProcessGroupWrapperTest(MultiProcessTestCase):
     def setUp(self):
         super(AbstractProcessGroupWrapperTest, self).setUp()
-        # For Windows platform, Python does not support fork, change it to spawn here.
-        if sys.platform == "win32":
-            self._spawn_processes()
-        else:
-            self._fork_processes()
+        self._spawn_processes()
 
     def _validate_error(self, exception, op_type, rank, tensor):
         err = str(exception)
@@ -291,91 +286,89 @@ def test_collective_shape_mismatch(self):
             self._test_collective_shape_mismatch(pg, use_cuda=True)
 
 
-# TSAN is not fork-safe since we're forking in a multi-threaded environment
-if not TEST_WITH_TSAN:
-    @requires_gloo()
-    class ProcessGroupGlooWrapperTest(AbstractProcessGroupWrapperTest):
-        def setUp(self):
-            super(ProcessGroupGlooWrapperTest, self).setUp()
-
-        def opts(self, threads=2, timeout=10.0):
-            opts = c10d.ProcessGroupGloo._Options()
-            opts._timeout = timeout
-            opts._devices = [create_device(interface=LOOPBACK)]
-            opts._threads = threads
-            return opts
-
-        def _create_wrapper_pg(self, with_new_group=False, timeout=10.0):
-            store = c10d.FileStore(self.file_name, self.world_size)
-            c10d.init_process_group(
-                backend="gloo", rank=self.rank, world_size=self.world_size, store=store
+@requires_gloo()
+class ProcessGroupGlooWrapperTest(AbstractProcessGroupWrapperTest):
+    def setUp(self):
+        super(ProcessGroupGlooWrapperTest, self).setUp()
+
+    def opts(self, threads=2, timeout=10.0):
+        opts = c10d.ProcessGroupGloo._Options()
+        opts._timeout = timeout
+        opts._devices = [create_device(interface=LOOPBACK)]
+        opts._threads = threads
+        return opts
+
+    def _create_wrapper_pg(self, with_new_group=False, timeout=10.0):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="gloo", rank=self.rank, world_size=self.world_size, store=store
+        )
+        if with_new_group:
+            pg = c10d.new_group(backend="gloo")
+        else:
+            _pg = c10d.ProcessGroupGloo(
+                store, self.rank, self.world_size, self.opts(timeout=timeout)
             )
-            if with_new_group:
-                pg = c10d.new_group(backend="gloo")
-            else:
-                _pg = c10d.ProcessGroupGloo(
-                    store, self.rank, self.world_size, self.opts(timeout=timeout)
-                )
-                pg = c10d._create_process_group_wrapper(
-                    _pg,
-                    "unused",
-                    store,
-                    self.rank,
-                    self.world_size,
-                    timeout=timeout,
-                )
-            return pg
-
-        def test_collective_hang(self):
-            pg = self._create_wrapper_pg(timeout=2.0)
-            self._test_collective_hang(pg)
-
-        # NOTE: these tests are separated by debug level instead of combined into
-        # one due to https://github.com/pytorch/pytorch/issues/55967, they can be
-        # combined after that is resolved.
-        @with_dist_debug_levels(levels=["DETAIL"])
-        def test_collectives_op_mismatch_debug_mode(self):
-            pg = self._create_wrapper_pg(with_new_group=True)
-            self._test_collectives_op_mismatch(pg)
-
-        @with_dist_debug_levels(levels=["OFF"])
-        def test_collectives_op_mismatch(self):
-            pg = self._create_wrapper_pg(with_new_group=False)
-            self._test_collectives_op_mismatch(pg)
-
-        @with_dist_debug_levels(levels=["DETAIL"])
-        def test_collective_shape_mismatch_debug_mode(self):
-            pg = self._create_wrapper_pg(with_new_group=True)
-            self._test_collective_shape_mismatch(pg)
-
-        @with_dist_debug_levels(levels=["OFF"])
-        def test_collective_shape_mismatch(self):
-            pg = self._create_wrapper_pg(with_new_group=False)
-            self._test_collective_shape_mismatch(pg)
-
-        @skip_if_lt_x_gpu(4)
-        @with_dist_debug_levels(levels=["DETAIL"])
-        def test_collectives_op_mismatch_cuda_debug_mode(self):
-            pg = self._create_wrapper_pg(with_new_group=True)
-            self._test_collectives_op_mismatch(pg, use_cuda=True)
-
-        @skip_if_lt_x_gpu(4)
-        @with_dist_debug_levels(levels=["OFF"])
-        def test_collectives_op_mismatch_cuda(self):
-            pg = self._create_wrapper_pg(with_new_group=False)
-            self._test_collectives_op_mismatch(pg, use_cuda=True)
-
-        @skip_if_lt_x_gpu(4)
-        @with_dist_debug_levels(levels=["DETAIL"])
-        def test_collective_shape_mismatch_cuda_debug_mode(self):
-            pg = self._create_wrapper_pg(with_new_group=True)
-            self._test_collective_shape_mismatch(pg, use_cuda=True)
-
-        @skip_if_lt_x_gpu(4)
-        @with_dist_debug_levels(levels=["OFF"])
-        def test_collective_shape_mismatch_cuda(self):
-            pg = self._create_wrapper_pg(with_new_group=False)
-            self._test_collective_shape_mismatch(pg, use_cuda=True)
+            pg = c10d._create_process_group_wrapper(
+                _pg,
+                "unused",
+                store,
+                self.rank,
+                self.world_size,
+                timeout=timeout,
+            )
+        return pg
+
+    def test_collective_hang(self):
+        pg = self._create_wrapper_pg(timeout=2.0)
+        self._test_collective_hang(pg)
+
+    # NOTE: these tests are separated by debug level instead of combined into
+    # one due to https://github.com/pytorch/pytorch/issues/55967, they can be
+    # combined after that is resolved.
+    @with_dist_debug_levels(levels=["DETAIL"])
+    def test_collectives_op_mismatch_debug_mode(self):
+        pg = self._create_wrapper_pg(with_new_group=True)
+        self._test_collectives_op_mismatch(pg)
+
+    @with_dist_debug_levels(levels=["OFF"])
+    def test_collectives_op_mismatch(self):
+        pg = self._create_wrapper_pg(with_new_group=False)
+        self._test_collectives_op_mismatch(pg)
+
+    @with_dist_debug_levels(levels=["DETAIL"])
+    def test_collective_shape_mismatch_debug_mode(self):
+        pg = self._create_wrapper_pg(with_new_group=True)
+        self._test_collective_shape_mismatch(pg)
+
+    @with_dist_debug_levels(levels=["OFF"])
+    def test_collective_shape_mismatch(self):
+        pg = self._create_wrapper_pg(with_new_group=False)
+        self._test_collective_shape_mismatch(pg)
+
+    @skip_if_lt_x_gpu(4)
+    @with_dist_debug_levels(levels=["DETAIL"])
+    def test_collectives_op_mismatch_cuda_debug_mode(self):
+        pg = self._create_wrapper_pg(with_new_group=True)
+        self._test_collectives_op_mismatch(pg, use_cuda=True)
+
+    @skip_if_lt_x_gpu(4)
+    @with_dist_debug_levels(levels=["OFF"])
+    def test_collectives_op_mismatch_cuda(self):
+        pg = self._create_wrapper_pg(with_new_group=False)
+        self._test_collectives_op_mismatch(pg, use_cuda=True)
+
+    @skip_if_lt_x_gpu(4)
+    @with_dist_debug_levels(levels=["DETAIL"])
+    def test_collective_shape_mismatch_cuda_debug_mode(self):
+        pg = self._create_wrapper_pg(with_new_group=True)
+        self._test_collective_shape_mismatch(pg, use_cuda=True)
+
+    @skip_if_lt_x_gpu(4)
+    @with_dist_debug_levels(levels=["OFF"])
+    def test_collective_shape_mismatch_cuda(self):
+        pg = self._create_wrapper_pg(with_new_group=False)
+        self._test_collective_shape_mismatch(pg, use_cuda=True)
 
 
 if __name__ == "__main__":
diff --git a/test/run_test.py b/test/run_test.py
index e043bcd0ad152..ad3cbb90b6d30 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -65,7 +65,6 @@
     'test_dataloader',
     'test_datapipe',
     'distributed/test_data_parallel',
-    'distributed/test_distributed_fork',
     'distributed/test_distributed_spawn',
     'distributions/test_constraints',
     'distributions/test_distributions',
@@ -212,7 +211,6 @@
     'distributed/rpc/test_faulty_agent',
     'distributed/rpc/test_tensorpipe_agent',
     'distributed/rpc/cuda/test_tensorpipe_agent',
-    'distributed/test_distributed_fork',
     'distributed/pipeline/sync/skip/test_api',
     'distributed/pipeline/sync/skip/test_gpipe',
     'distributed/pipeline/sync/skip/test_inspect_skip_layout',
@@ -294,7 +292,6 @@
     'test_testing',
     'test_view_ops',
     'distributed/nn/jit/test_instantiator',
-    'distributed/test_distributed_fork',
     'distributed/rpc/test_tensorpipe_agent',
     'distributed/rpc/cuda/test_tensorpipe_agent',
     'distributed/algorithms/ddp_comm_hooks/test_ddp_hooks',
@@ -576,7 +573,7 @@ def test_distributed(test_module, test_directory, options):
             os.environ['INIT_METHOD'] = 'env://'
             os.environ.update(env_vars)
             if with_init_file:
-                if test_module in ["test_distributed_fork", "test_distributed_spawn"]:
+                if test_module == "test_distributed_spawn":
                     init_method = f'{FILE_SCHEMA}{tmp_dir}/'
                 else:
                     init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file'
@@ -611,7 +608,6 @@ def test_distributed(test_module, test_directory, options):
     'test_cuda_primary_ctx': test_cuda_primary_ctx,
     'test_cpp_extensions_aot_no_ninja': test_cpp_extensions_aot_no_ninja,
     'test_cpp_extensions_aot_ninja': test_cpp_extensions_aot_ninja,
-    'distributed/test_distributed_fork': test_distributed,
     'distributed/test_distributed_spawn': test_distributed,
 }
 
diff --git a/test/test_determination.py b/test/test_determination.py
index 6d338af4b6c8f..6b7fcc0f0d242 100644
--- a/test/test_determination.py
+++ b/test/test_determination.py
@@ -16,7 +16,6 @@ class DeterminationTest(unittest.TestCase):
         "test_jit_profiling",
         "test_jit",
         "test_torch",
-        "distributed/test_distributed_fork",
         "distributed/test_distributed_spawn",
         "test_cpp_extensions_aot_ninja",
         "test_cpp_extensions_aot_no_ninja",
@@ -104,7 +103,6 @@ def test_torch_file(self):
         self.assertEqual(
             self.determined_tests(["torch/utils/cpp_extension.py"]),
             [
-                "distributed/test_distributed_fork",
                 "test_cpp_extensions_aot_ninja",
                 "test_cpp_extensions_aot_no_ninja",
                 "test_utils",
diff --git a/tools/stats/print_test_stats.py b/tools/stats/print_test_stats.py
index 7cc853e925181..71df463b14516 100755
--- a/tools/stats/print_test_stats.py
+++ b/tools/stats/print_test_stats.py
@@ -630,7 +630,6 @@ def __init__(self, name: str) -> None:
 
     def append(self, test_case: TestCase, test_type: str) -> None:
         is_multi_test = self.name == 'test_cpp_extensions_aot' or \
-            self.name == 'distributed/test_distributed_fork' or \
             self.name == 'distributed/test_distributed_spawn' or \
             self.name == 'distributed/test_c10d_gloo' or \
             self.name == 'cpp'  # The caffe2 cpp tests spawn duplicate test cases as well.
diff --git a/torch/distributed/CONTRIBUTING.md b/torch/distributed/CONTRIBUTING.md
index 5e426466ec67d..6cbaea694f215 100644
--- a/torch/distributed/CONTRIBUTING.md
+++ b/torch/distributed/CONTRIBUTING.md
@@ -85,7 +85,6 @@ python test/distributed/test_store.py
 python test/distributed/test_pg_wrapper.py
 
 # Run distributed tests, including tests for Distributed Data Parallel.
-python test/run_test.py --verbose -i distributed/test_distributed_fork
 python test/run_test.py --verbose -i distributed/test_distributed_spawn
 
 # Run the RPC test suite for the TensorPipeAgent.
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 74ed9a069604c..fb505d105980c 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -9,6 +9,7 @@
 import traceback
 import types
 import unittest
+import warnings
 from contextlib import contextmanager
 from datetime import timedelta
 from enum import Enum
@@ -468,6 +469,10 @@ def _start_processes(self, proc) -> None:
             self.processes.append(process)
 
     def _fork_processes(self) -> None:
+        warnings.warn(
+            "Fork based multiprocessing is dangerous and should not"
+            " be used, for tests with ASAN consider using opt-asan",
+            DeprecationWarning)
         proc = torch.multiprocessing.get_context("fork").Process
         self._start_processes(proc)
 

From 125e2d02e575612eb427104e7c67f1c28f090db8 Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Fri, 20 Aug 2021 12:26:58 -0700
Subject: [PATCH 107/530] Revert D30417370: [nnc] Enable CPU fusion

Test Plan: revert-hammer

Differential Revision:
D30417370 (https://github.com/pytorch/pytorch/commit/b9fc656cf26d60127bd695e4e5a7d27622f2563d)

Original commit changeset: 84ce7a578a36

fbshipit-source-id: cd23774cdc3273fd72f8a05f1900eaf36f373e6b
---
 torch/csrc/jit/codegen/fuser/interface.cpp |  8 ++++++--
 torch/csrc/jit/passes/graph_fuser.cpp      | 12 +-----------
 torch/csrc/jit/passes/graph_fuser.h        |  3 ---
 torch/csrc/jit/python/init.cpp             |  2 --
 4 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/torch/csrc/jit/codegen/fuser/interface.cpp b/torch/csrc/jit/codegen/fuser/interface.cpp
index ef7e9e0b629d5..ec67c4bd83773 100644
--- a/torch/csrc/jit/codegen/fuser/interface.cpp
+++ b/torch/csrc/jit/codegen/fuser/interface.cpp
@@ -8,12 +8,15 @@
 #include <c10/util/Flags.h>
 #include <stdexcept>
 
+C10_DEFINE_bool(torch_jit_enable_cpu_fusion, false, "enable cpu fusion");
+
 namespace torch {
 namespace jit {
 
 namespace detail {
 
-#ifdef TORCH_ENABLE_LLVM
+// Note: CPU fusion is currently disabled due to test flakiness
+#if defined(FBCODE_CAFFE2)
 bool cpu_fuser_enabled = true;
 #else
 bool cpu_fuser_enabled = false;
@@ -34,7 +37,8 @@ void runFusion(const int64_t key, Stack& stack) {
 }
 
 bool canFuseOnCPU() {
-  return fuser::hasFusionBackend(DeviceType::CPU) && detail::cpu_fuser_enabled;
+  return fuser::hasFusionBackend(DeviceType::CPU) &&
+      (detail::cpu_fuser_enabled || FLAGS_torch_jit_enable_cpu_fusion);
 }
 
 bool canFuseOnGPU() {
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 653f9fec08b32..f7dd466de4ff4 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -183,7 +183,7 @@ struct GraphFuser {
       return !strict_fuser_check;
     }
     if ((*device).is_cpu()) {
-      return canFuseOnCPULegacy();
+      return canFuseOnCPU();
     } else if ((*device).is_cuda()) {
       return canFuseOnGPU();
     } else if ((*device).is_xpu()) {
@@ -1244,16 +1244,6 @@ void PeepholeOptimizeShapeExpressions(Block* block, AliasDb* db) {
 
 } // anonymous namespace
 
-static bool cpu_fuser_enabled_legacy = false;
-
-bool canFuseOnCPULegacy() {
-  return cpu_fuser_enabled_legacy;
-}
-
-void overrideCanFuseOnCPULegacy(bool value) {
-  cpu_fuser_enabled_legacy = value;
-}
-
 void FuseGraph(std::shared_ptr<Graph>& graph, bool strict_fuser_check) {
   AliasDb db(graph);
   GraphFuser(&db, graph->block(), strict_fuser_check).run();
diff --git a/torch/csrc/jit/passes/graph_fuser.h b/torch/csrc/jit/passes/graph_fuser.h
index d710e5a098098..0cdcc2e20f469 100644
--- a/torch/csrc/jit/passes/graph_fuser.h
+++ b/torch/csrc/jit/passes/graph_fuser.h
@@ -5,9 +5,6 @@
 namespace torch {
 namespace jit {
 
-TORCH_API bool canFuseOnCPULegacy();
-TORCH_API void overideCanFuseOnCPULegacy(bool value);
-
 // NB: Be sure to run DCE before fusion, because dead instructions
 // can prevent fusion opportunities from being exploited.
 // On Windows will noop, NYI
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index f5da7b30c29d7..992e60edd7d19 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -589,8 +589,6 @@ void initJITBindings(PyObject* module) {
       .def("_jit_override_can_fuse_on_gpu", &overrideCanFuseOnGPU)
       .def("_jit_can_fuse_on_cpu", &canFuseOnCPU)
       .def("_jit_can_fuse_on_gpu", &canFuseOnGPU)
-      .def("_jit_can_fuse_on_cpu_legacy", &canFuseOnCPULegacy)
-      .def("_jit_override_can_fuse_on_cpu_legacy", &canFuseOnCPULegacy)
       .def(
           "_jit_differentiate",
           [](Graph& g) {

From a65d1ae7cc8c104d041f554d460da0a84c0f2d4e Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Fri, 20 Aug 2021 12:44:29 -0700
Subject: [PATCH 108/530] [ONNX] Fix controlflow shape inference with contrib
 op (#60707) (#62762)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62762

`ONNXShapeTypeInference` for node `n` is skipped if `n` is non ONNX namespace, or if `n` contains any non ONNX namespace nodes. This prevents controlflow nodes containing contrib ops from running `SpecialPostProcess`, which sets up correct node output shape/type information in rare cases.

This PR depends on opset 14 export https://github.com/pytorch/pytorch/pull/59486

Test Plan: Imported from OSS

Reviewed By: SplitInfinity

Differential Revision: D30375180

Pulled By: msaroufim

fbshipit-source-id: 5deacec39f091deb4d75ddd9e660e12fca7f16c5

Co-authored-by: BowenBao <bowbao@microsoft.com>
---
 test/onnx/test_custom_ops.py                  |  32 ++++
 .../passes/onnx/fixup_onnx_controlflow.cpp    | 145 +++++++++++++----
 .../jit/passes/onnx/fixup_onnx_controlflow.h  |   1 +
 .../jit/passes/onnx/shape_type_inference.cpp  | 150 +++++++++++-------
 4 files changed, 240 insertions(+), 88 deletions(-)

diff --git a/test/onnx/test_custom_ops.py b/test/onnx/test_custom_ops.py
index 739f267f90a95..04ac9a0066876 100644
--- a/test/onnx/test_custom_ops.py
+++ b/test/onnx/test_custom_ops.py
@@ -125,5 +125,37 @@ def symbolic_pythonop(g, n, *args, **kwargs):
         model = MyModule()
         run_model_test(self, model, input=(x, ))
 
+class TestExportAsContribOps(unittest.TestCase):
+    opset_version = 14
+    keep_initializers_as_inputs = False
+    onnx_shape_inference = True
+
+    def test_contrib_op_with_loop(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.gelu = torch.nn.GELU()
+
+            def forward(self, x):
+                res = []
+                res2 = []
+                for i in range(x.size(0)):
+                    if len(res) > 0:
+                        res2.append(res[0])
+                    else:
+                        res2.append(self.gelu(x[0]))
+                    res.append(x[0])
+                return torch.stack(res), torch.stack(res2)
+
+        def symbolic_custom_gelu(g, input):
+            return g.op("com.microsoft::Gelu", input).setType(input.type())
+
+        from torch.onnx import register_custom_op_symbolic
+        register_custom_op_symbolic("::gelu", symbolic_custom_gelu, 1)
+
+        x = torch.randn(3, 3, 4, requires_grad=True)
+        model = torch.jit.script(M())
+        run_model_test(self, model, input=(x, ))
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
index abfb547ed5e94..b0a310bfe20ad 100644
--- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
@@ -238,9 +238,7 @@ std::vector<Value*> FixupONNXLoopNode(Node* node, int opset_version) {
   auto new_outputs = ConvertSequenceDependencies(node, opset_version);
 
   // Copy type of block output to node output.
-  for (size_t i = 0; i < node->outputs().size(); ++i) {
-    node->output(i)->setType(node->blocks().at(0)->outputs().at(i + 1)->type());
-  }
+  FixupONNXControlflowNodeOutputs(node);
   TORCH_INTERNAL_ASSERT(output_size == new_outputs.size());
   return new_outputs;
 }
@@ -347,25 +345,90 @@ void ONNXFixupUninitializedOutput(Node* node) {
           graph, else_block, else_block_output, then_block_output);
       if_node->outputs()[i]->setType(else_block->outputs()[i]->type());
     }
-    auto then_tensor_type =
-        then_block->outputs().at(i)->type()->castRaw<TensorType>();
-    auto else_tensor_type =
-        else_block->outputs().at(i)->type()->castRaw<TensorType>();
-    if (then_tensor_type && else_tensor_type) {
-      const auto& then_shape = then_tensor_type->symbolic_sizes();
-      const auto& else_shape = else_tensor_type->symbolic_sizes();
-      std::vector<::c10::ShapeSymbol> dims;
-      if (then_shape.rank() && else_shape.rank() &&
-          then_shape.rank() == else_shape.rank()) {
-        for (const auto j : c10::irange(then_shape.rank().value())) {
-          if (then_shape[j] == else_shape[j]) {
-            dims.emplace_back(then_shape[j]);
-          } else {
-            dims.emplace_back(::c10::ShapeSymbol::newSymbol());
-          }
+  }
+}
+
+void ONNXMergeIfBlockOutputShapes(Node* node) {
+  TORCH_INTERNAL_ASSERT(node->kind() == ::c10::onnx::If);
+  Block* then_block = node->blocks().at(0);
+  Block* else_block = node->blocks().at(1);
+
+  TORCH_INTERNAL_ASSERT(
+      then_block->outputs().size() == else_block->outputs().size())
+
+  auto findCommonShape =
+      [](const ::c10::SymbolicShape& a,
+         const ::c10::SymbolicShape& b) -> ::c10::SymbolicShape {
+    std::vector<::c10::ShapeSymbol> dims;
+    if (a.rank() && b.rank() && a.rank() == b.rank()) {
+      for (const auto j : c10::irange(a.rank().value())) {
+        if (a[j] == b[j]) {
+          dims.emplace_back(a[j]);
+        } else {
+          dims.emplace_back(::c10::ShapeSymbol::newSymbol());
         }
-        if_node->output(i)->setType(
-            then_tensor_type->withSymbolicShapes(::c10::SymbolicShape(dims)));
+      }
+      return ::c10::SymbolicShape(dims);
+    }
+    if (a.rank() && a.rank().value() > 0) {
+      return a;
+    }
+    if (b.rank() && b.rank().value() > 0) {
+      return b;
+    }
+
+    return ::c10::SymbolicShape();
+  };
+
+  auto mergeTensorType =
+      [&findCommonShape](TensorTypePtr a, TensorTypePtr b) -> TensorTypePtr {
+    if (a && b) {
+      const auto& a_shape = a->symbolic_sizes();
+      const auto& b_shape = b->symbolic_sizes();
+      auto commonShape = findCommonShape(a_shape, b_shape);
+      return a->withSymbolicShapes(commonShape);
+    } else if (a) {
+      return a;
+    } else if (b) {
+      return b;
+    }
+    return nullptr;
+  };
+
+  auto mergeListType = [&mergeTensorType](
+                           ListTypePtr a, ListTypePtr b) -> ListTypePtr {
+    if (a && b) {
+      auto a_tensor_type = a->getElementType()->cast<TensorType>();
+      auto b_tensor_type = b->getElementType()->cast<TensorType>();
+      auto tensor_type = mergeTensorType(a_tensor_type, b_tensor_type);
+      if (tensor_type) {
+        return a->withContained({tensor_type})->cast<ListType>();
+      }
+      // Both branches produce ListType without tensor shape.
+      return a;
+    } else if (a) {
+      return a;
+    } else if (b) {
+      return b;
+    }
+    return nullptr;
+  };
+
+  for (const auto i : c10::irange(else_block->outputs().size())) {
+    auto then_type = then_block->outputs().at(i)->type();
+    auto else_type = else_block->outputs().at(i)->type();
+    auto then_tensor_type = then_type->cast<TensorType>();
+    auto else_tensor_type = else_type->cast<TensorType>();
+    auto then_list_type = then_type->cast<ListType>();
+    auto else_list_type = else_type->cast<ListType>();
+    if (then_tensor_type || else_tensor_type) {
+      if (auto tensor_type =
+              mergeTensorType(then_tensor_type, else_tensor_type)) {
+        node->output(i)->setType(tensor_type);
+      }
+    } else if (then_list_type || else_list_type) {
+      if (auto list_type = mergeListType(then_list_type, else_list_type)) {
+        node->output(i)->setType(list_type);
       }
     }
   }
@@ -376,16 +439,13 @@ std::vector<Value*> FixupONNXIfNode(Node* node, int opset_version) {
     return node->outputs().vec();
   }
   GRAPH_DUMP("Graph before fixing controlflow: ", node->owningGraph());
-  auto* if_node = node;
   FixupONNXSubblockOutputs(node);
-  ONNXFixupUninitializedOutput(if_node);
+  ONNXFixupUninitializedOutput(node);
   // Copy type of block output to node output.
-  for (size_t i = 0; i < node->outputs().size(); ++i) {
-    node->output(i)->setType(node->blocks().at(0)->outputs().at(i)->type());
-  }
+  ONNXMergeIfBlockOutputShapes(node);
 
   GRAPH_DUMP("Graph after fixing controlflow: ", node->owningGraph());
-  return if_node->outputs().vec();
+  return node->outputs().vec();
 }
 
 std::vector<Value*> FixupONNXControlflowNode(Node* n, int opset_version) {
@@ -401,5 +461,36 @@ std::vector<Value*> FixupONNXControlflowNode(Node* n, int opset_version) {
   }
 }
 
+void FixupONNXControlflowNodeOutputs(Node* n) {
+  switch (n->kind()) {
+    case ::c10::onnx::Loop: {
+      auto loop_carried_output_size = n->blocks().at(0)->inputs().size() - 2;
+      for (auto i : c10::irange(n->outputs().size())) {
+        auto type = n->blocks().at(0)->outputs().at(i + 1)->type();
+        if (i < loop_carried_output_size) {
+          n->output(i)->setType(type);
+        } else {
+          if (auto t_type = type->cast<TensorType>()) {
+            auto sizes = t_type->symbolic_sizes().sizes();
+            if (sizes.has_value()) {
+              sizes.value().emplace(
+                  sizes.value().begin(), c10::ShapeSymbol::newSymbol());
+              type = t_type->withSymbolicShapes(sizes.value());
+            }
+          }
+          n->output(i)->setType(type);
+        }
+      }
+      break;
+    }
+    case ::c10::onnx::If: {
+      ONNXMergeIfBlockOutputShapes(n);
+      break;
+    }
+    default:
+      break;
+  }
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h
index fad7611085223..8d33c2dd1fb5e 100644
--- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h
@@ -6,6 +6,7 @@ namespace torch {
 namespace jit {
 
 std::vector<Value*> FixupONNXControlflowNode(Node* n, int opset_version);
+void FixupONNXControlflowNodeOutputs(Node* n);
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index f630cf023f7b5..8ade722fb8bd9 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -4,6 +4,7 @@
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/onnx/constant_fold.h>
 #include <torch/csrc/jit/passes/onnx/constant_map.h>
+#include <torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h>
 #include <torch/csrc/jit/passes/onnx/fold_if_node.h>
 #include <torch/csrc/jit/passes/onnx/helper.h>
 #include <torch/csrc/jit/passes/onnx/scalar_type_analysis.h>
@@ -180,7 +181,21 @@ void UpdateTorchValueByOnnxValueInfo(
   }
 }
 
-bool IsSupportedNode(const Node* n) {
+bool IsValidONNXControlflowNode(const Node* n) {
+  // Skip when block size is zero. This is when the node is being created,
+  // and doesn't have subblocks attached yet. Run shape inference for these
+  // nodes later, when the subgraph has already completed shape inferencing.
+  auto node_kind = n->kind();
+  if (node_kind == ::c10::onnx::Loop || node_kind == ::c10::onnx::If) {
+    if (n->blocks().size() == 0) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool IsValidONNXNode(const Node* n) {
   auto node_kind = n->kind();
 
   if (!node_kind.is_onnx()) {
@@ -188,18 +203,14 @@ bool IsSupportedNode(const Node* n) {
     return false;
   }
 
-  // Skip when block size is zero. This is when the node is first created,
-  // doesn't have subblocks attached yet. Run shape inference for these nodes
-  // when the subgraph has already completed shape inferencing.
-  if (node_kind == ::c10::onnx::Loop || node_kind == ::c10::onnx::If) {
-    if (n->blocks().size() == 0) {
-      return false;
-    }
-    for (auto b : n->blocks()) {
-      for (auto b_n : b->nodes()) {
-        if (!IsSupportedNode(b_n)) {
-          return false;
-        }
+  if (!IsValidONNXControlflowNode(n)) {
+    return false;
+  }
+
+  for (auto b : n->blocks()) {
+    for (auto b_n : b->nodes()) {
+      if (!IsValidONNXNode(b_n)) {
+        return false;
       }
     }
   }
@@ -1310,6 +1321,20 @@ void SpecialPostProcess(Node* n) {
       }
       break;
     }
+    case ::c10::onnx::If: {
+      if (!IsValidONNXControlflowNode(n)) {
+        break;
+      }
+      FixupONNXControlflowNodeOutputs(n);
+      break;
+    }
+    case ::c10::onnx::Loop: {
+      if (!IsValidONNXControlflowNode(n)) {
+        break;
+      }
+      FixupONNXControlflowNodeOutputs(n);
+      break;
+    }
   }
 }
 
@@ -1391,64 +1416,67 @@ void ONNXShapeTypeInference(
     int opset_version) {
   GRAPH_UPDATE(
       "Running ONNX shape inference for node: ", n->kind().toDisplayString());
-  if (!IsSupportedNode(n)) {
-    return;
-  }
-  // Create a Graph containing only the single node n.
-  // This graph is later converted to ONNX to run shape inference.
-  auto n_graph = std::make_shared<Graph>();
-  auto clone_node = CloneNodeToGraph(n, n_graph, params_dict, opset_version);
-  n_graph->insertNode(clone_node);
+  if (IsValidONNXNode(n)) {
+    // Create a Graph containing only the single node n.
+    // This graph is later converted to ONNX to run shape inference.
+    auto n_graph = std::make_shared<Graph>();
+    auto clone_node = CloneNodeToGraph(n, n_graph, params_dict, opset_version);
+    n_graph->insertNode(clone_node);
 
-  // Register all node outputs as graph outputs.
-  for (auto output : clone_node->outputs()) {
-    n_graph->registerOutput(output);
-  }
+    // Register all node outputs as graph outputs.
+    for (auto output : clone_node->outputs()) {
+      n_graph->registerOutput(output);
+    }
 
-  // Use scalar_type_analysis without low precision cast
-  ScalarTypeAnalysisForONNX(n_graph, false, opset_version);
+    // Use scalar_type_analysis without low precision cast
+    ScalarTypeAnalysisForONNX(n_graph, false, opset_version);
 
-  GRAPH_DEBUG("Original torch graph: ", n->owningGraph()->toString());
-  GRAPH_DEBUG(
-      "Cloned torch graph to run shape inference: ", n_graph->toString());
-
-  if (IsGraphValidForInference(n_graph)) {
-    // TODO: Some ops have conversion happen at Peephole pass.
-    //       The conversion here is incomplete for these ops.
-    //       e.g: ListConstruct, ListUnpack, etc.
-    std::shared_ptr<onnx::ModelProto> model_proto;
-    SymbolDimMap symbol_map;
-    ConvertGraphToONNXProto(n_graph, model_proto, symbol_map, opset_version);
+    GRAPH_DEBUG("Original torch graph: ", n->owningGraph()->toString());
     GRAPH_DEBUG(
-        "ONNX graph to run shape inference: ", prettyPrint(*model_proto));
-
-    // infer shape
-    try {
-      onnx::shape_inference::InferShapes(*model_proto);
-      UpdateOutputTypeByONNXProto(n, clone_node, *model_proto, symbol_map);
-    } catch (std::runtime_error& ex) {
-      // TODO: include this as warning once we have a more consolidated warning
-      // system.
+        "Cloned torch graph to run shape inference: ", n_graph->toString());
+
+    if (IsGraphValidForInference(n_graph)) {
+      // TODO: Some ops have conversion happen at Peephole pass.
+      //       The conversion here is incomplete for these ops.
+      //       e.g: ListConstruct, ListUnpack, etc.
+      std::shared_ptr<onnx::ModelProto> model_proto;
+      SymbolDimMap symbol_map;
+      ConvertGraphToONNXProto(n_graph, model_proto, symbol_map, opset_version);
       GRAPH_DEBUG(
-          "ONNX shape inference fails with: ",
-          ex.what(),
-          " on graph: ",
-          n_graph->toString());
-      // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-      const char shape_err[] = "ShapeInferenceError";
-      // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-      const char type_err[] = "TypeInferenceError";
-      if ((strstr(ex.what(), shape_err) == nullptr) &&
-          (strstr(ex.what(), type_err) == nullptr)) {
-        throw;
+          "ONNX graph to run shape inference: ", prettyPrint(*model_proto));
+
+      // infer shape
+      try {
+        onnx::shape_inference::InferShapes(*model_proto);
+        UpdateOutputTypeByONNXProto(n, clone_node, *model_proto, symbol_map);
+      } catch (std::runtime_error& ex) {
+        // TODO: include this as warning once we have a more consolidated
+        // warning system.
+        GRAPH_DEBUG(
+            "ONNX shape inference fails with: ",
+            ex.what(),
+            " on graph: ",
+            n_graph->toString());
+        // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+        const char shape_err[] = "ShapeInferenceError";
+        // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+        const char type_err[] = "TypeInferenceError";
+        // NOLINTNEXTLINE(modernize-use-nullptr)
+        if ((strstr(ex.what(), shape_err) == NULL) &&
+            // NOLINTNEXTLINE(modernize-use-nullptr)
+            (strstr(ex.what(), type_err) == NULL)) {
+          throw;
+        }
       }
+      GRAPH_DEBUG(
+          "ONNX graph after shape inference: ", prettyPrint(*model_proto));
     }
-    GRAPH_DEBUG(
-        "ONNX graph after shape inference: ", prettyPrint(*model_proto));
   }
 
   SpecialPostProcess(n);
-  ProcessConstantValueMap(n, opset_version);
+  if (IsValidONNXNode(n)) {
+    ProcessConstantValueMap(n, opset_version);
+  }
   GRAPH_DEBUG(
       "Torch graph after shape inference:", n->owningGraph()->toString());
 }

From 87602549112aac84c6f36fa2b2fd18902ec6bef6 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Fri, 20 Aug 2021 12:44:29 -0700
Subject: [PATCH 109/530] [ONNX] Fix an issue that optimizations might adjust
 graph inputs unexpectedly. (#61280) (#62763)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62763

This PR is to fix the issue that the graph inputs might be updated when we export the model in inference mode.

When a model is export in inference mode, some optimizations will be made. One side effect of these optimizations is: the inputs of graph might be adjusted. Such optimizatiosn include:

	1. Conv and BatchNorm op fusion.
	2. Do constant folding.

If the user sets export_params=False, or set keep_initializers_as_inputs=True, it's highly possible that the user wants to provide the corresponding parameters or initiliazers as the inputs of the graph.
In such situation, no matter the model is export in inference mode or training mode, exporter needs to prevent above optimizations from adjusting the graph inputs. By this, the inputs of graph could match inputs that users provided.

The changes in this PR, add an additional common judgement to see if the above optimizations needs to be done or not. From the value of export_params and keep_initializers_as_inputs arguments, infer if the graph inputs are allowed to be adjusted.
If no, these optimizations will be ignored, even other requirements are matched.

Besides these code changes, the comments of some parameters below have been updated so that users have more thoughts when they consider how to leverage these parameters for different purposes:

	1. export_params
	2. training
	3. do_constant_folding
	4. keep_initializers_as_inputs

Test Plan: Imported from OSS

Reviewed By: SplitInfinity

Differential Revision: D30375183

Pulled By: msaroufim

fbshipit-source-id: 4db8b9695649eb32a3a0fefa950ee2e5651bdba0

Co-authored-by: fatcat-z <jiz@microsoft.com>
---
 .../expect/TestOperators.test_prelu.expect    | 28 +++++-----
 ...ors.test_retain_param_name_disabled.expect | 52 ++++++++++++++-----
 torch/_C/__init__.pyi.in                      |  2 +-
 torch/csrc/jit/passes/onnx/eval_peephole.cpp  | 21 ++++++--
 torch/csrc/jit/passes/onnx/eval_peephole.h    |  3 +-
 torch/csrc/jit/python/init.cpp                |  5 +-
 torch/onnx/__init__.py                        | 22 ++++++--
 torch/onnx/utils.py                           | 19 ++++---
 8 files changed, 107 insertions(+), 45 deletions(-)

diff --git a/test/onnx/expect/TestOperators.test_prelu.expect b/test/onnx/expect/TestOperators.test_prelu.expect
index e19623cfd4460..be0328e5c61b7 100644
--- a/test/onnx/expect/TestOperators.test_prelu.expect
+++ b/test/onnx/expect/TestOperators.test_prelu.expect
@@ -2,20 +2,30 @@ ir_version: 6
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
+  node {
+    input: "weight"
+    output: "2"
+    name: "Unsqueeze_0"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 1
+      ints: 2
+      type: INTS
+    }
+  }
   node {
     input: "input"
-    input: "4"
+    input: "2"
     output: "3"
-    name: "PRelu_0"
+    name: "PRelu_1"
     op_type: "PRelu"
   }
   name: "torch-jit-export"
   initializer {
     dims: 2
-    dims: 1
-    dims: 1
     data_type: 1
-    name: "4"
+    name: "weight"
     raw_data: "\000\000\200>\000\000\200>"
   }
   input {
@@ -41,7 +51,7 @@ graph {
     }
   }
   input {
-    name: "4"
+    name: "weight"
     type {
       tensor_type {
         elem_type: 1
@@ -49,12 +59,6 @@ graph {
           dim {
             dim_value: 2
           }
-          dim {
-            dim_value: 1
-          }
-          dim {
-            dim_value: 1
-          }
         }
       }
     }
diff --git a/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect b/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect
index 5eeaa875feb0c..aa9499e27ac49 100644
--- a/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect
+++ b/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect
@@ -2,33 +2,57 @@ ir_version: 6
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
+  node {
+    input: "1"
+    output: "3"
+    name: "Transpose_0"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      type: INTS
+    }
+  }
   node {
     input: "input.1"
-    input: "7"
+    input: "3"
     output: "4"
-    name: "MatMul_0"
+    name: "MatMul_1"
     op_type: "MatMul"
   }
+  node {
+    input: "2"
+    output: "5"
+    name: "Transpose_2"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      type: INTS
+    }
+  }
   node {
     input: "4"
-    input: "8"
+    input: "5"
     output: "6"
-    name: "MatMul_1"
+    name: "MatMul_3"
     op_type: "MatMul"
   }
   name: "torch-jit-export"
   initializer {
-    dims: 4
     dims: 5
+    dims: 4
     data_type: 1
-    name: "7"
+    name: "1"
     raw_data: "\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@"
   }
   initializer {
-    dims: 5
     dims: 6
+    dims: 5
     data_type: 1
-    name: "8"
+    name: "2"
     raw_data: "\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@"
   }
   input {
@@ -48,32 +72,32 @@ graph {
     }
   }
   input {
-    name: "7"
+    name: "1"
     type {
       tensor_type {
         elem_type: 1
         shape {
           dim {
-            dim_value: 4
+            dim_value: 5
           }
           dim {
-            dim_value: 5
+            dim_value: 4
           }
         }
       }
     }
   }
   input {
-    name: "8"
+    name: "2"
     type {
       tensor_type {
         elem_type: 1
         shape {
           dim {
-            dim_value: 5
+            dim_value: 6
           }
           dim {
-            dim_value: 6
+            dim_value: 5
           }
         }
       }
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index b683a60615dc5..4d0245c7786af 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -324,7 +324,7 @@ def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ...
 def _jit_pass_onnx_fold_if(graph: Graph) -> None: ...
 def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ...
 def _jit_pass_inline_fork_wait(graph: Graph) -> None: ...
-def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ...
+def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue], isAllowedToAdjustGraphInputs: _bool) -> Dict[str, IValue]: ...
 def _jit_pass_onnx_constant_fold(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> Dict[str, IValue]: ...
 def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ...
 def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ...
diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.cpp b/torch/csrc/jit/passes/onnx/eval_peephole.cpp
index 05afb69ef0f23..4bad9367af444 100644
--- a/torch/csrc/jit/passes/onnx/eval_peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/eval_peephole.cpp
@@ -141,14 +141,27 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) {
   }
 }
 
-void EvalPeepholeONNX(Block* b, ParamMap& paramsDict) {
+void EvalPeepholeONNX(
+    Block* b,
+    ParamMap& paramsDict,
+    bool isAllowedToAdjustGraphInputs) {
   auto valsToParamsMap = buildValueToParamsMap(b, paramsDict);
-  fuseConvBatchNorm(b, valsToParamsMap);
+
+  // Optimizations like fusing Conv and BatchNorm ops may adjust the graph
+  // inputs. If the graph inputs are not allowed to be adjusted, for example
+  // export_params is False, such optimizations will be skipped.
+  if (isAllowedToAdjustGraphInputs) {
+    fuseConvBatchNorm(b, valsToParamsMap);
+  }
+
   buildParamsMapFromValueToParamsMap(valsToParamsMap, paramsDict);
 }
 
-void EvalPeepholeONNX(std::shared_ptr<Graph>& g, ParamMap& paramsDict) {
-  EvalPeepholeONNX(g->block(), paramsDict);
+void EvalPeepholeONNX(
+    std::shared_ptr<Graph>& g,
+    ParamMap& paramsDict,
+    bool isAllowedToAdjustGraphInputs) {
+  EvalPeepholeONNX(g->block(), paramsDict, isAllowedToAdjustGraphInputs);
   GRAPH_DUMP("After EvalPeepholeONNX:", g);
 }
 
diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.h b/torch/csrc/jit/passes/onnx/eval_peephole.h
index 6f8961d08fd5e..d953f2c2e5bda 100644
--- a/torch/csrc/jit/passes/onnx/eval_peephole.h
+++ b/torch/csrc/jit/passes/onnx/eval_peephole.h
@@ -9,7 +9,8 @@ namespace jit {
 
 void EvalPeepholeONNX(
     std::shared_ptr<Graph>& g,
-    std::map<std::string, IValue>& paramDict);
+    std::map<std::string, IValue>& paramDict,
+    bool isAllowedToAdjustGraphInputs);
 
 } // namespace jit
 
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 992e60edd7d19..86b64b8342a7d 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -203,8 +203,9 @@ void initJITBindings(PyObject* module) {
       .def(
           "_jit_pass_onnx_eval_peephole",
           [](std::shared_ptr<Graph>& graph,
-             std::map<std::string, IValue>& paramsDict) {
-            EvalPeepholeONNX(graph, paramsDict);
+             std::map<std::string, IValue>& paramsDict,
+             bool isAllowedToAdjustGraphInputs) {
+            EvalPeepholeONNX(graph, paramsDict, isAllowedToAdjustGraphInputs);
             return paramsDict;
           },
           pybind11::return_value_policy::move)
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index b726b2b55e8b6..e058acce1947d 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -103,11 +103,17 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM
         export_params (bool, default True): if True, all parameters will
             be exported. Set this to False if you want to export an untrained model.
             In this case, the exported model will first take all of its parameters
-            as arguments, with the ordering as specified by ``model.state_dict().values()``
+            as arguments, with the ordering as specified by ``model.state_dict().values()``.
+            This helps in stripping parameters from the model which is useful for training.
+            Besides, if this is False, any optimization that may adjust graph inputs will
+            be skipped - for example, Conv and BatchNorm fusion.
         verbose (bool, default False): if True, prints a description of the
             model being exported to stdout.
         training (enum, default TrainingMode.EVAL):
-            * ``TrainingMode.EVAL``: export the model in inference mode.
+            * ``TrainingMode.EVAL``: export the model in inference mode. In this case, optimizations
+              (e.g., fusing Conv and BatchNorm ops) may adjust graph inputs by modifying model params
+              and model param names. Such adjustment could be skipped by setting export_params = False
+              or keep_initializers_as_inputs = True.
             * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is
               False and in training mode if model.training is True.
             * ``TrainingMode.TRAINING``: export the model in training mode. Disables optimizations
@@ -184,6 +190,8 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM
         do_constant_folding (bool, default False): Apply the constant-folding optimization.
             Constant-folding will replace some of the ops that have all constant inputs
             with pre-computed constant nodes.
+            Since this optimization adjusts model initializers, it will be disabled if
+            export_params = False or keep_initializers_as_inputs = True.
         example_outputs (T or a tuple of T, where T is Tensor or convertible to Tensor, default None):
             Must be provided when exporting a ScriptModule or ScriptFunction, ignored otherwise.
             Used to determine the type and shape of the outputs without tracing the execution of
@@ -265,9 +273,13 @@ def forward(self, x):
 
         keep_initializers_as_inputs (bool, default None): If True, all the
             initializers (typically corresponding to parameters) in the
-            exported graph will also be added as inputs to the graph. If False,
-            then initializers are not added as inputs to the graph, and only
-            the non-parameter inputs are added as inputs.
+            exported graph will also be added as inputs to the graph.
+
+            If False, then initializers are not added as inputs to the graph, and only
+            the non-parameter inputs are added as inputs. Meanwhile, the optimization
+            that might adjust graph inputs will be skipped (e.g., fusing Conv and
+            BatchNorm ops), even when the user export this model in inference mode.
+
             This may allow for better optimizations (e.g. constant folding) by
             backends/runtimes.
 
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 41ba20f3ad102..7860e38034028 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -439,7 +439,8 @@ def _model_to_graph(model, args, verbose=False,
                     example_outputs=None,
                     _retain_param_name=False, do_constant_folding=True,
                     _disable_torch_constant_prop=False, fixed_batch_size=False,
-                    training=None, dynamic_axes=None):
+                    training=None, dynamic_axes=None, export_params=True,
+                    keep_initializers_as_inputs=False):
     r"""Converts model into an ONNX graph.
 
     Returns:
@@ -498,10 +499,12 @@ def _model_to_graph(model, args, verbose=False,
 
     params_dict = _get_named_param_dict(graph, params)
 
-    if training is None or training == TrainingMode.EVAL:
-        params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict)
+    allow_adjust_graph_inputs = (export_params and not keep_initializers_as_inputs)
+    if (training is None or training == TrainingMode.EVAL):
+        params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict, allow_adjust_graph_inputs)
 
-    if do_constant_folding and _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions:
+    if do_constant_folding and allow_adjust_graph_inputs and \
+            _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions:
         params_dict = torch._C._jit_pass_onnx_constant_fold(graph, params_dict,
                                                             _export_onnx_opset_version)
         torch._C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
@@ -569,7 +572,9 @@ def _export_to_pretty_string(model, args, f, export_params=True, verbose=False,
                                                         output_names, operator_export_type,
                                                         example_outputs, _retain_param_name,
                                                         val_do_constant_folding, fixed_batch_size=fixed_batch_size,
-                                                        training=training)
+                                                        training=training,
+                                                        export_params=export_params,
+                                                        keep_initializers_as_inputs=val_keep_init_as_ip)
 
         return graph._pretty_print_onnx(params_dict, opset_version, False,
                                         operator_export_type, google_printer,
@@ -685,7 +690,9 @@ def _export(model, args, f, export_params=True, verbose=False, training=None,
                                 val_do_constant_folding,
                                 fixed_batch_size=fixed_batch_size,
                                 training=training,
-                                dynamic_axes=dynamic_axes)
+                                dynamic_axes=dynamic_axes,
+                                export_params=export_params,
+                                keep_initializers_as_inputs=val_keep_init_as_ip)
 
             # TODO: Don't allocate a in-memory string for the protobuf
             defer_weight_export = export_type is not ExportTypes.PROTOBUF_FILE

From db0771b05d81e9ca5e46740b09589a8ff0bc3ec0 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Fri, 20 Aug 2021 12:44:29 -0700
Subject: [PATCH 110/530] [ONNX] Update repeat_interleave for dynamic repeats
 (#59979) (#62764)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62764

Fixes #58733

- Support dynamic interleave for cases with dynamic repeat values
- Moved repeat_interleave symbolic from opset 11 to opset 13, as sequence as output types for loop outputs is needed for this change

Test Plan: Imported from OSS

Reviewed By: SplitInfinity

Differential Revision: D30375179

Pulled By: msaroufim

fbshipit-source-id: 787f96bf91d124fd0483761088c5f4ae930d96a9

Co-authored-by: Shubham Bhokare <shubhambhokare@gmail.com>
---
 test/onnx/test_pytorch_onnx_onnxruntime.py |  61 ++++++++---
 torch/onnx/symbolic_opset11.py             | 104 ------------------
 torch/onnx/symbolic_opset13.py             | 118 ++++++++++++++++++++-
 torch/onnx/symbolic_opset9.py              |   7 +-
 4 files changed, 171 insertions(+), 119 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 67903fb0bd94c..fd1062946941c 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -4323,7 +4323,7 @@ def forward(self, x):
         x = torch.tensor([[1, 2], [3, 4]])
         self.run_test(RepeatsDimsModel2(), (x,))
 
-    @skipIfUnsupportedMinOpsetVersion(11)
+    @skipIfUnsupportedMinOpsetVersion(13)
     def test_dynamic_repeat_interleave(self):
         class SingleDynamicModel(torch.nn.Module):
             def forward(self, x):
@@ -4345,25 +4345,62 @@ def forward(self, x):
         self.run_test(NegDynamicModel(), x, test_with_inputs=[another_x],
                       input_names=["input_1"], dynamic_axes={"input_1" : {1 : "w"}})
 
-        class SingleDynamicModel2(torch.nn.Module):
+        class SingleDynamicModelFloat(torch.nn.Module):
             def forward(self, x):
                 repeats = torch.tensor([4])
                 return torch.repeat_interleave(x, repeats, dim=0)
 
-        x = torch.tensor([[1, 2], [3, 4]])
-        another_x = torch.tensor([[7, 8], [5, 6]])
-        self.run_test(SingleDynamicModel2(), x, test_with_inputs=[another_x],
+        x = torch.tensor([[1.1, 2.1], [3.1, 4.1]])
+        another_x = torch.tensor([[7.1, 8.1], [5.1, 6.1]])
+        self.run_test(SingleDynamicModelFloat(), x, test_with_inputs=[another_x],
                       input_names=["input_1"], dynamic_axes={"input_1" : {0 : "h"}})
 
-        class AllDynamicModel(torch.nn.Module):
-            def forward(self, x):
-                repeats = torch.tensor([4])
-                return torch.repeat_interleave(x, repeats, dim=0)
+        class DynamicRepeatsModel(torch.nn.Module):
+            def forward(self, x, repeats):
+                return torch.repeat_interleave(x, repeats, dim=1)
 
-        x = torch.tensor([[1, 2, 4, 16], [3, 9, 27, 81], [2, 3, 5, 7]])
+        x = torch.tensor([[1, 2, 4], [3, 4, 7]])
         another_x = torch.tensor([[7, 8], [5, 6]])
-        self.run_test(AllDynamicModel(), x, test_with_inputs=[another_x],
-                      input_names=["input_1"], dynamic_axes={"input_1" : {0 : "h", 1 : "w"}})
+        repeats = torch.tensor([2])
+        another_repeats = torch.tensor([4])
+        self.run_test(DynamicRepeatsModel(), (x, repeats), test_with_inputs=[(another_x, another_repeats)],
+                      input_names=["input_1", "repeats_1"],
+                      dynamic_axes={"input_1" : {1 : "w"}, "repeats_1" : {0 : "r"}})
+
+        class DynamicRepeatsModel2(torch.nn.Module):
+            def forward(self, x, repeats):
+                return torch.repeat_interleave(x, repeats, dim=1)
+
+        x = torch.tensor([[1, 2, 4], [3, 4, 7]])
+        repeats = torch.tensor([2])
+        another_repeats = torch.tensor([4])
+        self.run_test(DynamicRepeatsModel2(), (x, repeats), test_with_inputs=[(x, another_repeats)],
+                      input_names=["input_1", "repeats_1"],
+                      dynamic_axes={"repeats_1" : {0 : "r"}})
+
+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_multiple_dynamic_repeat_interleave(self):
+        class DynamicRepeatsModel(torch.nn.Module):
+            def forward(self, x, repeats):
+                return torch.repeat_interleave(x, repeats, dim=1)
+
+        x = torch.tensor([[1, 2, 4], [3, 4, 7]])
+        repeats = torch.tensor([2, 3, 4])
+        another_repeats = torch.tensor([4, 3, 2])
+        self.run_test(DynamicRepeatsModel(), (x, repeats), test_with_inputs=[(x, another_repeats)],
+                      input_names=["input_1", "repeats_1"],
+                      dynamic_axes={"repeats_1" : {0 : "r"}})
+
+        class DynamicRepeatsModel2(torch.nn.Module):
+            def forward(self, x, repeats):
+                return torch.repeat_interleave(x, repeats, dim=0)
+
+        x = torch.tensor([[1, 2, 4], [3, 4, 7]])
+        repeats = torch.tensor([2, 3])
+        another_repeats = torch.tensor([4, 3])
+        self.run_test(DynamicRepeatsModel2(), (x, repeats), test_with_inputs=[(x, another_repeats)],
+                      input_names=["input_1", "repeats_1"],
+                      dynamic_axes={"repeats_1" : {0 : "r"}})
 
     def test_view(self):
         class ViewModel(torch.nn.Module):
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index ed7abf263f31c..53440f15928ee 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -894,110 +894,6 @@ def chunk(g, self, chunks, dim):
     chunk_vec = g.op("Concat", *chunk_vec, axis_i=0)
     return split(g, self, chunk_vec, dim)
 
-def repeat_interleave(g, self, repeats, dim=None, output_size=None):
-    input = self
-    final_dim = dim
-    # if dim is None flatten
-    # By default, use the flattened input array, and return a flat output array
-    if sym_help._is_none(dim):
-        input = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1])))
-        dim = 0
-    else:
-        dim = sym_help._maybe_get_scalar(dim)
-
-    repeats_dim = sym_help._get_tensor_rank(repeats)
-    repeats_sizes = sym_help._get_tensor_sizes(repeats)
-    input_sizes = sym_help._get_tensor_sizes(input)
-    if repeats_dim is None:
-        raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown "
-                           "repeats rank.")
-    if repeats_sizes is None:
-        raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown "
-                           "repeats size.")
-    if input_sizes is None:
-        raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown "
-                           "input size.")
-    # Handle cases where dim is negative
-    if dim < 0:
-        dim += len(input_sizes)
-
-    output_sizes = input_sizes.copy()
-    perm_i = [0]
-    for idx, input_size in enumerate(input_sizes):
-        perm_i.append(idx + 1)
-        if input_size is None:
-            output_sizes[idx], input_sizes[idx] = 0, -1
-    perm_i[0], perm_i[dim] = perm_i[dim], perm_i[0]
-
-    # Cases when repeats is a single value tensor and dim has unknown input size
-    if (repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1)) and output_sizes[dim] == 0:
-        if not sym_help._is_tensor(repeats):
-            repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
-        reps = sym_help._size_helper(g, input, dim)
-        reps = unsqueeze(g, reps, 0)
-        repeats = g.op("Expand", repeats, reps)
-    # There are cases when the repeats are 1-d tensor with multiple repeats, but dim
-    # provided along one of the dynamic axes provided. A simple example would be
-    # input.shape -> [1, 1, *] where * represents the dynamic axes, and dim = 2
-    # Now, repeat interleaving can be performed in pytorch when the value of * matches
-    # with the number of elements in repeat, for example if * -> 2, number of repeats
-    # should be 2 as well.
-    else:
-        return torch.onnx.symbolic_opset9.repeat_interleave(g, self, repeats, final_dim)
-
-    reps_like = g.op("ConstantOfShape", g.op("Shape", repeats),
-                     value_t=torch.tensor([1], dtype=torch.long))
-    r_splits = split(g, repeats, reps_like, 0)
-    i_splits = split(g, input, reps_like, dim)
-
-    output_sizes[dim], input_sizes[dim] = -1, 1
-
-    # Create a loop to iterate over each value along the dimension
-    # and perform individual interleaving using the repeats tensor
-    # Loop is of the following pattern
-    # input (trip_count, cond)
-    #   int trip_count = ...;
-    #   bool cond = ...;
-    #   for (int i=0; i < trip_count && cond; ++i) {
-    #     cond = ...;
-    #   }
-
-    # Loop conditions
-    loop_condition = g.op("Constant", value_t=torch.tensor(1))
-    loop_condition = g.op("Cast", loop_condition, to_i=9)
-    loop_len = reps
-    loop = g.op("Loop", loop_len, loop_condition)
-
-    # Loop inputs
-    loop_block = _add_block(loop.node())
-    block_input_iter = _add_input_to_block(loop_block)
-    cond = _add_input_to_block(loop_block)
-
-    r_split = loop_block.op("SequenceAt", r_splits, block_input_iter)
-    i_split = loop_block.op("SequenceAt", i_splits, block_input_iter)
-
-    i_split = unsqueeze(loop_block, i_split, dim + 1)
-    r_concat = [loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[:dim + 1])),
-                r_split,
-                loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1:]))]
-    r_concat = loop_block.op("Concat", *r_concat, axis_i=0)
-    i_split = expand(loop_block, i_split, r_concat, None)
-    i_split = sym_help._reshape_helper(loop_block, i_split,
-                                       g.op("Constant", value_t=torch.LongTensor(output_sizes)))
-
-    # Loop outputs
-    cond_out = loop_block.op("Cast", loop_condition, to_i=9)
-    _add_output_to_block(loop_block, cond_out)
-    _add_output_to_block(loop_block, i_split)
-    loop_out = loop.node().output()
-
-    # In this loop, the outputs are scan outputs and are concatenated along
-    # the zero'th dimension (by default). In order to avoid this and concatenate
-    # along the dimension provided, some post-processing is required
-    loop_out = g.op("Transpose", loop_out, perm_i=perm_i)
-    return sym_help._reshape_helper(g, loop_out,
-                                    g.op("Constant", value_t=torch.LongTensor(output_sizes)))
-
 
 def normal(g, loc, scale, seed):
     # If you can sample from a given distribution with mean 0 and variance 1, then you can easily sample from a
diff --git a/torch/onnx/symbolic_opset13.py b/torch/onnx/symbolic_opset13.py
index 7f20833571a53..0baf785757702 100644
--- a/torch/onnx/symbolic_opset13.py
+++ b/torch/onnx/symbolic_opset13.py
@@ -5,7 +5,9 @@
 import torch
 import torch.onnx.symbolic_helper as sym_help
 from torch.onnx.symbolic_helper import parse_args, _unimplemented
-from torch.onnx.symbolic_opset9 import overload_by_arg_count, _maybe_cast_reduce_op_input, nonzero
+from torch.onnx.symbolic_opset9 import overload_by_arg_count, _maybe_cast_reduce_op_input, nonzero, expand
+from torch.onnx.symbolic_opset11 import unsqueeze
+from torch.onnx.utils import _add_block, _add_input_to_block, _add_output_to_block
 
 
 # EDITING THIS FILE? READ THIS FIRST!
@@ -196,3 +198,117 @@ def unsafe_chunk(g, self, chunks, dim, _outputs=None):
     # user's modules.
     splits = g.op("Constant", value_t=torch.tensor(splits, dtype=torch.long))
     return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+def repeat_interleave(g, self, repeats, dim=None, output_size=None):
+    input = self
+    final_dim = dim
+    # if dim is None flatten
+    # By default, use the flattened input array, and return a flat output array
+    if sym_help._is_none(dim):
+        input = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1])))
+        dim = 0
+    else:
+        dim = sym_help._maybe_get_scalar(dim)
+
+    repeats_dim = sym_help._get_tensor_rank(repeats)
+    repeats_sizes = sym_help._get_tensor_sizes(repeats)
+    input_sizes = sym_help._get_tensor_sizes(input)
+    if repeats_dim is None:
+        raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown "
+                           "repeats rank.")
+    if repeats_sizes is None:
+        raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown "
+                           "repeats size.")
+    if input_sizes is None:
+        raise RuntimeError("Unsupported: ONNX export of repeat_interleave for unknown "
+                           "input size.")
+    # Handle cases where dim is negative
+    if dim < 0:
+        dim += len(input_sizes)
+
+    output_sizes = input_sizes.copy()
+    for idx, input_size in enumerate(input_sizes):
+        if input_size is None:
+            output_sizes[idx], input_sizes[idx] = 0, -1
+    print(output_sizes, input_sizes)
+
+    cond_dynamic_repeats = (repeats_dim == 1 and repeats_sizes[0] is None)
+    # If input size is dynamic or repeats vector is dynamic
+    if output_sizes[dim] == 0 or cond_dynamic_repeats:
+        reps = sym_help._size_helper(g, input, dim)
+        reps = unsqueeze(g, reps, 0)
+        # Check if repeats vector is a single integer value
+        # or a single dimension tensor with non-dynamic values
+        if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
+            if not sym_help._is_tensor(repeats):
+                repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+            repeats = g.op("Expand", repeats, reps)
+        # Check if repeats is dynamic
+        # As repeats is dynamic, we use a where node as a substitute for the if statement
+        # If repests_dim = 1, expand repeats otherwise use original tensor
+        elif cond_dynamic_repeats:
+            repeat_dim = sym_help._size_helper(g, repeats, g.op("Constant", value_t=torch.LongTensor([0])))
+            repeat_cond = g.op("Equal", repeat_dim, g.op("Constant", value_t=torch.LongTensor([1])))
+            repeats = where(g, repeat_cond, g.op("Expand", repeats, reps), repeats)
+    # There are cases when the repeats are 1-d tensor with multiple repeats, but dim
+    # provided along one of the dynamic axes provided. A simple example would be
+    # input.shape -> [1, 1, *] where * represents the dynamic axes, and dim = 2
+    # Now, repeat interleaving can be performed in pytorch when the value of * matches
+    # with the number of elements in repeat, for example if * -> 2, number of repeats
+    # should be 2 as well.
+    else:
+        return torch.onnx.symbolic_opset9.repeat_interleave(g, self, repeats, final_dim)
+
+    reps_like = g.op("ConstantOfShape", g.op("Shape", repeats),
+                     value_t=torch.tensor([1], dtype=torch.long))
+    r_splits = split(g, repeats, reps_like, 0)
+    i_splits = split(g, input, reps_like, dim)
+
+    output_sizes[dim], input_sizes[dim] = -1, 1
+
+    # Create a loop to iterate over each value along the dimension
+    # and perform individual interleaving using the repeats tensor
+    # Loop is of the following pattern
+    # input (trip_count, cond)
+    #   int trip_count = ...;
+    #   bool cond = ...;
+    #   for (int i=0; i < trip_count && cond; ++i) {
+    #     cond = ...;
+    #   }
+
+    # Loop conditions
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    loop_condition = g.op("Cast", loop_condition, to_i=9)
+    loop_len = reps
+
+    # Create an empty sequence to store final expansions
+    final_splits = g.op("SequenceEmpty")
+    loop = g.op("Loop", loop_len, loop_condition, final_splits)
+
+    # Loop inputs
+    loop_block = _add_block(loop.node())
+    block_input_iter = _add_input_to_block(loop_block)
+    cond = _add_input_to_block(loop_block)
+    final_splits = _add_input_to_block(loop_block)
+
+    r_split = loop_block.op("SequenceAt", r_splits, block_input_iter)
+    i_split = loop_block.op("SequenceAt", i_splits, block_input_iter)
+
+    i_split = unsqueeze(loop_block, i_split, dim + 1)
+    r_concat = [loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[:dim + 1])),
+                r_split,
+                loop_block.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1:]))]
+    r_concat = loop_block.op("Concat", *r_concat, axis_i=0)
+    i_split = expand(loop_block, i_split, r_concat, None)
+    i_split = sym_help._reshape_helper(loop_block, i_split,
+                                       g.op("Constant", value_t=torch.LongTensor(output_sizes)))
+    final_splits = loop_block.op("SequenceInsert", final_splits, i_split)
+
+    # Loop outputs
+    cond_out = loop_block.op("Cast", loop_condition, to_i=9)
+    _add_output_to_block(loop_block, cond_out)
+    _add_output_to_block(loop_block, final_splits)
+
+    loop_out = loop.node().output()
+    loop_out = g.op("ConcatFromSequence", loop_out, axis_i=dim)
+    return loop_out
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 993284a292a96..ce59e15bb354d 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -2058,7 +2058,7 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
         if not sym_help._is_tensor(repeats):
             repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
         if input_sizes[dim] == 0:
-            return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 11,
+            return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 13,
                                                              "Unsupported along dimension with unknown input size")
         else:
             reps = input_sizes[dim]
@@ -2067,8 +2067,11 @@ def repeat_interleave(g, self, repeats, dim=None, output_size=None):
     # Cases where repeats is a 1 dim Tensor
     elif repeats_dim == 1:
         if input_sizes[dim] == 0:
-            return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 11,
+            return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 13,
                                                              "Unsupported along dimension with unknown input size")
+        if repeats_sizes[0] is None:
+            return sym_help._onnx_opset_unsupported_detailed("repeat_interleave", 9, 13,
+                                                             "Unsupported for cases with dynamic repeats")
         assert repeats_sizes[0] == input_sizes[dim], "repeats must have the same size as input along dim"
         reps = repeats_sizes[0]
     else:

From 1dd648f1c40c24a3d5a151581a8129652191fa86 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Fri, 20 Aug 2021 12:44:29 -0700
Subject: [PATCH 111/530] [ONNX] Suppport torch.dot and
 torch.nn.utils.spectral_norm (#62596) (#62765)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62765

Fixes #27723

Test Plan: Imported from OSS

Reviewed By: SplitInfinity

Differential Revision: D30375181

Pulled By: msaroufim

fbshipit-source-id: 715f4745899757ec405877980cd20c826028eb2c

Co-authored-by: BowenBao <bowbao@microsoft.com>
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 21 +++++++++++++++++++++
 torch/onnx/symbolic_opset9.py              |  4 ++++
 2 files changed, 25 insertions(+)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index fd1062946941c..865b3656dbbdf 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -5722,6 +5722,27 @@ def forward(self, input, other):
         y = torch.randint(10, (5, ))
         self.run_test(MatmulModel(), (x, y))
 
+    @skipIfUnsupportedMinOpsetVersion(9)  # MatMul long inputs is added in ONNX opset 9.
+    def test_dot(self):
+        class MatmulModel(torch.nn.Module):
+            def forward(self, input, other):
+                return torch.dot(input, other)
+
+        x = torch.randn(5, requires_grad=True)
+        y = torch.randn(5, requires_grad=True)
+        self.run_test(MatmulModel(), (x, y))
+
+        x = torch.randint(10, (5, ))
+        y = torch.randint(10, (5, ))
+        self.run_test(MatmulModel(), (x, y))
+
+    @disableScriptTest()  # SpectralNorm not TorchScript compatible.
+    def test_spectral_norm(self):
+        m = torch.nn.utils.spectral_norm(torch.nn.Linear(2, 4))
+
+        x = torch.randn(6, 2)
+        self.run_test(m, (x, ))
+
     def test_prelu(self):
         class PReluModel(torch.nn.Module):
             def __init__(self):
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index ce59e15bb354d..70bb8282570e2 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -3138,6 +3138,10 @@ def mv(g, self, vec):
     return matmul(g, self, vec)
 
 
+def dot(g, self, other):
+    return matmul(g, self, other)
+
+
 @parse_args('v', 'v')
 def fill(g, self, value):
     dtype = self.type().scalarType()

From 07e41cf2d7e4cd36443c1401e7dbb9970a50df82 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Fri, 20 Aug 2021 12:56:01 -0700
Subject: [PATCH 112/530] [easy]Unbreak caffe2benchmarking build (#63655)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63655

ghstack-source-id: 136324310

Test Plan: buck build //fbobjc/Apps/Internal/Caffe2Benchmarking:Caffe2Benchmarking fbobjc/mode/iphonesimulator

Reviewed By: hl475, JacobSzwejbka

Differential Revision: D30455659

fbshipit-source-id: b6da6be4f89b6e84753ef0849ffedea04785034a
---
 binaries/benchmark_helper.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index b0e1ae06be8d8..7690e356adaa0 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -16,6 +16,7 @@
 
 #include <chrono>
 #include <fstream>
+#include <iostream>
 #include <string>
 #include <thread>
 #ifdef _WIN32

From b008bb4443250276f9bfc50bb338c368f1a414cb Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 20 Aug 2021 13:13:54 -0700
Subject: [PATCH 113/530] Include iostream in ProcessGroupMPI.cpp (#63656)

Summary:
As it uses `std::cerr`, which in turn results in compilation regression introduced by https://github.com/pytorch/pytorch/pull/61500
Fixes https://github.com/pytorch/pytorch/issues/63653

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63656

Reviewed By: ejguan

Differential Revision: D30455824

Pulled By: malfet

fbshipit-source-id: 29f316e7f7fd8e7dcbee2666e7a985f25bf56515
---
 torch/csrc/distributed/c10d/ProcessGroupMPI.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
index aa6d81bbe4a13..b75f4417e832a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
@@ -4,6 +4,7 @@
 
 #include <limits>
 #include <map>
+#include <iostream>
 
 #include <c10/core/DeviceGuard.h>
 #include <c10/util/irange.h>

From a8de0d83fed2d68512c0b0e20716bd63e6769469 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Fri, 20 Aug 2021 14:00:20 -0700
Subject: [PATCH 114/530] empty caching allocator before test_avg_pool2d large
 subtest (#63528)

Summary:
Otherwise, unrecoverable OOM occurs on MI25.  Fixes broken ROCm CI test1.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63528

Reviewed By: malfet, zhouzhuojie

Differential Revision: D30459151

Pulled By: walterddr

fbshipit-source-id: 63e205c4f486fcbdd514cfb0ed8e38584f894585
---
 test/test_nn.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/test_nn.py b/test/test_nn.py
index d21e0477715db..bb109cf20e459 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -13936,6 +13936,9 @@ def helper(n, c, h, w, kernel_size, stride=None,
         helper(4, 8, 8, 8, 3, count_include_pad=False, padding=2, stride=2)
         helper(4, 8, 8, 8, 3, divisor_override=42)
         helper(4, 8, 8, 8, 7)
+        # ROCm 16GB MI25 hits OOM error. Clear caching allocator prior to running large subtest.
+        if TEST_WITH_ROCM and 'cuda' in device:
+            torch.cuda.empty_cache()
         helper(200, 512, 28, 28, 2)
         helper(4, 8, 7, 7, 3, stride=1)
         helper(4, 8, 7, 7, 3, padding=2, stride=1)

From da0820e553a1ff89dbfd37c591154e8326748fab Mon Sep 17 00:00:00 2001
From: jiayisun <jiayi.sun@intel.com>
Date: Fri, 20 Aug 2021 14:54:51 -0700
Subject: [PATCH 115/530] add BFloat16 operators on CPU: range, sinh, cosh,
 frexp, nan_to_num (#61826)

Summary:
Added BFloat16 support for range, sinh, cosh, frexp, and nan_to_num on CPU, and collected the benchmark data of these OPs(range, sinh, cosh, frexp, and nan_to_num) for BFloat16 and Float32 data type by using the operator_benchmark tool of PyTorch on the platform of Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz

Number of cores: 1 core, 28 cores(1 socket)
[cosh_sinh_benchmark.txt](https://github.com/pytorch/pytorch/files/6974313/cosh_sinh_benchmark.txt)
[frexp_benchmark.txt](https://github.com/pytorch/pytorch/files/6974315/frexp_benchmark.txt)
[nan_to_num_benchmark.txt](https://github.com/pytorch/pytorch/files/6974317/nan_to_num_benchmark.txt)
[range_benchmark.txt](https://github.com/pytorch/pytorch/files/6974318/range_benchmark.txt)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61826

Reviewed By: saketh-are

Differential Revision: D30257259

Pulled By: VitalyFedyunin

fbshipit-source-id: 394cd713e6394050a8c90b2160633beb675d71dd
---
 aten/src/ATen/native/RangeFactories.cpp               | 4 ++--
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp           | 8 ++++----
 c10/util/BFloat16-math.h                              | 6 ++++++
 torch/testing/_internal/common_methods_invocations.py | 4 ++++
 4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/RangeFactories.cpp b/aten/src/ATen/native/RangeFactories.cpp
index 508c157965edc..7d48c63b755ce 100644
--- a/aten/src/ATen/native/RangeFactories.cpp
+++ b/aten/src/ATen/native/RangeFactories.cpp
@@ -113,7 +113,7 @@ Tensor& logspace_cpu_out(const Scalar& start, const Scalar& end, c10::optional<i
 }
 
 Tensor& range_cpu_out(const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result) {
-  AT_DISPATCH_ALL_TYPES(result.scalar_type(), "range_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND(kBFloat16, result.scalar_type(), "range_cpu", [&]() {
     using accscalar_t = at::acc_type<scalar_t, false>;
     auto xstart = start.to<accscalar_t>();
     auto xend = end.to<accscalar_t>();
@@ -133,7 +133,7 @@ Tensor& range_cpu_out(const Scalar& start, const Scalar& end, const Scalar& step
     scalar_t *data_ptr = r.data_ptr<scalar_t>();
 
     at::parallel_for(0, size, internal::GRAIN_SIZE, [&](int64_t p_begin, int64_t p_end) {
-      scalar_t is = p_begin;
+      accscalar_t is = p_begin;
       for (int64_t i = p_begin; i < p_end; ++i, ++is) {
         data_ptr[i] = xstart + is * xstep;
       }
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 6288cec2ea3b3..a867a2a0ce519 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -322,7 +322,7 @@ static void sinc_kernel(TensorIteratorBase& iter) {
 }
 
 static void sinh_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "sinh_cpu", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, iter.dtype(), "sinh_cpu", [&]() {
     cpu_kernel_vec(
         iter,
         [=](scalar_t a) -> scalar_t { return std::sinh(a); },
@@ -331,7 +331,7 @@ static void sinh_kernel(TensorIteratorBase& iter) {
 }
 
 static void cosh_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "cosh_cpu", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, iter.dtype(), "cosh_cpu", [&]() {
     cpu_kernel_vec(
         iter,
         [=](scalar_t a) -> scalar_t { return std::cosh(a); },
@@ -407,7 +407,7 @@ static void nan_to_num_kernel(
     c10::optional<double> nan,
     c10::optional<double> pos_inf,
     c10::optional<double> neg_inf) {
-  AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "nan_to_num", [&]() {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "nan_to_num", [&]() {
     scalar_t nan_replacement = static_cast<scalar_t>(nan.value_or(0.));
     scalar_t pos_inf_replacement = pos_inf.has_value()
         ? static_cast<scalar_t>(pos_inf.value())
@@ -586,7 +586,7 @@ static void entr_kernel(TensorIteratorBase& iter) {
 }
 
 static void frexp_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND(kHalf,
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf,
     // The iter.dtype() here is the dtype of mantissa output.
     // It's a floating point type and must be the same as the input's dtype.
     iter.dtype(),
diff --git a/c10/util/BFloat16-math.h b/c10/util/BFloat16-math.h
index 2760100db6e98..a7b8426ced36a 100644
--- a/c10/util/BFloat16-math.h
+++ b/c10/util/BFloat16-math.h
@@ -57,6 +57,12 @@ inline c10::BFloat16 sin(c10::BFloat16 a) {
 inline c10::BFloat16 tan(c10::BFloat16 a) {
   return std::tan(float(a));
 }
+inline c10::BFloat16 sinh(c10::BFloat16 a) {
+  return std::sinh(float(a));
+}
+inline c10::BFloat16 cosh(c10::BFloat16 a) {
+  return std::cosh(float(a));
+}
 inline c10::BFloat16 tanh(c10::BFloat16 a) {
   return std::tanh(float(a));
 }
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 617b102642d05..63af3965a2e7d 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -6028,6 +6028,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
     UnaryUfuncInfo('cosh',
                    ref=np_unary_ufunc_integer_promotion_wrapper(np.cosh),
                    dtypes=all_types_and_complex_and(torch.bool),
+                   dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    safe_casts_outputs=True,
                    assert_autodiffed=True,
@@ -6413,6 +6414,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                    op=torch.frexp,
                    ref=np.frexp,
                    dtypes=floating_types_and(torch.half),
+                   dtypesIfCPU=floating_types_and(torch.half, torch.bfloat16),
                    # skip testing torch.frexp as it is not supported by ROCm platform yet
                    decorators=[skipCUDAIfRocm],
                    supports_out=False,
@@ -7432,6 +7434,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
     UnaryUfuncInfo('sinh',
                    ref=np_unary_ufunc_integer_promotion_wrapper(np.sinh),
                    dtypes=all_types_and_complex_and(torch.bool),
+                   dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    safe_casts_outputs=True,
                    assert_autodiffed=True,
@@ -7753,6 +7756,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
     UnaryUfuncInfo('nan_to_num',
                    ref=np.nan_to_num,
                    dtypes=all_types_and(torch.half, torch.bool),
+                   dtypesIfCPU=all_types_and(torch.half, torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.half, torch.bool, torch.bfloat16),
                    supports_forward_ad=True,
                    # Passing numpy_kwargs via sample_kwargs, as numpy does comparison

From bcf8e2f57eb8fdcf294276f3f5763b3e42fccec8 Mon Sep 17 00:00:00 2001
From: driazati <driazati@users.noreply.github.com>
Date: Fri, 20 Aug 2021 15:45:10 -0700
Subject: [PATCH 116/530] Remove breakpad from docker image (#63598)

Summary:
As of https://github.com/pytorch/pytorch/issues/63186 we're doing this properly via a third_party cmake build, so we don't need it here anymore.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63598

Reviewed By: walterddr, malfet

Differential Revision: D30432250

Pulled By: driazati

fbshipit-source-id: d0d5db14355cf574e42c0d0ed786bb26230180bd
---
 .circleci/docker/README.md                  |  2 +-
 .circleci/docker/build.sh                   | 20 -----------------
 .circleci/docker/common/install_breakpad.sh | 25 ---------------------
 .circleci/docker/ubuntu-cuda/Dockerfile     |  8 +++----
 .circleci/docker/ubuntu/Dockerfile          | 15 ++++---------
 5 files changed, 9 insertions(+), 61 deletions(-)
 delete mode 100644 .circleci/docker/common/install_breakpad.sh

diff --git a/.circleci/docker/README.md b/.circleci/docker/README.md
index a87522f622ccd..cc4f97cfae748 100644
--- a/.circleci/docker/README.md
+++ b/.circleci/docker/README.md
@@ -27,5 +27,5 @@ Docker builds are now defined with `.circleci/cimodel/data/simple/docker_definit
 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
 
 # Set flags (see build.sh) and build image
-sudo bash -c 'BREAKPAD=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
+sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
 ```
diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index 7c8477349981a..2b916a19ae117 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -88,7 +88,6 @@ case "$image" in
     DB=yes
     VISION=yes
     KATEX=yes
-    BREAKPAD=yes
     ;;
   pytorch-linux-xenial-py3.6-gcc7.2)
     ANACONDA_PYTHON_VERSION=3.6
@@ -101,7 +100,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     ;;
   pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7)
     CUDA_VERSION=10.2
@@ -112,7 +110,6 @@ case "$image" in
     DB=yes
     VISION=yes
     KATEX=yes
-    BREAKPAD=yes
     ;;
   pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7)
     CUDA_VERSION=11.1
@@ -123,7 +120,6 @@ case "$image" in
     DB=yes
     VISION=yes
     KATEX=yes
-    BREAKPAD=yes
     ;;
   pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7)
     CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names
@@ -134,7 +130,6 @@ case "$image" in
     DB=yes
     VISION=yes
     KATEX=yes
-    BREAKPAD=yes
     ;;
   pytorch-linux-xenial-py3-clang5-asan)
     ANACONDA_PYTHON_VERSION=3.6
@@ -142,7 +137,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     ;;
   pytorch-linux-xenial-py3-clang7-asan)
     ANACONDA_PYTHON_VERSION=3.6
@@ -150,7 +144,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     ;;
   pytorch-linux-xenial-py3-clang7-onnx)
     ANACONDA_PYTHON_VERSION=3.6
@@ -158,7 +151,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     ;;
   pytorch-linux-xenial-py3-clang5-android-ndk-r19c)
     ANACONDA_PYTHON_VERSION=3.6
@@ -177,7 +169,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     ;;
   pytorch-linux-bionic-py3.6-clang9)
     ANACONDA_PYTHON_VERSION=3.6
@@ -185,7 +176,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     VULKAN_SDK_VERSION=1.2.162.1
     SWIFTSHADER=yes
     ;;
@@ -195,8 +185,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
-    BREAKPAD=yes
     ;;
   pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9)
     CUDA_VERSION=10.2
@@ -206,7 +194,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     ;;
   pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7)
     CUDA_VERSION=10.2
@@ -216,7 +203,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     ;;
   pytorch-linux-bionic-cuda11.0-cudnn8-py3.6-gcc9)
     CUDA_VERSION=11.0
@@ -226,7 +212,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     ROCM_VERSION=3.9
     ;;
   pytorch-linux-bionic-rocm4.0.1-py3.6)
@@ -235,7 +220,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     ROCM_VERSION=4.0.1
     ;;
   pytorch-linux-bionic-rocm4.1-py3.6)
@@ -244,7 +228,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     ROCM_VERSION=4.1
     ;;
   pytorch-linux-bionic-rocm4.2-py3.6)
@@ -253,7 +236,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     ROCM_VERSION=4.2
     ;;
   *)
@@ -261,7 +243,6 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BREAKPAD=yes
     echo "image '$image' did not match an existing build configuration"
     if [[ "$image" == *py* ]]; then
       extract_version_from_image_name py ANACONDA_PYTHON_VERSION
@@ -325,7 +306,6 @@ docker build \
        --build-arg "GCC_VERSION=${GCC_VERSION}" \
        --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
        --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
-       --build-arg "BREAKPAD=${BREAKPAD}" \
        --build-arg "ANDROID=${ANDROID}" \
        --build-arg "ANDROID_NDK=${ANDROID_NDK_VERSION}" \
        --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
diff --git a/.circleci/docker/common/install_breakpad.sh b/.circleci/docker/common/install_breakpad.sh
deleted file mode 100644
index f49f1fb325e2a..0000000000000
--- a/.circleci/docker/common/install_breakpad.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-git clone https://github.com/driazati/breakpad.git
-pushd breakpad
-
-# breakpad has no actual releases, so this is pinned to the top commit from
-# main when this was forked (including the one patch commit). This uses a fork
-# of the breakpad mainline that automatically daisy-chains out to any previously
-# installed signal handlers (instead of overwriting them).
-git checkout 5485e473ed46d065e05489e50dfc59d90dfd7e22
-
-git clone https://chromium.googlesource.com/linux-syscall-support src/third_party/lss
-pushd src/third_party/lss
-# same as with breakpad, there are no real releases for this repo so use a
-# commit as the pin
-git checkout e1e7b0ad8ee99a875b272c8e33e308472e897660
-popd
-
-./configure
-make
-make install
-popd
-rm -rf breakpad
diff --git a/.circleci/docker/ubuntu-cuda/Dockerfile b/.circleci/docker/ubuntu-cuda/Dockerfile
index e0e7dc9b6e5bf..003538f576bd5 100644
--- a/.circleci/docker/ubuntu-cuda/Dockerfile
+++ b/.circleci/docker/ubuntu-cuda/Dockerfile
@@ -61,6 +61,10 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh
 ENV INSTALLED_VISION ${VISION}
 
+ADD ./common/install_openssl.sh install_openssl.sh
+ENV OPENSSL_ROOT_DIR /opt/openssl
+RUN bash ./install_openssl.sh
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 ADD ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
@@ -88,9 +92,5 @@ ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
 # Install LLVM dev version (Defined in the pytorch/builder github repository)
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 
-ADD ./common/install_openssl.sh install_openssl.sh
-ENV OPENSSL_ROOT_DIR /opt/openssl
-RUN bash ./install_openssl.sh
-
 USER jenkins
 CMD ["bash"]
diff --git a/.circleci/docker/ubuntu/Dockerfile b/.circleci/docker/ubuntu/Dockerfile
index ea00c083c3d02..76a64bc0ea10d 100644
--- a/.circleci/docker/ubuntu/Dockerfile
+++ b/.circleci/docker/ubuntu/Dockerfile
@@ -82,13 +82,6 @@ RUN rm AndroidManifest.xml
 RUN rm build.gradle
 ENV INSTALLED_ANDROID ${ANDROID}
 
-# (optional) Install breakpad
-ARG BREAKPAD
-ADD ./common/install_breakpad.sh install_breakpad.sh
-RUN if [ -n "${BREAKPAD}" ]; then bash ./install_breakpad.sh; fi
-RUN rm install_breakpad.sh
-ENV INSTALLED_BREAKPAD ${BREAKPAD}
-
 # (optional) Install Vulkan SDK
 ARG VULKAN_SDK_VERSION
 ADD ./common/install_vulkan_sdk.sh install_vulkan_sdk.sh
@@ -113,6 +106,10 @@ ADD ./common/install_ninja.sh install_ninja.sh
 RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
 RUN rm install_ninja.sh
 
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh
+ENV OPENSSL_ROOT_DIR /opt/openssl
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 ADD ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
@@ -130,9 +127,5 @@ ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
 # Install LLVM dev version (Defined in the pytorch/builder github repository)
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 
-ADD ./common/install_openssl.sh install_openssl.sh
-RUN bash ./install_openssl.sh
-ENV OPENSSL_ROOT_DIR /opt/openssl
-
 USER jenkins
 CMD ["bash"]

From efe01c59e3f64979bf054af8a70705f41b65db4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8C=97=E6=B5=B7=E8=8B=A5?= <flandre@scarletx.cn>
Date: Fri, 20 Aug 2021 15:45:12 -0700
Subject: [PATCH 117/530] [Doc] Deprecation notice for only_inputs argument
 (#63631)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63544.

Changed docstring accordingly. I'm new here, not sure if the style is okay. Please check.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63631

Reviewed By: ejguan

Differential Revision: D30459439

Pulled By: soulitzer

fbshipit-source-id: 8df3c509d1dd39764815b099ab47229550126cbe
---
 torch/autograd/__init__.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index 0d4f153d007c1..d11e261efcea1 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -173,17 +173,18 @@ def grad(
     gradients w.r.t. each of the outputs. If an output doesn't require_grad,
     then the gradient can be ``None``).
 
-    If ``only_inputs`` is ``True``, the function will only return a list of gradients
-    w.r.t the specified inputs. If it's ``False``, then gradient w.r.t. all remaining
-    leaves will still be computed, and will be accumulated into their ``.grad``
-    attribute.
-
     .. note::
 
         If you run any forward ops, create ``grad_outputs``, and/or call ``grad``
         in a user-specified CUDA stream context, see
         :ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.
 
+    .. note::
+
+        ``only_inputs`` argument is deprecated and is ignored now (defaults to ``True``).
+        To accumulate gradient for other parts of the graph, please use
+        ``torch.autograd.backward``.
+
     Args:
         outputs (sequence of Tensor): outputs of the differentiated function.
         inputs (sequence of Tensor): Inputs w.r.t. which the gradient will be

From e0fe5699c4b7dabd132834b690d6cc2513e0e978 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Fri, 20 Aug 2021 16:28:39 -0700
Subject: [PATCH 118/530] enable increment build for build_libtorch (#63074)

Summary:
Since issue https://github.com/pytorch/pytorch/issues/59859 is resolved.

rerun_cmake in build_libtorch should not be hardcoded.
build_libtorch is necessary to generate debug version libtorch.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63074

Reviewed By: VitalyFedyunin, seemethere

Differential Revision: D30306705

Pulled By: malfet

fbshipit-source-id: f2077d334191f4973da0681560937bc8bab730c1
---
 tools/build_libtorch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py
index 800d8eb278481..c263e5084f783 100644
--- a/tools/build_libtorch.py
+++ b/tools/build_libtorch.py
@@ -14,7 +14,10 @@
 if __name__ == '__main__':
     # Placeholder for future interface. For now just gives a nice -h.
     parser = argparse.ArgumentParser(description='Build libtorch')
+    parser.add_argument('--rerun-cmake', action="store_true", help='rerun cmake')
+    parser.add_argument('--cmake-only', action="store_true",
+                        help='Stop once cmake terminates. Leave users a chance to adjust build options')
     options = parser.parse_args()
 
     build_caffe2(version=None, cmake_python_library=None, build_python=False,
-                 rerun_cmake=True, cmake_only=False, cmake=CMake())
+                 rerun_cmake=options.rerun_cmake, cmake_only=options.cmake_only, cmake=CMake())

From 7c0f5b9aa4dbdfefce02b10a07c1928d4ec1a66b Mon Sep 17 00:00:00 2001
From: driazati <driazati@users.noreply.github.com>
Date: Fri, 20 Aug 2021 16:38:42 -0700
Subject: [PATCH 119/530] [clang-tidy] Enable more folders (#63380)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63380

Crosses off some more of #62011, see the test in the stacked PR #63381

Test Plan: Imported from OSS

Reviewed By: malfet, seemethere

Differential Revision: D30455843

Pulled By: driazati

fbshipit-source-id: d473545d05ffa0b2476968f0b1c55f3a16a2c755
---
 .github/workflows/lint.yml                           | 7 ++++++-
 tools/linter/clang_tidy/__main__.py                  | 1 +
 torch/csrc/api/include/torch/nn/functional/pooling.h | 2 +-
 torch/csrc/deploy/example/benchmark.cpp              | 1 +
 torch/csrc/deploy/test_deploy.cpp                    | 3 ++-
 torch/csrc/deploy/test_deploy_python_ext.cpp         | 2 +-
 6 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 720e76c4e6a5f..f036bc17d2a2d 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -367,7 +367,12 @@ jobs:
           cd "${GITHUB_WORKSPACE}"
 
           python3 -m tools.linter.clang_tidy \
-            --paths torch/csrc/fx \
+            --paths \
+              torch/csrc/fx \
+              torch/csrc/utils \
+              torch/csrc/generic \
+              torch/csrc/deploy \
+              torch/csrc/tensor \
             --clang-tidy-exe "$(which clang-tidy)" \
             --disable-progress-bar 2>&1 | tee "${GITHUB_WORKSPACE}"/clang-tidy-output.txt
 
diff --git a/tools/linter/clang_tidy/__main__.py b/tools/linter/clang_tidy/__main__.py
index fc9f2ab4e6687..b99c1f5366848 100644
--- a/tools/linter/clang_tidy/__main__.py
+++ b/tools/linter/clang_tidy/__main__.py
@@ -74,6 +74,7 @@ def clang_search_dirs() -> List[str]:
         "-torch/csrc/deploy/interpreter/interpreter.h",
         "-torch/csrc/deploy/interpreter/interpreter_impl.h",
         "-torch/csrc/deploy/interpreter/test_main.cpp",
+        "-torch/csrc/deploy/test_deploy_python_ext.cpp",
     ],
     "paths": ["torch/csrc/"],
     "include-dir": ["/usr/lib/llvm-11/include/openmp"] + clang_search_dirs(),
diff --git a/torch/csrc/api/include/torch/nn/functional/pooling.h b/torch/csrc/api/include/torch/nn/functional/pooling.h
index c8538858e8a74..f06b68ba2870d 100644
--- a/torch/csrc/api/include/torch/nn/functional/pooling.h
+++ b/torch/csrc/api/include/torch/nn/functional/pooling.h
@@ -776,7 +776,7 @@ inline std::tuple<Tensor, Tensor> fractional_max_pool2d_with_indices(
 
   Tensor _random_samples_ = _random_samples;
   if (!_random_samples_.defined()) {
-    auto n_batch = 1 ? input.dim() == 3 : input.size(0);
+    auto n_batch = input.dim() == 3;
     _random_samples_ = torch::rand({n_batch, input.size(-1), 2}, torch::TensorOptions().dtype(input.dtype()).device(input.device()));
   }
   return torch::fractional_max_pool2d(input, kernel_size, *output_size_, _random_samples_);
diff --git a/torch/csrc/deploy/example/benchmark.cpp b/torch/csrc/deploy/example/benchmark.cpp
index 348d84fec02b4..d2f1142965d40 100644
--- a/torch/csrc/deploy/example/benchmark.cpp
+++ b/torch/csrc/deploy/example/benchmark.cpp
@@ -295,6 +295,7 @@ struct Benchmark {
   std::function<void(int)> run_one_work_item;
 };
 
+// NOLINTNEXTLINE(bugprone-exception-escape)
 int main(int argc, char* argv[]) {
   int max_thread = atoi(argv[1]);
   cuda = std::string(argv[2]) == "cuda";
diff --git a/torch/csrc/deploy/test_deploy.cpp b/torch/csrc/deploy/test_deploy.cpp
index f88a23c43bde0..a004db1e0d232 100644
--- a/torch/csrc/deploy/test_deploy.cpp
+++ b/torch/csrc/deploy/test_deploy.cpp
@@ -63,7 +63,7 @@ TEST(TorchpyTest, InitTwice) {
 TEST(TorchpyTest, DifferentInterps) {
   torch::deploy::InterpreterManager m(2);
   m.register_module_source("check_none", "check = id(None)\n");
-  int64_t id0, id1;
+  int64_t id0 = 0, id1 = 0;
   {
     auto I = m.all_instances()[0].acquire_session();
     id0 = I.global("check_none", "check").toIValue().toInt();
@@ -312,6 +312,7 @@ TEST(TorchpyTest, SharedLibraryLoad) {
       I.global("sys", "path").attr("append")({"torch/csrc/deploy"});
       I.global("test_deploy_python", "setup")({getenv("PATH")});
     } else {
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
       char buf[PATH_MAX];
       strncpy(buf, test_lib_path, PATH_MAX);
       dirname(buf);
diff --git a/torch/csrc/deploy/test_deploy_python_ext.cpp b/torch/csrc/deploy/test_deploy_python_ext.cpp
index 42700ead6678b..59a04f5e84853 100644
--- a/torch/csrc/deploy/test_deploy_python_ext.cpp
+++ b/torch/csrc/deploy/test_deploy_python_ext.cpp
@@ -7,7 +7,7 @@
 bool run() {
   torch::deploy::InterpreterManager m(2);
   m.register_module_source("check_none", "check = id(None)\n");
-  int64_t id0, id1;
+  int64_t id0 = 0, id1 = 0;
   {
     auto I = m.all_instances()[0].acquire_session();
     id0 = I.global("check_none", "check").toIValue().toInt();

From 3ee1f81dce748aac73848d0ad45c1eb84fcffd74 Mon Sep 17 00:00:00 2001
From: Bo Wang <bowangbj@fb.com>
Date: Fri, 20 Aug 2021 17:09:35 -0700
Subject: [PATCH 120/530] Extend _sharded_tensor constructor to support other
 ops like torch.ones (#63378)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63378

a) Introduce InitCommonParams to wrap tensor creation params
b) Factor local tensor initiation into common_params so that tensor value is not hard specified in ShardedTensor constructor
c) Add _sharded_tensor.ones(...) to exemplify - Note memory_format arg is not provided to be consistent as torch.ones
d) Follow up: more ops like torch.full, torch.zero, torch.rand,

Test:
$ python test/distributed/_sharded_tensor/test_sharded_tensor.py TestCreateTensorFromParams --v
$ python test/distributed/_sharded_tensor/test_sharded_tensor.py TestShardedTensorChunked.test_create_sharded_tensor_with_ones --v
$ python test/distributed/_sharded_tensor/test_sharded_tensor.py TestShardedTensorEnumerable.test_create_sharded_tensor_with_ones --v

Test Plan: Imported from OSS

Reviewed By: pritamdamania87, wanchaol

Differential Revision: D30359245

Pulled By: bowangbj

fbshipit-source-id: 85768fcb36e9d9d40213036884b1266930a91701
---
 .../_sharded_tensor/test_sharded_tensor.py    | 109 +++++++++++-
 torch/distributed/_sharded_tensor/__init__.py |  61 ++++++-
 torch/distributed/_sharded_tensor/api.py      | 162 ++++++++----------
 3 files changed, 237 insertions(+), 95 deletions(-)

diff --git a/test/distributed/_sharded_tensor/test_sharded_tensor.py b/test/distributed/_sharded_tensor/test_sharded_tensor.py
index 829855f6be2c5..5067f301b5595 100644
--- a/test/distributed/_sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_sharded_tensor/test_sharded_tensor.py
@@ -1,4 +1,5 @@
 from functools import wraps
+import math
 import io
 import sys
 import torch
@@ -15,6 +16,11 @@
     EnumerableShardingSpec,
     ShardMetadata
 )
+from torch.distributed._sharded_tensor.api import (
+    CreateOp,
+    TensorInitParams,
+    _create_tensor_from_params,
+)
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     requires_nccl,
@@ -22,10 +28,11 @@
     TEST_SKIPS,
 )
 from torch.testing._internal.common_utils import (
+    TestCase,
     TEST_WITH_DEV_DBG_ASAN,
     run_tests,
+    sandcastle_skip_if,
 )
-
 if TEST_WITH_DEV_DBG_ASAN:
     print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr)
     sys.exit(0)
@@ -115,6 +122,38 @@ def wrapper(self):
         self.destroy_comms()
     return wrapper
 
+class TestCreateTensorFromParams(TestCase):
+    @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed')
+    def test_empty(self):
+        tensor_init_params = TensorInitParams(
+            create_op=CreateOp.EMPTY,
+            dtype=torch.double,
+            layout=torch.strided,
+            requires_grad=False,
+            pin_memory=False,
+            memory_format=torch.contiguous_format, )
+        local_device = torch.device('cuda:0')
+        local_tensor = _create_tensor_from_params(
+            5, 10, local_device=local_device, tensor_init_params=tensor_init_params)
+        self.assertEqual(local_device, local_tensor.device)
+        self.assertEqual(torch.double, local_tensor.dtype)
+        self.assertEqual(torch.strided, local_tensor.layout)
+        self.assertEqual(False, local_tensor.requires_grad)
+
+    @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed')
+    def test_ones(self):
+        tensor_init_params = TensorInitParams(
+            create_op=CreateOp.ONES,
+            dtype=torch.double,
+            layout=torch.strided,
+            requires_grad=False,
+            pin_memory=False,
+            memory_format=torch.contiguous_format, )
+        local_device = torch.device('cuda:0')
+        local_tensor = _create_tensor_from_params(
+            5, 10, local_device=local_device, tensor_init_params=tensor_init_params)
+        expected_tensor = torch.ones(5, 10, device=local_device, dtype=torch.double)
+        self.assertEqual(expected_tensor, local_tensor)
 
 class TestShardedTensorChunked(ShardedTensorTestBase, MultiProcessTestCase):
 
@@ -219,6 +258,35 @@ def test_complete_world_size(self):
                     else:
                         self.assertEqual((3, 20), shard.tensor.size())
 
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_create_sharded_tensor_with_ones(self):
+        """ Test _sharded_tensor.ones(...) """
+
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+        h, w = 10, 20
+        sharded_tensor = _sharded_tensor.ones(spec, h, w)
+
+        # Validate local shard is initialized with torch.ones
+        local_shards = sharded_tensor.local_shards()
+        self.assertEqual(1, len(local_shards))
+        local_shard = local_shards[0].tensor
+        self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.device)
+        # The split: for rank!=3 ceil(h/4)=3  for rank=3 1
+        expected_h = 1 if self.rank == 3 else math.ceil(h / 4)
+        self.assertEqual((expected_h, w), local_shard.size())
+        self.assertEqual(local_shard, torch.ones(expected_h, w))
+
     @with_comms
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
@@ -818,6 +886,45 @@ def test_grid_sharding(self):
                 shard = remote_shard.to_here()
                 self.assertEqual((5, 5), shard.tensor.size())
 
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_create_sharded_tensor_with_ones(self):
+        """ Test _sharded_tensor.ones(...) """
+
+        spec = EnumerableShardingSpec([
+            ShardMetadata(
+                shard_offsets=[0, 0],
+                shard_lengths=[5, 5],
+                placement="rank:0/cuda:0",
+            ),
+            ShardMetadata(
+                shard_offsets=[0, 5],
+                shard_lengths=[5, 5],
+                placement="rank:1/cuda:1",
+            ),
+            ShardMetadata(
+                shard_offsets=[5, 0],
+                shard_lengths=[5, 5],
+                placement="rank:2/cuda:2",
+            ),
+            ShardMetadata(
+                shard_offsets=[5, 5],
+                shard_lengths=[5, 5],
+                placement="rank:3/cuda:3",
+            )
+        ])
+
+        sharded_tensor = _sharded_tensor.ones(spec, 10, 10, init_rrefs=True)
+        self.assertEqual((10, 10), sharded_tensor.size())
+        self.assertEqual(1, len(sharded_tensor.local_shards()))
+
+        # Verify local shard is initialized with torch.ones
+        local_shard = sharded_tensor.local_shards()[0]
+        self.assertEqual(torch.device(f'cuda:{self.rank}'), local_shard.tensor.device)
+        self.assertEqual((5, 5), local_shard.tensor.size())
+        self.assertEqual(local_shard.tensor, torch.ones(5, 5))
+
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_uneven_shards(self):
diff --git a/torch/distributed/_sharded_tensor/__init__.py b/torch/distributed/_sharded_tensor/__init__.py
index d9833159dc9de..ecb7ea1fed8c6 100644
--- a/torch/distributed/_sharded_tensor/__init__.py
+++ b/torch/distributed/_sharded_tensor/__init__.py
@@ -3,12 +3,15 @@
 import torch
 from torch.distributed._sharding_spec import ShardingSpec
 from .api import (
+    CreateOp,
     Shard,
     ShardedTensor,
     ShardedTensorMetadata,
+    TensorInitParams,
     load_with_process_group,
 )
 
+
 def empty(
         sharding_spec: ShardingSpec,
         *size,
@@ -49,14 +52,62 @@ def empty(
     Returns:
         A :class:`ShardedTensor` object on each rank
     """
+    tensor_init_params = TensorInitParams(create_op=CreateOp.EMPTY, dtype=dtype, layout=layout,
+                                          requires_grad=requires_grad,
+                                          pin_memory=pin_memory, memory_format=memory_format)
+    return ShardedTensor(
+        sharding_spec,
+        *size,
+        tensor_init_params=tensor_init_params,
+        process_group=process_group,
+        init_rrefs=init_rrefs,
+    )
+
+def ones(
+        sharding_spec: ShardingSpec,
+        *size,
+        dtype=None,
+        layout=torch.strided,
+        requires_grad=False,
+        pin_memory=False,
+        memory_format=torch.contiguous_format,
+        process_group=None,
+        init_rrefs=False):
+    """
+    Creates a ones :class:`ShardedTensor`. Needs to be called on all ranks in an SPMD fashion.
+
+    Args:
+        sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+        size (int...): a sequence of integers defining the shape of the output
+            tensor. Can be a variable number of arguments or a collection like a list or tuple.
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+
+    Returns:
+        A :class:`ShardedTensor` object on each rank
+    """
+    tensor_init_params = TensorInitParams(create_op=CreateOp.ONES, dtype=dtype, layout=layout,
+                                          requires_grad=requires_grad,
+                                          pin_memory=pin_memory, memory_format=memory_format)
     return ShardedTensor(
         sharding_spec,
         *size,
-        dtype=dtype,
-        layout=layout,
-        requires_grad=requires_grad,
-        pin_memory=pin_memory,
-        memory_format=memory_format,
+        tensor_init_params=tensor_init_params,
         process_group=process_group,
         init_rrefs=init_rrefs,
     )
diff --git a/torch/distributed/_sharded_tensor/api.py b/torch/distributed/_sharded_tensor/api.py
index ca9a05abffa06..2b6720b059a85 100644
--- a/torch/distributed/_sharded_tensor/api.py
+++ b/torch/distributed/_sharded_tensor/api.py
@@ -1,6 +1,7 @@
 import collections
 from contextlib import contextmanager
 from dataclasses import dataclass, field
+from enum import Enum
 from typing import (
     Dict,
     List
@@ -22,6 +23,7 @@
     validate_non_overlapping_shards_metadata
 )
 
+
 # Tracking for sharded tensor objects.
 _sharded_tensor_lock = threading.Lock()
 _sharded_tensor_current_id = 0
@@ -123,6 +125,26 @@ def _register_remote_shards(sharded_tensor_id: int, rrefs: List[rpc.RRef[Shard]]
         _sharded_tensor_map[sharded_tensor_id]._register_remote_shards(rrefs, rpc_rank)
 
 
+class CreateOp(Enum):
+    EMPTY = 0
+    ONES = 1
+
+
+@dataclass
+class TensorInitParams(object):
+    """ Container for list of common params to create new local tensor. """
+
+    __slots__ = ['create_op', 'dtype', 'layout', 'requires_grad', 'pin_memory',
+                 'memory_format']
+
+    create_op: CreateOp
+    dtype: torch.dtype
+    layout: torch.layout
+    requires_grad: bool
+    pin_memory: bool
+    memory_format: torch.memory_format
+
+
 class ShardedTensor(object):
     """
     ShardedTensor is an abstraction to represent Tensors that are sharded
@@ -136,8 +158,9 @@ class ShardedTensor(object):
     ShardedTensor doesn't provide any Tensor like operations but is a wrapper
     providing the Tensor representing the local shard and the global metadata.
     Using these, users can build their custom distributed sharded computations
-    on top of this primitive. The local shards are all initialized using
-    :meth:`torch.empty`.
+    on top of this primitive. The local shards are all initialized using the
+    create_op specified by tensor_init_params.create_op, e.g., torch.ones, or
+    torch.empty
 
     Args:
         sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification
@@ -146,20 +169,7 @@ class ShardedTensor(object):
             tensor. Can be a variable number of arguments or a collection like a list or tuple.
 
     Keyword args:
-        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
-            Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`).
-        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
-            Default: ``torch.strided``.
-        requires_grad (bool, optional): If autograd should record operations on the
-            returned tensor. Default: ``False``.
-        pin_memory (bool, optional): If set, returned tensor would be allocated in
-            the pinned memory. Works only for CPU tensors. Default: ``False``.
-        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
-            returned Tensor. Default: ``torch.contiguous_format``.
-        process_group (ProcessGroup, optional): The process group to work on. If None,
-            the default process group will be used. If specified the ShardedTensor is only
-            built on ranks that are part of this process group and the provided ``sharding_spec``
-            is applied in the context of this process group.
+        tensor_init_params (:class: `TensorInitParams`): common params to create tensor.
         init_rrefs (bool, optional): Whether or not to initialize
             :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
             Need to initialize the RPC Framework if specified as ``True``.
@@ -170,11 +180,7 @@ def __init__(
         self,
         sharding_spec: ShardingSpec,
         *size,
-        dtype=None,
-        layout=torch.strided,
-        requires_grad=False,
-        pin_memory=False,
-        memory_format=torch.contiguous_format,
+        tensor_init_params: TensorInitParams,
         process_group=None,
         init_rrefs=False,
     ):
@@ -182,13 +188,13 @@ def __init__(
         # _process_group, _local_shards, etc.
         self._prepare_init(process_group=process_group, init_rrefs=init_rrefs)
 
-        if dtype is None:
-            dtype = torch.get_default_dtype()
+        if tensor_init_params.dtype is None:
+            tensor_init_params.dtype = torch.get_default_dtype()
 
-        if layout != torch.strided:
+        if tensor_init_params.layout != torch.strided:
             raise ValueError('Only torch.strided layout is currently supported')
 
-        if memory_format != torch.contiguous_format:
+        if tensor_init_params.memory_format != torch.contiguous_format:
             raise ValueError('Only torch.contiguous_format memory_format is currently supported')
 
         if len(size) == 1 and isinstance(size[0], collections.Sequence):
@@ -203,23 +209,9 @@ def __init__(
         self._sharding_spec = sharding_spec
 
         if isinstance(self._sharding_spec, ChunkShardingSpec):
-            self._init_chunked(
-                dims,
-                dtype,
-                layout,
-                requires_grad,
-                pin_memory,
-                memory_format,
-            )
+            self._init_chunked(dims, tensor_init_params)
         elif isinstance(self._sharding_spec, EnumerableShardingSpec):
-            self._init_enumerable(
-                dims,
-                dtype,
-                layout,
-                requires_grad,
-                pin_memory,
-                memory_format,
-            )
+            self._init_enumerable(dims, tensor_init_params)
         else:
             raise ValueError(f'Unsupported sharding_spec: {self._sharding_spec}')
 
@@ -420,15 +412,7 @@ def _init_from_local_shards(
         sharded_tensor._post_init()
         return sharded_tensor
 
-    def _init_chunked(
-        self,
-        dims,
-        dtype,
-        layout,
-        requires_grad,
-        pin_memory,
-        memory_format,
-    ):
+    def _init_chunked(self, dims, tensor_init_params: TensorInitParams, ):
         current_rank = dist.get_rank(self._process_group)
         sharding_dim = self._sharding_spec.dim  # type: ignore[attr-defined]
 
@@ -469,38 +453,22 @@ def _init_chunked(
                 # Build the local shard for the current rank if it is involved in the sharding spec.
                 if current_rank == rank:
                     # Initialize the local shard.
-                    local_shard = torch.empty(
-                        *rank_dims,
-                        dtype=dtype,
-                        layout=layout,
-                        device=local_device,
-                        requires_grad=requires_grad,
-                        memory_format=memory_format,
-                        pin_memory=pin_memory,
-                    )
-
+                    local_shard = _create_tensor_from_params(
+                        *rank_dims, local_device=local_device, tensor_init_params=tensor_init_params)
                     self._local_shards.append(Shard(local_shard, shard_metadata))
 
         # Build overall metadata
         self._metadata = ShardedTensorMetadata(
             shards_metadata,
             dims,
-            dtype,
-            layout,
-            requires_grad,
-            memory_format,
-            pin_memory,
+            tensor_init_params.dtype,
+            tensor_init_params.layout,
+            tensor_init_params.requires_grad,
+            tensor_init_params.memory_format,
+            tensor_init_params.pin_memory,
         )
 
-    def _init_enumerable(
-        self,
-        dims,
-        dtype,
-        layout,
-        requires_grad,
-        pin_memory,
-        memory_format,
-    ):
+    def _init_enumerable(self, dims, tensor_init_params: TensorInitParams):
         # Validate the sharding spec is compatible with the tensor.
         check_tensor(self._sharding_spec.shards, dims)  # type: ignore[attr-defined]
 
@@ -513,27 +481,20 @@ def _init_enumerable(
 
             if current_rank == rank:
                 # Initialize the local shard.
-                local_shard = torch.empty(
-                    *shard_metadata.shard_lengths,
-                    dtype=dtype,
-                    layout=layout,
-                    device=local_device,
-                    requires_grad=requires_grad,
-                    memory_format=memory_format,
-                    pin_memory=pin_memory,
-                )
-
+                local_shard = _create_tensor_from_params(
+                    *shard_metadata.shard_lengths, local_device=local_device,
+                    tensor_init_params=tensor_init_params)
                 self._local_shards.append(Shard(local_shard, shard_metadata))
 
         # Build overall metadata
         self._metadata = ShardedTensorMetadata(
             shards_metadata,
             dims,
-            dtype,
-            layout,
-            requires_grad,
-            memory_format,
-            pin_memory,
+            tensor_init_params.dtype,
+            tensor_init_params.layout,
+            tensor_init_params.requires_grad,
+            tensor_init_params.memory_format,
+            tensor_init_params.pin_memory,
         )
 
     def _parse_and_validate_remote_device(self, remote_device: torch.distributed._remote_device):
@@ -672,3 +633,26 @@ def __setstate__(self, state):
                 f'but at load time was {global_world_size}')
 
         self._post_init()
+
+
+def _create_tensor_from_params(*size, local_device, tensor_init_params: TensorInitParams):
+    """ Helper to construct tensor from size, device and common params. """
+
+    if tensor_init_params.create_op == CreateOp.ONES:
+        return torch.ones(*size,
+                          dtype=tensor_init_params.dtype,
+                          layout=tensor_init_params.layout,
+                          device=local_device,
+                          pin_memory=tensor_init_params.pin_memory,
+                          requires_grad=tensor_init_params.requires_grad,)
+    elif tensor_init_params.create_op == CreateOp.EMPTY:
+        return torch.empty(*size,
+                           dtype=tensor_init_params.dtype,
+                           layout=tensor_init_params.layout,
+                           device=local_device,
+                           requires_grad=tensor_init_params.requires_grad,
+                           # Note memory_format param is not accepted by torch.ones
+                           memory_format=tensor_init_params.memory_format,
+                           pin_memory=tensor_init_params.pin_memory,)
+    else:
+        raise ValueError(f'Unsupported create_op: {tensor_init_params.create_op}')

From b4f5809db8511d9517b043b14f48814a9199dea3 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Fri, 20 Aug 2021 18:27:33 -0700
Subject: [PATCH 121/530] Migrate thnn_conv2d from THC to ATen (#63428)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63428

Closes gh-24644, closes gh-24645

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D30441307

Pulled By: ngimel

fbshipit-source-id: 9c3dec469c0525831ae398df261cf41b7df7e373
---
 BUILD.bazel                                   |   1 -
 aten/src/ATen/LegacyTHFunctionsCUDA.h         |   4 -
 aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp  | 266 ---------
 aten/src/ATen/native/cuda/ConvolutionMM2d.cu  | 519 +++++++++++++++++-
 aten/src/ATen/native/native_functions.yaml    |   4 +-
 aten/src/THCUNN/CMakeLists.txt                |   1 -
 aten/src/THCUNN/SpatialConvolutionMM.cu       |  13 -
 .../THCUNN/generic/SpatialConvolutionMM.cu    | 499 -----------------
 aten/src/THCUNN/generic/THCUNN.h              |  46 --
 9 files changed, 509 insertions(+), 844 deletions(-)
 delete mode 100644 aten/src/THCUNN/SpatialConvolutionMM.cu
 delete mode 100644 aten/src/THCUNN/generic/SpatialConvolutionMM.cu

diff --git a/BUILD.bazel b/BUILD.bazel
index ca8874d64e857..dab227590072d 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -420,7 +420,6 @@ filegroup(
         "aten/src/THCUNN/SoftMarginCriterion.cu.cc",
         "aten/src/THCUNN/SoftPlus.cu.cc",
         "aten/src/THCUNN/SoftShrink.cu.cc",
-        "aten/src/THCUNN/SpatialConvolutionMM.cu.cc",
         "aten/src/THCUNN/Tanh.cu.cc",
     ],
 )
diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
index 5670f31a089d9..1a20e0bb8fa0b 100644
--- a/aten/src/ATen/LegacyTHFunctionsCUDA.h
+++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@@ -24,10 +24,6 @@ Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper);
 Tensor _th_potri(const Tensor & self, bool upper);
 Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src);
 Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training);
-std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_forward_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output, Tensor & columns, Tensor & ones);
-std::tuple<Tensor,Tensor,Tensor> _thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const optional<Tensor> & bias, IntArrayRef stride, IntArrayRef padding);
-std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones);
-std::tuple<Tensor,Tensor,Tensor> _thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask);
 
 } // namespace th
 } // namespace legacy
diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
index 30c61a3e8b355..4ead51e6bd26e 100644
--- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
+++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -175,272 +175,6 @@ Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src) {
     }
     return self;
 }
-std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_forward_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output, Tensor & columns, Tensor & ones) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
-  const Tensor& bias = *bias_maybe_owned;
-
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
-            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaDoubleSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
-            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
-            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaHalfSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
-            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            auto output_ = checked_dense_tensor_unwrap(output, "output", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 6, "_thnn_conv2d_forward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THNN_CudaBFloat16SpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_conv2d_forward_out not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return std::tuple<Tensor &, Tensor &, Tensor &>(output, columns, ones);
-}
-std::tuple<Tensor,Tensor,Tensor> _thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
-  const Tensor& bias = *bias_maybe_owned;
-
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto output_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto output = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(output_));
-    auto columns_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto columns = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(columns_));
-    auto ones_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto ones = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(ones_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
-            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            THNN_CudaDoubleSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
-            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            THNN_CudaSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
-            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            THNN_CudaHalfSpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 2, "_thnn_conv2d_forward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 3);
-            auto bias_ = checked_dense_tensor_unwrap(bias, "bias", 4, "_thnn_conv2d_forward", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            THNN_CudaBFloat16SpatialConvolutionMM_updateOutput(globalContext().getTHCState(), self_, output_, weight_, bias_ ? bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_conv2d_forward not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return std::tuple<Tensor, Tensor, Tensor>(output, columns, ones);
-}
-std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones) {
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            if (grad_input_) THNN_CudaDoubleSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            if (grad_weight_ || grad_bias_) THNN_CudaDoubleSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
-            break;
-        }
-        case ScalarType::Float: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            if (grad_input_) THNN_CudaSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            if (grad_weight_ || grad_bias_) THNN_CudaSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
-            break;
-        }
-        case ScalarType::Half: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            if (grad_input_) THNN_CudaHalfSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            if (grad_weight_ || grad_bias_) THNN_CudaHalfSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_input_ = checked_dense_tensor_unwrap(grad_input, "grad_input", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_weight_ = checked_dense_tensor_unwrap(grad_weight, "grad_weight", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            auto grad_bias_ = checked_dense_tensor_unwrap(grad_bias, "grad_bias", 8, "_thnn_conv2d_backward_out", true, DeviceType::CUDA, dispatch_scalar_type);
-            if (grad_input_) THNN_CudaBFloat16SpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            if (grad_weight_ || grad_bias_) THNN_CudaBFloat16SpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_conv2d_backward_out not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return std::tuple<Tensor &, Tensor &, Tensor &>(grad_input, grad_weight, grad_bias);
-}
-std::tuple<Tensor,Tensor,Tensor> _thnn_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones, std::array<bool,3> output_mask) {
-    const OptionalDeviceGuard device_guard(device_of(self));
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto grad_input_ = output_mask[0] ? c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr;
-    auto grad_input = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_input_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_input_));
-    auto grad_weight_ = output_mask[1] ? c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr;
-    auto grad_weight = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_weight_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_weight_));
-    auto grad_bias_ = output_mask[2] ? c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release() : nullptr;
-    auto grad_bias = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(grad_bias_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*)grad_bias_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            if (grad_input_) THNN_CudaDoubleSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            if (grad_weight_ || grad_bias_) THNN_CudaDoubleSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
-            break;
-        }
-        case ScalarType::Float: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            if (grad_input_) THNN_CudaSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            if (grad_weight_ || grad_bias_) THNN_CudaSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
-            break;
-        }
-        case ScalarType::Half: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            if (grad_input_) THNN_CudaHalfSpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            if (grad_weight_ || grad_bias_) THNN_CudaHalfSpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto grad_output_ = checked_dense_tensor_unwrap(grad_output, "grad_output", 1, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 2, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto weight_ = checked_dense_tensor_unwrap(weight, "weight", 3, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto kernel_size_ = check_intlist<2>(kernel_size, "kernel_size", 4);
-            auto stride_ = check_intlist<2>(stride, "stride", 5);
-            auto padding_ = check_intlist<2>(padding, "padding", 6);
-            auto columns_ = checked_dense_tensor_unwrap(columns, "columns", 7, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto ones_ = checked_dense_tensor_unwrap(ones, "ones", 8, "_thnn_conv2d_backward", false, DeviceType::CUDA, dispatch_scalar_type);
-            if (grad_input_) THNN_CudaBFloat16SpatialConvolutionMM_updateGradInput(globalContext().getTHCState(), self_, grad_output_, grad_input_ ? grad_input_ : NULL, weight_, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0]);
-            if (grad_weight_ || grad_bias_) THNN_CudaBFloat16SpatialConvolutionMM_accGradParameters(globalContext().getTHCState(), self_, grad_output_, grad_weight_ ? grad_weight_ : NULL, grad_bias_ ? grad_bias_ : NULL, columns_, ones_, kernel_size_[1], kernel_size_[0], stride_[1], stride_[0], padding_[1], padding_[0], 1);
-            break;
-        }
-        default:
-            AT_ERROR("_thnn_conv2d_backward not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return std::tuple<Tensor, Tensor, Tensor>(grad_input, grad_weight, grad_bias);
-}
 
 } // namespace th
 } // namespace legacy
diff --git a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu
index ede7e1fb39b29..bf3f8ac0a6eff 100644
--- a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu
+++ b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu
@@ -1,12 +1,482 @@
 #include <ATen/ATen.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
-namespace at {
-namespace native {
+#include <ATen/div_rtn.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDABlas.h>
+#include <ATen/native/ConvUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/cuda/im2col.cuh>
 
-std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_backward_out_cuda(const Tensor& grad_output,
-    const Tensor& self,
-    const Tensor& weight,
+namespace at { namespace native {
+namespace {
+
+void slow_conv2d_shape_check(
+    const Tensor& input, const Tensor& grad_output,
+    const Tensor& weight, const Tensor& bias,
+    int64_t kH, int64_t kW,
+    int64_t dH, int64_t dW,
+    int64_t padH, int64_t padW,
+    bool weight_nullable) {
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got kH: ", kH, " kW: ", kW);
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got dH: ", dH, " dW: ", dW);
+
+  TORCH_CHECK(weight_nullable || weight.defined(),
+              "weight tensor is expected to be non-nullable");
+  TORCH_CHECK(!weight.defined() ||
+              ((weight.numel() > 0) && (weight.dim() == 2)),
+              "non-empty 2D weight tensor expected, but got: ", weight.sizes());
+  TORCH_CHECK(!bias.defined() || (bias.dim() == 1 && bias.sizes()[0] == weight.sizes()[0]),
+              "Expected bias to have shape [", weight.sizes()[0], "] but got ", bias.sizes());
+
+  const auto in_sizes = input.sizes();
+  constexpr int ndim = 4;
+  constexpr int dimf = 1;
+  constexpr int dimh = 2;
+  constexpr int dimw = 3;
+  TORCH_CHECK(in_sizes.size() == ndim, "Expected 4D input tensor, but got ", in_sizes);
+
+  // Allow for empty batch size but not other dimensions
+  const bool valid_empty = c10::multiply_integers(in_sizes.slice(1)) != 0;
+  TORCH_CHECK(valid_empty, "non-empty input tensor expected but got: ", in_sizes);
+
+  int64_t inputHeight = in_sizes[dimh];
+  int64_t inputWidth = in_sizes[dimw];
+
+  int64_t exactInputHeight = inputHeight + 2 * padH;
+  int64_t exactInputWidth = inputWidth + 2 * padW;
+
+  TORCH_CHECK(exactInputHeight >= kH && exactInputWidth >= kW,
+              "Calculated padded input size per channel: ",
+              IntArrayRef{exactInputHeight, exactInputWidth},
+              ". Kernel size: ", IntArrayRef{kH, kW},
+              ". Kernel size can't be greater than actual input size");
+
+  // NOTE: can't use conv_output_size if the weight isn't defined
+  auto outputHeight = div_rtn<int64_t>(exactInputHeight - kH, dH) + 1;
+  auto outputWidth = div_rtn<int64_t>(exactInputWidth - kW, dW) + 1;
+
+  TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1,
+              "Given input size per channel: ",
+              IntArrayRef{inputHeight, inputWidth},
+              ". Calculated output size per channel: ",
+              IntArrayRef{outputHeight, outputWidth},
+              ". Output size is too small");
+
+  if (weight.defined()) {
+    const auto w_sizes = weight.sizes();
+    int64_t nInputPlane = w_sizes[1];
+    if (w_sizes.size() == 2) {
+      nInputPlane /= (kH * kW);
+    }
+    TORCH_CHECK(in_sizes[dimf] == nInputPlane,
+                "Expected input dim ", dimf, " to have size ", nInputPlane,
+                " but got ", in_sizes[dimf]);
+  }
+
+  if (grad_output.defined()) {
+    const auto gO_sizes = grad_output.sizes();
+    TORCH_CHECK(gO_sizes.size() == ndim,
+                "Expected grad_output to have ", ndim,
+                " dimensions but got shape", gO_sizes);
+
+    if (weight.defined()) {
+      const auto w_sizes = weight.sizes();
+      TORCH_CHECK(gO_sizes[dimf] == w_sizes[0],
+                  "Expected  dim ", dimf, " to have size ", w_sizes[0],
+                  " but got ", gO_sizes[dimf]);
+    } else if (bias.defined()) {
+      const auto b_sizes = bias.sizes();
+      int64_t nOutputPlane = b_sizes.size() == 0 ? 1 : b_sizes[0];
+      TORCH_CHECK(gO_sizes[dimf] == nOutputPlane,
+                  "Expected grad_output dim ", dimf, " to have size ",
+                  nOutputPlane, " but got ", gO_sizes[dimf]);
+    }
+    TORCH_CHECK(gO_sizes[dimh] == outputHeight,
+                "Expected grad_output dim ", dimh, " to have size ",
+                outputHeight, " but got ", gO_sizes[dimh]);
+    TORCH_CHECK(gO_sizes[dimw] == outputWidth,
+                "Expected grad_output dim ", dimw, " to have size ",
+                outputWidth, " but got ", gO_sizes[dimw]);
+  }
+}
+
+Tensor new_view_weight_MM2d(const Tensor& weight_) {
+  auto weight = weight_.expect_contiguous();
+  const auto w_sizes = weight->sizes();
+  TORCH_CHECK(w_sizes.size() == 4);
+  int64_t s1 = w_sizes[0];
+  int64_t s2 = c10::multiply_integers(w_sizes.slice(1));
+  return weight->view({s1, s2});
+}
+
+void slow_conv2d_forward(
+           const Tensor &input,
+           const Tensor &output,
+           const Tensor &weight_,
+           const Tensor &bias,
+           const Tensor &columns,
+           const Tensor &ones_,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW) {
+  auto weight = new_view_weight_MM2d(weight_);
+  slow_conv2d_shape_check(
+      input, {}, weight, bias, kH, kW, dH, dW, padH, padW, /*weight_nullable*/false);
+
+  TORCH_CHECK(!bias.defined() || bias.is_contiguous(),
+              "bias tensor has to be contiguous");
+
+  constexpr int ndim = 4;
+  constexpr int dimf = 1;
+  constexpr int dimh = 2;
+  constexpr int dimw = 3;
+
+  auto in_sizes = input.sizes();
+  int64_t batchSize = in_sizes[0];
+  int64_t nInputPlane  = in_sizes[dimf];
+  int64_t inputHeight  = in_sizes[dimh];
+  int64_t inputWidth   = in_sizes[dimw];
+  int64_t nOutputPlane = weight.sizes()[0];
+  int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  // Resize output
+  resize_output(output, {batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  // Resize temporary columns
+  resize_output(columns, {nInputPlane * kW * kH, outputHeight * outputWidth});
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  Tensor ones;
+  if (bias.defined()) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+  const bool requires_columns = (
+      kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0);
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+                                  "slow_conv2d_cuda", [&] {
+    // For each elt in batch, do:
+    for (int elt = 0; elt < batchSize; elt ++) {
+      // Matrix mulitply per output:
+      auto input_n = input.select(0, elt);
+      auto output_n = output.select(0, elt);
+
+      // Do Bias first:
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m_ = nOutputPlane;
+      int64_t n_ = outputHeight * outputWidth;
+      int64_t k_ = 1;
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      if (bias.defined()) {
+        at::cuda::blas::gemm(
+            't', 'n',
+            n_, m_, k_,
+            scalar_t(1),
+            ones.data_ptr<scalar_t>(), k_,
+            bias.data_ptr<scalar_t>(), k_,
+            scalar_t(0),
+            output_n.data_ptr<scalar_t>(), n_
+        );
+      } else {
+        output_n.zero_();
+      }
+
+      if (requires_columns) {
+        // Extract columns:
+        at::native::im2col(
+          c10::cuda::getCurrentCUDAStream(),
+          input_n.data_ptr<scalar_t>(),
+          nInputPlane, inputHeight, inputWidth,
+          outputHeight, outputWidth,
+          kH, kW, padH, padW, dH, dW,
+          1, 1,
+          columns.data_ptr<scalar_t>()
+        );
+      }
+
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m = nOutputPlane;
+      int64_t n = columns.size(1);
+      int64_t k = nInputPlane*kH*kW;
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      auto gemm_in_ptr = requires_columns ?
+          columns.data_ptr<scalar_t>() :
+          input_n.data_ptr<scalar_t>();
+      at::cuda::blas::gemm(
+          'n', 'n',
+          n, m, k,
+          scalar_t(1),
+          gemm_in_ptr, n,
+          weight.data_ptr<scalar_t>(), k,
+          scalar_t(1),
+          output_n.data_ptr<scalar_t>(), n
+      );
+    }
+  });
+}
+
+void slow_conv2d_backward(
+    const Tensor &input,
+    const Tensor &grad_output,
+    const Tensor &grad_input,
+    const Tensor &weight_,
+    const Tensor &grad_columns,
+    const Tensor &ones,
+    int kH, int kW,
+    int dH, int dW,
+    int padH, int padW) {
+  Tensor weight = new_view_weight_MM2d(weight_);
+  slow_conv2d_shape_check(input, grad_output, weight, {},
+                          kH, kW, dH, dW, padH, padW, /*weight_nullable=*/false);
+
+  // Params
+  auto weight_sizes = weight.sizes();
+  int nInputPlane = weight_sizes[1]/(kW*kH);
+  int nOutputPlane = weight_sizes[0];
+
+  TORCH_INTERNAL_ASSERT(grad_output.is_contiguous());
+
+  auto input_sizes = input.sizes();
+  int64_t inputWidth   = input_sizes[3];
+  int64_t inputHeight  = input_sizes[2];
+  auto output_sizes = grad_output.sizes();
+  int64_t outputWidth  = output_sizes[3];
+  int64_t outputHeight = output_sizes[2];
+
+  // Batch size + input planes
+  int64_t batchSize = input_sizes[0];
+
+  // Resize output
+  resize_output(grad_input, input_sizes);
+  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
+
+  // Resize temporary columns
+  resize_output(grad_columns, {nInputPlane*kW*kH, outputHeight*outputWidth});
+  TORCH_CHECK(grad_columns.is_contiguous(), "grad_columns must be contiguous");
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+                                  "slow_conv2d_backward_cuda", [&] {
+    // For each elt in batch, do:
+    for (int elt = 0; elt < batchSize; elt ++) {
+      // Matrix mulitply per sample:
+      auto grad_input_n = grad_input.select(0, elt);
+      auto grad_output_n = grad_output.select(0, elt);
+
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m = nInputPlane*kW*kH;
+      int64_t n = grad_columns.sizes()[1];
+      int64_t k = nOutputPlane;
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      at::cuda::blas::gemm<scalar_t>(
+          'n', 't',
+          n, m, k,
+          scalar_t(1),
+          grad_output_n.data_ptr<scalar_t>(), n,
+          weight.data_ptr<scalar_t>(), m,
+          scalar_t(0),
+          grad_columns.data_ptr<scalar_t>(), n
+      );
+
+      // Unpack columns back into input:
+      using acc_t = at::acc_type<scalar_t, true>;
+      at::native::col2im<scalar_t, acc_t>(
+        c10::cuda::getCurrentCUDAStream(),
+        grad_columns.data_ptr<scalar_t>(),
+        nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+        1, 1, grad_input_n.data_ptr<scalar_t>()
+      );
+    }
+  });
+}
+
+void slow_conv2d_grad_weight_bias(
+           const Tensor &input,
+           const Tensor &grad_output,
+           const Tensor &grad_weight_,
+           const Tensor &grad_bias,
+           const Tensor &columns,
+           const Tensor &ones,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW) {
+  if (grad_weight_.defined()) {
+    TORCH_CHECK(grad_weight_.is_contiguous(), "grad_weight needs to be contiguous");
+  }
+  if (grad_bias.defined()) {
+    TORCH_CHECK(grad_bias.is_contiguous(), "grad_bias needs to be contiguous");
+    TORCH_CHECK(ones.is_contiguous(), "ones needs to be contiguous");
+  }
+
+  auto grad_weight = new_view_weight_MM2d(grad_weight_);
+  slow_conv2d_shape_check(input, grad_output, grad_weight, grad_bias,
+                          kH, kW, dH, dW, padH, padW, /*weight_nullable=*/true);
+
+  // Params
+  TORCH_INTERNAL_ASSERT(input.is_contiguous());
+  TORCH_INTERNAL_ASSERT(grad_output.is_contiguous());
+
+  auto input_sizes = input.sizes();
+  int64_t nInputPlane = input_sizes[1];
+  int64_t nOutputPlane = grad_output.sizes()[1];
+
+  int64_t inputWidth   = input_sizes[3];
+  int64_t inputHeight  = input_sizes[2];
+  int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input_sizes[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones.defined() && ones.numel() < outputHeight * outputWidth) {
+    ones.resize_({outputHeight, outputWidth});
+    ones.fill_(1);
+  }
+
+  // Resize temporary columns
+  resize_output(columns, {nInputPlane * kH * kW, outputHeight * outputWidth});
+
+  const bool requires_columns = (
+      kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0);
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+                                  "slow_conv2d_grad_weight_bias_cuda", [&] {
+    // For each elt in batch, do:
+    for (int elt = 0; elt < batchSize; elt ++) {
+      // Matrix mulitply per output:
+      auto grad_output_n = grad_output.select(0, elt);
+
+      // Do Weight:
+      if (grad_weight.defined()) {
+        // Matrix mulitply per output:
+        auto input_n = input.select(0, elt);
+
+        if (requires_columns) {
+          // Extract columns:
+          at::native::im2col<scalar_t>(
+            c10::cuda::getCurrentCUDAStream(),
+            input_n.data_ptr<scalar_t>(),
+            nInputPlane, inputHeight, inputWidth,
+            outputHeight, outputWidth,
+            kH, kW, padH, padW, dH, dW,
+            1, 1,
+            columns.data_ptr<scalar_t>()
+          );
+        }
+
+        // M,N,K are dims of matrix A and B
+        // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+        int64_t m = nOutputPlane;
+        int64_t n = nInputPlane*kW*kH;
+        int64_t k = columns.sizes()[1];
+
+        // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+        auto gemm_in_ptr = requires_columns ?
+            columns.data_ptr<scalar_t>() :
+            input_n.data_ptr<scalar_t>();
+        at::cuda::blas::gemm(
+            't', 'n',
+            n, m, k,
+            scalar_t(1),
+            gemm_in_ptr, k,
+            grad_output_n.data_ptr<scalar_t>(), k,
+            scalar_t(1),
+            grad_weight.data_ptr<scalar_t>(), n
+        );
+      }
+
+      // Do Bias:
+      if (grad_bias.defined()) {
+        // M,N,K are dims of matrix A and B
+        // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+        int64_t m_ = nOutputPlane;
+        int64_t k_ = outputHeight * outputWidth;
+
+        // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+        at::cuda::blas::gemv(
+            't',
+            k_, m_,
+            scalar_t(1),
+            grad_output_n.data_ptr<scalar_t>(), k_,
+            ones.data_ptr<scalar_t>(), 1,
+            scalar_t(1),
+            grad_bias.data_ptr<scalar_t>(), 1
+        );
+      }
+    }
+  });
+}
+
+}  // namespace (anonymous)
+
+
+std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_forward_out_cuda(
+    const Tensor &self_,
+    const Tensor &weight_,
+    IntArrayRef kernel_size,
+    const c10::optional<Tensor> &bias_,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    Tensor &output,
+    Tensor &finput,
+    Tensor &fgrad_input) {
+  TORCH_CHECK(kernel_size.size() == 2);
+  TORCH_CHECK(stride.size() == 2);
+  TORCH_CHECK(padding.size() == 2);
+
+  auto self = self_.expect_contiguous();
+  auto weight = weight_.expect_contiguous();
+  auto bias = [&] {
+    if (bias_.has_value() && bias_->defined()) {
+      return bias_->expect_contiguous();
+    }
+    return MaybeOwned<Tensor>::owned(c10::in_place);
+  }();
+
+  slow_conv2d_forward(
+      *self,
+      output,
+      *weight,
+      *bias,
+      finput,
+      fgrad_input,
+      kernel_size[0], kernel_size[1],
+      stride[0], stride[1],
+      padding[0], padding[1]
+    );
+  return std::tuple<Tensor&, Tensor&, Tensor&>{
+      output, finput, fgrad_input};
+}
+
+std::tuple<Tensor, Tensor, Tensor> slow_conv2d_forward_cuda(
+    const Tensor &self,
+    const Tensor &weight,
+    IntArrayRef kernel_size,
+    const c10::optional<Tensor> &bias,
+    IntArrayRef stride,
+    IntArrayRef padding) {
+  auto output = at::empty({0}, self.options());
+  auto finput = at::empty({0}, self.options());
+  auto fgrad_input = at::empty({0}, self.options());
+  return slow_conv2d_forward_out_cuda(
+      self, weight, kernel_size, bias, stride, padding, output, finput, fgrad_input);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_backward_out_cuda(
+    const Tensor& grad_output_,
+    const Tensor& self_,
+    const Tensor& weight_,
     IntArrayRef kernel_size,
     IntArrayRef stride,
     IntArrayRef padding,
@@ -16,17 +486,42 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_backward_out_cuda(const Tensor
     Tensor& grad_weight,
     Tensor& grad_bias) {
   if (grad_weight.defined()) {
-    grad_weight.resize_(weight.sizes());
+    resize_output(grad_weight, weight_.sizes());
     grad_weight.zero_();
   }
   if (grad_bias.defined()) {
-    grad_bias.resize_({ weight.size(0) });
+    resize_output(grad_bias, {weight_.sizes()[0]});
     grad_bias.zero_();
   }
-  return legacy::cuda::_thnn_conv2d_backward_out(grad_input, grad_weight, grad_bias,
-                                                 grad_output, self, weight,
-                                                 kernel_size, stride, padding,
-                                                 finput, fgrad_input);
+  auto grad_output = grad_output_.expect_contiguous();
+  if (grad_input.defined()) {
+    resize_output(grad_input, self_.sizes());
+    auto weight = weight_.expect_contiguous();
+
+    slow_conv2d_backward(
+        self_, *grad_output,
+        grad_input, *weight,
+        finput, fgrad_input,
+        kernel_size[0], kernel_size[1],
+        stride[0], stride[1],
+        padding[0], padding[1]);
+  }
+  if (grad_weight.defined() || grad_bias.defined()) {
+    auto self = self_.expect_contiguous();
+    slow_conv2d_grad_weight_bias(
+        *self,
+        *grad_output,
+        grad_weight,
+        grad_bias,
+        finput,
+        fgrad_input,
+        kernel_size[0], kernel_size[1],
+        stride[0], stride[1],
+        padding[0], padding[1]
+      );
+  }
+  return std::tuple<Tensor&, Tensor&, Tensor&>{
+      grad_input, grad_weight, grad_bias};
 }
 
 std::tuple<Tensor, Tensor, Tensor> slow_conv2d_backward_cuda(
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 40245cc7607af..9bce764b1ee1a 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -9498,13 +9498,13 @@
   python_module: nn
   dispatch:
     CPU: slow_conv2d_forward_out_cpu
-    CUDA: legacy::cuda::_thnn_conv2d_forward_out
+    CUDA: slow_conv2d_forward_out_cuda
 
 - func: thnn_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> (Tensor output, Tensor finput, Tensor fgrad_input)
   python_module: nn
   dispatch:
     CPU: slow_conv2d_forward_cpu
-    CUDA: legacy::cuda::_thnn_conv2d_forward
+    CUDA: slow_conv2d_forward_cuda
 
 - func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
   python_module: nn
diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt
index 55197277b3779..f84005e7e92f6 100644
--- a/aten/src/THCUNN/CMakeLists.txt
+++ b/aten/src/THCUNN/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
-${CMAKE_CURRENT_SOURCE_DIR}/SpatialConvolutionMM.cu
 PARENT_SCOPE)
 
 set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE}
diff --git a/aten/src/THCUNN/SpatialConvolutionMM.cu b/aten/src/THCUNN/SpatialConvolutionMM.cu
deleted file mode 100644
index 020bfa1ebf8ce..0000000000000
--- a/aten/src/THCUNN/SpatialConvolutionMM.cu
+++ /dev/null
@@ -1,13 +0,0 @@
-#include <THCUNN/THCUNN.h>
-#include <THC/THCTensor.hpp>
-#include <THCUNN/common.h>
-#include <ATen/native/cuda/im2col.cuh>
-
-#include <TH/THHalf.h>
-#include <THC/THCNumerics.cuh>
-
-#include <THCUNN/generic/SpatialConvolutionMM.cu>
-#include <THC/THCGenerateFloatTypes.h>
-
-#include <THCUNN/generic/SpatialConvolutionMM.cu>
-#include <THC/THCGenerateBFloat16Type.h>
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
deleted file mode 100644
index af492b3e7da02..0000000000000
--- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
+++ /dev/null
@@ -1,499 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/SpatialConvolutionMM.cu"
-#else
-
-#include <ATen/div_rtn.h>
-#include <ATen/cuda/CUDABlas.h>
-
-static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
-                         THCState *state,
-                         THCTensor *input, THCTensor *gradOutput,
-                         THCTensor *weight, THCTensor *bias,
-                         int kH, int kW, int dH, int dW, int padH, int padW,
-                         int weight_nullable) {
-  THArgCheck(kW > 0 && kH > 0, 9,
-             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
-  THArgCheck(dW > 0 && dH > 0, 11,
-             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
-
-  if (weight != NULL) {
-    THCUNN_argCheck(state, !weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
-                    "non-empty 2D or 4D weight tensor expected, but got: %s");
-    if (bias != NULL) {
-      THCUNN_check_dim_size(state, bias, 1, 0, weight->size(0));
-    }
-  } else if (!weight_nullable) {
-    THError("weight tensor is expected to be non-nullable");
-  }
-
-  int ndim = input->dim();
-  int dimf = 0;
-  int dimh = 1;
-  int dimw = 2;
-
-  if (ndim == 4) {
-    dimf++;
-    dimh++;
-    dimw++;
-  }
-
-  // Allow for empty batch size but not other dimensions
-  bool valid_empty = false;
-  if (ndim == 3) {
-    valid_empty = input->size(0) == 0 && input->size(1) != 0 && input->size(2) != 0;
-  } else if (ndim == 4) {
-    valid_empty = input->size(0) == 0 && input->size(1) != 0 && input->size(2) != 0 && input->size(3) != 0;
-  }
-
-
-  THCUNN_argCheck(state, (!input->is_empty() || valid_empty) && (ndim == 3 || ndim == 4), 2, input,
-                  "non-empty 3D or 4D input tensor expected but got: %s");
-
-  int64_t inputHeight  = input->size(dimh);
-  int64_t inputWidth   = input->size(dimw);
-
-  int64_t exactInputHeight = inputHeight + 2 * padH;
-  int64_t exactInputWidth = inputWidth + 2 * padW;
-
-  if (exactInputHeight < kH || exactInputWidth < kW) {
-    THError("Calculated padded input size per channel: (%ld x %ld). "
-      "Kernel size: (%d x %d). Kernel size can't be greater than actual input size",
-      exactInputHeight, exactInputWidth, kH, kW);
-  }
-
-  int64_t outputHeight = div_rtn<int64_t>(exactInputHeight - kH, dH) + 1;
-  int64_t outputWidth  = div_rtn<int64_t>(exactInputWidth - kW, dW) + 1;
-
-  if (outputWidth < 1 || outputHeight < 1) {
-    THError("Given input size per channel: (%ld x %ld). "
-      "Calculated output size per channel: (%ld x %ld). Output size is too small",
-      inputHeight, inputWidth, outputHeight, outputWidth);
-  }
-
-  if (weight != NULL) {
-    int64_t nInputPlane = weight->size(1);
-    if (weight->dim() == 2) {
-      nInputPlane /= (kH * kW);
-    }
-    THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
-  }
-
-  if (gradOutput != NULL) {
-    if (weight != NULL) {
-      int64_t nOutputPlane = weight->size(0);
-      THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
-    } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->dim() == 0 ? 1 : bias->size(0);
-      THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
-    }
-    THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
-    THCUNN_check_dim_size(state, gradOutput, ndim, dimw, outputWidth);
-  }
-}
-
-static THCTensor* THNN_(newViewWeightMM2d)(THCState *state, THCTensor *weight) {
-  weight = THCTensor_(newContiguous)(state, weight);
-  if (weight->dim() == 4) {
-    int64_t s1 = weight->size(0);
-    int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
-    THCTensor *old_weight = weight;
-    weight = THTensor_wrap(weight).view({s1, s2}).unsafeReleaseTensorImpl();
-    THCTensor_(free)(state, old_weight);
-  }
-  return weight;
-}
-
-void THNN_(SpatialConvolutionMM_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *output,
-           THCTensor *weight,
-           THCTensor *bias,
-           THCTensor *columns,
-           THCTensor *ones,
-           int kW, int kH,
-           int dW, int dH,
-           int padW, int padH) {
-  THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
-  if (bias) {
-    THCUNN_assertSameGPU(state, 2, weight, bias);
-  }
-  weight = THNN_(newViewWeightMM2d)(state, weight);
-  THNN_(SpatialConvolutionMM_shapeCheck)
-       (state, input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, 0);
-  THArgCheck(!bias || THCTensor_(isContiguous)(state, bias), 5,
-             "bias tensor has to be contiguous");
-
-  int ndim = input->dim();
-  int dimf = 0;
-  int dimh = 1;
-  int dimw = 2;
-
-  if (ndim == 4) {
-    dimf++;
-    dimh++;
-    dimw++;
-  }
-
-  int64_t nInputPlane = input->size(dimf);
-  int64_t inputHeight  = input->size(dimh);
-  int64_t inputWidth   = input->size(dimw);
-  int64_t nOutputPlane = weight->size(0);
-  int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-  int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-
-
-  input = THCTensor_(newContiguous)(state, input);
-  int is_batch = 1;
-  if (input->dim() == 3) {
-    // Force batch
-    is_batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
-  }
-
-  // Batch size + input planes
-  int64_t batchSize = input->size(0);
-
-  // Resize output
-  THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
-
-  // Resize temporary columns
-  THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
-  // Define a buffer of ones, for bias accumulation
-  // Note: this buffer can be shared with other modules, it only ever gets increased,
-  // and always contains ones.
-  if (bias) {
-    if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
-      // Resize plane and fill with ones...
-      THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
-      THCTensor_(fill)(state, ones, ScalarConvert<int, scalar_t>::to(1));
-    }
-  }
-
-  // Helpers
-  THCTensor *input_n = THCTensor_(new)(state);
-  THCTensor *output_n = THCTensor_(new)(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCTensor_(select)(state, input_n, input, 0, elt);
-    THCTensor_(select)(state, output_n, output, 0, elt);
-
-    // Do Bias first:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    int64_t m_ = nOutputPlane;
-    int64_t n_ = outputHeight * outputWidth;
-    int64_t k_ = 1;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    if (bias) {
-      at::cuda::blas::gemm<scalar_t>(
-          't', 'n',
-          n_, m_, k_,
-          ScalarConvert<int, scalar_t>::to(1),
-          THCTensor_(data)(state, ones), k_,
-          THCTensor_(data)(state, bias), k_,
-          ScalarConvert<int, scalar_t>::to(0),
-          THCTensor_(data)(state, output_n), n_
-      );
-    } else {
-      THCTensor_(zero)(state, output_n);
-    }
-
-    if (kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0) {
-      // Extract columns:
-      at::native::im2col<scalar_t>(
-        c10::cuda::getCurrentCUDAStream(),
-        THCTensor_(data)(state, input_n),
-        nInputPlane, inputHeight, inputWidth,
-        outputHeight, outputWidth,
-        kH, kW, padH, padW, dH, dW,
-        1, 1,
-        columns->data<scalar_t>()
-      );
-    }
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    int64_t m = nOutputPlane;
-    int64_t n = columns->size(1);
-    int64_t k = nInputPlane*kH*kW;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    auto gemm_in_ptr =
-        (kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0)
-        ? THCTensor_(data)(state, columns)
-        : THCTensor_(data)(state, input_n);
-    at::cuda::blas::gemm<scalar_t>(
-        'n', 'n',
-        n, m, k,
-        ScalarConvert<int, scalar_t>::to(1),
-        gemm_in_ptr, n,
-        THCTensor_(data)(state, weight), k,
-        ScalarConvert<int, scalar_t>::to(1),
-        THCTensor_(data)(state, output_n), n
-    );
-  }
-
-  // Free
-  THCTensor_(free)(state, input_n);
-  THCTensor_(free)(state, output_n);
-
-  // Resize output
-  if (is_batch == 0) {
-    THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
-    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
-  }
-
-  THCTensor_(free)(state, input);
-  THCTensor_(free)(state, weight);
-}
-
-void THNN_(SpatialConvolutionMM_updateGradInput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           THCTensor *weight,
-           THCTensor *gradColumns,
-           THCTensor *ones,
-           int kW, int kH,
-           int dW, int dH,
-           int padW, int padH) {
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
-                       gradColumns, gradInput);
-  weight = THNN_(newViewWeightMM2d)(state, weight);
-
-  THNN_(SpatialConvolutionMM_shapeCheck)
-       (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, 0);
-
-  // Params
-  int nInputPlane = weight->dim() == 2 ? weight->size(1)/(kW*kH) : weight->size(1);
-  int nOutputPlane = weight->size(0);
-
-  input = THCTensor_(newContiguous)(state, input);
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-
-  int is_batch = 1;
-  if (input->dim() == 3) {
-    // Force batch
-    is_batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
-    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2));
-  }
-
-  int64_t inputWidth   = input->size(3);
-  int64_t inputHeight  = input->size(2);
-  int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-  int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-
-  // Batch size + input planes
-  int64_t batchSize = input->size(0);
-
-  // Resize output
-  THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
-
-  // Resize temporary columns
-  THCTensor_(resize2d)(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
-  // Helpers
-  THCTensor *gradInput_n = THCTensor_(new)(state);
-  THCTensor *gradOutput_n = THCTensor_(new)(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per sample:
-    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
-    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    int64_t m = nInputPlane*kW*kH;
-    int64_t n = gradColumns->size(1);
-    int64_t k = nOutputPlane;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    at::cuda::blas::gemm<scalar_t>(
-        'n', 't',
-        n, m, k,
-        ScalarConvert<int, scalar_t>::to(1),
-        THCTensor_(data)(state, gradOutput_n), n,
-        THCTensor_(data)(state, weight), m,
-        ScalarConvert<int, scalar_t>::to(0),
-        THCTensor_(data)(state, gradColumns), n
-    );
-
-    // Unpack columns back into input:
-    at::native::col2im<scalar_t, accreal>(
-      c10::cuda::getCurrentCUDAStream(),
-      THCTensor_(data)(state, gradColumns),
-      nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
-      1, 1, THCTensor_(data)(state, gradInput_n)
-    );
-  }
-
-  // Free
-  THCTensor_(free)(state, gradInput_n);
-  THCTensor_(free)(state, gradOutput_n);
-  THCTensor_(free)(state, weight);
-
-  // Resize output
-  if (is_batch == 0) {
-    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
-    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
-    THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
-  }
-
-  THCTensor_(free)(state, input);
-  THCTensor_(free)(state, gradOutput);
-}
-
-void THNN_(SpatialConvolutionMM_accGradParameters)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *gradOutput,
-           THCTensor *gradWeight,
-           THCTensor *gradBias,
-           THCTensor *columns,
-           THCTensor *ones,
-           int kW, int kH,
-           int dW, int dH,
-           int padW, int padH,
-           accreal scale_) {
-  scalar_t scale = ScalarConvert<accreal, scalar_t>::to(scale_);
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, gradBias, columns, ones);
-  if (gradWeight) {
-    THArgCheck(THCTensor_(isContiguous)(state, gradWeight), 4, "gradWeight needs to be contiguous");
-    gradWeight = THNN_(newViewWeightMM2d)(state, gradWeight);
-  }
-  if (gradBias) {
-    THArgCheck(THCTensor_(isContiguous)(state, gradBias), 5, "gradBias needs to be contiguous");
-    THArgCheck(THCTensor_(isContiguous)(state, ones), 7, "ones needs to be contiguous");
-  }
-
-  THNN_(SpatialConvolutionMM_shapeCheck)
-       (state, input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, 1);
-
-  // Params
-  input = THCTensor_(newContiguous)(state, input);
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-
-  int is_batch = 1;
-  if (input->dim() == 3) {
-    // Force batch
-    is_batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
-    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2));
-  }
-
-  int64_t nInputPlane = input->size(1);
-  int64_t nOutputPlane = gradOutput->size(1);
-
-  int64_t inputWidth   = input->size(3);
-  int64_t inputHeight  = input->size(2);
-  int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-  int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-
-  // Batch size + input planes
-  int64_t batchSize = input->size(0);
-
-  // Define a buffer of ones, for bias accumulation
-  if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
-    // Resize plane and fill with ones...
-    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
-    THCTensor_(fill)(state, ones, ScalarConvert<int, scalar_t>::to(1));
-  }
-
-  // Resize temporary columns
-  THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
-  // Helpers
-  THCTensor *input_n = THCTensor_(new)(state);
-  THCTensor *gradOutput_n = THCTensor_(new)(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
-
-    // Do Weight:
-    if (gradWeight) {
-      // Matrix mulitply per output:
-      THCTensor_(select)(state, input_n, input, 0, elt);
-
-      if (kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0) {
-        // Extract columns:
-        at::native::im2col<scalar_t>(
-          c10::cuda::getCurrentCUDAStream(),
-          THCTensor_(data)(state, input_n),
-          nInputPlane, inputHeight, inputWidth,
-          outputHeight, outputWidth,
-          kH, kW, padH, padW, dH, dW,
-          1, 1,
-          columns->data<scalar_t>()
-        );
-      }
-
-      // M,N,K are dims of matrix A and B
-      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-      int64_t m = nOutputPlane;
-      int64_t n = nInputPlane*kW*kH;
-      int64_t k = columns->size(1);
-
-      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-      auto gemm_in_ptr =
-          (kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0)
-          ? THCTensor_(data)(state, columns)
-          : THCTensor_(data)(state, input_n);
-      at::cuda::blas::gemm<scalar_t>(
-          't', 'n',
-          n, m, k,
-          scale,
-          gemm_in_ptr, k,
-          THCTensor_(data)(state, gradOutput_n), k,
-          ScalarConvert<int, scalar_t>::to(1),
-          THCTensor_(data)(state, gradWeight), n
-      );
-    }
-
-    // Do Bias:
-    if (gradBias) {
-      // M,N,K are dims of matrix A and B
-      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-      int64_t m_ = nOutputPlane;
-      int64_t k_ = outputHeight * outputWidth;
-
-      // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-      //#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_BFLOAT16)
-      at::cuda::blas::gemv<scalar_t>(
-          't',
-          k_, m_,
-          scale,
-          THCTensor_(data)(state, gradOutput_n), k_,
-          THCTensor_(data)(state, ones), 1,
-          ScalarConvert<int, scalar_t>::to(1),
-          THCTensor_(data)(state, gradBias), 1
-      );
-    }
-  }
-
-  // Free
-  THCTensor_(free)(state, input_n);
-  THCTensor_(free)(state, gradOutput_n);
-  if (gradWeight)
-    THCTensor_(free)(state, gradWeight);
-
-  // Resize
-  if (is_batch == 0) {
-    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
-    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
-  }
-
-  THCTensor_(free)(state, input);
-  THCTensor_(free)(state, gradOutput);
-}
-
-#endif
diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h
index 87a6105293057..d624fdd090177 100644
--- a/aten/src/THCUNN/generic/THCUNN.h
+++ b/aten/src/THCUNN/generic/THCUNN.h
@@ -26,50 +26,4 @@ TORCH_CUDA_CU_API void THNN_(MultiMarginCriterion_updateGradInput)(
     THCTensor* weights, // [OPTIONAL]
     accreal margin);
 
-TORCH_CUDA_CU_API void THNN_(SpatialConvolutionMM_updateOutput)(
-    THCState* state,
-    THCTensor* input,
-    THCTensor* output,
-    THCTensor* weight,
-    THCTensor* bias, // [OPTIONAL]
-    THCTensor* columns,
-    THCTensor* ones,
-    int kW,
-    int kH,
-    int dW,
-    int dH,
-    int padW,
-    int padH);
-
-TORCH_CUDA_CU_API void THNN_(SpatialConvolutionMM_updateGradInput)(
-    THCState* state,
-    THCTensor* input,
-    THCTensor* gradOutput,
-    THCTensor* gradInput,
-    THCTensor* weight,
-    THCTensor* columns,
-    THCTensor* ones,
-    int kW,
-    int kH,
-    int dW,
-    int dH,
-    int padW,
-    int padH);
-
-TORCH_CUDA_CU_API void THNN_(SpatialConvolutionMM_accGradParameters)(
-    THCState* state,
-    THCTensor* input,
-    THCTensor* gradOutput,
-    THCTensor* gradWeight,
-    THCTensor* gradBias, // [OPTIONAL]
-    THCTensor* columns,
-    THCTensor* ones,
-    int kW,
-    int kH,
-    int dW,
-    int dH,
-    int padW,
-    int padH,
-    accreal scale);
-
 #endif

From 2d58f3f56d7be4eced403454d561179c0d5527c5 Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@fb.com>
Date: Fri, 20 Aug 2021 21:08:59 -0700
Subject: [PATCH 122/530] NNAPI: Support const values in binary ops

Summary:
NNAPI converter failed with 1 const value and one tensor earlier
Code suggestions from dreiss

Test Plan:
pytest test/test_nnapi.py::TestNNAPI::test_pointwise_binary

Imported from OSS

Reviewed By: anshuljain1

Differential Revision: D28893881

fbshipit-source-id: 59240373fb03c6fdafa4cb2fa4d8408dd20092f6
---
 test/test_nnapi.py                  | 25 +++++++++++++++++++++++++
 torch/backends/_nnapi/serializer.py | 25 ++++++++++++++++++-------
 2 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/test/test_nnapi.py b/test/test_nnapi.py
index 19efa7f0ae738..d70bebf547a1e 100644
--- a/test/test_nnapi.py
+++ b/test/test_nnapi.py
@@ -49,6 +49,7 @@ def check(
         convert_args=None,
         atol_rtol=None,
         limit=None,
+        expected_memory_format=None
     ):
         with torch.no_grad():
             if isinstance(arg_or_args, torch.Tensor):
@@ -76,6 +77,8 @@ def check(
                     # Too many mismatches.  Re-run the check with no tolerance
                     # to get a nice message.
                     self.assertEqual(eager_output, nnapi_output, atol=0, rtol=0)
+            if expected_memory_format:
+                self.assertTrue(nnapi_out.is_contiguous(memory_format=expected_memory_format))
 
     def float_and_quant_and_nhwc(self, inp_float, scale, zero_point):
         torch.manual_seed(29)
@@ -319,6 +322,28 @@ def forward(self, lhs, rhs):
                             torch.tensor([[3.0, 4.0], [5.0, 6.0]]),
                         ])
 
+    def test_pointwise_binary_const(self):
+        const = torch.randn(1, 4, 6, 6)
+
+        class ArgPlusConst(torch.nn.Module):
+            def forward(self, arg):
+                return arg + const
+
+        class ConstPlusArg(torch.nn.Module):
+            def forward(self, arg):
+                return const + arg
+
+        arg_contig = torch.randn(2, 4, 6, 6)
+        arg_nhwc = nhwc(torch.randn(2, 4, 6, 6))
+
+        for mod_class in [ArgPlusConst, ConstPlusArg]:
+            for use_nhwc in [False, True]:
+                with self.subTest(mod_class=mod_class.__name__, use_nhwc=use_nhwc):
+                    arg = arg_nhwc if use_nhwc else arg_contig
+                    memory_format = torch.channels_last if use_nhwc else torch.contiguous_format
+                    self.check(mod_class(), arg,
+                               expected_memory_format=memory_format)
+
     def test_hardtanh(self):
         inp = torch.tensor([-2.0, -0.5, 0.5, 2.0, 7.0])
         self.check(torch.nn.Hardtanh(), inp)
diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py
index f85d51a040995..a2530df478833 100644
--- a/torch/backends/_nnapi/serializer.py
+++ b/torch/backends/_nnapi/serializer.py
@@ -404,8 +404,8 @@ def add_tensor_operand_for_input(self, arg_idx, jitval, tensor):
                 self.compute_operand_shape(operand_id, dim, f"args[{arg_idx}].shape[{dim}]")
         return operand_id
 
-    def add_tensor_operand_for_weight(self, tensor):
-        toper = self.torch_tensor_to_operand(tensor, DimOrder.UNKNOWN_CONSTANT)
+    def add_tensor_operand_for_weight(self, tensor, dim_order=DimOrder.UNKNOWN_CONSTANT):
+        toper = self.torch_tensor_to_operand(tensor, dim_order)
         operand_id = len(self.operands)
         self.operands.append(toper)
         tsize = tensor_size(toper.op_type, toper.shape)
@@ -418,6 +418,9 @@ def add_tensor_operand_for_weight(self, tensor):
             buf_num,
             offset,
             tsize))
+        # For NHWC NNAPI op, lay out data in the same dim order by permuting torch tensor
+        if dim_order == DimOrder.CHANNELS_LAST:
+            tensor = tensor.permute(0, 2, 3, 1)
         self.used_weights.append(tensor)
         return operand_id
 
@@ -456,6 +459,9 @@ def add_immediate_int_vector(self, value):
             array.array("i", value).tobytes(),
             (len(value),))
 
+    def has_operand_for_jitval(self, jitval):
+        return jitval in self.jitval_operand_map
+
     def get_tensor_operand_by_jitval(self, jitval):
         operand_id = self.jitval_operand_map[jitval]
         return (operand_id, self.operands[operand_id])
@@ -469,11 +475,11 @@ def get_tensor_operand_by_jitval_fixed_size(self, jitval):
                 raise Exception("Flexible size is not supported for this operand.")
         return op_id, oper
 
-    def get_tensor_operand_or_constant(self, jitval):
+    def get_tensor_operand_or_constant(self, jitval, dim_order=DimOrder.PRESUMED_CONTIGUOUS):
         operand_id = self.jitval_operand_map.get(jitval)
         if operand_id is None:
             _, value = self.get_constant_value(jitval, "TensorType")
-            operand_id = self.add_tensor_operand_for_weight(value)
+            operand_id = self.add_tensor_operand_for_weight(value, dim_order)
         return (operand_id, self.operands[operand_id])
 
     def get_tensor_operand_for_weight(self, jitval):
@@ -1233,9 +1239,14 @@ def _do_add_binary(self, node, opcode, fuse_code, *, qparams=None):
         assert node.inputsAt(0).type().kind() == "TensorType"
         assert node.inputsAt(1).type().kind() == "TensorType"
 
-        # TODO: Should support constant as either operand.
-        in0_id, in0_oper = self.get_tensor_operand_by_jitval(node.inputsAt(0))
-        in1_id, in1_oper = self.get_tensor_operand_by_jitval(node.inputsAt(1))
+        if self.has_operand_for_jitval(node.inputsAt(0)):
+            in0_id, in0_oper = self.get_tensor_operand_by_jitval(node.inputsAt(0))
+            in1_id, in1_oper = self.get_tensor_operand_or_constant(node.inputsAt(1), in0_oper.dim_order)
+        elif self.has_operand_for_jitval(node.inputsAt(1)):
+            in1_id, in1_oper = self.get_tensor_operand_by_jitval(node.inputsAt(1))
+            in0_id, in0_oper = self.get_tensor_operand_or_constant(node.inputsAt(0), in1_oper.dim_order)
+        else:
+            raise Exception(f"Can't do a NNAPI binary op: {opcode} on two constants")
 
         assert in0_oper.op_type == in1_oper.op_type
         in0_id, in0_oper, in1_id, in1_oper = self.transpose_for_broadcast(

From b2a601ffe54294100d9967b98cc6576675dcdfcf Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Fri, 20 Aug 2021 21:41:19 -0700
Subject: [PATCH 123/530] [Static Runtime] Implement out variant for
 fb::quantized_linear (#63635)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63635

Reviewed By: ajyu

Differential Revision: D30446234

fbshipit-source-id: 1ef014186ff725930a97d0159626f9233ee74030
---
 benchmarks/static_runtime/test_scripts.h      | 11 +++++
 .../static_runtime/test_static_runtime.cc     | 14 ++++++
 torch/csrc/jit/runtime/static/ops.cpp         | 47 +++++++++++++++++++
 3 files changed, 72 insertions(+)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index 9946c7af02e5a..73380129731ed 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -719,3 +719,14 @@ const auto append_tensor_script = R"JIT(
       lst.append(a)
       return lst
 )JIT";
+
+const std::string quantize_script = R"IR(
+  graph(%input: Tensor, %weights: Tensor):
+      %scale: float = prim::Constant[value=1.]()
+      %zero_point: int = prim::Constant[value=1]()
+      %bias: None = prim::Constant()
+      %packed_params = quantized::linear_prepack(%weights, %bias)
+      %1254 = quantized::linear(%input, %packed_params, %scale, %zero_point)
+      %1249: Tensor = aten::dequantize(%1254)
+      return (%1249)
+)IR";
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index ec703ef8a2ec1..dfe2c14e9489b 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -1172,3 +1172,17 @@ TEST(StaticRuntime, IndividualOps_Append) {
   testStaticRuntime(append_tensor_script, args_tensor);
   testStaticRuntime(append_tensor_script, args_tensor, args_tensor_large);
 }
+
+TEST(StaticRuntime, QuantizedLinear) {
+  at::Tensor weight =
+      at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQInt8);
+  at::Tensor input =
+      at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQUInt8);
+
+  at::Tensor weight_2 =
+      at::quantize_per_tensor(torch::randn({4, 3}), 2, 3, torch::kQInt8);
+  at::Tensor input_2 =
+      at::quantize_per_tensor(torch::randn({4, 3}), 2, 3, torch::kQUInt8);
+
+  testStaticRuntime(quantize_script, {input, weight}, {input_2, weight_2});
+}
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index eef5595cee7b2..2543182db138c 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1528,6 +1528,53 @@ REGISTER_OPERATOR_FUNCTOR(quantized::linear, quantized_linear, [](Node* n) -> SR
   };
 });
 
+REGISTER_OPERATOR_FUNCTOR(
+    fb::quantized_linear,
+    fb_quantized_linear,
+    [](Node* n) -> SROperator {
+      if (!n->matches(torch::schema(
+              "fb::quantized_linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase w_prepack, Tensor Y_scale_i, Tensor Y_zero_point_i) -> Tensor"))) {
+        LogAndDumpSchema(n);
+        return nullptr;
+      }
+      const auto w = toIValue(n->inputs()[1]);
+      c10::intrusive_ptr<LinearPackedParamsBase> packed_weight;
+      if (w) {
+        packed_weight = w->toCustomClass<LinearPackedParamsBase>();
+      }
+      return [packed_weight](ProcessedNode* p_node) {
+        const auto& input = p_node->Input(0).toTensor();
+        const auto output_scale = p_node->Input(2).toTensor().item().toFloat();
+        const auto output_zero_point =
+            p_node->Input(3).toTensor().item().toLong();
+
+        if (p_node->Output(0).isNone()) {
+          p_node->Output(0) = at::native::empty_affine_quantized(
+              {0},
+              c10::kQUInt8,
+              c10::nullopt,
+              c10::kCPU,
+              false,
+              output_scale,
+              output_zero_point,
+              c10::nullopt);
+        }
+        auto& out_t = p_node->Output(0).toTensor();
+        fastResizeToZero(out_t);
+
+        if (packed_weight) {
+          packed_weight->apply_out(
+              input, output_scale, output_zero_point, out_t);
+        } else {
+          // Weights could be quantized on the fly
+          auto packed_weight_tmp =
+              p_node->Input(1).toCustomClass<LinearPackedParamsBase>();
+          packed_weight_tmp->apply_out(
+              input, output_scale, output_zero_point, out_t);
+        }
+      };
+    });
+
 REGISTER_OPERATOR_FUNCTOR(aten::full, aten_full, [](Node* n) -> SROperator {
   if (!n->matches(torch::schema(
           "aten::full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"))) {

From 8871ff29b743948d1225389d5b7068f37b22750b Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@fb.com>
Date: Fri, 20 Aug 2021 22:15:55 -0700
Subject: [PATCH 124/530] [sharded_tensor] add readonly tensor properties
 (#63679)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63679

This PR add read only tensor properties to sharded tensor, to match the torch.Tensor behaviors.

Test Plan: test_sharded_tensor_metadata

Reviewed By: pritamdamania87

Differential Revision: D30459343

fbshipit-source-id: 9aec8ecfe76479eed25f3b843495e5719ed2956d
---
 .../_sharded_tensor/test_sharded_tensor.py    | 43 +++++++++----------
 torch/distributed/_sharded_tensor/api.py      | 29 +++++++++++++
 2 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/test/distributed/_sharded_tensor/test_sharded_tensor.py b/test/distributed/_sharded_tensor/test_sharded_tensor.py
index 5067f301b5595..26a176b1455c1 100644
--- a/test/distributed/_sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_sharded_tensor/test_sharded_tensor.py
@@ -174,19 +174,17 @@ def test_sharded_tensor_metadata(self):
         sharded_tensor = _sharded_tensor.empty(spec, 10, 20, init_rrefs=True)
         sharded_tensor_metadata = sharded_tensor.metadata()
         self.assertEqual(torch.Size([10, 20]), sharded_tensor_metadata.size)
-        self.assertEqual(torch.float, sharded_tensor_metadata.dtype)
-        self.assertEqual(torch.strided, sharded_tensor_metadata.layout)
-        self.assertEqual(False, sharded_tensor_metadata.requires_grad)
-        self.assertEqual(torch.contiguous_format, sharded_tensor_metadata.memory_format)
-        self.assertEqual(False, sharded_tensor_metadata.pin_memory)
+        self.assertEqual(torch.float, sharded_tensor.dtype)
+        self.assertEqual(torch.strided, sharded_tensor.layout)
+        self.assertEqual(False, sharded_tensor.requires_grad)
+        self.assertTrue(sharded_tensor.is_contiguous())
+        self.assertFalse(sharded_tensor.is_pinned())
 
         sharded_tensor = _sharded_tensor.empty(spec, 10, 20, requires_grad=True, init_rrefs=True)
-        sharded_tensor_metadata = sharded_tensor.metadata()
-        self.assertEqual(True, sharded_tensor_metadata.requires_grad)
+        self.assertEqual(True, sharded_tensor.requires_grad)
 
         sharded_tensor = _sharded_tensor.empty(spec, 10, 20, dtype=torch.double, init_rrefs=True)
-        sharded_tensor_metadata = sharded_tensor.metadata()
-        self.assertEqual(torch.double, sharded_tensor_metadata.dtype)
+        self.assertEqual(torch.double, sharded_tensor.dtype)
 
         # Need CPU for pin_memory
         spec = ChunkShardingSpec(
@@ -200,8 +198,12 @@ def test_sharded_tensor_metadata(self):
         )
 
         sharded_tensor = _sharded_tensor.empty(spec, 10, 20, pin_memory=True, init_rrefs=True)
-        sharded_tensor_metadata = sharded_tensor.metadata()
-        self.assertEqual(True, sharded_tensor_metadata.pin_memory)
+        self.assertEqual(True, sharded_tensor.is_pinned())
+
+        # test read only properties, they're read only as we can't simply change
+        # the global metadata without changing the underlying shard's properties
+        with self.assertRaisesRegex(AttributeError, "can't set attribute"):
+            sharded_tensor.requires_grad = True
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -782,19 +784,17 @@ def test_sharded_tensor_metadata(self):
         sharded_tensor = _sharded_tensor.empty(spec, 10, 10, init_rrefs=True)
         sharded_tensor_metadata = sharded_tensor.metadata()
         self.assertEqual(torch.Size([10, 10]), sharded_tensor_metadata.size)
-        self.assertEqual(torch.float, sharded_tensor_metadata.dtype)
-        self.assertEqual(torch.strided, sharded_tensor_metadata.layout)
-        self.assertEqual(False, sharded_tensor_metadata.requires_grad)
-        self.assertEqual(torch.contiguous_format, sharded_tensor_metadata.memory_format)
-        self.assertEqual(False, sharded_tensor_metadata.pin_memory)
+        self.assertEqual(torch.float, sharded_tensor.dtype)
+        self.assertEqual(torch.strided, sharded_tensor.layout)
+        self.assertEqual(False, sharded_tensor.requires_grad)
+        self.assertTrue(sharded_tensor.is_contiguous())
+        self.assertFalse(sharded_tensor.is_pinned())
 
         sharded_tensor = _sharded_tensor.empty(spec, 10, 10, requires_grad=True, init_rrefs=True)
-        sharded_tensor_metadata = sharded_tensor.metadata()
-        self.assertEqual(True, sharded_tensor_metadata.requires_grad)
+        self.assertEqual(True, sharded_tensor.requires_grad)
 
         sharded_tensor = _sharded_tensor.empty(spec, 10, 10, dtype=torch.double, init_rrefs=True)
-        sharded_tensor_metadata = sharded_tensor.metadata()
-        self.assertEqual(torch.double, sharded_tensor_metadata.dtype)
+        self.assertEqual(torch.double, sharded_tensor.dtype)
 
         # Need CPU for pin_memory
         spec = EnumerableShardingSpec([
@@ -821,8 +821,7 @@ def test_sharded_tensor_metadata(self):
         ])
 
         sharded_tensor = _sharded_tensor.empty(spec, 10, 10, pin_memory=True, init_rrefs=True)
-        sharded_tensor_metadata = sharded_tensor.metadata()
-        self.assertEqual(True, sharded_tensor_metadata.pin_memory)
+        self.assertTrue(sharded_tensor.is_pinned())
 
     @with_comms
     @skip_if_lt_x_gpu(4)
diff --git a/torch/distributed/_sharded_tensor/api.py b/torch/distributed/_sharded_tensor/api.py
index 2b6720b059a85..5f501b7689e4e 100644
--- a/torch/distributed/_sharded_tensor/api.py
+++ b/torch/distributed/_sharded_tensor/api.py
@@ -551,6 +551,35 @@ def size(self) -> torch.Size:
         """
         return self._metadata.size
 
+    def is_pinned(self) -> bool:
+        """
+        Returns True if the sharded tensor (each local shard) resides in pinned memory.
+        """
+        return self._metadata.pin_memory
+
+    def is_contiguous(self) -> bool:
+        """
+        Returns True if the sharded tensor (each local shard) is contiguous in memory
+        in the order specified by memory format.
+        """
+        return self._metadata.memory_format == torch.contiguous_format
+
+    @property
+    def shape(self):
+        return self._metadata.size
+
+    @property
+    def requires_grad(self):
+        return self._metadata.requires_grad
+
+    @property
+    def dtype(self):
+        return self._metadata.dtype
+
+    @property
+    def layout(self):
+        return self._metadata.layout
+
     def _register_remote_shards(self, remote_shards: List[rpc.RRef[Shard]], rpc_rank: int):
         self._remote_shards[rpc_rank] = remote_shards
 

From 76da46ccdccd2fa06fd17b58edbaca98100be5ed Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Sat, 21 Aug 2021 03:36:09 -0700
Subject: [PATCH 125/530] Revert D30417127: Remove flag to toggle CPU fusion in
 the presence of parallelism

Test Plan: revert-hammer

Differential Revision:
D30417127 (https://github.com/pytorch/pytorch/commit/6600bc96517269c608ea47b76b6bda9476c7bcef)

Original commit changeset: b77d7c68364f

fbshipit-source-id: 6b52fb83a84fe241945e3cb3eeb71050d1d9c8f1
---
 test/cpp/tensorexpr/test_te_fuser_pass.cpp |  6 +++++-
 test/jit/test_profiler.py                  |  3 +++
 test/test_jit_fuser_te.py                  |  5 +++++
 test/test_tensorexpr.py                    |  4 ++++
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 19 ++++++++++++++++++-
 torch/csrc/jit/passes/tensorexpr_fuser.h   |  2 ++
 torch/csrc/jit/python/init.cpp             |  2 ++
 7 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
index 91fb4c2b7582c..8dd616453362b 100644
--- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp
+++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@@ -15,15 +15,19 @@ namespace jit {
 using namespace torch::jit::tensorexpr;
 
 struct WithCPUFuser {
-  WithCPUFuser(bool val = true) : cpuFuserEnabled(canFuseOnCPU()) {
+  WithCPUFuser(bool val = true)
+      : cpuFuserEnabled(canFuseOnCPU()), parallel(texprParallelCPUEnabled()) {
     overrideCanFuseOnCPU(val);
+    setTexprParallelCPUEnabled(true);
   }
 
   ~WithCPUFuser() {
     overrideCanFuseOnCPU(cpuFuserEnabled);
+    setTexprParallelCPUEnabled(parallel);
   }
 
   bool cpuFuserEnabled;
+  bool parallel;
 };
 
 TEST(TEFuserPass, FuserPass_1) {
diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py
index b9ed9d0b78eb5..aa8be0518385f 100644
--- a/test/jit/test_profiler.py
+++ b/test/jit/test_profiler.py
@@ -29,6 +29,8 @@ def setUp(self):
         torch._C._debug_set_fusion_group_inlining(False)
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
+        self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled()
+        torch._C._jit_set_texpr_parallel_cpu_enabled(True)
 
     def tearDown(self):
         torch._C._jit_set_profiling_executor(self.prev_exec)
@@ -40,6 +42,7 @@ def tearDown(self):
         torch._C._jit_set_texpr_reductions_enabled(self.old_reduction_enabled)
         torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
+        torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel)
 
     def test_tensor_type_not_determined_by_inputs(self):
         @torch.jit.script
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index b89caca44a1b2..5e8204a4c7b14 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -85,6 +85,10 @@ def setUp(self):
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
 
+        # TODO: CPU fuser currently is disabled when multithreading.
+        self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled()
+        torch._C._jit_set_texpr_parallel_cpu_enabled(True)
+
         self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         self.int_dtypes = [
             torch.int8,
@@ -112,6 +116,7 @@ def tearDown(self):
 
         torch._C._jit_set_texpr_fuser_enabled(self.texpr_fuser_state)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
+        torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel)
 
     def assertLastGraphAllFused(self):
         self.assertAllFused(torch.jit.last_executed_optimized_graph())
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 47c7e689aa6a4..6353113a1ec4c 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -24,6 +24,9 @@ def setUp(self):
         torch._C._debug_set_fusion_group_inlining(False)
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
+        # TODO: CPU fuser currently is disabled when multithreading.
+        self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled()
+        torch._C._jit_set_texpr_parallel_cpu_enabled(True)
 
         self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
 
@@ -36,6 +39,7 @@ def tearDown(self):
         torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuser_state)
         torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
+        torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel)
 
     def assertLastGraphAllFused(self):
         self.assertAllFused(torch.jit.last_executed_optimized_graph())
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 52bf4539479df..d4add03506c4f 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 
+#include <ATen/Parallel.h>
 #include <ATen/core/interned_strings.h>
 #include <ATen/record_function.h>
 #include <c10/util/FunctionRef.h>
@@ -249,6 +250,15 @@ bool isSupported(Node* node) {
 } // namespace tensorexpr
 
 static bool texpr_fuser_enabled_ = true;
+static bool texpr_parallel_cpu_enabled = false;
+
+bool texprParallelCPUEnabled() {
+  return texpr_parallel_cpu_enabled;
+}
+
+void setTexprParallelCPUEnabled(bool val) {
+  texpr_parallel_cpu_enabled = val;
+}
 
 void setTensorExprFuserEnabled(bool val) {
   texpr_fuser_enabled_ = val;
@@ -888,7 +898,14 @@ class TensorExprFuser {
       return false;
     }
     if (device->is_cpu()) {
-      return canFuseOnCPU();
+      // CPU fusion is only supported for single-thread.
+      if (!canFuseOnCPU()) {
+        return false;
+      }
+      if (at::get_num_threads() == 1 || texprParallelCPUEnabled()) {
+        return true;
+      }
+      return false;
     } else if (device->is_cuda()) {
       return canFuseOnGPU();
     } else if (device->is_xpu()) {
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h
index 254aebd91d12f..3f6538b7e587a 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.h
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -24,6 +24,8 @@ TORCH_API void setTensorExprFuserEnabled(bool val);
 TORCH_API bool tensorExprFuserEnabled();
 TORCH_API bool setTexprReductionsEnabled(bool value);
 TORCH_API bool texprReductionsEnabled();
+TORCH_API bool texprParallelCPUEnabled();
+TORCH_API void setTexprParallelCPUEnabled(bool val);
 
 TORCH_API void RemoveProfileNodesAndSpecializeTypes(
     std::shared_ptr<Graph>& graph);
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 86b64b8342a7d..c92ab1b46e41c 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -712,6 +712,8 @@ void initJITBindings(PyObject* module) {
       .def("_jit_texpr_set_fallback_allowed", &tensorexpr::setFallbackAllowed)
       .def("_jit_set_texpr_reductions_enabled", &setTexprReductionsEnabled)
       .def("_jit_texpr_reductions_enabled", &texprReductionsEnabled)
+      .def("_jit_set_texpr_parallel_cpu_enabled", &setTexprParallelCPUEnabled)
+      .def("_jit_texpr_parallel_cpu_enabled", &texprParallelCPUEnabled)
       .def(
           "_jit_set_te_generate_block_code",
           [](bool gen_block_code) {

From 37d60c08e547e63cef8a80a9e187c4d5e3b9b418 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Sat, 21 Aug 2021 03:45:21 -0700
Subject: [PATCH 126/530] Revert D30360382: [nnc] Support thread level
 parallelism in fused kernels

Test Plan: revert-hammer

Differential Revision:
D30360382 (https://github.com/pytorch/pytorch/commit/d6d86efb1c839ddafd1398d6dab9caa4f31a9f0b)

Original commit changeset: 29acf4e932c6

fbshipit-source-id: e0531113135d30eabb172dc1537d5dd6d65dc438
---
 test/cpp/tensorexpr/test_kernel.cpp        | 30 --------
 torch/csrc/jit/tensorexpr/kernel.cpp       | 87 ----------------------
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 24 ++----
 torch/csrc/jit/tensorexpr/llvm_jit.h       |  8 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp     |  7 --
 5 files changed, 8 insertions(+), 148 deletions(-)

diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index 8d4e48c4a0bff..8f36f54395f49 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -206,36 +206,6 @@ TEST_F(Kernel, _3) {
   }
 }
 
-TEST_F(Kernel, ParallelStrided) {
-  KernelScope kernel_scope;
-
-  const auto graph_string = R"IR(
-      graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu),
-            %1 : Float(5, 3, 40005, strides=[960120, 160020, 2], device=cpu)):
-        %2 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %1)
-        %3 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %2)
-        return (%3))IR";
-  auto graph = std::make_shared<Graph>();
-  parseIR(graph_string, &*graph);
-
-  auto a = at::rand({5, 3, 40005}, TensorOptions(kCPU).dtype(at::kFloat));
-  auto b = at::rand({10, 6, 80010}, TensorOptions(kCPU).dtype(at::kFloat))
-               .index(
-                   {Slice(None, None, 2),
-                    Slice(None, None, 2),
-                    Slice(None, None, 2)});
-  auto ref = a * (a * b);
-  auto o = at::zeros_like(ref);
-  TensorExprKernel k(graph);
-  std::vector<at::Tensor> inputs = {a, b};
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  k.run(stack);
-  o = stack[0].toTensor();
-  for (size_t i = 0; i < 5 * 3; i++) {
-    CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
-  }
-}
-
 TEST_F(Kernel, DISABLED_Shape_Inference) {
   // disabled: doesn't do stride propagation, and isn't being used currently
 
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index c5333b2010610..faacd022e7e0b 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -2,7 +2,6 @@
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 
 #include <ATen/ExpandUtils.h>
-#include <ATen/Parallel.h>
 #include <ATen/TensorGeometry.h>
 #include <c10/util/irange.h>
 #include <c10/util/string_utils.h>
@@ -2488,86 +2487,6 @@ void fuseAllLoops(StmtPtr st) {
   }
 }
 
-// Compute the trip count of a loop if it is a constant.
-c10::optional<int64_t> tripCount(ForPtr loop) {
-  auto tc = IRSimplifier::simplify(
-      cast<int64_t>(ExprHandle(loop->stop()) - ExprHandle(loop->start())));
-  if (auto val = to<LongImm>(tc.node())) {
-    return val->value();
-  }
-  return c10::nullopt;
-}
-
-// Prune innermost loops until iterations satisfies a minimum grain size.
-static void pruneByGrainSize(std::vector<ForPtr>& loops) {
-  constexpr int64_t minGrainSize = 32768;
-  int64_t grainSize = 1;
-  for (int64_t i = loops.size(); i > 0; i--) {
-    auto tc = tripCount(loops[i - 1]);
-    if (!tc) {
-      break;
-    }
-    grainSize *= *tc;
-    if (grainSize < minGrainSize) {
-      loops.pop_back();
-    }
-  }
-}
-
-// Retain enough outermost loops to fill the number of threads.
-static void pruneByThreadCount(std::vector<ForPtr>& loops) {
-  int64_t trips = 1;
-  auto threads = at::get_num_threads();
-  auto it = loops.begin();
-  for (; it != loops.end(); it++) {
-    if (trips >= threads) {
-      break;
-    }
-    auto tc = tripCount(*it);
-    if (!tc) {
-      break;
-    }
-    trips *= *tc;
-  }
-  loops.erase(it, loops.end());
-}
-
-// Flatten and parallelize outer loops, subject to a minimum number of elements
-// in the inner loop, and a maximum level of thread-level parallelism in the
-// outer loops.
-template <typename Bufs>
-static void parallelizeOuterLoops(LoopNest& l, Bufs&& bufs) {
-  for (auto const& buf : bufs) {
-    auto loops = l.getLoopStmtsFor(buf);
-    pruneByGrainSize(loops);
-    pruneByThreadCount(loops);
-
-    // There are no loops to parallelize; give up.
-    if (loops.size() == 0) {
-      continue;
-    }
-    // The loop nest contains a reduction; give up.
-    auto reductions = NodeFinder<ReduceOp>::find(loops[0]);
-    if (reductions.size() > 0) {
-      continue;
-    }
-    // The loop nest has loop carried dependences; give up.
-    if (LoopNest::hasLoopCarriedDependence(loops[0])) {
-      continue;
-    }
-    // Try to flatten the outer loops and parallelize them if successful.
-    ForPtr flattened = nullptr;
-    if (loops.size() == 1) {
-      flattened = loops[0];
-    } else {
-      LoopNest::flatten(loops, &flattened);
-    }
-    if (flattened) {
-      flattened->set_parallel();
-    }
-  }
-}
-
 StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) {
   torch::jit::tensorexpr::LoopNest l(st, bufOutputs_);
   GRAPH_DEBUG("Original Stmt:\n", std::to_string(l.root_stmt()), "\n");
@@ -2609,8 +2528,6 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) {
   if (backendType == kLLVMCodeGen) {
     fuseAllLoops(l.root_stmt());
     GRAPH_DEBUG("after fuse", *l.root_stmt());
-    parallelizeOuterLoops(l, bufOutputs_);
-    GRAPH_DEBUG("after parallelize", *l.root_stmt());
   }
 
   if (backendType == kCudaCodeGen) {
@@ -2685,13 +2602,9 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) {
   }
 
   l.prepareForCodegen();
-  GRAPH_DEBUG("after prepareForCodegen", *l.root_stmt());
-  l.simplify();
-  GRAPH_DEBUG("after simplification", *l.root_stmt());
 
   if (backendType == kLLVMCodeGen && !hasReduction) {
     l.vectorizeInnerLoops();
-    GRAPH_DEBUG("after vectorization", *l.root_stmt());
   }
 
   StmtPtr stmt = l.root_stmt();
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index d5a95bc4cf886..eac1f82f25c4b 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -274,24 +274,15 @@ class LLVMCodeGenImpl : public IRVisitor {
   }
 };
 
-extern "C" {
 typedef void (*ParallelCallee)(int index, int8_t* packed_data);
-void DispatchParallel(
-    int8_t* func,
-    int start,
-    int stop,
-    int8_t* packed_data) noexcept {
+void DispatchParallel(int8_t* func, int start, int stop, int8_t* packed_data) {
   // TODO: preserve the func type.
-  try {
-    ParallelCallee callee = reinterpret_cast<ParallelCallee>(func);
-    at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) {
-      for (int index = f_begin; index < f_end; index++) {
-        callee(index, packed_data);
-      }
-    });
-  } catch (...) {
-  }
-}
+  ParallelCallee callee = reinterpret_cast<ParallelCallee>(func);
+  at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) {
+    for (int index = f_begin; index < f_end; index++) {
+      callee(index, packed_data);
+    }
+  });
 }
 
 } // namespace tensorexpr
@@ -1296,7 +1287,6 @@ void LLVMCodeGenImpl::processParallelFor(ForPtr v) {
       module_->getOrInsertFunction("DispatchParallel", dispatcher_fntype);
   llvm::Function* dispatcher =
       llvm::cast<llvm::Function>(dispatcher_callee.getCallee());
-  dispatcher->addFnAttr(llvm::Attribute::NoUnwind);
   irb_.CreateCall(
       dispatcher, {func_value, start, stop, packed_caller_args_ptr});
   value_ = llvm::ConstantInt::get(IntTy_, 0);
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h
index 8585900abc8d6..30ad5317a1b3c 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.h
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.h
@@ -17,13 +17,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-extern "C" {
-void DispatchParallel(
-    int8_t* func,
-    int start,
-    int stop,
-    int8_t* packed_data) noexcept;
-}
+void DispatchParallel(int8_t* func, int start, int stop, int8_t* packed_data);
 
 inline std::string formatError(llvm::Error&& err, const char* msg) {
   static constexpr char* defaultErrorMsg = "Unexpected failure in LLVM JIT";
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 7bcdd1a666f7b..a296d8c7af79b 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -179,13 +179,6 @@ class Vectorizer : public IRMutator {
     });
   }
 
-  ExprPtr mutate(ModPtr v) override {
-    std::vector<ExprPtr> inputs = {v->lhs(), v->rhs()};
-    return try_vectorize(v, inputs, [&]() {
-      return ExprHandle(inputs[0]) % ExprHandle(inputs[1]);
-    });
-  }
-
   ExprPtr mutate(AndPtr v) override {
     std::vector<ExprPtr> inputs = {v->lhs(), v->rhs()};
     return try_vectorize(v, inputs, [&]() {

From e926f75b0bc66c789365cb1c48ba41e8447b97fb Mon Sep 17 00:00:00 2001
From: jiej <jiej@nvidia.com>
Date: Sat, 21 Aug 2021 09:05:04 -0700
Subject: [PATCH 127/530] BatchNorm autodiff re-enabled (#57321)

Summary:
Turns on BN in autodiff:

1. outputs an empty tensor for running stats to by pass autodiff issue on None;
2. fixing BN inference backward in cudnn & miopen, where backward falls back to native batchnorm kernel instead;

Pull Request resolved: https://github.com/pytorch/pytorch/pull/57321

Reviewed By: albanD, ngimel

Differential Revision: D30250419

Pulled By: jansel

fbshipit-source-id: a62553789c20fb50a820003a056f40d9d642dfaa
---
 aten/src/ATen/native/Normalization.cpp        | 54 +++++++++++++---
 aten/src/ATen/native/cuda/Normalization.cu    |  6 +-
 aten/src/ATen/native/cudnn/BatchNorm.cpp      |  3 +
 .../ATen/native/miopen/BatchNorm_miopen.cpp   |  2 +
 test/test_jit.py                              | 62 +++++++++++++++++++
 torch/csrc/jit/runtime/symbolic_script.cpp    |  2 +-
 .../_internal/jit_metaprogramming_utils.py    | 35 ++++++++++-
 7 files changed, 149 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 40ee1d5d4a152..611faf010abaf 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -240,7 +240,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
     grad_weight = at::empty_like(weight, at::MemoryFormat::Contiguous);
   }
   if (grad_input_mask[2]) {
-    grad_bias = at::empty_like(weight, at::MemoryFormat::Contiguous);
+    grad_bias = at::empty({input.size(1)}, input.options());
   }
 
   // since we are directly manipulating pointers in contiguous path,
@@ -416,6 +416,22 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
   const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
 
   auto num_features = input.sizes()[1];
+
+  if (input.numel() == 0) {
+    Tensor reserve = at::empty({0}, input.options().dtype(kByte));
+    auto options = input.options().dtype(
+        at::toAccumulateType(input.scalar_type(), /*is_cuda=*/input.is_cuda()));
+    auto save_mean = at::empty({num_features}, options);
+    auto save_invstd = at::empty({num_features}, options);
+
+    // don't return view of input, don't return empty tensor because it will break gradient chain
+    auto out = input.clone();
+    if (weight.defined()) out = out * weight[0];
+    if (bias.defined()) out = out + bias[0];
+    return std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t>(
+        out, save_mean, save_invstd, reserve, 0);
+  }
+
   if (running_mean.defined()) {
     check_dims_match_num_input_features("running_mean", num_features, running_mean.numel());
   } else if (!training) {
@@ -508,7 +524,30 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_impl_index_backward(
   const Tensor& save_mean = c10::value_or_else(save_mean_opt, [] {return Tensor();});
   const Tensor& save_var_transform = c10::value_or_else(save_var_transform_opt, [] {return Tensor();});
 
-  if (impl_index == 0) {
+  if (input.numel() == 0) {
+    std::vector<int64_t> dims(input.dim() - 1);
+    dims[0] = 0;
+    std::iota(dims.begin() + 1, dims.end(), 2);
+
+    // don't return empty tensor because it will break gradient chain
+    Tensor grad_input;
+    Tensor grad_weight;
+    Tensor grad_bias;
+    if (output_mask[2]) {
+      grad_bias = grad_output.sum(dims);
+    }
+    if (output_mask[1]) {
+      grad_weight = (grad_output * input).sum(dims);
+    }
+    if (output_mask[0] && weight.defined()) {
+      grad_input = grad_output * weight[0];
+    }
+    return std::make_tuple(grad_input, grad_weight, grad_bias);
+  }
+
+  // backward in inference mode is not supported in cudnn, fallback to native
+  // TODO: verify the same thing in miopen
+  if (impl_index == 0 || (!train)) {
     return at::native_batch_norm_backward(grad_output, input, weight, running_mean, running_var, save_mean, save_var_transform, train, epsilon, output_mask);
   } else if (impl_index == 1) {
     // TODO: _batch_norm_impl_index_backward is only used in JIT. cudnn NHWC
@@ -528,13 +567,6 @@ Tensor batch_norm(
   const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
   const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
   const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
-  if (input.numel()==0){
-    //don't return view of input, don't return empty tensor because it will break gradient chain
-    auto out = input.clone();
-    if (weight.defined()) out = out * weight[0];
-    if (bias.defined()) out = out + bias[0];
-    return out;
-  }
   return std::get<0>(at::_batch_norm_impl_index(input, weight, bias, running_mean, running_var,
                                                 training, momentum, eps, cudnn_enabled));
 }
@@ -602,7 +634,9 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const c10:
 
   return AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "batch_norm", [&] {
       if (!train) {
-        return batch_norm_cpu_transform_input_template<scalar_t>(self, weight, bias, {}, {}, running_mean, running_var, train, eps);
+        auto save_mean = at::empty({0}, self.options());
+        auto save_var = at::empty({0}, self.options());
+        return batch_norm_cpu_transform_input_template<scalar_t>(self, weight, bias, save_mean, save_var, running_mean, running_var, train, eps);
       } else {
         auto save_stats = batch_norm_cpu_update_stats_template<scalar_t, InvStd>(self, running_mean, running_var, momentum, eps);
         return batch_norm_cpu_transform_input_template<scalar_t>(self, weight, bias, std::get<0>(save_stats), std::get<1>(save_stats), running_mean, running_var, train, eps);
diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
index dff3f69bcc43c..0238b1b682877 100644
--- a/aten/src/ATen/native/cuda/Normalization.cu
+++ b/aten/src/ATen/native/cuda/Normalization.cu
@@ -487,7 +487,8 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cuda(const Tensor& grad_o
   // save_mean and save_invstd, so it needs recalculated.
   const auto acc_type = at::toAccumulateType(input.scalar_type(), /*is_cuda=*/true);
   Tensor mean;
-  if (save_mean->defined()) {
+  TORCH_INTERNAL_ASSERT(save_mean->defined(), "save_mean should always be defined\n");
+  if (save_mean->numel() != 0) {
     mean = *save_mean;
   } else if (needs_reduction) {
     TORCH_CHECK(!train && running_mean->defined());
@@ -496,7 +497,8 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cuda(const Tensor& grad_o
   }
 
   Tensor invstd;
-  if (save_invstd->defined()) {
+  TORCH_INTERNAL_ASSERT(save_invstd->defined(), "save_invstd should always be defined\n");
+  if (save_invstd->numel() != 0) {
     invstd = *save_invstd;
   } else {
     TORCH_CHECK(!train && running_var->defined());
diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp
index 3a34e327e2697..1c70aa353b517 100644
--- a/aten/src/ATen/native/cudnn/BatchNorm.cpp
+++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp
@@ -212,6 +212,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
 #endif // CUDNN_VERSION >= 7400
   } else {
     reserve = at::empty({0}, input->options().dtype(kByte));
+    // This keeps a consistent output with native_batch_norm
+    save_mean = at::empty({0}, weight_t.options());
+    save_var = at::empty({0}, weight_t.options());
     AT_CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
       handle, mode, &one, &zero,
       idesc.desc(), input->data_ptr(),
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index d78fe079ed442..28e20e90b2997 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -120,6 +120,8 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
       save_mean.data_ptr(),
       save_var.data_ptr()));
   } else {
+    save_mean = at::empty({0}, weight_t.options());
+    save_var = at::empty({0}, weight_t.options());
     MIOPEN_CHECK(miopenBatchNormalizationForwardInference(
       handle, mode, &one, &zero,
       idesc.desc(), input->data_ptr(),
diff --git a/test/test_jit.py b/test/test_jit.py
index 2dd0d4764c46c..06afe656a8d3c 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -10774,6 +10774,68 @@ def addmm_grad_test(b, x, w):
         self.assertEqual(w.grad, w_ref.grad)
         self.assertEqual(b.grad, b_ref.grad)
 
+    @unittest.skipIf(not RUN_CUDA, "running tests on cuda to verify cudnn fix")
+    def test_batch_norm_inference_backward_cuda(self):
+        with enable_profiling_mode_for_profiling_tests():
+            class MyBatchNorm(torch.nn.Module):
+                def __init__(self, num_features, affine, track_running_stats):
+                    super(MyBatchNorm, self).__init__()
+                    self.bn = torch.nn.BatchNorm2d(
+                        num_features, 1e-5, affine=affine, track_running_stats=track_running_stats).float()
+
+                def forward(self, x: torch.Tensor):
+                    o = self.bn(x)
+                    o = torch.nn.functional.relu(o)
+                    return o
+
+            batch = 4
+            c = 2
+            hw = 3
+            # Initialize param and input values
+            x_init = torch.randn(batch, c, hw, hw, dtype=torch.float).cuda()
+            grad = torch.randn(batch, c, hw, hw, dtype=torch.float).cuda()
+
+            training = False
+            affine = True
+            track_running_stats = True
+
+            module = torch.jit.script(MyBatchNorm(c, affine, track_running_stats)).cuda()
+            ref_module = MyBatchNorm(c, affine, track_running_stats).cuda()
+            module.eval()
+            ref_module.eval()
+
+            jit_module = torch.jit.script(module)
+            ref_module.load_state_dict(module.state_dict())
+
+            x = x_init.detach().clone()
+            x.requires_grad_()
+            x_ref = x_init.detach().clone()
+            x_ref.requires_grad_()
+
+            # Test symbolic differentiation
+            # Run Forward and Backward thrice to trigger autodiff graph
+            for i in range(0, 3):
+                y = jit_module(x)
+                y.backward(grad)
+            x.grad.zero_()
+
+            module.bn.running_mean.zero_()
+            module.bn.running_var.fill_(1.0)
+            ref_module.bn.running_mean.zero_()
+            ref_module.bn.running_var.fill_(1.0)
+
+            # run jitted module
+            y = jit_module(x)
+            y.backward(grad)
+            # reference computation
+            y_ref = ref_module(x_ref)
+            y_ref.backward(grad)
+
+            self.assertEqual(y_ref, y)
+            self.assertEqual(x.grad, x_ref.grad)
+            self.assertEqual(module.bn.running_mean, ref_module.bn.running_mean)
+            self.assertEqual(module.bn.running_var, ref_module.bn.running_var)
+
     def test_zeros(self):
         class M(torch.jit.ScriptModule):
             __constants__ = ['d']
diff --git a/torch/csrc/jit/runtime/symbolic_script.cpp b/torch/csrc/jit/runtime/symbolic_script.cpp
index 453a83cd4507e..29ce74a7d3ef7 100644
--- a/torch/csrc/jit/runtime/symbolic_script.cpp
+++ b/torch/csrc/jit/runtime/symbolic_script.cpp
@@ -1117,7 +1117,7 @@ const std::vector<std::string> functions = {
             return result, backward
     )",
     R"(
-        def batch_norm_disabled(input : Tensor,
+        def batch_norm(input : Tensor,
                        weight : Optional[Tensor],
                        bias : Optional[Tensor],
                        running_mean : Optional[Tensor],
diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py
index a21717bc5f9a1..350866cdbf083 100644
--- a/torch/testing/_internal/jit_metaprogramming_utils.py
+++ b/torch/testing/_internal/jit_metaprogramming_utils.py
@@ -109,8 +109,39 @@
     ('bilinear', (S, S, S), ((S, S, M), torch.zeros(M, S, M),),),
     ('embedding', torch.tensor([[1, 2, 4, 5], [4, 3, 2, 5]]), (torch.rand(6, 3), ), '', (True,)),
     ('embedding_bag', torch.tensor([1, 2, 4, 2]), (torch.rand(5, 3), torch.tensor([0, 4]),),),
-    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), ),
-        '', (False, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S),
+        (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), None, None, True, ),
+        'training', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (0, S, S, S),
+        (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+         non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), True, ),
+        'size_zero', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (0, S, S, S),
+        (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+         non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), True, ),
+        'size_zero_inference', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S),
+        (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+         non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), True, ),
+        'with_weight_and_bias_training', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+                            None, non_differentiable(torch.ones(S)), True, ),
+        'with_only_bias_training', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+                            non_differentiable(torch.randn(S)), None, True, ),
+        'with_only_weight_training', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+                            None, None, False, ),
+        'inference', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+                            non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), False, ),
+        'with_weight_and_bias_inference', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+                            None, non_differentiable(torch.ones(S)), False, ),
+        'with_only_bias_inference', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+                            non_differentiable(torch.randn(S)), None, False, ),
+        'with_only_weight_inference', (True, 'aten::_batch_norm_impl_index')),
     ('instance_norm', (S, S, S), (non_differentiable(torch.zeros(S)), non_differentiable(torch.ones(S))),),
     ('layer_norm', (S, S, S, S), ([5],), '',
      (True, ['aten::native_layer_norm'])),

From 2289a12f21c54da93bf5d696e3f9aea83dd9c10d Mon Sep 17 00:00:00 2001
From: Horace He <horacehe2007@yahoo.com>
Date: Sat, 21 Aug 2021 17:13:27 -0700
Subject: [PATCH 128/530] Made FuncTorchBatched decompose
 CompositeImplicitAutograd (#63616)

Summary:
See https://github.com/facebookresearch/functorch/issues/56

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63616

Reviewed By: zou3519

Differential Revision: D30438316

Pulled By: Chillee

fbshipit-source-id: e84446d9f68b87daa0cfff75b3b8a972f36ec85a
---
 c10/core/DispatchKeySet.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp
index 404acc7cb1db3..21433d4ace8d7 100644
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@@ -32,8 +32,8 @@ bool isBackendDispatchKey(DispatchKey t) {
 // math_dispatch_keyset contains all keys in backend_dispatch_keyset and
 // autograd_dispatch_keyset Alias key DispatchKey::CompositeImplicitAutograd
 // maps to math_dispatch_keyset.
-constexpr DispatchKeySet math_dispatch_keyset =
-    backend_dispatch_keyset | autograd_dispatch_keyset;
+constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset |
+    autograd_dispatch_keyset | DispatchKeySet({DispatchKey::FuncTorchBatched});
 
 DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) {
   TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);

From d6133b2fe6b863dd49fb21641bd04e24e19ac794 Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Sun, 22 Aug 2021 18:55:45 -0700
Subject: [PATCH 129/530] Remove `_fork_processes` from common_distributed.py
 (#63711)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63711

This removes `_fork_process` from common_distributed.py and fixes all
other callpoints to use `spawn_process` instead.
ghstack-source-id: 136395719

Test Plan: waitforbuildbot

Reviewed By: xush6528

Differential Revision: D30463834

fbshipit-source-id: 0c09e8a996d0e5b912c8cdd45488a39951bac4db
---
 torch/testing/_internal/common_distributed.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index fb505d105980c..01e167f528af2 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -9,7 +9,6 @@
 import traceback
 import types
 import unittest
-import warnings
 from contextlib import contextmanager
 from datetime import timedelta
 from enum import Enum
@@ -468,14 +467,6 @@ def _start_processes(self, proc) -> None:
             self.pid_to_pipe[process.pid] = parent_conn
             self.processes.append(process)
 
-    def _fork_processes(self) -> None:
-        warnings.warn(
-            "Fork based multiprocessing is dangerous and should not"
-            " be used, for tests with ASAN consider using opt-asan",
-            DeprecationWarning)
-        proc = torch.multiprocessing.get_context("fork").Process
-        self._start_processes(proc)
-
     def _spawn_processes(self) -> None:
         proc = torch.multiprocessing.get_context("spawn").Process
         self._start_processes(proc)

From 726fd26b3e5ecf205569e59cf7cd5a6cbf4387a0 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Sun, 22 Aug 2021 22:29:04 -0700
Subject: [PATCH 130/530] Update ROCm PyTorch persons of interest (#55206)

Summary:
cc jeffdaily sunway513

Pull Request resolved: https://github.com/pytorch/pytorch/pull/55206

Reviewed By: VitalyFedyunin

Differential Revision: D30296584

Pulled By: dzhulgakov

fbshipit-source-id: 6e5c610cc6b7c7fd58b80fa3f9de31f269341a88
---
 docs/source/community/persons_of_interest.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst
index 5c1fcbf1c7ecb..c220ae80806e8 100644
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@@ -97,8 +97,9 @@ MKLDNN
 AMD/ROCm/HIP
 ~~~~~~~~~~~~
 
--  Junjie Bai (`bddppq <https://github.com/bddppq>`__)
--  Johannes M. Dieterich (`iotamudelta <https://github.com/iotamudelta>`__)
+-  Peng Sun (`sunway513 <https://github.com/sunway513>`__)
+-  Jithun Nair (`jithunnair-amd <https://github.com/jithunnair-amd>`__)
+-  Jeff Daily (`jeffdaily <https://github.com/jeffdaily>`__)
 
 Build + CI
 ~~~~~~~~~~

From bafd875f743d93ccb3463676ea29101cae1760d7 Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Mon, 23 Aug 2021 07:05:51 -0700
Subject: [PATCH 131/530] Allow implementing either backward or vjp for
 Function (#63434)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63434

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30431968

Pulled By: albanD

fbshipit-source-id: 0bb88664283486a9fd3364e6c3d79442a44625c2
---
 test/test_autograd.py      | 18 +++++++++++++++++-
 torch/autograd/function.py | 21 +++++++++++++++++----
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 7200bd525acf2..8b7aeb4159f23 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -5477,13 +5477,29 @@ class BadBw(Function):
             def forward(ctx, foo):
                 return foo.clone()
 
+        class BadBw2(Function):
+            @staticmethod
+            def forward(ctx, foo):
+                return foo.clone()
+
+            @staticmethod
+            def backward(ctx, foo):
+                return foo
+
+            @staticmethod
+            def vjp(ctx, foo):
+                return foo
+
         inp = torch.rand(1, requires_grad=True)
         with self.assertRaisesRegex(NotImplementedError, "must implement the forward"):
             BadFw.apply(inp)
 
-        with self.assertRaisesRegex(RuntimeError, "must implement the backward"):
+        with self.assertRaisesRegex(RuntimeError, "must implement either the backward"):
             BadBw.apply(inp).sum().backward()
 
+        with self.assertRaisesRegex(RuntimeError, "Implementing both 'backward' and 'vjp'"):
+            BadBw2.apply(inp).sum().backward()
+
     def test_custom_function_local_inplace(self):
         class MyFn(torch.autograd.Function):
             @staticmethod
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index 4d6122924ec14..90aeea5f1dfea 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -188,7 +188,15 @@ def _register_hook(backward_hooks, hook):
 class BackwardCFunction(_C._FunctionBase, FunctionCtx, _HookMixin):
     def apply(self, *args):
         # _forward_cls is defined by derived class
-        return self._forward_cls.backward(self, *args)  # type: ignore[attr-defined]
+        # The user should define either backward or vjp but never both.
+        backward_fn = self._forward_cls.backward  # type: ignore[attr-defined]
+        vjp_fn = self._forward_cls.vjp  # type: ignore[attr-defined]
+        if backward_fn is not Function.backward and vjp_fn is not Function.vjp:
+            raise RuntimeError("Implementing both 'backward' and 'vjp' for a custom "
+                               "Function is not allowed. You should only implement one "
+                               "of them.")
+        user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn
+        return user_fn(self, *args)
 
 
 class FunctionMeta(type):
@@ -271,7 +279,8 @@ def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
 
     @staticmethod
     def backward(ctx: Any, *grad_outputs: Any) -> Any:
-        r"""Defines a formula for differentiating the operation.
+        r"""Defines a formula for differentiating the operation with backward mode
+        automatic differentiation.
 
         This function is to be overridden by all subclasses.
 
@@ -291,8 +300,12 @@ def backward(ctx: Any, *grad_outputs: Any) -> Any:
         first input to :func:`forward` needs gradient computated w.r.t. the
         output.
         """
-        raise NotImplementedError("You must implement the backward function for custom"
-                                  " autograd.Function.")
+        raise NotImplementedError("You must implement either the backward or vjp method for "
+                                  "your custom autograd.Function to use it with backward "
+                                  "mode AD.")
+
+    # vjp and backward are alias of each other
+    vjp = backward
 
 
 def once_differentiable(fn):

From f1d865346fa6cd191dd0f3102a8f58ec04d6cda1 Mon Sep 17 00:00:00 2001
From: Gary Miguel <garymiguel@microsoft.com>
Date: Mon, 23 Aug 2021 07:41:33 -0700
Subject: [PATCH 132/530] [ONNX] add test images to repo (#63717)

Summary:
This is better than the status quo:
* Test doesn't download files from the internet -> faster and more
  reliable.
* Test doesn't leave the git working directory dirty.

Rather than using the original images, I've copied some images from
the pytorch/vision repo. This will keep the tests in the two repos
in sync, while avoiding adding new assets to the vision repo.

See https://github.com/pytorch/vision/pull/4176.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63717

Reviewed By: janeyx99

Differential Revision: D30466016

Pulled By: malfet

fbshipit-source-id: 2c56d4c11b5c74db1764576bf1c95ce4ae714574
---
 test/onnx/assets/grace_hopper_517x606.jpg  | Bin 0 -> 73746 bytes
 test/onnx/assets/rgb_pytorch.png           | Bin 0 -> 575 bytes
 test/onnx/test_pytorch_onnx_onnxruntime.py |  31 ++++++---------------
 3 files changed, 8 insertions(+), 23 deletions(-)
 create mode 100644 test/onnx/assets/grace_hopper_517x606.jpg
 create mode 100644 test/onnx/assets/rgb_pytorch.png

diff --git a/test/onnx/assets/grace_hopper_517x606.jpg b/test/onnx/assets/grace_hopper_517x606.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d2a427810f679db537236c5430873a81a62ef412
GIT binary patch
literal 73746
zcmcG#byQu=vM)MuclU+6ySux)yE_C3mOu#Z?(PuWEm(rPyE{qHK!CUM?S0O9`;Pm@
zz5l$LqkHx0s_v@l?$x8$Z&tspyzKz!3NrFC00bliU<Q7Gw@VxoH9J=?S5G@vHxMTa
z8^9+auMBmGsA28t>g4TZ@9F}83H`4u00Dmf{rOi48U<fQ4<zg4YGLjKQnI(T^Rjob
z1&O=ZI$48-0cda{{YQ=f^MB;ToxJS5ysiFI0_p}y&BNZs3nb%YZtDqdhw!(9zXb7*
zJ<y1#lCExk9)ETGBl9oTKRjA=Yz1>Sa~Eq*YtVmo0w98$|BE31@#+pw($&e;10-ke
zWaH)PVhz%?_WEn{UyX7AtBnm<-U*z;{?iEJ-!+AqS^r(b0p>se2H^a^WVrvSVg8?*
ze>6S*Ta%WDxr?Wbwa5QklN)%{|F;Bn0`BGi2i`w(01qqcZEt1$*H!Rr{`CTY6#G{K
z{x!y=DB2oIAXgu2508Hf{++>p@d#lxo$M|D)!GkS`7iJPvH?gFaDo9BkO5i1126|H
z0c!vR9#2=m4g6Sx>;BR9SN1>Xzvc#9`M3Swe*RDNk0%%y@{*DuZ5J<dTU%?Z|5^?Z
z|6VEBcuMAe-~pGH0?AstSbLazfhSzT-1F~(`|A(HKYbwMs95`gbnRWNTzx%3T6W&f
z7B1%YP5^+>@K*+$%)kjeQxG5k2m%)|QUQDb;_W|wL2_U(fwwCFRou$e!Wys&0SiLB
ztpi~IGCVvy0z5JT0x|{?A`%7;8Zt5(4n7tZ4i*+Z2J%1j?~i|d`CoGg6eJ`RR1|bn
zRCH`qR8;K087lU_nqd6jbnw;>V8DYNFoS}i03b0SpfDibHo*e``!)n23bqmk;vWJV
z{Fk7>8v-;8EF3%nA`<vV{+~<;021oInCJimGz1hRG$afhEIbSpGAEdc0R>Ht$qpl~
zZjMFa9?AiWomki)K}m(9F<`;T<?&%w5{^1a^T+N_OHXdxqQ;LjQps9Y(qD_cc<|=5
z_pE<4g$>So@88JKrj&#?rw%O~+?KWsFCO0M*!X-(D{CEDI=bhT)wT7FNH1?2T|R!m
z03ad3)<XYd6C5lI_dh0(vtvSoEpVs6g5d~F#Qtl+fX3_(PAZFCj}J+bnmCr8TtBH{
z{~7>C!~Jy*Y(z1hm9#dGm-XI%oACc_0qy^o@U{jZL;a%@0}usnMuq!Vp$FFBiP1^A
zOfMOYMFnZgk9j#U1~ReJpHwM)(z)il(Y73&?pe~2&XoOvHCfO3(U`uEG+yMkKf45G
z9CdESYfoB^+XeczECux3sh_)Ad_N`A5lGiPjhjBbyxTTWA8qX+9Qm?*{!+S%i`&qm
z+cw|y+FFO4K%?WevK^VU#-f93qJq<WXrX^<W!Wx}4I-%{7IYlUegmYvwoN);MfGh+
zQfHn*tHw{i^omzbHDNiY$SkeB0gbweXY6mluE<nJ)K8Z{eqOC@ofwS)=i1x?;gPC5
zh_#nAp@6`uZSB|ac`C<yIC$Y=N;(=V41PA!hD5T*jb*7sg1G(H)0N?uyr6bV%?0|-
zUHj8T-IKMokIvi{Ep=^yuHG@*Z6}5bOXrz-fvUAuWYv1Mytu9YnQ41-_pk^0)_E<f
zy25>>&NYtArHigFconq^Z$R}66isij3r?=*Ijl3z!F+^rQ2nZ%BZIRK|CcvF<ub-`
zJ7`<;g{)NfX7DLDHRZKy@$zh;Ge)S|G*_)>NNb$LN%*lN;1g~A{rjl%JNg&u8f)*T
zmImzz_RB#6T<UmY3fCUDap{{8Dl4>cddwr!=-h~o-;cbz^Tv6J-vbmYw%}cF(*lHf
zg`XX}en*SWmEpt=*)y~4Zjc?a)SvgNs-6iW3EetHH8mQaw!Q&PD=uH!7R%byjkVUT
zyF}-8n`YjCk9X+)x&}`z3r{+5s!xN;?fkE)dUxJPYli1~Yv(W2!qSBk+bxT|V;*ln
z@O)>BBi;}%0nBV2UCZ&<h55bEaTy$aD`T!K>}!6e56PaXWa||qR4BwmKyckCLuW0z
zI1n3GSQUU%h@1PtSL!~64~zb*Lh#$nE>UKHfbj~lAbsO*vX!0Deaphqoa_;ItMGWi
zX})pekKTIHL^BspQ+LaTU(gF|J)+8wmyPu*OCsyJPolS^Nz-S+I-F0za`n{)Pp<>1
z@iNjne3zd2?>6rR&P1MBinpH0{%jXMxju4d`|zlcJXjuToz;y7mwOV6M8}<=ipp-c
zc?X~3^s`Td7S6p^EC|CblJ?Da%E0M3gXebLuPo<%RzzhIQm*5RtID!JuI|Uv*6Uea
z<-Gcm27%-a0bhBaQ}@x1@NMMm4deQ*%?@>_O|$&0t_A$#+pU<9vt#W5*`b5AY8zrW
z;!s}t{1@6R#@&j9OS0#7{##QY?%FQG(OFN_BYneYVq~e73+4cw<e=6UD`z--Z>D^)
z*s?D#JfWvg0dn_?P3xzew~8fCJ$IZXoo!a$6n~OmQJa{|$a?R!pJanI{ZATRYhP5p
z*;}$zx!a|q6K{&Vj5RlXeKlb?n<h1|(oaYGbe+BPVtXFQUGJ(o(S*~Q<m7B<K&*P)
z?)+p>t)Fk~-)kaT*hyrG#4#M_eYa)a;N6}{vtdwYxq8ks;a~Mad(&IJ9B8{3Uxkly
z=kyieQt7=4c#d0?WG#A8JL78C)z;1<vLqdIwtp>q1D>kxX>w+0^;m?O`{#lqF5g{+
zW@o1_=3Uf8b1=20!}TtAj$)W>t8SY6&N!S(`#a#`!pU@%1zOh=_bt}wI<A_{drBI8
zCEX01^f-m$+I8~q=r(Zn=YKyT_g&V15w0u`;~smY#b(9yZT8szdf92KVeaRT8^>z)
zmW5GM>zZ}jHJG@-G<6&P>Urz-*EHt*Z+AG;^}%{>{)0bH2U;%hr(<%=W*7o{vmKv}
zecLt@Pq_QcUo3C)(r-jnhQApsFZWgx>RXbuj_^rl?f7)m**~#3X62v*kBsc25FnWx
zir2X!MVF<W*Ad4`XO&6s#FV{b{~9wMf0JC57t-~fe7n}Z!Y8t$7u{NyWMy2=l2tR7
z#+ZOOK_GXji~e@G>#4WwZfbvjJ2_#Dy~=xe&7yOKdgu+PpYyn#ik`7M)zfMNA;)a^
z8wl(C7<ArySeDDJyUl;a*`8v^S*;i{Df38w<xR~`E9Q`!D-t~(7<d-d9<5*QT<2u`
z!lxc@+^Vf<V&o{&#J;WEr+D(3{K{_U6_mZ`W^#VT<)6~BtjE|o8n1+Kds%t)t#~nb
z#*t_K`VCNMHjdVIeHL~OaKDe0<Lt<vGSq+P@&?QWEXL&3Cn33uOpVUUId^F9ESQ>8
z4_-`=AI84WOdH%Ut&^Q<Rb*!Bd0EC$e8w|O>eK%^HD1;#d_lb!u`KfyN^bqEafE%l
z{8`ZWmiP@&eMK=_oZ<A<SCMY#=kAGvSe(@lbasEz>a%t!I(toZ-F{-dS?MIMuWA3y
zP56WHdyq03VPpI4g`Mxo_1z1PQ$S$07QsUH0=r?N_GHHcmrytFJP6IV<cO=!GxPUz
zLWitH9lC54W&N*mEUZ$_vTl;1-vK|XTYWlWnVwY}6FYj-PhQsFfOChHI*A5nruy3t
z#hM#L#@X4Zp4Ax*+p9iH;C%oxg77`=@}N>2rj^Tq<M<mO<K9aF;*?>>Q6o0p64H1B
zisI)((iSv}Vj$ly9;J%D&3%2^Pn{R2HyEr&Txv-(ShCTCtmvHAtxow<@2&HcdRe^u
z2Eg10AHAGs^!<K@E3-%Oj3}nM_(+;}9FW^|awYz51`RVlKF&ojlA_`F=JYRv5}eOO
z0)Xgs`UuuIIsV8YZQUDy=Te}bs%I&qoQn6<dBq*!>`nO_#b^0BDC_JgxqoonF2Sj?
z#xeUFOMuMvMZM7JYTfDT-N6D^Nu=4;;ALR&5zBRjQ6NJIQD(B`dENp0geT=|($o>h
zpY$b<`X3FGb3zp(VmLKY)-M+}Lc#m|8G(+Kn@g+Fi;jUxw@CJm{mh<gzWUy{OG}O0
z#u3|B+ZVcP8!@~a6S5i(-cx~T+Z_j2D84gQJlruWli*<<s2SsM^L)yE*J9kh)3D0h
z+vk?fpTTXxd*izqWG&Ltx5M}bguObbo*zSwS51z*v&LvW>klyA_Stq^?~x?shyOMh
z_RSP6nDDiUN{PZ6gM}PRD!2Z@^{KA#%-3Ys(1VT45w4k|OP;o?O<-(ox+vmgZ+HLD
zs^57TujkU~f@S@-dh<2&>J1Rqs6W3#wH#I&^j_+dX)_BY%lQ>p)oZFTNkYq5bvQRS
zwyj@}kHunHYiVqmx9u$YLoitMwEVN&6ex}e<ha+WZ$VCvVn>!fSX{;2dGX%YO8@M@
z-)!v%kWT95+Vp2=CZOPU<Bv*t3<q0oPx0>&5;Falb;m#3=e)NL6D$$1?D5#<SY{~C
z+f}N<SvG#u^e(E;6UcO5$w?Wg97^@xfVcS_e`}oWB{p{tB5uP}ZC$ogP&fO9@73$s
zg-G|}+)rV_>_DZ7+R2jJr{Vy6<W0#J++bO2?@LEv)pEnt2J08w)9Vvc1NyVYriI}y
zkj<i~l-?+EOpI=@>&EsYqb{fS(t)v!^wqZawRc+gL%`HS*Pml@HV$-uwP*c?!#o5m
z66y-@G8Me}?iuZPAhr?Qp~_o(XLQd~lao%Ue@k!M*ihhl;}l=oc9V*kAZ8g-FE{T}
z^?l;#`^`k+iofsc`9g5?gdk^UT@VZ!_Gaq*r8Q4}(P-nw`>323w3CPVYr|YIUh*^p
zy))u{T#ZZOQ8?$;Y{KiJvl^rHMUk4T#JhMX-r#ag7iT}z%afHr#=*WXm*Y!8U43Ww
zNN3|`{K~^!w70GmI_JiAwzKV;@{VhNf^Xk|eA5%BHvpnh=y9S-{r87gLoRdo^l_G+
z=z)C;b4&K+aRW)}YdBN))i)r>P}JDbRsJbq!ZAxfebKdwL4ZGX{wc8D^$%GS_<R@E
z^&6qL%DcWYxb1Y5=9j$ds_QpP$C8cN*4m3ZmbQT4OTmj5vU=N@yP&DN*jKL3b^rP$
z%}B-ZhE7ctK9U{}UmO0u+_f6t;34X047e!6TZALd5itoeR*jzvhR(V?YbIy4=4pCg
z4R(T)jsntU=U+R#5y>+r2wEVBZmqX8t^9r70GGa}y<>yP_gZfNLmR@FAff#jd)n^R
z&)!xqr4RV$rcwK&YeX%5XgY0%roYb(#2t<1h(|+?ga{l%cPHHLJF3SO3ta-$&RB~p
zd10pAM#mhp)~081Lwn1)f@~fGgSNMI*l(Z{2~cjo3=4rEEqqSE>u;g{m0eL*wnp$Y
zurM^Aq@FeYqQP&Y?);CH;ZrY+X`gd_{8>Zn8!%fPHB@4nE{*DD>D&_I`F#!W{AhG6
zc;nN;i8eN#eO&i7k*bIA4X6m#W**1cIoNtaf{I>ESO?zcE<UBsdI!s%1n=bUERr_7
zvM?57j(6%vw{Gff-v<l|PePh3Z48*K6K?x<M9B!edj~7NbJEF@8?hAB9cWoslM^9w
zZUD1N-MD5~6XdMCHUi#bn*LB<-{A=%N0KZT1xLm8#m;XoxO23!=VeaZnZczk@obEW
z-mGi-=o<!e27dQab9@1B@wh9&x&qX>s`uG-E%9}oCxOa#4bwhzBOUb{%Pra9{i2qI
zd@5kW<d@L{<}v?>Jk@y4F<HqmOZgwOK<n{y62pu7mU1F2oq5eR_Ex@ttcO4%*24Uv
zwilA#9>+8qZW?lQE^$4VJQ4ex-5R$xx1cI-;uwDq5BxLxamUR_RGZ_d+|vUW%gxR{
z<tf|wCkNjbwQdXk$2uO9?RQn9Iy3f;&b%9BU8hS=XSU_9SPKd#CN0~=tPLVqtjZq;
z2ZqUft|NZeACvkG3-k7oQPf0D=qs*UqGiJeXxH>6F0ivRPWHrL8=6<fp*hCw@M1KS
z2WW%l1xlz;(1Qx&+)@x?24X%X#e}1SCw}qh4H$vdbUnjYt{u3IG1X{8MXPvt&VI>C
zXZ@+unLOSoMk9$&2p2*<9s1mRe5K8&Jo-!adv<0#s#7BDkd@^t!*1*m<!({7OV0-a
z=h%F>cF!9%1Qg8`Ev`ojZOgNFt!<Lolm}kq+uuD17fy_ch^lhZwsSV{H2Uo}Cim7!
zy5`GedFMBqP7CSbhccZ+s*ADcAxJiM#08^Ea#b_wR#tKMw<e<QpHka}!zWRJj~(Tf
zFBKx0jvLD(M@jRrQXdJipHw2(&_m>SxX2fLmsq}>yLK(xHPJ~=;pouD7j5QE7ypjp
z7u6FOQOr4Sgo@sBcE3HteBh}vx^}kwL&`0B#Pl0Ndipnr5}gi!T{K{Q@Nh@?IUb{}
z`1vO)Sz1Xpr69{wxUPP0&%n=?;ti;bWfw<->w-HISq2t&4`~pv0<?PO99Z)ce7mM6
zr4Np!eOvt(LPbufW9{JyhATmquFh`eUiKFDPWE1YAbS@Z*S`>lHaLMXA^<=G!~rWX
zIA!q<Or-|cfy-Xt<oQnt4EX?9{|5;Wzyq)VY=1lamu^^O1vP0|kh-_|UzOl`01kiz
zGr{`)x9AW&>|X-s@W}9R@W`l$@bHMJ=qO;o2L=8A2Hb%_0HaHY2uKJBNT^6CNT?Wp
z(V>6CLjU*Z5Tpqt%>RN8k-rTABVf0n!2rkq&|d%u78(W){%_%*G5l{H#6Qpw7!~>p
z$iP6u!N7n8p<o~|AOUhHXm)W-7<CF*b1*7|l~^bN*D#=A;X#T0WA+0l&hF2oBFV<D
znwC_!bDkf$_I@Q7!%I=q3~E`W;DI3?2uK(R7-(2HSlE9|fdHdS<lye0!4OX<1x#WC
z2Ug*LgoX#UMNuOqrzO|y?uR5Q*dJdtJ#nai&Pj6rCrEQ(rw$pKbi7ITp@A+I-UzZq
z6X9N}5Fj9BP#3ktza!q0VOV{z23a=o%K4^()VS74OXcAeC?F9gauu#HI;(q(wN_3q
zJe*sYS)rjO!rCGGqO-WT66-Y3D<w2cFj5|JQTaegs;!L8$!$@!B(ggB{)MqpkVZ2!
zz)QnC!A7SRvZqu|LR$wVWOQ^6kTsDp%>tWv2p}-jw?NCr{(xdPI%b;AD2`;1hu+81
zP?r}UO`u8)?j?8ErTY~Uy}ph4rK362D;-kD+U66j>z;+BK5g1~6tVdIk!AUx3>06D
zP5v>tf(<sr)6Yj-<(e7E_W^RcIO0{T+e{D9foacfz3p_fQsn$)<(6R*zg*+F8eCcJ
z1%ojwdw$O@)OiOeCw=U&5AyCG{OssH!jF~r(6LBdM~A1b`AOttdiz1(dH8N*-{{qP
ze0=6$I*fFeuz$BCEk**0Gd@h$VXRH=ytOv&U?Z1{);s}P-W`Kps325Gbs7Kmj($Xz
zIOHlK)uAjfBOIfV0Ebi1s<3tAN~_XEJC@NFqb4bOrQu{NyYY0KQNY%&RIVCWVyDlH
zXDg>?VOX|a6AZd=;mIrAYUO2L(p4Cslpjj_JUHGY7}wwzm~`$LHBgVW5L>QMD)vLt
z069U1<?bvh?ehIvA+p7-@?qdM_>W?jXs&+YlwCs1<<GTxyLo%)FUmbXj{;S=_BQGo
zqh2hFhhMn@upY3{vR)LuTQ0p?&kd|}IuE#Q2nmyO+_KCE)-=@_hZR(WxtDu{C4*L~
z41{a%Rwlg|py}>+`EE2Tms;y}*1pWD$9s|vK@g*QkTh)FzI5@_M{1NY-fik8Je+!S
zu)#~PUedX29rN68kXOuVcN&UVQKq;@Xc6kGcZ0E10x4VwFfdYdCad`3;Ex99kobqa
z^%Xl%lRjdoYq|SU*k#k0NKexzEsY&w7{_*ZoLD*x4jZ1GLXY(Evd39dS6<sQD~`#f
zH6|)1077r~z|%Y|aw0&=?G0Eze<9AlIld!$(R}>*2WiVzlU!_&iDRFDo6rPFj(o?S
znZKPFeFTSG`^u4?Yr!+6r3Pl$nhPq6WgTvOs&Xq!SWt9s4rE_#A{~tA5HEJM9(6Mo
z!B1PnVoLcjmSl2hViD_NPPl<}N+hjPJH<b0wLHOJH?$ZRldA{PlYQ3px9`k@F`1)D
z>ZZ<O;NYaR83+i0EsPFS;915tb@f)#Dsi_*MD`##CY7oi21AEjb6?bjikojgJ@+h?
zJ`pK&Vq<Afl7FT_EvH?JR=!#`b*y1#vDS{22xE%yskmNLu1&4pRLQp2DBB46%7=U0
zr?K_WGbj$5%ZsbY45G8wrVGkA`O%@v;z4YeV&teY&q_0=<Li)%ET&yi`h9?kN)S{4
zQGxwOE(~k+GU9{r_Gcq*D}|6~D4C)Mh&Yh-%g;sri%2Tyly6X*s}QwX4pkufA*rUh
z&%mR^#NYrkRbAOPC3IR!Iy141YOfDkNe~&ks1T2Q7D9C-)iC`ZltN+9<>4+zP!l9`
z*x}2YcR&DJp@3EyG`0v546y+eu0v)>M&38_$TCTF`1alYLsVu0DEi7hM1=c(3d~s$
zaVa(d4Q(&+4$;7x78E%@g4vf}OWk3KA{O%ck6(=qdYR@$>kvb6(;|t*GOtJyB;|)8
zhG7A$b+q}?!Obidh4#U$MThU!lzALeh#^Drw)bjE0R@Esi)1#_a=YZ8Qdx(ON&ON4
zWpmv-a{!rHd_p0QbOjo;7{W0iIHBa1oaM+lx{D9-p&y0(tF(k7<(g-RH)V4795qKG
z<w=AEdJ0mUSe@kvDIs1YtvY1}!aYm~H^Ux+M1}Ad#1h+-4tNkG`kp#;7+>TKj^yV-
zjxXwJIE$aJC#d*Z3IGXc7&rXgmzjP`IoGhy5B)4!w5nbyN&^x$g|f(Hx@RA3Bj_oy
zDC3;~NV%K$3`JHjy9id|!wSR_Zeb9K`0dInh_)p^L(9yy640Sv`u71e=$~S;GiJp`
zTrI_zar)z|d!N30GcaTl5PTa$lIn+%7p}Y&gZhX7D^)4K`)JOXKrYs9R^EvTjqqVF
z^Oz6ft8pH`7ywi94a8{&>h!C(!l53erw8PGRj{(=uy^uh9|ju^FOs9M0B8qyLZdi`
zPc3dBx_xM_4())6pA3qHFz%Uj?t73@vF0E+`ZUo7(TlGYfTm;UI|L?lAKSx7X&i~L
z9)775DEhiQc0ozwAK?}sBp}}tmLU!G!_UEqAY6vG+oSt4{kpNIRiV}-;VVNga#ha~
z!P);&Poh?^fi3zrb6_4JPd`R;m;lv0f+MC<sTc{LH%s8{bs`q$;TKA~j-QGczR&+W
zC$QWoR|y%S8&1q-dKtOpM_Ojt_aLRE(0zn0zv{~{6h}Gdju1065pwkbF>U+vmJK&A
z<f}FmefY!YX>$X!BX-dA7P8~%<bX{C6x|A>v0MZ~j6&$f2gbcWOdjkeI`-UDd6ca*
z<}fAqsrhTvs2E5C)ap9Qj03^%6~51l^FUEwMU!V^!H761l)>;>4#rq9W+>tpe#5CH
zhlj2!5fZ^4HJiH>Bi9D8hV6M%REtR@1993|j}4Kd<XTZ(S$c%mtiy>C?&|nE=>@}6
z5bx;H+>^CX0A+<UF>EY;s%-rJYI&;4oGAxf#*RO#(1d6fkP?7xgpK}>$P{#}OPKtK
z-xjb*ZaWg;_t-HX4q^I3=7?iq#iVD(horP2`SR&SEH~gtuUpWNN>mAOhM3sDL8fSC
zrhHD}LorbLowGE-Py{7Jfimw&{W4ne<o_D4d~3~kR)=g&D9{ndf%!95RQXyrdoeX|
z?MGp~`c=VB6@GWE3bWktruCvy*E-K-)c5Z)xopp|oY2jumhREM;;p$N8E9pf&OVJt
z`QgL!Kc<A*YSz~KHG;9PrCCePeh|L@)G40T8O-DARCmSfD;T_B(Arcv8{lU&gXjro
zi4)W)D6*@aeKc>x-^Nj^%9H#Px31P8Y_G3N_>R@3O1RU+p}X6vbo<c~8HIVSu+Li)
zhLV(hvu!lS8etHwMYov$frYn|Lr;4q2RNdNQ;ZzbWRG>gcd=_3YN-qG?t)x8gBX(P
zM^^|HjV2&{6x6rdbVe|*nAZ=Tz+8aHcmq%?I-eB~O?{YOki)yJ(sg={t<SF2bIJAN
zAg-{pka}m-5aOtIHsuO(K2hDv+MSpClQ7+D`jR6%8dZ<UL0e`U6B|^xS8?x-h2@C#
z$6L9Ju8+SUA3oGizQDc#+OGn}pq|4wpt$qeX@zh-;H5IvE0tFXX!mUnk6Zro{_AP%
zYK(Qyx_x(uFrEh=DZGYI^Bjh^!vtR#=a7t~JWL&Q1Y~EZ<jN;O%smO`@4}7sev!sS
z#oDrstDdg<p$pp;G}30mA#DqG2^dqasybb@^R?cKvWxBkABle%X;SGmdgB>5<#M~b
zN_y1MXO$A7_3~@PXATxSqJDkyy5<h%AnVpV&93TF(>*zY(Bgyxk@*N6HXFU>y_9OF
zQr+<-X?U`q#tzm#acApy_ij3`I_pO$3ya?|_-GVu`Mv?4gr>WVS2kSIbpw<IQk+TD
zdN*|EYpdQ5#2z|1?{5A!Y@_{H^#+`2`=hECvDm2-A{K!XMxQMN7Hcawwf$@C_&fS-
zAQ(ER)T@lO>Tp{e;!%?LH>$)3%2@{qh-{WST-s2YRxheeHbTz`buRA^+2Ja#8%NIU
zSbc{cH(AxA#z^Ox-g&<pUoo^dy=;-t>@1Kdkmc4Q#?D(Aw`T!b3ekp&<@0ezA~%ti
z1qifX1bqTE)a%L+bS4KCx@3s+%3@8S9T``RoPE8WSnUL+e#J98u=u1ydWwPLFWvxR
z8IfXLS)2$Nez)F>{77dvw}oXCP?8EK={JZ_fCYgGn+#_cM*$D#;-fTCXLHsd1B*{t
zJESUG@G!xG|EB^b?{y-LoPkOu+;#|2#_qvvx=!MAjv$BO`&88kcdL2c;R48FWC$2t
zf_LJ+>`8nCG?ei^=*%vc_4vllYFZAAnIFS?r(+ZpZR8j}<4|D;Q5qRIRAbvD+0Ryq
z%wZyEIXGw_jQIJE;nKm&lMi96qd}B5xk7LaVIfqhIz&>Xii;yldL6G+Rez3vJ@sJf
zpB_}g4N)ma$V|BoBVwjyD$(r)00GJ9kcAAN8c4)+gqSzLik(7!PMj@-__+|>>a;us
zdpG{F3?F$$Y6Ksozq^XY42px=<)rLJIE3dq;|oN(_!#xA4q$Zx*l8;gs1P(Fh<7OD
zCdVl=Bs?NMmZV@gRHwLyg&!nxahk5F!N(FR&>)p41pY{tlbYvZ!N-&;Jf|QEi;3^1
zf((I#TOBsu`hx604mv7HQhF8ywP3ds96;a)F>!n>MdwPA$fVnauOKc}C_=VgasLql
zTOd7wMuthwl1|3~2SJs__E~A+$ShVsA;$)CdIUdw47;R1EH(?9!TX&rygTuLhNcvv
z?KVV{*eUry;BUEsiXi|M4X~2od)E&KqSA_?iu{Vnk9`nOXi<DB85shJ!IkNdlsNP?
z1J)r5F$c0==3~EYgcJH+vJs8&7hp#*gxqNN5R;voqe8=$z{~<OD^dbIT*)n>We5+@
zWkp?4!bpM8`OzQu$NZ}q4kLxCm(-@*aay8s3FLu*xEpmre)#m<CDPH%NwR?wLCteY
z83v?zegz_0a}~LSE=9$F+qe=Pbnx5k0e2J|23G=ozp{iwj4Hdk2Q+aplN$uIm98?U
zM3OuHq>?42O>COAsF%0vdwCypeYB#Glrb{h-_Tu|3h2dbUPWRhi{?Hc450vzkH-C)
z&^Q?k?jPTP>49iX+(dz_WDcqim}WA8>}842Tn^A@`upto*r_@m2!mlZn(o-&7`b1+
z)kpt$jFX}QG^Aoe(}{}=9G*Ni@F4gi`|-#bE@i~gnqsWj>uwbrkUq;bZVwIpl2Yb`
z0qBRt$7sZQV(~31Bge?if$;GtAX930Xh92#=HrJX=)IMLu5XQkk!;Paauop*$M}<T
z{cb5A+)TcGuj-`Q!{WZy8+Hgcq1sQ6(e`|dZuiIJn^fT|cO`hK$SvupBK6t{VNs7k
z?@AN579sg5C=Z-s_6N;HJ4eO@naeL^T8K?#h<{g#{75m&<fg|hMx#vU246C$eKLp0
z+pB=ZiXZmFwcRq5qG(~bqHYh?eO-V`xQAV;Ba0YcN4`o!gnek0LN2?BlbnyrXX-Pw
zS1Fdd!yJkLUr4V+(0=iBaQhoj()I@Ui=2{(D!dB40hfLLd{?{WcfN#1q$s+@ab|o|
z<d(QD*;ehJ1An{sv=e%DExwzBOEprH6r+RBgfAIiC0X}lIlSV(nS6+KA6^N+QQaoU
z6&Z`$t`P>2s4b(Sh)M8Rv&YDWJJew+l@G2Gp=Ok8)T`+cvdn5O+9!q6X$3%QkJo?i
zSVibtq_%uadjkjsTqnMc+`iz}vsd(wd<ep!3;x8npD4el#mC~udKK4kqfiXHY%f2-
z5>WL84kigQ@Kx_XWxmQB72%WRNu2I;VDP#5x`*p2kAI#}TBkJD7ltI*q7Mqet>~Ud
zGo`Ra$xzKd!p9^bDcwk_Jnv=p%b#=u&!SqG?}-T0U2K_cW0?w7BP#U|Q`W8<Ck<W?
zd^S?F4U5<`dIvlIwAM@7oG1{*&JWTzrdK>*LL%$95BU#PIC`_(cy?;}-6BT&5OZ<;
zcvd}(``!AdqqF@qIw<!H=|QutY-X*Q3DE)4-`J#>+g=nmO6iZg%<qADWBS<_-nPpw
zl2}2*+L0dur5YqUEeg1|8xEXz+<^))q@@bPS?g8&OC`nhQ>n?WuGWeN^F^xNdrut$
zZV0Oh<Ek;vJR-SU0$DFzF5?c*+0DA`^37R|fesOA*|d+nBNgdoPyT%xSCi|RCR@X?
zmet1F#(aoe74tU>9vuuqX9q5|+#;94u5ss$cfn(Q>0Nqjt%kFKMu}&yv2pF~9o3)O
zwCtQ|GhYn@l|@nbVoKJ76`{Y>1;Tl4f9p+OzogGALfgC^m#zLCU+k*9Qi|vmOIsYY
zsp}b-=4WZx`=@5R)8fzdHWQ1n_?5l$$i~B$ZmxFW$zqKi{uX;tNpJP`${)e(uQKA@
z?+?#%doN7#GHPubB4FovZ>!ZSwXlxv6axiZnSXKSG5L@QJlXW5GER<6f!`XGYo#^T
zI{FHH%)!&N*R^OZbhHiA?PSHFI<J)}bhlJkmbCu0?s={}GVSK7zuLLgf_Uj%D?59O
zv|><iXLiFK_XcSB-MxI<-&C9>QSH3jx^`^4Y&l1KXkp+uo%D1*ZEUzBd`d?s;bPy?
z@%;7DE9;MDzisczRcW+tU^L&{UdCu>tZP;_RzoN5s2FSG;Ml^uST?Er$@ETSwq<q0
zh4;GB)!bQqfpJSDR$RH>-jRL2a*~_ml&6^R%%uGAY!S11HkxF5OsjP}X%ow_t#RG0
zwKCYR#J{b+Rwp}pp3v)5IHqjr4Jg}k?S2@htqM}SBGPnlA>lMBJGNU4R75NYs^Bi*
z?~PvL(v^J8FB{9ZT#VLL`*X2!$;=>F&Rl)9nA*JRX17pow-*1awN&=g!r}vwY8Gwr
z4O!L6x>nzYKh5t}pqfF0`R<mW%{V)<M)XL8;t>9rdLCC^;>c@S#&0t<U*$`^O!27^
zSI1PvXUR}X$+dQ}xa^|s=BxY#y`v?&Qg3Bt1xx5><!8!E9;QHb-NR8+j@Hx0?Iw#e
z2>H8e%|6@HlTA)JpR8+`1!2<$ja*&9S7wIp-hA%W2dFkvLzl17E<~<6xsnd|vD^~X
zwHwP&<LX)2<ErV8#ja$@f~r-E!RE^6h(~C!Ei4s?IQd^S)8UopqM*}RW=WiTFy@RE
zTMb>h*eh*SekC)iSBejPiGZ;SB+Ob_<{Ufnu=mn!LrYzstl;K3yRFX>foyh2Tq#}X
zoBFnm>YeG>);-7~wAYzpVL-qo^UVTp4TK5VGAu*S?`uhru7ej4XgX)25!hlbD%0eJ
zt$|bU`@?(X7_FTWcRu3M5pMdjszy5dav980;-0U7lp1k}l@?3{M4~hYP0%Ykg3&Vn
zH!>@Egh)3#1_e+OJ~R#=pruS@PDh0<wJ2snz(n8b7X~8U{cOewo#Fy==eG^Rsz@MJ
zr_x3Jv%}~2VVtQcB-Vp2Q;VEq?+daUhfABB)$Fs!R}DWoEzHsNtMJk~Oa{+>t14Vr
zyUbXDM6JkrFGy67B0}8i;VK0DBt#x1LJ4{Zh0=G953zX(@W6+!;wGYh*nnkjY>>UZ
z)a4ah4*O5Bs_H?x@Cgoy?cj$R^ipCfh@4U(5CpI%2Nf1&y^Vqloe(9KEn&S09cQdi
zsMM}VC-s52k@LvZf{G0?ot%FUj~xn+9;itvnYN<X{}?GCQM4mu_91kjgrI;v!YdwM
z8%VH$(D*DvCPht6Si%5pYUjmHEYDYv7=e!CPGgmPCyB3?ZZ0wi##_wi=?Vla#41%W
zkhSUfvk;VGM&Ug$Av|!a;M~&)Bu0v5B<oDhB$9r8`*hHRXd7)FdT^#*fze-}(II8U
zAsvD-0M8+ua7dXcff?S7V1&mCuRw2w0*<11R4Y%#VnHurrd4T9Dw1cD4oB7o$1?0M
zqY_HW^CvY*xJ#vo+O)iwoZ5^1Cex1fPyCn@De@TAfltRVdKQzL@ad}6o9M{e-zxcW
z1XjRT+jd7Bk2NC@-QQ}ayTviTZG($l^S7%whuhItI-Kt_><IvYk8FLLbJBRFfcvxe
zJ~vM-sR^hg4*&D7Pv<Qrm+Pg*X|<FZXVIIAbB>BRKM>h3U9&Sg-#6q9zua$7XK7JW
z$EsIaoZAn(YktU>a!DArNF?%qLF-#}PUVeWcXyS@SS&pxyNG3c9|jaZxJN%us?QT*
zB!PT6XVr4Vl^$8#GnSfX;Y)XtB0(9Gw)k?2M;#S_0?)d2?9!pT%G~$7Qd!8OFt~9F
zO(BbM55<6Y7&d^0nD&n9dj;efdZQ7(L5CUdp7G0buGUgoT>cW8q&dkC_{_5VY+m7X
zm`1YrIrh>@RUjP(u1tG%Asu$TJp>vvvl6)xZ4HDSv4NHQ5enGu$#+oPdUe-BkxXHW
z#+sZtH|$9h==X*EGW3j8m-`nnSppKR(gi7M5%9T{mg4dVlSyVW&_6%lQYIQwKvp3T
zn@^*l9>HfmO3;Z>y!#+>jUPKPfDk+JjiBE5DsFj3I-;}^QDGT<=f3Nb-t?zmHnDkF
zImnov$&HiqQA3bcytDqLNpV4$8BTH(P;hu2gdz=o`PMfp{s!>;bMi4gqkbgoA&zh}
zJ>wP2mDk^u@f;E}gT}3@VhF55f0?W$%H6k*Gc_suV5O=4IRvJIbXC@8u*ULHxBYn1
zuXJlb-C$D>T?>nyDPZ4nZjEe#$kS+VdJ?9@>h$z}q~(P6QVUVA*nKuMqLOJbYsJVL
ztr2VT4-NfMr~9?&;q<-y&*f|9SMF=pd~OoWC+m`sKTBuyD{em*i8E55lVTB%aUP!c
zEB~yxm)l!iig9TPgwj>ap>;C70rVm|^R1<>i@C}!Wd1MQtv$V8)TP51SscIAzbJMu
zWWE8X>(g&Q`>QPF%u(TKT3+q=eP;basvn1GQ7qi}b#hzRN@>@fCd2Bb3;FE@t;*hk
zMGFRrv-tKK(4hOOOS#{-qNnfd(|qA`?(fsu{3%O|u9-cSB-TMj?Q`{pTAO*LNwIxH
zLDECQ43L=_o6f?|OW%#xK!7>Ruv|zBzZ3i_vBh`Q1rrl(uhw2?xUPPid8-lEepT68
zt$kZlu~2(D@2n8-X);NENbuWVdGOBDlOru-H(0o?uKG@7X+6!}S-4la${w$?`bAZF
z2o8I0(x8}&pQoG~$z$WL(Dr){?kR(B&dhAt5qF)#?lAQ*`-lM9m?NV@`g6;0c12%B
zt#fku$;PRpsrp)vL-T^CPN{MM?*&hBFN*q9hDy?Xi&|tz6CWT%F)+I+^_qDm$o%vE
zs=ZvSCmnmxFBY*_T4Ff8V(F&o@};8n)~Hs2-KCitktWY;tb??uIe^!>ZGNlu*bs$~
z<>D$=r3+pZDQ8)$Tw7D8UdX|Jx~TdtXwkW8w##O-ZLVJa&W<M}sLJcJ%)FbYFdmn~
zd>QwBn;@BnSABVt+6MUiDdS*QGVfqQPvTGDu0L??=;g^@_XaF1<31D0x;7QtHXHJk
zma^@9eVS6M_&K$!y~y1$eY8lt#i4SwTE+MK%OKemtvg@UDD3*cA=x$6X6t(NIoaHb
zahFTos74@Clr!q=sc`-LDWk!?mZh=u#p1zxvrVsYmrj}eG8RU{tB%%cncDq<g|j<v
z;brP7zgS=YCvU%E_GQ5nd#3fvjUB3aubuJ_PpF?yH}A;1%T>x&UDjna7oPSmhbJn)
zvA8pY*$;QJlH==fFi76?+MoDuxdX}P;!jEQOB;=J%i9~`0~J~VG2}2Gva*`-v~{@>
zws}`$qN53Q$fwqv<Ii{zdrO^%rx(C~&K^W6S!Tci4=f*pSzknSWR{M$%JwZiE!#Qg
zN<Ect^UnS27VwWwskG+hZ?{E^*;RRQH>n$q47W8lA7YCatL1n~aW|IAW*+>*7R&V2
z7Zj%t7iy==_hW=A)GwopbM3a=>w<S*lBKJ@FZ1LN7Qfzvc`x>SIeWQEP3>qc882^E
zLy?KFN2&NRLCBzGkUxrZjBRYz@o4WxF4%kP{8U~=v=DH2|2XbChm_vTv#rq%tBHNb
z|4o#8D__WAa+8)s)uA~=F2koROt&8W<YfJk(r8YbF@5NHTINsuMt<O|X;}pz=)LJ^
zL%R}`-TDSZ+zE(&5D<@3d;|7h6llxZ44gp1U6G#I71)lo6|ngvEOD(xs8ivZkD1}^
zmjOZNlDF&OO)T_Qq}a_bWr7o<fk*g9J<`9nuD;iItSDa7Sv<=oo^v7m5<0~RM(X2z
z12_a4!ffd}7oXE|&6xy*qRL%3_3E)Er(mLp=SaWm?cBlCO@^0ZZ2J(-mP0Yk<Lr_&
zRSED61{XaoWUU*2LmTh16Z-x`$L<Ck`yH8o7{~8t>u&C@Gj*R>?C#>FZ7!C4NKRea
z|Er@Lylg22HiDE~%2k=tJ~6iDu)ixNxuKKe93mzAL>O9}CXIohrER2xlobk}2U8|(
z4!KG<xXM4pjU7u}Ag>8qr?`=TARTN$QQiuuxgBAVs|2xyYKRjQG4z8l3Dr(1*@rbn
zn@~aF>Sfj%W`&RDoe2pQa&$P@6403Hn+IYL)OeT$#ZsRbfm8v*^g)pA`aU#8&cLvz
zD!xO7eAOYnBv6#nk3sd+laOLnFiuZ{F6M(Noe1=22<WQLU!VgzQjuftrT!>E7M2bb
z@R16!hsDux%cT(P#$e7TBoz^djzPpQOO}MIhj5T}jo`7=ih)s1Yqo?-3hQ@dEbnQN
z6xGg@6+)?KZ~GzU<T*5uX98)^VQfAHS>8&P%yCeLKod;!q2SwoHU8{*5h4cvS%GOu
zlb#$tLH=734s<zF(+BXi7j^RO2XhMkf&}sY4}}=-`q?R{242cQE!uef3?Eat3P^ey
zNzsE&g&J9WT}&`Tpkr?r5zz}b3kuU?=;W~GApP3pe@ypcvx&aV@3*9C(*QcjbQ7yi
zuB@(=nhgcdQ4<xTxbx5js3Ynfr(21=XO3T%_|%&b&a{#eM5gWIrM==+o!4=)neWGW
zG!yO^_&l*P^;W+fR4A854^hG{dhxEiR`7ILYIi!c#4eDm(BeZ`tfC%j22|TBFHgDT
zYc)H_ZI)`%f?TaEm&@!mJwq0Xj1#9v+TSH>@#RnU{K}^9va_mQdX)V-MSou$3n;Tu
zeT;7sdK?}+@~(Y6i(1CfULXZZY{X~@hQ$jzjg}z=RWDabNFhtXZ--lDZf;n<0>sDm
z_!ek=d?%*#r;X)0b1e4$bADHdREg6cW$N6yC6G(ENoWtwiLYwQ)J3Vs_#h>ut~3vg
zuE>e0@|~4}vju@Qf%3bQ6+_2%t`4z;iunoQFaf$){bo9!T>4M+_m=Zy?48;#s?9oW
zZ3ux**9NPr>AbtIG}V^*xB1^|NA7C7&+uO3A_c;WX%3#Jqntc;iR6{{i4j!R>&P6j
z($t!|yWGnil^2a5Sdd}X+%m=STTT`VjTg>QSnWY=#6KwMZf!N&%zg*Yx4>>XE4Seg
zoz6f5$HHHw+2aL_wE_*(b!xP)-93+g3OM4*oVT}WL<$itXoO>q;uS(g(uN)6&T?b!
z{UAiF&yb?33h1+nW);_)+Z0HfoBM)Vw)3JWlpLntA?cMDRiaU?SY23H0na&PfvPtM
z8OkLkc1SKrjk}$ZE8>Il+JgIm`3k5N+u;)~Ud5q@3{{G={~XbyEaP}-fx@-sj|pNG
zg5b+(m$m%us@`{h6XMfrduIM<RI&Sd>pn6O&M^!@Y)g0k1~BI=JGnZ_WYu_n{M^4^
zGJPqF_A>eg+^nDaplx{xGd@Lmhs7E8aZhf!e*CDrT;UZaMGN)VJsUTNWVK~ZQKfYh
zIGehFjJ(AQe&2W8Cz*;cNN|YnG$=>92x}oo?Bu*2XinxHbv-IN^s?h`&piIgkti;q
zQYT$qPau-;t`OQI!9d*Mek7|92Kg&J$|x3*NtTRj8u>I6Yz4$V5JdK##h^BN{I}-=
z>IfbPe2<0pec@cIv3%#smW$S~AxZ7mojEt1#k%~YuZ>)5qCCYxA~J%BKj&61rXOUY
ztxD+i)Ep>tHeIQdjq<e|1fFY7Zd<pX{LvhJiLzX{a!z@|g-`~^`20HT*|#SuAM_q~
z+{`ZAE@cd4eSddv>o+BBUMVg3&kqpfsxRFe;`5}fx$x53D@ZE~800LS)!j3Is9&>#
z*|(45^cRfnCBkI|D_LR<$2`1~B^lc0E4;OC2@S#F<-dbcann8h?iMSFr~S^l#@AA%
zb7eKl)!LWaeQ(bBhth?whzpf;HyALDCw?7+MFlmRTuP37)6}gVA6RF%)4timgJa<^
zxcw)?TPJ*{!O>pfd*??!Pvfxx__)gDI+eu+1vuxh3x~))wMq(3^X2J3hE3hKe1(sW
z?*dv56+4H5IjU1WC0Cr-k)>5vD;g4h^~RLVgdeaaVXS%LEOinPbU53}Y_HW%*RC|&
zoUK-GF<9<RRfyx}3OVz(*S3k^ydIuuvlt9BN9oFQBy;4PIvj6F$+cH4#>#KG)CQ$I
zrm8W-ArP!vZ?^OtwhZ5W>1ixfPjz7?-ka90;k2!YP&BIi8HB&gD=AJDH@rLeQt{2w
z>Su@>h+d7qNz77mSbet;dQ(%FeL+~Yt?959sVmxdMPr+3xFxlx;!c<$9h>;sU+fEK
zI%1IA_HF;h*<9;por6Gl?O}LZWy`X;<%v;4`GOm*N?K2xqXYK@`1SzT3wf{C`IDrj
z&3>S*@~tx+ZsXyEP5NwkL7%e3HtU*`OFi@If|tt$I3#p^ah>&&L_EJF_?WT#PG5`v
zVs7?u?%RHK9i~Q<bGd>pRWqymz(_1Z>%nW$1QtHK{>p`KPiW10UES1m8#jTYr#gYJ
z{09RUq2@ZmA+nrpm9{@^BhD;kD=xS5>urL9)ls#cryAD7B4%Hqrs+MK+x8CQiaPM{
zSTc(_TjxvOFD?=>=DL*W%TJ<F_qsL~YtZ<=Z;)CQNbAYhyCPw4H`vH}&fPSu?d`DR
z*;~r(VP<vQcwbkk{HJDd;$*sb4*LXrVdBB~=+fZWtzvX8z+`8fZS12mu4d6TBmEx~
zZ-cCjV=dn!Q6s0gyA}iW64wVLfrnc?@rq^kuzY4o<*X{<zJM$?lkQ%hQ?sNpQ56!!
zT{6Zf74JcqlxCs*8<aHBhfk;AJ8-$x_S$pzY>>-*ArIQm%FAbd78Os}IR;0c9O&eT
zKS_Q`t0`K)VXj<y?x0P*!@ozNt|C#$XIp@r-iu1~&A_G>BWw>;Z;Ss!<Y-(i<b^XG
znI@8SZa5Y1yve92;ETH)p2|3P$Ix&t7o>kjkMV8Mz4Cs)&Z}+mrP#}^)Yt1=vG8}Q
z-Db$ngS_2ZZ(hg<hv}3}+j*Soj4;s~;OCOg<6Yf3wKf;pUhg*2oU1^foHN87=PvNO
z7ttlExRQ|DgfZks`Y6*M8?|h2E{>#o7Aehn%EP{WlIkpDUqIR=D#bCaywTY=OIw)7
zm`knZ$8kH?Q%cs1pH4euUU^ftk~0LX+2@~2qf3F3^K1I8TTiXEeicqE!8{+&(4!0r
zSO2ia)y@a=)K;bO^w_R$QVKg7CY?3ck@BMm@}h8E>R0>%@4P;sE7PVwa(`9=A8m2g
zG2Uk=&#kZ1rdU_5-$cnyu;d7)#@_|d)V_SyZMM|-oSntjo1d5ebL7GN%ux6~S8U*9
zYZmH`r{s*8_M+QE+jXm)z}ulGo{9ijh5;d#r-Ok7&sm_pT1(wy@WwE!_+(nRS`Oxu
z96bK+Av&hC+gD2wNYB_~9cL(;t#gjmNP9P!$qISycv@rD7lzPD*Ns_1DMBlpf(aP>
zoc+1=tQ<<dLu$=Ly7uqGry)ZQzr~VbMmve9M&-W?x~0ged)+JH+=okte72Gk3$@Do
zEfvOw^_g0islc8EiVd<#g^F05fo74$+ONHTxLYccad$)w=&&9={qZRQy;EB7%Y)=+
zI}pJ%MkaAUAwp`JSV=?-Bw=k%3JgQQJIPN;a#UduMsg|SoNV%lx($MCAyV^-QSM{!
z$zwptY}6|sp&3~o8RnF}*$-7=Y+({AC8UxgsE{P+m?BVN6QHh&fznYF<Y=VINo0{h
zSZh{!B5V<qamh*PFhf{F&|x+ZMxOL31^D9f`$O0SKiMP*AS0;dk-vPh5GQ7P7@R19
zI83nTL=q~1_$+~LBj}kTn9+=oBAtiu@?!>DsZO}*gSZ`{)Q96cF%bPttEL1NPKwa9
zW-<khI4OD*rq>>g1dz9na*UA37;*Ip>IQtcuOI3*7c*I{qrVCV$lewXntfx(T~l*6
z6(Gbt_4Xjcq0|QXS}udR{l^CK^3R5Spvf|DhS0h(O$V{FBRtVdnz2gDA*wGTUov=Z
zP_?zjH%ahZusY<KhCAhH{Kvte0BX8t!cl35UZULlnTR3ck_04@OGG-uo-$KUUG`Z!
zrwi(u_?3S$+fh!1RT-893a*aWzX`kn?|t5YMc$Udmh~C;QfapXLL=FLFv}Lpz*TW=
zsx7L6(^#bP3FmRUO`fq-=5x(T?SjrsXaT+(xfw~DIW5Jr&v~j<XHE&C2vn&2Bj9-P
zXMu?hYj#5!=Jz8E4dwA@X{9)^M0PO;CG+phGr<ShnC+)e)kOAAcC$3TIG+-LXG;95
zO_X0_@k&O#?v);n#;~XST*y`wg>&0Ht>|k$>pf6byJIFJwJa~<mNl0o(hV-nS(m+s
zsopRGh$Dh+tOAv_)lC$W4f5(&JUICJo@dS9$B>cMt8|);-QA8FVxT9+#4UU@oqdQ6
zU{Tr2w|x>jR&>|vj5L~vCza{c2PnBog$bx$DsJ*k?<70Z{VVk&UXwQM*F!&k=Je92
z!l;Euu}a~}G=DDk?7e-tR&nueU2hC#{H-cVB!UOAc@#?`ljJ8+GovDiAbT)qqo-D+
zy>Z4BZh=7!GYFkx(yOM9)7hKJd{i!fM9*cX)8UiFH7QFV#~^F`TXwFi=hF|~=(C_)
zy!pA29EA$K;zYiR79)d#UEDS6^kXuW*ikE&V^<}Cp6HJoSQ%eZ0%Rk&JYpjXcwhb?
zeVXUFkC!~xhF92$y^2?KQ7JuHEY~lbaLe#@D$4D>UV0I3T<k)Ds3EkQ#neoQNjNZy
z(kV!Gp7p_d-+WGQ700?L2)jM+O~1;svtf!}&lmcYm5do_vq{HVOqgB$HT$bsyWo#b
ze(k%<sY5@5r^~f!UB7@0a1a9BGy8my4qt5V1^3$a8J<8&iPwh$<UCwX|6A)fAUZ3#
zTH|BTyllCAkh3P@aB;3TrJ-u@hW%W!zWz!WW1etBxY9?F!-tXi8oi$j0g-M%RzcYF
zB)p}j+_w-l)>#r4;%b}uS0hnt{Q4^6_C5uZpVP_|mM~QJ2bQ#U9fkJoh8r7GTd^{o
zRWBJMbw5?Ik|3)XNQ^WmN7#&Z6q4$ogoMANCG*XW3V$!(V0_2*B#;+bc{5t<M9&l6
z!+8GGeA)~*U$o{G{ZR~s<^KVdKx)6OQI#5}mCp2|h;=7Y?j29KRldA=4!LsG4N#>t
zvAjJxO$}ok_G3))r&9)&zW@ozJK=!45s%PU7{jtDMyugi7JSF%rW>>C+)mqjcQNjD
z%LCZWOs4j^Uhv>OCh1nPTA%7WmOIzV?fTxoNm=dQm6c7;XSsjrtg4nWtChU9=<5Ff
zTCb~Cn`$>#*p2|MGbnom`zI)pl}5GgsKsS)Lz?#Txug{iX<k1RResOShTYCM)~J=c
zk&9JT!iS}IE{5pZwD(qS9XKqlE-E(Z`>1^yPNYj)>UwWgx2H~J^x3MHo-6$m(;X<y
z%XZoJUGVRlEowpAy`5`t>u}bS>1nubmYb0C`E@A$mw&0D%;Wdl_WiosvWD%s+~3g%
zDSFkO^iIM^ewe*&-!rZHd!=^$pQg5!Qm;kS-MCy<A?&Q?opoG5<}a*U%d7cq@^UsZ
zOO1Xu+*I7#lBS9@`^C|mGf667&ZE;GEIMPS9dsFO{b%)spjN)8PN1bqe3Z+Zr8K*r
zLwVvo@8y?7Zw_@Db}icP8g;FPrZEy;vl!n+jd#V$$Nl)^wZ3O{Csg$Y)-HE()3$A?
z=~Ss`T-tP-P@|t!S6rIb^Sjdcxg)CVy?OHYWvN$Sw%*+HNIJcJ#=G56R;<66w=U#;
zM>U+Sr^xMEY4b&QE#Hf)l$@Gv(?#A9r7E3T!5a<Jr#hjIvbX5>?fZ7uX+Y6+mF+K8
zh5GesqSmp`Xw&g}yx(Kvj&Id3`Lom2ZFeop)BQf$_tm$&u%k%ZHSTMunzfkLHlX~v
z+J8EmS@wCN$o~MIT`bGXrnhaouST0gZ0@AvS$yHt>JVr*?73YP-OJ~u`F6^b>^9qL
zYE=jn&2s9;Ov8iO%E#+(!xOV=>Y{ejYggOa8>f?!Noi7vf7Ltw>ho7|-L9^VtwEyT
z=xJO-DVEd8v8wecmKK_A(n1vzm`z%Y^2#nXK44g0urqM!9ge7$y0mB-H46MqsKVzl
z{{R$MHoL8=({ML;+i=`%*c@nDR;uRG*Z!ZX=JV8J7t8U+_@<)#l<ZoOlWqK8Nw(Ox
zzOcBot<t*%Ym2tsdS~?1maFL+Zx3xsFDL$+nY_=}raKQxY~M8K?iN=)L3?feG&$z>
z(J333H$fD~txkmh05j8b{{TOm{ISmVf13NA%<6RZy{Bxnv1*fPZAH{|E+^F0QN_pb
z#oeDhcB5eGl{Tw^exNp!3m-(!nclL8HLV}={Lz-}xi|bYv(s@V=SoGVT<Xo`*UMdp
zeCivnqUsM)=Do>d8(`XwZ5B4016m!W9^ylNM>k%GZvO!Ci==mJKTmG68#T)u*9*Hw
z1ubbuG1Szx`kV1MYSsKvlS}4jMDANl?;p6+qRNhK3YND=)2P?fI&`_H=js0dn$1jX
zQMc}>-ugeN_NH|H{Cc9o>Y1fSXx7^GLsS;)sYE-}d;b8^tbaF4J}*2`X<c4c-Whek
zO-fbV_O+6013K&F$mwmg?RzfyX<=V^tLa~E`-M+V;nv>`ovk1}KZ{%RzMJa5_%v^}
zUzU9(uwB|++wGSZdyd}1MeS#5rCzyBHC;{4a|79{@}9;uwE4f(Y-_*n-RGz+cRN0+
z=sViB2AT9NnMl02fH__2Xme_6cBWRP%+E79qmxkTub90K8@1P~cek*DTWfg2T1pjl
z7d#$GujM;TN-E{e=}9S9#z>pNMNitY@4)JRF>LO4Ev~Oj^!CqbeYkElP3_|?0|i<Y
znC7_L)OjsqYQ3#>4^iHK<@|9(XUT4&qh((6cfPZ-Z7W~Y8cuPj{{T}4+tdxx#(7-N
zTAS)O>@o4V=#QJUZ74Tu>c>>0qycZc+tX=&gUl&hGfu5M+?80^wA|fnoq1!q?R`CW
zM9_06(ZAj9YCKn>9$e;BZ8wHCrk_tp{MvqajAh8<$79vi$>Z2Pu$!vNI;~DU(w%R)
zbp6%4&1b%a5`8DU>Nfh%<~TG?ttx|-qkoQ{JbG(o-R*jpY^AwaxPD_nsYE%hbvF>Y
zNohEacE<+lFO^$2Pi<eTt<|>AzWm2iwRJGzU(M*Uiz&VJnXgZ#_nmC}+_7HSI?H<K
z>w1>^CF8wXz=KRM7O<bX;90Fl<!aRC#@NCQ@3yF&RF+BpluF#3qf3v?ABJA|%dqXg
zGA$Z*`y1z@O|@f8xS%>Jw7WhIr5xAwHle|yyEV9$v}y`Wib(zFpl(~{^}AMe*6n{*
zyWD@&d3qC!-Tv%gf72XX=ha^@Bm^O3sE&@OAD1gOqi<@GU3wTU@!@iATbgZ9>PSH|
zLzqu=GzSD$pj<e{x2N|X>RW9$)NplbN7Ab7Ceh0B2ikA>wMNfLSphV{N~u2R>qcsE
zQdu=qohZ1;#pu8g5g|DMGD5S20MbZQ#E`PW12m4}iq*|egb)EbP#Gu^_n2Z4C<rS&
zr<@@tD^Uh2WG9-z2nlfPSzvo$H&_imQSLa*`(cOzLBkUyL;+<8jDQHT{__PCa|J6c
zSwhlC2{SU{3Mi65Fe^yKLZK!}KFCCwYLG&E;DBZeq)-A>;8qAyWJ2N-y#xXo=5x@f
zB~)<A44M_hDUtA+N*q8gm~vvLzbqhzI3i;;K_r$V#sQ7eppw)WOa(&@Xv|5&1d^CU
zEam~sq|}K8he~5t5D*DLQ3FT=DInlgRvV*~4Ki>+;v!Y$R=6oBQ}&z!2rDw7Wh7bW
z0KkV5WingE6Bq<(9A%W{iugo;AP8om0-O(bbO9nr2vfd!;|xO)$e$N~j~)=9=?a=m
z%8uX@k8CIa|JCCa!32>YqJHjf5iPlq1O#aE&yN^}unM5bI|}<Edvp}@5>y!xAnC_a
zEBC%0;t%0nhg0c3rlqiaxxRB8Mbq6;v|A!|J;jXrW0(Tx9(6B5t{_ainiFm+nm?xX
z7K($Q5v4)Zln}!p0<3@-@L$~pn|oZJ-jzv7*8>8m{Nyt|oIE0CYAqn*Ow%*_VGz~?
z+>n6fsG#l6CS$o&6iJErdtwlNY4?$0Z7Ro2QpYx(T|$${A+FG#qeHVTE^e(c(LPc+
z&FYpk5?bm+=~4kCiKKISU~5l>NCcqAJY%JNpQh*aOKN;9q)U!yzu6GGPs`nJ{xKy7
zNh7%Rk5p6XU6!Tuiq-!BO^N|7I0&ZfKRLEt!$r349cNJdPYTuiLMdN2`jOkV`TZ_~
zkLj!cZDcZaXJS8vqB*(%;?ljHf^sokt{i1)In_6paW`^~i$-@pnHxsm)Z0kiwwkQ1
zb8HLPSV1ppO|ijg)Tc(fA6}oSO{ABw<B%i2?#9-`cHB0;Yg+Xf$0n+4$Xb}8$~9M&
zpL1HJcYxQ_D&R0CCmT|?2L|^>wU1Ora!!soGX1jkXHM&7H`@vo0g|^4;ybgB%hQ{-
z-*;{etyaA-(m^rh2L%%{nv}r=bzhRktM{9MsJeh+wwjU}PX*%`?^sbzad}fd(%9YI
zNVV)ce)CGLYd=+k3}c6U*Q<ODg(Lp}u$zj1#6hGJ9j~gok8J77I(7F7qMbUf44KWx
zgjRZ!WU%VJhO=#)+KsBuQiVnpEjWh=s@mChHz`%ACyX3&Jm8?XgMs6UD#uRwT~WWM
zyr-v2wXUzaS`km(aBujdncUI1w69jy;1<@PO*)M{T7I_2O8HRYP4yS2qzFP0#1hGm
zXZtXE^WEvU-(+9wB7U2PDjm<<%70m_dYb*jsV;6AD@3DC!WzO{902Ad<kP_;eb=-b
zN2%`^g<jfjS);jfip6KE0~DblQ-F<JJCxzspYowj{iQsG`}>RZ<?k6$=A-p5n*bF+
z@rHm3O+j5rBZQ$!YK}7H$ettH0HJp|GMDy6ZMN$3WGO+-(o>G06ikpuek%xt0ioQW
z7X=7};=FKy+85{$kERBYik+u~AP@z*kmbM`c0vSdM@eTX{SdT?km*UJ;Dz2_4p?rY
zxJ!WjrvMTc6U@?70)=>YLTyKYOI(d0>S~lEL}wIhq-NuS!Bi><Ig*Kt0bYjE{{VKr
z-rq_0_eSESy&-Q*)D0SGFsoXvYc+oqVyl{iN}xah!?U(ACpA>fB@znt7zhWI$bQt4
z5=mrBb#pR(SA}%J5(flaCQ5OD9OWRLL@Iv5AU6tmm;e(11jp%v>aoO?Eg1#v?1mLG
z8rLF%q)Jnm_`9IARK-99sX3U$N=`x?;7H@y+XWc9n)2KPjFbohrcl-b>PI)qxrO&b
zn{iYG62?U3D8oR~zN@=QKFL9CQpYH$NyLJa?}M-uyG+w|f_$qYK$$1#fGX291Hm-h
z=FFl2lDEq*EMvCjoXe?#P?er8m$qE6!E(?!8ix!qfuu`w%BN!JoBpNhotEvkTb)XK
z$4dHDw)Yy3EWEdVw|<iLnzU8Z=xAkcKjue8w%@u#sX9ns?$5P$-BFHy;kRB?Yn?{A
zpHYqV4z|6I)#&Nd{{SjHziqp2JA2<+!s@lHN{w|Yj-gP~bC^?89BItccL>AvUR(>1
zk%8bb#F)!2B9!+hM4D=vbtm(c5_QbB^-jsKZMSuuu<w*N3rE*?p=m%>d#TVo%ynAs
zV`(|f^61p))@=PlxOCmy?>0-9TGw+;I)$kAxv7Kcy=kAg;~FyOI4+}tq~J{ZIB<;$
z3uf8*lj=d-#<5g0F83?D%C4<ip?z9}m((+PD!s|glVS$eEN+${1oLqtf#DG@3yhN~
zLdr&b4oTW!tQ8WaIuLSlb!>5SiBeP$n9T5(WF|M^2v?Vfz5>W1rV#8fD3V;~4G6Pz
zWoI{VuWSiEr43DJ$;@(*g!x?;6(k|SWjXgn7o<w>P1HjAHFbLd#xKoiDupvZB#LJx
zTw}Asb+Ee|Us=!PFWz^m{X=yULP)3+5&*$jBZ`KU;EH=OSaW~~6cLP$TtuLOr8P_e
ziS6%=WdcA{xmw}@S^Z`Pkf7j61QV1!GZ+;!koUq-gFpnye8;;?AONQ)AzV+gQ=e=C
z2nnPrDw${B4t@ww1Vl;q6%Y=UBuPxb5`X}JIY7f9`}2T9&Lpk~pSKuzR}fK4M=%2~
zWCA9#UfhA>fQBKZpb)ddXSDwSWDo)u5V)L#&~X!jLu3TP1XUCIAqIe@$Z?Q9`9J_^
zDiNrHV1>sxa|)Gr`J_w)21Tk$cgh~p0SSgmgm%CKIgpeS$xupa4;VmWfLyWc&QM2~
z1OSQqf4&gG5kT$XIluyJyumvZ3^)%MDx^77gyE8zLew)ec1fCWgR2BmQ4h8N8l1kK
zp4DIMg%9|WUC1$o3=jX-;}tsyB!IYre(!JMi6{UVLx7u5%3&v;bT*?4iU6eG5Ry=K
zL8YWM$uT~|9Z2DEzQP2Q+;u6K%1EOInMC08O%!L292dGFr6wbtCkX`XAMC`SlrECu
znPK;0=n*yb&{qT!pp8Tc0q<O5Mgm(Sq_3B<W9o?Nm8?Soq`kP~4{DGa04I(Jy~`PG
z{8u~4JciS2ocs_j2OM*YXG!nO>Fb+|-CAn8k!>sxkgiGp01Q=F^QlNK(g;v6k;M2!
zl`1(EMhFsMAV{%AO5CF?rS&kYO)`4-HLk1IySa9%_39dx7c%UAm}&y{xu7_v2gN{G
zV?g3&W@ZZFxWnG3PZ>^q>lUn$7nqzA1ZzTopm6StI^<?mFd-*C$jc=NK35DDc(1ZC
zYZQ@?Kv6y7E4_q9h)jtG098BtW3v2<rpArN>DV>kdmDzR<0U~sB8X3XTe^Fys_Z>K
zePd}Wi?<Yb$Czj?{uahL!dDj9tXp34$TXc_Lud0#sz2@x{q$(m*J)(5>43Ui_2zW>
zOlf*^tvAk`dudwNHLaF*l}|75E)k=sK0YN2h(%o2;=(br#|0~z#nZJI!9l95MZLFE
zebk$+rul#UO4^a3Qn&u$qYYV@vV5(oTe)0-;@3HiC5a+F*Ww@VM?VKsh8YP?ItR+<
zmhN@Hq)j(5qts6;ry<@K{4ve-Wlv4yMJlbD_oH70*OzI%U)0PGnL3YAYH;Os5P(dL
zE-MtLb3D03hzAL%I023kborXy9YEq;sm6myAw-qIsT8MSqYwz@N#Vs}JU3ns6@M)<
zJiqF0{w5_P7a5_RLk<wI1rwW>RydGVAqgQ;2~}AEFYShqP?AnXy{14##RHweLWNw`
z6JGYssUSqaK)fJ0AT1;)CTh5FNCgfA%QBDvL!W?94mkouHb?@`8OjU8n&Yf#cbK;f
zdm#dbSgfG{0-RAWf=Iv;3CwpwnJP-IW&oM`ga9I%u`7i0geJTP2tXmnu%{8*;RO|~
zm%^$}AxV<JA*AtwJ8*)SW=<FxcpPDA5=rpEM}oruCjl&vWGqCd*$4@O29kk2utdb&
ziO3JG5D2X1OpNlz1|>&~A;CG33PKGyuWYzLFnGyA!Z^zeq4~F0loire8j_wuONI~%
zf*ej+!XW<WC<#(Hb}kSVW|Ew|VTA+&3BnS<LI7#)86+eEL1&k2AP^%1?u4>HFwedL
zGbx8;DLb$U!vWGvW$(st;9BC~(7nBo%n;-I;nM(>m)HG}z_4^|x8|oR(X51KG>E^I
z0Op}H9>D15Jt|YEY}ycm>ILld-hQa*KoX@0kajWE;5yl-Nk4Iml)V1{;rhlQR9+ZJ
zcS;tOA#<J_XT}~voC2?~eK126xM7+To>8)<Y7Gn|kv;^%1cD@rowE{}!^#i_oXU74
z;UNv0gc%M6bA|vD=0wGDnCymWbjnO3C;~<##5j<k{2<G+14bcIOWp0107g_2pcF+>
zhwg_A5FGLxPZ16*LI4ReNkuqAv^GaMd5I7K9OR&s>|V!x0s#&Hf|;sLAEgeI4VN$&
znf4ecgens-GtK}b0Fi=+U_=QJ0mgG*WH?B6C?t%hIYWv`prZ*@B47awR3ji21bL(f
z7$rfNB&l5RmS5(AjzHv8Y>DkA5`f?qW=3Zp2nHa46eMO$u@Fy~0p*gBGua51rlL@s
zW=x?IsT_c~PGFp601QLt%M|d14$$QxIAts$Lj(WU<5h^67*E{_o}j}iNDRdLM4;7f
zkXH$kqvH3%U{-2mkc9CO)%$(~31|&Tl+kfWaF`+9WRXcEMNFVWm-gclgC}YTfC*v2
zv1)Yo3Fd36DXI@|wsDZdCWfey01o(qs*2!=A}H;NSU^zbsWNcMN=8^=IrfryATmfs
zDn>b^O;8XEuux3BkqbaxP@qkGaY&i!Kw|S01G^Zu8Awcf09_%*NCNmQJ4%AHDV|->
z8>C`LDpZd11Q3fR;53Zfo^KerWQc?-0)kKw9Wgk;s_tpZK|EtE0Z?*N;}~_NiB9#3
zZ+s&}Xi3O-6r4{(0^x`uR(xUtfX;YIe_TyOs)C8{in#v(7}+Tts1j0Np>l{G0dk@)
z90wSk(g`r5GEcfO-qk{dP6v!zct}8!K$zf?i|5N4l?LxwjCPu3jWub4W`v{v0EK_e
z6cCC46CQ3SvM;|ZMKiqBBppJS(C)%{n~eC!C)Ha20LOA`{{T<xHSm9`wcqs+{L$0>
zZkjO<qbH^yI&J<4qdf}g5*fwY=8K;F`iowLDJW<J8Irx~-7!dT<?TEQ#T*L$PmZ3J
z(Jp887k?175TRs#zwUrjAu)jNgfWt?1I84}x@tt{oM|%)v`CRZ*&r&eqDcdkc!)<3
z43m#{WEqu7FJvL4-U4w~7ZRlNImbV{;Q^>fRYN&rIN0s`J?7z1GKDf%;z>>~J5S5H
zk8K*4I*O*IltVbGccBd3UqooP#kndxk<O{x)v7sKNdiNR@{E>kmv5h@;-twGc%*i#
zZjQ3F4W=ouFE*C}Fb*-Yqp;a%bF?+B6Kh%qAmtgKHP(q6Bf+;txwdeIY?rk(hSDLb
ziiHOr=#%;l_JH9`p~1?7IhgM>wrdlsLrcH}BO0WX!0?8*Vy5+VS^<m!S_&yo8N&S3
z%NGv9M+1J(bP#DL4#}P#*bmz7nBHdKQ1&P9j`4BRR&N2|G^v>YA|@UY6Z%4lV(8T=
zITr|jo0`(3e<x-393a@%006=Rvyhi>vKQ#<nHIRD!pktfM0Q8?qYLytkZ2?TM5WsZ
z^pzkKu!0Ibt&yJ<NIy9G68w#>ryu^CTm&r05JKZ9dp3)QE|!%TBF%A*z&cio0d|t{
zAd-+?*vO~TRKtNZ!$~7cWjRj>i{?uthxscjGm+EjtA~K?GT<;+$!|$qyaK^9@Z%kk
z-91W_+fAU1M+s*W4U&s#hrW|fl^stN9AQ31%?7yk63F7O=^Ma&r-A`L0T`9KX8x13
zmKDh`NbXOuw`0^0u24ao@y;W5*JkRvUeJ!7hYUN6MRKkun5D+F*TQ-JzTI;`A6lE6
zrxKF)#v__qC&b5~?)Dox?0YJ)L2ofYxaSnT^X2_FYJHdtehy-ItZ0WZIw`LcHO>m7
zcHe_JehVz+iv1!MV~S#9KXKY__S&YEc406;DZ>&3U>3Qns3G8q8q|};I+!O3Mbb)w
zIj%BZFy??kDuq4$kN_*g>VgBj&1}FbdpPz)h?^~f;LzVJD-=JR#uX?L=h7z;gIyg+
zv@~fxW|1PYob$#w{Q%)snFmXn;O3VS2q1$wc1LBXKR;4n<0IHm$pm!0J^5z3H2(n2
zfs>Z8i|U&85s;|T5VcT}Ke7rGQUZ8(lOYa5KtiC+c%d+eAWg{W$R;tk$^r{Uf>p~V
zth^y|k};6MGtM5?$xMWm+)7FlMlLF#Ak0d6cfcWp1j1*UQ%`5%f&oXO%oLcCP6=rW
z3%dbbIPl{D06}r)37jwxLaB*UpY1h-6;J>iP&si=-w%*bs3w4C!v_drhbYy|XFl*a
zc0d9kVigqwNZ|vC2eYtnUJ!r?7bA-B5N4`EIdcQu3<0M?kP*NsGZO89bJS4C1KKC;
z5bj?AFpKrVm>`oPaX6BhEC5DHUQ!5EK_%kj-vp^t&zgPnY!J}tgDj7XIjt%r95XH_
zB|?gLzyl2O1R@mre-t%5poC1)e$3)PptQTUn*lxmh-f5&;3lf#6F6WqS1N1G7qg1O
zha|Bmi>Lnpi6KA-|Ip?8Zq;+XUmHs{txY<mf)A!O^)2dKI;9Hj(^{D=sK-cxB{3JD
zlq_vdt1Ty&(InwH;fz?mZgfMeoZg|^sjV8M3)*9f^3{<402FO^>B(_<IQ=8rSXF0J
zlC`8DDAGH@hcptP$PlyOxXuH0)NrLtdbM3Pv7uZ_q=qE-MeAqzZMoF5e(Sv@ovk*S
zo|}x&4N7=^7||=Rhs5<EwY-jTP#;(>d8wBObutN85K7DvL`i-re!8Hw4m29$>5{5i
zJ}($)W5Co}bg!6AzR}+m#rdTBSJDh?2oI4`)$?S%kt@RXfK?7Fht&+G32^`wpD65t
zNOQnENK=u*Er-M6M?)a;fDRzxERsPnhJ^rgNk{L5;0Kf}=ip>SM%QXHm_!G{EUg4f
zB<05p5^8%)VbxWffGAfe%ckVV40FaYYlXxWCSg-4MK}HiG(?~yifTZau3f<dPfuDa
zlsJhOE*MKxV@(2sH9!pTjG9w27~zo2=M{1OC)_uYBZiiNgn<Sie3Xnh;W)`M4im$_
z4%mpd0ys^7ZrIatU_}r#j~Qnj1Lc0d>Ng;;xrL19&8`~1+KK{5$pMz5D8GEIC;F|x
zYyJkSKGRgpeiIyDSYP`7PvQFFZS?;D>aBOb*=7&S($Ulm((Px_YM#NXOj1f_6A5rg
zy!_B=?{o-i15C>Tt1<hcjfB(16h#~dULBA7Xne&^Mc>2&l=!AF`cz+1qiNG&eOHj$
zRM<I-o`H=6JzAldD~XWAkK4vB-9NWd+GslV<R$x9Tc!9=ZDGx!xVr1pfAaj)zAk9d
zDQ#6Mw;jyYH>NIWu+^^Swv#zuq|MJyD**E~xw<xc2TA_2;G2Mynm{<Js541QHm-e3
z`r@9SPLmFR&@e<&2Q-`h+*|gmQBI9EuyT;7J28xRZ%HikIyP)Vo0>K|M#*PrG%}EE
zftWZFe{^n{MU>l2DIf%HU9MP;i@^#4WSC(YOgi2hrDQ+@B4ae3y0Y}D`w@mMs#kHU
zTmdB-1}XJMOK5}|T2llw!-Qor+!u17Y9cOtVeMv@Bice0QG!GiaV&Gvxe9}t7(?2G
z&qGt+?Tm)eVQ4L;@jX&QED{^~t9Hm~5<r8Pz;(-mi$HT)nw2H(i!e(ZNI6yEg9S<r
zYq^V%9Yg?7Wez~byGTHBP=b7NhYFPrkP*!&2@_G<5{hlmT{SXrGakqVT9~i5rB#wi
z3B$ig(#=H+unR69z8k5H0a}6TbgGl~Q@$PRX{D?La2km0z~coGDm*AP*6n+cHcjFa
zw>U1h4XLH9FES`FUc(NfbWKSj09D0tNL4P{K?)p46NhvvMu7JwIkfeX#a&#)eNgAp
zE^A9eLZHH3v4c-^456(E(n^E=Q5ZEkkPR+-z;m!<k)Ux{b&V;j6(sUTZCf|fa>#Q`
zp$H7681*PLw6)>D$PB^pfX=0~9F<-Pdg*Zr7stLJx!jvse@^CAbb>GmcAZ{WgN$?<
zbE>K&Z8bVvNopb-08tpX6qT<o7|DBzh)uhy8P0*g08obo<0x}x_sy?$Nf6vY$dDpB
z_l(hQyH4M@(XC3YFy(Hhyy6y8?v81_bahK3>d@w(<EK_6D8R>CtxO7f37I7T@0g8s
z^^vV=@lOYoLQ$$GrO|z>vWurutG44L_alZ~+19kIDb;8J0clw{5tp(c0g}~*1q{&)
z=N$v6wt96b2Ueu!J&qKrGzD`Dj+Rvln0<lf)%aqAT9c?%bvmd^32Nk10zVXVt<6be
znajIxW5X0=TGJOdR`fKAVy#+q+K7XhqnR=DM_;Oi?{lQ2p40|8F2c=w*4hWfbwSP<
zAb4}V&5pOEkWMS_oOOLYc_#Y09PV4jE3jrL1h~m$nM1+PP-sa24DrlBLI?{i)B7Ne
zK$i?A00$s4jmA*Oa~urE*9a6@mvQmN4xY{f-I~HP;sD6vkO%}XX^{yiMM|JhIM2jF
z15gDNERH|3p~WYXc7lG00BDnxlv6BllnG2VNLdFJP)c?|nI%bK5+e@T!f<Jvh{9{x
zoB%b-iezS^3|A~+9VY>#<VSxPbEJa_;n)MX!kG(~062F;0lrlPlCnhlQ~l^9@RCfH
z@E?STIzX6kT+m{?VKt#O9#9Gv@ytK~B@E2wRGA@{vKEjKI22Uv_rVm@1PZ8HNW>&9
zB#;7WI5_tod;m0s1L0B!?2wVGBC}854|7noIz(lqa>j5P47APIwI;qDu+R+WT$dB<
z!W2oTl@MTvv&a3YPyzqZsC=ldX?2~pqB-e3vcY*{(fhvLM|Ik6cH4w3xM^=R@y;mS
z7xM3B>325jsW)4eF0AZ{#54>9U(20VlA#)6yVq3iIY*&%V@T1IzLoiBvUIIBSE%3C
zUpxI;fD_mnxJ7g3_ft0B&X-TxJhr9E8sVrCpZ1vQ7Zr%2`pg>Mz)`*t<({YAF859P
z`sV$noyV%`Vw=7OvO%+p_=(-Ux^BBwNY0Us7Pu9v-_0{<Q{_^=5!Sw5^sTQ@bcW&U
z(`l<&&uuz~7N>Uu9G19R?m}FIB)g>^h50B+>4!E%3a?OmT{xdAZANW6dC?n5ZpWr4
z3AnPju)Dajxd~p~?x#_%C>5$XRXpJ%94@7t_+uS|=LbidJH~Y#l{2E~idQUPRCX+X
zf5R2NmHAt}Z{?Nm7;J3`1zr5dwrT@Umr~;zA;`D#V>z_2$EgWH)`+ATw2w5D8B4Yy
z1ylnh9oS7{s{Fy)I)9_KsylAoN7dXe5PGDk3?CUjJh95@1yH6#9>q+tSh^Qg#GM>d
z!BkR)+$&Z+j?j3Jy{(8Vs96z&rxNoxl8Ow~6WfeKa0n{VoVa*Fci>{wgRuBUO%g~}
zi2@fgv;EPGQ$ACOILaqv*ho@^4mibDADktTGU;#xk^4>)i$NrW2aZTYu*w?ehZR&@
z)k2euq6d=WrLH892|#5fc03~Mml9ZsAdNblHTkpm#qZ^LXZ3o_s1d=Swc{1dyizD>
zjwnRRd*b`@xK;kCa^|LuEe;1AN=QW~??)rm-~PXl-`xKInHzmQf2+0L{l+)VCOmJ{
z+Fw6Na;^kBAL0fmjtF}(J{ZO8=4<IYs1}^dXf;n2anhdnqd0)BGXNbcf$#qSG*Q4*
z;qm_f?V#p=QFpdR!)-sNs>`!a)VUASXGDbpul!eoCwg9O2jh!Hm90wkXi<Ga_R?{v
zbuTl}lTh*IF46ud)^z=mx!oATH48dg+)_o)=IrL?!|uh=_SCHfFx2&mQl^Due9qE2
zm+V5laqdxCDwf%TQEj423tCR7(9r4m1o^)6l6qyz#{U2mSZRXi0u0<GrvP<j-|<G4
z-m$l+>P1^pPOk7#!LA-f$5WBKkz=!FwrU|Mv<A~|0C;_<$#XMpyb+s3ma+PpT<4NN
z9%*8I@hPct;m8!H^4gla)k9+rjngQx^_&Zzg*GsPs7z#2Hx&SOw^MPMYn%vZ?P$U)
zd7@ig7|M^_=!Q9{2}bKK0Cx>jj7s-+O$|2}Z4Sa!HGz-W)ePJ$ol<Rf0FYE8K*$5z
z8Yi;m-2)k|X7&W1aPat{`$@$cx?<|1OP_QeT_D-1?ja*S&b4}6jj64Qk87r!P{mS>
z&f!9#EoPl69Kgi@jKKmkDc;_-%0XyE8kS(^I4kTYvAl3EbRG*D(sPvX;TbMvR=btO
z7GPuB6`Kv%RHiAo0$fz8C`=MIwJs_OL^K+%dJ#-wPijEqkzU4`u@7V3#BY=alH~1&
zyF**~fdZqNlA7itU1W8?(r_fUYPcQ&ct*wGnYrquAVimpH7bp&U~9SUOWOT40<AZ8
z6Bt#lO+ZYV^5R@Z8bT*@Y%W0>ONgfSgNTuy1~fZ`$fV*Jh1dcx!AOWU>PdNsHwx}F
zPpDkRoh8Llr~<6xNn3KWeN!hqk&8e@PkcLYH&-;vhc1jwLm}bW6?@Ib(#7Pu@71gE
zdMqwPmzl^L7Dg^uC003Yit|ZCu{UyPs(nf>dU*qmIN}g-jsDqw(u`wV!z5{IRTsAy
z<-@6~5>2(YsP*&JCB&!QVC{^SZ@pUauHM@~XT;hLkP}s?V$YK-p4j)YnO?N=-20QE
z*J{+YsZH*yOg5*&?LSOF`$oFo*Q2G*1zga-?vbWBVe0O<rpeWKrHfCX%_MQvrXU@A
z)pY7Qp}J<hC_|{QD#*zM#5Kl~$;J4T{$kv7+`Hm+3UwP$J&kDrs90u4F)K1mLs-?O
z*B7Ux;8c7i4xaBUQ}&I^ukUD!-M+*16sng7)-*<!midJ;=QCcHeV;5xxpkLOSywx5
z@3ykmwKCCKo2i!6Zn>#xGF9UO<6Lg4m05(jHrl6CcXvvj<3_CtfV!PVmgt`;MOshu
zvHt*eIfm<NQPrxJtKCnX?Qt5~;?U=g6*2W0raA>0+NupQnpULXj%gz{&DxvxVxPMZ
z-O{B}OIcS*xNw;oT<UGF_&mCk?M90&r$(sy8Sf<Hfq~L=fBs9iw3RJ$+(;QVo82@n
zpZhV?K?&R!1s>Svy(MS=0P==9xGjAOb*fZraBG0msidA@?Q0z_<lB}~Z5m~TH8@mq
zy-$_gKQLt--V3FwzqKdsIJ~vzr%0HSAzXlAKXk&wkl;WW05XKo99#mZkyAfp6%G!_
zs54TMF|`>Onj0`CTlEZJV1!9~d8H0OF)J}8l;n#Ep$G>RCVkzozzs8)m@2*eV1yZG
zJ{%x50~Y}zN=%*@!CZ*R3Cp~9tRNK197bn|5FFJ2<N#6Q`=QgC1QH7g9fNy;6UK1B
z4RHe!4-ly($J+?Cq^1%?`k?6qocv(9z5|CCDg!hCM5mY;LyKfqYl4)*4w2<5R%8Lf
z2qXYO$fT4&14(FiD(OJDf(hXP2qY=x<(^9qnyRCutKo<+($kA1nB$)q0B(@esbnRP
zfu%H@PFO6K9n3W$#ZXL=34&UMRR%>#@PG)@liWMk6@?OWE~%~vq7*Pc|I)c_-!%F@
z_06S9sHaq!HlQGr!Y?*AV(qP~Axwn>h>sakf2VOt%?qXNEfT~hbb6lCai_QRqc=LQ
zLrUB|R3e7K$gr)>J93hE7e-r?u7#2G={lpILBNsbYR4b>wb5d$9n?Ez1gydT09M8X
z^%x53m&780L*_44+Fw?^w7hU-T8?<7P-<!}36?Q%+x~Mp=H1r2{g$SpdX#yZN8pPi
z$Gd7<<EA)!h}JI#&N$jNF0PqQn+-~pPMTq%POOA_N9A8m?mGs=>N|=urDqy}qW}tV
z&Nzc_wu@e{rMTD)HcqRoFw8Si9Anb^M&UzpwWn5%K<4QLI2>gZ@@_Wv<-5#E+;DF?
zU~R4`TijacxnQUm;94CppvI$j-DzzWHLvTXJ9;cGk_<D9`qZvHKUJLc^AXMdZrbj*
z8wEP{RSC4B;OnYj9Yug5OYE4U+TV6DlcKE&yc25{u{hUK?{{17=U;KUC2Kipw2}i@
z3DO(k8k$02gR8+n0K}IM<!KTGV~mO6ltoDMcHqBSM|z1#MP!kUy5rPvC3%F-22&9c
zZ>d(DHXw6|PV*?3pfmWez!dO^xrJ5{i(Cp4lU~@3>>~StW;=4*t~R@m)7%8*Yub2%
zBXQIHAH46+XrV;YpdHOxpeUIZh`R2yEO!b!ZF<y0p2o3UWU@pta&GfyuxxL<tQ$>~
z*nKuON@jw1ta72sm3F~aPY&Z$xan3>cpP`DHVbZ?t4hRIHSS1Pg-|feVHJk}szy1+
z!*|?k?$`Bq8-lei9jFX|m<=8guQMTgC+?1ePij(ibi~f+-U^5<&$y@_)Q*Ajs8ar;
zbmH`=8r>~9wCCXHpbXM$f#Q*T`C7)*-7B^2M`&rMQzQx=-q{%D`o2D&fAD|Ijeei~
zRlfK8EXVnd*A4xza4lo0fYii!gTYU{d19GwOV+H`5{)hZoB|A`-4}kpZC~raRMX8$
zN>wzHuG2Nt%`t<uTU%XRGMx)rb#>HZD(Y&vz_rgrUPW~_mcA<DN}LN0m8G@pEaY3=
z$+`HA-%Rb7AEEtgsj`(zl6^X05L&_lz3!f@G_}8KHusOEZ(X(NxhgO<mh5!UhJ{=j
zT+Urz4yS4RahXlkHv4AT=@hDvwE+q#k>w0bWIVqNRyQlUZn$V;2@eNKpz&|r&NFUu
zp5|y!af-lg*6_``nJzw~nWsf35vlGj{K|xYRsR5Y;cOQC!QR}@(^y#2LQvEVeMb_B
z=KEi2E8CR{msKffs9#ycG}3dLz3ljAv4s20%7qW<rsC#}Do?1E!R}yd-=(!bEqPr1
zE!C<uTeS&AQyF%d?p?VDn^EjW?&qyKii_VytHrb0Cdbnu!-SugY#qMiuKJRrsO?{_
zzF+1GW|pvU*HCF=e~HmH!M0s*!k*{9RjYSG(vxj$snjlVvWkjr{{YIQT<tB6k72xD
z?WUgDx4D+IDFIgQt9e`+e6)16TE{&<7M4YFxoQ*IqOi@ELe2Y--saMICQ$2qzw7#K
zWnIRb?b`QgTK7R!7X~=QfA%L+F`=TpGf~qZS0UhWoY2PWsJia@`m+bFSj$sUwA?Tu
z#AEnks5c$X{{V2V!+A>1riU8U?`ryC#+YA8zkSK2je*MKq}94pwG;U^t;IXjmgmv4
zY<gF(Y(aewSb$Z&m2E;{515!?-nzSJ(%r({)!5O@E@=Wh$OgJmgRu2aQBtht*jyUM
zohQ|)N~7Awxs=AI28xa$?)xoi^&J=er>d>D`fP9_-u0uJ-mVn_k;>vf`63gQ$oz1O
zmtvtSR^dAQo3dU10Nz*jzP8)NX8yjDti$mjIIxej(~g#y>H8J^h4nU6rCEiQ+sRN6
zNlbKon5f-4g)-+<wx?c*{%s=G`mOC^j&{aHwHrf5mCdkiOKP%XwCF8x#gAx2A~M{Z
zYNZ&{+-Z%u(w8MwqoyV5rHg4*ZB`Pbpekmw$}-&MG$Iv5P$lg#SY_2Jo|ULAq(G5o
zrIuq!j5~)h+Nr*|sif%v&T&rg<1AvA$&6b{ERwS+X8wbFx!l>)K8dQ`ZfGUC!j<eC
z5-A&o@Y+jG#?GbHXGkMZ1^A2VawHAfvsm`hVmUJnLnYz<^eS!^G^kZ&x2tzuo}*O&
zZ9JywG(IN_m1<N?y$0=3QZ3b*cXZ!aZ&clLd+F_4Xe#QCAe8q`6}L|Gj@eG7?&kAx
zLX(KqRL-NgHK6lG%XHrEZf(fEyA&ufNMkuz)BgZ3S3O#(%GY@r6b;7n_4|dT#pMUp
z+UdQm7Oj7$s*?yWrRwSAeN_HrYmB9FsKxDy&X^QjvkOskn%^e-%TqevXtt_Zx7kpo
zPvK!;1cAlFRBdSW?TvENQlFqo1*UmgvLu-0j@_s?dpCC5bs9EazPfu`wN#|qOUp&?
zCABIKw*FmFkxHko+R|$ut6MZk0HnEu65^j_VpHP9V^`u=XPBhvwJk6@rFu+lH(qTu
ziv!QAR)sP&yf5<@9Ib6nyYa=gk4yA7SnR@!TGMpKC+k0#a>tOm>og+sf9b~%*|*y4
zB95Y3Oo7O*9nrV9-0!WfU3a$Db**rtTTYT%M>8E2+J7!>NhY6%u)C3O=Rs|i8pVHk
zrs|!OcIn;A{@>ke)U8U<TVy>~ngWx~y;NzJa_ZEsXD>8GX|`SNOsM)HDN_Kw?kE8x
zmY8w)!$fX38auA#VOrwwI=x`#K9k(8a5}WO4#Lxi-Ha{Q^){5P>}}fM%0pE`*=wld
ziS|aH%q^)xo1CfOVf?S}9_IF_rs>p<(yeMbG&SD69&`)L*5-JmZ1+v7waq#kog^Zl
z535WLV@=|2;@w|mywRqzqq$qvWz#6ss>0&eF@$P%r3KAY<xX7=i?++RR&82!E|gT8
zSxA)(Yfw>SiEqOAFW-xBnpmYrO-E;p=_u~PwZ}tO(l(auy4))Q=Za}+97)n$OzT=@
z!5gSGjh3jjRjAh&)T+l7a-y`P@Bte6ea>IEsy5UqTGw%<S)<aEKn0~Ff$a^RF@d<-
zQ?j_JZq~m`6VrKkcD9t#<!c6*>iTCY-N^i;nv`=-UgMkUTuIrQxai|7pF;)gra9=l
zl$Fwa%^_te7JcYXYB(%|njCROJ^j%sI+`9iNO1wBN)!_Kp|5h1P#sIy4`;qQiL;c*
z(I%3?KDY!(gT{UyF+U@TEdV&2j$8)_XER)=gyeW-5~&&tITA=*7qc}B2($sD;J9SG
zcupK7vQ-FSXqf<biQ@nPpr|J>W#JA}s2L3?Q@AJB2r<B55EPWB-vk4uc!WB0fChO%
z1wj}9kV=e|6+kFVIm~b>bDE5S;{=Bhkp@JGcM`Ax8ac=SP^bs?!3_pu6Or$NCP2r7
ziQ~c$3P30VB+0}>7)f#=DXS&n_Mjb31Wc&~pbbgI2WmkkM^ZOG*<b_&5`h!~Q@4x&
zGzOjk%ijRBgQXXQJgJf?#(mx}5e|@4{?dR0s3NXD#So!tYNxss0zd!L;}c~}Mk_7=
zs;qVXuHQ|!LGAXGDy!~=I%HsUrl%}%_HwxLrwk?I8Mm8-YsOKhRqizH!c%D~0U&c9
zWNx`{e67j-iC4gJofMZ;LhVXGkP4W<dDBe>XuL334pKlf$LobsOj&B-Ba~?B%JneG
z9-tvPW;-Vxm*lPe&03dsb*j1a>4ug9r4~u=jx=WO6ZVMf{{SSk)2q1GaX{T5`z0LC
zO5JX6RVdK9%cQAX(bf0PyV?3`;@;13svGM`cqVa_>z${6v3a$vPQAT$5c*((po7D{
zJmc2;*x$O6?&g!V+fyc$0f<JFC&b39Zr*oImaA)R*H59QdRj}F;?;@Ag59+w=|7-<
zHY!-G434tcs`U2xUcm{X52<WjF`&&S_G3##1Q;tsum#gu$Z?I|Qguz&Q#OIz*wb@E
z%Tc4u6aDDYQv^2&B8g*`CN)`dbW_hJRXSJ8kV#3Cq^a=ih$;i9C_Z7I6B7~!Aw>Q0
z2_U_wo?(fZ!Y6y+Xo~obq~-=$SsILT7Bmtvitk=k+?wlZWkrYq!NF5fognwd)!kiT
zY;CNofcid|)^P$XV{h8Jmu9PJN~3BAG@2zrBQTr@jvbBGt|>Kv(7Wf0j9JAA)UJ_U
zJ}}S`p}>HNFMOjXaIvqqUDdO=YE+{1B=*P=*%;-h2$YT^Jfp7Naq3{DFbe_l@i^kK
zd-+=TvvjT+KPk;<cqcih(-bhFsEexty|H=uVM|{3TA+54)?jcTV!f@7cdJwK9E$zV
ztk=N)s@Hq>67~BiyQo;!f5F%4WY+UB`o9LPPWYP}u7um@v9-4<#Cn7ac2v*+EgzZc
z4>6aiqwPkI=2M<`ifrW!(oE#8CrN4l0BSZn72P{-k!5zBS{+iGriW2+YCG`=#c8$a
zmMHUzi|#kZ3S8O8ramUp{{VB_t6seyQmF^hC8LqGT1$&u6PjmFENHiFpY>&?lib?Q
zUq+)`Mw9X3@F$!2B6~Y=cV2~ULDXrmf;A}*)U@5DbptXrM#1{aii4<fhRl=&br3|g
zBUhR#irUo;7jmamR<C+>?55YyCCm*CE^$(=<(XdIPsI@LTSd*)fV`<hi~DBisz{w$
zO+&Bo({K4=>0?fnYo>aHR3!3(4v{rJP?*NocD2z#mD&q^Cx@Du={%2XzV4F}ccm7k
zs;N@78O6b@*V0m(Uh2xtmX`YIid5N6rLNM_;?|u<?FB9{t}jil)ayDW)gM*F)~r=u
zP@=fI_R|^Lt&#N&ePwVI!%1*J8m{b=Xx%{7WZXp1ImI)GCZy})YKo*%rj#{|w7hl0
z9`LMPWeSqyO<!c=7*`u@^-8WZX)7LeF?8oKCtxQQYa2S%TvnE~1=<{_8WL_k5pdZy
zJ+kOEmp1jOgQSAR3>hIV!;DzD4laomT;axX-8TFVX=BoRlKQn;$}A(zP~$w4FTED0
zbmbbhwCLMZH0~&!Pq<x9ad2<LAi&f=dM=keHMYA^rApx)El!fp;cj9ZO2&<|T~W+y
z9A&1V3@%CY2^fXWMC9o3MELxBR`H5$<%v<Yx8A7{ZFNlC(`ZpqsvLL?QDE7rRikfq
z15SrDI8vwpb6ysFCb49&Wg_M>t6Jx{mo}Ro9WDnS2+7{)+gVx~j7nmrD=8r}lu>f7
zE?qM%l<M|UjnJ>`c9i#PmQt%-&2120U5p?*?u8Q=#hSNDZ4LlFiFHC|@b_xMOBIfU
zqWV<s_Zpnvsc~ZgoUu@`f1v(Q;?)2w;I#{MZK}4#$d$-Wk(Cryok}JC)oHg}QL?zJ
z{KuuYs8qS64nXh485L}4(;~CSs93>HjUzBiW&Q@n@2NKCUu~|qr%XEH&>EE5M9yz$
zJ<$%=UY6lyPU(nSQGs2X2<0IU-Vt8c8SPm*)3|%wpr@*Im()=AEtK6|Rk+;<s@4@{
zHj)I>mAZ0_O1pgu%od-d4!5Lrun?%Yq+B-|ZlzGxxew674zEm>BH&!kF_UcD5*p@G
zeR`Dvb4w|Z(q}N|1op+TldcT1RFqPbQ^4g_dV6jaoLg;G$vrM{0WiuVr?b+c)U~2U
zqICkUL*gU13kzqlrmeB2rDkSlfYTUv+iu-WOWFw`TG7Q;YZg1VGH$dp#x_%`f=uN$
zn<MHFG)<ukogfG*d!uttX{ywu_B6Js<_9UvgM?V=+ptz^G`5&|aSSq0V_{!v4z(^t
z3ZS&js7fc=ilsSAz`3}_iz=Fy?bPX63ShlUjvyj5YL;$?4X)VOQZ*?B3~QY1%B#M(
z%zYO!lbNNi*PSIo*=m=<<nu;TjVg>DkmM~*IG`iT;R2DGYd+GUUe&bf3YMxCMWqjE
zn#Nly+}%1(BBPwO9ZfI|(AkCi&W2Q>QTGOuiF*`~4q9+Yi5Rh2?X0t7LqU1csE$oZ
z$L_i<l!&O)T+Ule?p-~&u^w4bl?f_crlO(9^LexK$3(fiH<WGeTw3Rrx}8PxkkS!<
z3|6|mq2k`FnhjRfE!1S-Oz`$b>fYX^wUrBchnc8M@YEF}Z>358cGqf}f8}h4ti?Z0
zl$ym#H1U!{pD-Cr9WC;%C2=R&4FEJ^0YD@;cfkOGh$LVZV0b|a&<lH(JyGMHVbVxL
zFeGz?nuA*MiX`DI@cc4|OB@GjGcW)MCxoZ_ga9;xyu&0We_SF8cpyEWu!jRPElwj8
zOp+EwCW@rs3cv$WN<b=o@JI~+0;h*?&)W`J5~_Q6!f}v7nRoGk0iz;jap8{`1Zik4
z2uK2BjJ~LP5>Ox*8DR}V42G$qCkS9O$u~%XCC3;cbgEB|3*54TiiamEqJ<x_Kp}Zb
zOp*%bU;vp6BqpxM!2)LuMOmoLb6(hRm^xIEiXISUsFWz3`A;YSVC5<lI&vHl5(J4t
zH4<hsm)S^rfC^GmTs#s6Ie?*XL+*wFLYg$;G?GuY6e-}1ToM!jfB)3ttyYj)jmr+q
z;DMzIpd-b5TM5%qA&ljh1@D8(3xG0EhCyW=5+mAG5y(kGCyY9zfC5}nYc4q?2F#sJ
z3z5oM%22B=Vo0Fj#w>pe7N!$`E*x-E7M+u*YHYhb{k6N2wbHe&90<}J3{{*;s8A$A
zV;Kwu$O25v%&cUDRNSL1s-LiuZaA)ltVJN;;;JW%;m((v%n6apu*^d#qPXb*nFE3z
zFrIZ10R9t>6OD@^BV4+MWV;SCR)OOVrzxzogvwYf;h^(iPLUzPSwxhYTy&rXCp1Ml
zcYIM8{6Y(uj*(XJ3P_Y=R3$Q!8F5&iqB+F2OE7T6=NRvpC<s}M{{U<vH?nAJF^_Ye
z1Bp}_&koqm+jo7-Y^chXRozVwm1KLwU=|%3^1(zUBN^sI=h+#OP0_3xCswgehSzr8
zUAw)N?ikkG$*y~Q^Nn#W9#8{@kcgn1))o>RBmp1}$-*Sy72F0TbBk$2k|Nt~tZEV@
z4yP4Cfdyjo^1`E;yVa%|3m(>m_?qFX{ivg(P=)1G?Fqs!KP(ga%|KnH=8!;=BT*+$
z-fVNdR-cjNeh;kI!2YV&d-o0bl3305lbr2uG>9AkCQkfN{ehzAqi?9Oqtu~w>K5RQ
z(#<6Oty)Yo=8#S9-vB)#gA=H@Dn`SjHa+UwX}-I-wm+e`YdJ_N6PMIavpC0R&FNOU
zsKu`aZiM#wn{OuQ{{YHM(YAD-by}@P2L}#j=~I#rk!ZTw+R*1w6+Im&h~88L7M(>?
zm~%!aNZ7vKrDhiCAsW{MkqeJ_iFf^U=s45V%A^$`#45^hjm__>I+KKDIvTC6wMX95
zX=!4(F`(mw=GjH6;q@C!(z$7M7yetc0^mW(&Bx0XeUYBEYBgR-Eae2v963vtQZc=@
zeeddA<96c50$j>`sidF^x`M-&o#hT2WY<Mf3RFdFchad*)akH>S93zS#7U%PH4n2I
zoBghh8iJJo>`@H#5?5Quz_;whqNuEDox+tcE$KC_s8vl+Pci%25;WW0JvA8e;4G?X
zEPG081tcDDiqkB%@u!G(x-Pe&`id0UbLfL2_JFI6OkL0L#`=d*(lipQnMJ^AA&h-M
zo&Yx})9sAeKxHa4jb}6m3Ob1)*{V<cQE8*F+Ef(XR@AxDaATZ-vs8AUh8o=65+`xN
zd};O?8;-6djWpX)r2bnAoX~i3g@@W&?%hREYk$()s&ck|*Tu%uX0o;#s8FX*QYtkF
zPrW$KG}_RND2kvC9noU5Dppu7ol0LOD+_U>>T2!w{kvwjZ7Vus)+w&0CC=@NuT<_8
ztI%bmT1{GxG=Lb)qQ&9#z-m0T0MtM$zo-kfx<COkO5g*0(M`3q8?NtC`$=`JxAHY&
zTkup*zMtWUUaCnfD(Fj(DDHEfJABoBbT?Zql|8#}qiWRD)Hr5?7~MIiNh*ZMnWSRU
zVJ&F`m`urJV`4Pu<(dN|#E7!obL>SIX56atZl*0NZl()JP-b1nu*FKmJ8a%7?DWB>
zO5~dMiRHspN^|2EH~9gY8St5nHv6`=wI<wC1Xj0jdtBc%it?Yd5*66sn$9@*6SHwv
zQJ34p#bjh{`kh15sx=@%tGwgk8rzPaxfLH*X}Wd0Y8_9uAUcshX{|m;%G+;+N>uB)
z;ohhks<+7J32I+}{n*>nW8My*E+DB-YzZkOgy$$pwc}iho9()lj`ZnVP*F0%_ftL$
zezyA~7Ok&Ry`a8@HxrA^eDJ}UMb74cs2VF%6&s2Gq%L5_qUy%c;D$BLa8gKd3O@`d
zZj1X#!u4}e@I_sgyRAp)7M7I|jcjwALM|GzGL6$Ihn8p$wyM&OqED$v5WKDb0Cp9X
zwYaD1G0#zhNm>1!B_eCPkj9!UL2~Azgd&?ynZZ6Y6|6HQa(kU3vGy2u6`e)Z8AXR%
z8?Q*g?NA;XoFW(3bw?CyTOb<tp4|PBA#fu@i;<EO3K~gY@kY6>cz%*jfKY+riw-I3
zR_RCFLAX|<WXktd0t$5Xl*FTgAmVb3p3&4)FPW-(fIgrDF*7f<8m^?<Zmu@ko`VY0
z41H7zG=Zr;h|_fWbsk&}1mTCp5|t@|b+|@$+jzBsv+6r@b-nGBk!ZNpu5t<k1E(1!
z&2a>ITny%zk(_5;)1_Eby8u1xpz*;AoX~&Sj9PR(LgLDdMhS=1WjPEq>vL6Oujp{y
za^=}Cf;(4y*~s<V)Z^K(vvO^&$ba(^7lI_1;yb$`($oo6Nv05{h>OJj>BEAC_IFM_
zA?6sB$_PO&-+T~AK%kMt7$g$u$mjJ#PMHi9d7uFT0)P);Uv6RyrloillF9&6EW6VP
zGZQM9N>Bh6X~j@|I6?rKVxWmkDu^;&IOhnW;Xm!l01_dbm)?7f4q{38n=B9>=y2r)
znG%F#apS@pFhNhmn4%0|0k|qkeeh;MLZj^e0CW)3sYW6~N#lg!5Ef+QaRoad0je6@
zWCW%Ud?h9GDcywO*cE95?Ir{f48f%(`*uSCHQ-Dj83UT*3EVtF@q_|_@e$vIBme?R
zt`oz=z%&Iou50x|f`&)2Q4TtA?}Y##|JCA#P9zzhtG*wOsep9Jd2pQJyt=W^4$}{6
z0^q*o6XP8nNRDcv2_6^{AXIrgpmRuq3Z7N{@ED~R13wr~b9B_yQvzC(cm)e^mMapI
z@(WWC%@%5L&N2ZYhUz#BWQK78A!Xi9UFIbm1k=4^HLC8Hc=R$T>Tzlg!zG&WhPf2X
zB=I=HrtA`FG*jGR#iWT4N#)Nm+Zxhs2y5mBP7(lthzSlr8dEchAu{mJ3!0@>Oosv|
z3CBq(F)VR`Dr3+FRXm`y91?s<#~8<{GF*s6gicJQ*rFk*pgUBh%Z&YzRqU7{uD}Fd
z#1I++3`q*P06ZY+L94-XhP0u=U$<zA)@YL@PzN;EA_{PV&<Jn^2qcMs<Kh1Rgg}y%
z80Fe?#K2S_o+?nLaXAqIM4A+ye@L8MK3vp$H{u&iX>%rl3sF%12d^|#t?lbwR;6!2
zrPXS<HJ}1gJBLa1J+{MVPWG=o^O{SS1R+aj9N$us{H6HoC^~o>I4P_4=c17P=cVm;
zTlzbN<IO=tI8&r%0XkMw`ywuzbvkX^>Cw;)Ak8jjL|{jYV@r7L6xh~;FC$R>(Z9B4
zn^qi<2}d*2q~99Ww28Ntsdjo+RV((^f2ZA?#tAS#PC(O5RD1MuxA<dAx>2orR5hV>
z0H|{@W36!7d%Y$C0B8-W<1nRV;L{M?TMKK^t*A*PE(B#LuH;tdN2$AycBwa>+I=P_
z-?Dy+%)NC~)B!ooxF2L`YSi>A66MgRyaySlvM!#Xb*nTUMvu-I6^gCxyJ^<a{{Zz4
zl7M7?@WOc!jGLR?)RxZTsL@)h-RTE8tYE{_0#4v(5xR@&(WywY)Sz7FdQAsg3X!g#
z{usoiS*@mN)Fl|-Of?c@%$fE^&u**fkmnhroOolY0`Z7;mOL>rwzq#ytSU0CH#B)%
z@R<Jq#A%wgFP3+<&!Wb*=9Aa~E^vEVNhdkQM*hu}Tq;syL!1<#2NT<bYKKu@?R5hk
zZ%M!tjWDG+#G!G;^{Cyac8xqy)GA+XtZM$QlZq+_2OKyxOhWzAl|r`EUs(i+ZiXn&
zQ-F?sI?H;w9-XDQq}P3x077_1mvG#7%c<Q}sOFI-DNRCtP>sN67E1h{N`xW7YP;l@
ziEQeAuD7<gsX)4&YL4?-(>j_;kLT1M2mGdgG6urS>NW<w=`Fasd93ysX$1pXN5jf6
zinm8?j@<VOo0i~?x3zbtYfOn6qGjLuV8!r}P1;%moYGw7#R6iyVz*Z6-sX#C7(zcT
zE47<@eO#pG00ZfahiVPf)FxohjA2_>(pp?W4l@dxWf^rU1!!+GPJR(|GI*I<b;y(r
zO${g;gHACOK-bcHUZuP+`XWEnHLg++0w+1kvLmf>+Jq7BGWW!!(pmNq+Uj$fw_Be`
zZZ)aY9=_9b@@PDTXatk#KkUDo%@#X8qp>JOTGqqO$aOIfVOY$&r)PL7(XJZ$PiC5g
z%qhc9Xo_`>jS5#4_S%k>HxW(8HYg#(iH!Ky3=PTKxgRpdIliUS{^YP~AL=p%IhfTS
zL_jxsJB>uOs4A{5lRzW7G)JSYsvlLgPo%0_#arW?ZY=hDUA~<&Nb@#sLNf_JWDYZ?
zE%hMbVJ|Cfn<iP+4AoY<X}1KR5UxYicrmNKrACu<(&TnH%%ZBqwQiomOP{65t4K1;
zPwI?2s;#X=lQKABF@llMu8m^eW)td{YFyM4f-FaCOVqB?sOLDNPD(+CH_244C7}3-
z$lK|;_W+y#n#mSs8Y3^L-q3X>z0RnQr&zk1+Uyhgf9^#xwT(Mlx~7pp*GV%&7Mj(K
zm9w1Lu$MSA;uPxo<4Ho<*L7WSUrjn|t;O<}XWoS;M3T(6jJBysx+*2!$w>yy?r@n5
z1BljMa9+sAV^!Ko=6bXMgeoY@vM0GA>}^orO|{Rc#+i`7G%<aSbYp@V36MAelBO}~
zy%yfiChccAKWCMGvOL$V-QUdHq!M1=Gxv?tff$|{57}ZtH9)3agWmyE<R>8PQ6&fj
zQfG!8k<~dO0HFXVDVh3UubW6AWiHi}E>km!OumT-BuZliD*y~~w32?w$KCY9(Q+~{
z)d3JoBZ)|QVj8%W{__A4fB^)}L5I7s7E(m|VHE^`2<<7912WJWd@=EW1u#(HCOlS9
zh$ENYa7jV{V!5Y;CrFYV{m{Tkj*_4ehrSxB2ybdTJYmiOWHg+xCV0Rvkq$k$B;XSk
zktZ*D{SeXt1fuZq2MBw$3CuWlAY4xf024DNnFA$3o-hpvq97s7o|RGrH%V?%%`-@n
zV5kHxD=<gZ5y+iMJWS;b0jm<f5Gt8YaG}%Enz>VnNKgU)*5b)B;NarijEbK4D~hU-
zfR`Q-x9Sao=co54+FUWln}X4K_?l*&;x!EbIzuHXGuW}d5>kqi$>NB#)1%OH1Ehj-
zOn4qK9Z;wm>}2nzBx6L3nigHT$pVaFwVa9_fh7R#;IS!x4T3R8Gr%MOpcM*}i9u13
zU+nBL3B^Rl1B_$`6#=A_l5qt)VvAYGY?u({t_g#2mlcJp$_T-ESHNchhC~S&;4$|?
zegp?%NY#-7c*eDn5fi3ZC=$hI#XBLT*d7N3!e{rxQ<$_&WHUlZhqAlr`i-Xl07{)|
zew9!r9?ea8W0N9S*MAH{a1A=XDUh;Rk7PvY$b>SUd?D_b6PODz+s8RWS__;4na*Vf
zIiAR_<Ek3X2#!r)%0Og%A&n%0O7>?CB0<DRgag7F9f+D{aGGN{i^Fb!KTuJTMPH%;
zSEE`r6_67v6I$BVyr|j~7;4ljrGOPvnU0^*y#-FwMfU17*TuBV)50Tm<s!*OHE4_F
z=ScpX{{U9pY5r*d(PM@RaY*V_Tm`sk$w*10V%sqrrL92F5F$jy&MZMJRB`N%KPp?T
zU90;NUqR5b6r}{Rs%E<SZ?+5ZNcQlHD^Zjis|Jl-=oH0j?WtbWC&mf(NchEqyMVNO
zkmyfrqYtIO^s($seI}l2{pWd7b;&!u2R>q}-X1>~Px9)G+bMpzhK+YE%_HgRE2#WH
zpyDG-PTaoX^^~0rCB5VqDEuHs#^_h8PLAPaSktcRVXo2_QJ0yg%qI^u-s)Rv=$#r>
zG8<d;I_>U&)a(mSm|$8#NyT?@s2p5F^)`{^ClWp$Fzd~)dEH*tUe#?7)k>6{Rb`7_
zBPO21TFR4{`ljKIn_hh1hfZ*oI2TDlB`wu&G>vc!Cx)hpWSnCzt4mr#W@*Ub!E=me
zHF-;B(pp5+CKAdvY)h&_vs<O*8sMi8p;94}B<Vyx+Thg_<t<Q-Od@J_R&|x{N4e8-
zj3*ehsQOn0u9+&N$Q}`rYEq`+2pXCbPi`>v8htP`S#P&7zn0dZ0SZn;oT8D{JI%c&
zRc#{D_UaAMh>f=Exw4JBiEv#cIgL;!oT97Kdvmvo^SWG4(u0d=w5e;K_J}_V1X~qV
z*s&~xcGgmD_nRHFdj65J+ETh}sZ@JiP_8FYB}_WoudFNA6uow?4hNM~WyUOfj@4zj
z+kS@Wy6HOyAPyW0d*Z170G8WZoE+MZ(iD|Jh5{Pd_UgNcN=>X1@hq0zM{fNOc`1mV
zTw`Y4{*&pox`-lU7@^&CrQX#<J8!G%hJ|G+vJ%5gS8Hz9cEqvd({$VE6_Fk1wi<>H
zY_%fXX^N@VYi~^&ovEc#QUK!`OWvszDZibJYY#D80pk?SxB9N_SJPwv04V@3AfhTo
zh5a9GU6k2zQT|nONlLFMg|?5*#;McVVAJbryM^m4>}kHUi;z7NP%=<B2-h9{=9f@h
zexN#=wY8p^ku6{iSh($*Cf&4aZHp44z$UA_$v!{s#doZ>QFTZ9m9tXjiWiZKninGz
zj>hPf6Tx?lIjnGW&YGh8v~KOWpx;kRPyps1rZcPFfE*kHo=RwFj#1M)>wsu;@son)
zi5E&&z_qIIRH1O~9Ader7fRHY$ht~7JWa)24rnMek(~Qt0ZhHcx<Cue1{i%&j@rC|
z98ti9ayb*TD9NB)DywAE+OsPVsR^e-8^*8zS<4`!xDa9rxAgYwF<!gC)ZsF6lcn~s
zjJu|n01*P|_L$45v|C;P+lDr+CNzcz337&${{V&sS`@abw-`2cEw_d;;<dnSI%Ra=
z91aUO$=qE*)Ei@Ni&gDnwItb-;}|dL>pD+xq+H5QaMewU00_UnHddAvv?#Mk(i-PG
zGIZc3V0A5Wjt(u+WQ}$CXt*o^0~*#f5EN)1MB^J!(i#~fHx(*dz^|4aS-{k*rvl<y
zv&_UsX*6-|AF@ZX;oJFqYf$M0Uiao_jpf|=oQt#j(tgE)9z4e&XZtJy6?`T}3I70!
z2?QZ6T}4wIF-QoQf`)3bprCeAMD}M0=0K7!j4fXgDwuxcgcZjDJXQe-%>aN$09}M4
z1_+2yx)h{`mml8%6<>0k;ebFQWa3jHI3=oQ<{WY<^}!(GNk>n|*#X1=f?x$g5S+W<
z0RU1#1_=#wS{7xgf&y|sz7w1rpkNdLW|K&1CSWAbb`SvZfpQ69JdX@k0d_!y%Yt_Z
zZUDFtyh4u*Axr=!1HXT0?S=tCi0m#vd$_`x2p1Iz?3@K6Mnq4x4xpUh4hM=r0gx(^
zL{(fN5?3VvEH+q~E*?GN99=X{3I`GW@C-A;Toc9=E^Sy1OOL%FKnMTV-}R45+w~^S
z<Qi`%v_6CL4gq6;QnI<JdTbd<-oCQO(iu2$U)!2JQ$}|5^?3^dwBQsW`XaMmruJ(q
z%KN1nK;2h5oa~ld)AGi9JjZI>qSa`;z2>Re-zBG0lJ0#oZ_`^#Zf_sz-&_zIOkfmt
zFP14)i7v-dfKn~}d%4uT-ra6qw5>u0s6gY{{4q~-+*1s|IUG<&N)BHlGMn>D7vg8R
zP5V;8U__ufT`mSPB%!Cv#2KG%5snjD<s^-rrqj4zE9g-rf@YxyND&)SOLKT)Mvb~6
z2Bwm^E+^Rm<p~5y1~U4hN|fD3qo}zxF+l*P5-Aw&z@~VN1{voX>*Z9`g(-lZjd9X&
zGCt18LTWI=eh?Hiqh*&6O8cP$mOM{{K`;SCRFLEQX9F-WId}2yhbCi&a39)xhiKyv
zn9v^V8sVA4wG|^LZQAblYsTAIFua!#T#^Ae!}h(l<94pvR9mD1uXqw1XC04W*sOND
zI$x(?HIMTj@fD2nxSXk6u`X7nw|iJ*Wpu|!?fVU#Hx`PwP8`p4(5GxzToJleJF|x}
zRD|I@p#fN~Z#s=CItqYAOku7(uGPmL5j6k`$|HD`v0OOEI{yIhr}$;=Hd^?j-i$k*
zXWF=sQQ$QFMmKi=y3Ob{I!MwpSkc@x13-j4!9<S~A~(TbZsx9)Y7!A;7=17LT6L+X
zoBUMD*5OcFTRx#G;^F}^<!KE;6Lj@qTWobVd=7S;X3|TW-5P$Os_IoJQhST26$c$a
z<M7W*jB2($;-!tnN4>6fH#o8E&q$D|NrP6s8R`2`bv&ZB@%&gF5K|8=ss|}W@65(U
zR)*0f{!r2r>-R>whUsfrtSx&>9!p6QoR%~ldDd2&MT@N{Rcr35qZ>_*1;IYl`CIs*
zD7j#kIu|S+I7Mdr2R}f%c9%-2)YQxwMs2UA3u$T&Ad&)Q;T4Xa-XFWzP`i5tI_(-9
z1i;gQQ=8T-?H?4qMA8i9B?rbUnrclv%M_DUSgkvVt{B$UW0ZEda5MU&7V7^1(GO9M
zMmARzK`nNW_P7mvaOIbTRJw}k^xIM3NzFzPIsX9SlqlzpMb@$KG&_yeUEb=B-)!Q2
zN9l9dP-{u%G0-;aDtmpc3KSXX)27l004Ypx%*8*Yw(Tu`?Nyqp+&KDR;?^XQKGa?<
z9CWmXBec#_i)PZnX^AOI3X`^&bA?1ge5B$Rk7NYJ*c8&BVK~g9_IhPeTaRD|6B2qt
zwWV)Yk=jWJqqZWkYqX)txcC^1qAF_E0TTgRse7YachmM&OI=ccs2zw5B~P=KEH<=s
z6mdNAY}AEJBPrIji_lF;Bmz|cMG4;$N#=ufFwm^g+3DDk`f5U*DHDW}Xj8r+cyM!r
zRMa2@7$7`i(EDbz>89rf&Ck^|Ps9vh*SDkUHk$&}abEM;5u~*wrkgbmqzFfHI-yiU
zAOteCgPzSNHjqIQOe1SjfNNScb0I5;9AX=i8m(;0X6G3vwhGe`sbZ<A$9HYIuEn~%
zv8VLa9jcqqfDCT-&B5=|(n>I^+A%-kJ+W-vw(D)m)z+634Mw;fE&!`Zn_L~7#8S4~
zC(ymCWm%+FrBEyIR3tS=Xp0=w<Bn*u-BW8$g~rsrwK`=~a{;L`o>8>7dztBIVIje1
zG$S9TFn0SY*6T^gO1Q8L5a$~Puu}m{flx3w9zS|1)P>4Z0xI>M>P>Sd32~lN#w-?0
zcj;NF({pud6Pd;<*EK^ZJ<@Af!le%k{gE!o)O&U9sADb|>Xi8i3nC)Orq^+vYO4cc
zTdL`e<s@o(W;#%5Z?-Y6?mZeyPZZNR4LQU&8;hTRQr^mwtJW>0sjTvbl6>zxw8Tes
zNYZ_3b)G6#4RCgq(9!@_Gwo>2(LOz|nSR`5lblok02T#&i>lq3$vjjWOS>AH>969S
za>Kb!GbH<ubO7@+sY%1U_d?Wsy_xsnIQE^#kR(zR10@9|+lj^iM3~1C5I{n@fRHE(
zV*^nO?{bI)IE5tf>`);EX;zXB3TNyPyvm2TaP~q>%eW~Dl3ZpI7zmSs<L`iQR;ZdG
z%=p1+D$pcB9LwMF7!z5JGk^oyaL9I~4MeI85`27N9Y7E~aQfkD_sKi+gHRetAzW0M
z&+1MPijYVuLQX^Jh0JCp`%VY}0N|vRl4Sq@slX(Vp8ViQhJ??VC&2ws;v@-F%Z?))
zU=marl#r5%!vPSJo@F?!DS<3_@rKG^Hxee?q+!i-flxTiPd)+w6M~^~_9280BM{^y
zW;@^q5)%o}iS|MTR02_CAQ*El%|{i-2v8)3Q;}cWC{O|a(CLDmRyD!k6$m3ix%$mz
zW`LC%(pU}#GP-K|rscO=-&S^~J)y*K?~Kcf&AW1aN;Ims^%SJw2q`QKbKEQDO0F+1
z<9c|}>a#q#eEROaxqZkYzo`EJP#lV%sxX{RqmfV|`ST}Z{++sw!HhnQ)SsYBSOpJ?
z#RQUEc#a9iJ4)i6n6=npoLpwp>TNn|(%$!Wr$*!xMr3x87tW>K=xqB-Y&Q0_m8!Q$
zBOshiQo2Hu>u*%_gUS@SxM7?wqN`q}@o?J8O)?t6@P?VGjFE=^MH}+LZl`Tal$0#)
zS+_%4S4t76;-x_Zl4tm$B9rbyaKs$p<<d+-_h7%iBPK&gB7CT16O43al@XK*s;)wc
zvzhRMfD)>4?8*iuaRlQBNK}#nCWGoR1Otg>GN1>5WvE18pCW{D3Nf71>4M@M;xP98
zaH#|8HL0i<1sN2GUHD&dI4a63t4rF;F2{5IKIMnV$CY{liiw1D?WW?B+ELor;UG^Q
z_~gz~*@PJ+UHUTj*Q4mUx~kIh&S}}kIVx{<>gsw+{CMn9ipd=<g=sXlUdi^#1JoxD
z*kze@N^NO5M6YOQ7?QE6y2Z1I<F;)*2V1JoC}n2`l0D-neZ1U6qAH~Ni--kC?BN(M
zKiyC8%iL<VywUGQCBZH<NtZa&^~Uy@N<)Hv%NiS?7-*KwTnX{PBXQ~hZl@Gf?u<T^
z{{Yjc+Gysl-Xf*dR`l*z@kA@>AIt;IrNI<<#V(C&sx?FXC)FT&Ra{;q1_TvBsKt)@
z_MzOWxusa?IkUlXOvF{UN=20(<4vTzkQ@$;V3h&QIfU~@gzDOQw?Zy1E3mcglqz1(
za1~i@_KhI5;qK-?5Ma{dFN!GLQ?pgFxphjFMP~x8cnacI98s~Ze(?Ksb-1TWtvTo_
zq=gEg@P|Kp-8Tw)9^+JL0l=A$13aTdElMp<Diuoh4h_{bZu%PNdX*?@3f^Y}Em(Xb
zF&!?GmXIhzm>Rg^@9d62)3vHv?(EbH1Dru5Nz;z6X!g3B$#FVENC2XYCmH!Fmn+p8
zxbJno)NO?^S?y?PkU)DzabD^Tr`L5WDh+XcL(fp3D?T?ceW!UR78@$X&kk&ZK`7bc
zgSBHKr4^><2M5-!ONj#j!OK4R#&y)b5^-fx#4?4wCejT7jMM3dDQIZYhaM4{F6mp_
zjZFdUam5Bnl18f7Rs!ywrjqvQey5!}b+-pQwany=ZPv%VZv?XT{9Cot45ZaXSvk%i
z<r0k>^Gb!%k4#L<o~&gGg|vBId0dlG5mM@_w(8LE+D1=lGb|XlQH)D&fu%@?*1W4w
zDwec1WMHIa8P@$9vs8JfuAod>^5CRn7o`NezG206Nf9%v?x<H*2KsVVHQc}|O}=H$
zEKTboZT|o+k*!#XwT@^({?<{i{{XS=7~sk^{Ual&IY?hJuBD7I(6xnUHM^-}33R;~
z$nho}I#iqaPc*P5y6cV7ql;Z{rjc1lk?o9g)|Wv&8ZOod1<pQX9?ASVV>Z*GEa}z;
z>a|MD7&xe(;cRXl)4OA-mRi~BRB5SEp%zo-gwoXsyK6<@R-M-u1eU&&TF?;zQ;uXr
zlseyUuHeef(xmD$AaIS*>1~SLo~2qWd)lrcpbrcojiqaB4LdtVvBMo85{I=Sxk?K{
zKf8D(^#YdRX?o96)~jAnOqm6jhA|bs$^Ay|w&_RnA55dDiE3*}!|&?mhW`LgX0vwM
z29;(>wHgQ{kN{EfV;k*)n_Eh?%_pNvh-mmrd3pR?9S}`vlOZZaHy4Q`ZP;zwZ~<^A
z8j>VA8jt=c#JP9@G?GDGBa3-RF|g1m(x%X$SEw8VrKIBFKIp=z=K78-a4jko%Yv!G
z9;Hp<Y4^twXIzwO(*Y4@BNQF7!KJ`8r(H)44Kh5&E`3W^MU0#k<*jHX1iNw6V@0*o
zcD7e`q5$Ou0vr)I{6EDS659%OW@$<8t!YB@vpe(_niRgdW~M5Ve*PZ|k0XBHFTog&
zd*0f$h#|71$C3)U+}yL8M#9R0eFFN9omUq)t_^3DHhy3JXvMv#!v6qHkO7pH8Yed#
zf5HTB^!r=+7S@n%Z^JLvIlSA-Hm3ELm3Zc1A;r=v0V{|&BpE6WD10EG9jI_vQq?4x
znVh5293i~3M`@pS9ne53fT=hy_Q3*Yg!2RFf=q<7nlA_d1x8a6m`Fh}C|i&`e})R7
zQ81=Zh-h?*CSm{=Fq+LfaDam>LP{I~!vmSbj@SqSkUPLRzybkWCaZ|b5di{ZCB_R#
zkkzJRRK1X<L7#u@kO0sDP^xfIKWrwka#4}&5E@XZ=1fw3oseV}NH86+07+;Bk<2@1
z3?Vs*lCG``4wQ*s++koGbdtCc_2B>wCjdJUxvB_AgaEyeWEN91X~d`k5)@O1+AEh}
zh67NdU{ZM|Fp1?DCLoo_Ld9@QbIZOKl_U=glz6}~x~8ZwUl>s2Zlu&veykD{0Du3{
z*z~^W&A+05S5eZZNS$AVEOBq^Ta_)wrs-{P`gb*6YProMHfn&kx;Ix{N?X!Ywkia+
ztB_R^9HL|rQ_U4OWYR>!u4$GWBHlfvADOv)Qhv&o+_?O^UMW<)SL@1q?%3_Gs4c;H
zX$`I<6k(5&C=VfYu&7iZW->m=cqB}rUhz0%oaZ@k1T-vw4Ff<_my}{+ZV|Sch6&?M
z2syb?y$kQkhFVtKuISe~YZ_qE1dig7f!FpX%Y6zH)nYFa)Bga)6>Yxu=We^FduH;=
zv|ONxFBYE2xb+VEXVlvr8>)^J_bXBh*@1RW_eBtlqk1<pjId>uQncZdSfZRt0&vIx
zn5c-=8ZyL{PCbz;+o9TCp$a$<0SCq#_DKd=3<&UzuTwbHE5J<-M2K++fU1$~NMzI!
zIi`s<QII{5_hRK3`LG69WMV5O0~DcLGnNo(IV!eZQV@Rl4x42*fO8rBF(th>Rj$wg
zD2j7PuIuu?=38YqauHd%DjRdBNGK#q0o|N!I%fUbe&tQgc7{|sNhoG}<5=KBLQgZ<
z3X<z;6e^(Li5!g&T;W+Ly;TnTU|pa;P%U^?nFyMrXT~diBe*jE0BfdeR66MdEg!dJ
zSj|dqU@gy#Uv71d(}m2jvTeN>XcTjnAgD>GNhv&{R+KW&IEC(+lIHH?!ZP3ZC-`OV
z3riUHqZaMO%%pe_b!1a0*j&7RjLv#WCJ2$OtXNr0su19#8UE<l*FW^(CQE*(!|9Hk
zZx(6he-(HXH_}_fs}k%E1!FqC-M4OUb(Qrp8KO>NHRFSu-P;h<LwM&m0CD=GD&PMA
zdr`oymP~z7Pcl-D*9Ox`w9&7R2+A^2RZSv?yL1-iY~xbH{<_)ArPPtH0As3|!}vVW
zT6np!d#OE!k*6U|;$y(aWusUI(+(&cRZ|I`F%I>&ZM&FjHwq1`3hAmO@rRZ8#g;u<
zacVc?xn;R$>?ytJv2}LaPop_qp5o@!q~vLs0j>##Ofy*OTRqz<&|NF3jUfzzo<HJ_
zdB1ecEvm46rE^k?l!m%QPXh0XEFB?pTBU_SRB3gy+Q;$qTsc$lIDeWq-Icr6)!S~m
zc9hz;cFeY2i&a=X3+<&(PNguUgUHahoJ4KuxzBT&LsG9JbeV>qFkGU%+T%prLbPcu
zI1pnh+WdeiS4wVhks?}t64!BzPuObPRb5Ov2Aw_AbEMj1L3O%2iZp?wMqNu`P%9kf
zw+Dq&oP(nmZ>lag9mb_5fYu5ua|y-ZJx^lBQ|mg>$z7(NyjW(TIZ?t-SWOl-_joUH
zjp_;n7PIo0fB>q$*%Pwv$@;xl3Ux4S#ynz5ev9cir~u$*7BDY%`+BsU`j5>5YSX08
zfr-YQMPCDNPjRuM{K|pgk_k}025}W`rL?ByqULZ{VzE}K(~hhb-s-a^NeijDNvIAH
z7yZ9h+4`F2)~G@!F+%Jlk#$uw7kOi;)V8%%xR6ZnPWX`Z6P(7<pa5||t@>6fRW`e=
zVEQ!x0}{uPsX3NWJsJ<SXsT({MbhUQZVYKZc@iSpT9ZjUFgNsT)q1?UT|z<wik|q{
zThI()t8_>pg5Kb9_r|8u<kO~D>=*;JB;rS1)x9hfR5w<V;qfHUFDz8ZwEK;PTb9z(
zRgQDCIcgRKYZ{QYX1MKE{;Bk6wYrtwXSgAE;S+^QtElO9K59-0slWo#N`7GNbYju9
zb8afqZAY3Ct|<ncW7H%N98tP8tRyO`IQKgBGVU)OQngy6mV*KGgF*!$OvWo~Ej3gl
z9EAJgJNoXRY3-Cb!w7Xp%&UmNyR!(xy?>`{^3?Sa#YvJZqR3UWwPkXmfurgQ->SK7
zvkIXHFpe^4@l3g*eWHn7b%69~1*XMKL^KzF2-6rgHlM7j`-!V@0L@7XiyJ}ehta0m
zdJB-eyvRIx0n1Elirnzg7%odge%r;SvjZ17098??4jM~i8(yO4JxLRU0cbSKiD^QJ
zr81lXIZ*e4G2tGIfyVh(_dkksZ}6NRW$&3fETe~rlmHYe=lh{AXp))A4AiMYay^jj
zxrBm7=43=^$mI%TK@-LhRY|Xmu<VH>NhMBQorsXs%mNS{N4gLJ9W*#hcH;<=8j3Sg
zC;KE71O&w8KYS8hGUQgGrU?uN5*2Y=CY_L^LY#6i?o^PF#%R7BkN}uOK!lSiN)Q0x
z2r|S70n(e0yN9|Q8g(vcqmQxykO26R*$iV<%B1rjZ?M7>B%wa+R}tX{5UYg5ppYa8
z!vzz<0RRwGEXpB)FNS0U08hRU6D>n7%q5H>IE0d()WQG=L=u(?6UAYvq!F1x!6d(2
zDLiT+GC5_!9ORm2O8X&z(JTt^Su7OPjUEsbfC#{%D3S5;fJ<7fmY|q$&H<81QBRxS
z3K^haha3`q$WQ_Q(&BD&iBe05LLKAV1S-g;DZ(0MF)(vZFj__@A|=E|(jmw=`Ii+E
z_QVcTfXFBk$|oeC04d-H5=wW(F8~xon3=?Jlvj5lp{*swF+2#)CIsm^N`r_Q<q)vX
zgpycbB4&bxz)ci@lJ>-H<PR|yoMt$Zaqn0q&cb0Or1ryJlaQR0OU4&51x%(D6e#`C
zZs(AMOw)kFwh@v@_jbaObn=v-wYrL8N*)m!`CoYG-U1q&bhFN6i5qK6e?g$SZsP*R
z-$5^Mn8=mz5tj0=#YP?7k<aAvH$9NaW_mA7^zD&X?)*<B_g7LkA^|mDbP7rnB5{QC
zC{)y*&!omvq@SF#bf)NRcP7!OKPt8$@$rtOPNg@w7@@Kt$q?|4Ax*`kRRU~6vyl<i
zwoTQyyLBg0ZmNt3@&;YK(BmpUq|eW@x+-+f23V8=gN6}~6sf?HQ1<q~ruRLHK?4re
zNW!zZsdv0O#zBn=Q^qs%Z!30ZH>#(!=2*7nB)Vezkv(=vs1T^19x<14;M;!;u{<HW
z+doNYUeKnQ1~5NfE7&xWjxCtM4`g8UtEyI|4u+moT%{STo(6=CS5In^RWc*{B1S82
zD-uvOsVEd%>RcWRiv2L)8C2E^b+;t%i1Tz^{_|HC?RO!dJkYm795@_iOg^I@QyCGg
zrx}hh4KiCsn~KPQdjiffs+Q>iteMv{F^KStQdE-fW}>as%vw}krx%d8<boa2&opkG
zZ(44+q+D5v0-2T_@t@Q*oHq8>rjK<uNf7Cfz@GT8bhUfC?xm$`X3tfYl<HQzsU>9!
zZATL3$u-n$b~tfz<6)-aj$P<0NlHpBNhF?by$h?_=K8c*%;C*XRzu>_yc-#`u0XbZ
zm!vDT?gHJFJf+#4QY;^9Ok-}8%@$I-Z#ZETs(v3V;M>=852)7yDaoz>0L;vO7*DA}
zdUR&SjXPbMlWpk`UL0R==HpPAl1JHR!S^C}RHF8WT~k&O$ih$t{icsLuZlH{xD<n{
zr0Qw}<BuW}85Xr4So(TFPEz8S9t1>3Olw~gC6-jG>8oj2y^5L+on$6il!0$&X8Pf^
zgY{Aak-^AH%Urv8pt6%0RipwOFmN-H46s-%xVfou=xQ|CG8p5`;5^@ZR2Wyhc;iQ*
zV5LHh7Kbr`jxr(5!Ab-z62mx`U3D!C0SX6V7wvNY0J9pu>P4g$lJLf(sXLAhF`6#j
zML{hLa+WkM1kAX_l}`nQ3S3`*c<PrGDZPQz{a3u{X=sv<F>{U*hE!{7FRIp>MR^F+
zNH}WLWf*_2sZzTsNCr5of}qr)UECuookmlq$^)2CYDFZT!vNZ%SQvG7dm6+zQ*AUT
z*EP-pUSI$zID0Lq(?{Dg(|!)n(WEqE%6=%xt#MCDYYfcEA5w9Rb<NnM(M-B-n`S{n
zFC1bm2b-yjb6JfwskF7Nc@9vDiW~IvMh(s27PKq4H`Jp407jyXMI1T(;f`!zU+yK1
zWkT@a3s@RL;*0*wVV%LWZ;3{}l~O7-#RvyaF#Ep@3M`sm+H9#hi<?!=5@84wp`It(
z5$?3jU9zcYS2(eh+6M<}Uf9Kb>s{&F(gE(OP!X9VoU&|W@4JmXk7~2HuGP2wMK)bL
zP0XcGIC_j97MHX-G;v~`>uPvbNfdJA)v37VlV-1MZRM>}!%vyTWI$;`RDLdBEN+be
z>WwwL;i*hwqC|JVsZNW4btP~hE0|SVh;t12v}u3;05m?Enp-swToGJO9x>?b@NGhE
zwT_3Jy~>=esyx;`N9ma&ZUhsNhX9}e2~jkfp@$$RmjNQ~9uR;DsH$Q|9AzC7olpRi
za56<nXA2b6nMh)ItN_s<Nbi#62$Ud)A>sl+1F|E%2Lla~sVkaz;R!}0_Y?I)m#{eT
zU)$dR0EzG`k70ldw5;3`a)-MB+=dH^_(N2Qx_y~_aKH%Wf^s;a<D4K#Ly4G45S@t#
z<_PZ2Tp@6oUNX$%jG+LG#Fkl)Y~ea%QIYt5xJG1zn32p0$jF#VOu$`9o_)drgA@S^
zir{{j0I><j6miN6j}ao1;7RU<rcF625}Z{yLjkBU6XA&`vIzofDPkoA0#K@+DK60n
zg#<K`lPLg(sSAweXTVAu%%WYhpVKIlx5`c>a)x7o3xeS+j4&8T3Z_|!;vq_CF_Nk7
z;SbUTvdHl;6fCh%Wd8uN0gKa~oh{?1KKx-r(pqPNrg-BD06+iJ;(A)(T&jcuy}7|+
zW)2v(7!aZCN%2T~W&wBZ!Zx7@f*OM*C}lK=95YFF<AhA+oN||J;v@z@C^DA;72X6Y
zVr5W-&Lw7<1qftE1&FDXCnJs?*pZGpNN>d=dm=WnsMwrFCzKpO1!N8$-e^M**Gv*a
zr=CUt1rf)?7(fz6nHiu!B5t+s5acCD2T~}=9Fq>%*xK5)qtqcnD^(MOD@!J^p0!2R
ze4`2Eq{e+z1-ei*ys|4XJiZ=i!+(*3ggjZ?^#1@(r7zx(;fo6?xUtg{$qD}e3^l<N
zV7NotIb^ClCyX?q1!Qta#CMXwhM)ipM|gosW=LGP4iM)mVT7TrAyA1Be0#iM=uuxf
zGIAiMYrzgNV(87u*4NInF3`%pRlsq_j8@4Y5TSb#U`j_Z53#Q6YpQkG@D(QkJW>~F
zWyQU0?D|gEMSUW!Q>tgaFgB&9)eZqB!oJ9o!seaDy(*1Us4V!5;w`b_w^O{bewz-i
z_{Hit@u%Q&?8cX7zeYyesoeJ3RV1h&0wSqV#+9Q33L{FcYZl(6plVsvEyN7snOdt~
zX5A%pgqA(A&-A9LKBco?y0_jtZhxs%ml@$X;DR!3*jsg3s-Vot6TUEZqU~Ue=BkqI
zjhzPuSE;AXq@6e!@b*Q#>efGmvyQ$m10L3x6r5ZE8jO(S?h%=A$2qjsp$dQkIZj=X
zgKMbLsz52N#F58Wx7iwgv+A0AJp;GeJUeu%(kdnv?Qr<p{{R&{t^O#Y-HVs5t5()6
z`xjPQe)KIx+?&B-YdXKH8AhA7UB5}L9G5pko6=Nqq|dbG`D%TbzI>?ZsylZ>-*0yS
zHtT<7Mp7n4melprq#C3BTAJ#)-y1zIvm6%I*0$ksLeAfs;^w0*=>vg%R|Zq&{7#Qg
zb-nGcN_5?>*=iD%DonJrV37iwG_;;FZl_14g@|$ND{#?b`m@eT^!Yc`pI<8IjRi)F
ziZ5kvElP7w?(-~DYu{GdZfaUwDJ&~Z+**zDreAt4t$1|`Z&2M#X+Rgu(CE?cdu<)w
z)vc`5cm=epEZXH09sd9nb74f@>`~TkS*+4DRXUNrhKBqH&Y6>eK3utDWof&T=F|kC
z#RAP2>xu>4h0U$S#j$Z{rl7RVQgZvhdL3(Zdv8!R`ipYJkdlorj~1&JnNQ{<oh2U=
zud?o)T4Juga1ekHNKJcUHr=<>1<zLr(*zPi#RAi*T|pRJSyLAW{pmz^-B(yNJ+4=2
zK;eiW#XpywXpwT{TcgKAq13%$C9Z#@jbB#~3rySmQ32kYR5g!r4`C^!;(I(Jn7gR%
z>Oa+DjbJt4#PoWCgHD^4x%xzsO5=%(KP@$&N0n*eQ>EJ9(_?=&u?`_v%$iS_)H%Nh
z$f>+jq7l_yRSzr1cufRzN}W#DwXL@Z6zg%ta0kLLKkkdA=XiOh*|~JOXHS<jSZj?Z
zjR?L?rJFjXUEn#W(p*Xs<0Ke~eG#m_zF{7=zGfW9!z98e)H;UZveKg0J<cG3Hzb06
z5tdzZUYQPY4kT#JNyHz!5&0Bra|M1ad2uo~J>9-Z9(``oe5ZlMBU0_&=IW?ryGah@
zMHrLIr9LF(fm?6YRkV@MK4;_L7dW?cw!q6eY}~He4Z#P|W06=s(tpH@8@C0OSzT^P
z*CcGZb7-$<#@4+)rFVem(=%-)2lD>&;|u0*rrw%osJg#bL;wanl}|U^i%qLey+zBV
z36=@)n5TTxE;}8#B^&zIhZ*8&Rd%1@AMM0A^_BQxs}~`t*_*9=rE8_S)NQm|w(f!N
zvS~^e44dgU>^ZB?NFOi9PntRvt;My~5p``*wP`r=RHi}1=fXH!NGuF2nvxxy$7zkm
z&D6KM6>(CF&1y9zacDqoI*jE75ld=ImHz<kw~{>#u0u^J;<RU<owU=YeGbkQKvF`;
zFCH*RnLvjQF^9Eoc00YDJNp`%={)Ik6l<pWxzy@1ZvOzs8HRWXlYztChuw~nX{6Jw
zObuet6H13Ca)K5BP`Xxl1wgZh83ZJ88A%TqASfjL7_0&UX#q+QsoO5dNOcDVP(dI`
zKpn8+)`HN8GYP;?2nA#ehiSlezyQE%Cj)_mCI&!>rag-V0w0BQ#%Hn+9NL6{;rp==
zzz8)0dwg(&btf~BG-oUl4yLJ;g!}%OKn0e59@K;aP*5!K_T>PBgF*~*k%SrvA-IWv
zMDZLJ2{_AuJPe=#gn*)FKV&AN0^^jusSZaKlJ-MPu}pX@F@-(36-+xI5(`kwrZa@N
znH~}l;xGv`P%u%P05r-+Ffd#Ph=$1Lkx!~d97sVmGepDpz~v~T9uoG$5WrkP3V0F6
z+YSV|RPu~Jbc6sGNL-h<!U_cDNn&_H7&PR(!yF+&o`592WJ#<j0RPnDj;Q*DrQ|rV
zp`t)Zp#q{#5`ef%0OgtC0G5z(5=kLgDhV<Gk$^ezlx@`rZBKfVqlB=CiFXwxVpjqX
z;-N=$MCpAIRnefdGz2mkfUEC`8<+uRnBkAp5wQ|xLq0u`W9hz?+`5LVJr+5($~9MX
zgs}q{Ak<=;3O4ACCWqBJTHHzPjfEQ#Pf2-Fai8ywu~VV;Yb~wxDUCIl(&s)(T#?SI
zG5({N;U+k=FMrn<xjdP-NZ_b<eJ6!tYEqY);&?|+@*+{K0auB_7^IyNDxz}6BzsWd
zF^^<K1B;7W=E`U4jEL<`Qx1fn0NKbQOce*d8sMT>VuK`!19?;Zt|A!WsG%vBwjx>f
zd<<ukfboS%#8qi(k_bQw73_gflAz=JVFH>Wk--5~iL(=abWU1FCJqK*p!S?0?Ho$w
z<Jl1c6D2|060c<VD*GZYBTPOjokOKA-Ct>ANt-!+oMR`W&#Tlor13uFitj?zf9KLJ
zK+<%B+9P|?WYFqsw$w(Ks%02GMSrD#@u`mFHr~HYV^!i?_kiC*<*9%%%w15^vU>b3
zAbU)7t>UXa8?XwU1w!^mBHVX<vf9?AI@e6~HIgZ@Dm;)*AS_|@n{slKif_2CQjBbQ
zq@zZ&?l)TvPf)lCkP0W&4gF{7cPHFm*KM_xAa%Xu090Yc*h#F4WgfPDhM~4wHSa=|
zE-(b}ro$EO%XZv1dy{jx*Y&DTPE2Uv#y6F*+n=X9tvS7`Yfq_(`jjnh*tfTrLeabR
z-B4>=>`O|OM?3zp!73J?_-w+T<MYKa?k+BS-dy(;d2u9?Ov!Qccf(0D#|S_n@0_I@
ztnuwwj@wa7tJU|jnSYaZ``vO)_AHTU`EzL7uDt2WPe-@)Y^sw#%GFU1_#?D@=%n}U
zmbIECMykBzH3HG^G08qsush}^%uo7$yTF!IA06-JU7Iep3haIwQByLY95RK-_#5ti
z%t`TGF2}aM>J<%+yKlBVzTvI}nhk5nTv)9C0Cpg&I!2Hx81tld8B4x3-!88Weww7(
zpc6v1dQX(cH>)rHtZgd>wyT2bYk<?1B0%ttRHL?Xk7?I~tW>(H`%1?5XbBf+sx;LH
z5)mYyW-ysx#|I-@s5#AHRgnq~XPP(njSdf>M=Qd(e%xbUPfJHK2NvoI5dvv$IN=jg
zeMh#s8Y|7Qm8qj?^E87jK~J;IE?BNzSk|P}VNhmt%N}X=eVFJ~nM~8DLsyCbvc)33
zaYxiHXaImv%?b%%G37|U)yAl2yJb4)Q_i4qI&wn*>jW;TnJD4?@wl(Iw^>ojWYi0h
zI7S<-i#fERGvHMMs8br$(^WCb%#+oaY-``j3Fa~!Pkd$D*q78oe4|s9N}PM5N;a<6
z2^dW<65|_ND|(s=v~v*I1WilEAv?LzmfGXh)uTUV+nU;JC<BQkkPw1{FBsg_qSiH~
zpa!&Zuf$hBY(B=9r%k$)mV?0oMTqOw^=cX;29P@yBNU$I+h+L1MI@C0VPopuJ;4h>
zXi{Xn5XG*WOIjEPqQxbkVIpT5-M)iqfcdlnbhR-6CaBp_rsC(+P<4>17EQ=Sb71o`
z8(H*dk4P6i?$xXzGxC53naY|Q_M*A-n%&!9PW1NQRHenm&BevW1s>XLZlB<7O%c$#
zUu$j6-}MTk@@<?80TOjUpiAu2iWmO?kPkkePLg?@R{6?t^4m3IZwl2Dy7j>v-fh~o
zZDg^1=M)E56*Bfr!XXM`2t$rt(1}@MCI@j<Ft|G~g+KrzB;%lSjQy{5+;&TWe_>bB
zs8BhlnUz{pX{gWTxw81V{={`&o9hj~s9GqUHk*Jz>qodF^ocs%&P{*$es=!=nmF4=
zkg^~ENC<}*Qe%UgfSX-4<0CI|jTSjFw}x@){e^ceMr)($dKfjps-a&ok)MJ<(sbko
z1QMw{y^+QC{;}#i4K%1;I^vKsqjHs+W`GAgdW`-Twl3Q*%#F76dOO84VOZf^r_`Wa
zPIC1r<MC^au;kjF3DNeMWnyv1GUQN_g)xyuRLDRx0er=?XvBWY0-ah{R9kUvQI+X9
zD{9hkdkG7Jo64~A+;ts5kQ(JvhtU_6_lyA=g->YqKtgyFMV|u>GQ<E!bi5!Uku`T_
z2?#=DoEp=^R4f9>12d5;m%0-7c9FshM=}Yo)c^o2vjY@NGJ=?d#!^G$C_!@?cu8Tw
zbk10X!26&9hD46k)QQ7{Fs?-t+Y=Q@;zY4RMB$2}VFFLS02+iLbi#!$9r(Zia=8^r
z!b8FyMi`l5m_jN!0P(^a85$@ANcKT$q>1gz2yy^M8d7+w_(Jnc87G7QFsK*+MtP$4
zcfu|pCMBY3PaY8Tkr)G>Tp_uP+PJ53m;e(@LISJa6&J!3E@joIs$!WUVG000|JCAh
z1m?3$1Qz0l85QiF0K#xBBse0-NerZ60iH5cIGN=eH4z(#R5Qp=F8M<od`5m4V-gal
z1p_4j{?mpKm@h6p(N)bsYFN5Arz-Uh;2Poo082oAW3X9+9qVngwz0Hr3u<AEV?YVS
zM>Y9K#`+h|nwKFY%*-N3Qfp3!c!<S0%XKtHt8q#x>dijt=RE1il$26>pJsitMeVzJ
zNFJhL-xsU8dWbSXBgQ7HYTDKnD3*tMjX<$YH&tqF;&NXe>@FvTabKcwI?O>bs+e#w
zihSR^u5n@BHj3M2!<lf>#yQJGxD_Dq{4t?AiIxFz3V~v!n_Uk_b;3$FAnK)xBpz9b
zk&hS;z?^%ai6L3`LtY0BAv42{FjEp$q<joVl+Oho$W-w_gy2L)UcDBpUr+>@ZX_6E
zZ*9Swi@0cPTd}o*XD)Z_tDb4aQ?$>*QcOG{;LHrLNY=Y`qR_3T=Wx`5G3)Mk6;v@P
zvx9LW1!i*6gP*E#ZKp|ZdL&+YbEr0rimhs!rDDpC<z*^uFL{>%{`6qn{KT(EaI1Z^
z+O?(182heb)slu&EojH>J}E2l0!x(4l1V>QG?G*jTu8*lzm>G(wrhIUxg35+souG9
z&NPKpFwEU<-B;9G<B8m_>QruYm_-_C!$UnbnA1q0A$Y<Vf~u%xI7c#7SGq2QAqee+
z6jwy3VwC46SxL*igdlJtL1cI&A|_hd_veooL8OE5ge6>k!VxG)$dF||F(Lv5x>F3Y
z7^k-gL4=mbCM))mX*g3Mz>;%8grNW>fbB9^0K0ssbDP)uQ6L0m&u97-40peo)x{2x
zyu2fplUnC-O+ZWA9(?&<aLazX+1*g1YCf$HNuH{dYEx+OE{{xZdp)01^!3)wxG$r3
zOw*|5d6+>3l^OS`y^VWfHzJ%GdY0OgjH4y?S;GEUUV58#{jq3PTpNYVdV!5Y;K;dd
z6dC9>pkk^(!&e_?EOXt8*88J6iuYmM8Uxu=PpT9(>_5w_&$&<yUvtUl)g2zaL0aEH
z2;z0HlZqx1j!z{ec27q`v&XBm2iRyUHTbFkKyINqa_|v~>P7G2ns#t8vT+VCXbRv1
zi*WNF_FRzzDN_Wv6U@LQ6odr(u!_^Cfpr<IsC_pf#Hex{3xwlP8m_INlF&jliIJA1
ziv`UdT~BG#A;NignAMw58o*o3;9T+$5S(JzHHitW4K-?doabm(fV1KfK&%^kPh5vN
zp$Qzakc`hox}E_*(4>;M{oY;?u(zPvjL!khJxBqZ{ur^!iIGrE<ZB&h_B9ZlNDyFz
zwB_9ymiEkXV@O0Z0&)n4>5SLW1Qf(hSeZ-cgGHm&E@@S1MHqrp>WZBHg6O1-gfy+S
zj9V_l;~3V~<<$e~OcDt#0F_mY`mEB{Q<})TJR(XOeMY+`gUqXsV2G(v3Fc){L7)Sq
zlO};APweK6t%bd6^%+K(BDH5FBa=oBsGo8*v?{ineM-qqwAyMt&IpZfHmJMwZr@wb
z{-t`&73&6yG>HCH5&X9PH#eT7J67GvsQ9vuMa6CoU2sK?(%O}slP#$YEnrFFWM+Hg
zhJX11asL4ByB(WP@~vyvBetzj&;J0W9r5H6MnPxU*&b*4)qPI$uI^}we^0fyww`V+
z_1vHRc<e=ONi7aEn|qto=a_(2V-jQnzz39sNg*)jkN~wnO20`YX;37GfFAUK?0~?G
zB1K|S6q!4S`ZCHBNeDqi%oFX#0*UGhvrh;i$2ssnvcMs6Y3>9S{p%8kDku#pGzw%p
z@rE@>`a9vkOeEGskOna&q60F*`u_lHtSjyH-)&Rk3?yn#w8E^P;ft=*^Bw-6qrKY)
zw4eRWss|6pN$A%n+Gd}@6clMVWJ;nLKqbJS@~J{1=wG=Eo9<*~ZWCT)cWt+I+qZgt
zuGMi);Xp2HPdhR5vD#kG;%sEKbcvi56(q62Sy9xg3bY*RH5eCNGFsu5pytmgvu%BS
zz1aZE*Nx7IDW<VX!M}F6U-RmXaOK6iO>KWco_uD$2UDu*)Kk0wW^sa~5`=b7`y)xS
z^<K@req9>Br@KG?nz;?Ae*=}xsGoK>j{&a@a3G}kVTh11r8O*5uW_BltQw%x%!FXL
z%f1Q(S0OnPA!D8<L%2dsQ+H`-?0`aZa3o8FFrG!mQ^?9BL=uv(5O9VNso5ZWOfiiK
zjuKSk*#t}~6CkI?1^~GbCof_Bj3No1C=i}00Ec3@gNP%FK+hteE?E-@rkO}iYC94W
zou$ZC5Iau@1^`v6<s$$_04XpG*Hp=p{ZfS|OoWo4%6)`EaY;KerNGMt5CGH(hcaN9
z!hnO0OWVVQC;<P~;*uc_B<j>p695y0`w(`+>(PC^M^LH=bs|{caZwByCyM79)-<3=
zQPGTWAk5Ah3Ms^A#vJD2oMv+DhAfZlB^6xM5!Jp?A-29lEplI~I+ZZ3El$|wpD2&y
z-yry~$M(iLr71U3gTQ{>@rGX$UOz#q9}IhpN78eVW&xCJZKyTCY6+(f+#&}xbbu6N
zkM4<0{e_(xm9H-7+tQ-a`o;Ac-7Rn03#<L;Eyo<QNOH=Ci?24CdzRhE615d`NE417
zcXL96v{WEDvU3^~sKsL4*B>-0?yr8=(=L4*BCV}{Mw+#kA5FjL()-`cd~-jET%|G3
z-xHOD<mBBdXt(NfvtinTQgy0t?3)U^y+VS8Fw9iAsqE##F&}YSwCg>^WPqTMIYc0z
z96RAE6(Fb*5oB$~IU_CJUn1|vakbGpA-GSD0OJn$R|F{|#2h`4IkN6SI7^5HQz((0
z<|+Gvp!R(b2_%Iim~s%}Fh^mC{@72@2@LX*idJ3%0S!VHML|M%9uNs0$bPLV(>C@l
zJZ6xBmnN{#B&Acu761lnvIERW`XH5A%sT`7p(}uDk^vH2yPyOvDxn4-`@CQncr+;B
zc)*@vIG^r>WLHj<<B2d^rOFJug7?566(VzQ7;=yfla4_^>?L42Wlkd`A5<cVh?J35
zN5U8xJ9WidJ<87Vk<(U<dR189X032^dlzV|r3mGC4;kY>t~_mS<~Fk&ic}s5ovYg(
zzuRBxZQFIz0_&knuX{u%)3T+J%f^K_=I2em;?|!30BcQdtO;|2LTjL)y^eR?O>K(K
z#k)6fwO|B#jQ|h|t4h!4j^%T0M{%@$y>@G-Ri%$<NwkD;pJaKq<8ItKFR5#6_krGk
z3u|2wwJph7RRq2koi%>VNWsaon^>yU>F{qRzErbW7J9+7-|8tXLehiG(u8{=DkLv6
zrixRD#~fuD<?Xnn2mtB<t&&J!f;GYBgUKYx7rAka5jQ%&0xRdE9OcrbazyUS7}mYb
zZzGDPlf#I2d~NRO8qL%|7!n<V1X$D7r={#u=T@m8DF-3p7DR){jm<2jQRD%sf%+0^
z5_9j2`$|5Y7B;Iu9+*!RD6y}<swt98%|@DSXjLW30Kk3N*i(1`%mPeSG)RtKQ4^AR
zjuazvSRU}YTP!eCz?hcOnRCLRG6aMQz7Y|u)mG^Qog|qIxXV1FHpGWH#E_kZRAq>e
zis)GN1@HkIgT^T&0COxl(@83_(g_)XgoI&VR(tCfhY;6GQfdSVpvohmb9GwchR?E(
zQ*pae*SHcXu$k6+;pw-_4{LZNSCXrAMWbo4{=K|9(XCCc$?bL9o8_rkX>~hx{n})D
zwZFj~UX3AdNvr_?0zx$XE7R879@|FFg1E3N>O0K$nZBdX+IzX9d|M@&3^R_08qUbq
zR&qI4zGSd?EQp-uwWakO8B7A_xX1>8u0P>e@xM^+j^8)E%XWO<eREd6rNNGI^xB6e
zGnPF=^B;6OH$`suZFprz(Khr7q1RaYVLybvJeZO|B#4qJ+Zy^Ao?auI<sdAO3^)jC
z;z`0?zQ|qD2{D4CE?*KboS+~l8Owm5gdj>D8Oj2xyN~xl0btYza5#hEKiI^KCSgzm
zG@by&bC5Gxj{gAciCXPV4Kou($#Y)RhJ+BLfE1G{W;hrmAV*Db!6%7?(h2~lPi(PC
zLDGpy0<{o>J}9vvfPzR>RGiQ7_`#74l<*)(4&RH-05mk|(ynah<y67YTnW@mLuRrV
zCQ0J}14~h%&0Nj|_<JLDw)Iy30JR3vw7heuHBya#&0=uo=Wp=3G{A<aS&j;rK{x_2
zDDDVU<EbSQjOQA-wuIuIYNhte=3~HXH!ZoY1wS^{p=yrGbvb|gkzlgxZkoK2yZtkX
z^4&1gnfu{s&+*425|*4-6v}x)<!ERqU5c!+i{Z<-t5l`lS1vubr5WAr?k(>bs?N14
z?L}HIYsZd~=yS_}R6&<v&NwI{nQcl=^xoesMiSQorb-4~k-f0%UZLCcgLV44O_!x~
z1J%8?Wn2FMxfjEi2ZW<e-3n#JYZ%I>UJcX&1P6>Y6%|2LBw?IXHqV%wRnvt%!rrBY
zM`KY<YF~)BX{gcsF>R$<g~J(M-lJ<$a0QiVxIKg%!=|yKGmMpD(<ziE4~Y*#y`TU%
z<G@5_M*@4~Nm!9k6EdX<k__aA8stPWYK1GpLL@|PpgBjp%?X5(%}l_8LYd7+nSXo*
zPywDecu4yo5ZPj*R&b&Ehm~c5WV|6jGym7;oq4v6yQnMaXbh@7MgcOpxgxED^oM3g
zX-Mk-05$fvnshfhO+17#=qIGdC_J<Fcg5BxSSQm0%hB^THcGrFsu<&_nI<J6-PjW-
zVJdJF#tWGhqp19#8inXGLIU<6j<-p!a0A6vJNu)L{Goe&d*(`L<UPapMW^P6QM+a7
z_U-nyG+yqd)SISGZM%P*@N#pxXphHJ?8X+Waidanie)>meqvm1mlbxMhT+t9x@yYT
zG|rb?5K!oF=4}tM-}NJ%kxU#VfLW9K7$HDt!fP+v?1)Iy<uX!1IGzGgr6jT_l$&*^
z51T0&=Do7Y37mjg%OzgOA*f0@3J^^2{fr>iqB>aVAg(}80w6q~s<lW##T8(X=Qy<q
zR6&Wt0yU!yB&YX7iFs)S`Gs$H-NFG3%}ya<OEEAIE;ECqA{1wNB~N6GCn%DrD4Zn#
z0ZNs}*#P2DD3l*4<2iwn!WwlPQ@p?cT=T>r_QFg6IN{?99w?s3KmwXf)B(n800SVC
zNp?er04K*ZFaae31zbwZLL?T25&}cU7zpW5M9L{lCj>O9Ddh@yfN_B`CS(GDC*cVd
zMtL4700e?^Bo8kSY#ksVZ*zySQWBEV0+BZ%C@{hckm@9XIW*(j-2e?z1d&D#%me+=
z>)wvIrMUEt&vi?gwJn$zfoRg^_z@mN02zRd$vgi5ba%gz-A3BmJ=acLJUZgV=DoNj
zNugCW0mIXKUH&*JkAW!VPTHN8+mr<N9ucVO{*c?cvu!q)jnQ6`zS8M|YEh|^n7Fs?
z)AGJHmbC+nuvpj$g@{Tisyv3I;Uy@#G<oWc4Z~r)sj_b@OPUDgRBw{C9IDcB#DAHe
z9A~=Sm07BmEefGYny2u`Q0qREx9WYr(;t(4L0@Nd1laGgf8{^Q_WuAE!yKb&yL~$K
zEH@?^H@2>ID)b(z*9mDFky=33-?bc`MTc%xSvGd~+?9G2%2D```m8Q%gU*r{ws;{?
z;}!b$&o^}GG@zu0q6r1^{{Rz`E>-9`>gsJ*AZkeo$5*h%j<u8MRDF7ynnQ_hrRH)Y
zNgrmOuZkd_xV5BJw?R^!dQBcpM6ujck-z3h%%<w5ljcOr$N-cMQ~vBhxllD9O0lS1
zD!HPYNuQ?FCCVi?9!)WwQkfK4rLEOb&H(~>8i7g6Pc&NHQX?AF!~H|TV&V!7G9Uno
zpY2BO#jdAn9OPPnAXFp(ow2F2vUSB_Oag$bj%Ip8M}IVKH+$+kRkWIF*LpIIDH2>r
z8d}jkPIe85$*QEYWb~uZ)NZ#Xhox^$k*|AM-krU>0n}XeYeEl-I-kq#ggXtPyT1*q
zy4;4E^+dw}2--CGF@yZk2g}cwcNQ)^L)0w~-Y!8cr0M>dkDsP8^?uvqsIrS#14aV~
z+hWBMaygv7Rci=jW44e000<l*wO9a54t?3iVF)D{H&11~H{4YrwCT0adyWCE1;^bQ
z(So%&AJ1N=HLk6CR^7=?-2SHdGpLD5pYSxf#yH9%6rO$2xaytI?|b&;y1NUi?{gPA
zYFg4eB^rVdb_7%qY7OboNasnQl8S(2ASe@p=AbA@%iEMVjV&M^Qj6G&f$fGwVG>mZ
za!%gZA&LnLEYeVn$dMk?;RsDI37Iee=Crw}0+kB5=6QXP_8m1i5J(Cpkku(LsWa^`
zSOFvuhGuIhVjk27OQxT{5d?ssJC(&j#3Ij8hO{E0X~dbLL!8&Oqy<hOnWAK85?KIs
zxSUBC6eeL3961)Ap{3Gs0Yr*~SYwHEi$dT?n&mW<AlwYd3RfJ+%nlF?c~=f%NCcxv
zkSBt`>|FlPm>=H=s<<q{C9XMv!|H@gz$yk}e_R6+4q)t&b3O?#@k7Za#z2E0m`l1)
z7=Y>|DOn~MLu5vh;0~2n84{T!03u{c^Kk9)f{jNhDi%OXEZ`ClrW3_h5klp`q|B-I
z$^ZdJ5v!CckWiw=R^7R7J4I1-w%oqm!y(Ma%@Oq7%I1FzV}#VN#ZZ%vcNhgUtB7l`
z08K$SONVSY$;H+#f@-RbSG?*SKi4*!rB>N)jcuLr0gQ6~O+%1tk273P`fo97b4wRc
zMRseCjBqz2!;li`Oq9>Fwl-T|Q1073QCiaB?rA*qD?QGn@tAW?A9eAv-!n^##eHUb
zoMiEXq1lQa#l-yy2+dQ50Im^*vu>8#-rYxWwgIiEODfV(O*)MT>TXQ_Kf>tDnkHd=
z$s3kcSk%bVPLDy_MjEC2N)$B?DWC1x3IQMg(6e<0*GG5TZY^s}y`?6IW6t;16>P35
z+0|~B(k^J!Q<NT49+-1=t53Ay9J}UcOjHHC)s;O>VE+K9PqT&*Pa5M=*z$QBF2i(t
zQOh)t;z`k^Q9`LnCx6<A$Q(vEh;zV_dwz%xB?9EAzfZn3&sv!BXF~Z!P>s&&h%<9q
znavPJlja8UH@>I2sYtE0nf)EO#A&2?-aj2u_>-ZxF|wW4bzl9xql*Hf)gZEqe~O`O
zRZgW+;^#VUc~vBpbj(nfo>K1j7fI6GS6chVi3CZRf;*sqKqxp*C~^|jAxOaHB!yuR
zF_FkD@DX892qd*g=FCjv+X_I0nvC-U#K1V5vw~FPBZEpRX8-{~Z-3Nr5baaYrf3Nm
z6f;;DOt(qJFbR;FK$-<qIFpLP0B{~Dl0YG$Pz*w15*8@E$GAcxHB^`o2uKkKU)v}J
zHA#DnIGGS}l6wz?0_7omMi|Zj2M~%<P^PNcAe=1IBbWBVCyqzm3<HaR7-27_5{w2g
z;g2Ga0G^R4!%|3Rk#-193&IcvQb`kl$`+9|mwf%H4M`!C;Dt$~im04042UQZyhaNI
zA#guwPi`l%VY-wm>fjTPd=N@$R0=`uNC0>yK#o;11cFxsoLPD+cdfDYF5kH80-KyD
z*JvOu7s<-j@QTdTU=RYWm_!jZ6Q-pnNx_Z^d||www3Kjby!LIQ!$?01fGsGJI7a4~
z1)?e48MuwtrndW~{J~69c;M!875@MXTJ35pG`kguyoW5>G7M0<`=Xt?fxm3b{{Z)k
zW~E9X04}LF{UgG&f6{;cm&eO}ghGg}<Rdt^%TTp;cBZEc*VEW=?W*~;*dE}O6tb%v
z3tddN%*TU)Yqet=3xd$iDKn_5D&hXmnk-#g(RUSZ4*S#8nr`Y4FKpB_R+<XTE=CH>
zYn|$9{%t?`jpuD1i?^uSf7EYWYEvM--qQ>av{iW3p#By|He73}dt$1Vx*c`Qu_&p@
z(z&IYWfHB83fGkD7D<3Lnq)o}upGHYs28@>*hNRZ#+?cdauFk+`J5cs=?A3tCI;!X
zy?JRRuW53w93u+zVcD(A-Tf1DyVMcdUhIUC9N3%U-t5D)7dF*?)2CrXw`IQHPMB>o
zQNo)Nh+%WxTCvpDv<**txp`R@{)p&HqqzS7-91l3P1_J{b`K%mgH!qSQCs?htCgSV
z{{ZGYmZ7+O)9LP<ym8q&ZNqe7(N?AHWGn-f?Ez}NwF^d{gHPU$uhG46u=TBruJ)SR
zYtm2E*wy8&MaS(o;$iqIv+~{en&vk999^$;+HXW=<ntvcxIt}CHvS^aGZ0`5Oc>U4
zfb1ig{O0OE=shP^(vj15y1VqOIkuVRs^R=1I#oh}ct@8$a`hnIx}@3LygJ)!9+2Q_
zxeMLlgh>r`Ilut`L{2<1fus=QIimZ2vL2rZknPWj@gfEU6D9iA0SyQoN})r|`@}GU
zLLk9Onna`m(8ZBVWKZm{(M>$UX@KpV0vZIu8BFl{VZbAZYl~Zi^ULdkjv-Kx0+bR1
zwilZ&Dq(;FR{`Zk6EPwIG&qo_!99@OC=_!ikJMq@LCR)**h4@i^A?~<j0(&Q5J4#C
z7a6(S#O6uKA(Jwh0|PVk!5Ga70)KQc9MaVcQiTGNKpABOHcCKIGFC<#s5^)m4h!K6
zkOHP-A;Z2H1yQ<$r8F1?ILZM!Tnj|80!34b%lxpI2iivuEYKB_Y7>Nor3*rkfk^}^
z;;8_~0H9(B1UcA%g@4|FDgp5pgC{2u#~v^yRjNu%34&2O)&W3BP~v;YGX_wAb4UQs
zQb=({$Q6d5Gss92gcg>P_qc%~iZOS>6D9j!3@{-mNGCqn0Flllf>vidVKfG~ggWMl
zJTs8yh}8MB%f<?h^WKGL1|u9uB_`v=577;JhPR0m!b^@2UM53I0F4NRb1FuN4suOV
zKnvUB3$1&E%YVz=&DQ&+U9zO8-_u%3x=nf^WS?rEsjv8BrKm1KgnT2}94gB4rro$(
zU(-=i+N&yr&q-G_{wDIrb!*vCtgT4uWk$91+$IZL1DZm8*y(WYIa_gg!PCLyW8_AS
zOYeKfC{;p>f&_&DMw0C($|njn;q?6mbN|u2G@_`NY+-J|T-nsMcAYf=3R40jEl`8m
z7`I1JYes0XfjVS#sr~1k`iEtH$+$k<%IYsGN%<jCk)~@)jQo5fy>-sPLvq+(p@7{q
zYHP>}pcvt0j$_Ll5zXU|Z_yiLb95=HGS1plPe~FwtEt*LgJdg~mw}k3Nl$K=!~?WS
zKxYzED(nd@8R6d%%wkMRDv4K87iGFlERx!XhP6mStB~UjlS&gXWKU)=i-f=`MsQj{
zQ<6z03gV}{YKaw%C8|auXuMByLzY@f<0;Do!VwNHM@~fI5p!R>u+@`7;8Y~ID#8G$
zToe$P=Y$0X5<~<u0v0HWxRvjOlKNl}xRuBN4}gFG)D9{Xpb%Yv1SBAIpa@cMz#YBh
ze{3aChYzg5c0tlnkOLXz2$E2Y3KSiXfNEfan2Cca!I}gJAV?FLICsG*EQTI106_q$
zn&T`W%Sg@Lrdam~X)REqLlJ~25<m)Yk@{hPK_N)QzyPj2qby|$0wj2*FiI&B@a=#E
z<Sr->d?3OF4w<fg!vb7nAc8djxrHZ$03j@jjxi$)(3xQ5Ob#g#fL!1SlH<Wl7A6Fw
zu?PqwJYk`u(Ek7>cT-<!sbNvdM>_x@&&upSg#Q2(b-E)>MD`f-XUW~*T5kr?#55Ax
zjSe`6r8=qJ=2J20Rx}VC)8!#Mk`h=`p(*J4nP86?P9O#PAOf?F8}n!8>+Q{cm+32l
zE?P}J$8(~8my2I-Z1MiKRV$C={2!UkO(F={{K@&nOJl2BVd+;>x!stw?ynuqM%m&<
zn*_yshWt(bT^DjcTlLe9e^cjIUe&jv%2!s_=9^K~1qQkGB!JWv4MwVugG_Tx8d}-4
zyfK(KanJ2TfmWa?*eClWGFywS=gmI6x_k9+)tVfID6*9#2t2wN*90pXI{jDFS1fyO
zH?6|6YN)K*msN{g4`#hT!=^N$x}*U!z%PN6DV!#`Hhs`kH!?=mK#ZmVzz{h1d{LLQ
z@4K$SxX|79Ylm9iQ6vhD!R?VW>aeS7QgU-K(XSe1sicg6GhY~5kqS++2!VUxsH%F`
z<<HDryRNqKfwsH7vq$T!RG!aXfpp(N=lJPA^6GpKsdgqjOSEq{`v&E0yX`gq08f2v
z?Nr^1Nch=M4>dQAN3-n5ta>}FtUAwRx7@7JwcYu5wyLh7Sb3Q5yYbR@#>gp9?Ir7O
zqNUSaIeog+c9+wpTR<oT)ORZYUeg{ttx~Oel`Gadt5tidRC3Q-in|!=zdbscm%F9E
zO<U9srD6HkfJ_>$AC$)@-PHd8@JApnSqb}K$R0^}ienibAixcC)i7kqOt`rr5`Y7l
z@stMv*tm`d#t?^cxa1V%5&{t}$??ZHAyP#`Obol=Na2EDPEf!_ppdAf1@SOLLK0LU
z`VM+wV~oOJGJsIXU=E;rPZ$t~1Hhh9j3$Y<Rv_Xga-0?i5V_z^B~M_2qXGz)I4&6k
zlt^Q0HNn%xLI6IPFIkD9ky9nlC@xkICW1lI11$R?Iqq{Ct;SL^Qc8D-0AUz7p$NPZ
zX_|YX#5q_GD04{zh@TaP>Hxa}F;t#C@`N113a%0YX*sDq@WdbhXe0ulY6gAKTpZw2
z%1Kt?fX!fs06`#1YEVxI1xQd;#z%y~LktF%5RNP%rcEXbsp%zzNKY+8_(r1umWSvP
zG7k2h!Nv=VfW)Ct3UHPw!U2s9Dw0%EQ6-PI8ovlAsWM85!_fJND5S3(KB$ev5Q{!P
zx)2Fak-|hb4$NV|@%DE^B&4}~IOPUK4le!BfO$gjgb+9c!WyKgmZU^8&I$&|DI@~n
zB7~efA--T*GawYkMgS2COG*@);VkivztNY!uIZ~X;tg9u`)48l0ET%<KlyHCaqyoB
zoWS?R@8z!G8FzWKJj&&Q;@la2Wk2Qy_Pvh(0PV)E9O-v!%1WLnr;!@N@VcD~n@%AU
zjG;r2CWVq?nx`J<QPHdV{7h6o|I)gVi5?;`>z;73!ZI`OGn`{r2V8&$zB$#v>C-b8
zb8^rvOp#VNXH(dl4c4L38p?|1wNT=5j^B6#o1{%-2k(wg^AWmkEyREz&Mmax%%@dB
zNiUZs^H`#hsLpblrvnZyEkHpEIe|cZ^N7$Y83je*2~hUsHI1h@c`AZr4=9|V0IEvQ
z6c83bkfRW?oB|}L%DBJ-oEqo5OxmYOB@`(|fV{eKm?@D+B}%ASNFS>R9%lfBa>poQ
z2_yhQ5+<ma@q{#@6`Y0wdBT>EHAwJ3OeqpU1f;vjgaRa~AOXj|17;5aG#SB|LWjTW
zfK;Ara0B)bz%F{ZKmiK(LuOBQaDf9kgEJ0sGI0WCM1vvU+Xx1YL<JnDhtoJ*vIsCh
z4n6aPa1Ou;70rAhERavP_do#|h2;W4QfD{@Yk?r9kgM&4gsKn-z?I-kk^vwY%{Z)<
z01yPI#W74$2NlN%AWVRC_b(i{V-70d3aW*0RYFVv01JW51^_H28csdxqY(oc5TIl;
zf<}0g!WavHca>Gi4{XT81Rqp4H8_l+;3qVtB|hi?+H5<8O|N&fyrf7icw@BY*ANJg
zVC)y|+&1fbru=P3ejria#|KA?0u)xLB^M7F$4jB}Qu}4-iaQS3Ro~Sv^#|QFkZq`v
zX}Gk1{l#5Bm;V3{`M>bt)7(q!)3*ZU^K<62tc`w?>3ZpRI$N&cPZ|T_x1jKU_XcgN
zQ~eJmKhFMF0rIw+K+*z8su|+})6_{-R~I-ET%~h{NC1jP0OiI440ougS`|QE!`VK_
zCZS$o%&2n3WME>2<PfNtP7>&LrRGv(0Sd4H3xEL-AjM{)Bxw#UL!{3fKV*nXfu>2p
zaVHp&q<~2%CJ%%v08c3kJTSvKtRe{jPDrHT2ox#i0z<kEsdgws1qLR9UBpj`UdctD
zp}Nxl0HU{Am-cNHs9e)DuI-#i4*)GUgp+E~cbD$#XT=qyQdUKzYWBhf1A)Q82>=;L
z<CYN8VN)XV?&kjLz4hDuTZ^i$uUY>9Y0o9jGyecAL5P^_0uz<4EqFpPBq8I36)5JA
z2q0o7;ldNl29jC}ijK*BaE#Caq=7Ze!*t?X>^v7Zb`2nfU)Vy_dI$nsi^WD5K`L-g
z9n#|g%IT5`<AwqgNe4(JbC+Z>1fe`KfRa*3amhWaFp!F-Dg>sBKIn1~q~<0efST<D
z97ZHyXWAgBBTUeV0*4jBApkUhxs;rM1GuWJCWFdE%E$~!J{)179N_0WRw1g0F+GtP
zKM)DVb3O1!JEWK<q1nLU_@SkxvidtAh#)kAt}+K9+1s2BWN0VM6O4)R)Bf}cl5~tf
zBlkk`$WODjc)$U3ng~LPo*X*_G=N?9hXPhd#6m6_iEzW=!Bno9P6%;!4o79AWI{_%
z89*`SbkL0XQByuDt&|565yT*#kf4F#%>bw*ra6hUfKwT$hJ`%(KnezO;6HQ%6x0(u
z&ji92j#5)jC|pRx5IwM(a#<7^9N|eqg#;aBV5*BK0BYcfu9V3${>Vka$i*e20p74?
z0CFPWqD2;MhLT8YNyD%n9xx0_SkMl51Cc$R5Ef;p&48Liil8gYFw%h!aY>{!fv1<7
z>4>lr>1r8C2<_ryFK)c6w%*(B7W~yL-pb*KiW+&^<Mv$MSi}o7IHI=|?eK-=nv&5e
zc7w;d9Awi?XzGEgs%Urb)mwg-`iH6i07>b7q}Bfblc&|xpZR|bDCZy7+<shsoH@qf
zwLhjg@{?Te)HgpXs5R8Th7@)Fd~?%h%=*3NFO~aSd=LNAx)OC9rznP~=0CS&WSI#J
zbBtQ1XS!<~<aEp{vUD7ylYGV{w|0#fknYf!M^~%NNGpeT&?Axj$f>J$wxy1mCF;Lv
z$@DfYvcGtU-E^@(4|z*jnX0GN00qO@;|PGNt0^bG68p0ewC9A0DVKh5nhetch$j`{
z2!IhMAWwRc1K~sqDV3HyAOMLJKnWiaL<1;OP>K?BJQ5qkr+#qeg$Is4$OHrfB^a(5
zK?I)O-v|f^LBV`r;L-q`iogPEON&|r@BwAou(HI0PBWKu8OWYEm;`x{U~+~6P+YQ-
z8L0&ha)mNUArz?qOBe(RsQ%~xa`(*lLIKD^oEHHINlpbqcV`Hc3*m$;Fq({D02zUp
z7%U*ICDR{z2~6aGAutO{g1kbR<Jk-ZRF$Y^W=?QOAZkoxxfFzmp}x_dYzE3)QBF{R
zPE=BgCOmtgO%#wnZrE^8(x53aOi~2Z6%`eGVSqS~X@|7p21<vqctdo}M=&|}&Js%>
zssJZqG=N+ha0<A00Ae?_IfoJfGQ*7FFrfgu&ES$*lL%@cl+*?zx3}92LPxhrXau5x
zj}Z-NL;$D(W};oh7?TD#xn&_CF`UT;T)0Eff*SSZ^E8<?Qv}rw0K(@u$fC%#01NJ%
zU3xd5D|Ic0>#96ZrBbV@+j5fUHH94fxpngY0E#O6R@Za9Zg$&Uu;;mcg7DSS2Q^3k
z0O?Z>cPx6Fq_#_YN;IfjQ}nH^SSe9&4Kne<Au8ou%OI9Aegz84<OQu(hu7R56bUte
zO*~_0&yZUsHmB<9xaJTcj=<285a^8sy_{;V$i1bLWz9mFI5*g!2*bYlF|*XQt44)a
zQM!A7D&+GuuO3%9zh!*w`w`q67Z9<jzH1AJ4e;?0)K_TKo5KUfdyR$e&wpQQ+$fqi
zR*y96QgVbUt<qhK9vs)P-y2;R{IbA+d4^utzkJ~|o!1vlVIHsPtu;KLLFqJsJYN(C
zvCU9WoQNRJcuFQgsyYd!vINM;E@QNqM1$p+IC$j^5;HP5k~7K(2=*40G6*Lyct9dX
zm1?TzJRs(Mq(noP8Os=ugD`XK5DW+jDJW}{6D0(qrykgJ!2qC_3Hzb0nFGz*1R)F5
zM(GmF33o$-#-NJetB+}jiwKyIDwxSTVFv0zst{s)3PVf@wMd}~4COd!a5w}>f~nvj
z0D%YuOk$>8#Bqn|#lR5N7@o{hBm=ayzEyAyYPgKt&`W=X`r6lgP~lFZ@`6ez(g`SY
zFwuiZmk^SMqA>)B3WWw9FboeN*q|%J4&{YW0~&`SMiW%Q0EokyU!<vMDobRcLioW_
znUGZJI7$Hs$Sbl_DdLctQc8hPfJ&f-xU}FFc;|+5f*d3X6Ol8L;|o!exvdV60%VMb
zV@e*zz<!pzfz83hG$<p7K0k^Y57{Gx=Z15Fx>cdd+}98A!w`d16(|P*JW0TRnb5!l
zoJi6}7?~V2j6aGHG749)9XQJf1ty3{QfI`#0wiD%cLbI{??MBaT2e-07BUFX;*L29
zm>jF9fB`B%FaQX|j7}hZa88oYK$pcZg9IRB-wEZ~kc4(6L6G8*9DtA#QU-JO!s;-C
zGB6I51v*g4aY#g<b5AgCCPsab*@6gF?CjwP42Ip+ZM9oKnkv+%RQ;V)n^KrP`fEBA
zLV-X*r&pWeg%kO%l>HsQt+-opuWf#fr~lNuyMdo%V%45NWf=}CQz*r(Ovk!8?uS!T
z4zL)4aX`J8#}ezh;<r;>H~tH0Q)uDH2_NQ;-1(FMW01x;A5zsf9Yea*U0?N8Sg9fy
zbW7P6an>#LMO>Flbn7Eh2n3=I&e#Z3hq@QLx^PEtHh$RFVH1fiDa6hisO3x{-aXJ8
zjEVaF@BooO?K2-lDbIr81=DeHIQ>uoAj)Sr072NHAa-F0q>e`!WDFfBhY-Fp?D2!C
zLJr0Ki~s{XCOycoP&heE=aPZ}5Rp`wWD4+!01azi;?fE7h182YaDWI%E>sB!&PkA8
zY(A!)ZZwp!PK|h>8T(;^U9*&+5HyYi5rjAu8d6^p98ef)xFI2UE^wq(6SxQfAWBLk
zp>aTv@#e%GMMIBfFcn0Iz-mRyz5yhbtiT9^AgZ4DV*n@@0oe{m0Iml}kUjW7AvBSU
z=aP0n01OH)W^f23s*#8j?0`zSBxVSobQDlQSx*cBg6A5OaT<$2kN`;%02`;93E}LA
zO;LC%${GPujnowY#1|7J?T5@LCQ}8x6+dGD1RxC{qt;`%X0Y~&A_Ay~3>G76RWrl3
zC2JfSq)Px77?Q|BIpRhEeWDy()KF0X0%$vDlqUcYc|s|gcTC4Y=njLcx>l#W?sSN<
zU=FjldJ|mY+K*}SP-mH+@g4Ud$Z9R%ZT?yGrF%d(d%7;AcXr0px1{0B$|Vl6J{MKx
z{4|>1w;qDqEn97D#v%|0(Hbp}WHj8`^@^2|F=_)yL6le9)SQ~77J(3<LV$rCXNEAi
z<rVGMakbm+Zg(5%o}I0&H&?iS22%y@BkxN~KGXSQVY%J4iA)fxp!=hW{N?IzPi?p9
zyJ!Ao`4<=ZNmkINR*g4*=T#cN^kPVoR8=`1`@CH5`@P=vc%HTGQ&&pKjVLKn5)z|G
zHIc0W1e%E=fk`TO#H|QOP~-MSYc(h#a-x)Lh8STyVpJmCaTwiGQb96NAwDp4s0e1?
zaO{U75(z>A5!=EEL@HEB4$;_5CL%Vd24E_K;vxE8iKMAXI3_R%MgjurRPG)E08LX;
z0X?`v7)T+(c;^T;QG8My8%&TMO)!c|YAP@QOU(e69x7CU!T|$G)gy$_s|`ab{@yV5
zsuI+kfX}rcl|hdXKW^w~2DD<hcEhBS7-QT0y^uqi6rLFY^}r`c1x&cfaWaVvVRBD2
z8b(uyU(9<ru4C$lmyk&bhlDTyW+pgcoD4&nrcP5XIhZd9LByh!2iXp2DKIpo$pzVy
z0ioFh(xqjD3m_O(%|Xl)GO(M6Xqi2b@+RRqkF<mMfW{gcCJrczj$M$35yWAb;|RG*
zN<|1sg@n-H*B~S`1#k+1Qh#P{fKMtD%&J|$;Dtm28jd3ZsUazh@rP<4X=)MW3ot&Y
z3IG@xkfReQ2LdHINeF-ggo5i{V11K@s<Z$Up~inmgEu;9h?pI`Lcl!43>2Uzi8EXv
zU?P~}rz~V7urQIw6$ke~QcrFv2e|e`MiA2EmZCX}g!?AAVF0e427dSw=248&8~~U&
z<B(9ju)tjMo+OMVE^J{yL5GGJqC$WwfB)6FzD+6N7>_!N=NS)npeC7*2*s)9X#pf>
z+GCqjszgwoMDpgm265**+U>7*ad6OrwP(C02T><H<IuO&g}s%{a)45!R;D;<YmDR0
zZ1RFakQ5H_ipH)o_?erO@QUqVB0%B$!G~(aGel1O;5g1kMr5AI5&+FfS@4Y}K$&sH
zX0SdkDI<x5pa}^t_GJs0;#DQa0146>i9A4AK)?WI1pHwHRU{tug476Bc<KboXWIY@
zNScDSXj2auahj)WfXZAWu=H--V0JB<jrFamWp?YWTISA$!Dy{n=~MG*RdlI2pA>ZA
z@~@)rXttwls8(e_qM)|7q|yk)ex9AbE502toSM<T;#6H|c|uc?4iG?Klazso7?fE@
zr1$>-l-&=tZC87x&8@X7drD@j%j#TFYB%BaKT<$*{EX%Ex+&E<2cupBSzOcBC+Xd|
z006{_w4Vt&=)#>OaiEjch{*&XtAYbOBd`Ae{{S!PN{;-yoyL(UK=mj+qFuC_a)=-D
zcEX`cUGB9!^whRxL;fb#r4sOAwVazyNQ;Ew5Im?CDaTN)@>=}~v)-+0XZh5sQ}+61
z;x*O2QFUd%^6OaF2M<oIK)?N3EJ#)F7S>QgC`9oK3Cf62fh7kM+zYtHy5r@SS<qm2
z=$TasKb2mIk_ma%E4!Z6x$PC(ZMs`N(y?fBSmc-sHkPYj{4nYcEofp%0!)b}UJw%*
zrfZOdGjT;;-v}~7asB7A7=lcLil~`5;okuy2`Wic3ompROk$*g^iS!4nScq+WdMMM
zt_2rKUgO3d90SY94p2x!iT96p*@*D)ocmyzp(G+G!T=vlLxYIr7@pih5fTRgRHTY3
zeQ^Lu2S|dY5fM8QCT5gVAZPm|G{kH+iuRVft(9x)Os{Clto=ndf?J@H3zP0@*ZeW*
zov%Y40)jA6f^i*@<%MYkVxbC^MV4{hK3??;w_T3dReIf6f6>-ilb{1h=ke1206QN9
zx|Tzs+H6o#9kYzc)9p7+eK9dU5w}o^u@{acTB8(TU+xs`X%Y;==L-99WeQ~vd5&Av
zJx=PcbqjYa%Sv};160hEBUWc`9Zy1#t#@$nC{PY{LbDBQ);#ySZnd|~%I|Z!DN^bB
zgK;gU(WaF#_I);KY;XF*b*x!V!&&t!2Gp#{NucsHr`uKj7^yo*ASNDZtotHSPL&Id
zEII&^R2g6=l3|P>aVsMw>>O4IC0tQtpG2j`7*x0`vE^ty5K1IgJ&qxwXycGTxRoVm
zj5LtCPb#AEKs;d=@|Kg4&p1GpIAP2%)r7wECNYOk#f$|2qDzA3+Y$oQLJ4pQB7i$o
z&TxxKBg#Qggz&_b!|udD0b809W&&ja6yj8z0OJX*ILIoQl*1@+DC`hQSnv=4N;t@H
zn9U%4l{5C|0$sob9v|HZkQ(DO01)m6v?nw=fVosV%s7cdOwSC-%_D7FruW-+&^H!t
zq%SBk>f2MB&qksAr}>8~lL#lJ_qPpBqQp3dR;CF505@bCv@eenku_Lbl$MPnNUd{;
zJ>(!0@Pav7b9pi)WCscM#^?Uj-iRmArh7eFYxKb#F}{(@L4~1z)u$yNi?Rx&OY$7A
z`V;mVazJQI4{XWpoFk>~#7xYedPd{^*WW^cL8^#?tv~!wuR2?P+jh2ZH@1PBZCb|F
zwf#modfH307O~pfCcm5R!>JpbTlV9+<+(*=ilL@Yr@9g+Y4%IL2?24L$jLkVVW+-*
zivgT=w(3&FRFRmLIB|yHjV?l+zq$mM4CZl!xSY@-62Q$?6BwpLK=_7To)C!T)QOg?
zrTy@wC7P9y?J|@#+C~f9umF71X`UIZhXDv26PPJf&ph};Nl>VXiAW_}RD@MBTucHG
zbee@4Jfl2e4O(>oNF1t?1gXXXoO4_<5`xqKkw3fOivS1=Qyw2o5Ds1`IBu_m<g)LF
zB8h;FKw7h(e{?hgL8Kj1QdSf@Ow+MeN#Bep0RPsvlBk6eQH=7W6cg=9#O}yoGKdRC
zsh;TO(CZpLt++P5HM-Z&^zNn)ZA7P^;*JHSC76`svwY0C^m<|Rft}68Dp_FFs_cIR
zakN51kd!J4h}mIMHR{ZKr&%S@SK0(Z1_}zOPo@OUQ~Dsee}^QW2q#JwfC5ctKY5GH
z!$u}(N9{O9W@@<MnS$U1h%@(&$QaFYknDs4oSaCWP>g|8C>QI5MDR}k0NoNck*@Yp
zWZzZ7n3isG2w_Z4@zYi{>FpG+E<HL{+Z~;?O~YHrR@0;%qL^sseNRQr)AysYS#I_h
zj&&=2Z8WJ;uHgDlV_QzMb<;|YZ=_}BYahw+$2sXe?$c<gb78u?Q?{=5xL2y*%Jo<@
zKh{6#Q*u*#YMx3ypEJ3l=W9;(g*N{HPuwZqTHZPJX*%)Y)hT+5QNoQ&ai;ErHF8nZ
z%AfxLo1>M9>rC*^Die~()b))))mvST*|0sAzP-f@>eNAKZCguAhWeb(Mv?2bm;Ppm
zrZT>!hd7NTNO78iQBMIEJ=3UmzKpiGtADxw09NWM!S1uNd0?*R)dSPEp>0Tdv@JDK
zXY+OXUlmxPS<|?sZ$^#T6>b{VitlI*YpNgSpaViHoed?rso_Ddgx^uxw*l#~l$ht#
z1sGlif=d>iRX>$i^QmZKx!vnFCZ|T_h+l5v3mH`aRi^j)Y(1LZG_{_1kZzPsT9s<q
zFh01aQs9S?>Q<!YRBPN`*;4$iK8rpV@;)<ZZ>?h}(|rT2uBGZ#)UJDIywwssytOJ-
zdw&x?%ne8go$E2rr(Tg|M#$2ba3*0#;Ye%iF%f6L5tM=ObUxnVlP+y)w^Qj_vYk3=
z&`ZTIPNxQxX1ASMb#gb8;EA7l!)Er=f1vS5)AU%<SmNpen?^s&qj2igy*E4K<mYrO
zP*B#Fu#H5AH`rA&F{kQnp2w)R*V$e;0MbnxhK_nw2yhN+PfY&+GM~p5TfM@aO10{m
zYJ|3taZ=_v^=iFKn(xSI^jqjRL-cYor<x*LL3ZC&?G!1toho_-8lfEKijFN{)2B@@
zEl<s->g$KO^a>tl-gbNKj{c_Ke@qID=3~GiwJGIMg!!2-#rER{K?w=t{gK}KkEg79
ziplo3Kr*OT+uSuC>MtfheX4GgDmA>Z!mlkWZTFWJR|slb)O8Alg<GkFw4v~ZL8=ib
zNxFU|SAqxyXTo8l$NOM}X>O$<0pkf~a2kjpp8TQEh_vc6AQLc7ND>GW`$ZIX&nP9K
zc}#)=gqirjac~1J-qgYX6HpQpH3z;VXe9fDu~=&ZR~aPFfQJrUE<8-QWep(`NQoTf
zn*GvZ+4Adl8*fkxSD*TQ$-aSEB~X7M$G6PS<BEtFXQ(0*+nh+p)T2?4wyCDIHwV+G
zCC=AL!^N$J)}2X3(bV;h(Q@WFfD)LIf{5PSUudLxviAETjndnzH+`<lcA_-4>X>tz
z_FmKJ)o1Y5Yug!{=IQ8Aa1wDq5aOdKg2=VTtkv~a{`7Ud71ZfUbITa2(%T-}di$>A
zO*?CLy1m6r5sVt8+wjKL%HMDOK(D-BQl(Yy<)>A{!B@)~Z=2ioosQjM(-s<Q8|@dD
z{+p%xcAxrsUZX$aRBNic{{ShFm1T(K<%!8R3%Z+y<!j2s2i&{V0ZK3*B_L1yHW-D(
z`Opb9LuLb715e`B=7`jvGOj6>82&G{3sIPwC<O@$J7U8)?M^ye5gUZ(huMVCM8P~z
z1tf@Ci6FT`A1`lw0vdoQkdZ(zIPlHTPy_)XK~zk0C>C)16~M_R=s*DNSa2!Wd<-Wf
zkV;9~bASZlq$_5U0OL8pa3Fx}Wq|`2L>B_=mOzi}gyPXG6ETEEi~y?Sc(05uAfr<f
zeYwB{%@;{bW-*DVTh_R!C{ST9S)7FY)9-?*Rk6Y{oRnglZz>U$R1M6MOr<1<>Jy;L
zBINMSI+x4umwQ(K0J;8?cNJ?;7VoY@DjHz9ucpt3n>e=hx5~ba*tWZB`*n>^S@p{*
z7hKkIv{9<`=X;nY!SKXXtCGyr_*Yf<aivJkbi?s=H62?`)Y;Q?stpcq#bU>?!3#@4
z<H9RNt$SLJ4Adtgeu%PJ?UXu;sW$ps#ohJO3#nVJOOrZ!OU*u3vHEqn%=2pd@rl#D
zJ+$syuU1)JiuyL8fBG@{ih<O{wO2Kq?Z-=WdHqN)Se?1WN}hC@JAR$&)?vqXwn@G`
zWaRX8mY?P|4Wt5>K11xT5&*#srqQUAB4jwm%VpShi{7BvHtN5rsY7wB@m<cksIFru
zRj737)2E%3pQ}_YgJ;<{9m{IBxMg<>y3}b^Z|dreN}8Rd5JLTyx-6Ft@|%VYs5_bH
zizEKy<9xSuTAeqjX`Gar{euKF5N1k^M3Izj`l^GS@_yays>*Dqy!w?I_XfG`TJ*g-
zVHNMx=D$mS(q%{dF_W}(&e3)A3wGFUDOG;mYiV~{jlb$EH7Zf5KvQ<Myi__{Rr1vu
z{l<->`f=N3W>u?YP0YTbMy16^`gOH2uPk!3`EDzDq8xiOySj0v6i%<S%j-U)ZtP>k
zQIeCCCZ3d(YQJpcvr-jP2MjUB0n`Z@<_<|6&vE&A(Ui8^%9h(>g<3Xr?&((b>zC>+
zI<cr(rOEbUt-JKcPTO73ZM~4EO1+Jp)`r!n;jXn7x%x}o_ols0iXqFlJ94`e=?k*7
z{{R#g=le%HiRqWQH@-|I6(wn1$D=u%v-0CEFo_OBfj(*F+ZV&JY@ZFNlYl2%igt;%
zHoJdH^u>L?*6QnT)Lpz%u&f(evsQyYN*{{Xd5%@VbKHzc5wtF1f{r<v${o-K!d
z<e=jkR*$L6fFx-EY73O%j*{WuDD7QQ(H$eS*qgg;w@aztGKpIE6QuGvPv-XWx;ZZY
z0B1*U-uF8jX1TO2Z!Rt9wZ~CwDO4>1&L~eVo@gvzh5Jyw1k`7`mCwlboXfUlZMeM}
zUfAw6;1Va`IKtD)PL=K8xG91dxhWGtj(He5S`p2j1qh3ZhLAoIqYQ=^!+=0Z!wE?C
zLy~$)nIUjV&nOs{jZy)7l)xcK1mleV0CXgB7bJpl@PHHv<$~rR!6ZP=MGh(74FD8j
z@q3dPP~~nqJSUDEVL%7}*S3Q>;SklvS$M;!{{SqyN4g=b?qcGLkoHG2L#?UJersHE
zZB}58XH~4>+#0}r?v81w1ed>$7`uGVxb=7HHo%8|p51{)MecjgwWYC0YKa0uOR_o^
z)gY12zm3Mbgmi*Iko&!pNGFs4*8)mRDg~aNXbm8V5)cv;=6iF1a$j=9%7+JOz@Xs(
zf&|Kp7qS+l2nqrM_(07vPumz_pmBDUT}3g8DAqRh=~igf={>IvCQ8=_36H}O%?T<3
zg#&~oK_vshFn|^X>Ay1W3yZ34;$)W=RhpGgGa_1lFs++Py|pSgn{_Lq(|Fq&%%|%H
z&qK{WOG{scM^keu=>Gsy&lDnnqve<HfU_LRnGbY2k9#^s+t%GJWZgEFPqS3k=FklU
z;mr0_uTj|iakFP!bkPX=fE$4Ym4~#NOlXcsN5bKbEh2DL4ka;>kne#$!R}{mx9dKe
zkvoMZGz3ECk8pOcbZl+9pJTZzM{=VglXz%O3gnIeOpNj4KV$+MjGVonz8jEk>^&<_
za<O?zS2XId;3>E``-z9vy0=s=3EJk2<%y3o0MU>fe`Sn{7k8G(ZSHE(B@xuBxu5J9
zTZg%HJswM51DZ7hoiYfNHNnjqwCPZ1ikS1`ckSh;Q@0D3rW)!Rp6a@zDV<x?Y$!Jh
zjRz)%31#JLRvl1oM|LRU+2Ek6jBt-W@3$<w-fym0)D>$v<QzkgYX1N>sM|BII+~FU
zZR1_|*r(|+T)3?Jiq$SEr>RlSd(a@%=NCH<ykR!96tWu904p+yA^YHo5Q732wi-(3
z;R<pj9>_!oD}sc_7X5HrG8NMk5CB<l2xp8rsDxl8jM5qb6=^Lq0d&YGz6)Ghs?}32
z^9VRLM*)e#UJ#VIOBAHYhfxj=E%RV__vHjUla7;-1qT`XA~aMkL{O>15Oe<ki3IQ=
zI71rhb)S=)>p;Hg%Yp%&?TYpzZfTDXPOtv}RyKWY)IN<$w5<<425JvEIGsP`Y;p~=
za=O}gD{a!z-=A#pQHKpc$?SjJdkdQAvew+2eRERhQS_@FO$KDN_v01abW6O^W{(2T
z)7KkKU54uKcDii`T575s<#X#ZLDfthT&Gr<X?D2fmlsdExv6sXE_pgbvICW^koX%L
zUFiOnlPsvY+G~DoMgj^q;9eS!!x}_$No$1j`chC^R893EepP!qVm-%DnjA)@;4(rX
zhn`Et6w9*!VFHbil)wjx?SkUG&i08SaKU0_BZGh~C=oCK2@dEoE*NRd5AGyGP&ZbR
zNWdI8VumCrNF~IDktE9uB1<3&l^g-mqX~a}Epw43YKky~P>|6z1p>&EC{789O(cR9
zWWAm77ns0wtkzSQ3O%r!BdS>`NrIvVpoIBR4Etvr3vIRagKAfmJQ&0vm02DdqB5F-
ztLakf9DA4UD|VvW`Vot5L{_NQz1moO&H>ITFo?mPn6+)Rw)=L5fbi1kkxHp2gu@Oo
zbLndqHipoQ(g>#TCa7P#BTw+s5p>*3g^Sn;O>`PDm^G=4Ba_bcKb6R}$57q#HoDjE
zJ*RVl#$0(-ZIt)8)vT#4m=(T(>mHrjH%gT^3x`|LHsy_|`c+X&+ThTPCkWkj{{UU|
zw(quETkW@wx~kgc?qxbvWYb2gH7l40nej^M%FnC0Qg9|#md$V^nR<Gg{?ut$_bCcc
zj6%sLtKAzX$&_a(QDEGCm>o+VmmW;)dW`Nk+>~KGsd;`BSUPWVG`Ib?aI0l|N`q=r
zeQNHst);9E6rA@R4n}&P6Ju@DRByKJ^Q<>5!)?8{w!9mottwQtula(tQ0gI^isJT;
zR>pqSbzZK`X;toaw{7*xsh)>f+${d}M@Apa$~QGP*Hmp*vfkdx-lg3)nvV`;O4Vsn
zf2PkbrBTCMsrjkv^*Ljq%$%h=E{eFN{sYNzp4WXmu16p7GXrY7yW97^pSj#Ny~^5!
zwbQFP)hsI@XX&d`k!v}*>NAzR&4=vOuJ)bFXt>=r&Bn^o)Z4P9#akN9roz`ZUZbIB
z%<fH5V6eKUPor!tF6uU=TPwQOZm7q-`s&dSZAz_kz|z;eYxLXsY}cZf<-<O4i=D#9
zZQE%%f~8{Xnr^6H$}O75Rd~`{PgaVlrTp*mP;afK9QksjF73wRou<1JDTSp;@&3$Q
zZdY6su>E6Swcq+*G4~Fj{T<~^vbDAC3O5w(Yt#>{MH+&RhSSs@=23gv+Kq~q;keXg
zA#ZiH-%@?uHq*0xTY8Pw@)%i9NgT!cnWQ!?A1#H&I(IIp9?OHe`gPjp685-TRbXva
zyf;hfvAo}oHGN@HxAkrno%=u2iyy~7yI`BeY~<2t(=gZOR<5<pb<)g`OA;c{ZHz8Q
zPQzRi!`$W8ut^<7xulv)<Pvh};!-ZD2<f6aN$0!ofAIO`^Z2|!ulMWyeDXHn<juX6
z3srB52D+O%9+xy9UpN{wupv0ySMYT4aEWJlr+ynM!@ad_`9YIUfO|`IRDbW;J;~Ek
zo^s|zb>-C+&HR+OmK$%5?)mzQCYVh6_I>Jr-`pT>&t2*5wsmhM3*%&o`={ws?2UNb
zj<(sPInSOv!IF`*pTB>sY_82hlFU8Ky?kY({*S+zp1-nAiobn4ckPy^mf`4a74sTu
zTo~{0hw+>z&D_l}Vt@60lc|WUz1MvvYVZ3V9jloP$)#Vj8mxEf8mKFM_$_b@UT-Ec
zh)!MTk)52MtE`JL7XyN;z`!fa-I&SN9k+)J&6@Upz3^=DWx2*mamI5)qol_A=Bd-Y
z!H;e<ySLW&pFcEl&Hdg6#79FR!m=wnU}k5gr80A7WZEh#X4+8H+Vqdnjpw!W`cf$;
zWxPxYvmmP1IdJURmj4&VUx$qHcRs!RDf6Iz_Mu7#+%4hQ()We4={I}z&uc0DF^4yH
z|20t^R-kVj<C(Hze`=)0!>f<c6+fzEES6+U+G#ylc>^BD{*UlwRH6W526l$#u{d`M
z5uqkg`%o@I%oClkkUiwNVj2<*(>EiW8<YcMpI!!6p*<Qcf*yN&G}y!6$ndLjm`CnN
zCLJ+nKSCSR$u8ckq)<5@3Zg6RxtISNX=<&?+om(<q}pRkC1-m~p}lv<b{(hi)>BVb
z+d@l2c3rcRxHRdi&-0%4dah+jyVmLMx$xKNlOB`1sG}jE?x7P}MZS(*u@`PUJmW=M
z+#8ctI_BH;O6^}ICL?fP>IWyS>CDhx-@fk+s1~ko9nN+nA?OpS;Y4}7P7fFk{|Bzq
zrV0pM`<-1BuW)!WJ<@qtHr=!zB-Ux{0kHAa@={b-iW$cxnd4NLgug&n(@MqZp>?1r
zvh+#Y16?3M>VTbpVUX5(ud3M4zXP6i`4sZsY~{j?QCXIMgZet)LP*0Ae-x5k(P_Q=
zCmDc^^&q+sxW*Gx{JFZN!<T@u!lP50L6S2919K|KTX8TG45(26bHtwyv`lnYn}RP|
z@kggcI73=?s{vS118tkfDk|8#!wb+HAuX>QhT{`3!nqBxl!X+37-j}oI?Kj?Da9-3
zzQ=i}usu~#N8-;MNoI}0U;x1gV@U_3iL^w`*r@Y&z}W+Z1($Gvr~-o?()uC$sxD(G
z9g^0RR3tPI5y>ZE+lAoy3EX^8P&VO-l1ja<^AG15TV5`idiDIhh)Cx^VBT1Z{iAc$
zzO?q0JP-^pa*be26k2B<(YgSDgjF;`#?+2%df~}bHN}smQ~cqx20?sjpNi51%0*cG
zafq`?PQ7-nCpKh$Io_{vEyD93#*NpPTE4t}HCK%b9;M;yggG8-9MpG3Ej0FUKio9G
z{FWN^V!R?GG2dP;024{dv42aTo7&j&{KtejiRsW3<K&Q_?A3k{C`KysoiCTBYm6I5
zB@h7Em1I}~DgrzYm$9t)oFu9<$HEs1xMsk(<j-4?C_e^)R4=*{<YXZXw3n&2zo_7n
zp`PhtX+vs8&Q&&Ta=ND#oZNm~#wghIQN90OwbNeinQ_a}J*~`8n))4zJYI~08JD+d
z&Z!=%Yg04$^66FQL@%N}Ty(PO-;@Ww9?hXUVx467+-({t@)U+GU-8m-wey8n#*M7v
zIGS%#o?(3=L^Qq_>fm=a^pI%d?_F-iy6MnwTrZGE8wDSsbtFOEK5QD~p)@`BdCg_7
zkzYown)|mYo@BMqcjL10MBC5i){>IfqM5UL=`WB~FFsVeayq4pT@{Pcy6?6>%iR0%
zi$c=}5!ur1EJkgfF$iwqS=6fdrAFwY!AG)BHlFV5lT3XpclL>fxW`~ei!+!sohr|)
zmfKSs?>!H{d^RN-7Z91(nG;FhijP)_YTkdnyyS6qE;(lBk(m9=L&`g@MnpfUUJ7qM
zo}4dp60|bDUh()+OM(aB2}V>=By92}_4IT<&8V2_=$kgKX?Ks@%IhfUj|&_Bc9ZD4
zXZ7J#L~>({$H~_}TFfPt6?T;Q?fuM{7akji2(#vNzQ&}?mo2rwi^3_Ieso<)j-c6Y
zOA-9mK7i){oA|aXX^RB>gw2?W+Cj(N#CxN^eQt|rdkHfk!Z(o*r>x$HzSkLE4+_ZJ
zbLaj)aI@kcR*f&0jo5cD53pMgs~pZ^Zb&Z$^gmdA5_6=rY3jINPqW+g>Usin;`hCO
zuOFHZ6Khf!Hf!TKL~c^y5IXwmc0`ukZ%SL!>yt-Vd%V>h9?cxO8RPf%;oXQ#k=HLj
z5W6pO5t4r<L()m&E}7RZQb*ECgZNgod2<@<hqxQ;Z4lXs|8jF{JlubcNz?yI3r#E9
zIvO1`u8-FPQ4*-u&p&W~Ekmd9sRhqe4v5TY(A@8v`m*##Np9VATFO27yt8YSe_xiA
zN(0_Ey|fnoxE;n0Jc}JTDejpbA!jKL-3#%cU3Ip~0<)Yi@=SU$T#3G3xgTBzR!dJ8
zcqv7=N4;G;5Z7nfH|Kpt!6u|oCDK*Rl2iQt4<X3}fF{3l^dQZLXty~RD0GBwQv%y3
zTruMh5`c(t7~xBz@lQVT?-fmOE@^vUDA+-GLwL;QepMi@Y-#U3x%&8gDdU_Da&bID
zp!33v>+koxRiJt-USfgr-e<HxkP4!@FRoBm3jV(vc-D01-Rry3)3-Y&IVMlao=i-8
zMb`C{8?#LFEgTLD_I{tTcs0WwJI`;6JafzaQ#<?6wLQv!b_E3c_G3!;hgO^O#}_=i
zEE<Q;-!k8QrcSkWBEsUm$=4p^k@pLnt~TuZ0?Q2)TUp5lPAs2K#=bzlYX$M6^AEIq
z{j~Qr{0YHmdcp%{1R)GygLWf9eQZu@cCE8NbgKb-VX+IwW_e|sIxvdFtfMBmMkF9L
z0Bt{<VZodCpVuioG*`M8I<C~R-qqZLto^L{1H`ui;QlV9@!9%sV8?nN-D7ytv{{lh
z8pesJLHs)vaI^;tLiY`?a4RCQ+W~el<y5+8tZNFIpzyf*iq_T$Ku1JcT#n>Kl65#L
zlyS?umoyKQ?r}<Z3$7bb0bsNA`o=_)ahS*J2Qwo_(!*6|r5PBrf>jHE_ow{XF(0hH
z`OpT|UshHj{_3`E<C9m<-(0p1C|Ito%orQqL1VByNc-<?7a7?pEtJ9!8h2y3_|)a%
zEU-?7MNX#VJm5)}H156RX~4nPHKbLzZrvB;PXW+guGJ;zVwN1q#0J`@atp<AkWG;9
zw63U$AQjs6Ha-MPQE;*AcAY5NK*?=}yaE8TcFE9L;ldoAdO^E2SM8Uz<aBq}DVShm
zb1h=Pj^KH<=Q)d%4Zt*ot8?!_DOqIl%H;x&Qf%M$Zz_vcYSEof#T2%`&lr|Uj*@hD
z7Ybg;u@jrPQi$c!pLUWjok~`{eZVnU1!oi7F?lk#s;PeK+HEwLO=JzX!C6e$uC=!~
zs<s@@Hv=>$SBQIOJ&*p}Ex7GgxBaff-&A-C?r*<=y?B2Xg=n!Pz+4acOB<`Qh@{71
zQ*sd)EnrX>aDoWJcf3qkPrP%?!q0C++jSjJz?hSwfZWYS64s+(cqxHnS&ayRj)>M#
zE=!ot7JOa67OrKQug#+Ag`|lM@`5-8+k69&IAxHHCs}USnxHfD%_VDyh)(oq5`%SG
z+;G>5I)>;0AV-vWpnX>AJW|=Fbxh*A8;?};Q&Y3%aDq#UlVB<#%8ePS%W@~mj=oxn
zuZsX{yrlOjOMzKPX9~8Rs&jU7NLg?DKzh(8t_E}pXWGLmofjo2Q8tf3R7}RQ)l&11
z#u70&IPR&ocRH1sxbsdBq`xNQOOg~=GLGS8=`b=gDbhDo)=gh>8IXP@O@iHPRk~z@
z%QZ>oJjp-;nnPX?z6d%X#n_MHgl05)7_gH5A#8T(&XrnN+^R_$E2+8CyC#O7Raqaz
zlscI%N?B`q=bz}K5RRp!sH9ra(ib*WsOnD_y~-dU_=8Hvit9-Hjo0vQfl@>o#Pa)}
z4^Z_$EG5!G#-IS^a?|Q(=<b<qauPC+6b16=F|~_-QmT`@tflP|w}2MHY79#_gNYkm
zu6-;jjlg#>EuTpg%_giMM|iafU2ee~ZElzp`Lb}!Z$sV3=yEvTjy--obJu+s!&Cmh
z(f7r#M>V>t`zt$*Hup28CSKh-5nvf{Cy#c2y|AhsUWXj(RAqaPRr)mWnx>Io^uP7)
zF`v=+7F!mw%i4pzsVB-ibWsglx6R)guUVHPj29v7=tbK5e!4hovzd{l(Zg~IIw(iy
z$X}JZhS1Vf=wQDBJ(%R>u(hrL;<w1ZP6ok2;<qBr2RwPq1|7|1hGpG>WxEqdKms_L
z17bW_=V=zTD5SiAQ`3aRhU)#{JpfEjU(3F*6Rogsrzg0^C+m!aNOizI-&YAAij>l%
zb<0nwIP^n@gY&-IUE#)yoej(t6H`Fk_COG-WzdKu>--Mv1evigAyjEc@m0euEMu*a
zfG&Q#v2d*0<s|T=FAB2kW8sRIQmlQp4f|LQA-v-Z7<p(7eqZmvTPMsGE+UxjMo*Y{
z%36LL59x$G(9(v6%1(2Xx;r&P1l>aykrhSGN_y1Hmt;~!zdd9MD(JA(rH?kGKRMo_
z19M5))vffa0+rChB^2?w3<K_%>Q30f;w{K#5R?f?&RfyxRDp7i;9@~5$-i~nUAfg$
z-zbRm*knfjfn>dB7G&sZIUK;7KM~qC_`CM!w1~U*%|`yhb5>n$@f$qiF*yQVt_~(`
z0dl+F_Tx7y(+OqGodvE(Ysl`AND_^B@nI`qc%eRj(0F@b&_c!>fFShA5~K3mGs9)U
zN+O=FWoe)Y2T4Yx#V2!!V6b<A9JJ#kjX16q#o?CuN9m)@8B%Hj)74~LLV7Ly&uDUv
zurR~3OOX0dV*PcjHdKKDStZT{eVT5M1{q%4bNMAMoag^l1$Lg4gY<;tGXTF)3b~z5
z#D#SSK?n0dj<!7RJ1}c*8xmhvft0qrL3BnbG3lx*fHMvupLNNiiGvP9hA;s+-i~w&
zxVd;ZV>Z7j;!Fh@2FOukwK~;iE8^<K{b-bZjfwsS8AM!oO<f=)^Buk(^!`jCj%5#K
zuXh-{dq>tz>ZSVz0X>QsLum)o&ncP-7LrU?pJXA{0amRNzTycvtRG$Abz*Yd2o5`9
zTu`!4&zJ;TAs0KT0Kfzy5rem|uenaz9`*)ENV=(|2$gyqx-bRr1#ghKmMm0a8Z%W9
z41VghU5&-FQG`n5ut0KaFP#@6sNMMrpVWEGl1Q*CJ1ShV>Ax7?t5nE$;^-UmpGPVe
zN;}uyb&575T)#F@M=nBchrW2=xyV)J|DQK)S>X&|D-ZeLzEfa_f8;Z>G5!~*k6m)w
zWuhn`Mj6B6I7RIGIT;MEE!20*XD8>`dqI$M)`}**8b3{;|3pH#05pftkR;2P$3Xq_
z{-2zWy_eHwdTiZJYPI+~`+Yie=(s7I-bW|x_13zB(VScJ=?Fg*Dao*eZfCb|{yF^X
zpsso~i1z2_30G|BD|yGnnvi)wW*=!hSD;8}G*xjTe-5@CvtQ>r544xr;KOBcY)TJ^
zM`q*YL-6tYlzLUjaQc}((}Yz*D`Aud#<<DDf)$7gh&f~Mg}L1kR02-!v9^6`k3ZhI
z#wm4`1HM$(f|Y-Wa$mFv<4WnJbY<vfNw~L>?4~5KkXa-y1TtI~Eh+M`&QuSMxKLLJ
zO2z2T<2gzcDXq`isQ7aN-Z`*;tPSqzv~sc(SUhwh*{6Q<!Z84b);4A$zEcT-o*)w>
zB%TSHwXU=_&rvO1=RmQF7_B*DL`cR@6@p#_Qx1v@;}f!V6FNag^+WoCb8-3*mjsgO
z3JCx9Ak)Q>(Tj77CZXg?xL1zffQbNAo(NVq%&p=1o86~%6TP?qIm*6h2yUWc(0#k$
zM;F<2I73x22VO@8Z`QWEC?H7M{>sMi#(YLPDzh9Uc{AYMrR<$EGJ_W?n59rngb}(s
zXzhxV-Kp)H&sX%$a6P6@eQR4B&h-zN?J@3MiC`ab{~MMxRX43=dCA!8lg^o`RxQ}O
zSN}{}9T2s}C*K=rnttsr<94e*gz0`I&Rfg&=y}h;qo-ptCwwN?&gGr@c$ZS^S5oMr
zXcYsM;1~X?4ZlfW8%=sR#ilk0-_T3^WJ?YXhF_@q<(IYe`eW3L+18$=XT$3zW?M(Y
znqQYw?;37D`<8u1?Vyb3#qeW~%$5|3M;33K93Ec&H{9B3?|*>a#E#~~?<KoBnE6k~
zFD%TAe<d2yq)wC$4&qQr;NW8tPSB&%Mw89T?$k-8?bT9V+m7xwu`c-ZHGE91VRR9v
zh;_Og5@Xm$*K>bN^HRA*>7=7ODw3OSgGxE8y&uw|IgCsO^QbFHvEMf#J#-Fjw(Ix}
zgMBSUAnB(UiZJOg3L>Pn199Ajam)s1kr!35KU%m>@&waZPO5`-k$*ne$CJ3IE3!-_
z@hwt*fJ)g<z&<~N(3)nTn*<G~a#eSR7!S#k8oLPgzh)EUBZemrp?Z;urR<b#njXYl
z^q?I>vy3P?3ZiK#%#4u$Z`HwBI>|rPd;jj$Iaq6p5`rs9ukFaV%0a#fqoD^mDZ=O5
z1YRRl>bWaM&O!Xje6iw!klZ%-xdIGH+SIp;tf?IX>YyI@zR5VscrX-4D`;tbTw%6+
zPKR#?m%*E+ACq-2g3?`DkcSvRn!FxD)csVOs(~8!ve%4jC|xp0TjikoVlh9c7!*%O
zO1+Ed059bACJ4Y^q;qjfP#KgPiF#aNyBdIU9SLb2%(NsIzSHVq?vkQB#$%r9)<9r#
zl&Mrxu6Q?uoIPQXpmQPG3|K;500iqi6n4>_WB;83#fHRFBxGtbx^fJ!wK$u;uMlLc
zf-jX}JqBcVbrNdbZwmeW^uV&wWxl*3GS?itBR<Cx?yFdAN5$Y8dDx&ecG*o9l_TY4
z@7bwVCT4Br>;SI%BE87j`?S@RorzQ>=s^3q4xBU2lgicnsZ6n=z%5-}-Fg6U-iA5W
z^X~zKVlJ9j0r_wIJrwB&M+f`|ymTx-FRUq%L9g!nF!g9)s>W{L8<o+u$8SU@Sh^MA
zI*LAzkCz*JSAMh&KFC@ll-x*<OK&ay4bIKP27=SFB0nX>`>VNK!pSJCl%BRJf!+^M
zjPE6S<OnTlSeYjJn+JoGMj;7Fl2+d_coZAyDEBMK1~scKBrKsCeIOd*8<V)5?X7wX
zuK?&U($rE+rOsQwmoeeVe3G2;SEm)K2yUZNg76_qNDU2?{8?2on3(w5sSWv+=y-&u
zWD+0?oA>x$3ZNMZ8G_mwIlxs<^U2({v-08mok`VS5e&tZ@hiVcVzj48!6V6(^NNqv
zfz2up@IG@R^GPSfxH*Dl&8`G0R0a$GF$8@`SoxG__Wp=$CPRElY{g%Ov3G3-W#SR4
zeX!v-M8kkxPtDzKv7|9y7Q1F(!0a^Wlj_gTzp8lYch<S=qvueLhT~y5!DZMHpm@fK
zpEjU+tOvqZj7lt5O8E*#>Ax4GS$h2Ti4)SFmyYTic!Q(?b9fZGVgT+M$+bJ2z;*AE
z$81xJD*x24-Y;?X{p12>|6UjhHhiFUX)~^w-7J{RGb<K1J?JTJE)3t5clG4fLI1Ln
zD&Zpi<;znsMoR&|^7&Vz$CjUkQ|E-*DKf^B8`{_8CVs0`WqNwn_nU+ps2{Gn86~Ps
z*(>`whwfXa<!yKCWsLU61|R0@)l;X-@BNyextFK+XgqYwF}&JqaeA8nyyUKWK*<BE
zyog_Z3p#N(#~(N?zqU$#b8xsucck(9dB>YBw}QO)yc`qemG`f?ZSDE`cyHW|d-pFq
zI`4DX=Im*a!;7oU^}4TY+kBqUYnMdR4+^ZJ$LqgKP<-Bf9>dwU_nxh!FWwJ5^6*^}
z^Zn_e=@VmH2X($mkJkSB_553$`ssH!sur$EELgjrZ|IJ_?caV+ug}YB$?0WgRqlDc
z9OjhU=y#PjBx2jcvF5e&a@*SdqL-((OHT-*)h-v`VsE+Js`+t$e9R+vU;z7j&xzUN
zsdutHEE;2IZ%WRdb~=CB`>&Y&Ci8F0{i@HCbm}IHPbPcR{w@zFQUCq&W~+BWMp|aA
z+ztZ=UESa%Coj8#r}-bhbku42%}!zL$E5UJ-&%9gx%f!luenBPwq;jqYpj<s|2i%2
zin}RB2RfwAG1ObkO#}Mu(y`Zwl#nt{l{w@9+~B7X3rXOzfW(5kK?-ShHtWe2t*rb5
zQUh8CZF$}1-J#|Pr&E>yLkFib0~Hh~Axxarf3i4W7y{zjqH32Z67=hAV+hLcz#O6%
zUlxi1CtZ71IO2*4tNR?qLWx?Z#zi`6%kqK7Gvv?7d6e#*r|_Jn!B*#m&59*e#Q;<8
zBE6e0W3ai-0aYBtKtHV}yj#$Til4T|A+?f^jQF=Ew3fXDg3*HD_DE-t9GI<z+&>S&
z^r#{G<@AB%ybjx<&ZxRPyT{5lcy%lGrf8$fW%mTo*{;ZGb(;a1YM@dsLM(QvO2!9z
zC^$*dzotB?AeJFoVd!D;=cIwMeV;CY<+RK=tfy|L2*W9=gLITb$W|GtG|@%f&QczW
z1T=zE1#^V8o%Uv*X|wp{Q9lI>^rI7<EEO<vP`sK_fy~#&w!7Y5KsaMm*L}k2Po%Ye
Q?Z2mJbu#7uB;!~A0}s41j{pDw

literal 0
HcmV?d00001

diff --git a/test/onnx/assets/rgb_pytorch.png b/test/onnx/assets/rgb_pytorch.png
new file mode 100644
index 0000000000000000000000000000000000000000..c9d08e6c7da91991a780ded69d966fbd0c18eb5a
GIT binary patch
literal 575
zcmV-F0>J%=P)<h;3K|Lk000e1NJLTq003kF003kN0ssI2|9}Dm00068Nkl<Zc%1E=
z!F9th3`5zD&^bCv$L%Pcqcim8gHtPF1b~zu{xL5x<!>mlEGZGj7-Nhv#u#IaF~&Ub
z`}JAfg#UT3ZorwgrlOmy&ZeT3tmdYokF5TtqKT|6OhpG-yO@e{SsR&(T3LIUib7f2
znTje|JDQ3TSx-%UeE*C;Ugi2tTyNdf{F~uOlZ7=kb3iOQS&ODRAd+<hQ=yS{4^ttK
zMKyJ+FL<n})0_2qB;GpQruQANl65~*L6TK#ig$~TmsM_RvKcR+9vf;*kpRmYH&<*5
z?hYxdm8tRTuOaG&JO1`P0+Thkh5}QA62NfLlO@ZPpb$pZ)zuUCzN>PmaMjNnZUjJ9
zlUR+5Lc--_C1A-ayjS3rUX``c4B&cG-3=31RsxEw^1%(M0Zvu|%SvEb2`nptWhJnz
z1eTScX5Ns^HuHuz#<CKu$|@h(%(LFqvV|c$c%|J9698nf=TuTqCj^|V1jWlF_bP{x
zrI|e+BUnEGOIEEf%}NV@XC(lyR@N`kQdaD>(W(4^W5BYWUP<8T=~wzvHVw=eSveE-
z@vmU*u$W9x_LNA6ouqO*$_do4iT5CcyX{v$JfuD{;{F;oStf}>w4t0GG0jzx?!IRz
zQ-)W1Qwl#ZaW_~0ufxGg%Bmjxm&rPqx3<ybh%v?(V~jDz7-Nhf@dqxz$wu47lIQ>c
N002ovPDHLkV1i6h2Xz1d

literal 0
HcmV?d00001

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 865b3656dbbdf..9d56c1169dec2 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -496,35 +496,20 @@ def run_word_language_model(self, model_name):
         # Only support CPU version, since tracer is not working in GPU RNN.
         self.run_test(model, (x, model.hidden))
 
-    def get_image_from_url(self, url, size=(300, 200)):
+    def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor:
         import os
-        from urllib.parse import urlsplit
-        from urllib import request
         from PIL import Image
         from torchvision import transforms
-        from torch._utils_internal import get_writable_path
 
-        filename = os.path.basename(urlsplit(url)[2])
-        data_dir = get_writable_path(os.path.join(os.path.dirname(__file__)))
-        path = os.path.join(data_dir, filename)
-        data = request.urlopen(url, timeout=15).read()
-        with open(path, "wb") as f:
-            f.write(data)
-        image = Image.open(path).convert("RGB")
+        data_dir = os.path.join(os.path.dirname(__file__), "assets")
+        path = os.path.join(data_dir, *rel_path.split("/"))
+        image = Image.open(path).convert("RGB").resize(size, Image.BILINEAR)
 
-        image = image.resize(size, Image.BILINEAR)
+        return transforms.ToTensor()(image)
 
-        to_tensor = transforms.ToTensor()
-        return to_tensor(image)
-
-    def get_test_images(self):
-        image_url = "http://farm3.staticflickr.com/2469/3915380994_2e611b1779_z.jpg"
-        image = self.get_image_from_url(url=image_url, size=(100, 320))
-
-        image_url2 = "https://pytorch.org/tutorials/_static/img/tv_tutorial/tv_image05.png"
-        image2 = self.get_image_from_url(url=image_url2, size=(250, 380))
-
-        return [image], [image2]
+    def get_test_images(self) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        return ([self.get_image("grace_hopper_517x606.jpg", (100, 320))],
+                [self.get_image("rgb_pytorch.png", (250, 380))])
 
     @skipIfUnsupportedMinOpsetVersion(11)
     @disableScriptTest()  # Faster RCNN model is not scriptable

From 98449f5bbaed2ae8b2ac5f3e4d1cbe6445011961 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Mon, 23 Aug 2021 09:28:21 -0700
Subject: [PATCH 133/530] hotfix clone issue (#63770)

Summary:
This was discovered during https://github.com/pytorch/pytorch/issues/63408. For some reason only this checkout action is not correctly set fetch-depth

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63770

Reviewed By: malfet, janeyx99

Differential Revision: D30486110

Pulled By: walterddr

fbshipit-source-id: a67395cca2487407ed0d49c8c89587935ca5f212
---
 .github/templates/linux_ci_workflow.yml.j2                       | 1 +
 .github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 1 +
 .github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml | 1 +
 .github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 1 +
 .github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml | 1 +
 .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml        | 1 +
 .../generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml      | 1 +
 7 files changed, 7 insertions(+)

diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index bceeba51f20bc..9b4ba87194065 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -316,6 +316,7 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v2
         with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
       - name: Pull docker image
         run: |
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 1e1aec057c7d4..7e6006ad300de 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -284,6 +284,7 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v2
         with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
       - name: Pull docker image
         run: |
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index 28180e3e98727..2cd316b2b8674 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -284,6 +284,7 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v2
         with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
       - name: Pull docker image
         run: |
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index ddb1522962dff..a88191469ebde 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -284,6 +284,7 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v2
         with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
       - name: Pull docker image
         run: |
diff --git a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
index fb6d83a0f2432..264553f1877ee 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -284,6 +284,7 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v2
         with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
       - name: Pull docker image
         run: |
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index eabc42408fa91..7b100e6d1294d 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -284,6 +284,7 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v2
         with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
       - name: Pull docker image
         run: |
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 47ac9f73d422f..9d7da989e8b9a 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -282,6 +282,7 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v2
         with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
       - name: Pull docker image
         run: |

From db1b27fa8db52402d23ff2f357856b1a5d08d258 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Mon, 23 Aug 2021 09:44:09 -0700
Subject: [PATCH 134/530] fix mpi ssh runtime error (#63580)

Summary:
should fix https://github.com/pytorch/pytorch/issues/60756.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63580

Test Plan:
- this CI.
- validated by running on the bionic_cuda container: https://app.circleci.com/pipelines/github/pytorch/pytorch/366632/workflows/478602fb-698f-4210-ac09-d9c61af5c62b/jobs/15472104

Reviewed By: malfet

Differential Revision: D30486472

Pulled By: walterddr

fbshipit-source-id: d83ab88d163d4a468f03961a13d891b658668a7f
---
 .circleci/docker/common/install_openmpi.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.circleci/docker/common/install_openmpi.sh b/.circleci/docker/common/install_openmpi.sh
index 7bd32c71f16fb..8c45279b8b464 100644
--- a/.circleci/docker/common/install_openmpi.sh
+++ b/.circleci/docker/common/install_openmpi.sh
@@ -1,4 +1,10 @@
 #!/bin/bash
 
 sudo apt-get update
+# also install ssh to avoid error of:
+# --------------------------------------------------------------------------
+# The value of the MCA parameter "plm_rsh_agent" was set to a path
+# that could not be found:
+#   plm_rsh_agent: ssh : rsh
+sudo apt-get install -y ssh
 sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev

From 560cd881956bbf425251d63f0ff0f9085a759447 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 23 Aug 2021 12:05:51 -0700
Subject: [PATCH 135/530] Kill THCUNN (#63429)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63429

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D30441308

Pulled By: ngimel

fbshipit-source-id: 3ae342a2f8d5c7f8827b637c4055c5d1b0a1be26
---
 BUILD.bazel                                  | 18 -----
 CONTRIBUTING.md                              |  3 +-
 README.md                                    |  2 +-
 aten/CMakeLists.txt                          |  7 --
 aten/src/ATen/TensorUtils.cpp                |  1 -
 aten/src/ATen/TensorUtils.h                  |  1 -
 aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp |  1 -
 aten/src/ATen/cuda/detail/KernelUtils.h      |  3 -
 aten/src/README.md                           |  1 -
 aten/src/THCUNN/CMakeLists.txt               | 10 ---
 aten/src/THCUNN/README.md                    | 26 ------
 aten/src/THCUNN/SharedMem.cuh                | 43 ----------
 aten/src/THCUNN/THCHalfAutoNumerics.cuh      | 38 ---------
 aten/src/THCUNN/THCUNN.h                     | 13 ---
 aten/src/THCUNN/common.h                     | 83 --------------------
 aten/src/THCUNN/doc/api_reference.md         | 26 ------
 aten/src/THCUNN/doc/style_guidelines.md      | 64 ---------------
 aten/src/THCUNN/generic/THCUNN.h             | 29 -------
 setup.py                                     |  2 -
 tools/README.md                              |  4 -
 tools/amd_build/build_amd.py                 |  2 -
 torch/utils/hipify/hipify_python.py          |  1 -
 22 files changed, 2 insertions(+), 376 deletions(-)
 delete mode 100644 aten/src/THCUNN/CMakeLists.txt
 delete mode 100644 aten/src/THCUNN/README.md
 delete mode 100644 aten/src/THCUNN/SharedMem.cuh
 delete mode 100644 aten/src/THCUNN/THCHalfAutoNumerics.cuh
 delete mode 100644 aten/src/THCUNN/THCUNN.h
 delete mode 100644 aten/src/THCUNN/common.h
 delete mode 100644 aten/src/THCUNN/doc/api_reference.md
 delete mode 100644 aten/src/THCUNN/doc/style_guidelines.md
 delete mode 100644 aten/src/THCUNN/generic/THCUNN.h

diff --git a/BUILD.bazel b/BUILD.bazel
index dab227590072d..5acbe4082d38e 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -409,21 +409,6 @@ filegroup(
     ],
 )
 
-filegroup(
-    name = "thcunn_srcs_cu",
-    srcs = [
-        "aten/src/THCUNN/BCECriterion.cu.cc",
-        "aten/src/THCUNN/ELU.cu.cc",
-        "aten/src/THCUNN/HardTanh.cu.cc",
-        "aten/src/THCUNN/LeakyReLU.cu.cc",
-        "aten/src/THCUNN/MultiMarginCriterion.cu.cc",
-        "aten/src/THCUNN/SoftMarginCriterion.cu.cc",
-        "aten/src/THCUNN/SoftPlus.cu.cc",
-        "aten/src/THCUNN/SoftShrink.cu.cc",
-        "aten/src/THCUNN/Tanh.cu.cc",
-    ],
-)
-
 filegroup(
     name = "aten_srcs_cu",
     srcs = [
@@ -573,8 +558,6 @@ cc_library(
         "aten/src/THC/**/*.cpp",
         "aten/src/THC/*.cuh",
         "aten/src/THC/generic/*.cu.cc",
-        "aten/src/THCUNN/*.cuh",
-        "aten/src/THCUNN/generic/*.cu.cc",
     ],
     exclude = [
         "aten/src/ATen/Config.h",
@@ -716,7 +699,6 @@ cu_library(
     srcs = [
         ":aten_srcs_cu",
         ":thc_srcs_cu",
-        ":thcunn_srcs_cu",
     ],
     copts = ATEN_COPTS + torch_cuda_half_options,
     visibility = ["//visibility:public"],
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7d8659a8babff..e1a049cf9a979 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -242,8 +242,7 @@ into the repo directory.
 * [aten](aten) - C++ tensor library for PyTorch (no autograd support)
   * [src](aten/src) - [README](aten/src/README.md)
     * [TH](aten/src/TH)
-      [THC](aten/src/THC)
-      [THCUNN](aten/src/THCUNN) - Legacy library code from the original
+      [THC](aten/src/THC) - Legacy library code from the original
       Torch. Try not to add things here; we're slowly porting these to
       [native](aten/src/ATen/native).
       * generic - Contains actual implementations of operators,
diff --git a/README.md b/README.md
index 53ebfb1a4bec6..9b2a854ef3557 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ We hope you never spend hours debugging your code because of bad stack traces or
 PyTorch has minimal framework overhead. We integrate acceleration libraries
 such as [Intel MKL](https://software.intel.com/mkl) and NVIDIA ([cuDNN](https://developer.nvidia.com/cudnn), [NCCL](https://developer.nvidia.com/nccl)) to maximize speed.
 At the core, its CPU and GPU Tensor and neural network backends
-(TH, THC, THNN, THCUNN) are mature and have been tested for years.
+are mature and have been tested for years.
 
 Hence, PyTorch is quite fast – whether you run small or large neural networks.
 
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index 400b00f8e858a..7ba92a6decee7 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -80,21 +80,14 @@ if(USE_ROCM)
   # ATen proper)
   set(AT_CUDA_ENABLED 1)
   add_subdirectory(src/THH)
-  add_subdirectory(src/THHUNN)
   message("ROCm is enabled.")
 elseif(USE_CUDA)
   set(AT_CUDA_ENABLED 1)
   add_subdirectory(src/THC)
-  add_subdirectory(src/THCUNN)
 else()
   message("disabling CUDA because USE_CUDA is set false")
   set(AT_CUDA_ENABLED 0)
 endif()
-if(NOT USE_CUDA)
-  # we still parse THCUNN even if cuda is disabled to make sure to
-  # install it
-  install(FILES src/THCUNN/generic/THCUNN.h DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THCUNN/generic")
-endif()
 
 if(NOT USE_NNPACK)
   set(AT_NNPACK_ENABLED 0)
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index af9a8a1b22153..1ec9f9c291c0a 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -282,7 +282,6 @@ bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) {
   return contig_if_nonempty;
 }
 
-// Correspond to THCUNN_check_dim_size/THNN_check_dim_size
 void check_dim_size(
     const Tensor& tensor,
     int64_t dim,
diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
index 8e84ecaa4a3a2..1417174a1f6d3 100644
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@@ -144,7 +144,6 @@ TORCH_API void* maybe_data_ptr(const TensorArg& tensor);
 // on whether a subgeometry is contiguous.
 TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides);
 
-// Correspond to THCUNN_check_dim_size/THNN_check_dim_size
 TORCH_API void check_dim_size(
     const Tensor& tensor,
     int64_t dim,
diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
index 4ead51e6bd26e..0ad6dc8256ff0 100644
--- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
+++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -7,7 +7,6 @@
 #include <ATen/ExpandUtils.h>
 #include <THC/THC.h>
 #include <THC/THCTensor.hpp>
-#include <THCUNN/THCUNN.h>
 #undef THNN_
 #undef THCIndexTensor_
 #include <ATen/DeviceGuard.h>
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index 836504a729fea..91a61b04b8590 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -2,9 +2,6 @@
 
 #include <ATen/ATen.h>
 
-// Contents of this file are copied from THCUNN/common.h for the ease of porting
-// THCUNN functions into ATen.
-
 namespace at { namespace cuda { namespace detail {
 
 // CUDA: grid stride looping
diff --git a/aten/src/README.md b/aten/src/README.md
index e3e01515afb0f..183ec09a97efd 100644
--- a/aten/src/README.md
+++ b/aten/src/README.md
@@ -7,7 +7,6 @@ multiple variants of the library, summarized here:
 * TH = TorcH
 * THC = TorcH Cuda
 * THCS = TorcH Cuda Sparse (now defunct)
-* THCUNN = TorcH CUda Neural Network (see cunn)
 * THNN = TorcH Neural Network (now defunct)
 * THS = TorcH Sparse (now defunct)
 
diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt
deleted file mode 100644
index f84005e7e92f6..0000000000000
--- a/aten/src/THCUNN/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
-PARENT_SCOPE)
-
-set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE}
-  "${CMAKE_CURRENT_SOURCE_DIR}"
-PARENT_SCOPE)
-
-install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-  DESTINATION ${ATEN_INSTALL_INCLUDE_SUBDIR}
-  FILES_MATCHING PATTERN "*.h" PATTERN "*.cuh")
diff --git a/aten/src/THCUNN/README.md b/aten/src/THCUNN/README.md
deleted file mode 100644
index 5c4662322cbb5..0000000000000
--- a/aten/src/THCUNN/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# THCUNN
-
-THCUNN is a library that gathers nn's C implementations of neural network modules. It's entirely free of Lua dependency and therefore can be used in any application that has a C FFI. Please note that it only contains quite low level functions; most users will want to use ATen, which provides a C++ wrapper around these functions.
-
-
-Looking to add an implementation?  Consider writing an ATen native function
-instead!  See [../ATen/native](../ATen/native).
-
-## Links
-
-* [API reference](doc/api_reference.md)
-* [Style guidelines](doc/style_guidelines.md)
-
-## API
-
-THCUNN is a purely functional library. It provides 2-3 functions for each module, that perform the most important operations:
-
-* **updateOutput** - applies the module to an input
-* **updateGradInput** - accepts gradient w.r.t. output and previous module input, and computes a gradient w.r.t. that input
-* **accGradParameters** - *(optional, only modules with parameters)* accepts gradient w.r.t. output and previous module input, and computes gradient w.r.t. the parameters
-
-For information on argument types please check the [API reference](doc/api_reference.md).
-
-## Developer docs
-
-* [Style guidelines](doc/style_guidelines.md)
diff --git a/aten/src/THCUNN/SharedMem.cuh b/aten/src/THCUNN/SharedMem.cuh
deleted file mode 100644
index 8d83d9f9a9c58..0000000000000
--- a/aten/src/THCUNN/SharedMem.cuh
+++ /dev/null
@@ -1,43 +0,0 @@
-// Based on the simpleTempltes CUDA example
-
-#ifndef THCUNN_SHAREDMEM_H
-#define THCUNN_SHAREDMEM_H
-
-template <typename T>
-struct SharedMem {
-  __device__ T *getPointer()
-  {
-    extern __device__ void error(void);
-    error();
-    return NULL;
-  }
-};
-
-template <>
-struct SharedMem<half>
-{
-  __device__ half *getPointer() {
-    extern __shared__ half s_half[];
-    return s_half;
-  }
-};
-
-template <>
-struct SharedMem<float>
-{
-  __device__ float *getPointer() {
-    extern __shared__ float s_float[];
-    return s_float;
-  }
-};
-
-template <>
-struct SharedMem<double>
-{
-  __device__ double *getPointer() {
-    extern __shared__ double s_double[];
-    return s_double;
-  }
-};
-
-#endif
diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
deleted file mode 100644
index 62691b9df7c21..0000000000000
--- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef THC_HALF_AUTO_NUMERICS_INC
-#define THC_HALF_AUTO_NUMERICS_INC
-
-#include <TH/THHalf.h>
-#include <THC/THCNumerics.cuh>
-
-// WARNING: THCNumerics is being deprecated. Read the comments and function usage
-//          in THCNumerics to learn about the deprecation
-//
-// Half numerics functions defined as free functions, so cunn code can be
-// written generically, i.e. without excessive calling of THCNumerics<THHalf> functions.
-
-// these functions should move to THCNumerics
-
-inline __host__ __device__ THHalf fmaxType(THHalf x, THHalf y) {
-  return THCNumerics<THHalf>::ge(x, y) ? x : y;
-}
-
-inline __host__ __device__ float fmaxType(float x, THHalf y) {
-  return fmaxf(x, ScalarConvert<THHalf, float>::to(y));
-}
-
-inline __host__ __device__ float fmaxType(float x, float y) {
-  return fmaxf(x, y);
-}
-
-inline __host__ __device__ double fmaxType(double x, double y) {
-  return fmax(x, y);
-}
-
-
-// arithmetic functions
-
-inline __host__ __device__ THHalf pow(THHalf a, THHalf b) {
-  return THCNumerics<THHalf>::pow(a, b);
-}
-
-#endif
diff --git a/aten/src/THCUNN/THCUNN.h b/aten/src/THCUNN/THCUNN.h
deleted file mode 100644
index a4392ddaba166..0000000000000
--- a/aten/src/THCUNN/THCUNN.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#include <THC/THC.h>
-
-#define THCIndexTensor THCudaLongTensor
-#define THCIndexTensor_(NAME) THCudaLongTensor_ ## NAME
-typedef int64_t THCIndex_t;
-
-#define THNN_(NAME) TH_CONCAT_3(THNN_, CReal, NAME)
-
-#include <THCUNN/generic/THCUNN.h>
-#include <THC/THCGenerateFloatTypes.h>
-
-#include <THCUNN/generic/THCUNN.h>
-#include <THC/THCGenerateBFloat16Type.h>
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
deleted file mode 100644
index 69b7f3a4d3fa8..0000000000000
--- a/aten/src/THCUNN/common.h
+++ /dev/null
@@ -1,83 +0,0 @@
-#ifndef THCUNN_COMMON_H
-#define THCUNN_COMMON_H
-
-#define THCUNN_assertSameGPU(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \
-  "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
-
-// Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
-
-// CUDA: number of blocks for threads.
-inline int GET_BLOCKS(const int64_t N)
-{
-  // Round up division for positive number
-  auto block_num = N / CUDA_NUM_THREADS + (N % CUDA_NUM_THREADS == 0 ? 0 : 1);
-
-  constexpr int64_t max_int = std::numeric_limits<int>::max();
-  THAssertMsg(block_num <= max_int, "Can't schedule too many blocks on CUDA device");
-
-  return static_cast<int>(block_num);
-}
-
-#define THCUNN_resizeAs_indices(STATE, I1, I2)              \
-  if (!I1->sizes().equals(I2->sizes()))                     \
-  { \
-    THCudaLongTensor_resizeAs(STATE, I1, I2);               \
-  }
-
-#define THCUNN_check_shape(STATE, I1, I2)                 \
-  if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2))        \
-  { \
-       THCDescBuff s1 = THCTensor_(sizeDesc)(STATE, I1);  \
-       THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2);  \
-       THError(#I1 " and " #I2 " shapes do not match: "   \
-               #I1 " %s, " #I2 " %s", s1.str, s2.str);    \
-  }
-
-
-#define THCUNN_check_shape_indices(STATE, I1, I2)              \
-  if (!I1->sizes().equals(I2->sizes()))                        \
-  { \
-       THCDescBuff s1 = THCIndexTensor_(sizeDesc)(STATE, I1);  \
-       THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2);       \
-       THError(#I1 " and " #I2 " shapes do not match: "        \
-               #I1 " %s, " #I2 " %s", s1.str, s2.str);         \
-  }
-
-#define THCUNN_check_nElement(STATE, I1, I2)                \
-  if (I1 != NULL && I2 != NULL ) {                          \
-    ptrdiff_t n1 = THCTensor_(nElement)(STATE, I1);              \
-    ptrdiff_t n2 = THCTensor_(nElement)(STATE, I2);              \
-    if (n1 != n2)                                           \
-    {        \
-      THCDescBuff s1 = THCTensor_(sizeDesc)(state, I1);     \
-      THCDescBuff s2 = THCTensor_(sizeDesc)(state, I2);     \
-      THError(#I1 " and " #I2 " have different number of elements: "        \
-              #I1 "%s has %ld elements, while "             \
-              #I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
-    }        \
-  }
-
-#define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \
-  if (THCTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM ||             \
-      THCTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) {        \
-      THCDescBuff s1 = THCTensor_(sizeDesc)(state, T);       \
-      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"        \
-              " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
-  }
-
-#define THCUNN_check_dim_size_indices(STATE, T, DIM, DIM_SIZE, SIZE)  \
-  if (THCIndexTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM ||                 \
-      THCIndexTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) {            \
-      THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, T);           \
-      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
-              " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
-  }
-
-#define THCUNN_argCheck(STATE, COND, ARG, T, FORMAT) \
-  if (!(COND)) { \
-    THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \
-    THArgCheck(COND, ARG, FORMAT, s1.str);           \
-  }
-
-#endif
diff --git a/aten/src/THCUNN/doc/api_reference.md b/aten/src/THCUNN/doc/api_reference.md
deleted file mode 100644
index 3f49b9b6d1ce6..0000000000000
--- a/aten/src/THCUNN/doc/api_reference.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# API docs
-
-This document describes the conventions behind the THCUNN API.
-
-### The API
-
-All functions provided by THCUNN are stored in `aten/src/THCUNN/generic/THCUNN.h`.
-Look at this file.
-
-### Note on function names
-
-Please remember, that because C doesn't support function overloading, functions taking different tensor types have different names. So e.g. for an Abs module, there are actually two updateOutput functions:
-
-* `void THNN_FloatAbs_updateOutput(...)`
-* `void THNN_DoubleAbs_updateOutput(...)`
-
-In these docs such function will be referred to as `void THCUNN_Abs_updateOutput(...)`, and it's up to developer to add a type prefix. `real` is an alias for that type.
-
-### Argument types
-
-Some arguments have additional tags placed in square brackets in their header declarations:
-
-* **[OUT]** - This is the output argument. It will be reshaped if needed.
-* **[OPTIONAL]** - This argument is optional and can be safely set to NULL
-* **[BUFFER]** - A buffer. `updateGradInput` and `accGradParameters` should get the same buffers that were used in `updateOutput` call.
-* **[MODIFIED]** - Some functions accept an `inplace` flag. If set to true, this argument might be modified (in addition to the output).
diff --git a/aten/src/THCUNN/doc/style_guidelines.md b/aten/src/THCUNN/doc/style_guidelines.md
deleted file mode 100644
index 086db8bcbe28a..0000000000000
--- a/aten/src/THCUNN/doc/style_guidelines.md
+++ /dev/null
@@ -1,64 +0,0 @@
-## API design guidelines
-
-Functions should return `void`.
-
-All functions should accept arguments in the following order. `...` represent any module-specific parameters or buffers, disregarding whether they are used for writing or reading. Arguments in `...` below should be ordered like this:
-```
-[weight], [bias], [any buffers], [additional arguments], [optional arguments]
-```
-
-### Modules
-```
-updateOutput: state, input, output, ...
-updateGradInput: state, input, gradOutput, gradInput, ...
-accGradParameters: state, input, gradOutput, [gradWeight], [gradBias], ...
-```
-
-e.g.
-```C
-void THNN_(ClassNLLCriterion_updateGradInput)(
-           THCState *state,
-           THCTensor *input,
-           THCIndexTensor *target,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           int64_t reduction,
-           THCTensor *weights,
-           THCTensor *total_weight,
-           int64_t ignore_index)
-```
-
-### Criterions
-```
-updateOutput: state, input, target, output, ...
-updateGradInput: state, input, target, gradInput, ...
-```
-
-e.g.
-
-```C
-void THNN_(ClassNLLCriterion_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCIndexTensor *target,
-           THCTensor *output,
-           int64_t reduction,
-           THCTensor *weights,
-           THCTensor *total_weight,
-           int64_t ignore_index)
-```
-
-## Code style guide
-
-```C
-void THNN_(GatedLinear_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *output,
-           int dim)
-//<- 10 ->
-```
-
-All arguments should start on a new line after function name, and they should be indented using 10 spaces.
-
-Use 2 spaces for block indentation.
diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h
deleted file mode 100644
index d624fdd090177..0000000000000
--- a/aten/src/THCUNN/generic/THCUNN.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/THCUNN.h"
-#else
-
-#include <ATen/core/Reduction.h>
-#include <ATen/Generator.h>
-
-TORCH_CUDA_CU_API void THNN_(MultiMarginCriterion_updateOutput)(
-    THCState* state,
-    THCTensor* input,
-    THCIndexTensor* target,
-    THCTensor* output,
-    int64_t reduction,
-    int p,
-    THCTensor* weights, // [OPTIONAL]
-    accreal margin);
-
-TORCH_CUDA_CU_API void THNN_(MultiMarginCriterion_updateGradInput)(
-    THCState* state,
-    THCTensor* input,
-    THCIndexTensor* target,
-    THCTensor* gradOutput,
-    THCTensor* gradInput,
-    int64_t reduction,
-    int p,
-    THCTensor* weights, // [OPTIONAL]
-    accreal margin);
-
-#endif
diff --git a/setup.py b/setup.py
index 8135e1e4c2f7c..a20098232af3c 100644
--- a/setup.py
+++ b/setup.py
@@ -1028,8 +1028,6 @@ def print_box(msg):
                 'include/THC/*.cuh',
                 'include/THC/*.h*',
                 'include/THC/generic/*.h',
-                'include/THCUNN/*.cuh',
-                'include/THCUNN/generic/*.h',
                 'include/THH/*.cuh',
                 'include/THH/*.h*',
                 'include/THH/generic/*.h',
diff --git a/tools/README.md b/tools/README.md
index a28affa5f30aa..e4aba38afd851 100644
--- a/tools/README.md
+++ b/tools/README.md
@@ -15,10 +15,6 @@ Modern infrastructure:
     to import arbitrary Python files in a script, without having to add
     them to the PYTHONPATH first.
 
-Legacy infrastructure (we should kill this):
-* [cwrap](cwrap) - Implementation of legacy code generation for THNN/THCUNN.
-  This is used by nnwrap.
-
 Build system pieces:
 
 * [setup_helpers](setup_helpers) - Helper code for searching for
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index 8cfecda82e328..70f7e7a83e1ec 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -81,12 +81,10 @@
     "aten/src/ATen/native/sparse/cuda/*",
     "aten/src/ATen/native/quantized/cuda/*",
     "aten/src/THC/*",
-    "aten/src/THCUNN/*",
     "aten/src/ATen/test/*",
     # CMakeLists.txt isn't processed by default, but there are a few
     # we do want to handle, so explicitly specify them
     "aten/src/THC/CMakeLists.txt",
-    "aten/src/THCUNN/CMakeLists.txt",
     "torch/*",
     "tools/autograd/templates/python_variable_methods.cpp",
 ]
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
index 6697f1e014cf7..ad2903f7ad655 100644
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -750,7 +750,6 @@ def repl(m):
                 or f.startswith("ATen/native/quantized/cuda")
                 or f.startswith("ATen/native/sparse/cuda")
                 or f.startswith("THC/")
-                or f.startswith("THCUNN/")
                 or (f.startswith("THC") and not f.startswith("THCP"))
             ):
                 return templ.format(get_hip_file_path(m.group(1), is_pytorch_extension))

From a709ab34a8d847cae506e221ad8c8efa1eadc828 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Mon, 23 Aug 2021 12:41:32 -0700
Subject: [PATCH 136/530] [nnc] Re-enable CPU fusion" (#63665)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63665

This reverts commit 125e2d02e575612eb427104e7c67f1c28f090db8.

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D30471646

Pulled By: bertmaher

fbshipit-source-id: 4189869566f03b5f9ada78d78830f6a34946eed6
---
 torch/_C/__init__.pyi.in                   |  2 ++
 torch/csrc/jit/codegen/fuser/executor.cpp  |  5 +++--
 torch/csrc/jit/codegen/fuser/interface.cpp |  8 ++------
 torch/csrc/jit/passes/graph_fuser.cpp      | 12 +++++++++++-
 torch/csrc/jit/passes/graph_fuser.h        |  3 +++
 torch/csrc/jit/python/init.cpp             |  2 ++
 torch/testing/_internal/jit_utils.py       |  2 ++
 7 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 4d0245c7786af..0b6bb6b64e0a4 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -208,6 +208,7 @@ def _jit_get_schemas_for_operator(name :str) -> List[FunctionSchema]: ...
 def _jit_check_alias_annotation(g: Graph, args: Tuple[Any, ...], unqualified_op_name: str): ...
 def _jit_can_fuse_on_cpu() -> _bool: ...
 def _jit_can_fuse_on_gpu() -> _bool: ...
+def _jit_can_fuse_on_cpu_legacy() -> _bool: ...
 def _debug_get_fusion_group_inlining() -> _bool: ...
 def _debug_set_fusion_group_inlining(enable: _bool): ...
 def _jit_texpr_fuser_enabled() -> _bool: ...
@@ -215,6 +216,7 @@ def _jit_nvfuser_enabled() -> _bool: ...
 def _llvm_enabled() -> _bool: ...
 def _jit_override_can_fuse_on_cpu(override: _bool): ...
 def _jit_override_can_fuse_on_gpu(override: _bool): ...
+def _jit_override_can_fuse_on_cpu_legacy(override: _bool): ...
 def _jit_set_symbolic_shapes_test_mode(override: _bool): ...
 def _jit_symbolic_shapes_test_mode_enabled() -> _bool: ...
 def _jit_set_texpr_fuser_enabled(enable: _bool): ...
diff --git a/torch/csrc/jit/codegen/fuser/executor.cpp b/torch/csrc/jit/codegen/fuser/executor.cpp
index b260e48b16c3f..46f2f41d07e36 100644
--- a/torch/csrc/jit/codegen/fuser/executor.cpp
+++ b/torch/csrc/jit/codegen/fuser/executor.cpp
@@ -11,6 +11,7 @@
 #include <torch/csrc/jit/codegen/fuser/kernel_cache.h>
 #include <torch/csrc/jit/codegen/fuser/kernel_spec.h>
 #include <torch/csrc/jit/codegen/fuser/tensor_info.h>
+#include <torch/csrc/jit/passes/graph_fuser.h>
 
 #include <algorithm>
 #include <iostream> // TODO: remove, debugging only
@@ -327,7 +328,7 @@ void launchFusion(
 
 bool runFusion(const int64_t key, Stack& stack, std::string* code_out) {
   // Short-circuits if fusion isn't enabled
-  if (!canFuseOnCPU() && !canFuseOnGPU())
+  if (!canFuseOnCPULegacy() && !canFuseOnGPU())
     return false;
 
   // Acquires the FusionSpec
@@ -362,7 +363,7 @@ bool runFusion(const int64_t key, Stack& stack, std::string* code_out) {
   // Attempts to run fallback if device fusion is disabled
   if (device.is_cuda() && !canFuseOnGPU())
     return false;
-  if (device.is_cpu() && !canFuseOnCPU())
+  if (device.is_cpu() && !canFuseOnCPULegacy())
     return false;
   if (device.is_xpu())
     return false;
diff --git a/torch/csrc/jit/codegen/fuser/interface.cpp b/torch/csrc/jit/codegen/fuser/interface.cpp
index ec67c4bd83773..ef7e9e0b629d5 100644
--- a/torch/csrc/jit/codegen/fuser/interface.cpp
+++ b/torch/csrc/jit/codegen/fuser/interface.cpp
@@ -8,15 +8,12 @@
 #include <c10/util/Flags.h>
 #include <stdexcept>
 
-C10_DEFINE_bool(torch_jit_enable_cpu_fusion, false, "enable cpu fusion");
-
 namespace torch {
 namespace jit {
 
 namespace detail {
 
-// Note: CPU fusion is currently disabled due to test flakiness
-#if defined(FBCODE_CAFFE2)
+#ifdef TORCH_ENABLE_LLVM
 bool cpu_fuser_enabled = true;
 #else
 bool cpu_fuser_enabled = false;
@@ -37,8 +34,7 @@ void runFusion(const int64_t key, Stack& stack) {
 }
 
 bool canFuseOnCPU() {
-  return fuser::hasFusionBackend(DeviceType::CPU) &&
-      (detail::cpu_fuser_enabled || FLAGS_torch_jit_enable_cpu_fusion);
+  return fuser::hasFusionBackend(DeviceType::CPU) && detail::cpu_fuser_enabled;
 }
 
 bool canFuseOnGPU() {
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index f7dd466de4ff4..653f9fec08b32 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -183,7 +183,7 @@ struct GraphFuser {
       return !strict_fuser_check;
     }
     if ((*device).is_cpu()) {
-      return canFuseOnCPU();
+      return canFuseOnCPULegacy();
     } else if ((*device).is_cuda()) {
       return canFuseOnGPU();
     } else if ((*device).is_xpu()) {
@@ -1244,6 +1244,16 @@ void PeepholeOptimizeShapeExpressions(Block* block, AliasDb* db) {
 
 } // anonymous namespace
 
+static bool cpu_fuser_enabled_legacy = false;
+
+bool canFuseOnCPULegacy() {
+  return cpu_fuser_enabled_legacy;
+}
+
+void overrideCanFuseOnCPULegacy(bool value) {
+  cpu_fuser_enabled_legacy = value;
+}
+
 void FuseGraph(std::shared_ptr<Graph>& graph, bool strict_fuser_check) {
   AliasDb db(graph);
   GraphFuser(&db, graph->block(), strict_fuser_check).run();
diff --git a/torch/csrc/jit/passes/graph_fuser.h b/torch/csrc/jit/passes/graph_fuser.h
index 0cdcc2e20f469..aafb442eafb6f 100644
--- a/torch/csrc/jit/passes/graph_fuser.h
+++ b/torch/csrc/jit/passes/graph_fuser.h
@@ -5,6 +5,9 @@
 namespace torch {
 namespace jit {
 
+TORCH_API bool canFuseOnCPULegacy();
+TORCH_API void overrideCanFuseOnCPULegacy(bool value);
+
 // NB: Be sure to run DCE before fusion, because dead instructions
 // can prevent fusion opportunities from being exploited.
 // On Windows will noop, NYI
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index c92ab1b46e41c..baea47d63ed18 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -590,6 +590,8 @@ void initJITBindings(PyObject* module) {
       .def("_jit_override_can_fuse_on_gpu", &overrideCanFuseOnGPU)
       .def("_jit_can_fuse_on_cpu", &canFuseOnCPU)
       .def("_jit_can_fuse_on_gpu", &canFuseOnGPU)
+      .def("_jit_can_fuse_on_cpu_legacy", &canFuseOnCPULegacy)
+      .def("_jit_override_can_fuse_on_cpu_legacy", &overrideCanFuseOnCPULegacy)
       .def(
           "_jit_differentiate",
           [](Graph& g) {
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index 6086572039033..7f9fb976934d3 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -668,11 +668,13 @@ def wrapper(func):
 
 def enable_cpu_fuser(fn):
     def wrapper(*args, **kwargs):
+        torch._C._jit_override_can_fuse_on_cpu_legacy(True)
         torch._C._jit_override_can_fuse_on_cpu(True)
         torch._C._jit_set_te_must_use_llvm_cpu(False)
         try:
             fn(*args, **kwargs)
         finally:
+            torch._C._jit_override_can_fuse_on_cpu_legacy(False)
             torch._C._jit_override_can_fuse_on_cpu(False)
             torch._C._jit_set_te_must_use_llvm_cpu(True)
     return wrapper

From 0bc7fef406447cbafe00b0337527ae37e315a064 Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Mon, 23 Aug 2021 12:53:42 -0700
Subject: [PATCH 137/530] [Static Runtime] Remove unused fusion patterns
 (#63636)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63636

Reviewed By: d1jang

Differential Revision: D30446573

fbshipit-source-id: 3abb7f697380f3b4e865b98c594de359b5e26b96
---
 torch/csrc/jit/runtime/static/passes.cpp | 115 +++++------------------
 1 file changed, 23 insertions(+), 92 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
index bbd7dd17f2feb..2e9eb5746d276 100644
--- a/torch/csrc/jit/runtime/static/passes.cpp
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@@ -165,125 +165,57 @@ C10_UNUSED void ClipRangesGather(std::shared_ptr<torch::jit::Graph>& graph) {
   fuse.runOnGraph(graph);
 }
 
+C10_UNUSED void PrecomputeMultiplierShiftForSigridHash(
+    std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string pattern = R"IR(
+    graph(%a, %b, %c, %d):
+        %y0 : Tensor = fb::sigrid_hash(%a, %b, %c, %d)
+        return (%y0)
+  )IR";
+  std::string split_pattern = R"IR(
+    graph(%a, %b, %c, %d):
+        %y0 : Tensor = fb::sigrid_hash_compute_multipler_shift(%c)
+        %y2 : Tensor = fb::sigrid_hash_precompute(%a, %b, %c, %y0, %d)
+        return (%y2)
+  )IR";
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, split_pattern);
+  fuse.runOnGraph(graph);
+}
+
 C10_UNUSED
 void ClipRangesGatherSigridHash(std::shared_ptr<torch::jit::Graph>& graph) {
   // TODO:: check restrictions for inputs; outputs not used elsewhere
-  std::string pattern_1 = R"IR(
-    graph(%a, %b, %c, %d, %e, %f, %g):
-        %y0 : Tensor, %y1 : Tensor = fb::clip_ranges_gather_lengths_to_offsets(%a, %b, %c, %d)
-        %y2 : Tensor = fb::sigrid_hash(%y0, %e, %f, %g)
-        return (%y2, %y1))IR";
-  std::string fused_pattern_1 = R"IR(
-    graph(%a, %b, %c, %d, %e, %f, %g):
-        %off : Tensor, %out : Tensor = fb::clip_ranges_gather_sigrid_hash_offsets(%b, %a, %c, %e, %f, %g, %d)
-        return (%out, %off))IR";
-
-  std::string pattern_2 = R"IR(
+  std::string pattern = R"IR(
     graph(%a, %b, %c, %d, %e, %f, %g, %h):
         %y0 : Tensor, %y1 : Tensor = fb::clip_ranges_gather_lengths_to_offsets(%a, %b, %c, %d)
         %y2 : Tensor = fb::sigrid_hash_precompute(%y0, %e, %f, %g, %h)
         return (%y2, %y1))IR";
-  std::string fused_pattern_2 = R"IR(
+  std::string fused_pattern = R"IR(
     graph(%a, %b, %c, %d, %e, %f, %g, %h):
         %off : Tensor, %out : Tensor = fb::clip_ranges_gather_sigrid_hash_precompute_offsets(%b, %a, %c, %e, %f, %g, %h, %d)
         return (%out, %off))IR";
   SubgraphRewriter fuse;
-  fuse.RegisterRewritePattern(pattern_1, fused_pattern_1);
-  fuse.runOnGraph(graph);
-
-  fuse.RegisterRewritePattern(pattern_2, fused_pattern_2);
+  fuse.RegisterRewritePattern(pattern, fused_pattern);
   fuse.runOnGraph(graph);
 }
 
 C10_UNUSED void ClipRangesGatherRangesSigridHash(
     std::shared_ptr<torch::jit::Graph>& graph) {
-  std::string pattern_1 = R"IR(
-    graph(%a, %b, %c, %d, %e, %f):
-        %y0 : Tensor = fb::clip_ranges(%b, %c)
-        %y1 : Tensor, %y2 : Tensor = fb::gather_ranges(%a, %y0)
-        %y3 : Tensor = fb::sigrid_hash(%y1, %d, %e, %f)
-        return (%y3, %y2))IR";
-  std::string fused_pattern_1 = R"IR(
-    graph(%a, %b, %c, %d, %e, %f):
-        %off : Tensor, %out : Tensor = fb::clip_ranges_gather_sigrid_hash_v3(%b, %a, %c, %d, %e, %f)
-        return (%out, %off))IR";
-
-  std::string pattern_2 = R"IR(
+  std::string pattern = R"IR(
     graph(%a, %b, %c, %d, %e, %f, %g):
         %y0 : Tensor = fb::clip_ranges(%b, %c)
         %y1 : Tensor, %y2 : Tensor = fb::gather_ranges(%a, %y0)
         %y3 : Tensor = fb::sigrid_hash_precompute(%y1, %d, %e, %f, %g)
         return (%y3, %y2))IR";
-  std::string fused_pattern_2 = R"IR(
+  std::string fused_pattern = R"IR(
     graph(%a, %b, %c, %d, %e, %f, %g):
         %off : Tensor, %out : Tensor = fb::clip_ranges_gather_sigrid_hash_precompute_v3(%b, %a, %c, %d, %e, %f, %g)
         return (%out, %off))IR";
 
-  SubgraphRewriter fuse;
-  fuse.RegisterRewritePattern(pattern_1, fused_pattern_1);
-  fuse.runOnGraph(graph);
-
-  fuse.RegisterRewritePattern(pattern_2, fused_pattern_2);
-  fuse.runOnGraph(graph);
-}
-
-C10_UNUSED void PrecomputeMultiplierShiftForSigridHash(
-    std::shared_ptr<torch::jit::Graph>& graph) {
-  std::string pattern = R"IR(
-    graph(%a, %b, %c, %d):
-        %y0 : Tensor = fb::sigrid_hash(%a, %b, %c, %d)
-        return (%y0)
-  )IR";
-  std::string split_pattern = R"IR(
-    graph(%a, %b, %c, %d):
-        %y0 : Tensor = fb::sigrid_hash_compute_multipler_shift(%c)
-        %y2 : Tensor = fb::sigrid_hash_precompute(%a, %b, %c, %y0, %d)
-        return (%y2)
-  )IR";
-  SubgraphRewriter fuse;
-  fuse.RegisterRewritePattern(pattern, split_pattern);
-  fuse.runOnGraph(graph);
-}
-
-C10_UNUSED void ClipRangesGatherRangesX2SigridHash(
-    std::shared_ptr<torch::jit::Graph>& graph) {
-  // Placeholder is a dummy op used to capture the first subgraph
-  std::string pattern = R"IR(
-    graph(%ranges, %values, %max_length, %salt, %max_value, %hash_into_int32):
-        %clipped : Tensor = fb::clip_ranges(%ranges, %max_length)
-        %output : Tensor, %unused : Tensor = fb::gather_ranges(%values, %clipped)
-        %sigrid_hash_out : Tensor = fb::sigrid_hash(%output, %salt, %max_value, %hash_into_int32)
-        return (%sigrid_hash_out, %clipped))IR";
-  std::string fused_pattern = R"IR(
-    graph(%ranges, %values, %max_length, %salt, %max_value, %hash_into_int32):
-        %sigrid_hash_out : Tensor, %clipped : Tensor = fb::placeholder(%ranges, %values, %max_length, %salt, %max_value, %hash_into_int32)
-        return (%sigrid_hash_out, %clipped))IR";
-
-  // the second gather_ranges can be eliminated because the `lengths` is
-  // produces is identical to the lengths produced by
-  // clip_ranges_gather_sigrid_hash_v3 (caveat, the fused ops makes some
-  // simplifying assumptions about the ranges input)
-  std::string pattern2 = R"IR(
-    graph(%gather2_values, %ranges, %values, %max_length, %salt, %max_value, %hash_into_int32):
-        %sigrid_hash_out : Tensor, %clipped : Tensor = fb::placeholder(%ranges, %values, %max_length, %salt, %max_value, %hash_into_int32)
-        %unused : Tensor, %lengths : Tensor = fb::gather_ranges(%gather2_values, %clipped)
-        return (%lengths, %sigrid_hash_out))IR";
-
-  std::string fused_pattern2 = R"IR(
-    graph(%gather2_values, %ranges, %values, %max_length, %salt, %max_value, %hash_into_int32):
-        %lengths : Tensor, %sigrid_hash_out : Tensor = fb::clip_ranges_gather_sigrid_hash_v3(%ranges, %values, %max_length, %salt, %max_value, %hash_into_int32)
-        return (%lengths, %sigrid_hash_out))IR";
-
   SubgraphRewriter fuse;
   fuse.RegisterRewritePattern(pattern, fused_pattern);
   fuse.runOnGraph(graph);
-
-  fuse.RegisterRewritePattern(pattern2, fused_pattern2);
-  fuse.runOnGraph(graph);
-
-  // reverse the ops that got fused in step 1 but not in step2
-  fuse.RegisterRewritePattern(fused_pattern, pattern);
-  fuse.runOnGraph(graph);
 }
 
 C10_UNUSED void ClipRangesGatherRangesX2SigridHashPrecompute(
@@ -349,7 +281,6 @@ void FuseInferenceOpsForSparseNN(std::shared_ptr<torch::jit::Graph>& graph) {
   ClipRangesGatherSigridHash(graph);
   ClipRangesGatherRangesSigridHash(graph);
 
-  ClipRangesGatherRangesX2SigridHash(graph);
   ClipRangesGatherRangesX2SigridHashPrecompute(graph);
 
   // prioritize clip_ranges+gather_ranges+sigrid_hash fusion over

From a781340bf7f610c14c42bf2e0a5f06d9b7e67193 Mon Sep 17 00:00:00 2001
From: Zeina Migeed <migeedz@fb.com>
Date: Mon, 23 Aug 2021 14:09:10 -0700
Subject: [PATCH 138/530] Add equality constraints for some acc opeartions for
 symbolic inference (#63689)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63689

Test Plan:
buck run mode/opt-clang caffe2/torch/fb/model_transform/experimental:fx_ir_lower_inline_cvr -- \
    --action=lower_and_run \
    --filename=inline_cvr_7x_dec_2020.model \
    --print_glow_glog=True

Reviewed By: jamesr66a

Differential Revision: D30462113

fbshipit-source-id: 0b2a1ce9770561248527d47c07b80112491dc949
---
 .../experimental/graph_gradual_typechecker.py   | 12 +++++++++---
 torch/fx/experimental/unify_refinements.py      | 17 +++++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index 5ce53a7ff6896..6e05f918e810e 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -436,7 +436,7 @@ def adaptiveavgpool2d_inference_rule(n: Node, module_instance):
 def flatten_check(tensor_type, start_dim, end_dim):
     l = len(tensor_type.__args__)
 
-    start_dim = l if start_dim == -1 else start_dim
+    start_dim = l if start_dim == -1 else abs(start_dim)
     end_dim = l + end_dim + 1 if end_dim < 0 else end_dim + 1
 
     if 0 <= start_dim <= (l - 1) and 0 <= end_dim <= l and start_dim < end_dim:
@@ -668,6 +668,10 @@ def replace_dyn_with_fresh_var(self, typ):
         elif isinstance(typ, TensorType):
             new_args = [self.replace_dyn_with_fresh_var(a) for a in typ.__args__]
             return TensorType(tuple(new_args))
+        elif isinstance(typ, list):
+            return [self.replace_dyn_with_fresh_var(t) for t in typ]
+        elif isinstance(typ, tuple):
+            return (self.replace_dyn_with_fresh_var(t) for t in typ)
         else:
             return typ
 
@@ -698,8 +702,10 @@ def refine_node(self, n: Node):
                 pass
 
         if n.op == 'output':
-            assert isinstance(n.args[0], Node)
-            n.type = n.args[0].type
+            def get_node_type(a):
+                return a.type
+            n.type = torch.fx.node.map_arg(n.args[0], get_node_type)
+            return n.type
 
         else:
             pass
diff --git a/torch/fx/experimental/unify_refinements.py b/torch/fx/experimental/unify_refinements.py
index c8561041472ae..5074377ebf2dc 100644
--- a/torch/fx/experimental/unify_refinements.py
+++ b/torch/fx/experimental/unify_refinements.py
@@ -52,6 +52,8 @@ def substitute_solution_one_type(mapping, t):
     if isinstance(t, Var):
         if t in mapping.keys():
             return mapping[t]
+        else:
+            return t
 
     elif isinstance(t, TensorType):
         new_type = []
@@ -62,6 +64,21 @@ def substitute_solution_one_type(mapping, t):
                 new_type.append(typ)
         return TensorType(tuple(new_type))
 
+    elif isinstance(t, list):
+        new_type = []
+        for typ in t:
+            new_type.append(substitute_solution_one_type(mapping, typ))
+        return new_type
+
+    elif isinstance(t, tuple):
+        new_type = []
+        for typ in t:
+            new_type.append(substitute_solution_one_type(mapping, typ))
+        return tuple(new_type)
+
+    else:
+        return t
+
 
 def substitute_all_types(graph, mapping):
     """

From 7946f8a9f6a020a89f534f4a2b921357935ee975 Mon Sep 17 00:00:00 2001
From: Erjia Guan <erjia@fb.com>
Date: Mon, 23 Aug 2021 14:32:56 -0700
Subject: [PATCH 139/530] Rename DataPipe to Op-er (#63325)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63325

Rename each DataPipe to an operation name ending with er. Functional API should remain `verb` such as `read_from_tar` , `shuffle`, ... (Discussed in [here](https://github.com/facebookexternal/torchdata/pull/97#discussion_r688553905))
- Batch -> Batcher
- Collate -> Collator
- Concat -> Concater
- GroupByKey - > ByKeyGrouper ?
- ListDirFiles -> FileLister
- LoadFilesFromDisk -> FileLoader
- Map -> Mapper
- ReadFilesFromTar -> TarArchiveReader
- ReadFilesFromZip -> ZipArchiveReader
- ReadLinesFromFile -> LineReader
- Shuffle -> Shuffler
- ToBytes -> StreamReader
- Transforms -> Transformer
- Zip -> Zipper

Let me know if you have better name for each DataPipe

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D30466950

Pulled By: ejguan

fbshipit-source-id: 72909dca7b3964ab83b965891f96cc1ecf62d049
---
 test/test_datapipe.py                         | 127 ++++++------------
 torch/utils/data/datapipes/iter/__init__.py   |  74 +++++-----
 torch/utils/data/datapipes/iter/callable.py   |  41 +-----
 .../data/datapipes/iter/combinatorics.py      |   4 +-
 torch/utils/data/datapipes/iter/combining.py  |   8 +-
 .../iter/{listdirfiles.py => filelister.py}   |   4 +-
 .../{loadfilesfromdisk.py => fileloader.py}   |   4 +-
 torch/utils/data/datapipes/iter/grouping.py   |  12 +-
 torch/utils/data/datapipes/iter/linereader.py |  18 +++
 .../data/datapipes/iter/readlinesfromfile.py  |  19 ---
 torch/utils/data/datapipes/iter/selecting.py  |   4 +-
 .../iter/{tobytes.py => streamreader.py}      |   4 +-
 ...eadfilesfromtar.py => tararchivereader.py} |   4 +-
 ...eadfilesfromzip.py => ziparchivereader.py} |   4 +-
 torch/utils/data/datapipes/map/__init__.py    |   7 +-
 torch/utils/data/datapipes/map/callable.py    |   4 +-
 torch/utils/data/datapipes/map/combining.py   |   4 +-
 17 files changed, 128 insertions(+), 214 deletions(-)
 rename torch/utils/data/datapipes/iter/{listdirfiles.py => filelister.py} (93%)
 rename torch/utils/data/datapipes/iter/{loadfilesfromdisk.py => fileloader.py} (93%)
 create mode 100644 torch/utils/data/datapipes/iter/linereader.py
 delete mode 100644 torch/utils/data/datapipes/iter/readlinesfromfile.py
 rename torch/utils/data/datapipes/iter/{tobytes.py => streamreader.py} (85%)
 rename torch/utils/data/datapipes/iter/{readfilesfromtar.py => tararchivereader.py} (96%)
 rename torch/utils/data/datapipes/iter/{readfilesfromzip.py => ziparchivereader.py} (96%)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 9c2380112705d..86e53fa699142 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -34,7 +34,6 @@
 import numpy as np
 
 import torch
-import torch.nn as nn
 import torch.utils.data.backward_compatibility
 import torch.utils.data.datapipes as dp
 import torch.utils.data.graph
@@ -54,13 +53,6 @@
     basichandlers as decoder_basichandlers,
 )
 
-try:
-    import torchvision.transforms
-    HAS_TORCHVISION = True
-except ImportError:
-    HAS_TORCHVISION = False
-skipIfNoTorchVision = skipIf(not HAS_TORCHVISION, "no torchvision")
-
 try:
     import dill
     # XXX: By default, dill writes the Pickler dispatch table to inject its
@@ -177,7 +169,7 @@ def tearDown(self):
 
     def test_listdirfiles_iterable_datapipe(self):
         temp_dir = self.temp_dir.name
-        datapipe = dp.iter.ListDirFiles(temp_dir, '')
+        datapipe = dp.iter.FileLister(temp_dir, '')
 
         count = 0
         for pathname in datapipe:
@@ -186,7 +178,7 @@ def test_listdirfiles_iterable_datapipe(self):
         self.assertEqual(count, len(self.temp_files))
 
         count = 0
-        datapipe = dp.iter.ListDirFiles(temp_dir, '', recursive=True)
+        datapipe = dp.iter.FileLister(temp_dir, '', recursive=True)
         for pathname in datapipe:
             count = count + 1
             self.assertTrue((pathname in self.temp_files) or (pathname in self.temp_sub_files))
@@ -195,13 +187,13 @@ def test_listdirfiles_iterable_datapipe(self):
     def test_loadfilesfromdisk_iterable_datapipe(self):
         # test import datapipe class directly
         from torch.utils.data.datapipes.iter import (
-            ListDirFiles,
-            LoadFilesFromDisk,
+            FileLister,
+            FileLoader,
         )
 
         temp_dir = self.temp_dir.name
-        datapipe1 = ListDirFiles(temp_dir, '')
-        datapipe2 = LoadFilesFromDisk(datapipe1)
+        datapipe1 = FileLister(temp_dir, '')
+        datapipe2 = FileLoader(datapipe1)
 
         count = 0
         for rec in datapipe2:
@@ -220,9 +212,9 @@ def test_readfilesfromtar_iterable_datapipe(self):
             tar.add(self.temp_files[0])
             tar.add(self.temp_files[1])
             tar.add(self.temp_files[2])
-        datapipe1 = dp.iter.ListDirFiles(temp_dir, '*.tar')
-        datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1)
-        datapipe3 = dp.iter.ReadFilesFromTar(datapipe2)
+        datapipe1 = dp.iter.FileLister(temp_dir, '*.tar')
+        datapipe2 = dp.iter.FileLoader(datapipe1)
+        datapipe3 = dp.iter.TarArchiveReader(datapipe2)
         # read extracted files before reaching the end of the tarfile
         for rec, temp_file in itertools.zip_longest(datapipe3, self.temp_files):
             self.assertTrue(rec is not None and temp_file is not None)
@@ -247,9 +239,9 @@ def test_readfilesfromzip_iterable_datapipe(self):
             myzip.write(self.temp_files[0])
             myzip.write(self.temp_files[1])
             myzip.write(self.temp_files[2])
-        datapipe1 = dp.iter.ListDirFiles(temp_dir, '*.zip')
-        datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1)
-        datapipe3 = dp.iter.ReadFilesFromZip(datapipe2)
+        datapipe1 = dp.iter.FileLister(temp_dir, '*.zip')
+        datapipe2 = dp.iter.FileLoader(datapipe1)
+        datapipe3 = dp.iter.ZipArchiveReader(datapipe2)
         # read extracted files before reaching the end of the zipfile
         for rec, temp_file in itertools.zip_longest(datapipe3, self.temp_files):
             self.assertTrue(rec is not None and temp_file is not None)
@@ -271,8 +263,8 @@ def test_routeddecoder_iterable_datapipe(self):
         temp_pngfile_pathname = os.path.join(temp_dir, "test_png.png")
         png_data = np.array([[[1., 0., 0.], [1., 0., 0.]], [[1., 0., 0.], [1., 0., 0.]]], dtype=np.single)
         np.save(temp_pngfile_pathname, png_data)
-        datapipe1 = dp.iter.ListDirFiles(temp_dir, ['*.png', '*.txt'])
-        datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1)
+        datapipe1 = dp.iter.FileLister(temp_dir, ['*.png', '*.txt'])
+        datapipe2 = dp.iter.FileLoader(datapipe1)
 
         def _png_decoder(extension, data):
             if extension != 'png':
@@ -321,10 +313,10 @@ def test_groupbykey_iterable_datapipe(self):
                     f.write('12345abcde')
                 tar.add(file_pathname)
 
-        datapipe1 = dp.iter.ListDirFiles(temp_dir, '*.tar')
-        datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1)
-        datapipe3 = dp.iter.ReadFilesFromTar(datapipe2)
-        datapipe4 = dp.iter.GroupByKey(datapipe3, group_size=2)
+        datapipe1 = dp.iter.FileLister(temp_dir, '*.tar')
+        datapipe2 = dp.iter.FileLoader(datapipe1)
+        datapipe3 = dp.iter.TarArchiveReader(datapipe2)
+        datapipe4 = dp.iter.ByKeyGrouper(datapipe3, group_size=2)
 
         expected_result = [("a.png", "a.json"), ("c.png", "c.json"), ("b.png", "b.json"), ("d.png", "d.json"), (
             "f.png", "f.json"), ("g.png", "g.json"), ("e.png", "e.json"), ("h.json", "h.txt")]
@@ -447,13 +439,14 @@ def _get_data_from_tuple_fn(data, *args, **kwargs):
             create_temp_files_for_serving(tmpdir, test_file_count,
                                           test_file_size, file_url_template)
 
-            datapipe_dir_f = dp.iter.ListDirFiles(tmpdir, '*_list')
-            datapipe_f_lines = dp.iter.ReadLinesFromFile(datapipe_dir_f)
+            datapipe_dir_f = dp.iter.FileLister(tmpdir, '*_list')
+            datapipe_stream = dp.iter.FileLoader(datapipe_dir_f)
+            datapipe_f_lines = dp.iter.LineReader(datapipe_stream)
             datapipe_line_url: IterDataPipe[str] = \
-                dp.iter.Map(datapipe_f_lines, _get_data_from_tuple_fn, (1,))
+                dp.iter.Mapper(datapipe_f_lines, _get_data_from_tuple_fn, (1,))
             datapipe_http = dp.iter.HttpReader(datapipe_line_url,
                                                timeout=timeout)
-            datapipe_tob = dp.iter.ToBytes(datapipe_http, chunk=chunk)
+            datapipe_tob = dp.iter.StreamReader(datapipe_http, chunk=chunk)
 
             for (url, data) in datapipe_tob:
                 self.assertGreater(len(url), 0)
@@ -539,18 +532,18 @@ class TestFunctionalIterDataPipe(TestCase):
     def _test_picklable(self):
         arr = range(10)
         picklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [
-            (dp.iter.Map, IDP(arr), (), {}),
-            (dp.iter.Map, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}),
-            (dp.iter.Collate, IDP(arr), (), {}),
-            (dp.iter.Collate, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}),
+            (dp.iter.Mapper, IDP(arr), (), {}),
+            (dp.iter.Mapper, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}),
+            (dp.iter.Collator, IDP(arr), (), {}),
+            (dp.iter.Collator, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}),
             (dp.iter.Filter, IDP(arr), (_fake_filter_fn, (0, ), {'test': True}), {}),
         ]
         for dpipe, input_dp, dp_args, dp_kwargs in picklable_datapipes:
             p = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs))  # type: ignore[call-arg]
 
         unpicklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [
-            (dp.iter.Map, IDP(arr), (lambda x: x, ), {}),
-            (dp.iter.Collate, IDP(arr), (lambda x: x, ), {}),
+            (dp.iter.Mapper, IDP(arr), (lambda x: x, ), {}),
+            (dp.iter.Collator, IDP(arr), (lambda x: x, ), {}),
             (dp.iter.Filter, IDP(arr), (lambda x: x >= 5, ), {}),
         ]
         for dpipe, input_dp, dp_args, dp_kwargs in unpicklable_datapipes:
@@ -566,10 +559,10 @@ def test_concat_datapipe(self):
         input_dp2 = IDP(range(5))
 
         with self.assertRaisesRegex(ValueError, r"Expected at least one DataPipe"):
-            dp.iter.Concat()
+            dp.iter.Concater()
 
         with self.assertRaisesRegex(TypeError, r"Expected all inputs to be `IterDataPipe`"):
-            dp.iter.Concat(input_dp1, ())  # type: ignore[arg-type]
+            dp.iter.Concater(input_dp1, ())  # type: ignore[arg-type]
 
         concat_dp = input_dp1.concat(input_dp2)
         self.assertEqual(len(concat_dp), 15)
@@ -913,59 +906,17 @@ def test_shuffle_datapipe(self):
         with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"):
             len(shuffle_dp_nl)
 
-    @skipIfNoTorchVision
-    def test_transforms_datapipe(self):
-        torch.set_default_dtype(torch.float)
-        # A sequence of numpy random numbers representing 3-channel images
-        w = h = 32
-        inputs = [np.random.randint(0, 255, (h, w, 3), dtype=np.uint8) for i in range(10)]
-        tensor_inputs = [torch.tensor(x, dtype=torch.float).permute(2, 0, 1) / 255. for x in inputs]
-
-        input_dp = IDP(inputs)
-        # Raise TypeError for python function
-        with self.assertRaisesRegex(TypeError, r"`transforms` are required to be"):
-            input_dp.legacy_transforms(_fake_fn)
-
-        # transforms.Compose of several transforms
-        transforms = torchvision.transforms.Compose([
-            torchvision.transforms.ToTensor(),
-            torchvision.transforms.Pad(1, fill=1, padding_mode='constant'),
-        ])
-        tsfm_dp = input_dp.legacy_transforms(transforms)
-        self.assertEqual(len(tsfm_dp), len(input_dp))
-        for tsfm_data, input_data in zip(tsfm_dp, tensor_inputs):
-            self.assertEqual(tsfm_data[:, 1:(h + 1), 1:(w + 1)], input_data)
-
-        # nn.Sequential of several transforms (required to be instances of nn.Module)
-        input_dp = IDP(tensor_inputs)
-        transforms = nn.Sequential(
-            torchvision.transforms.Pad(1, fill=1, padding_mode='constant'),
-        )
-        tsfm_dp = input_dp.legacy_transforms(transforms)
-        self.assertEqual(len(tsfm_dp), len(input_dp))
-        for tsfm_data, input_data in zip(tsfm_dp, tensor_inputs):
-            self.assertEqual(tsfm_data[:, 1:(h + 1), 1:(w + 1)], input_data)
-
-        # Single transform
-        input_dp = IDP_NoLen(inputs)  # type: ignore[assignment]
-        transform = torchvision.transforms.ToTensor()
-        tsfm_dp = input_dp.legacy_transforms(transform)
-        with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"):
-            len(tsfm_dp)
-        for tsfm_data, input_data in zip(tsfm_dp, tensor_inputs):
-            self.assertEqual(tsfm_data, input_data)
-
     def test_zip_datapipe(self):
         with self.assertRaises(TypeError):
-            dp.iter.Zip(IDP(range(10)), list(range(10)))  # type: ignore[arg-type]
+            dp.iter.Zipper(IDP(range(10)), list(range(10)))  # type: ignore[arg-type]
 
-        zipped_dp = dp.iter.Zip(IDP(range(10)), IDP_NoLen(range(5)))  # type: ignore[var-annotated]
+        zipped_dp = dp.iter.Zipper(IDP(range(10)), IDP_NoLen(range(5)))  # type: ignore[var-annotated]
         with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"):
             len(zipped_dp)
         exp = list((i, i) for i in range(5))
         self.assertEqual(list(zipped_dp), exp)
 
-        zipped_dp = dp.iter.Zip(IDP(range(10)), IDP(range(5)))
+        zipped_dp = dp.iter.Zipper(IDP(range(10)), IDP(range(5)))
         self.assertEqual(len(zipped_dp), 5)
         self.assertEqual(list(zipped_dp), exp)
         # Reset
@@ -979,8 +930,8 @@ def _test_picklable(self):
         picklable_datapipes: List[
             Tuple[Type[MapDataPipe], MapDataPipe, Tuple, Dict[str, Any]]
         ] = [
-            (dp.map.Map, MDP(arr), (), {}),
-            (dp.map.Map, MDP(arr), (_fake_fn, (0,), {'test': True}), {}),
+            (dp.map.Mapper, MDP(arr), (), {}),
+            (dp.map.Mapper, MDP(arr), (_fake_fn, (0,), {'test': True}), {}),
         ]
         for dpipe, input_dp, dp_args, dp_kwargs in picklable_datapipes:
             p = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs))  # type: ignore[call-arg]
@@ -988,7 +939,7 @@ def _test_picklable(self):
         unpicklable_datapipes: List[
             Tuple[Type[MapDataPipe], MapDataPipe, Tuple, Dict[str, Any]]
         ] = [
-            (dp.map.Map, MDP(arr), (lambda x: x,), {}),
+            (dp.map.Mapper, MDP(arr), (lambda x: x,), {}),
         ]
         for dpipe, input_dp, dp_args, dp_kwargs in unpicklable_datapipes:
             with warnings.catch_warnings(record=True) as wa:
@@ -1005,10 +956,10 @@ def test_concat_datapipe(self):
         input_dp2 = MDP(range(5))
 
         with self.assertRaisesRegex(ValueError, r"Expected at least one DataPipe"):
-            dp.map.Concat()
+            dp.map.Concater()
 
         with self.assertRaisesRegex(TypeError, r"Expected all inputs to be `MapDataPipe`"):
-            dp.map.Concat(input_dp1, ())  # type: ignore[arg-type]
+            dp.map.Concater(input_dp1, ())  # type: ignore[arg-type]
 
         concat_dp = input_dp1.concat(input_dp2)
         self.assertEqual(len(concat_dp), 15)
diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
index bdaef95e9fa56..5af2ab661da40 100644
--- a/torch/utils/data/datapipes/iter/__init__.py
+++ b/torch/utils/data/datapipes/iter/__init__.py
@@ -1,38 +1,31 @@
 from torch.utils.data.datapipes.iter.callable import (
-    CollateIterDataPipe as Collate,
-    MapIterDataPipe as Map,
-    TransformsIterDataPipe as Transforms,
+    CollatorIterDataPipe as Collator,
+    MapperIterDataPipe as Mapper,
 )
 from torch.utils.data.datapipes.iter.combinatorics import (
     SamplerIterDataPipe as Sampler,
-    ShuffleIterDataPipe as Shuffle,
+    ShufflerIterDataPipe as Shuffler,
 )
 from torch.utils.data.datapipes.iter.combining import (
-    ConcatIterDataPipe as Concat,
-    ZipIterDataPipe as Zip,
+    ConcaterIterDataPipe as Concater,
+    ZipperIterDataPipe as Zipper,
+)
+from torch.utils.data.datapipes.iter.filelister import (
+    FileListerIterDataPipe as FileLister,
+)
+from torch.utils.data.datapipes.iter.fileloader import (
+    FileLoaderIterDataPipe as FileLoader,
 )
 from torch.utils.data.datapipes.iter.grouping import (
-    BatchIterDataPipe as Batch,
+    BatcherIterDataPipe as Batcher,
     BucketBatcherIterDataPipe as BucketBatcher,
-    GroupByKeyIterDataPipe as GroupByKey,
+    ByKeyGrouperIterDataPipe as ByKeyGrouper,
 )
 from torch.utils.data.datapipes.iter.httpreader import (
     HTTPReaderIterDataPipe as HttpReader,
 )
-from torch.utils.data.datapipes.iter.listdirfiles import (
-    ListDirFilesIterDataPipe as ListDirFiles,
-)
-from torch.utils.data.datapipes.iter.loadfilesfromdisk import (
-    LoadFilesFromDiskIterDataPipe as LoadFilesFromDisk,
-)
-from torch.utils.data.datapipes.iter.readfilesfromtar import (
-    ReadFilesFromTarIterDataPipe as ReadFilesFromTar,
-)
-from torch.utils.data.datapipes.iter.readfilesfromzip import (
-    ReadFilesFromZipIterDataPipe as ReadFilesFromZip,
-)
-from torch.utils.data.datapipes.iter.readlinesfromfile import (
-    ReadLinesFromFileIterDataPipe as ReadLinesFromFile,
+from torch.utils.data.datapipes.iter.linereader import (
+    LineReaderIterDataPipe as LineReader,
 )
 from torch.utils.data.datapipes.iter.routeddecoder import (
     RoutedDecoderIterDataPipe as RoutedDecoder,
@@ -40,33 +33,38 @@
 from torch.utils.data.datapipes.iter.selecting import (
     FilterIterDataPipe as Filter,
 )
-from torch.utils.data.datapipes.iter.tobytes import (
-    ToBytesIterDataPipe as ToBytes,
+from torch.utils.data.datapipes.iter.streamreader import (
+    StreamReaderIterDataPipe as StreamReader,
+)
+from torch.utils.data.datapipes.iter.tararchivereader import (
+    TarArchiveReaderIterDataPipe as TarArchiveReader,
+)
+from torch.utils.data.datapipes.iter.ziparchivereader import (
+    ZipArchiveReaderIterDataPipe as ZipArchiveReader,
 )
 from torch.utils.data.datapipes.iter.utils import (
     IterableAsDataPipeIterDataPipe as IterableAsDataPipe,
 )
 
-__all__ = ['Batch',
+__all__ = ['Batcher',
            'BucketBatcher',
-           'Collate',
-           'Concat',
+           'ByKeyGrouper',
+           'Collator',
+           'Concater',
+           'FileLister',
+           'FileLoader',
            'Filter',
-           'GroupByKey',
            'HttpReader',
            'IterableAsDataPipe',
-           'ListDirFiles',
-           'LoadFilesFromDisk',
-           'Map',
-           'ReadFilesFromTar',
-           'ReadFilesFromZip',
-           'ReadLinesFromFile',
+           'LineReader',
+           'Mapper',
            'RoutedDecoder',
            'Sampler',
-           'Shuffle',
-           'ToBytes',
-           'Transforms',
-           'Zip']
+           'Shuffler',
+           'StreamReader',
+           'TarArchiveReader',
+           'ZipArchiveReader',
+           'Zipper']
 
 # Please keep this list sorted
 assert __all__ == sorted(__all__)
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index cc0f9e13b3adf..18f6f17fff156 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -1,5 +1,4 @@
 import warnings
-import torch.nn as nn
 from torch.utils.data import IterDataPipe, _utils, functional_datapipe, DataChunk
 from typing import Callable, Dict, Iterator, Optional, Sized, Tuple, TypeVar
 
@@ -26,8 +25,8 @@ def default_fn(data):
 
 
 @functional_datapipe('map')
-class MapIterDataPipe(IterDataPipe[T_co]):
-    r""" :class:`MapIterDataPipe`.
+class MapperIterDataPipe(IterDataPipe[T_co]):
+    r""" :class:`MapperIterDataPipe`.
 
     Iterable DataPipe to run a function over each item from the source DataPipe.
     The function can be any regular python function or partial object. Lambda
@@ -108,8 +107,8 @@ def __setstate__(self, state):
 
 
 @functional_datapipe('collate')
-class CollateIterDataPipe(MapIterDataPipe):
-    r""" :class:`CollateIterDataPipe`.
+class CollatorIterDataPipe(MapperIterDataPipe):
+    r""" :class:`CollatorIterDataPipe`.
 
     Iterable DataPipe to collate samples from datapipe to Tensor(s) by `util_.collate.default_collate`,
     or customized Data Structure by collate_fn.
@@ -153,35 +152,3 @@ def __init__(self,
                  fn_kwargs: Optional[Dict] = None,
                  ) -> None:
         super().__init__(datapipe, fn=collate_fn, fn_args=fn_args, fn_kwargs=fn_kwargs)
-
-
-@functional_datapipe('legacy_transforms')
-class TransformsIterDataPipe(MapIterDataPipe):
-    r""" :class:`TransformsIterDataPipe`.
-
-    Iterable DataPipe to use transform(s) from torchvision or torchaudio to transform
-    data from datapipe.
-    args:
-        datapipe: Iterable DataPipe being transformed
-        transforms: A transform or a sequence of transforms from torchvision or torchaudio.
-    """
-
-    def __init__(self,
-                 datapipe: IterDataPipe,
-                 transforms: Callable,
-                 ) -> None:
-        # Type checking for transforms
-        transforms_types: Tuple = (nn.Module, )
-        try:
-            # Specific types of transforms other than `nn.Module` from torchvision
-            import torchvision.transforms as tsfm
-            transforms_types += (tsfm.Compose, tsfm.RandomChoice, tsfm.RandomOrder,
-                                 tsfm.ToPILImage, tsfm.ToTensor, tsfm.Lambda)
-        except ImportError:
-            pass
-
-        if not isinstance(transforms, transforms_types):
-            raise TypeError("`transforms` are required to be a callable from "
-                            "torchvision.transforms or torchaudio.transforms")
-
-        super().__init__(datapipe, fn=transforms)
diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index a8b1e3d9737fa..d1a7dd0368221 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py
@@ -44,8 +44,8 @@ def __len__(self) -> int:
 
 
 @functional_datapipe('shuffle')
-class ShuffleIterDataPipe(IterDataPipe[T_co]):
-    r""" :class:`ShuffleIterDataPipe`
+class ShufflerIterDataPipe(IterDataPipe[T_co]):
+    r""" :class:`ShufflerIterDataPipe`
 
     Iterable DataPipe to shuffle the input DataPipe with a buffer. The buffer
     with `buffer_size` is filled with elements from the datapipe first. Then,
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 0693b1f0ad6de..4b28e0926c42b 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -7,8 +7,8 @@
 
 
 @functional_datapipe('concat')
-class ConcatIterDataPipe(IterDataPipe):
-    r""" :class:`ConcatIterDataPipe`.
+class ConcaterIterDataPipe(IterDataPipe):
+    r""" :class:`ConcaterIterDataPipe`.
 
     Iterable DataPipe to concatenate multiple Iterable DataPipes.
     args:
@@ -54,7 +54,7 @@ def __iter__(self):
 
 
 @functional_datapipe('fork')
-class ForkIterDataPipe(IterDataPipe):
+class ForkerIterDataPipe(IterDataPipe):
 
     def __new__(cls, datapipe, instances):
         result = []
@@ -96,7 +96,7 @@ def __iter__(self):
 
 
 @functional_datapipe('zip')
-class ZipIterDataPipe(IterDataPipe[Tuple[T_co]]):
+class ZipperIterDataPipe(IterDataPipe[Tuple[T_co]]):
     r""" :class:`ZipIterDataPipe`.
 
     Iterable DataPipe aggregates elements into a tuple from each of
diff --git a/torch/utils/data/datapipes/iter/listdirfiles.py b/torch/utils/data/datapipes/iter/filelister.py
similarity index 93%
rename from torch/utils/data/datapipes/iter/listdirfiles.py
rename to torch/utils/data/datapipes/iter/filelister.py
index 91ef8a3b080a4..48fdce9f52ef5 100644
--- a/torch/utils/data/datapipes/iter/listdirfiles.py
+++ b/torch/utils/data/datapipes/iter/filelister.py
@@ -2,8 +2,8 @@
 from torch.utils.data.datapipes.utils.common import get_file_pathnames_from_root
 from typing import List, Union, Iterator
 
-class ListDirFilesIterDataPipe(IterDataPipe[str]):
-    r""" :class:`ListDirFilesIterDataPipe`
+class FileListerIterDataPipe(IterDataPipe[str]):
+    r""" :class:`FileListerIterDataPipe`
 
     Iterable DataPipe to load file pathname(s) (path + filename), yield pathname from given disk root dir.
     args:
diff --git a/torch/utils/data/datapipes/iter/loadfilesfromdisk.py b/torch/utils/data/datapipes/iter/fileloader.py
similarity index 93%
rename from torch/utils/data/datapipes/iter/loadfilesfromdisk.py
rename to torch/utils/data/datapipes/iter/fileloader.py
index c9dd5daf9a17a..2b73e4e156b70 100644
--- a/torch/utils/data/datapipes/iter/loadfilesfromdisk.py
+++ b/torch/utils/data/datapipes/iter/fileloader.py
@@ -5,8 +5,8 @@
 from torch.utils.data.datapipes.utils.common import get_file_binaries_from_pathnames
 
 
-class LoadFilesFromDiskIterDataPipe(IterDataPipe[Tuple[str, IOBase]]):
-    r""" :class:`LoadFilesFromDiskIterDataPipe`.
+class FileLoaderIterDataPipe(IterDataPipe[Tuple[str, IOBase]]):
+    r""" :class:`FileLoaderIterDataPipe`.
 
     Iterable Datapipe to load file streams from given pathnames,
     yield pathname and file stream in a tuple.
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index e6304c2de8217..5f449489ac756 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -32,8 +32,8 @@ def __iter__(self):
 
 
 @functional_datapipe('batch')
-class BatchIterDataPipe(IterDataPipe[DataChunk[T_co]]):
-    r""" :class:`BatchIterDataPipe`.
+class BatcherIterDataPipe(IterDataPipe[DataChunk[T_co]]):
+    r""" :class:`BatcherIterDataPipe`.
 
     Iterable DataPipe to create mini-batches of data. An outer dimension will be added as
     `batch_size` if `drop_last` is set to `True`, or `length % batch_size` for the
@@ -93,8 +93,8 @@ def __len__(self) -> int:
 
 
 @functional_datapipe('unbatch')
-class UnBatchIterDataPipe(IterDataPipe):
-    r""" :class:`UnBatchIterDataPipe`.
+class UnBatcherIterDataPipe(IterDataPipe):
+    r""" :class:`UnBatcherIterDataPipe`.
 
     Iterable DataPipe to undo batching of data. In other words, it flattens the data up to the specified level
     within a batched DataPipe.
@@ -255,7 +255,7 @@ def cmp_fn(a: Tuple[str, Any], b: Tuple[str, Any]):
 
 
 @functional_datapipe('groupby')
-class GroupByIterDataPipe(IterDataPipe):
+class GrouperIterDataPipe(IterDataPipe):
     # TODO(VtalyFedyunin): Add inline docs and tests (they are partially available in notebooks)
     def __init__(self,
                  datapipe: IterDataPipe[T_co],
@@ -329,7 +329,7 @@ def __iter__(self):
 
 
 @functional_datapipe('group_by_key')
-class GroupByKeyIterDataPipe(IterDataPipe[list]):
+class ByKeyGrouperIterDataPipe(IterDataPipe[list]):
     r""" :class:`GroupByKeyIterDataPipe`.
 
     Iterable datapipe to group data from input iterable by keys which are generated from `group_key_fn`,
diff --git a/torch/utils/data/datapipes/iter/linereader.py b/torch/utils/data/datapipes/iter/linereader.py
new file mode 100644
index 0000000000000..2b15b93c9c60a
--- /dev/null
+++ b/torch/utils/data/datapipes/iter/linereader.py
@@ -0,0 +1,18 @@
+from typing import Tuple
+from torch.utils.data import IterDataPipe
+
+
+class LineReaderIterDataPipe(IterDataPipe[Tuple[str, str]]):
+    r""" :class:`LineReaderIterDataPipe`
+
+    Iterable DataPipe to load file name and stream as source IterDataPipe
+    and yield filename and line(s).
+    """
+
+    def __init__(self, source_datapipe):
+        self.source_datapipe = source_datapipe
+
+    def __iter__(self):
+        for file_name, stream in self.source_datapipe:
+            for line in stream:
+                yield file_name, line
diff --git a/torch/utils/data/datapipes/iter/readlinesfromfile.py b/torch/utils/data/datapipes/iter/readlinesfromfile.py
deleted file mode 100644
index c8366af3b475f..0000000000000
--- a/torch/utils/data/datapipes/iter/readlinesfromfile.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from typing import Tuple
-from torch.utils.data import IterDataPipe
-
-
-class ReadLinesFromFileIterDataPipe(IterDataPipe[Tuple[str, str]]):
-    r""" :class:`ReadLinesFromFileDataPipe`
-
-    Iterable DataPipe to load file names as source iter data pipe
-    and yield filename and line(s).
-    """
-
-    def __init__(self, source_datapipe):
-        self.source_datapipe = source_datapipe
-
-    def __iter__(self):
-        for file_name in self.source_datapipe:
-            with open(file_name) as file:
-                for line in file:
-                    yield (file_name, line)
diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py
index 46a613a7d91a4..83872cebdb53d 100644
--- a/torch/utils/data/datapipes/iter/selecting.py
+++ b/torch/utils/data/datapipes/iter/selecting.py
@@ -1,13 +1,13 @@
 from torch.utils.data import IterDataPipe, functional_datapipe, DataChunk
 from typing import Callable, TypeVar, Iterator, Optional, Tuple, Dict
 
-from .callable import MapIterDataPipe
+from .callable import MapperIterDataPipe
 
 T_co = TypeVar('T_co', covariant=True)
 
 
 @functional_datapipe('filter')
-class FilterIterDataPipe(MapIterDataPipe):
+class FilterIterDataPipe(MapperIterDataPipe):
     r""" :class:`FilterIterDataPipe`.
 
     Iterable DataPipe to filter elements from datapipe according to filter_fn.
diff --git a/torch/utils/data/datapipes/iter/tobytes.py b/torch/utils/data/datapipes/iter/streamreader.py
similarity index 85%
rename from torch/utils/data/datapipes/iter/tobytes.py
rename to torch/utils/data/datapipes/iter/streamreader.py
index 21fd82d381bcb..f74efe746a759 100644
--- a/torch/utils/data/datapipes/iter/tobytes.py
+++ b/torch/utils/data/datapipes/iter/streamreader.py
@@ -2,8 +2,8 @@
 from torch.utils.data import IterDataPipe
 
 
-class ToBytesIterDataPipe(IterDataPipe[Tuple[str, bytes]]):
-    r""" :class:`ToBytesIterDataPipe`
+class StreamReaderIterDataPipe(IterDataPipe[Tuple[str, bytes]]):
+    r""" :class:`StreamReaderIterDataPipe`
 
     Iterable DataPipe to load IO stream with label name,
     and to yield bytes with label name in a tuple
diff --git a/torch/utils/data/datapipes/iter/readfilesfromtar.py b/torch/utils/data/datapipes/iter/tararchivereader.py
similarity index 96%
rename from torch/utils/data/datapipes/iter/readfilesfromtar.py
rename to torch/utils/data/datapipes/iter/tararchivereader.py
index f4566021fcc7f..9145f5f1dbc11 100644
--- a/torch/utils/data/datapipes/iter/readfilesfromtar.py
+++ b/torch/utils/data/datapipes/iter/tararchivereader.py
@@ -7,8 +7,8 @@
 import tarfile
 import warnings
 
-class ReadFilesFromTarIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]):
-    r""":class:`ReadFilesFromTarIterDataPipe`.
+class TarArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]):
+    r""" :class:`TarArchiveReaderIterDataPipe`.
 
     Iterable datapipe to extract tar binary streams from input iterable which contains tuples of
     pathname and tar binary stream, yields pathname and extracted binary stream in a tuple.
diff --git a/torch/utils/data/datapipes/iter/readfilesfromzip.py b/torch/utils/data/datapipes/iter/ziparchivereader.py
similarity index 96%
rename from torch/utils/data/datapipes/iter/readfilesfromzip.py
rename to torch/utils/data/datapipes/iter/ziparchivereader.py
index edb8320aece9f..e98bd179760c8 100644
--- a/torch/utils/data/datapipes/iter/readfilesfromzip.py
+++ b/torch/utils/data/datapipes/iter/ziparchivereader.py
@@ -8,8 +8,8 @@
 import zipfile
 import warnings
 
-class ReadFilesFromZipIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]):
-    r""" :class:`ReadFilesFromZipIterDataPipe`.
+class ZipArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]):
+    r""" :class:`ZipArchiveReaderIterDataPipe`.
 
     Iterable data pipe to extract zip binary streams from input iterable which contains tuples of
     pathname and zip binary stream, yields pathname and extracted binary stream in a tuple.
diff --git a/torch/utils/data/datapipes/map/__init__.py b/torch/utils/data/datapipes/map/__init__.py
index b7609957baaa8..5879165aff2eb 100644
--- a/torch/utils/data/datapipes/map/__init__.py
+++ b/torch/utils/data/datapipes/map/__init__.py
@@ -1,7 +1,6 @@
 # Functional DataPipe
-from torch.utils.data.datapipes.map.callable import MapMapDataPipe as Map
-from torch.utils.data.datapipes.map.combining import \
-    (ConcatMapDataPipe as Concat)
+from torch.utils.data.datapipes.map.callable import MapperMapDataPipe as Mapper
+from torch.utils.data.datapipes.map.combining import ConcaterMapDataPipe as Concater
 
 
-__all__ = ['Map', 'Concat']
+__all__ = ['Concater', 'Mapper']
diff --git a/torch/utils/data/datapipes/map/callable.py b/torch/utils/data/datapipes/map/callable.py
index 00457299316ae..8dbad957e069d 100644
--- a/torch/utils/data/datapipes/map/callable.py
+++ b/torch/utils/data/datapipes/map/callable.py
@@ -26,8 +26,8 @@ def default_fn(data):
 
 
 @functional_datapipe('map')
-class MapMapDataPipe(MapDataPipe[T_co]):
-    r""":class:`MapMapDataPipe`.
+class MapperMapDataPipe(MapDataPipe[T_co]):
+    r""":class:`MapperMapDataPipe`.
 
     Map DataPipe to run a function over each item from the source DataPipe.
     The function can be any regular python function or partial object. Lambda
diff --git a/torch/utils/data/datapipes/map/combining.py b/torch/utils/data/datapipes/map/combining.py
index 234d45382efe6..4743c3726b356 100644
--- a/torch/utils/data/datapipes/map/combining.py
+++ b/torch/utils/data/datapipes/map/combining.py
@@ -5,8 +5,8 @@
 
 
 @functional_datapipe('concat')
-class ConcatMapDataPipe(MapDataPipe):
-    r""" :class:`ConcatMapDataPipe`.
+class ConcaterMapDataPipe(MapDataPipe):
+    r""" :class:`ConcaterMapDataPipe`.
 
     Map DataPipe to concatenate multiple Map DataPipes.
     The actual index of is the cumulative sum of source datapipes.

From 49be16d50ab7d7fe38b241347ff72e381feba071 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Mon, 23 Aug 2021 15:02:10 -0700
Subject: [PATCH 140/530] .github: Add ec2 information as a step (#63784)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63784

Also creates the common.yml.j2 file as a place to store common code
amongst the templates

Should look like:
![image](https://user-images.githubusercontent.com/1700823/130495226-f18b8c0f-1ea7-4097-8bbb-e998fabb71f2.png)

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: malfet, driazati

Differential Revision: D30490682

Pulled By: seemethere

fbshipit-source-id: 18028b4acff938ef54cd6e4877561b2d830a11cf
---
 .github/scripts/display_ec2_information.sh    | 14 +++++++++++++
 .github/templates/common.yml.j2               |  6 ++++++
 .github/templates/linux_ci_workflow.yml.j2    |  6 ++++++
 .github/templates/windows_ci_workflow.yml.j2  |  4 ++++
 ...torch-linux-xenial-cuda10.2-py3.6-gcc7.yml |  9 ++++++++
 ...torch-linux-xenial-cuda11.1-py3.6-gcc7.yml |  9 ++++++++
 ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 17 +++++++++++++++
 ...rated-linux-bionic-py3.8-gcc9-coverage.yml | 17 +++++++++++++++
 ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 17 +++++++++++++++
 ...rated-linux-xenial-cuda11.1-py3.6-gcc7.yml | 17 +++++++++++++++
 .../generated-linux-xenial-py3.6-gcc5.4.yml   | 21 +++++++++++++++++++
 ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml |  5 +++++
 ...torch-linux-xenial-cuda11.3-py3.6-gcc7.yml |  9 ++++++++
 ...iodic-linux-xenial-cuda11.3-py3.6-gcc7.yml | 17 +++++++++++++++
 ...rated-periodic-win-vs2019-cuda11.3-py3.yml | 13 ++++++++++++
 .../generated-win-vs2019-cpu-py3.yml          | 13 ++++++++++++
 .../generated-win-vs2019-cuda10.1-py3.yml     | 13 ++++++++++++
 .../generated-win-vs2019-cuda11.1-py3.yml     | 13 ++++++++++++
 18 files changed, 220 insertions(+)
 create mode 100755 .github/scripts/display_ec2_information.sh
 create mode 100644 .github/templates/common.yml.j2

diff --git a/.github/scripts/display_ec2_information.sh b/.github/scripts/display_ec2_information.sh
new file mode 100755
index 0000000000000..be47418966025
--- /dev/null
+++ b/.github/scripts/display_ec2_information.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+function get_ec2_metadata() {
+    # Pulled from instance metadata endpoint for EC2
+    # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+    category=$1
+    curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+}
+
+echo "ami-id: $(get_ec2_metadata ami-id)"
+echo "instance-id: $(get_ec2_metadata instance-id)"
+echo "instance-type: $(get_ec2_metadata instance-type)"
diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
new file mode 100644
index 0000000000000..12108f1f95f46
--- /dev/null
+++ b/.github/templates/common.yml.j2
@@ -0,0 +1,6 @@
+{%- macro display_ec2_information() -%}
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
+{%- endmacro -%}
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 9b4ba87194065..25099b46cbc8e 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -1,3 +1,4 @@
+{% import 'common.yml.j2' as common %}
 {# squid_proxy is an private ELB that only available for GHA custom runners #}
 {%- set squid_proxy    = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%}
 {# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #}
@@ -89,6 +90,7 @@ jobs:
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      !{{ common.display_ec2_information() }}
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -165,6 +167,7 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      !{{ common.display_ec2_information() }}
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -318,6 +321,7 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      !{{ common.display_ec2_information() }}
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -476,6 +480,7 @@ jobs:
         with:
           # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
           fetch-depth: 0
+      !{{ common.display_ec2_information() }}
       - uses: actions/download-artifact@v2
         name: Download PyTorch Test Reports
         with:
@@ -541,6 +546,7 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      !{{ common.display_ec2_information() }}
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index c1160fe32de60..f00f4b19a903d 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -1,3 +1,4 @@
+{% import 'common.yml.j2' as common %}
 {# squid_proxy is an private ELB that only available for GHA custom runners #}
 {%- set squid_proxy    = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%}
 {# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #}
@@ -82,6 +83,7 @@ jobs:
           path: pytorch-${{ github.run_id }}
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      !{{ common.display_ec2_information() }}
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -190,6 +192,7 @@ jobs:
           path: pytorch-${{ github.run_id }}
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      !{{ common.display_ec2_information() }}
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -290,6 +293,7 @@ jobs:
         with:
           # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
           fetch-depth: 0
+      !{{ common.display_ec2_information() }}
       - uses: actions/download-artifact@v2
         name: Download PyTorch Test Reports
         with:
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 780de8e1919e9..f45ed052e3838 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: libtorch-linux-xenial-cuda10.2-py3.6-gcc7
@@ -63,6 +64,10 @@ jobs:
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -139,6 +144,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 3d586ae322e8e..ba59027969b7a 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: libtorch-linux-xenial-cuda11.1-py3.6-gcc7
@@ -63,6 +64,10 @@ jobs:
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -139,6 +144,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 7e6006ad300de..67bb2064863bc 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-bionic-cuda10.2-py3.9-gcc7
@@ -63,6 +64,10 @@ jobs:
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -139,6 +144,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -286,6 +295,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -436,6 +449,10 @@ jobs:
         with:
           # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - uses: actions/download-artifact@v2
         name: Download PyTorch Test Reports
         with:
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index 2cd316b2b8674..59061662341a3 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-bionic-py3.8-gcc9-coverage
@@ -63,6 +64,10 @@ jobs:
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -139,6 +144,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -286,6 +295,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -440,6 +453,10 @@ jobs:
         with:
           # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - uses: actions/download-artifact@v2
         name: Download PyTorch Test Reports
         with:
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index a88191469ebde..d897e28ace007 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-xenial-cuda10.2-py3.6-gcc7
@@ -63,6 +64,10 @@ jobs:
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -139,6 +144,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -286,6 +295,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -436,6 +449,10 @@ jobs:
         with:
           # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - uses: actions/download-artifact@v2
         name: Download PyTorch Test Reports
         with:
diff --git a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 264553f1877ee..30514cab07ea4 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-xenial-cuda11.1-py3.6-gcc7
@@ -63,6 +64,10 @@ jobs:
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -139,6 +144,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -286,6 +295,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -436,6 +449,10 @@ jobs:
         with:
           # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - uses: actions/download-artifact@v2
         name: Download PyTorch Test Reports
         with:
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index 7b100e6d1294d..bd4d65027c066 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-xenial-py3.6-gcc5.4
@@ -63,6 +64,10 @@ jobs:
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -139,6 +144,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -286,6 +295,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -436,6 +449,10 @@ jobs:
         with:
           # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - uses: actions/download-artifact@v2
         name: Download PyTorch Test Reports
         with:
@@ -498,6 +515,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index dbfba5f1fa74d..1827249beae99 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/bazel_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-xenial-py3.6-gcc7-bazel-test
@@ -63,6 +64,10 @@ jobs:
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 8352b229f4fae..1f4c6d270ec54 100644
--- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7
@@ -61,6 +62,10 @@ jobs:
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -137,6 +142,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 9d7da989e8b9a..5fd0e99f002f7 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: periodic-linux-xenial-cuda11.3-py3.6-gcc7
@@ -61,6 +62,10 @@ jobs:
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -137,6 +142,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -284,6 +293,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -434,6 +447,10 @@ jobs:
         with:
           # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - uses: actions/download-artifact@v2
         name: Download PyTorch Test Reports
         with:
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml
index 9487ea5a8fdb3..407aace6e4aee 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/windows_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: periodic-win-vs2019-cuda11.3-py3
@@ -57,6 +58,10 @@ jobs:
           path: pytorch-${{ github.run_id }}
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -154,6 +159,10 @@ jobs:
           path: pytorch-${{ github.run_id }}
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -248,6 +257,10 @@ jobs:
         with:
           # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - uses: actions/download-artifact@v2
         name: Download PyTorch Test Reports
         with:
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index fb2a097c64452..35f9feccaf26c 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/windows_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: win-vs2019-cpu-py3
@@ -57,6 +58,10 @@ jobs:
           path: pytorch-${{ github.run_id }}
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -146,6 +151,10 @@ jobs:
           path: pytorch-${{ github.run_id }}
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -232,6 +241,10 @@ jobs:
         with:
           # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - uses: actions/download-artifact@v2
         name: Download PyTorch Test Reports
         with:
diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
index 2fbc8650f43e1..26b703500b0d1 100644
--- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/windows_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: win-vs2019-cuda10.1-py3
@@ -59,6 +60,10 @@ jobs:
           path: pytorch-${{ github.run_id }}
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -156,6 +161,10 @@ jobs:
           path: pytorch-${{ github.run_id }}
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -250,6 +259,10 @@ jobs:
         with:
           # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - uses: actions/download-artifact@v2
         name: Download PyTorch Test Reports
         with:
diff --git a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml
index 8b52a07055608..d4175aca5f02d 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml
@@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/windows_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: win-vs2019-cuda11.1-py3
@@ -59,6 +60,10 @@ jobs:
           path: pytorch-${{ github.run_id }}
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -156,6 +161,10 @@ jobs:
           path: pytorch-${{ github.run_id }}
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -250,6 +259,10 @@ jobs:
         with:
           # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
           fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - uses: actions/download-artifact@v2
         name: Download PyTorch Test Reports
         with:

From 55f8f95ad4b36d201b63bd4b53984d378cf3d672 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Mon, 23 Aug 2021 15:36:59 -0700
Subject: [PATCH 141/530] fix git diff issue (#63408)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/60111, ideally we should merge this before https://github.com/pytorch/pytorch/issues/63360 but we can also test this with https://github.com/pytorch/pytorch/issues/63360 easily.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63408

Test Plan:
- This is conform working with local test.sh run by setting PR_NUMBER
- should be validated by GHA CI as well

Concern:
- currently GHA CI is running into proxy 403 rate-limit exceeded issue consistently. However the worst case is not generating any git diff files, which is going to be exactly the same as current behavior.
- depends on https://github.com/pytorch/pytorch/issues/63770.

Reviewed By: driazati, janeyx99

Differential Revision: D30489355

Pulled By: walterddr

fbshipit-source-id: a638b7ae5820f29a7aca6cc40ff390ab253cb174
---
 .jenkins/pytorch/common_utils.sh | 11 +++++++++++
 .jenkins/pytorch/test.sh         | 12 +++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index 49db051a0f484..cb7ef207af47c 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -49,6 +49,17 @@ function get_exit_code() {
   return $retcode
 }
 
+function get_pr_change_files() {
+  # The fetch may fail on Docker hosts, this fetch is necessary for GHA
+  # accepts PR_NUMBER and extract filename as arguments
+  set +e
+  tmp_file=$(mktemp)
+  wget -O "$tmp_file" "https://api.github.com/repos/pytorch/pytorch/pulls/$1/files"
+  # this regex extracts the filename list according to the GITHUB REST API result.
+  sed -n "s/.*\"filename\": \"\(.*\)\",/\1/p" "$tmp_file" | tee "$2"
+  set -e
+}
+
 function file_diff_from_base() {
   # The fetch may fail on Docker hosts, this fetch is necessary for GHA
   set +e
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 9f3e378a45fb8..4fce9ab00dcc9 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -142,12 +142,14 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX512-* || $TEST_CONFIG == 'nogpu_NO_AVX
   export ATEN_CPU_CAPABILITY=avx2
 fi
 
-# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs
-#       see https://github.com/pytorch/pytorch/issues/60111
-#       change it back to PR_NUMBER when issue is fixed.
-if [ -n "$CIRCLE_PR_NUMBER" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
+  # if PR_NUMBER exist, use it to grab PR contents.
   DETERMINE_FROM=$(mktemp)
-  file_diff_from_base "$DETERMINE_FROM"
+  if [ -n "$PR_NUMBER" ]; then
+    get_pr_change_files "$PR_NUMBER" "$DETERMINE_FROM"
+  else
+    file_diff_from_base "$DETERMINE_FROM"
+  fi
 fi
 
 test_python_legacy_jit() {

From 84890aae352ea6bf8dd2c501683a0da240ea7dec Mon Sep 17 00:00:00 2001
From: Don Jang <djang@fb.com>
Date: Mon, 23 Aug 2021 16:20:27 -0700
Subject: [PATCH 142/530] [Static Runtime] Add an out variant op for aten::abs
 (#63675)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63675

This change adds an out variant implementation for `aten::abs`.

Test Plan:
- Observed `V0820 14:14:08.880342 101788 impl.cpp:1394] Switch to out variant for node: %3 : Tensor = aten::abs(%a.1)`

- Perf impact: TBD

Reviewed By: hlu1

Differential Revision: D30461317

fbshipit-source-id: 0c0230bd40afe463ae1ccb222c2a1207ebcf4191
---
 benchmarks/static_runtime/test_scripts.h        |  5 +++++
 .../static_runtime/test_static_runtime.cc       |  9 +++++++++
 torch/csrc/jit/runtime/static/ops.cpp           | 17 +++++++++++++++++
 3 files changed, 31 insertions(+)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index 73380129731ed..9e01d3b8d0b87 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -26,6 +26,11 @@
  alias of the model output.
 
 */
+const auto abs_script = R"JIT(
+  def forward(self, a):
+    return a.abs().clone()
+)JIT";
+
 const auto list_construct_script = R"JIT(
   def forward(self, a, b):
     return [a, b]
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index dfe2c14e9489b..71102215b4e2e 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -256,6 +256,15 @@ TEST(StaticRuntime, Addmm) {
   testStaticRuntime(addmm_script, args, args1);
 }
 
+TEST(StaticRuntime, IndividualOps_Abs) {
+  auto a = at::randn({2, 3});
+  auto b = at::randn({4, 2, 3});
+  std::vector<IValue> args{a};
+  std::vector<IValue> args2{b};
+  testStaticRuntime(abs_script, args);
+  testStaticRuntime(abs_script, args, args2);
+}
+
 TEST(StaticRuntime, IndividualOps_Binary) {
   auto a = at::randn({2, 3});
   auto b = at::ones({2, 3});
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 2543182db138c..27f6e545ec7bc 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -301,6 +301,23 @@ REGISTER_OPERATOR_FUNCTOR(
       };
     });
 
+REGISTER_OPERATOR_FUNCTOR(aten::abs, aten_abs, [](Node* n) -> SROperator {
+  if (!n->matches(torch::schema("aten::abs(Tensor self) -> Tensor"))) {
+    LogAndDumpSchema(n);
+    return nullptr;
+  }
+  return [](ProcessedNode* p_node) {
+    const auto& in0_t = p_node->Input(0).toTensor();
+    if (p_node->Output(0).isNone()) {
+      p_node->Output(0) = at::native::abs(in0_t);
+    } else {
+      auto& out_t = p_node->Output(0).toTensor();
+      fastResizeToZero(out_t);
+      at::native::abs_out(in0_t, out_t);
+    }
+  };
+});
+
 REGISTER_OPERATOR_FUNCTOR(aten::mul, aten_mul, [](Node* n) -> SROperator {
   if (!n->matches(torch::schema(
           "aten::mul.Tensor(Tensor self, Tensor other) -> Tensor"))) {

From 130549d61ba34f3d70167be3dc88631385112625 Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@fb.com>
Date: Mon, 23 Aug 2021 16:33:07 -0700
Subject: [PATCH 143/530] Fix typo in NNAPI tests (#63797)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63797

nnapi memory format test has a typo

Test Plan:
pytest test/test_nnapi.py::TestNNAPI

Imported from OSS

Reviewed By: Amyh11325

Differential Revision: D30495473

fbshipit-source-id: 8edad7c01a080847a64a2797e077ec4d6077552a
---
 test/test_nnapi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_nnapi.py b/test/test_nnapi.py
index d70bebf547a1e..f8db7e1a3df90 100644
--- a/test/test_nnapi.py
+++ b/test/test_nnapi.py
@@ -78,7 +78,7 @@ def check(
                     # to get a nice message.
                     self.assertEqual(eager_output, nnapi_output, atol=0, rtol=0)
             if expected_memory_format:
-                self.assertTrue(nnapi_out.is_contiguous(memory_format=expected_memory_format))
+                self.assertTrue(nnapi_output.is_contiguous(memory_format=expected_memory_format))
 
     def float_and_quant_and_nhwc(self, inp_float, scale, zero_point):
         torch.manual_seed(29)

From fc6dd0bc008d1a1872626567506be6e9e5dcbae1 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Mon, 23 Aug 2021 17:26:27 -0700
Subject: [PATCH 144/530] [JIT] Move UseVariadicCat internals (#63577)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63577

Since other variadic ops will have an almost identical implementation, we can generalize the `UseVariadicCat` implementation and put it in a common folder.

Also moved some test utilities that other variadic op tests will likely need.

Test Plan: `buck test caffe2/test/cpp/jit:jit -- ConcatOptTest`

Reviewed By: navahgar

Differential Revision: D30409937

fbshipit-source-id: 925c11c27b58ce98cb8368d2a205e26ba66d3db9
---
 test/cpp/jit/test_concat_opt.cpp       |  64 ++++---------
 test/cpp/jit/test_utils.cpp            |  32 +++++++
 test/cpp/jit/test_utils.h              |   7 ++
 tools/build_variables.bzl              |   1 +
 torch/csrc/jit/passes/concat_opt.cpp   |  90 ------------------
 torch/csrc/jit/passes/concat_opt.h     |   7 --
 torch/csrc/jit/passes/variadic_ops.cpp | 126 +++++++++++++++++++++++++
 torch/csrc/jit/passes/variadic_ops.h   |  16 ++++
 torch/csrc/jit/runtime/static/impl.cpp |   2 +-
 9 files changed, 200 insertions(+), 145 deletions(-)
 create mode 100644 torch/csrc/jit/passes/variadic_ops.cpp
 create mode 100644 torch/csrc/jit/passes/variadic_ops.h

diff --git a/test/cpp/jit/test_concat_opt.cpp b/test/cpp/jit/test_concat_opt.cpp
index 03c0ce6a58dae..5cb73d234927e 100644
--- a/test/cpp/jit/test_concat_opt.cpp
+++ b/test/cpp/jit/test_concat_opt.cpp
@@ -1,45 +1,15 @@
 #include <gtest/gtest.h>
 
+#include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/concat_opt.h>
+#include <torch/csrc/jit/passes/variadic_ops.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
 #include <torch/csrc/jit/testing/file_check.h>
 
 namespace torch {
 namespace jit {
 
-namespace {
-
-void checkOutputs(
-    const std::vector<at::Tensor>& out1,
-    const std::vector<at::Tensor>& out2) {
-  ASSERT_EQ(out1.size(), out2.size());
-  for (size_t i = 0; i < out1.size(); ++i) {
-    ASSERT_EQ(out1[i].sizes(), out2[i].sizes());
-    float max_diff = (out1[i] - out2[i]).abs().max().item<double>();
-    ASSERT_EQ(max_diff, 0);
-  }
-}
-
-std::vector<at::Tensor> runGraph(
-    std::shared_ptr<Graph> graph,
-    const std::vector<at::Tensor> inputs) {
-  std::vector<IValue> stack = fmap<IValue>(inputs);
-  Code code(graph, "test");
-  InterpreterState(code).run(stack);
-  TORCH_INTERNAL_ASSERT(!stack.empty());
-  // Graph outputs that are handled below:
-  //   * A list of Tensors.
-  //   * 1 Tensor.
-  if (stack.front().isTensorList()) {
-    return stack.front().toTensorVector();
-  }
-  TORCH_INTERNAL_ASSERT(stack.front().isTensor());
-  return {stack.front().toTensor()};
-}
-
-} // namespace
-
 TEST(ConcatOptTest, SimpleCommonInputsEliminationPrefix) {
   auto graph = std::make_shared<Graph>();
 
@@ -64,7 +34,7 @@ TEST(ConcatOptTest, SimpleCommonInputsEliminationPrefix) {
   ASSERT_TRUE(EliminateConcatCommonInputs(graph));
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   // Graph after EliminateConcatCommonInputs:
   //  graph(%0 : ...,
@@ -109,7 +79,7 @@ TEST(ConcatOptTest, SimpleCommonInputsEliminationSuffix) {
   ASSERT_TRUE(EliminateConcatCommonInputs(graph));
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   // Graph after EliminateConcatCommonInputs:
   //  graph(%0 : ...,
@@ -161,7 +131,7 @@ TEST(ConcatOptTest, CommonInputsEliminationWithDifferentOrderInputs) {
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
 
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   // No optimizations should have happened in this case since the inputs
   // to the `cat` are in different order.
@@ -198,7 +168,7 @@ TEST(ConcatOptTest, MoreCommonInputsElimination) {
   ASSERT_TRUE(EliminateConcatCommonInputs(graph));
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   testing::FileCheck()
       .check_count("= prim::VarConcat(%0, %1, %5)", 1, /*exactly*/ true)
@@ -233,7 +203,7 @@ TEST(ConcatOptTest, ExpandConcat) {
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
 
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   // After full concat optimization we should have the following graph:
   //
@@ -289,7 +259,7 @@ TEST(ConcatOptTest, ConcatWithoutResultShape) {
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
 
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   // No optimizations should have happened in this case since the output
   // shape of `aten::cat` is not known.
@@ -324,7 +294,7 @@ TEST(ConcatOptTest, ConcatWithoutInputShape) {
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
 
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   // No optimizations should have happened in this case since the shape of %5,
   // which is an input to `aten::cat`, is not known.
@@ -361,7 +331,7 @@ TEST(ConcatOptTest, UseVariadicCat) {
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
 
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   // After replacing `aten::cat` with `prim::VarConcat` we should have the
   // following graph:
@@ -406,7 +376,7 @@ TEST(OptimizeConcatTest, UseVariadicCatReplaceMultiple) {
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
 
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   // After full concat optimization we should have the following graph:
   //
@@ -446,7 +416,7 @@ TEST(ConcatOptTest, UseVariadicCatWithMultipleListUses) {
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
 
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   // After replacing `aten::cat` with `prim::VarConcat` we should have the
   // following graph:
@@ -488,7 +458,7 @@ TEST(ConcatOptTest, UseVariadicCatWithListMutationAfterCat) {
   ASSERT_TRUE(UseVariadicCat(graph));
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   // The input list to `aten::cat` is mutated only after `aten::cat` op. So,
   // it should have been replaced with `prim::VarConcat`. The transformed graph
@@ -534,7 +504,7 @@ TEST(ConcatOptTest, UseVariadicCatWithListMutationBeforeCat) {
     ASSERT_FALSE(UseVariadicCat(graph));
     graph->lint();
     auto opt_outputs = runGraph(graph, inputs);
-    checkOutputs(orig_outputs, opt_outputs);
+    ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
     // No transformation should have happened since the `prim::ListConstruct` is
     // mutated before `aten::cat`.
@@ -549,7 +519,7 @@ TEST(ConcatOptTest, UseVariadicCatWithListMutationBeforeCat) {
     ASSERT_TRUE(RemoveListMutationAndUseVariadicCat(graph));
     graph->lint();
     auto opt_outputs = runGraph(graph, inputs);
-    checkOutputs(orig_outputs, opt_outputs);
+    ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
     // The mutation of the list must be removed and the `aten::cat` op must
     // be replaced with the `prim::VarConcat` op in the graph. The transformed
@@ -602,7 +572,7 @@ TEST(ConcatOptTest, UseVariadicCatWithMultipleListMutations) {
   ASSERT_TRUE(RemoveListMutationAndUseVariadicCat(graph));
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   // All the mutations of the list must be removed and the `aten::cat` ops must
   // be replaced with `prim::VarConcat` ops in the graph. The transformed graph
@@ -659,7 +629,7 @@ TEST(
   ASSERT_TRUE(EliminateConcatCommonInputs(graph));
   graph->lint();
   auto opt_outputs = runGraph(graph, inputs);
-  checkOutputs(orig_outputs, opt_outputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
 
   // After performing:
   //     * Remove list mutation
diff --git a/test/cpp/jit/test_utils.cpp b/test/cpp/jit/test_utils.cpp
index 7750ba8f10fee..27667f068588b 100644
--- a/test/cpp/jit/test_utils.cpp
+++ b/test/cpp/jit/test_utils.cpp
@@ -198,6 +198,7 @@ bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor> inputs) {
   }
   return diff.abs().max().item<float>() < 2e-6 * maxValue;
 }
+
 bool almostEqual(const at::Tensor& a, const at::Tensor& b) {
   return checkRtol(a - b, {a, b});
 }
@@ -206,6 +207,20 @@ bool exactlyEqual(const at::Tensor& a, const at::Tensor& b) {
   return (a - b).abs().max().item<float>() == 0.f;
 }
 
+bool exactlyEqual(
+    const std::vector<at::Tensor>& a,
+    const std::vector<at::Tensor>& b) {
+  if (a.size() != b.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < a.size(); ++i) {
+    if (!exactlyEqual(a[i], b[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
 std::pair<at::Tensor, at::Tensor> lstm(
     at::Tensor input,
     at::Tensor hx,
@@ -248,5 +263,22 @@ RegisterOperators reg({
 });
 } // namespace
 
+std::vector<at::Tensor> runGraph(
+    std::shared_ptr<Graph> graph,
+    const std::vector<at::Tensor>& inputs) {
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  Code code(graph, "test");
+  InterpreterState(code).run(stack);
+  TORCH_INTERNAL_ASSERT(!stack.empty());
+  // Graph outputs that are handled below:
+  //   * A list of Tensors.
+  //   * 1 Tensor.
+  if (stack.front().isTensorList()) {
+    return stack.front().toTensorVector();
+  }
+  TORCH_INTERNAL_ASSERT(stack.front().isTensor());
+  return {stack.front().toTensor()};
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_utils.h b/test/cpp/jit/test_utils.h
index 676759dca480f..5e640aed0114a 100644
--- a/test/cpp/jit/test_utils.h
+++ b/test/cpp/jit/test_utils.h
@@ -88,6 +88,13 @@ bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor> inputs);
 bool almostEqual(const at::Tensor& a, const at::Tensor& b);
 
 bool exactlyEqual(const at::Tensor& a, const at::Tensor& b);
+bool exactlyEqual(
+    const std::vector<at::Tensor>& a,
+    const std::vector<at::Tensor>& b);
+
+std::vector<at::Tensor> runGraph(
+    std::shared_ptr<Graph> graph,
+    const std::vector<at::Tensor>& inputs);
 
 std::pair<at::Tensor, at::Tensor> lstm(
     at::Tensor input,
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index e20d97333c83e..2eabbd0a8b230 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -244,6 +244,7 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/passes/symbolic_shape_analysis.cpp",
     "torch/csrc/jit/passes/specialize_autogradzero.cpp",
     "torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp",
+    "torch/csrc/jit/passes/variadic_ops.cpp",
     "torch/csrc/jit/passes/subgraph_rewrite.cpp",
     "torch/csrc/jit/passes/tensorexpr_fuser.cpp",
     "torch/csrc/jit/passes/utils/memory_dag.cpp",
diff --git a/torch/csrc/jit/passes/concat_opt.cpp b/torch/csrc/jit/passes/concat_opt.cpp
index aa2573ebb42f2..81c8a6745007a 100644
--- a/torch/csrc/jit/passes/concat_opt.cpp
+++ b/torch/csrc/jit/passes/concat_opt.cpp
@@ -497,95 +497,5 @@ void ExpandConcatAndEliminateRedundancy(const std::shared_ptr<Graph>& graph) {
   GRAPH_DUMP("After expanding Concat and eliminating redundancy", graph);
 }
 
-namespace {
-
-class VariadicCatUpdater {
- public:
-  explicit VariadicCatUpdater(std::shared_ptr<Graph> graph)
-      : graph_(std::move(graph)) {}
-
-  bool run() {
-    collectCatNodes(graph_->block());
-    bool changed = false;
-    for (auto c : cat_nodes_) {
-      changed = replaceWithVariadicCat(c) || changed;
-    }
-    return changed;
-  }
-
- private:
-  void collectCatNodes(Block* block) {
-    for (auto node : block->nodes()) {
-      if (node->kind() == aten::cat) {
-        cat_nodes_.push_back(node);
-      }
-      for (Block* b : node->blocks()) {
-        collectCatNodes(b);
-      }
-    }
-  }
-
-  bool replaceWithVariadicCat(Node* cat) {
-    if (cat->input(0)->node()->kind() != prim::ListConstruct) {
-      return false;
-    }
-    auto list = cat->input(0)->node();
-    // We do not transform cat ops whose list input can not be moved to the
-    // position before cat. This in turn implies that there is some mutation
-    // of the input list before cat.
-    if (!getOrCreateAliasDb()->couldMoveBeforeTopologically(list, cat)) {
-      return false;
-    }
-    std::vector<Value*> inputs = list->inputs().vec();
-    inputs.push_back(cat->input(1));
-    auto var_cat = cat->owningGraph()->create(prim::VarConcat, inputs);
-    GRAPH_UPDATE("Adding\n", *var_cat);
-    var_cat->insertBefore(cat);
-    GRAPH_UPDATE("Replacing\n", *cat, "with\n", *var_cat);
-    cat->output()->replaceAllUsesWith(var_cat->output());
-    GRAPH_UPDATE("Deleting\n", *cat);
-    cat->destroy();
-    if (!list->hasUses()) {
-      GRAPH_UPDATE("Deleting\n", *list);
-      list->destroy();
-    }
-    return true;
-  }
-
-  AliasDb* getOrCreateAliasDb() {
-    if (!aliasDb_) {
-      aliasDb_ = std::make_unique<AliasDb>(graph_);
-    }
-    return aliasDb_.get();
-  }
-
-  std::shared_ptr<Graph> graph_;
-  std::unique_ptr<AliasDb> aliasDb_ = nullptr;
-
-  std::vector<Node*> cat_nodes_;
-};
-
-} // namespace
-
-bool UseVariadicCat(const std::shared_ptr<Graph>& graph) {
-  GRAPH_DUMP("Before VariadicCat", graph);
-  bool changed = VariadicCatUpdater(graph).run();
-  if (changed) {
-    GRAPH_DUMP("After VariadicCat", graph);
-  }
-  return changed;
-}
-
-bool RemoveListMutationAndUseVariadicCat(const std::shared_ptr<Graph>& graph) {
-  bool changed_in_last_iter = true;
-  bool changed = false;
-  while (changed_in_last_iter) {
-    changed_in_last_iter = RemoveListMutation(graph);
-    changed_in_last_iter = changed_in_last_iter || UseVariadicCat(graph);
-    changed = changed || changed_in_last_iter;
-  }
-  return changed;
-}
-
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/concat_opt.h b/torch/csrc/jit/passes/concat_opt.h
index b82dc25e612a4..ef4d9432438e6 100644
--- a/torch/csrc/jit/passes/concat_opt.h
+++ b/torch/csrc/jit/passes/concat_opt.h
@@ -13,12 +13,5 @@ TORCH_API bool EliminateConcatCommonInputs(const std::shared_ptr<Graph>& graph);
 TORCH_API void ExpandConcatAndEliminateRedundancy(
     const std::shared_ptr<Graph>& graph);
 
-// Replaces the `aten::cat` ops in the given graph with variadic cat ops.
-// Returns true if the graph is modified.
-TORCH_API bool UseVariadicCat(const std::shared_ptr<Graph>& graph);
-
-TORCH_API bool RemoveListMutationAndUseVariadicCat(
-    const std::shared_ptr<Graph>& graph);
-
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/variadic_ops.cpp b/torch/csrc/jit/passes/variadic_ops.cpp
new file mode 100644
index 0000000000000..aeb70747b3a57
--- /dev/null
+++ b/torch/csrc/jit/passes/variadic_ops.cpp
@@ -0,0 +1,126 @@
+#include <torch/csrc/jit/passes/variadic_ops.h>
+
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/remove_mutation.h>
+
+namespace torch {
+namespace jit {
+
+namespace {
+
+class VariadicUpdater {
+ public:
+  explicit VariadicUpdater(
+      std::shared_ptr<Graph> graph,
+      NodeKind op,
+      NodeKind variadic_op)
+      : graph_(std::move(graph)), op_(op), variadic_op_(variadic_op) {}
+
+  bool run() {
+    collectOpNodes(graph_->block());
+    bool changed = false;
+    for (auto n : op_nodes_) {
+      changed |= replaceWithVariadicOp(n);
+    }
+    return changed;
+  }
+
+ private:
+  void collectOpNodes(Block* block) {
+    for (auto node : block->nodes()) {
+      if (node->kind() == op_) {
+        op_nodes_.push_back(node);
+      }
+      for (Block* b : node->blocks()) {
+        collectOpNodes(b);
+      }
+    }
+  }
+
+  bool replaceWithVariadicOp(Node* op_node) {
+    if (op_node->input(0)->node()->kind() != prim::ListConstruct) {
+      return false;
+    }
+    auto list = op_node->input(0)->node();
+    // We do not transform ops whose list input can not be moved to the
+    // position before op. This in turn implies that there is some mutation
+    // of the input list before op.
+    if (!getOrCreateAliasDb()->couldMoveBeforeTopologically(list, op_node)) {
+      return false;
+    }
+    std::vector<Value*> inputs = list->inputs().vec();
+    // Add non-list inputs
+    for (size_t i = 1; i < op_node->inputs().size(); ++i) {
+      inputs.push_back(op_node->input(i));
+    }
+    auto var_op_node = op_node->owningGraph()->create(variadic_op_, inputs);
+    GRAPH_UPDATE("Adding\n", *var_op_node);
+    var_op_node->insertBefore(op_node);
+    GRAPH_UPDATE("Replacing\n", *op_node, "with\n", *var_op_node);
+    op_node->output()->replaceAllUsesWith(var_op_node->output());
+    GRAPH_UPDATE("Deleting\n", *op_node);
+    op_node->destroy();
+    if (!list->hasUses()) {
+      GRAPH_UPDATE("Deleting\n", *list);
+      list->destroy();
+    }
+    return true;
+  }
+
+  AliasDb* getOrCreateAliasDb() {
+    if (!aliasDb_) {
+      aliasDb_ = std::make_unique<AliasDb>(graph_);
+    }
+    return aliasDb_.get();
+  }
+
+  std::shared_ptr<Graph> graph_;
+  std::unique_ptr<AliasDb> aliasDb_ = nullptr;
+
+  std::vector<Node*> op_nodes_;
+
+  NodeKind op_;
+  NodeKind variadic_op_;
+};
+
+} // namespace
+
+bool UseVariadicOp(
+    const std::shared_ptr<Graph>& graph,
+    NodeKind op,
+    NodeKind variadic_op) {
+  const std::string pass_name = std::string("variadic ") + op.toQualString();
+  GRAPH_DUMP("Before " + pass_name, graph);
+  bool changed = VariadicUpdater(graph, op, variadic_op).run();
+  if (changed) {
+    GRAPH_DUMP("After " + pass_name, graph);
+  }
+  return changed;
+}
+
+bool RemoveListMutationAndUseVariadicOp(
+    const std::shared_ptr<Graph>& graph,
+    NodeKind op,
+    NodeKind variadic_op) {
+  bool changed_in_last_iter = true;
+  bool changed = false;
+  while (changed_in_last_iter) {
+    changed_in_last_iter = RemoveListMutation(graph);
+    changed_in_last_iter =
+        UseVariadicOp(graph, op, variadic_op) || changed_in_last_iter;
+    changed = changed || changed_in_last_iter;
+  }
+  return changed;
+}
+
+bool UseVariadicCat(const std::shared_ptr<Graph>& graph) {
+  return UseVariadicOp(graph, aten::cat, prim::VarConcat);
+}
+
+bool RemoveListMutationAndUseVariadicCat(const std::shared_ptr<Graph>& graph) {
+  return RemoveListMutationAndUseVariadicOp(graph, aten::cat, prim::VarConcat);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/variadic_ops.h b/torch/csrc/jit/passes/variadic_ops.h
new file mode 100644
index 0000000000000..1c52e9513ae2b
--- /dev/null
+++ b/torch/csrc/jit/passes/variadic_ops.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Replaces the `aten::cat` ops in the given graph with variadic cat ops.
+// Returns true if the graph is modified.
+TORCH_API bool UseVariadicCat(const std::shared_ptr<Graph>& graph);
+
+TORCH_API bool RemoveListMutationAndUseVariadicCat(
+    const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 1ee69a642384f..4219be5040ba7 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -9,11 +9,11 @@
 #include <caffe2/core/timer.h>
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
-#include <torch/csrc/jit/passes/concat_opt.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/remove_mutation.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
+#include <torch/csrc/jit/passes/variadic_ops.h>
 #include <torch/csrc/jit/runtime/static/ops.h>
 #include <torch/csrc/jit/runtime/static/passes.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>

From 172e5c76ab05f1a137eb065b7f221a20eaef514a Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Mon, 23 Aug 2021 17:28:33 -0700
Subject: [PATCH 145/530] Fix some memory bugs in onnx passes (#63754)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63754

Running onnx tests with ASAN uncovers several memory errors.  These two are caused by: (1) iterating the uses list of a node after mutation, and (2) accessing the `blocks` attribute of a possibly deleted node.

To reproduce (this is on a CentOS 7 box):
```
DEBUG=1 CFLAGS="-fsanitize=address" CXXFLAGS="-fsanitize=address" USE_LLVM=$(realpath ../llvm-project/install) CMAKE_PREFIX_PATH=$CONDA_PREFIX python setup.py install
LD_PRELOAD=$(realpath /lib64/libasan.so.5) numactl -C3 pytest -v --cov --cov-report xml:test/coverage.xml --cov-append onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset11 -s
```

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D30493939

Pulled By: bertmaher

fbshipit-source-id: e16e19dc9b4c9896e102ca8bf04c8bedfdde87af
---
 .../jit/passes/onnx/list_model_parameters.cpp |  6 ++-
 .../passes/onnx/pattern_conversion/common.cpp |  4 +-
 .../onnx/remove_inplace_ops_for_onnx.cpp      | 45 +++++++++++--------
 3 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/torch/csrc/jit/passes/onnx/list_model_parameters.cpp b/torch/csrc/jit/passes/onnx/list_model_parameters.cpp
index ccadf53713466..9c751bbae9e12 100644
--- a/torch/csrc/jit/passes/onnx/list_model_parameters.cpp
+++ b/torch/csrc/jit/passes/onnx/list_model_parameters.cpp
@@ -76,6 +76,7 @@ std::vector<IValue> getParamAttributes(
   WithInsertPoint guard(m);
 
   std::vector<IValue> parameterIValues = {};
+  std::unordered_set<Node*> nodesToDestroy;
   for (auto it = block->nodes().begin(); it != block->nodes().end();) {
     Node* n = *it;
     it++; // node n can be destroyed
@@ -142,7 +143,7 @@ std::vector<IValue> getParamAttributes(
           // This attr is constant for ONNX.
           auto attrVal = tryInsertConstant(*graph, attr);
           n->output()->replaceAllUsesWith(*attrVal);
-          n->destroy();
+          nodesToDestroy.emplace(n);
         }
       }
     }
@@ -156,6 +157,9 @@ std::vector<IValue> getParamAttributes(
           std::end(nextParameterIValues));
     }
   }
+  for (auto n : nodesToDestroy) {
+    n->destroy();
+  }
   return parameterIValues;
 }
 
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp
index 2854c3ab2fe2e..bc646308424b0 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp
@@ -4,8 +4,8 @@ namespace torch {
 namespace jit {
 
 bool IndexingPatternFinder::IsSameSource(const Node* n, const Node* m) {
-  const auto& source_n = n->sourceRange().source();
-  const auto& source_m = m->sourceRange().source();
+  const auto source_n = n->sourceRange().source();
+  const auto source_m = m->sourceRange().source();
   return (
       (source_n->text() == source_m->text()) &&
       (source_n->starting_line_no() == source_m->starting_line_no()));
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index 913f4dc2b6edb..2cef76a7391ae 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -317,26 +317,33 @@ static void PrepareForRemoveMutations(MutationRemover& mr, Block* b) {
   }
 
   for (auto input : b->inputs()) {
-    for (auto use : input->uses()) {
-      Node* node = use.user;
-      if (!mr.inplaceOpVariant(node)) {
-        continue;
-      }
-      auto it = std::find(node->inputs().begin(), node->inputs().end(), input);
-      if (it != node->inputs().end()) {
-        int index = std::distance(node->inputs().begin(), it);
-        std::cerr << "Warning: ONNX Preprocess - Removing mutation from node "
-                  << node->kind().toQualString() << " on block input: '"
-                  << (*it)->debugName() << "'. This changes graph semantics."
-                  << std::endl;
-
-        Node* newNode =
-            addDummyClone(b->owningGraph(), input, false, b->return_node());
-        TORCH_INTERNAL_ASSERT(nullptr != newNode);
-        node->replaceInput(index, newNode->output());
-        input->replaceAllUsesAfterNodeWith(node, newNode->output());
+    bool needsRestart = false;
+    do {
+      needsRestart = false;
+      for (auto use : input->uses()) {
+        Node* node = use.user;
+        if (!mr.inplaceOpVariant(node)) {
+          continue;
+        }
+        auto it =
+            std::find(node->inputs().begin(), node->inputs().end(), input);
+        if (it != node->inputs().end()) {
+          int index = std::distance(node->inputs().begin(), it);
+          std::cerr << "Warning: ONNX Preprocess - Removing mutation from node "
+                    << node->kind().toQualString() << " on block input: '"
+                    << (*it)->debugName() << "'. This changes graph semantics."
+                    << std::endl;
+
+          Node* newNode =
+              addDummyClone(b->owningGraph(), input, false, b->return_node());
+          TORCH_INTERNAL_ASSERT(nullptr != newNode);
+          node->replaceInput(index, newNode->output());
+          input->replaceAllUsesAfterNodeWith(node, newNode->output());
+          needsRestart = true;
+          break;
+        }
       }
-    }
+    } while (needsRestart);
   }
 }
 

From d9231dc3dfb9c218bbd62355f9a0349be2ceca75 Mon Sep 17 00:00:00 2001
From: driazati <driazati@users.noreply.github.com>
Date: Mon, 23 Aug 2021 17:30:51 -0700
Subject: [PATCH 146/530] Skip archiving useless build artifacts (#63785)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63785

We currently zip up everything in `build/` which includes a lot of cruft (`.o` files, random things copied in from dependencies, etc). This makes the artifact bigger (slower upload/download times, and takes about 1.5 minutes to archive). This change makes archiving instead take ~15 seconds and removes the 50 second upload to GitHub step that isn't as useful now that we have the HUD PR page that lists out all artifacts.

Test Plan: Imported from OSS

Reviewed By: seemethere, janeyx99

Differential Revision: D30494444

Pulled By: driazati

fbshipit-source-id: 93202dba7387daeb4859a938110b02ff2dc2ccc4
---
 .github/templates/linux_ci_workflow.yml.j2          | 13 +------------
 .../generated-linux-bionic-cuda10.2-py3.9-gcc7.yml  | 13 +------------
 .../generated-linux-bionic-py3.8-gcc9-coverage.yml  | 13 +------------
 .../generated-linux-xenial-cuda10.2-py3.6-gcc7.yml  | 13 +------------
 .../generated-linux-xenial-cuda11.1-py3.6-gcc7.yml  | 13 +------------
 .../generated-linux-xenial-py3.6-gcc5.4.yml         | 13 +------------
 ...ed-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml | 13 +------------
 7 files changed, 7 insertions(+), 84 deletions(-)

diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 25099b46cbc8e..767760bf24d25 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -220,18 +220,7 @@ jobs:
       {%- if not is_libtorch %}
       - name: Archive artifacts into zip
         run: |
-          zip -r artifacts.zip dist/ build/ .pytorch-test-times.json
-      # Upload to github so that people can click and download artifacts
-      - uses: actions/upload-artifact@v2
-        # Don't fail on upload to GH since it's only for user convenience
-        continue-on-error: true
-        name: Store PyTorch Build Artifacts on Github
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
       - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
         name: Store PyTorch Build Artifacts on S3
         with:
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 67bb2064863bc..61a817ea64bc1 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -199,18 +199,7 @@ jobs:
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Archive artifacts into zip
         run: |
-          zip -r artifacts.zip dist/ build/ .pytorch-test-times.json
-      # Upload to github so that people can click and download artifacts
-      - uses: actions/upload-artifact@v2
-        # Don't fail on upload to GH since it's only for user convenience
-        continue-on-error: true
-        name: Store PyTorch Build Artifacts on Github
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
       - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
         name: Store PyTorch Build Artifacts on S3
         with:
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index 59061662341a3..f07b8712b6ea1 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -199,18 +199,7 @@ jobs:
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Archive artifacts into zip
         run: |
-          zip -r artifacts.zip dist/ build/ .pytorch-test-times.json
-      # Upload to github so that people can click and download artifacts
-      - uses: actions/upload-artifact@v2
-        # Don't fail on upload to GH since it's only for user convenience
-        continue-on-error: true
-        name: Store PyTorch Build Artifacts on Github
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
       - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
         name: Store PyTorch Build Artifacts on S3
         with:
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index d897e28ace007..cb8c6b55b1789 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -199,18 +199,7 @@ jobs:
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Archive artifacts into zip
         run: |
-          zip -r artifacts.zip dist/ build/ .pytorch-test-times.json
-      # Upload to github so that people can click and download artifacts
-      - uses: actions/upload-artifact@v2
-        # Don't fail on upload to GH since it's only for user convenience
-        continue-on-error: true
-        name: Store PyTorch Build Artifacts on Github
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
       - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
         name: Store PyTorch Build Artifacts on S3
         with:
diff --git a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 30514cab07ea4..4275cc31ebddd 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -199,18 +199,7 @@ jobs:
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Archive artifacts into zip
         run: |
-          zip -r artifacts.zip dist/ build/ .pytorch-test-times.json
-      # Upload to github so that people can click and download artifacts
-      - uses: actions/upload-artifact@v2
-        # Don't fail on upload to GH since it's only for user convenience
-        continue-on-error: true
-        name: Store PyTorch Build Artifacts on Github
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
       - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
         name: Store PyTorch Build Artifacts on S3
         with:
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index bd4d65027c066..e3be43370a777 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -199,18 +199,7 @@ jobs:
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Archive artifacts into zip
         run: |
-          zip -r artifacts.zip dist/ build/ .pytorch-test-times.json
-      # Upload to github so that people can click and download artifacts
-      - uses: actions/upload-artifact@v2
-        # Don't fail on upload to GH since it's only for user convenience
-        continue-on-error: true
-        name: Store PyTorch Build Artifacts on Github
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
       - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
         name: Store PyTorch Build Artifacts on S3
         with:
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 5fd0e99f002f7..0282b206a117c 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -197,18 +197,7 @@ jobs:
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Archive artifacts into zip
         run: |
-          zip -r artifacts.zip dist/ build/ .pytorch-test-times.json
-      # Upload to github so that people can click and download artifacts
-      - uses: actions/upload-artifact@v2
-        # Don't fail on upload to GH since it's only for user convenience
-        continue-on-error: true
-        name: Store PyTorch Build Artifacts on Github
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
       - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
         name: Store PyTorch Build Artifacts on S3
         with:

From fc474979055c99d9aa48e1ee8d0c7a33a2a67daf Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 23 Aug 2021 17:39:45 -0700
Subject: [PATCH 147/530] Simplify ccache instructions in CONTRIBUTING.md
 (#62549)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62549

When building CUDA files with native CMake support, it will respect the
`CMAKE_CUDA_COMPILER_LAUNCHER` setting. So, there's no need for symlinks.

Test Plan: Imported from OSS

Reviewed By: bdhirsh

Differential Revision: D30498488

Pulled By: malfet

fbshipit-source-id: 71c2ae9d4570cfac2a64d777bc95cda3764332a0
---
 CONTRIBUTING.md | 112 +++++++++++-------------------------------------
 1 file changed, 24 insertions(+), 88 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e1a049cf9a979..baafcefdc59fe 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -734,111 +734,47 @@ succeed.
 
 #### Use CCache
 
-Even when dependencies are tracked with file modification,
-there are many situations where files get rebuilt when a previous
-compilation was exactly the same.
-
-Using ccache in a situation like this is a real time-saver. The ccache manual
-describes [two ways to use ccache](https://ccache.samba.org/manual/latest.html#_run_modes).
-In the PyTorch project, currently only the latter method of masquerading as
-the compiler via symlinks works for CUDA compilation.
-
-Here are the instructions for installing ccache from source (tested at commit
-`3c302a7` of the `ccache` repo):
+Even when dependencies are tracked with file modification, there are many
+situations where files get rebuilt when a previous compilation was exactly the
+same. Using ccache in a situation like this is a real time-saver.
 
+Before building pytorch, install ccache from your package manager of choice:
 ```bash
-#!/bin/bash
-
-if ! ls ~/ccache/bin/ccache
-then
-    set -ex
-    sudo apt-get update
-    sudo apt-get install -y cmake
-    mkdir -p ~/ccache
-    pushd ~/ccache
-    rm -rf ccache
-    git clone https://github.com/ccache/ccache.git
-    mkdir -p ccache/build
-    pushd ccache/build
-    cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ccache -DENABLE_TESTING=OFF -DZSTD_FROM_INTERNET=ON ..
-    make -j$(nproc) install
-    popd
-    popd
-
-    mkdir -p ~/ccache/lib
-    mkdir -p ~/ccache/cuda
-    ln -s ~/ccache/bin/ccache ~/ccache/lib/cc
-    ln -s ~/ccache/bin/ccache ~/ccache/lib/c++
-    ln -s ~/ccache/bin/ccache ~/ccache/lib/gcc
-    ln -s ~/ccache/bin/ccache ~/ccache/lib/g++
-    ln -s ~/ccache/bin/ccache ~/ccache/cuda/nvcc
-
-    ~/ccache/bin/ccache -M 25Gi
-fi
-
-export PATH=~/ccache/lib:$PATH
-export CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc
+conda install ccache -f conda-forge
+sudo apt install ccache
+sudo yum install ccache
+brew install ccache
 ```
 
-Alternatively, `ccache` provided by newer Linux distributions (e.g. Debian/sid)
-also works, but the `nvcc` symlink to `ccache` as described above is still required.
-
-Note that the original `nvcc` binary (typically at `/usr/local/cuda/bin`) must
-be on your `PATH`, otherwise `ccache` will emit the following error:
-
-    ccache: error: Could not find compiler "nvcc" in PATH
-
-For example, here is how to install/configure `ccache` on Ubuntu:
+You may also find the default cache size in ccache is too small to be useful.
+The cache sizes can be increased from the command line:
 
 ```bash
-# install ccache
-sudo apt install ccache
-
-# update symlinks and create/re-create nvcc link
-sudo /usr/sbin/update-ccache-symlinks
-sudo ln -s /usr/bin/ccache /usr/lib/ccache/nvcc
-
 # config: cache dir is ~/.ccache, conf file ~/.ccache/ccache.conf
 # max size of cache
 ccache -M 25Gi  # -M 0 for unlimited
 # unlimited number of files
 ccache -F 0
-
-# deploy (and add to ~/.bashrc for later)
-export PATH="/usr/lib/ccache:$PATH"
 ```
 
-It is also possible to install `ccache` via `conda` by installing it from the
-community-maintained `conda-forge` channel. Here is how to set up `ccache` this
-way:
+To check this is working, do two clean builds of pytorch in a row. The second
+build should be substantially and noticeably faster than the first build. If
+this doesn't seem to be the case, check the `CMAKE_<LANG>_COMPILER_LAUNCHER`
+rules in `build/CMakeCache.txt`, where `<LANG>` is `C`, `CXX` and `CUDA`.
+Each of these 3 variables should contain ccache, e.g.
+```
+//CXX compiler launcher
+CMAKE_CXX_COMPILER_LAUNCHER:STRING=/usr/bin/ccache
+```
 
+If not, you can define these variables on the command line before invoking `setup.py`.
 ```bash
-# install ccache
-conda install -c conda-forge ccache
-
-# set up ccache compiler symlinks
-mkdir ~/ccache
-mkdir ~/ccache/lib
-mkdir ~/ccache/cuda
-ln -s $CONDA_PREFIX/bin/ccache ~/ccache/lib/cc
-ln -s $CONDA_PREFIX/bin/ccache ~/ccache/lib/c++
-ln -s $CONDA_PREFIX/bin/ccache ~/ccache/lib/gcc
-ln -s $CONDA_PREFIX/bin/ccache ~/ccache/lib/g++
-ln -s $CONDA_PREFIX/bin/ccache ~/ccache/cuda/nvcc
-
-# update PATH to reflect symlink locations, consider
-# adding this to your .bashrc
-export PATH=~/ccache/lib:$PATH
-export CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc
-
-# increase ccache cache size to 25 GiB
-ccache -M 25Gi
+export CMAKE_C_COMPILER_LAUNCHER=ccache
+export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+export CMAKE_CUDA_COMPILER_LAUNCHER=ccache
+python setup.py develop
 ```
 
-To check this is working, do two clean builds of pytorch in a row. The second
-build should be substantially and noticeably faster than the first build. If this doesn't seem to be the case, check that each of the symlinks above actually link to your installation of `ccache`. For example, if you followed the first option and installed `ccache` from source on a Linux machine, running `readlink -e $(which g++)` should return `~/ccache/bin/ccache`.
-
-
 #### Use a faster linker
 If you are editing a single file and rebuilding in a tight loop, the time spent
 linking will dominate. The system linker available in most Linux distributions

From e4f44bec27bc458c5dc8021ed87f0e6fae904ef4 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 23 Aug 2021 17:39:50 -0700
Subject: [PATCH 148/530] Fix pocketfft include path in mobile build (#63714)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63714

PocketFFT was disabled for CMake < 3.9 but CMake 3.11 is the first version to support `INCLUDE_DIRECTORIES` as a target property. So updating to CMake 3.10 causes the mobile builds to fail. Instead of limiting the CMake support, this just adds the include directory to the entire target,

Test Plan: Imported from OSS

Reviewed By: bdhirsh

Differential Revision: D30498369

Pulled By: malfet

fbshipit-source-id: 83372e29c477c97e7015763b7c29d6d7e456bcef
---
 caffe2/CMakeLists.txt    | 16 +++++++++++-----
 cmake/Dependencies.cmake |  7 ++++---
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 523fea8181cf8..67ab08f9b0fc5 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -529,11 +529,6 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/tensorexpr/llvm_codegen.cpp PROPERTIES COMPILE_FLAGS -Wno-init-list-lifetime)
   endif()
 
-  # Pass path to PocketFFT
-  if(AT_POCKETFFT_ENABLED)
-    set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/mkl/SpectralOps.cpp PROPERTIES INCLUDE_DIRECTORIES "${POCKETFFT_INCLUDE_DIR}")
-  endif()
-
   if(NOT INTERN_DISABLE_MOBILE_INTERP)
     set(MOBILE_SRCS
        ${TORCH_SRC_DIR}/csrc/jit/mobile/function.cpp
@@ -795,6 +790,17 @@ if(USE_PRECOMPILED_HEADERS)
       PROPERTIES SKIP_PRECOMPILE_HEADERS ON)
 endif()
 
+# Pass path to PocketFFT
+if(AT_POCKETFFT_ENABLED)
+  if(CMAKE_VERSION VERSION_LESS "3.11")
+    target_include_directories(torch_cpu PRIVATE "${POCKETFFT_INCLUDE_DIR}")
+  else()
+    set_source_files_properties(
+        "${PROJECT_SOURCE_DIR}/aten/src/ATen/native/mkl/SpectralOps.cpp"
+        PROPERTIES INCLUDE_DIRECTORIES "${POCKETFFT_INCLUDE_DIR}")
+  endif()
+endif()
+
 if(CMAKE_COMPILER_IS_GNUCXX AND BUILD_LIBTORCH_CPU_WITH_DEBUG)
   # To enable debug fission we need to build libtorch_cpu with debug info on,
   # but this increases link time and peak memory usage if we use the
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 3e37c3538f6fd..b3cc23ccac8f4 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -242,14 +242,15 @@ endif()
 
 # --- [ PocketFFT
 set(AT_POCKETFFT_ENABLED 0)
-if(NOT MKL_FOUND)
+if(NOT AT_MKL_ENABLED)
   find_path(POCKETFFT_INCLUDE_DIR NAMES pocketfft_hdronly.h PATHS
             /usr/local/include
-            "$ENV{POCKETFFT_HOME}"
+            ENV POCKETFFT_HOME
             "${PROJECT_SOURCE_DIR}/third_party/pocketfft"
            )
-  if(POCKETFFT_INCLUDE_DIR AND CMAKE_VERSION VERSION_GREATER "3.9")
+  if(POCKETFFT_INCLUDE_DIR)
     set(AT_POCKETFFT_ENABLED 1)
+    message(STATUS "Using pocketfft in directory: ${POCKETFFT_INCLUDE_DIR}")
   endif()
 endif()
 

From 630ec2e190fe866e8b5c87844d6dd09bf134aac5 Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@fb.com>
Date: Mon, 23 Aug 2021 17:41:38 -0700
Subject: [PATCH 149/530] [fx_acc] Add mapper for torch.log1p (#63792)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63792

Map `torch.log1p` to `acc_ops.add` + `acc_ops.log`.

Test Plan: buck test mode/opt glow/fb/fx/oss_acc_tracer:test_acc_tracer -- test_log1p

Reviewed By: wushirong

Differential Revision: D30491706

fbshipit-source-id: bcbeddf06131113185d2019cfd7cf5e9193a8a78
---
 torch/fx/experimental/fx_acc/acc_ops.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py
index 7c9520660ef77..0c0965a430afd 100644
--- a/torch/fx/experimental/fx_acc/acc_ops.py
+++ b/torch/fx/experimental/fx_acc/acc_ops.py
@@ -509,6 +509,21 @@ def div(*, input, other):
 def relu(*, input, inplace=False):
     return nn.functional.relu(**locals())
 
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.log1p),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def torch_log1p_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
+    with node.graph.inserting_before(node):
+        add_kwargs = {"input": node.kwargs["input"], "other": 1}
+        add_node = node.graph.call_function(add, kwargs=add_kwargs)
+        add_node.meta = node.meta.copy()
+        log_kwargs = {"input": add_node}
+        log_node = node.graph.call_function(log, kwargs=log_kwargs)
+        log_node.meta = node.meta.copy()
+        return log_node
 
 @register_custom_acc_mapper_fn(
     op_and_target=("call_method", "sum"),

From 16a44344229e89ed8c275503580c7a6a4997b9bd Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Mon, 23 Aug 2021 17:45:39 -0700
Subject: [PATCH 150/530] [BE] Enable functional optim tests for windows
 (#63462)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63462

Now that `torch.distributed.optim` gates DistributedOptimizer on RPC availability, these tests can be run on windows.
ghstack-source-id: 136437635

Test Plan: CI

Reviewed By: SciPioneer

Differential Revision: D30358923

fbshipit-source-id: 36739bdfe7214789f17de652d30c62c2bc124c73
---
 test/distributed/test_c10d_nccl.py            |  8 +++-----
 test/test_functional_optim.py                 | 16 +---------------
 .../_internal/distributed/distributed_test.py | 19 ++++---------------
 3 files changed, 8 insertions(+), 35 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index e42c5c6be1759..1378aa07f0903 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -39,7 +39,6 @@
     with_nccl_blocking_wait,
 )
 from torch.testing._internal.common_utils import (
-    IS_WINDOWS,
     TestCase,
     run_tests,
     retry_on_connect_failures,
@@ -51,10 +50,9 @@
 from torch.utils.checkpoint import checkpoint
 from torch.distributed.optim import functional_optim_map
 
-if not IS_WINDOWS:
-    from torch.distributed.optim.functional_sgd import _FunctionalSGD
-    from torch.distributed.optim.functional_adam import _FunctionalAdam
-    from torch.distributed.optim.functional_adamw import _FunctionalAdamW
+from torch.distributed.optim.functional_sgd import _FunctionalSGD
+from torch.distributed.optim.functional_adam import _FunctionalAdam
+from torch.distributed.optim.functional_adamw import _FunctionalAdamW
 
 if TEST_WITH_DEV_DBG_ASAN:
     print(
diff --git a/test/test_functional_optim.py b/test/test_functional_optim.py
index 98a3f06805dba..accc72058578d 100644
--- a/test/test_functional_optim.py
+++ b/test/test_functional_optim.py
@@ -1,10 +1,8 @@
-import unittest
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.optim import SGD, Adam, AdamW
-from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS
+from torch.testing._internal.common_utils import TestCase, run_tests
 from torch.distributed.optim import functional_optim_map
 
 class MyModule(torch.nn.Module):
@@ -80,24 +78,12 @@ def _test_functional_optim_parity(self, optim_cls, *args, **kwargs):
                 self.assertNotEqual(old_module_optim_params[i], optim_param)
                 self.assertNotEqual(old_module_functional_params[i], functional_param)
 
-    @unittest.skipIf(
-        IS_WINDOWS,
-        "Functional optimizer not support on windows, see https://github.com/pytorch/pytorch/issues/62137",
-    )
     def test_functional_optim_parity_sgd(self):
         self._test_functional_optim_parity(SGD, 1e-2, momentum=0.9, weight_decay=0.01)
 
-    @unittest.skipIf(
-        IS_WINDOWS,
-        "Functional optimizer not support on windows, see https://github.com/pytorch/pytorch/issues/62137",
-    )
     def test_functional_optim_parity_adam(self):
         self._test_functional_optim_parity(Adam, 1e-2, betas=(0.9, 0.999), eps=1e-6)
 
-    @unittest.skipIf(
-        IS_WINDOWS,
-        "Functional optimizer not support on windows, see https://github.com/pytorch/pytorch/issues/62137",
-    )
     def test_functional_optim_parity_adam_w(self):
         self._test_functional_optim_parity(AdamW, 1e-2, betas=(0.9, 0.999), eps=1e-6)
 
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 096b7182851c3..aa8841d3266bc 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -68,11 +68,12 @@
 
 from torch.distributed.optim import functional_optim_map
 
+from torch.distributed.optim.functional_sgd import _FunctionalSGD
+from torch.distributed.optim.functional_adam import _FunctionalAdam
+from torch.distributed.optim.functional_adamw import _FunctionalAdamW
+
 if not IS_WINDOWS:
     import torch.distributed.optim.post_localSGD_optimizer as post_localSGD_optimizer
-    from torch.distributed.optim.functional_sgd import _FunctionalSGD
-    from torch.distributed.optim.functional_adam import _FunctionalAdam
-    from torch.distributed.optim.functional_adamw import _FunctionalAdamW
 
 from torch.utils.data.distributed import DistributedSampler
 
@@ -4003,10 +4004,6 @@ def _test_ddp_hook_with_optimizer_parity(
             BACKEND != "nccl" and BACKEND != "gloo",
             "Only Nccl & Gloo backend support DistributedDataParallel",
         )
-        @sandcastle_skip_if(
-            IS_WINDOWS,
-            "FunctionalAdam not yet supported with Windows, see https://github.com/pytorch/pytorch/issues/62137"
-        )
         @skip_if_lt_x_gpu(2)
         @skip_if_rocm
         def test_ddp_hook_with_optimizer_parity_adamw(self):
@@ -4029,10 +4026,6 @@ def test_ddp_hook_with_optimizer_parity_adamw(self):
             BACKEND != "nccl" and BACKEND != "gloo",
             "Only Nccl & Gloo backend support DistributedDataParallel",
         )
-        @sandcastle_skip_if(
-            IS_WINDOWS,
-            "FunctionalAdam not yet supported with Windows, see https://github.com/pytorch/pytorch/issues/62137"
-        )
         @skip_if_lt_x_gpu(2)
         @skip_if_rocm
         def test_ddp_hook_with_optimizer_parity_adam(self):
@@ -4055,10 +4048,6 @@ def test_ddp_hook_with_optimizer_parity_adam(self):
             BACKEND != "nccl" and BACKEND != "gloo",
             "Only Nccl & Gloo backend support DistributedDataParallel",
         )
-        @sandcastle_skip_if(
-            IS_WINDOWS,
-            "FunctionalSGD not yet supported with Windows, see https://github.com/pytorch/pytorch/issues/62137"
-        )
         @skip_if_lt_x_gpu(2)
         @skip_if_rocm
         def test_ddp_hook_with_optimizer_parity_sgd(self):

From fc07489ec52e628c10500f0ffeba0cda1cea1b49 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Mon, 23 Aug 2021 17:45:39 -0700
Subject: [PATCH 151/530] [BE] Enable PostLocalSGD tests on windows (#63463)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63463

Now that `torch.distributed.optim` gates DistributedOptimizer on RPC availability, local sgd optimizer can be used on windows.
ghstack-source-id: 136437632

Test Plan: Ci

Reviewed By: SciPioneer

Differential Revision: D30358922

fbshipit-source-id: 9b56aebf1075f026637296d338805ad8851c9d40
---
 torch/testing/_internal/distributed/distributed_test.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index aa8841d3266bc..1631983d32ec7 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -72,8 +72,7 @@
 from torch.distributed.optim.functional_adam import _FunctionalAdam
 from torch.distributed.optim.functional_adamw import _FunctionalAdamW
 
-if not IS_WINDOWS:
-    import torch.distributed.optim.post_localSGD_optimizer as post_localSGD_optimizer
+import torch.distributed.optim.post_localSGD_optimizer as post_localSGD_optimizer
 
 from torch.utils.data.distributed import DistributedSampler
 
@@ -4610,9 +4609,6 @@ def _test_DistributedDataParallel_SyncBatchNorm(
             BACKEND != "nccl" and BACKEND != "gloo",
             "Only NCCL and GLOO backend support DistributedDataParallel",
         )
-        @sandcastle_skip_if(
-            IS_WINDOWS, "PostLocalSGDOptimizer not yet supported with Windows."
-        )
         def test_post_localSGD_optimizer_parity(self, grad_is_view=False):
             learning_rate = 0.03
             period = 4

From e1bdebf6858e16127673d1a362144089f99e6ec9 Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitaly.fedyunin@gmail.com>
Date: Mon, 23 Aug 2021 18:07:37 -0700
Subject: [PATCH 152/530] Adding DataLoader2 class as future replacement of
 DataLoader (#63742)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63742

Supports sharding and batching on loader level**

Supports sharding and batching on loader level

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30494506

Pulled By: VitalyFedyunin

fbshipit-source-id: 6648e09d955055ac38e3a4e3973f701acefca762
---
 test/test_dataloader.py                     | 37 ++++++++-
 torch/utils/data/__init__.py                |  6 +-
 torch/utils/data/dataloader_experimental.py | 89 +++++++++++++++++++++
 3 files changed, 129 insertions(+), 3 deletions(-)
 create mode 100644 torch/utils/data/dataloader_experimental.py

diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index c68d7e2e14b33..01136b9e4bb07 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -13,9 +13,20 @@
 import warnings
 import tempfile
 from torch import multiprocessing as mp
-from torch.utils.data import _utils, Dataset, IterableDataset, TensorDataset, DataLoader, ConcatDataset, ChainDataset, Subset
+from torch.utils.data import (
+    ChainDataset,
+    ConcatDataset,
+    DataLoader,
+    DataLoader2,
+    Dataset,
+    IterableDataset,
+    Subset,
+    TensorDataset,
+    _utils
+)
 from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL
 from torch.utils.data.dataset import random_split
+from torch.utils.data.datapipes.iter import IterableAsDataPipe
 from torch._utils import ExceptionWrapper
 from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS,
                                                   IS_IN_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest,
@@ -33,6 +44,17 @@
     else:
         warnings.warn(err_msg)
 
+try:
+    import dill
+    # XXX: By default, dill writes the Pickler dispatch table to inject its
+    # own logic there. This globally affects the behavior of the standard library
+    # pickler for any user who transitively depends on this module!
+    # Undo this extension to avoid altering the behavior of the pickler globally.
+    dill.extend(use_dill=False)
+    HAS_DILL = True
+except ImportError:
+    HAS_DILL = False
+skipIfNoDill = unittest.skipIf(not HAS_DILL, "no dill")
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -1934,6 +1956,19 @@ def test_excessive_thread_creation_warning(self):
             dataloader = DataLoader(self.dataset, batch_size=2, num_workers=1000)
 
 
+@unittest.skipIf(
+    TEST_WITH_TSAN,
+    "Fails with TSAN with the following error: starting new threads after multi-threaded "
+    "fork is not supported. Dying (set die_after_fork=0 to override)")
+class TestDataLoader2(TestCase):
+    @skipIfNoDill
+    def test_basics(self):
+        dp = IterableAsDataPipe(list(range(10)))
+        dl = DataLoader(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2)
+        dl2 = DataLoader2(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2)
+        self.assertEquals(list(dl), list(dl2))
+
+
 class StringDataset(Dataset):
     def __init__(self):
         self.s = '12345'
diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py
index 1d18b7b030894..0af9e6193af3d 100644
--- a/torch/utils/data/__init__.py
+++ b/torch/utils/data/__init__.py
@@ -11,9 +11,9 @@
 from torch.utils.data.dataset import (
     ChainDataset,
     ConcatDataset,
+    DataChunk,
     Dataset,
     Dataset as MapDataPipe,
-    DataChunk,
     IterableDataset,
     IterableDataset as IterDataPipe,
     Subset,
@@ -34,11 +34,14 @@
     runtime_validation,
     runtime_validation_disabled,
 )
+from torch.utils.data.dataloader_experimental import DataLoader2
+
 
 __all__ = ['BatchSampler',
            'ChainDataset',
            'ConcatDataset',
            'DataLoader',
+           'DataLoader2',
            'Dataset',
            'DistributedSampler',
            'IterDataPipe',
@@ -68,4 +71,3 @@
 ################################################################################
 # import subpackage
 ################################################################################
-from torch.utils.data import datapipes
diff --git a/torch/utils/data/dataloader_experimental.py b/torch/utils/data/dataloader_experimental.py
new file mode 100644
index 0000000000000..85028afd22124
--- /dev/null
+++ b/torch/utils/data/dataloader_experimental.py
@@ -0,0 +1,89 @@
+
+import functools
+
+import torch.utils.data.backward_compatibility
+from torch.utils.data import DataLoader, IterDataPipe
+from torch.utils.data.datapipes.iter import IterableAsDataPipe
+
+class DataLoader2:
+    def __new__(cls,
+                dataset,
+                batch_size=1,
+                shuffle=False,
+                sampler=None,
+                batch_sampler=None,
+                num_workers=0,
+                collate_fn=None,
+                pin_memory=False,
+                drop_last=False,
+                timeout=0,
+                worker_init_fn=None,
+                *,
+                prefetch_factor=2,
+                persistent_workers=False,
+                batch_outside_worker=False):
+        if isinstance(dataset, IterDataPipe):
+            datapipe = dataset
+            if batch_sampler is not None:
+                raise Exception(
+                    'batch_sampler is not yet supported for DataPipes')
+            if sampler is not None:
+                raise Exception(
+                    'sampler is not yet supported for DataPipes')
+            if shuffle:
+                datapipe = datapipe.shuffle()
+            if batch_outside_worker and pin_memory:
+                raise Exception(
+                    'pin_memory is not yet compatible with batch_outside_worker')
+            if not batch_outside_worker:
+                if batch_size is not None:
+                    datapipe = datapipe.batch(batch_size, drop_last=drop_last)
+                    if collate_fn is None:
+                        collate_fn = torch.utils.data._utils.collate.default_collate
+
+            def sharding_worker_init_fn(worker_init_fn, worker_id):
+                if worker_init_fn is not None:
+                    worker_init_fn(worker_id)
+                torch.utils.data.backward_compatibility.worker_init_fn(
+                    worker_id)
+
+            my_worker_init_fn = functools.partial(
+                sharding_worker_init_fn, worker_init_fn)
+
+            data_loader = DataLoader(datapipe,
+                                     batch_size=None,  # Replaced by .batch DataPipe
+                                     shuffle=False,  # Replaced by .shuffle DataPipe
+                                     sampler=None,
+                                     batch_sampler=None,
+                                     num_workers=num_workers,
+                                     collate_fn=collate_fn,
+                                     pin_memory=pin_memory,
+                                     drop_last=False,  # Replaced by .batch DataPipe
+                                     timeout=timeout,
+                                     worker_init_fn=my_worker_init_fn,
+                                     prefetch_factor=prefetch_factor,
+                                     persistent_workers=persistent_workers)
+
+            if not batch_outside_worker:
+                return data_loader
+            else:
+                if collate_fn is None:
+                    collate_fn = torch.utils.data._utils.collate.default_collate
+                datapipe = IterableAsDataPipe(data_loader).batch(
+                    batch_size, drop_last=drop_last).map(collate_fn)
+                return datapipe
+
+        else:
+            return DataLoader(dataset,
+                              batch_size=batch_size,
+                              shuffle=shuffle,
+                              sampler=sampler,
+                              batch_sampler=batch_sampler,
+                              num_workers=num_workers,
+                              collate_fn=collate_fn,
+                              pin_memory=pin_memory,
+                              drop_last=drop_last,
+                              timeout=timeout,
+                              worker_init_fn=worker_init_fn,
+                              prefetch_factor=prefetch_factor,
+                              persistent_workers=persistent_workers)

From 195c60d84460d16311ad606e504eda17b795a820 Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@fb.com>
Date: Mon, 23 Aug 2021 18:17:20 -0700
Subject: [PATCH 153/530] [fx2trt] Add acc op and converter for torch.pow
 (#63795)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63795

att

Test Plan: buck run mode/opt caffe2/torch/fb/fx2trt:test_binary_ops

Reviewed By: jackm321, wushirong

Differential Revision: D30492488

fbshipit-source-id: 6d615770567b13720316f06fd2f866ea2fdc2995
---
 .../fx/experimental/fx2trt/converters/acc_ops_converters.py | 5 +++++
 torch/fx/experimental/fx_acc/acc_ops.py                     | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
index eddb079afcac5..566359bf2af0d 100644
--- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
+++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
@@ -764,6 +764,11 @@ def acc_ops_mul(network, target, args, kwargs, name):
         network, kwargs["input"], kwargs["other"], trt.ElementWiseOperation.PROD, name
     )
 
+@tensorrt_converter(acc_ops.pow)
+def acc_ops_pow(network, target, args, kwargs, name):
+    return add_binary_elementwise_layer(
+        network, kwargs["input"], kwargs["exponent"], trt.ElementWiseOperation.POW, name
+    )
 
 @tensorrt_converter(acc_ops.min_two_tensors_input)
 def acc_ops_min_two_tensors_input(network, target, args, kwargs, name):
diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py
index 0c0965a430afd..95fffaa479c9e 100644
--- a/torch/fx/experimental/fx_acc/acc_ops.py
+++ b/torch/fx/experimental/fx_acc/acc_ops.py
@@ -496,6 +496,12 @@ def div(*, input, other):
     return input / other
 
 
+@register_acc_op_mapping(op_and_target=("call_function", torch.pow))
+@register_acc_op
+def pow(*, input, exponent):
+    return torch.pow(input, exponent)
+
+
 @register_acc_op_mapping(op_and_target=("call_function", nn.functional.relu))
 @register_acc_op_mapping(
     op_and_target=("call_function", torch.relu),

From d96ef8c1b1860185f0bd91699f71a087cf9e9efe Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Mon, 23 Aug 2021 18:43:17 -0700
Subject: [PATCH 154/530] [Static Runtime] SR clones graph input (#63704)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63704

Previously SR did not clone the graph. This was leading to subtle bugs in `testStaticRuntime`; static runtime would modify its graph, and the graph used by the JIT interpreter would change as well. The JIT interpreter would then crash if SR-only ops were added!

Cloning the graph is more consistent with the behavior of the `Module` ctor.

Test Plan: `buck test caffe2/benchmarks/static_runtime/...`

Reviewed By: hlu1

Differential Revision: D30463294

fbshipit-source-id: b771551a1f55f95fde79373b23babcf3e5ddf726
---
 torch/csrc/jit/runtime/static/impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 4219be5040ba7..1b5ee724b45a4 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -548,7 +548,7 @@ PrepareForStaticModule(
 StaticModule::StaticModule(
     std::shared_ptr<torch::jit::Graph> g,
     const StaticModuleOptions& opts)
-    : StaticModule(PrepareForStaticModule(g, opts), opts) {}
+    : StaticModule(PrepareForStaticModule(g->copy(), opts), opts) {}
 
 StaticModule::StaticModule(
     const torch::jit::Module& m,

From f5d585391d13287250e85cbd55a17c5e0b8ac2a8 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Mon, 23 Aug 2021 18:44:46 -0700
Subject: [PATCH 155/530] Add ROCm as a platform for which tests can be
 disabled (#63813)

Summary:
Realized we were missing ROCm as a platform on which one could disable a flaky test. (like how this issue specifies windows https://github.com/pytorch/pytorch/issues/61655)

cc jeffdaily sunway513 jithunnair-amd ROCmSupport

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63813

Reviewed By: seemethere

Differential Revision: D30498478

Pulled By: janeyx99

fbshipit-source-id: f1abe8677e1ddd01de3291e1618272ad8e287dc4
---
 torch/testing/_internal/common_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index a16056cd55cf7..b8e5b097bd6c0 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -906,8 +906,10 @@ def check_if_enable(test: unittest.TestCase):
             platform_to_conditional: Dict = {
                 "mac": IS_MACOS,
                 "macos": IS_MACOS,
+                "win": IS_WINDOWS,
                 "windows": IS_WINDOWS,
-                "linux": IS_LINUX
+                "linux": IS_LINUX,
+                "rocm": TEST_WITH_ROCM
             }
             if platforms == [] or any([platform_to_conditional[platform] for platform in platforms]):
                 raise unittest.SkipTest(

From 5b7cdc5a3ddb9a1a3d46d05b2925b5b4713b0025 Mon Sep 17 00:00:00 2001
From: mingfeima <mingfei.ma@intel.com>
Date: Mon, 23 Aug 2021 22:53:35 -0700
Subject: [PATCH 156/530] add channels last for GroupNorm (#49821)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49821

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D26007053

Pulled By: VitalyFedyunin

fbshipit-source-id: 34a48d5d3b66a159febf3c3d96748fbaba1b9e31
---
 .../src/ATen/native/cpu/group_norm_kernel.cpp | 162 ++++++++++++++++--
 aten/src/ATen/native/group_norm.cpp           |  17 +-
 test/test_nn.py                               |  34 ++++
 3 files changed, 193 insertions(+), 20 deletions(-)

diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
index 290a6315da445..fb8db7e61800f 100644
--- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
@@ -74,6 +74,136 @@ void GroupNormKernelImplInternal(
   });
 }
 
+template <typename T>
+void GroupNormKernelImplChannelsLastInternal(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t N,
+    int64_t C,
+    int64_t HxW,
+    int64_t group,
+    T eps,
+    Tensor& Y,
+    Tensor& mean,
+    Tensor& rstd) {
+  TORCH_CHECK(X.numel() == N * C * HxW);
+  TORCH_CHECK(!gamma.defined() || gamma.numel() == C);
+  TORCH_CHECK(!beta.defined() || beta.numel() == C);
+  const int64_t G = group;
+  const int64_t D = C / G;
+  const T* X_data = X.data_ptr<T>();
+  const T* gamma_data = gamma.defined() ? gamma.data_ptr<T>() : nullptr;
+  const T* beta_data = beta.defined() ? beta.data_ptr<T>() : nullptr;
+  T* Y_data = Y.data_ptr<T>();
+  T* mean_data = mean.data_ptr<T>();
+  T* rstd_data = rstd.data_ptr<T>();
+  const T s = T(1) / static_cast<T>(D * HxW);
+  const bool gamma_null = (gamma_data == nullptr);
+  const bool beta_null = beta_data == nullptr;
+
+  // temp buffer holding x and x2
+  Tensor buffer = at::empty({N, 2 * C}, X.options()).zero_();
+  T* buffer_data = buffer.data_ptr<T>();
+
+  using Vec = vec::Vectorized<T>;
+  at::parallel_for(0, N, 1, [&](int64_t start, int64_t end) {
+    constexpr int64_t K = Vec::size();
+    const int64_t inner_size = C / K * K;
+    for (int64_t n = start; n < end; ++n) {
+      T* mean_ptr = buffer_data + n * 2 * C;
+      T* rstd_ptr = mean_ptr + C;
+      for (int64_t i = 0; i < HxW; ++i) {
+        const T* X_ptr = X_data + n * HxW * C + i * C;
+        for (int64_t j = 0; j < inner_size; j += K) {
+          const Vec x_vec = Vec::loadu(X_ptr + j);
+          Vec mean_vec = Vec::loadu(mean_ptr + j) + x_vec;
+          Vec rstd_vec = Vec::loadu(rstd_ptr + j) + x_vec * x_vec;
+          mean_vec.store(mean_ptr + j);
+          rstd_vec.store(rstd_ptr + j);
+        }
+        for (int64_t j = inner_size; j < C; ++j) {
+          mean_ptr[j] += X_ptr[j];
+          rstd_ptr[j] += X_ptr[j] * X_ptr[j];
+        }
+      }
+
+      for (int64_t g = 0; g < G; ++g) {
+        T mean_val = T(0);
+        T rstd_val = T(0);
+        for (int64_t d = 0; d < D; ++d) {
+          mean_val += mean_ptr[g * D + d];
+          rstd_val += rstd_ptr[g * D + d];
+        }
+        mean_val *= s;
+        rstd_val = std::max(rstd_val * s - mean_val * mean_val, T(0));
+        rstd_val = T(1) / std::sqrt(rstd_val + eps);
+
+        // continue to use the temp buffer for mean and rstd value,
+        // so that we can vectorize the following math on entire C dimension.
+        for (int64_t d = 0; d < D; ++d) {
+          mean_ptr[g * D + d] = mean_val;
+          rstd_ptr[g * D + d] = rstd_val;
+        }
+
+        mean_data[n * G + g] = mean_val;
+        rstd_data[n * G + g] = rstd_val;
+      }
+
+      // expand gamma_null and beta_null to reduce if-else on critial path.
+      if (!gamma_null && !beta_null) {
+        for (int64_t i = 0; i < HxW; ++i) {
+          const T* X_ptr = X_data + n * HxW * C + i * C;
+          T* Y_ptr = Y_data + n * HxW * C + i * C;
+          for (int64_t j = 0; j < inner_size; j += K) {
+            Vec scale_vec = Vec::loadu(rstd_ptr + j) * Vec::loadu(gamma_data + j);
+            Vec bias_vec = Vec::loadu(beta_data + j) - scale_vec * Vec::loadu(mean_ptr + j);
+            Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) + bias_vec;
+            y_vec.store(Y_ptr + j);
+          }
+          for (int64_t j = inner_size; j < C; ++j) {
+            T scale = rstd_ptr[j] * gamma_data[j];
+            T bias = -scale * mean_ptr[j] + beta_data[j];
+            Y_ptr[j] = scale * X_ptr[j] + bias;
+          }
+        }
+      } else if (gamma_null && beta_null) {
+        for (int64_t i = 0; i < HxW; ++i) {
+          const T* X_ptr = X_data + n * HxW * C + i * C;
+          T* Y_ptr = Y_data + n * HxW * C + i * C;
+          for (int64_t j = 0; j < inner_size; j += K) {
+            Vec scale_vec = Vec::loadu(rstd_ptr + j);
+            Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) - scale_vec * Vec::loadu(mean_ptr + j);
+            y_vec.store(Y_ptr + j);
+          }
+          for (int64_t j = inner_size; j < C; ++j) {
+            T scale = rstd_ptr[j];
+            Y_ptr[j] = scale * X_ptr[j] -scale * mean_ptr[j];
+          }
+        }
+      } else {
+        for (int64_t i = 0; i < HxW; ++i) {
+          const T* X_ptr = X_data + n * HxW * C + i * C;
+          T* Y_ptr = Y_data + n * HxW * C + i * C;
+          for (int64_t j = 0; j < inner_size; j += K) {
+            Vec gamma_vec = gamma_null ? Vec(1) : Vec::loadu(gamma_data + j);
+            Vec beta_vec = beta_null ? Vec(0) : Vec::loadu(beta_data + j);
+            Vec scale_vec = Vec::loadu(rstd_ptr + j) * gamma_vec;
+            Vec bias_vec = beta_vec - scale_vec * Vec::loadu(mean_ptr + j);
+            Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) + bias_vec;
+            y_vec.store(Y_ptr + j);
+          }
+          for (int64_t j = inner_size; j < C; ++j) {
+            T scale = rstd_ptr[j] * (gamma_null ? T(1) : gamma_data[j]);
+            T bias = -scale * mean_ptr[j] + (beta_null ? T(0) : beta_data[j]);
+            Y_ptr[j] = scale * X_ptr[j] + bias;
+          }
+        }
+      }
+    }
+  });
+}
+
 void GroupNormKernelImpl(
     const Tensor& X,
     const Tensor& gamma,
@@ -86,20 +216,24 @@ void GroupNormKernelImpl(
     Tensor& Y,
     Tensor& mean,
     Tensor& rstd) {
-  AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GroupNormKernelImpl", [&]() {
-    GroupNormKernelImplInternal<scalar_t>(
-        X,
-        gamma,
-        beta,
-        N,
-        C,
-        HxW,
-        group,
-        static_cast<scalar_t>(eps),
-        Y,
-        mean,
-        rstd);
-  });
+  switch (X.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GroupNormKernelImpl", [&]() {
+        GroupNormKernelImplInternal<scalar_t>(
+            X, gamma, beta, N, C, HxW, group, static_cast<scalar_t>(eps), Y, mean, rstd);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast: {
+      AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GroupNormKernelImpl", [&]() {
+        GroupNormKernelImplChannelsLastInternal<scalar_t>(
+            X, gamma, beta, N, C, HxW, group, static_cast<scalar_t>(eps), Y, mean, rstd);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
 }
 
 template <typename T>
diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index 3a60d19959f83..5533780a4547e 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -31,7 +31,10 @@ std::tuple<Tensor, Tensor, Tensor> native_group_norm(
   const Tensor& gamma = *gamma_maybe_owned;
   const Tensor& beta = c10::value_or_else(beta_opt, [] { return Tensor(); });
 
-  TORCH_CHECK(X.is_contiguous());
+  auto memory_format = X.device().is_cpu() ?
+      X.suggest_memory_format() : at::MemoryFormat::Contiguous;
+
+  TORCH_CHECK(X.is_contiguous(memory_format));
 
   Tensor Y = at::native::empty_like(
       X,
@@ -39,7 +42,7 @@ std::tuple<Tensor, Tensor, Tensor> native_group_norm(
       c10::nullopt /* layout */,
       c10::nullopt /* device */,
       c10::nullopt /* pin_memory */,
-      LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+      memory_format);
   Tensor mean = at::empty({N, group}, X.options());
   Tensor rstd = at::empty({N, group}, X.options());
   GroupNormKernel(
@@ -73,7 +76,7 @@ std::tuple<Tensor, Tensor, Tensor> native_group_norm_backward(
         c10::nullopt /* layout */,
         c10::nullopt /* device */,
         c10::nullopt /* pin_memory */,
-        LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+        at::MemoryFormat::Contiguous);
   }
   if (grad_input_mask[1]) {
     dgamma = at::native::empty_like(
@@ -82,7 +85,7 @@ std::tuple<Tensor, Tensor, Tensor> native_group_norm_backward(
         c10::nullopt /* layout */,
         c10::nullopt /* device */,
         c10::nullopt /* pin_memory */,
-        LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+        at::MemoryFormat::Contiguous);
   }
   if (grad_input_mask[2]) {
     dbeta = at::native::empty_like(
@@ -91,7 +94,7 @@ std::tuple<Tensor, Tensor, Tensor> native_group_norm_backward(
         c10::nullopt /* layout */,
         c10::nullopt /* device */,
         c10::nullopt /* pin_memory */,
-        LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+        at::MemoryFormat::Contiguous);
   }
   GroupNormBackwardKernel(
       X.device().type(),
@@ -153,7 +156,9 @@ Tensor group_norm(
       c10::multiply_integers(input_shape.cbegin() + 2, input_shape.cend());
 
   const Tensor kEmpty;
-  const auto& X = input.is_contiguous() ? input : input.contiguous();
+  auto memory_format = input.suggest_memory_format();
+  const auto& X = input.device().is_cpu() ?
+      input.contiguous(memory_format) : input.contiguous();
   const auto& gamma = weight.defined() ? weight.contiguous() : kEmpty;
   const auto& beta = bias.defined() ? bias.contiguous() : kEmpty;
   TORCH_CHECK(!gamma.defined() || gamma.numel() == C);
diff --git a/test/test_nn.py b/test/test_nn.py
index bb109cf20e459..f4691e6a5fa03 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -13026,6 +13026,40 @@ def test_GroupNorm_empty(self, device):
             with torch.backends.cudnn.flags(enabled=False):
                 self._test_module_empty_input(mod, inp)
 
+    @onlyCPU
+    @dtypes(torch.float, torch.double)
+    def test_groupnorm_nhwc(self, device, dtype):
+        def helper(self, size, groups):
+            channels = size[1]
+            input = torch.randn(size, dtype=dtype, device=device, requires_grad=True)
+            input = input.contiguous(memory_format=torch.channels_last)
+            input.retain_grad()
+            grad = torch.randn(size, dtype=dtype, device=device)
+            grad = grad.contiguous(memory_format=torch.channels_last)
+            gn = nn.GroupNorm(groups, channels).to(device).to(dtype)
+            gn.weight.data.uniform_()
+            gn.bias.data.uniform_()
+
+            ref_input = input.detach().clone().contiguous().requires_grad_(True)
+            ref_grad = grad.detach().clone().contiguous()
+            ref_gn = nn.GroupNorm(groups, channels).to(device).to(dtype)
+            ref_gn.load_state_dict(gn.state_dict())
+
+            out = gn(input)
+            out.backward(grad)
+            ref_out = ref_gn(ref_input)
+            ref_out.backward(ref_grad)
+
+            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+            self.assertTrue(ref_out.is_contiguous())
+            self.assertEqual(out, ref_out)
+            self.assertEqual(gn.weight.grad, ref_gn.weight.grad)
+            self.assertEqual(gn.bias.grad, ref_gn.bias.grad)
+            self.assertEqual(input.grad, ref_input.grad)
+
+        helper(self, (4, 8, 10, 10), 4)
+        helper(self, (2, 30, 9, 9), 3)
+
     @onlyOnCPUAndCUDA
     def test_GroupNorm_numeric(self, device):
         def group_norm_ref(X, gamma, beta, groups, channels, eps):

From dd96c26066fd8e31dc768002e207477c38f86b7a Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Tue, 24 Aug 2021 00:29:22 -0700
Subject: [PATCH 157/530] [TensorExpr] More NFC changes like Expr* -> ExprPtr.
 (#63778)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63778

This is a preparation for a switch from raw pointers to shared pointers
as a memory model for TE expressions and statements.

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D30487425

Pulled By: ZolotukhinM

fbshipit-source-id: 9cbe817b7d4e5fc2f150b29bb9b3bf578868f20c
---
 benchmarks/cpp/tensorexpr/bench_approx.cpp    | 26 +++---
 benchmarks/cpp/tensorexpr/bench_batchnorm.cpp |  4 +-
 benchmarks/cpp/tensorexpr/bench_compile.cpp   |  4 +-
 benchmarks/cpp/tensorexpr/bench_concat.cpp    | 42 +++++-----
 benchmarks/cpp/tensorexpr/bench_gemm.cpp      | 80 +++++++++----------
 benchmarks/cpp/tensorexpr/bench_parallel.cpp  |  4 +-
 benchmarks/cpp/tensorexpr/bench_reduce.cpp    | 32 ++++----
 test/cpp/tensorexpr/test_llvm.cpp             |  2 +-
 test/cpp/tensorexpr/test_loopnest.cpp         | 70 ++++++++--------
 test/cpp/tensorexpr/test_memdependency.cpp    |  6 +-
 test/cpp/tensorexpr/test_reductions.cpp       | 24 +++---
 test/cpp/tensorexpr/tutorial.cpp              |  4 +-
 torch/csrc/jit/tensorexpr/cuda_codegen.cpp    | 29 +++----
 torch/csrc/jit/tensorexpr/half_support.h      |  3 +-
 .../jit/tensorexpr/mem_dependency_checker.h   |  4 +-
 .../csrc/jit/tensorexpr/operators/reduction.h |  6 +-
 16 files changed, 172 insertions(+), 168 deletions(-)

diff --git a/benchmarks/cpp/tensorexpr/bench_approx.cpp b/benchmarks/cpp/tensorexpr/bench_approx.cpp
index 1f09b1dbac5c1..6e31697d586dd 100644
--- a/benchmarks/cpp/tensorexpr/bench_approx.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_approx.cpp
@@ -12,19 +12,19 @@ using namespace torch::jit::tensorexpr;
 
 void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target, int width) {
   auto loops = ln->getLoopStmtsFor(target);
-  For *inner, *tail;
+  ForPtr inner, tail;
   ln->splitWithTail(loops[0], width, &inner, &tail);
   ln->vectorize(inner);
 }
 
 void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target) {
-  std::vector<For*> loops = ln->getLoopStmtsFor(target);
-  For *inner, *tail;
+  std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
+  ForPtr inner, tail;
   ln->splitWithTail(loops[0], 16 * 8, &inner, &tail);
-  For* outer = loops[0];
+  ForPtr outer = loops[0];
   ln->vectorize(inner);
   ln->splitWithTail(outer, 8, &inner, &tail);
-  Stmt* unrolled;
+  StmtPtr unrolled;
   LoopNest::unroll(inner, &unrolled);
 }
 
@@ -44,7 +44,7 @@ static void relu_nnc(benchmark::State& state) {
   LoopNest ln({B});
   optimizePointwise(&ln, B);
   ln.prepareForCodegen();
-  Stmt* s = ln.root_stmt();
+  StmtPtr s = ln.root_stmt();
   s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
   std::vector<CodeGen::BufferArg> args;
   args.emplace_back(B);
@@ -74,7 +74,7 @@ static void log_nnc_sleef(benchmark::State& state) {
   LoopNest ln({B});
   ln.prepareForCodegen();
   vectorize(&ln, B, 8);
-  Stmt* s = ln.root_stmt();
+  StmtPtr s = ln.root_stmt();
   s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
   std::vector<CodeGen::BufferArg> args;
   args.emplace_back(B);
@@ -104,7 +104,7 @@ static void log_nnc_fast(benchmark::State& state) {
   LoopNest ln({B});
   optimizePointwise(&ln, B);
   ln.prepareForCodegen();
-  Stmt* s = ln.root_stmt();
+  StmtPtr s = ln.root_stmt();
   s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
   std::vector<CodeGen::BufferArg> args;
   args.emplace_back(B);
@@ -134,7 +134,7 @@ static void log_nnc_vml(benchmark::State& state) {
   LoopNest ln({B});
   vectorize(&ln, B, 8);
   ln.prepareForCodegen();
-  Stmt* s = ln.root_stmt();
+  StmtPtr s = ln.root_stmt();
   s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
   std::vector<CodeGen::BufferArg> args;
   args.emplace_back(B);
@@ -181,7 +181,7 @@ static void logit_nnc_sleef(benchmark::State& state) {
   LoopNest ln({B});
   ln.prepareForCodegen();
   optimizePointwise(&ln, B);
-  Stmt* s = ln.root_stmt();
+  StmtPtr s = ln.root_stmt();
   s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
   std::vector<CodeGen::BufferArg> args;
   args.emplace_back(B);
@@ -218,7 +218,7 @@ static void logit_nnc_fast(benchmark::State& state) {
   LoopNest ln({B});
   ln.prepareForCodegen();
   optimizePointwise(&ln, B);
-  Stmt* s = ln.root_stmt();
+  StmtPtr s = ln.root_stmt();
   s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
   std::vector<CodeGen::BufferArg> args;
   args.emplace_back(B);
@@ -255,7 +255,7 @@ static void logit_nnc_vml(benchmark::State& state) {
   LoopNest ln({B});
   ln.prepareForCodegen();
   vectorize(&ln, B, 16);
-  Stmt* s = ln.root_stmt();
+  StmtPtr s = ln.root_stmt();
   s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
   std::vector<CodeGen::BufferArg> args;
   args.emplace_back(B);
@@ -326,7 +326,7 @@ static void tanh_nnc_fast(benchmark::State& state) {
   LoopNest ln({B});
   optimizePointwise(&ln, B);
   ln.prepareForCodegen();
-  Stmt* s = ln.root_stmt();
+  StmtPtr s = ln.root_stmt();
   s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
   std::vector<CodeGen::BufferArg> args;
   args.emplace_back(B);
diff --git a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
index 434cd6bfdbb8e..872594ec286b7 100644
--- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
@@ -105,7 +105,7 @@ BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) {
   loops = nest.getLoopStmtsFor(output);
   loops[0]->set_parallel();
   nest.prepareForCodegen();
-  Stmt* s = IRSimplifier::simplify(nest.root_stmt());
+  StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
   LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps});
 
   std::vector<CodeGen::CallArg> args;
@@ -163,7 +163,7 @@ BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) {
       });
   LoopNest nest({output});
   nest.prepareForCodegen();
-  Stmt* s = IRSimplifier::simplify(nest.root_stmt());
+  StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
   LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps});
 
   std::vector<CodeGen::CallArg> args;
diff --git a/benchmarks/cpp/tensorexpr/bench_compile.cpp b/benchmarks/cpp/tensorexpr/bench_compile.cpp
index cc84e65a545b2..245d5d8b203c5 100644
--- a/benchmarks/cpp/tensorexpr/bench_compile.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_compile.cpp
@@ -33,7 +33,7 @@ static void BM_CompileSwish(benchmark::State& state) {
       nest.computeInline(tensor->buf());
     }
     nest.prepareForCodegen();
-    te::Stmt* s = te::IRSimplifier::simplify(nest.root_stmt());
+    te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
     te::LLVMCodeGen cg(s, {A, sixth, n});
   }
 }
@@ -63,7 +63,7 @@ static void BM_CompileSwishLLVMOnly(benchmark::State& state) {
     nest.computeInline(tensor->buf());
   }
   nest.prepareForCodegen();
-  te::Stmt* s = te::IRSimplifier::simplify(nest.root_stmt());
+  te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
   for (auto _ : state) {
     te::LLVMCodeGen cg(s, {A, sixth, n});
   }
diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp
index a437967a09497..cb9aa84150e88 100644
--- a/benchmarks/cpp/tensorexpr/bench_concat.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp
@@ -83,7 +83,7 @@ class ConcatBench : public benchmark::Fixture {
         });
     LoopNest nest({output});
     nest.prepareForCodegen();
-    Stmt* s = IRSimplifier::simplify(nest.root_stmt());
+    StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
     std::vector<CodeGen::BufferArg> buf_args(inputs.begin(), inputs.end());
     buf_args.push_back(output);
     LLVMCodeGen cg(s, buf_args);
@@ -108,47 +108,51 @@ class ConcatBench : public benchmark::Fixture {
 
     TORCH_INTERNAL_ASSERT(concat_dim_ == 1);
 
-    auto output_buf = new Buf(
-        new Var("aten_cat", kHandle),
-        {new IntImm(output_size_[0]), new IntImm(output_size_[1])},
+    auto output_buf = alloc<Buf>(
+        alloc<Var>("aten_cat", kHandle),
+        std::vector<ExprPtr>(
+            {alloc<IntImm>(output_size_[0]), alloc<IntImm>(output_size_[1])}),
         kFloat);
 
     std::vector<Placeholder> inputs;
-    std::vector<Stmt*> for_stmts(num_inputs);
+    std::vector<StmtPtr> for_stmts(num_inputs);
     int cumulative_input_sizes = 0;
     for (size_t i = 0; i < num_inputs; ++i) {
       inputs.emplace_back(Placeholder(
           "input" + std::to_string(i),
           kFloat,
           {input_sizes_[i][0], input_sizes_[i][1]}));
-      std::vector<Var*> for_vars(num_inputs);
+      std::vector<VarPtr> for_vars(num_inputs);
       for (size_t d = 0; d < num_dims; ++d) {
         for_vars[d] =
-            new Var("i" + std::to_string(i) + "_" + std::to_string(d), kInt);
+            alloc<Var>("i" + std::to_string(i) + "_" + std::to_string(d), kInt);
       }
-      auto store = new Store(
+      auto store = alloc<Store>(
           output_buf,
-          {for_vars[0],
-           new Add(for_vars[1], new IntImm(cumulative_input_sizes))},
-          new Load(inputs[i].data(), {for_vars[0], for_vars[1]}));
-      auto for_st = new For(
+          std::vector<ExprPtr>(
+              {for_vars[0],
+               alloc<Add>(for_vars[1], alloc<IntImm>(cumulative_input_sizes))}),
+          alloc<Load>(
+              inputs[i].data(),
+              std::vector<ExprPtr>({for_vars[0], for_vars[1]})));
+      auto for_st = alloc<For>(
           for_vars[0],
-          new IntImm(0),
-          new IntImm(input_sizes_[i][0]),
-          new For(
+          alloc<IntImm>(0),
+          alloc<IntImm>(input_sizes_[i][0]),
+          alloc<For>(
               for_vars[1],
-              new IntImm(0),
-              new IntImm(input_sizes_[i][1]),
+              alloc<IntImm>(0),
+              alloc<IntImm>(input_sizes_[i][1]),
               store));
       for_stmts[i] = for_st;
       cumulative_input_sizes += input_sizes_[i][1];
     }
-    auto output = new Tensor(output_buf, new Block(for_stmts));
+    auto output = new Tensor(output_buf, alloc<Block>(for_stmts));
 
     LoopNest nest({output});
     nest.prepareForCodegen();
     nest.vectorizeInnerLoops();
-    Stmt* s = IRSimplifier::simplify(nest.root_stmt());
+    StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
     std::vector<CodeGen::BufferArg> buf_args(inputs.begin(), inputs.end());
     buf_args.push_back(output);
     LLVMCodeGen cg(s, buf_args);
diff --git a/benchmarks/cpp/tensorexpr/bench_gemm.cpp b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
index 792d457c2f23a..7ebaa87781514 100644
--- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
@@ -54,7 +54,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
       {{K, "K"}});
   te::LoopNest loop({CT});
   loop.prepareForCodegen();
-  te::Stmt* s = loop.root_stmt();
+  te::StmtPtr s = loop.root_stmt();
   s = te::IRSimplifier::simplify(s);
   auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
 
@@ -80,41 +80,41 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
 
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* m = loops[0];
+    te::ForPtr m = loops[0];
     loop.splitWithMask(m, 32);
   }
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* n = loops[2];
+    te::ForPtr n = loops[2];
     loop.splitWithMask(n, 32);
   }
   // mo, mi, no, ni, k ->
   // mo, no, mi, ni, k
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* mi = loops[1];
-    te::For* no = loops[2];
+    te::ForPtr mi = loops[1];
+    te::ForPtr no = loops[2];
     loop.reorderAxis(mi, no);
   }
   // mo, no, mi, ni, k ->
   // mo, no, mi, k, ni
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* ni = loops[3];
-    te::For* k = loops[4];
+    te::ForPtr ni = loops[3];
+    te::ForPtr k = loops[4];
     loop.reorderAxis(ni, k);
   }
   // mo, no, mi, k, ni ->
   // mo, no, k, mi, ni
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* mi = loops[2];
-    te::For* k = loops[3];
+    te::ForPtr mi = loops[2];
+    te::ForPtr k = loops[3];
     loop.reorderAxis(mi, k);
   }
 
   loop.prepareForCodegen();
-  te::Stmt* s = loop.root_stmt();
+  te::StmtPtr s = loop.root_stmt();
   s = te::IRSimplifier::simplify(s);
   auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
 
@@ -140,41 +140,41 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
 
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* m = loops[0];
+    te::ForPtr m = loops[0];
     loop.splitWithMask(m, 4);
   }
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* n = loops[2];
+    te::ForPtr n = loops[2];
     loop.splitWithMask(n, 16);
   }
   // mo, mi, no, ni, k ->
   // mo, no, mi, ni, k
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* mi = loops[1];
-    te::For* no = loops[2];
+    te::ForPtr mi = loops[1];
+    te::ForPtr no = loops[2];
     loop.reorderAxis(mi, no);
   }
   // mo, no, mi, ni, k ->
   // mo, no, mi, k, ni
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* ni = loops[3];
-    te::For* k = loops[4];
+    te::ForPtr ni = loops[3];
+    te::ForPtr k = loops[4];
     loop.reorderAxis(ni, k);
   }
   // mo, no, mi, k, ni ->
   // mo, no, k, mi, ni
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* mi = loops[2];
-    te::For* k = loops[3];
+    te::ForPtr mi = loops[2];
+    te::ForPtr k = loops[3];
     loop.reorderAxis(mi, k);
   }
 
   loop.prepareForCodegen();
-  te::Stmt* s = loop.root_stmt();
+  te::StmtPtr s = loop.root_stmt();
   s = te::IRSimplifier::simplify(s);
   auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
 
@@ -200,49 +200,49 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
 
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* m = loops[0];
+    te::ForPtr m = loops[0];
     loop.splitWithMask(m, 4);
   }
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* n = loops[2];
+    te::ForPtr n = loops[2];
     loop.splitWithMask(n, 16);
   }
   // mo, mi, no, ni, k ->
   // mo, no, mi, ni, k
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* mi = loops[1];
-    te::For* no = loops[2];
+    te::ForPtr mi = loops[1];
+    te::ForPtr no = loops[2];
     loop.reorderAxis(mi, no);
   }
   // mo, no, mi, ni, k ->
   // mo, no, mi, k, ni
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* ni = loops[3];
-    te::For* k = loops[4];
+    te::ForPtr ni = loops[3];
+    te::ForPtr k = loops[4];
     loop.reorderAxis(ni, k);
   }
   // mo, no, mi, k, ni ->
   // mo, no, k, mi, ni
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* mi = loops[2];
-    te::For* k = loops[3];
+    te::ForPtr mi = loops[2];
+    te::ForPtr k = loops[3];
     loop.reorderAxis(mi, k);
   }
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* mi = loops[3];
-    te::For* ni = loops[4];
-    te::Stmt* unrolled;
+    te::ForPtr mi = loops[3];
+    te::ForPtr ni = loops[4];
+    te::StmtPtr unrolled;
     loop.vectorize(ni);
     loop.unroll(mi, &unrolled);
   }
 
   loop.prepareForCodegen();
-  te::Stmt* s = loop.root_stmt();
+  te::StmtPtr s = loop.root_stmt();
   s = te::IRSimplifier::simplify(s);
   auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
 
@@ -268,36 +268,36 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
 
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* m = loops[0];
+    te::ForPtr m = loops[0];
     loop.splitWithMask(m, 4);
   }
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* n = loops[2];
+    te::ForPtr n = loops[2];
     loop.splitWithMask(n, 16);
   }
   // mo, mi, no, ni, k ->
   // mo, no, mi, ni, k
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* mi = loops[1];
-    te::For* no = loops[2];
+    te::ForPtr mi = loops[1];
+    te::ForPtr no = loops[2];
     loop.reorderAxis(mi, no);
   }
   // mo, no, mi, ni, k ->
   // mo, no, mi, k, ni
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* ni = loops[3];
-    te::For* k = loops[4];
+    te::ForPtr ni = loops[3];
+    te::ForPtr k = loops[4];
     loop.reorderAxis(ni, k);
   }
   // mo, no, mi, k, ni ->
   // mo, no, k, mi, ni
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    te::For* mi = loops[2];
-    te::For* k = loops[3];
+    te::ForPtr mi = loops[2];
+    te::ForPtr k = loops[3];
     loop.reorderAxis(mi, k);
   }
   {
@@ -306,7 +306,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
   }
 
   loop.prepareForCodegen();
-  te::Stmt* s = loop.root_stmt();
+  te::StmtPtr s = loop.root_stmt();
   s = te::IRSimplifier::simplify(s);
   auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
 
diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
index fee326cdd4bd4..966c9e2a6853d 100644
--- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
@@ -44,10 +44,10 @@ BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
       });
   LoopNest loop_nest({c_tensor});
   auto const& loops = loop_nest.getLoopStmtsFor(c_tensor);
-  For* m = loops[0];
+  ForPtr m = loops[0];
   m->set_parallel();
   loop_nest.prepareForCodegen();
-  Stmt* stmt = loop_nest.root_stmt();
+  StmtPtr stmt = loop_nest.root_stmt();
   LLVMCodeGen cg(stmt, {c_tensor, a_buf, b_buf});
 
   float* a_ptr = A.data_ptr<float>();
diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
index acd46ac1de410..be5dcc815bc68 100644
--- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
@@ -233,7 +233,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
 
   te::LoopNest loop({BT});
   loop.prepareForCodegen();
-  te::Stmt* s = loop.root_stmt();
+  te::StmtPtr s = loop.root_stmt();
   s = te::IRSimplifier::simplify(s);
   auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
 
@@ -269,12 +269,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
 
   {
     auto const& loops = loop.getLoopStmtsFor(BT);
-    te::For* m = loops[1];
+    te::ForPtr m = loops[1];
     loop.splitWithTail(m, kChunkSize);
   }
 
   loop.prepareForCodegen();
-  te::Stmt* s = loop.root_stmt();
+  te::StmtPtr s = loop.root_stmt();
   s = te::IRSimplifier::simplify(s);
   auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
 
@@ -310,12 +310,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
 
   {
     auto const& loops = loop.getLoopStmtsFor(BT);
-    te::For* m = loops[1];
+    te::ForPtr m = loops[1];
     loop.splitWithMask(m, kChunkSize);
   }
 
   loop.prepareForCodegen();
-  te::Stmt* s = loop.root_stmt();
+  te::StmtPtr s = loop.root_stmt();
   s = te::IRSimplifier::simplify(s);
   auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
 
@@ -349,17 +349,17 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
       {{M, "M"}});
 
   te::LoopNest loop({BT});
-  te::Buf* rfac_buf;
+  te::BufPtr rfac_buf;
 
   auto loops = loop.getLoopStmtsFor(BT);
   TORCH_CHECK(loops.size() == 1);
-  te::For* mi;
+  te::ForPtr mi;
   loop.splitWithMask(loops.at(0), kChunkSize, &mi);
-  te::For* mo = loops.at(0);
+  te::ForPtr mo = loops.at(0);
 
   loop.reorderAxis(mo, mi);
   loops = loop.getLoopStmtsFor(BT);
-  auto bt_body = const_cast<te::Stmt*>(loop.getAllWritesToBuf(BT->buf())[1]);
+  auto bt_body = loop.getAllWritesToBuf(BT->buf())[1];
   TORCH_CHECK(loop.rfactor(bt_body, loops.at(0), &rfac_buf));
   loop.reorderAxis(loops.at(0), loops.at(1));
 
@@ -368,7 +368,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
   loop.vectorize(loops.at(1));
 
   loop.prepareForCodegen();
-  te::Stmt* s = loop.root_stmt();
+  te::StmtPtr s = loop.root_stmt();
   s = te::IRSimplifier::simplify(s);
   auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
 
@@ -394,8 +394,8 @@ BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) {
   te::LoopNest nest({b});
 
   auto loops = nest.getLoopStmtsFor(b);
-  te::For *mi, *mo;
-  te::Buf *rf;
+  te::ForPtr mi, mo;
+  te::BufPtr rf;
   nest.splitWithMask(loops[0], kChunkSize, &mi);
   loops = nest.reorder({loops[0], mi}, {1, 0});
   nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf);
@@ -566,8 +566,8 @@ BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) {
   auto sch = state.range(2);
   if (sch == 1) {
     auto loops = nest.getLoopStmtsFor(b);
-    te::For *mi, *mo;
-    te::Buf *rf;
+    te::ForPtr mi, mo;
+    te::BufPtr rf;
     nest.splitWithMask(loops[1], kChunkSize, &mi);
     loops = nest.reorder({loops[1], mi}, {1, 0});
     TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf));
@@ -583,8 +583,8 @@ BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) {
     nest.reorderAxis(loops[1], loops[2]);
   } else if (sch == 3) {
     auto loops = nest.getLoopStmtsFor(b);
-    te::For *mi, *mo;
-    te::Buf *rf;
+    te::ForPtr mi, mo;
+    te::BufPtr rf;
     nest.splitWithMask(loops[1], kChunkSize, &mi);
     loops = nest.reorder({loops[1], mi}, {1, 0});
     TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf));
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index 3776329a86a51..75e6a064d1ac5 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -1642,7 +1642,7 @@ TEST(LLVM, CompositeParallel) {
         [=](const VarHandle& m, const VarHandle& n) {
           return t3->load(m, n) + m + n;
         });
-    LoopNest loop_nest({t4}, {t1, t2, t3, t4});
+    LoopNest loop_nest(std::vector<Tensor*>({t4}), {t1, t2, t3, t4});
     std::vector<ForPtr> loop_list;
     {
       auto const& loops = loop_nest.getLoopStmtsFor(t1);
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index 898ee5293edab..c80dd5f492d95 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -1011,7 +1011,7 @@ TEST(LoopNest, ScheduleFunctionCall01) {
         return c->load(m, n, k) + 1;
       });
 
-  LoopNest l({d}, {c, d});
+  LoopNest l(std::vector<Tensor*>({d}), {c, d});
   l.prepareForCodegen();
   StmtPtr stmt = l.root_stmt();
   std::ostringstream oss;
@@ -1071,7 +1071,7 @@ TEST(LoopNest, ScheduleInlineSimple) {
         return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k);
       });
 
-  LoopNest l1({y}, {x, y});
+  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
   LoopNest l2(l1);
   l2.computeInline(x->buf());
 
@@ -1158,7 +1158,7 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
         return x->load(m, n, k) + y->load(m, n, k);
       });
 
-  LoopNest l({z}, {x, y, z});
+  LoopNest l(std::vector<Tensor*>({z}), {x, y, z});
   for (const std::string& order : inline_order) {
     if (order == "x") {
       l.computeInline(x->buf());
@@ -1267,7 +1267,7 @@ TEST(LoopNest, ScheduleInlineRandom) {
         return x->load(m, n, k) + x->load(m, n, k);
       });
 
-  LoopNest l1({y}, {x, y});
+  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
   l1.computeInline(x->buf());
 
   // would normally compare results but Rand isn't implemented in the
@@ -1304,7 +1304,7 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) {
             Intrinsics::make(kRand, kInt);
       });
 
-  LoopNest l1({y}, {x, y});
+  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
   l1.computeInline(x->buf());
 
   // would normally compare results but Rand isn't implemented in the
@@ -1337,7 +1337,7 @@ TEST(LoopNest, ScheduleInlineRandomLowerDimensions) {
         return x->load(m) + x->load(m);
       });
 
-  LoopNest l1({y}, {x, y});
+  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
   l1.computeInline(x->buf());
 
   // would normally compare results but Rand isn't implemented in the
@@ -1389,7 +1389,7 @@ TEST(LoopNest, ScheduleInlineIntrinsics) {
     }
   }
 
-  LoopNest l1({y}, {x, y});
+  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
   LoopNest l2(l1);
   l2.computeInline(x->buf());
 
@@ -1434,7 +1434,7 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
         return Intrinsics::make(kSqrt, x->load(m, n, k));
       });
 
-  LoopNest l1({y}, {x, y});
+  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
   l1.computeInline(x->buf());
 
   StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
@@ -1457,7 +1457,7 @@ TEST(LoopNest, ScheduleSplitAThenInline) {
     return a->load(j + ExprHandle(8));
   });
 
-  LoopNest l({b}, {a, b});
+  LoopNest l(std::vector<Tensor*>({b}), {a, b});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4);
   ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
@@ -1472,7 +1472,7 @@ TEST(LoopNest, ScheduleSplitBThenInline) {
     return a->load(j + ExprHandle(8));
   });
 
-  LoopNest l({b}, {a, b});
+  LoopNest l(std::vector<Tensor*>({b}), {a, b});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0);
   LoopNest::splitWithMask(loops[0], 3);
   l.computeInline(a->buf());
@@ -1499,7 +1499,7 @@ TEST(LoopNest, ScheduleSplitTwiceThenInline) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr i_inner;
 
-  LoopNest l({b}, {a, b});
+  LoopNest l(std::vector<Tensor*>({b}), {a, b});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4, &i_inner);
   LoopNest::splitWithMask(i_inner, 2);
@@ -1515,7 +1515,7 @@ TEST(LoopNest, ScheduleInlineThenSplit) {
     return a->load(j + ExprHandle(8));
   });
 
-  LoopNest l({b}, {a, b});
+  LoopNest l(std::vector<Tensor*>({b}), {a, b});
   l.computeInline(a->buf());
 
   std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
@@ -1540,7 +1540,7 @@ TEST(LoopNest, ScheduleSplitInlineThenSplit) {
     return a->load(j + ExprHandle(8));
   });
 
-  LoopNest l({b}, {a, b});
+  LoopNest l(std::vector<Tensor*>({b}), {a, b});
   auto loops = NodeFinder<For>::find(l.root_stmt());
   LoopNest::splitWithMask(loops.back(), 2);
   l.computeInline(a->buf());
@@ -1568,7 +1568,7 @@ TEST(LoopNest, ScheduleSplitInlineSimplify) {
     return a->load(j) - ExprHandle(1);
   });
 
-  LoopNest l({b}, {a, b});
+  LoopNest l(std::vector<Tensor*>({b}), {a, b});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4);
   ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
@@ -1587,7 +1587,7 @@ TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
         return a->load(k) * b->load(l);
       });
 
-  LoopNest l({c}, {a, b, c});
+  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
   l.computeInline(a->buf());
   l.prepareForCodegen();
@@ -1617,7 +1617,7 @@ TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
         return a->load(k) * b->load(l);
       });
 
-  LoopNest l({c}, {a, b, c});
+  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
   l.computeInline(a->buf());
   l.computeInline(b->buf());
@@ -1648,7 +1648,7 @@ TEST(LoopNest, ScheduleInlineThreeMixedInner) {
         return a->load(k) * b->load(l);
       });
 
-  LoopNest l({c}, {a, b, c});
+  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
   l.computeInline(b->buf());
   l.prepareForCodegen();
@@ -1678,7 +1678,7 @@ TEST(LoopNest, ScheduleInlineThreeMixedSplit) {
         return a->load(k) * b->load(l);
       });
 
-  LoopNest l({c}, {a, b, c});
+  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4);
   loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0);
@@ -1782,7 +1782,7 @@ TEST(LoopNest, ScheduleFuserThreeArg) {
     return f->load(i) + d.load(i);
   });
 
-  LoopNest l({g}, {e, f, g});
+  LoopNest l(std::vector<Tensor*>({g}), {e, f, g});
   l.computeInline(l.getLoopBodyFor(e));
   l.computeInline(l.getLoopBodyFor(f));
   l.prepareForCodegen();
@@ -1846,7 +1846,7 @@ TEST(LoopNest, LoopNestComputeAt_1) {
       "A", {{N, "i_a"}}, [&](const VarHandle& i_a) { return i_a * i_a; });
   Tensor* B = Compute(
       "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A->load(i_b); });
-  LoopNest l({B}, {A, B});
+  LoopNest l(std::vector<Tensor*>({B}), {A, B});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B->buf()).at(0);
   LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
   l.prepareForCodegen();
@@ -1909,7 +1909,7 @@ TEST(LoopNest, LoopNestComputeAt_2) {
       c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
     }
   }
-  LoopNest orig_loopnest({c}, {p, c});
+  LoopNest orig_loopnest(std::vector<Tensor*>({c}), {p, c});
 
   {
     // First let's try to compute P at axis cy (the outer loop)
@@ -2009,7 +2009,7 @@ TEST(LoopNest, LoopNestComputeAt_3) {
     }
   }
 
-  LoopNest orig_loopnest({D}, {A, B, C, D});
+  LoopNest orig_loopnest(std::vector<Tensor*>({D}), {A, B, C, D});
   {
     // First let's try to compute A at axis dy (the outer loop)
     LoopNest l(orig_loopnest);
@@ -2100,7 +2100,7 @@ TEST(LoopNest, Reduce2dComputeAt) {
       c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
     }
   }
-  LoopNest orig_loopnest({c}, {p, c});
+  LoopNest orig_loopnest(std::vector<Tensor*>({c}), {p, c});
   checkIR(orig_loopnest.root_stmt(), R"IR(
 # CHECK: for (int py = 0; py < H + 1; py++) {
 # CHECK:   for (int px = 0; px < W + 1; px++) {
@@ -2771,7 +2771,7 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) {
         return x->load(m, n, k) + y->load(m, n, k);
       });
 
-  LoopNest l({z}, {x, y, z});
+  LoopNest l(std::vector<Tensor*>({z}), {x, y, z});
   ForPtr a = nullptr;
   ForPtr b = nullptr;
   auto fors = NodeFinder<For>::find(l.root_stmt());
@@ -2983,7 +2983,7 @@ TEST(LoopNest, UnrollMultipleStatements) {
       Block::make(
           {Store::make(a_buf, {x}, x * 2),
            Store::make(b_buf, {x}, Load::make(a_buf, {x}))}));
-  Block::make({f});
+  auto parent_block = Block::make({f});
   StmtPtr unrolled = nullptr;
   LoopNest::unroll(f, &unrolled);
   checkIR(unrolled, R"IR(
@@ -3069,7 +3069,7 @@ TEST(LoopNest, UnrollWithLet) {
           {Let::make(e, 7),
            Store::make(a_buf, {x}, e),
            Store::make(b_buf, {x}, e + 1)}));
-  Block::make({f});
+  auto parent_block = Block::make({f});
   StmtPtr unrolled = nullptr;
   LoopNest::unroll(f, &unrolled);
   std::ostringstream oss;
@@ -3680,7 +3680,7 @@ TEST(LoopNest, DetectInlineRankMismatch) {
       "reshape",
       {{kTotalSize / 2, "i"}, {2, "j"}},
       [&](const VarHandle& i, const VarHandle& j) { return a->load(i, j); });
-  LoopNest l({reshape}, {a, reshape});
+  LoopNest l(std::vector<Tensor*>({reshape}), {a, reshape});
   ASSERT_THROWS_WITH(
       l.computeInline(l.getLoopBodyFor(a)),
       "Placeholder indexed access is inconsistent with its rank");
@@ -3702,7 +3702,7 @@ TEST(LoopNest, CacheReadsSimple) {
         return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
       });
 
-  LoopNest l({B, C}, {A, B, C});
+  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
   StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1];
   LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
 
@@ -3770,7 +3770,7 @@ TEST(LoopNest, CacheReadsOuter) {
         return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
       });
 
-  LoopNest l({B, C}, {A, B, C});
+  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
   StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][0];
   LoopNest::cacheAccesses(A->buf(), "A_local", i_loop);
 
@@ -3818,7 +3818,7 @@ TEST(LoopNest, CacheReadsInternal) {
         return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
       });
 
-  LoopNest l({B, C}, {A, B, C});
+  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
   StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1];
   LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
   l.prepareForCodegen();
@@ -3866,7 +3866,7 @@ TEST(LoopNest, CacheReadsInner) {
         return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
       });
 
-  LoopNest l({B, C}, {A, B, C});
+  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
   StmtPtr body = l.getLoopBodyFor(B);
   LoopNest::cacheAccesses(A->buf(), "A_local", body);
   l.prepareForCodegen();
@@ -3913,7 +3913,7 @@ TEST(LoopNest, CacheWritesSimple) {
         return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
       });
 
-  LoopNest l({B, C}, {A, B, C});
+  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
   StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A->buf())[0][1];
   LoopNest::cacheAccesses(A->buf(), "A_local", a_loop);
 
@@ -4093,7 +4093,7 @@ TEST(LoopNest, InlineConstantIndex) {
         return y->load(m, n, o);
       });
 
-  LoopNest l({z}, {y, z});
+  LoopNest l(std::vector<Tensor*>({z}), {y, z});
   l.simplify();
   ASSERT_TRUE(l.computeInline(y->buf()));
 }
@@ -4121,7 +4121,7 @@ TEST(LoopNest, CompoundTensorUsed) {
         return A->load(i, j + 1) + A->load(i, j + 2);
       });
 
-  LoopNest l({B}, {A, B});
+  LoopNest l(std::vector<Tensor*>({B}), {A, B});
   ASSERT_FALSE(l.computeInline(A->buf()));
   l.prepareForCodegen();
 
@@ -4897,7 +4897,7 @@ TEST(LoopNest, VectorizeUse) {
       "b", {{N, "n"}}, [&](const VarHandle& n) { return a.load(n) + 1.0f; });
   Tensor* c = Compute(
       "c", {{N, "n"}}, [&](const VarHandle& n) { return b->load(n) + 2.0f; });
-  LoopNest nest({c}, {b, c});
+  LoopNest nest(std::vector<Tensor*>({c}), {b, c});
   auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[0];
   ASSERT_TRUE(LoopNest::vectorize(loops[0]));
   loops = nest.getAllLoopNestsWritingToBuf(c->buf())[0];
diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp
index 7f844c5ba4cf4..9503f9d57b726 100644
--- a/test/cpp/tensorexpr/test_memdependency.cpp
+++ b/test/cpp/tensorexpr/test_memdependency.cpp
@@ -2739,7 +2739,7 @@ TEST(MemDependency, MemDependencyCheckerComputeAPI) {
         return c->load(m, n, k) + 1;
       });
 
-  LoopNest l({d}, {c, d});
+  LoopNest l(std::vector<Tensor*>({d}), {c, d});
 
   MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()});
 
@@ -2786,7 +2786,7 @@ TEST(MemDependency, MemDependencyCheckerComputeInline) {
         return c->load(m, n, k) + 1;
       });
 
-  LoopNest l({d}, {c, d});
+  LoopNest l(std::vector<Tensor*>({d}), {c, d});
   l.computeInline(c->buf());
 
   MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()});
@@ -2935,7 +2935,7 @@ TEST(MemDependency, MemDependencyCheckerComputeReduce) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
   Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}});
-  LoopNest l({d}, {c, d});
+  LoopNest l(std::vector<Tensor*>({d}), {c, d});
 
   MemDependencyChecker analyzer({a.data(), b.data()}, {d->buf()});
 
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index 0d033e0bd8a1f..449edac19823f 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -533,7 +533,7 @@ TEST(Reductions, ReduceAsProducer) {
       [&](const VarHandle& l, const VarHandle& n) {
         return c->load(l, n) * a.load(l, n);
       });
-  LoopNest loop({d}, {c, d});
+  LoopNest loop(std::vector<Tensor*>({d}), {c, d});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
   s = IRSimplifier::simplify(s);
@@ -578,7 +578,7 @@ TEST(Reductions, ReduceAsConsumer) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
   Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
-  LoopNest loop({d}, {c, d});
+  LoopNest loop(std::vector<Tensor*>({d}), {c, d});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
   s = IRSimplifier::simplify(s);
@@ -1201,7 +1201,7 @@ TEST(Reductions, ReduceInlineReduction) {
     }
   }
 
-  LoopNest l1({y}, {x, y});
+  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
   // Cannot inline a reduction computation
   ASSERT_FALSE(l1.computeInline(x->buf()));
 }
@@ -1235,7 +1235,7 @@ TEST(Reductions, ReduceInlineConsumer) {
     }
   }
 
-  LoopNest l1({y}, {x, y});
+  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
   LoopNest l2(l1);
   l2.computeInline(x->buf());
 
@@ -1293,7 +1293,7 @@ TEST(Reductions, ReduceInlineReducerInternal) {
     }
   }
 
-  LoopNest l1({y}, {x, y});
+  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
   LoopNest l2(l1);
   l2.computeInline(x->buf());
 
@@ -1340,7 +1340,7 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) {
     return b.load(0, 0, l) * d->load(l);
   });
 
-  LoopNest l({e}, {c, d, e});
+  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
   LoopNest l_before(l);
   l_before.prepareForCodegen();
   SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
@@ -1417,7 +1417,7 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) {
     return b.load(0, 0, l) * d->load(l);
   });
 
-  LoopNest l({e}, {c, d, e});
+  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
   LoopNest l_before(l);
   l_before.prepareForCodegen();
   SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
@@ -1492,7 +1492,7 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) {
     return b.load(0, 0, l) * d->load(l);
   });
 
-  LoopNest l({e}, {c, d, e});
+  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
   LoopNest l_before(l);
   l_before.prepareForCodegen();
   SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
@@ -1563,7 +1563,7 @@ TEST(Reductions, ReductionCacheBodyAccess) {
     return b.load(0, 0, l) * d->load(l);
   });
 
-  LoopNest l({e}, {c, d, e});
+  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
 
   StmtPtr d_loop = l.getLoopStmtsFor(d)[1];
   l.cacheAccesses(c->buf(), "scale_local", d_loop);
@@ -1604,7 +1604,7 @@ TEST(Reductions, ReductionCacheConsumerAccess) {
     return b.load(0, 0, l) * d->load(l);
   });
 
-  LoopNest l({e}, {c, d, e});
+  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
 
   LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4);
 
@@ -1645,7 +1645,7 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
     return b.load(0, 0, l) * d->load(l);
   });
 
-  LoopNest l({e}, {c, d, e});
+  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr inner;
@@ -1693,7 +1693,7 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
     return b.load(0, 0, l) * d->load(l);
   });
 
-  LoopNest l({e}, {c, d, e});
+  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr inner;
diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp
index 9320f47bfb3d8..5a6f257d6a79b 100644
--- a/test/cpp/tensorexpr/tutorial.cpp
+++ b/test/cpp/tensorexpr/tutorial.cpp
@@ -256,7 +256,9 @@ int main(int argc, char* argv[]) {
     // Creating a loop nest is as quite simple, we just need to specify a list
     // of all and a list of output tensors:
     // NOLINTNEXTLINE(bugprone-argument-comment)
-    LoopNest loopnest(/*outputs=*/{Y}, /*all=*/{X, Y});
+    std::vector<Tensor*> outputs = {Y};
+    std::vector<Tensor*> all = {X, Y};
+    LoopNest loopnest(outputs, all);
 
     // An IR used in LoopNest is based on tensor statements, represented by
     // `Stmt` class. Statements are used to specify the loop nest structure, and
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 2d00b1e4ab481..b342f1464b0c2 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -389,34 +389,33 @@ class AtomicAddFuser : public IRMutator {
 
   StmtPtr mutate(StorePtr v) override {
     BufPtr buf = v->buf();
-    StorePtr orig = const_cast<StorePtr>(v); // NOLINT
 
     // Thread locals never need to be atomic.
     if (thread_local_bufs_.count(buf->base_handle()) != 0) {
-      return orig;
+      return v;
     }
 
     ScalarType dtype = v->value()->dtype().scalar_type();
     if (dtype != ScalarType::Float && dtype != ScalarType::Double) {
-      return orig;
+      return v;
     }
     AddPtr add_v = to<Add>(v->value());
     if (!add_v) {
-      return orig;
+      return v;
     }
     LoadPtr load_v = to<Load>(add_v->lhs());
     if (!load_v) {
-      return orig;
+      return v;
     }
     if (v->base_handle() != load_v->base_handle()) {
-      return orig;
+      return v;
     }
     if (v->indices().empty() && load_v->indices().empty()) {
-      return orig;
+      return v;
     }
     bool index_equal = CheckEqual(v->flat_index(), load_v->flat_index());
     if (!index_equal) {
-      return orig;
+      return v;
     }
 
     // TODO: this checks that the metavars occur directly as an index, but this
@@ -431,7 +430,7 @@ class AtomicAddFuser : public IRMutator {
 
     if (vars_to_find.empty()) {
       // All metavars accounted for.
-      return orig;
+      return v;
     }
 
     return alloc<AtomicAdd>(buf, v->indices(), add_v->rhs());
@@ -609,23 +608,21 @@ class PrioritizeLoad : public IRMutator {
   }
 
   StmtPtr mutate(BlockPtr v) override {
-    BlockPtr v1 = const_cast<BlockPtr>(v); // NOLINT
-    assert(v1);
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    std::list<StmtPtr> stmts = v1->stmts();
+    std::list<StmtPtr> stmts = v->stmts();
     for (StmtPtr stmt : stmts) {
       PushList();
       StmtPtr stmt_new = stmt->accept_mutator(this);
 
-      AddMemLoadsFromList(v1, stmt);
+      AddMemLoadsFromList(v, stmt);
       PopList();
 
       if (stmt_new == stmt) {
         continue;
       }
-      v1->replace_stmt(stmt, stmt_new);
+      v->replace_stmt(stmt, stmt_new);
     }
-    return v1;
+    return v;
   }
 
   ExprPtr mutate(IfThenElsePtr v) override {
@@ -821,7 +818,7 @@ StmtPtr GPUMetaVarRewriter::mutate(BlockPtr v) {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     std::vector<StmtPtr> stmts;
     for (auto& v : innerSegments) {
-      for (auto* s : v.stmts()) {
+      for (auto s : v.stmts()) {
         stmts.push_back(s);
       }
     }
diff --git a/torch/csrc/jit/tensorexpr/half_support.h b/torch/csrc/jit/tensorexpr/half_support.h
index 15d48cd8952e0..eaf74d3c79d82 100644
--- a/torch/csrc/jit/tensorexpr/half_support.h
+++ b/torch/csrc/jit/tensorexpr/half_support.h
@@ -72,7 +72,8 @@ class HalfRewriter : public IRMutator {
       inserted_half_casts_.insert(new_val);
     }
 
-    return alloc<Store>(v->buf(), v->indices(), new_val);
+    v->set_value(new_val);
+    return v;
   }
 
   ExprPtr mutate(HalfImmPtr v) override {
diff --git a/torch/csrc/jit/tensorexpr/mem_dependency_checker.h b/torch/csrc/jit/tensorexpr/mem_dependency_checker.h
index 5363d2fc5ae93..1965b05009125 100644
--- a/torch/csrc/jit/tensorexpr/mem_dependency_checker.h
+++ b/torch/csrc/jit/tensorexpr/mem_dependency_checker.h
@@ -299,7 +299,7 @@ class TORCH_API MemDependencyChecker : public IRVisitor {
   DependencySet getAllReadsWithin(StmtOrExprPtr v) {
     DependencySet reads;
     auto insertAllReads = [&](const auto& nodes) {
-      for (auto* l : nodes) {
+      for (auto l : nodes) {
         auto bound = exprToAccess_.equal_range(l);
         for (auto it = bound.first; it != bound.second; ++it) {
           if (it->second->isRead()) {
@@ -324,7 +324,7 @@ class TORCH_API MemDependencyChecker : public IRVisitor {
 
     // writes just Store currently.
     auto stores = NodeFinder<Store>::find(v);
-    for (auto* s : stores) {
+    for (auto s : stores) {
       auto bound = stmtToAccess_.equal_range(s);
       for (auto it = bound.first; it != bound.second; ++it) {
         if (it->second->isWrite()) {
diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.h b/torch/csrc/jit/tensorexpr/operators/reduction.h
index 29f051f323b28..4335d7b3bd7f7 100644
--- a/torch/csrc/jit/tensorexpr/operators/reduction.h
+++ b/torch/csrc/jit/tensorexpr/operators/reduction.h
@@ -6,14 +6,14 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeSum(
+TORCH_API Tensor* computeSum(
     const std::vector<ArgValue>& inputs,
     const c10::optional<ScalarType>& outputType);
-Tensor* computeMean(
+TORCH_API Tensor* computeMean(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType);
-Tensor* computeAdaptiveAvgPool2d(
+TORCH_API Tensor* computeAdaptiveAvgPool2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType);

From 4e15a6f495ac7f42927a175261238b91632e8494 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Tue, 24 Aug 2021 00:29:22 -0700
Subject: [PATCH 158/530] [TensorExpr] Switch Exprs and Stmt from kernel-arena
 to shared_ptr. (#63216)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63216

Currently there are three classes managed by KernelArena: Expr, Stmt,
and Tensor (and derived classes). KernelArena has been a long standing
painpoint for NNC devs and we're moving away from that memory management
model to ref-count based memory model (using shared_ptr). This commit
switches Expr and Stmt to shared_ptr and is the biggest change in this
transition. Later commits will detach Tensor from KernelArena and kill
the arena + scope altogether.

Differential Revision:
D30353195
D30353195

Test Plan: Imported from OSS

Reviewed By: navahgar

Pulled By: ZolotukhinM

fbshipit-source-id: 9575225ada3d0fb65087ae40435f3dfea4792cae
---
 test/test_tensorexpr_pybind.py                |  3 -
 torch/csrc/jit/tensorexpr/eval.cpp            |  8 +-
 torch/csrc/jit/tensorexpr/expr.h              | 12 ++-
 torch/csrc/jit/tensorexpr/fwd_decls.h         |  9 +-
 torch/csrc/jit/tensorexpr/hash_provider.cpp   |  7 ++
 torch/csrc/jit/tensorexpr/hash_provider.h     | 23 ++++--
 torch/csrc/jit/tensorexpr/ir.h                |  6 ++
 torch/csrc/jit/tensorexpr/ir_cloner.cpp       |  8 +-
 torch/csrc/jit/tensorexpr/ir_mutator.cpp      |  8 +-
 torch/csrc/jit/tensorexpr/ir_printer.cpp      |  8 +-
 torch/csrc/jit/tensorexpr/ir_simplifier.cpp   | 81 ++++++++++++++++++
 torch/csrc/jit/tensorexpr/ir_simplifier.h     | 82 ++-----------------
 torch/csrc/jit/tensorexpr/ir_verifier.cpp     | 15 +++-
 torch/csrc/jit/tensorexpr/ir_visitor.cpp      |  8 +-
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp    |  5 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp        |  2 +-
 torch/csrc/jit/tensorexpr/stmt.h              | 20 +++--
 torch/csrc/jit/tensorexpr/tensorexpr_init.cpp | 55 +++++--------
 18 files changed, 213 insertions(+), 147 deletions(-)

diff --git a/test/test_tensorexpr_pybind.py b/test/test_tensorexpr_pybind.py
index d838892975c0c..0ae59e1c56484 100644
--- a/test/test_tensorexpr_pybind.py
+++ b/test/test_tensorexpr_pybind.py
@@ -394,9 +394,6 @@ def f(a):
         np.testing.assert_allclose(res1.numpy(), correct.numpy(), atol=2e-3)
         np.testing.assert_allclose(res2.numpy(), correct.numpy(), atol=2e-3)
 
-    def test_forgot_kernel_arena(self):
-        self.assertRaises(RuntimeError, lambda: torch._C._te.VarHandle("n", torch._C._te.Dtype.Int))
-
     @unittest.skipIf(not LLVM_ENABLED, "LLVM backend not enabled")
     def test_alloc_in_loop(self):
         with kernel_arena_scope():
diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp
index c7a28bdbb23ac..05c3ff8245221 100644
--- a/torch/csrc/jit/tensorexpr/eval.cpp
+++ b/torch/csrc/jit/tensorexpr/eval.cpp
@@ -281,8 +281,12 @@ class SimpleIREvaluatorImpl : public IRVisitor {
     return Value(result_v);
   }
 
-  template <typename Op>
-  void visit_binary_op(BinaryOpNode<Op>* v, bool option = false) {
+  template <
+      typename D,
+      typename std::enable_if<std::is_same<
+          decltype(detail::bin_op_deducer(std::declval<D>())),
+          void>::value>::type* = nullptr>
+  void visit_binary_op(NodePtr<D> v, bool option = false) {
     v->lhs()->accept(this);
     Value lhs_v = value_;
     v->rhs()->accept(this);
diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h
index fae24ec34be28..1b942eaf353fc 100644
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -36,10 +36,11 @@ enum IRNodeType {
 };
 
 // The common base between all expression node.
-class TORCH_API Expr : public KernelScopedObject {
+class TORCH_API Expr : public std::enable_shared_from_this<Expr> {
  public:
   explicit Expr(Dtype dtype, IRNodeType expr_type = kOther)
       : dtype_(dtype), expr_type_(expr_type) {}
+  virtual ~Expr() = default;
   Dtype dtype() const {
     return dtype_;
   }
@@ -66,6 +67,11 @@ class TORCH_API Expr : public KernelScopedObject {
    */
   static ExprPtr clone(ExprPtr s);
 
+ protected:
+  std::shared_ptr<Expr> getptr() {
+    return shared_from_this();
+  }
+
  private:
   Dtype dtype_;
   IRNodeType expr_type_;
@@ -78,7 +84,7 @@ class ExprNode : public Base {
  public:
   using ExprNodeBase = ExprNode<Op>;
   void accept(IRVisitor* visitor) override {
-    visitor->visit(static_to<Op>(this));
+    visitor->visit(static_to<Op>(Base::getptr()));
   }
   ExprPtr accept_mutator(IRMutator* mutator) override;
   // pass the constructor to the base class
@@ -335,7 +341,7 @@ class TORCH_API VarHandle : public ExprHandle {
 
 template <class Op, class Base>
 ExprPtr ExprNode<Op, Base>::accept_mutator(IRMutator* mutator) {
-  return mutator->mutate(static_to<Op>(this));
+  return mutator->mutate(static_to<Op>(Base::getptr()));
 }
 
 inline bool same_node(const ExprHandle& expr1, const ExprHandle& expr2) {
diff --git a/torch/csrc/jit/tensorexpr/fwd_decls.h b/torch/csrc/jit/tensorexpr/fwd_decls.h
index 01a767067f620..1b3dde560b427 100644
--- a/torch/csrc/jit/tensorexpr/fwd_decls.h
+++ b/torch/csrc/jit/tensorexpr/fwd_decls.h
@@ -1,26 +1,27 @@
 #pragma once
 #include <c10/core/ScalarType.h>
+#include <memory>
 
 namespace torch {
 namespace jit {
 namespace tensorexpr {
 
 template <typename Node>
-using NodePtr = Node*;
+using NodePtr = std::shared_ptr<Node>;
 
 template <typename To, typename From>
 NodePtr<To> to(NodePtr<From> x) {
-  return dynamic_cast<NodePtr<To>>(x);
+  return std::dynamic_pointer_cast<To>(x);
 }
 
 template <typename To, typename From>
 NodePtr<To> static_to(NodePtr<From> x) {
-  return static_cast<NodePtr<To>>(x);
+  return std::static_pointer_cast<To>(x);
 }
 
 template <typename Node, typename... Args>
 NodePtr<Node> alloc(Args&&... args) {
-  return new Node(std::forward<Args>(args)...);
+  return std::make_shared<Node>(std::forward<Args>(args)...);
 }
 
 class Buf;
diff --git a/torch/csrc/jit/tensorexpr/hash_provider.cpp b/torch/csrc/jit/tensorexpr/hash_provider.cpp
index fbc257d1988df..dce25669bf323 100644
--- a/torch/csrc/jit/tensorexpr/hash_provider.cpp
+++ b/torch/csrc/jit/tensorexpr/hash_provider.cpp
@@ -63,6 +63,13 @@ void HashProvider::visit(ModPtr v) {
   putHash(v, hash_combine(hashOf(v->lhs()), "%", hashOf(v->rhs())));
 }
 
+void HashProvider::visit(RoundOffPtr v) {
+  CACHE_GUARD();
+  v->lhs()->accept(this);
+  v->rhs()->accept(this);
+  putHash(v, hash_combine(hashOf(v->lhs()), "rof", hashOf(v->rhs())));
+}
+
 void HashProvider::visit(MaxPtr v) {
   CACHE_GUARD();
   v->lhs()->accept(this);
diff --git a/torch/csrc/jit/tensorexpr/hash_provider.h b/torch/csrc/jit/tensorexpr/hash_provider.h
index 5a33f048fec84..91ce269edeb5c 100644
--- a/torch/csrc/jit/tensorexpr/hash_provider.h
+++ b/torch/csrc/jit/tensorexpr/hash_provider.h
@@ -59,12 +59,16 @@ class TORCH_API HashProvider : public IRVisitor {
     return hashOf(e);
   }
 
-  bool cachedHash(const KernelScopedObject* e) {
+  bool cachedHash(ExprPtr e) {
     return exprToHash_.find(e) != exprToHash_.end();
   }
+  bool cachedHash(StmtPtr s) {
+    return stmtToHash_.find(s) != stmtToHash_.end();
+  }
 
   void clearCache() {
     exprToHash_.clear();
+    stmtToHash_.clear();
   }
 
   void visit(AddPtr v) override;
@@ -72,6 +76,7 @@ class TORCH_API HashProvider : public IRVisitor {
   void visit(MulPtr v) override;
   void visit(DivPtr v) override;
   void visit(ModPtr v) override;
+  void visit(RoundOffPtr v) override;
   void visit(MaxPtr v) override;
   void visit(MinPtr v) override;
   void visit(AndPtr v) override;
@@ -133,8 +138,8 @@ class TORCH_API HashProvider : public IRVisitor {
   }
 
   SimplifierHashType hashOf(StmtPtr s) {
-    auto it = exprToHash_.find(s);
-    if (it != exprToHash_.end()) {
+    auto it = stmtToHash_.find(s);
+    if (it != stmtToHash_.end()) {
       return it->second;
     }
 
@@ -182,15 +187,23 @@ class TORCH_API HashProvider : public IRVisitor {
     _hash_combine(seed, args...);
   }
 
-  void putHash(const KernelScopedObject* e, SimplifierHashType h) {
+  void putHash(ExprPtr e, SimplifierHashType h) {
     auto res = exprToHash_.emplace(e, h);
     if (res.second == false) {
       // This is always a logic bug since we should check the cache first.
       throw std::runtime_error("hash collision");
     }
   }
+  void putHash(StmtPtr s, SimplifierHashType h) {
+    auto res = stmtToHash_.emplace(s, h);
+    if (res.second == false) {
+      // This is always a logic bug since we should check the cache first.
+      throw std::runtime_error("hash collision");
+    }
+  }
 
-  std::unordered_map<const KernelScopedObject*, SimplifierHashType> exprToHash_;
+  std::unordered_map<ExprPtr, SimplifierHashType> exprToHash_;
+  std::unordered_map<StmtPtr, SimplifierHashType> stmtToHash_;
   UniqueNameManager name_manager_;
 
   size_t te_hash(SimplifierHashType val) {
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h
index 761b233fe8375..f9fc7dcfc4246 100644
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -178,6 +178,12 @@ class BinaryOpNode : public ExprNode<Op> {
   ExprPtr rhs_;
 };
 
+namespace detail {
+template <typename T>
+void bin_op_deducer(BinaryOpNode<T>);
+bool bin_op_deducer(...);
+} // namespace detail
+
 class TORCH_API Add : public BinaryOpNode<Add> {
  public:
   Add(ExprPtr lhs, ExprPtr rhs) : BinaryOpNode(lhs, rhs, IRNodeType::kAdd) {}
diff --git a/torch/csrc/jit/tensorexpr/ir_cloner.cpp b/torch/csrc/jit/tensorexpr/ir_cloner.cpp
index f724f2cbeb16f..e225826df66e2 100644
--- a/torch/csrc/jit/tensorexpr/ir_cloner.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_cloner.cpp
@@ -10,9 +10,13 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-template <typename Op>
+template <
+    typename Op,
+    typename std::enable_if<std::is_same<
+        decltype(detail::bin_op_deducer(std::declval<Op>())),
+        void>::value>::type* = nullptr>
 static ExprPtr mutate_binary_op(
-    NodePtr<BinaryOpNode<Op>> v,
+    NodePtr<Op> v,
     IRCloner* cloner,
     bool option = false) {
   ExprPtr lhs_new = v->lhs()->accept_mutator(cloner);
diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp
index 96635acab8c90..45121581eebf0 100644
--- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp
@@ -11,9 +11,13 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-template <typename Op>
+template <
+    typename Op,
+    typename std::enable_if<std::is_same<
+        decltype(detail::bin_op_deducer(std::declval<Op>())),
+        void>::value>::type* = nullptr>
 static ExprPtr mutate_binary_op(
-    BinaryOpNode<Op>* v,
+    NodePtr<Op> v,
     IRMutator* mutator,
     bool option = false) {
   ExprPtr lhs = v->lhs();
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index 23466f39160c8..f885246e24d2b 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -28,9 +28,13 @@ void IRPrinter::print(Stmt& stmt) {
 
 // TODO: change whether to include the parenthesis to the parent expression,
 // we need to look at the operator precedence to make the output simpler.
-template <typename Op>
+template <
+    typename Op,
+    typename std::enable_if<std::is_same<
+        decltype(detail::bin_op_deducer(std::declval<Op>())),
+        void>::value>::type* = nullptr>
 void visitBinaryOp(
-    BinaryOpNode<Op>* v,
+    NodePtr<Op> v,
     const std::string& op_str,
     IRPrinter* printer,
     bool parens = true) {
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index cb731d2525e71..23216dd4002f7 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -6,6 +6,70 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
+// Creates a new Expr of the given type with the provided lhs and rhs.
+inline ExprPtr newBinaryOpOfType(
+    IRNodeType expr_type,
+    ExprPtr lhs,
+    ExprPtr rhs,
+    bool option) {
+  switch (expr_type) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    case IRNodeType::kAdd:
+      return alloc<Add>(lhs, rhs);
+    case IRNodeType::kSub:
+      return alloc<Sub>(lhs, rhs);
+    case IRNodeType::kMul:
+      return alloc<Mul>(lhs, rhs);
+    case IRNodeType::kDiv:
+      return alloc<Div>(lhs, rhs);
+    case IRNodeType::kMod:
+      return alloc<Mod>(lhs, rhs);
+    case IRNodeType::kMax:
+      return alloc<Max>(lhs, rhs, option);
+    case IRNodeType::kMin:
+      return alloc<Min>(lhs, rhs, option);
+    case IRNodeType::kAnd:
+      return alloc<And>(lhs, rhs);
+    case IRNodeType::kXor:
+      return alloc<Xor>(lhs, rhs);
+    case IRNodeType::kLshift:
+      return alloc<Lshift>(lhs, rhs);
+    case IRNodeType::kRshift:
+      return alloc<Rshift>(lhs, rhs);
+    default:
+      LOG(FATAL) << "unsupported expr_type: " << static_cast<int>(expr_type);
+      return nullptr;
+  }
+}
+
+template <
+    typename Op,
+    typename std::enable_if<std::is_same<
+        decltype(detail::bin_op_deducer(std::declval<Op>())),
+        void>::value>::type* = nullptr>
+static ExprPtr mutateBinaryOp(
+    NodePtr<Op> v,
+    IRMutator* mutator,
+    bool option = false) {
+  ExprPtr lhs = v->lhs();
+  ExprPtr rhs = v->rhs();
+  ExprPtr lhs_new = lhs->accept_mutator(mutator);
+  ExprPtr rhs_new = rhs->accept_mutator(mutator);
+
+  ExprPtr node = v;
+
+  if (lhs != lhs_new || rhs != rhs_new) {
+    node = newBinaryOpOfType(v->expr_type(), lhs_new, rhs_new, option);
+  }
+
+  // Can only fold if both sides are constant.
+  if (!lhs_new->isConstant() || !rhs_new->isConstant()) {
+    return node;
+  }
+
+  return evaluateOp(node);
+}
+
 // Simple recursive GCD.
 template <typename T>
 T gcd(T a, T b) {
@@ -1499,6 +1563,22 @@ ExprPtr PolynomialTransformer::mutate(IfThenElsePtr v) {
   return alloc<IfThenElse>(condition_new, true_value_new, false_value_new);
 }
 
+ExprPtr PolynomialTransformer::mutate(AndPtr v) {
+  return mutateBinaryOp(v, this);
+}
+
+ExprPtr PolynomialTransformer::mutate(XorPtr v) {
+  return mutateBinaryOp(v, this);
+}
+
+ExprPtr PolynomialTransformer::mutate(LshiftPtr v) {
+  return mutateBinaryOp(v, this);
+}
+
+ExprPtr PolynomialTransformer::mutate(RshiftPtr v) {
+  return mutateBinaryOp(v, this);
+}
+
 StmtPtr PolynomialBase::mutate(CondPtr v) {
   ExprPtr cond_old = v->condition();
   StmtPtr true_old = v->true_stmt();
@@ -1904,6 +1984,7 @@ c10::optional<class ModRound*> isModRound(TermPtr e) {
     scalar = getImmediateByType(multiplier->dtype(), 1);
   }
 
+  // TODO: this leaks memory!
   return new ModRound(scalar, denom, divisor, mod_divisor);
 }
 
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.h b/torch/csrc/jit/tensorexpr/ir_simplifier.h
index 87c476242e8de..1df8b5d8f3501 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.h
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.h
@@ -55,7 +55,7 @@ Dtype promoteTypesVec(std::vector<ExprType>& v) {
 template <class ExprType>
 Dtype promoteTypesMap(
     ExprPtr s,
-    std::unordered_map<SimplifierHashType, ExprType*>& m) {
+    std::unordered_map<SimplifierHashType, ExprType>& m) {
   Dtype t = s->dtype();
   bool first = true;
   for (auto& e : m) {
@@ -69,12 +69,12 @@ Dtype promoteTypesMap(
 }
 
 template <class ExprType>
-Dtype promoteTypesVar(ExprType* e) {
+Dtype promoteTypesVar(ExprType e) {
   return e->dtype();
 }
 
 template <class ExprType, class... Args>
-Dtype promoteTypesVar(ExprType* e, Args... es) {
+Dtype promoteTypesVar(ExprType e, Args... es) {
   Dtype lhs = e->dtype();
   Dtype rhs = promoteTypesVar(es...);
   if (e->isConstant()) {
@@ -84,42 +84,6 @@ Dtype promoteTypesVar(ExprType* e, Args... es) {
   return promoteTypes(lhs, rhs);
 }
 
-// Creates a new Expr of the given type with the provided lhs and rhs.
-inline ExprPtr newBinaryOpOfType(
-    IRNodeType expr_type,
-    ExprPtr lhs,
-    ExprPtr rhs,
-    bool option) {
-  switch (expr_type) {
-    // NOLINTNEXTLINE(bugprone-branch-clone)
-    case IRNodeType::kAdd:
-      return alloc<Add>(lhs, rhs);
-    case IRNodeType::kSub:
-      return alloc<Sub>(lhs, rhs);
-    case IRNodeType::kMul:
-      return alloc<Mul>(lhs, rhs);
-    case IRNodeType::kDiv:
-      return alloc<Div>(lhs, rhs);
-    case IRNodeType::kMod:
-      return alloc<Mod>(lhs, rhs);
-    case IRNodeType::kMax:
-      return alloc<Max>(lhs, rhs, option);
-    case IRNodeType::kMin:
-      return alloc<Min>(lhs, rhs, option);
-    case IRNodeType::kAnd:
-      return alloc<And>(lhs, rhs);
-    case IRNodeType::kXor:
-      return alloc<Xor>(lhs, rhs);
-    case IRNodeType::kLshift:
-      return alloc<Lshift>(lhs, rhs);
-    case IRNodeType::kRshift:
-      return alloc<Rshift>(lhs, rhs);
-    default:
-      LOG(FATAL) << "unsupported expr_type: " << static_cast<int>(expr_type);
-      return nullptr;
-  }
-}
-
 // Uses the evaluator to fold an Expression with constant terms.
 // E.g. evaluateOp(Add(3, 4)) => 7.
 // Expr v must not have any unbound Vars.
@@ -498,21 +462,13 @@ class TORCH_API PolynomialTransformer : public PolynomialBase {
 
   ExprPtr mutate(ModPtr v) override;
 
-  ExprPtr mutate(AndPtr v) override {
-    return mutateBinaryOp(v, this);
-  }
+  ExprPtr mutate(AndPtr v) override;
 
-  ExprPtr mutate(XorPtr v) override {
-    return mutateBinaryOp(v, this);
-  }
+  ExprPtr mutate(XorPtr v) override;
 
-  ExprPtr mutate(LshiftPtr v) override {
-    return mutateBinaryOp(v, this);
-  }
+  ExprPtr mutate(LshiftPtr v) override;
 
-  ExprPtr mutate(RshiftPtr v) override {
-    return mutateBinaryOp(v, this);
-  }
+  ExprPtr mutate(RshiftPtr v) override;
 
   ExprPtr mutate(MaxPtr v) override;
 
@@ -526,30 +482,6 @@ class TORCH_API PolynomialTransformer : public PolynomialBase {
 
   ExprPtr mutate(IfThenElsePtr v) override;
 
-  template <typename Op>
-  static ExprPtr mutateBinaryOp(
-      BinaryOpNode<Op>* v,
-      IRMutator* mutator,
-      bool option = false) {
-    ExprPtr lhs = v->lhs();
-    ExprPtr rhs = v->rhs();
-    ExprPtr lhs_new = lhs->accept_mutator(mutator);
-    ExprPtr rhs_new = rhs->accept_mutator(mutator);
-
-    ExprPtr node = v;
-
-    if (lhs != lhs_new || rhs != rhs_new) {
-      node = newBinaryOpOfType(v->expr_type(), lhs_new, rhs_new, option);
-    }
-
-    // Can only fold if both sides are constant.
-    if (!lhs_new->isConstant() || !rhs_new->isConstant()) {
-      return node;
-    }
-
-    return evaluateOp(node);
-  }
-
   static ExprPtr simplify(ExprPtr e);
   static ExprHandle simplify(const ExprHandle& e);
   static StmtPtr simplify(StmtPtr e);
diff --git a/torch/csrc/jit/tensorexpr/ir_verifier.cpp b/torch/csrc/jit/tensorexpr/ir_verifier.cpp
index c88e92c9a7a82..f7adbdee93992 100644
--- a/torch/csrc/jit/tensorexpr/ir_verifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_verifier.cpp
@@ -9,8 +9,19 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-template <typename Op>
-void verifyBitwiseOp(const BitwiseOpNode<Op>* v, IRVerifier* verifier) {
+namespace detail {
+template <typename T>
+void deducer(BinaryOpNode<T>);
+
+bool deducer(...);
+} // namespace detail
+
+template <
+    typename D,
+    typename std::enable_if<std::is_same<
+        decltype(detail::deducer(std::declval<D>())),
+        void>::value>::type* = nullptr>
+void verifyBitwiseOp(NodePtr<D> v, IRVerifier* verifier) {
   if (!v->lhs()->dtype().is_integral()) {
     throw unsupported_dtype();
   }
diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.cpp b/torch/csrc/jit/tensorexpr/ir_visitor.cpp
index 9066544bd2291..eb2a4280c4f88 100644
--- a/torch/csrc/jit/tensorexpr/ir_visitor.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_visitor.cpp
@@ -11,8 +11,12 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-template <typename Op>
-static void visit_binary_op(BinaryOpNode<Op>* v, IRVisitor* visitor) {
+template <
+    typename Op,
+    typename std::enable_if<std::is_same<
+        decltype(detail::bin_op_deducer(std::declval<Op>())),
+        void>::value>::type* = nullptr>
+static void visit_binary_op(NodePtr<Op> v, IRVisitor* visitor) {
   v->lhs()->accept(visitor);
   v->rhs()->accept(visitor);
 }
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index eac1f82f25c4b..4ab2d53cc4942 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -488,12 +488,13 @@ class LLVMIntrinsicsExpander : public GenericIntrinsicsExpander {
     if (v->op_type() == kTanh) {
       ScalarType stype = v->dtype().scalar_type();
       if (stype == ScalarType::Float) {
-        return fast_tanh(v->param(0)->accept_mutator(this)).node();
+        return fast_tanh(ExprHandle(v->param(0)->accept_mutator(this))).node();
       }
     } else if (v->op_type() == kSigmoid) {
       ScalarType stype = v->dtype().scalar_type();
       if (stype == ScalarType::Float) {
-        return fast_sigmoid(v->param(0)->accept_mutator(this)).node();
+        return fast_sigmoid(ExprHandle(v->param(0)->accept_mutator(this)))
+            .node();
       }
     }
     // TODO: fast exp
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index a296d8c7af79b..d9d20736057fb 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -2380,7 +2380,7 @@ void LoopNest::compressBuffer(BufPtr buf, StmtPtr stmt) {
 
 void LoopNest::compressAllBuffers(StmtPtr stmt) {
   for (auto buf : BufFinder::find(stmt)) {
-    compressBuffer(const_cast<BufPtr>(buf), stmt);
+    compressBuffer(buf, stmt);
   }
 }
 
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index 0b4a2e4c5361c..7e4914fbc4aa7 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -14,14 +14,15 @@ namespace tensorexpr {
 class Placeholder;
 
 // The common base between all statement node.
-class TORCH_API Stmt : public KernelScopedObject {
+class TORCH_API Stmt : public std::enable_shared_from_this<Stmt> {
  public:
   Stmt() = default;
+  virtual ~Stmt() = default;
   virtual void accept(IRVisitor* visitor) = 0;
   virtual StmtPtr accept_mutator(IRMutator* mutator) = 0;
 
   StmtPtr get_parent() const {
-    return parent_;
+    return parent_ ? parent_->getptr() : nullptr;
   }
 
   /*
@@ -34,12 +35,15 @@ class TORCH_API Stmt : public KernelScopedObject {
   static StmtPtr clone(StmtPtr s);
 
  protected:
-  static void set_parent(StmtPtr s, StmtPtr new_parent) {
+  static void set_parent(StmtPtr s, Stmt* new_parent) {
     s->parent_ = new_parent;
   }
+  std::shared_ptr<Stmt> getptr() {
+    return shared_from_this();
+  }
 
  private:
-  StmtPtr parent_ = nullptr;
+  Stmt* parent_ = nullptr;
 };
 
 template <class Op>
@@ -47,7 +51,7 @@ class StmtNode : public Stmt {
  public:
   using StmtNodeBase = StmtNode<Op>;
   void accept(IRVisitor* visitor) override {
-    visitor->visit(static_to<Op>(this));
+    visitor->visit(static_to<Op>(getptr()));
   }
   StmtPtr accept_mutator(IRMutator* mutator) override;
   StmtNode() = default;
@@ -55,7 +59,7 @@ class StmtNode : public Stmt {
 
 template <class Op>
 StmtPtr StmtNode<Op>::accept_mutator(IRMutator* mutator) {
-  return mutator->mutate(static_to<Op>(this));
+  return mutator->mutate(static_to<Op>(getptr()));
 }
 
 // Concrete Stmt classes
@@ -193,7 +197,7 @@ class TORCH_API Block : public StmtNode<Block> {
   }
 
   void clear() {
-    for (auto* s : stmts_) {
+    for (auto s : stmts_) {
       set_parent(s, nullptr);
     }
     stmts_.clear();
@@ -281,7 +285,7 @@ class TORCH_API Block : public StmtNode<Block> {
 
   // returns the immediate child containing statement s.
   StmtPtr getEnclosedRoot(StmtPtr s) const {
-    while (s && s->get_parent() != this) {
+    while (s && s->get_parent().get() != this) {
       s = s->get_parent();
     }
     return s;
diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
index 304a317076c05..4e1618a8745d7 100644
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -184,10 +184,7 @@ void initTensorExprBindings(PyObject* module) {
           [](Placeholder& self,
              const std::vector<ExprHandle>& args,
              const ExprHandle& val) { return self.store(args, val); })
-      .def(
-          "data",
-          [](Placeholder& self) { return BufHandle(self.data()); },
-          py::return_value_policy::reference);
+      .def("data", [](Placeholder& self) { return BufHandle(self.data()); });
   py::class_<Tensor, std::unique_ptr<Tensor, py::nodelete>>(te, "Tensor")
       .def(py::init(
           [](BufHandle& b, StmtPtr s) { return new Tensor(b.node(), s); }))
@@ -197,8 +194,9 @@ void initTensorExprBindings(PyObject* module) {
             return self.load(v);
           })
       .def("buf", [](Tensor& self) { return BufHandle(self.buf()); })
-      .def("stmt", &Tensor::stmt, py::return_value_policy::reference);
-  py::class_<Cast>(te, "Cast").def_static("make", &Cast::make);
+      .def("stmt", &Tensor::stmt);
+  py::class_<Cast, std::shared_ptr<Cast>>(te, "Cast")
+      .def_static("make", &Cast::make);
 
   py::class_<DimArg>(te, "DimArg")
       .def(py::init<const ExprHandle&>())
@@ -321,7 +319,7 @@ void initTensorExprBindings(PyObject* module) {
       },
       py::return_value_policy::reference);
 
-  py::class_<Stmt, std::unique_ptr<Stmt, py::nodelete>>(te, "Stmt")
+  py::class_<Stmt, std::shared_ptr<Stmt>>(te, "Stmt")
       .def(py::init([](const std::vector<StmtPtr>& stmts) {
         return tensorexpr::Block::make(stmts);
       }))
@@ -330,22 +328,18 @@ void initTensorExprBindings(PyObject* module) {
         ss << self;
         return ss.str();
       });
-  py::class_<Store, Stmt, std::unique_ptr<Store, py::nodelete>>(te, "Store")
+  py::class_<Store, Stmt, std::shared_ptr<Store>>(te, "Store")
       .def_static(
           "make",
           [](const BufHandle& buf,
              std::vector<ExprHandle>& indices,
              const ExprHandle& value) {
             return Store::make(buf, indices, value);
-          },
-          py::return_value_policy::reference);
+          });
 
-  py::class_<For, Stmt, std::unique_ptr<For, py::nodelete>>(te, "For")
-      .def(
-          "index_var",
-          [](For& self) { return VarHandle(self.var()); },
-          py::return_value_policy::reference)
-      .def("body", &For::body, py::return_value_policy::reference)
+  py::class_<For, Stmt, std::shared_ptr<For>>(te, "For")
+      .def("index_var", [](For& self) { return VarHandle(self.var()); })
+      .def("body", &For::body)
       .def("set_parallel", &For::set_parallel)
       .def(
           "set_gpu_block_index",
@@ -362,35 +356,28 @@ void initTensorExprBindings(PyObject* module) {
           [](const VarHandle& var,
              const ExprHandle& start,
              const ExprHandle& stop,
-             StmtPtr body) { return For::make(var, start, stop, body); },
-          py::return_value_policy::reference);
+             StmtPtr body) { return For::make(var, start, stop, body); });
 
-  py::class_<Cond, Stmt, std::unique_ptr<Cond, py::nodelete>>(te, "Cond")
+  py::class_<Cond, Stmt, std::shared_ptr<Cond>>(te, "Cond")
       .def_static(
           "make",
           [](const ExprHandle& condition,
              StmtPtr true_stmt,
              StmtPtr false_stmt) {
-            return alloc<Cond>(condition.node(), true_stmt, false_stmt);
-          },
-          py::return_value_policy::reference)
-      .def("true_stmt", &Cond::true_stmt, py::return_value_policy::reference)
-      .def("false_stmt", &Cond::false_stmt, py::return_value_policy::reference);
+            return Cond::make(condition, true_stmt, false_stmt);
+          })
+      .def("true_stmt", &Cond::true_stmt)
+      .def("false_stmt", &Cond::false_stmt);
 
-  py::class_<
-      tensorexpr::Block,
-      Stmt,
-      std::unique_ptr<tensorexpr::Block, py::nodelete>>(te, "Block")
+  py::class_<tensorexpr::Block, Stmt, std::shared_ptr<tensorexpr::Block>>(
+      te, "Block")
       .def(py::init([](const std::vector<StmtPtr>& stmts) {
         return tensorexpr::Block::make(stmts);
       }))
-      .def(
-          "stmts",
-          &tensorexpr::Block::stmts,
-          py::return_value_policy::reference);
-  py::class_<ExternalCall, Stmt, std::unique_ptr<ExternalCall, py::nodelete>>(
+      .def("stmts", &tensorexpr::Block::stmts);
+  py::class_<ExternalCall, Stmt, std::shared_ptr<ExternalCall>>(
       te, "ExternalCall")
-      .def(py::init(&ExternalCall::make), py::return_value_policy::reference);
+      .def(py::init(&ExternalCall::make));
 
   py::class_<LoopNest>(te, "LoopNest")
       .def(py::init<const std::vector<Tensor*>&>())

From 62d02f2b577c223f94d0b190df3f158cd985c221 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Tue, 24 Aug 2021 00:29:22 -0700
Subject: [PATCH 159/530] [TensorExpr] Make 'Tensor' a value type. (#63586)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63586

This is another commit in transition from KernelArena memory management.
Tensor is essentially just a pair of <BufPtr, StmtPtr> and we don't need
to dynamically allocate it at all - it's cheap to pass it by value, and
that's what we're switching to in this commit.

After this change nothing uses KernelScope/KernelArena and they can be
safely removed.

Differential Revision:
D30429114
D30429114

Test Plan: Imported from OSS

Reviewed By: navahgar

Pulled By: ZolotukhinM

fbshipit-source-id: f90b859cfe863692b7beffbe9bd0e4143df1e819
---
 benchmarks/cpp/tensorexpr/bench_approx.cpp    |  20 +-
 benchmarks/cpp/tensorexpr/bench_batchnorm.cpp |   4 +-
 benchmarks/cpp/tensorexpr/bench_compile.cpp   |  40 +-
 benchmarks/cpp/tensorexpr/bench_concat.cpp    |   4 +-
 benchmarks/cpp/tensorexpr/bench_gemm.cpp      |  12 +-
 benchmarks/cpp/tensorexpr/bench_parallel.cpp  |   2 +-
 benchmarks/cpp/tensorexpr/bench_reduce.cpp    |  16 +-
 test/cpp/tensorexpr/test_approx.cpp           |   4 +-
 test/cpp/tensorexpr/test_boundsinference.cpp  | 190 +++---
 test/cpp/tensorexpr/test_conv.cpp             |   8 +-
 test/cpp/tensorexpr/test_cuda.cpp             |  68 +-
 test/cpp/tensorexpr/test_external_calls.cpp   |  48 +-
 test/cpp/tensorexpr/test_ir_printer.cpp       |  14 +-
 test/cpp/tensorexpr/test_kernel.cpp           |   2 +-
 test/cpp/tensorexpr/test_llvm.cpp             |  64 +-
 test/cpp/tensorexpr/test_loopnest.cpp         | 645 +++++++++---------
 test/cpp/tensorexpr/test_memdependency.cpp    |  68 +-
 test/cpp/tensorexpr/test_ops.cpp              |   4 +-
 test/cpp/tensorexpr/test_reductions.cpp       | 230 +++----
 test/cpp/tensorexpr/test_simplify.cpp         |   2 +-
 test/cpp/tensorexpr/tutorial.cpp              |  30 +-
 torch/csrc/jit/runtime/static/ops.cpp         |  14 +-
 torch/csrc/jit/tensorexpr/codegen.h           |   2 +-
 torch/csrc/jit/tensorexpr/ir_printer.cpp      |  17 +-
 torch/csrc/jit/tensorexpr/ir_printer.h        |   4 +-
 torch/csrc/jit/tensorexpr/kernel.cpp          |  62 +-
 torch/csrc/jit/tensorexpr/kernel.h            |  10 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp        |  22 +-
 torch/csrc/jit/tensorexpr/loopnest.h          |  14 +-
 .../csrc/jit/tensorexpr/operators/conv2d.cpp  |  18 +-
 torch/csrc/jit/tensorexpr/operators/conv2d.h  |   8 +-
 .../csrc/jit/tensorexpr/operators/matmul.cpp  |   8 +-
 torch/csrc/jit/tensorexpr/operators/matmul.h  |   4 +-
 torch/csrc/jit/tensorexpr/operators/norm.cpp  |   2 +-
 torch/csrc/jit/tensorexpr/operators/norm.h    |   2 +-
 .../jit/tensorexpr/operators/reduction.cpp    |  10 +-
 .../csrc/jit/tensorexpr/operators/reduction.h |   6 +-
 .../csrc/jit/tensorexpr/operators/softmax.cpp |  31 +-
 torch/csrc/jit/tensorexpr/operators/softmax.h |   2 +-
 torch/csrc/jit/tensorexpr/tensor.cpp          |  30 +-
 torch/csrc/jit/tensorexpr/tensor.h            |  39 +-
 torch/csrc/jit/tensorexpr/tensorexpr_init.cpp |  20 +-
 42 files changed, 881 insertions(+), 919 deletions(-)

diff --git a/benchmarks/cpp/tensorexpr/bench_approx.cpp b/benchmarks/cpp/tensorexpr/bench_approx.cpp
index 6e31697d586dd..92c26401f1617 100644
--- a/benchmarks/cpp/tensorexpr/bench_approx.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_approx.cpp
@@ -10,14 +10,14 @@
 using namespace torch::jit;
 using namespace torch::jit::tensorexpr;
 
-void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target, int width) {
+void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor target, int width) {
   auto loops = ln->getLoopStmtsFor(target);
   ForPtr inner, tail;
   ln->splitWithTail(loops[0], width, &inner, &tail);
   ln->vectorize(inner);
 }
 
-void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target) {
+void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor target) {
   std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
   ForPtr inner, tail;
   ln->splitWithTail(loops[0], 16 * 8, &inner, &tail);
@@ -33,7 +33,7 @@ static void relu_nnc(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   auto clamp = 0;
-  torch::jit::tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i){
+  torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i){
     auto A_elem = [&]() {
       auto elem = A.load(i);
       auto min = FloatImm::make(clamp);
@@ -67,7 +67,7 @@ static void log_nnc_sleef(benchmark::State& state) {
   KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  torch::jit::tensorexpr::Tensor* B =
+  torch::jit::tensorexpr::Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) {
         return log(A.load(i));
       });
@@ -97,7 +97,7 @@ static void log_nnc_fast(benchmark::State& state) {
   KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  torch::jit::tensorexpr::Tensor* B =
+  torch::jit::tensorexpr::Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) {
         return fast_log(A.load(i));
       });
@@ -127,7 +127,7 @@ static void log_nnc_vml(benchmark::State& state) {
   KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  torch::jit::tensorexpr::Tensor* B =
+  torch::jit::tensorexpr::Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) {
         return log_vml(A.load(i));
       });
@@ -168,7 +168,7 @@ static void logit_nnc_sleef(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   auto clamp = 1e-6f;
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto A_elem = [&]() {
       auto elem = A.load(i);
       auto min = FloatImm::make(clamp);
@@ -205,7 +205,7 @@ static void logit_nnc_fast(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   auto clamp = 1e-6f;
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto A_elem = [&]() {
       auto elem = A.load(i);
       auto min = FloatImm::make(clamp);
@@ -242,7 +242,7 @@ static void logit_nnc_vml(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   auto clamp = 1e-6f;
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto A_elem = [&]() {
       auto elem = A.load(i);
       auto min = FloatImm::make(clamp);
@@ -319,7 +319,7 @@ static void tanh_nnc_fast(benchmark::State& state) {
   KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  torch::jit::tensorexpr::Tensor* B =
+  torch::jit::tensorexpr::Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) {
         return fast_tanh(A.load(i));
       });
diff --git a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
index 872594ec286b7..85bf9d326ffac 100644
--- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
@@ -84,7 +84,7 @@ BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) {
   VarHandle eps("eps", kFloat);
 
   using axis = const VarHandle&;
-  Tensor* output = Compute(
+  Tensor output = Compute(
       "output",
       {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}},
       [&](axis n, axis c, axis h, axis w) {
@@ -147,7 +147,7 @@ BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) {
   VarHandle eps("eps", kFloat);
 
   using axis = const VarHandle&;
-  Tensor* output = Compute(
+  Tensor output = Compute(
       "output",
       {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}},
       [&](axis n, axis c, axis h, axis w) {
diff --git a/benchmarks/cpp/tensorexpr/bench_compile.cpp b/benchmarks/cpp/tensorexpr/bench_compile.cpp
index 245d5d8b203c5..50d54e57f889a 100644
--- a/benchmarks/cpp/tensorexpr/bench_compile.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_compile.cpp
@@ -13,24 +13,24 @@ static void BM_CompileSwish(benchmark::State& state) {
     te::KernelScope ks;
     te::VarHandle n("n", te::kInt);
     te::Placeholder A(te::BufHandle("A", {N}, te::kFloat));
-    te::Tensor* relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
+    te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
       return te::Max::make(A.load(i), 0.f, false);
     });
-    te::Tensor* min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
-      return te::Min::make(relu->load(i), 6.f, false);
+    te::Tensor min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
+      return te::Min::make(relu.load(i), 6.f, false);
     });
-    te::Tensor* plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
-      return min6->load(i) + 3.f;
+    te::Tensor plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
+      return min6.load(i) + 3.f;
     });
-    te::Tensor* times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
-      return A.load(i) * plus3->load(i);
+    te::Tensor times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
+      return A.load(i) * plus3.load(i);
     });
-    te::Tensor* sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
-      return times->load(i) * 1.f / 6.f;
+    te::Tensor sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
+      return times.load(i) * 1.f / 6.f;
     });
     te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
     for (auto tensor : {relu, min6, plus3, times}) {
-      nest.computeInline(tensor->buf());
+      nest.computeInline(tensor.buf());
     }
     nest.prepareForCodegen();
     te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
@@ -43,24 +43,24 @@ static void BM_CompileSwishLLVMOnly(benchmark::State& state) {
   te::KernelScope ks;
   te::VarHandle n("n", te::kInt);
   te::Placeholder A(te::BufHandle("A", {N}, te::kFloat));
-  te::Tensor* relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
+  te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
     return te::Max::make(A.load(i), 0.f, false);
   });
-  te::Tensor* min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
-    return te::Min::make(relu->load(i), 6.f, false);
+  te::Tensor min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
+    return te::Min::make(relu.load(i), 6.f, false);
   });
-  te::Tensor* plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
-    return min6->load(i) + 3.f;
+  te::Tensor plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
+    return min6.load(i) + 3.f;
   });
-  te::Tensor* times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
-    return A.load(i) * plus3->load(i);
+  te::Tensor times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
+    return A.load(i) * plus3.load(i);
   });
-  te::Tensor* sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
-    return times->load(i) * 1.f / 6.f;
+  te::Tensor sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
+    return times.load(i) * 1.f / 6.f;
   });
   te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
   for (auto tensor : {relu, min6, plus3, times}) {
-    nest.computeInline(tensor->buf());
+    nest.computeInline(tensor.buf());
   }
   nest.prepareForCodegen();
   te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp
index cb9aa84150e88..856065d6e789f 100644
--- a/benchmarks/cpp/tensorexpr/bench_concat.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp
@@ -60,7 +60,7 @@ class ConcatBench : public benchmark::Fixture {
           {input_sizes_[i][0], input_sizes_[i][1]}));
     }
 
-    Tensor* output = Compute(
+    Tensor output = Compute(
         "aten_cat",
         {{output_size_[0], "M"}, {output_size_[1], "N"}},
         [&](const VarHandle& m, const VarHandle& n) {
@@ -147,7 +147,7 @@ class ConcatBench : public benchmark::Fixture {
       for_stmts[i] = for_st;
       cumulative_input_sizes += input_sizes_[i][1];
     }
-    auto output = new Tensor(output_buf, alloc<Block>(for_stmts));
+    auto output = Tensor(output_buf, alloc<Block>(for_stmts));
 
     LoopNest nest({output});
     nest.prepareForCodegen();
diff --git a/benchmarks/cpp/tensorexpr/bench_gemm.cpp b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
index 7ebaa87781514..8646e97b756c2 100644
--- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
@@ -44,7 +44,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       te::Sum(),
@@ -68,7 +68,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       te::Sum(),
@@ -128,7 +128,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       te::Sum(),
@@ -188,7 +188,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       te::Sum(),
@@ -256,7 +256,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       te::Sum(),
@@ -302,7 +302,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
   }
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    loop.cacheAccesses(CT->buf(), "C_regs", loops[2]);
+    loop.cacheAccesses(CT.buf(), "C_regs", loops[2]);
   }
 
   loop.prepareForCodegen();
diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
index 966c9e2a6853d..847b66d9ee58b 100644
--- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
@@ -38,7 +38,7 @@ BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
   KernelScope kernel_scope;
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
-  Tensor* c_tensor = Compute(
+  Tensor c_tensor = Compute(
       "c", {{M, "m"}}, [&](const VarHandle& m) {
         return a_buf.load(m) + b_buf.load(m);
       });
diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
index be5dcc815bc68..9d3570197414d 100644
--- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
@@ -222,7 +222,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
   int M = A.numel();
 
   te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
-  te::Tensor* BT = te::Reduce(
+  te::Tensor BT = te::Reduce(
       "reduce_full",
       {{1, "N"}},
       te::Sum(),
@@ -255,7 +255,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
   int M = A.numel();
 
   te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
-  te::Tensor* BT = te::Reduce(
+  te::Tensor BT = te::Reduce(
       "reduce_full",
       {{1, "N"}},
       te::Sum(),
@@ -296,7 +296,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
   int M = A.numel();
 
   te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
-  te::Tensor* BT = te::Reduce(
+  te::Tensor BT = te::Reduce(
       "reduce_full",
       {{1, "N"}},
       te::Sum(),
@@ -339,7 +339,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
   TORCH_CHECK(M % kChunkSize == 0);
 
   te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
-  te::Tensor* BT = te::Reduce(
+  te::Tensor BT = te::Reduce(
       "reduce_full",
       {},
       te::Sum(),
@@ -359,7 +359,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
 
   loop.reorderAxis(mo, mi);
   loops = loop.getLoopStmtsFor(BT);
-  auto bt_body = loop.getAllWritesToBuf(BT->buf())[1];
+  auto bt_body = loop.getAllWritesToBuf(BT.buf())[1];
   TORCH_CHECK(loop.rfactor(bt_body, loops.at(0), &rfac_buf));
   loop.reorderAxis(loops.at(0), loops.at(1));
 
@@ -390,7 +390,7 @@ BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) {
   const int kChunkSize = 8;
 
   te::Placeholder a("A", te::kFloat, {M});
-  te::Tensor* b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
+  te::Tensor b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
   te::LoopNest nest({b});
 
   auto loops = nest.getLoopStmtsFor(b);
@@ -453,7 +453,7 @@ BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) {
   te::KernelScope ks;
   constexpr int kCacheSize = 1 << 12;
   te::Placeholder a("A", te::kFloat, {M, N});
-  te::Tensor* b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
+  te::Tensor b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
   te::LoopNest nest({b});
 
   auto sch = state.range(2);
@@ -560,7 +560,7 @@ BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) {
   te::KernelScope ks;
   constexpr int kChunkSize = 8;
   te::Placeholder a("A", te::kFloat, {M, N});
-  te::Tensor* b = te::computeSum({a.handle(), te::IntList({1}), false}, at::kFloat);
+  te::Tensor b = te::computeSum({a.handle(), te::IntList({1}), false}, at::kFloat);
   te::LoopNest nest({b});
 
   auto sch = state.range(2);
diff --git a/test/cpp/tensorexpr/test_approx.cpp b/test/cpp/tensorexpr/test_approx.cpp
index d761645b25b3f..2a4ce9485acde 100644
--- a/test/cpp/tensorexpr/test_approx.cpp
+++ b/test/cpp/tensorexpr/test_approx.cpp
@@ -11,7 +11,7 @@
 using namespace torch::indexing;
 namespace te = torch::jit::tensorexpr;
 
-static void vectorize(te::LoopNest* ln, te::Tensor* target, int width) {
+static void vectorize(te::LoopNest* ln, te::Tensor target, int width) {
   auto loops = ln->getLoopStmtsFor(target);
   te::ForPtr inner, tail;
   ln->splitWithTail(loops[0], width, &inner, &tail);
@@ -33,7 +33,7 @@ TEST(Approx, log_vml) {
   te::KernelScope ks;
   te::VarHandle N("N", te::kInt);
   te::Placeholder A("A", te::kFloat, {N});
-  te::Tensor* B = te::Compute(
+  te::Tensor B = te::Compute(
       "B", {N}, [&](const te::VarHandle& i) { return log_vml(A.load(i)); });
 
   te::LoopNest ln({B});
diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp
index fcfa8cec4bc49..d038665fad75b 100644
--- a/test/cpp/tensorexpr/test_boundsinference.cpp
+++ b/test/cpp/tensorexpr/test_boundsinference.cpp
@@ -49,7 +49,7 @@ TEST(BoundsInference, _1) {
   KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
+  Tensor b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
@@ -60,9 +60,9 @@ TEST(BoundsInference, _1) {
   ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
   verifyConstBounds(bounds_info.at(a.data())[0], {{0, 99}});
 
-  ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-  ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-  verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 99}});
+  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 99}});
 }
 
 TEST(BoundsInference, _2) {
@@ -74,7 +74,7 @@ TEST(BoundsInference, _2) {
   KernelScope kernel_scope;
   VarHandle n("n", kInt);
   Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
+  Tensor b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
@@ -85,9 +85,9 @@ TEST(BoundsInference, _2) {
   ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
   verifyConstBounds(bounds_info.at(a.data())[0], {{0, -1}});
 
-  ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-  ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-  verifyConstBounds(bounds_info.at(b->buf())[0], {{0, -1}});
+  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, -1}});
 }
 
 TEST(BoundsInference, _3) {
@@ -99,7 +99,7 @@ TEST(BoundsInference, _3) {
   KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n + 10}, kFloat));
-  Tensor* b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) {
+  Tensor b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) {
     return a.load(i) * a.load(i + 10);
   });
   LoopNest l({b});
@@ -111,9 +111,9 @@ TEST(BoundsInference, _3) {
   ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
   verifyConstBounds(bounds_info.at(a.data())[0], {{0, 109}});
 
-  ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-  ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-  verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 99}});
+  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 99}});
 }
 
 TEST(BoundsInference, _4) {
@@ -129,13 +129,13 @@ TEST(BoundsInference, _4) {
   ExprHandle W(320);
   ExprHandle H(200);
   Placeholder a(BufHandle("a", {H, W}, kFloat));
-  Tensor* b = Compute(
+  Tensor b = Compute(
       "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
         return x * y;
       });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return a.load(y, x) * b->load(y, x);
+        return a.load(y, x) * b.load(y, x);
       });
   LoopNest l({c});
   std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
@@ -149,13 +149,13 @@ TEST(BoundsInference, _4) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{0, 199}, {0, 319}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 199}, {0, 319}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 199}, {0, 319}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 199}, {0, 319}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 199}, {0, 319}});
   }
   {
     // Infer bounds on the inner loop scope
@@ -166,13 +166,13 @@ TEST(BoundsInference, _4) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {0, 319}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {0, 319}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {0, 319}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {0, 319}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {0, 319}});
   }
   {
     // Infer bounds on the inner loop body's scope
@@ -183,13 +183,13 @@ TEST(BoundsInference, _4) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {-1, -1}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {-1, -1}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {-1, -1}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {-1, -1}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {-1, -1}});
   }
 }
 
@@ -208,7 +208,7 @@ TEST(BoundsInference, _5) {
   KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
+  Tensor b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
 
@@ -229,9 +229,9 @@ TEST(BoundsInference, _5) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{0, 95}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 95}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 95}});
   }
   {
     // Verify inferred bounds for the tail loop
@@ -242,9 +242,9 @@ TEST(BoundsInference, _5) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{96, 99}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{96, 99}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{96, 99}});
   }
 }
 
@@ -263,13 +263,13 @@ TEST(BoundsInference, _6) {
   ExprHandle CW(32);
   ExprHandle CH(20);
   Placeholder a(BufHandle("a", {H, W}, kFloat));
-  Tensor* b = Compute(
+  Tensor b = Compute(
       "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
         return x * y;
       });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{CH, "y"}, {CW, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return a.load(y + 100, x + 100) * b->load(y * 2, x * 5);
+        return a.load(y + 100, x + 100) * b.load(y * 2, x * 5);
       });
   LoopNest l({c});
   std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
@@ -283,13 +283,13 @@ TEST(BoundsInference, _6) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{100, 119}, {100, 131}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 38}, {0, 155}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 38}, {0, 155}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 19}, {0, 31}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 19}, {0, 31}});
   }
   {
     // Infer bounds on the inner loop scope
@@ -300,13 +300,13 @@ TEST(BoundsInference, _6) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {100, 131}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {0, 155}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {0, 155}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {0, 31}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {0, 31}});
   }
   {
     // Infer bounds on the inner loop body's scope
@@ -317,13 +317,13 @@ TEST(BoundsInference, _6) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {-1, -1}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {-1, -1}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {-1, -1}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {-1, -1}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {-1, -1}});
   }
 }
 
@@ -331,9 +331,9 @@ TEST(BoundsInference, Adjacent) {
   KernelScope kernel_scope;
   ExprHandle H(6);
   Placeholder a(BufHandle("a", {20}, kFloat));
-  Tensor* b =
+  Tensor b =
       Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H); });
   LoopNest l({b, c});
   std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
@@ -348,9 +348,9 @@ TEST(BoundsInference, Adjacent) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{0, 5}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 5}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 5}});
   }
   {
     // Infer bounds on the inner loop scope
@@ -362,9 +362,9 @@ TEST(BoundsInference, Adjacent) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{6, 11}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 5}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 5}});
   }
   {
     // Infer bounds on the high level program.
@@ -377,24 +377,24 @@ TEST(BoundsInference, Adjacent) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{0, 11}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 5}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 5}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 5}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 5}});
   }
 }
 
 TEST(BoundsInference, MultipleTopLoopLoad) {
   KernelScope kernel_scope;
   Placeholder a(BufHandle("a", {100}, kFloat));
-  Tensor* b =
+  Tensor b =
       Compute("b", {{64, "x"}}, [&](const VarHandle& x) { return a.load(x); });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{32, "x"}}, [&](const VarHandle& x) { return a.load(x + 10); });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "d", {{96, "x"}}, [&](const VarHandle& x) { return a.load(x + 2); });
   LoopNest l({b, c, d});
 
@@ -418,7 +418,7 @@ TEST(BoundsInference, MultipleTopLoopLoad) {
 
   // b, c, d only written.
   {
-    auto bounds = bounds_info[b->buf()];
+    auto bounds = bounds_info[b.buf()];
     ASSERT_EQ(bounds.size(), 1);
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
@@ -426,7 +426,7 @@ TEST(BoundsInference, MultipleTopLoopLoad) {
     verifyConstBounds(bound, {{0, 63}});
   }
   {
-    auto bounds = bounds_info[c->buf()];
+    auto bounds = bounds_info[c.buf()];
     ASSERT_EQ(bounds.size(), 1);
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
@@ -434,7 +434,7 @@ TEST(BoundsInference, MultipleTopLoopLoad) {
     verifyConstBounds(bound, {{0, 31}});
   }
   {
-    auto bounds = bounds_info[d->buf()];
+    auto bounds = bounds_info[d.buf()];
     ASSERT_EQ(bounds.size(), 1);
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
@@ -506,24 +506,24 @@ TEST(BoundsInference, MultipleTopLoopStore) {
 TEST(BoundsInference, CacheReads) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 3);
+        return A.load(i + 30, j + 3);
       });
-  Tensor* C = Compute(
+  Tensor C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
   LoopNest l({B, C});
   auto bounds_info_before = inferBounds(l.root_stmt());
 
   StmtPtr j_loop = l.getLoopStmtsFor(B)[1];
-  LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
+  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
 
   auto bounds_info_after = inferBounds(l.root_stmt());
 
@@ -571,7 +571,7 @@ TEST(BoundsInference, CacheReads) {
 
 TEST(BoundsInference, Flattened) {
   KernelScope kernel_scope;
-  Tensor* b = Compute(
+  Tensor b = Compute(
       "b",
       {{3, "z"}, {4, "y"}, {5, "x"}},
       [&](const VarHandle& z, const VarHandle& y, const VarHandle& x) {
@@ -585,7 +585,7 @@ TEST(BoundsInference, Flattened) {
 
   // There's only one buffer.
   ASSERT_EQ(bounds_info.size(), 1);
-  auto& TABI = bounds_info[b->buf()][0];
+  auto& TABI = bounds_info[b.buf()][0];
   ASSERT_EQ(TABI.kind, TensorAccessKind::kStore);
   // Flattened bounds should have a single dimension.
   ASSERT_EQ(TABI.start.size(), 1);
@@ -651,11 +651,11 @@ TEST(BoundsInference, GetPotentialHazards) {
 TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return (i + 1) * (j + 1);
       });
@@ -679,13 +679,13 @@ TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
 TEST(BoundsInference, GetPotentialHazardsLoopCall) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{64, "i"}, {64, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i, j) + 5;
+        return A.load(i, j) + 5;
       });
 
   LoopNest l({A, B});
@@ -706,7 +706,7 @@ TEST(BoundsInference, GetPotentialHazardsLoopCall) {
 TEST(BoundsInference, GetPotentialHazardsLoopSplit) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
diff --git a/test/cpp/tensorexpr/test_conv.cpp b/test/cpp/tensorexpr/test_conv.cpp
index 63881d0d33cae..293fbe248f176 100644
--- a/test/cpp/tensorexpr/test_conv.cpp
+++ b/test/cpp/tensorexpr/test_conv.cpp
@@ -30,7 +30,7 @@ TEST(Conv, DepthwiseConv2D) {
   te::Placeholder input("input", te::kFloat, {N, C, H, W});
   te::Placeholder weight("weight", te::kFloat, {K, CperG, R, S});
   te::Placeholder bias("bias", te::kFloat, {K});
-  te::Tensor* output = te::conv2d_depthwise(
+  te::Tensor output = te::conv2d_depthwise(
       input.handle(), weight.handle(), bias.handle(), kStride, kPad, kGroups);
 
   te::LoopNest loop({output});
@@ -61,7 +61,7 @@ TEST(Conv, DepthwiseConv2DNoBias) {
 
   te::Placeholder input("input", te::kFloat, {N, C, H, W});
   te::Placeholder weight("weight", te::kFloat, {K, CperG, R, S});
-  te::Tensor* output = te::conv2d_depthwise(
+  te::Tensor output = te::conv2d_depthwise(
       input.handle(), weight.handle(), kStride, kPad, kGroups);
 
   te::LoopNest loop({output});
@@ -96,7 +96,7 @@ TEST(Conv, DepthwiseConv2DDynamicShapes) {
   te::Placeholder input("input", te::kFloat, {N_var, C_var, H_var, W_var});
   te::Placeholder weight(
       "weight", te::kFloat, {K_var, CperG_var, R_var, S_var});
-  te::Tensor* output = te::conv2d_depthwise(
+  te::Tensor output = te::conv2d_depthwise(
       input.handle(),
       weight.handle(),
       N_var,
@@ -195,7 +195,7 @@ TEST(Conv, Conv2D) {
   te::Placeholder inputB(te::BufHandle("input", {N, C, H, W}, te::kFloat));
   te::Placeholder filterB(te::BufHandle("filter", {K, C, R, S}, te::kFloat));
 
-  te::Tensor* conv = te::Reduce(
+  te::Tensor conv = te::Reduce(
       "conv",
       {{N, "n"}, {K, "k"}, {OH, "oh"}, {OW, "ow"}},
       te::Sum(),
diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
index e36e17ad432f9..ed5c070ea8689 100644
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ b/test/cpp/tensorexpr/test_cuda.cpp
@@ -34,7 +34,7 @@ static void testCudaTestVectorAdd01_impl() {
   Dtype dtype = ToDtype<ctype>();
   Placeholder a_buf("a", dtype, {num_iter, block_count, block_size});
   Placeholder b_buf("b", dtype, {num_iter, block_count, block_size});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c",
       {
           {num_iter, "n"},
@@ -99,7 +99,7 @@ TEST(Cuda, Sigmoid_CUDA) {
   const int block_size = 128;
   Dtype dtype = ToDtype<float>();
   Placeholder a_buf("a", dtype, {num_iter, block_count, block_size});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c",
       {
           {num_iter, "n"},
@@ -165,7 +165,7 @@ static void testCudaTestVectorAdd02_impl(int N, int block_size) {
   KernelScope kernel_scope;
   Placeholder a_buf("a", kFloat, {N});
   Placeholder b_buf("b", kFloat, {N});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c",
       {
           {N, "N"},
@@ -225,7 +225,7 @@ TEST(Cuda, HalfCast_CUDA) {
   KernelScope ks;
   auto half = ToDtype<at::Half>();
   Placeholder a("a", half, {4});
-  Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
     return Cast::make(kFloat, a.load(i));
   });
 
@@ -267,7 +267,7 @@ TEST(Cuda, DynamicShape2D_CUDA) {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {m, n}, kFloat));
     Placeholder b(BufHandle("b", {m, n}, kFloat));
-    Tensor* c = Compute(
+    Tensor c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
           return a.load(i, j) + b.load(i, j);
         });
@@ -328,7 +328,7 @@ TEST(Cuda, TestRand01_CUDA) {
   const int num_iter = 3;
   const int block_count = 16;
   const int block_size = 128;
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c",
       {
           {num_iter, "n"},
@@ -387,7 +387,7 @@ TEST(Cuda, DynamicShapeSplit_CUDA) {
   constexpr int N = 4096;
   VarHandle n("n", kInt);
   Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b = Compute(
+  Tensor b = Compute(
       "b", {{n, "n"}}, [&](const VarHandle& i) { return a.load(i) * 2.0f; });
   LoopNest l({b});
   ForPtr inner;
@@ -928,16 +928,16 @@ TEST(Cuda, HalfSupport_CUDA) {
   KernelScope ks;
   auto half = ToDtype<at::Half>();
   Placeholder a("a", half, {4});
-  Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
     return Cast::make(half, ExprHandle(2.0f) * a.load(i));
   });
 
-  Tensor* c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) {
-    return Cast::make(kFloat, Cast::make(half, ExprHandle(42)) + b->load(i));
+  Tensor c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) {
+    return Cast::make(kFloat, Cast::make(half, ExprHandle(42)) + b.load(i));
   });
 
-  Tensor* d = Compute("d", {{4, "n"}}, [&](const VarHandle& i) {
-    return Cast::make(half, c->load(i));
+  Tensor d = Compute("d", {{4, "n"}}, [&](const VarHandle& i) {
+    return Cast::make(half, c.load(i));
   });
 
   LoopNest l({b, c, d});
@@ -986,7 +986,7 @@ TEST(Cuda, HalfPropagation_CUDA) {
   KernelScope kernel_scope;
   auto half = ToDtype<at::Half>();
   Placeholder a("a", half, {4});
-  Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
     return Max::make(a.load(i), ExprHandle(alloc<HalfImm>(0)), true);
   });
 
@@ -1036,7 +1036,7 @@ TEST(Cuda, UnusedHalfArgument_CUDA) {
   Placeholder a("a", kFloat, {4});
   auto half = ToDtype<at::Half>();
   Placeholder b("b", half, {4});
-  Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
     return Max::make(a.load(i), ExprHandle(alloc<FloatImm>(0)), true);
   });
 
@@ -1168,10 +1168,10 @@ TEST(Cuda, MaskBlockDim_CUDA) {
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {A_SIZE});
   Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + 10;
   });
-  Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + b_buf.load(i);
   });
 
@@ -1261,10 +1261,10 @@ TEST(Cuda, MaskThreadDim_CUDA) {
   int B_SIZE = 100;
   Placeholder a_buf("a", kFloat, {A_SIZE});
   Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + 10;
   });
-  Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i / 2) + b_buf.load(i);
   });
 
@@ -1356,10 +1356,10 @@ TEST(Cuda, MaskMultiBlockDim_CUDA) {
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {A_SIZE});
   Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + 10;
   });
-  Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + b_buf.load(i);
   });
 
@@ -1450,10 +1450,10 @@ TEST(Cuda, MaskBlockAndThreadDim_CUDA) {
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {A_SIZE});
   Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + 10;
   });
-  Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + b_buf.load(i);
   });
 
@@ -1543,17 +1543,17 @@ TEST(Cuda, MaskMultiDim_CUDA) {
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
   Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
         return ExprHandle(2) * a_buf.load(i, j);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "D",
       {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->load(i, j * 2) + b_buf.load(i, j);
+        return c.load(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
@@ -1673,17 +1673,17 @@ TEST(Cuda, MaskMultiDimSymbolic_CUDA) {
   VarHandle B_SIZE("B_SIZE", kInt);
   Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
   Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
         return ExprHandle(2) * a_buf.load(i, j);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "D",
       {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->load(i, j * 2) + b_buf.load(i, j);
+        return c.load(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
@@ -2087,17 +2087,17 @@ TEST(Cuda, MaskMultiDimMultiAxis_CUDA) {
   int B_SIZE = 15;
   Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
   Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
         return ExprHandle(2) * a_buf.load(i, j);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "D",
       {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->load(i, j * 2) + b_buf.load(i, j);
+        return c.load(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
@@ -2218,17 +2218,17 @@ TEST(Cuda, MaskMultiDimMultiLevel_CUDA) {
   int B_SIZE = 15;
   Placeholder a_buf("a", kFloat, {OUTER_A_SIZE, A_SIZE});
   Placeholder b_buf("b", kFloat, {OUTER_B_SIZE, B_SIZE});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "C",
       {{OUTER_A_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
         return ExprHandle(2) * a_buf.load(i, j);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "D",
       {{OUTER_B_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->load(i, j * 2) + b_buf.load(i, j);
+        return c.load(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp
index 24ddfbf095ab3..a170e530fa98f 100644
--- a/test/cpp/tensorexpr/test_external_calls.cpp
+++ b/test/cpp/tensorexpr/test_external_calls.cpp
@@ -31,7 +31,7 @@ TEST(ExternalCall, Conv2d_float) {
   int64_t dilation = 1;
   int64_t groups = 1;
 
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
@@ -95,7 +95,7 @@ TEST(ExternalCall, Conv2d_int) {
   int64_t dilation = 1;
   int64_t groups = 1;
 
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
@@ -153,7 +153,7 @@ TEST(ExternalCall, Conv2d_nobias_noargs) {
   Placeholder Weight("Weight", kFloat, {16, 16, 1, 1});
   BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat);
 
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
@@ -203,7 +203,7 @@ TEST(ExternalCall, Addmm_float) {
   int64_t beta = 2;
   int64_t alpha = 2;
 
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
@@ -283,7 +283,7 @@ TEST(ExternalCall, Prepacked_Linear_float) {
       weight, bias, c10::optional<at::Scalar>(), c10::optional<at::Scalar>());
 
   Placeholder DummyPrepacked("DummyPrepacked", kFloat, {1});
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
@@ -370,7 +370,7 @@ TEST(ExternalCall, Prepacked_Conv2d_float) {
       c10::optional<at::Scalar>());
 
   Placeholder DummyPrepacked("DummyPrepacked", kFloat, {1});
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
@@ -431,7 +431,7 @@ TEST(ExternalCall, BinaryFloat) {
     Placeholder B("", kFloat, toExprHandleVec(bShape));
     BufHandle ResultBuf("Result", toExprHandleVec(resShape), kFloat);
 
-    Tensor* Result = new Tensor(
+    Tensor Result = Tensor(
         ResultBuf.node(),
         ExternalCall::make(
             ResultBuf,
@@ -516,7 +516,7 @@ TEST(ExternalCall, UnaryFloat) {
     Placeholder A("A", kFloat, toExprHandleVec(aShape));
     BufHandle ResultBuf("Result", toExprHandleVec(resShape), kFloat);
 
-    Tensor* Result = new Tensor(
+    Tensor Result = Tensor(
         ResultBuf.node(),
         ExternalCall::make(
             ResultBuf, externCallName, {BufHandle(A.data())}, externCallArgs));
@@ -566,14 +566,14 @@ TEST(ExternalCall, ComputeInterop) {
   BufHandle ConvResultBuf("ConvResult", {1, 16, 32, 32}, kFloat);
   BufHandle MatmulResultBuf("MatmulResult", {1, 16, 32, 32}, kFloat);
 
-  Tensor* Input = Compute(
+  Tensor Input = Compute(
       "Input",
       {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}},
       [&](const VarHandle& n,
           const VarHandle& c,
           const VarHandle& h,
           const VarHandle& w) { return FloatImm::make(5.0f); });
-  Tensor* Weight = Compute(
+  Tensor Weight = Compute(
       "Weight",
       {{16, "n"}, {16, "c"}, {1, "kh"}, {1, "kw"}},
       [&](const VarHandle& n,
@@ -581,28 +581,28 @@ TEST(ExternalCall, ComputeInterop) {
           const VarHandle& h,
           const VarHandle& w) { return FloatImm::make(6.0f); });
 
-  Tensor* ConvResult = new Tensor(
+  Tensor ConvResult = Tensor(
       ConvResultBuf.node(),
       ExternalCall::make(
           ConvResultBuf,
           "nnc_aten_conv2d",
-          {BufHandle(Input->buf()), BufHandle(Weight->buf())},
+          {BufHandle(Input.buf()), BufHandle(Weight.buf())},
           {}));
-  Tensor* MatmulResult = new Tensor(
+  Tensor MatmulResult = Tensor(
       MatmulResultBuf.node(),
       ExternalCall::make(
           MatmulResultBuf,
           "nnc_aten_matmul",
-          {BufHandle(ConvResult->buf()), BufHandle(ConvResult->buf())},
+          {BufHandle(ConvResult.buf()), BufHandle(ConvResult.buf())},
           {}));
-  Tensor* Result = Compute(
+  Tensor Result = Compute(
       "Result",
       {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}},
       [&](const VarHandle& n,
           const VarHandle& c,
           const VarHandle& h,
           const VarHandle& w) {
-        return ConvResult->load(n, c, h, w) + MatmulResult->load(n, c, h, w);
+        return ConvResult.load(n, c, h, w) + MatmulResult.load(n, c, h, w);
       });
 
   LoopNest l({Input, Weight, ConvResult, MatmulResult, Result});
@@ -658,31 +658,31 @@ TEST(ExternalCall, Inlining) {
 
   BufHandle MatmulResultBuf("MatmulResult", {8, 8}, kFloat);
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         return FloatImm::make(5.0f);
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         return FloatImm::make(4.0f);
       });
-  Tensor* MatmulResult = new Tensor(
+  Tensor MatmulResult = Tensor(
       MatmulResultBuf.node(),
       ExternalCall::make(
           MatmulResultBuf,
           "nnc_aten_matmul",
-          {BufHandle(A->buf()), BufHandle(B->buf())},
+          {BufHandle(A.buf()), BufHandle(B.buf())},
           {}));
-  Tensor* Result = Compute(
+  Tensor Result = Compute(
       "Result",
       {{8, "i"}, {8, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return MatmulResult->load(i, j) + FloatImm::make(3.0f);
+        return MatmulResult.load(i, j) + FloatImm::make(3.0f);
       });
 
   StmtPtr root_stmt = alloc<Block>(std::vector<StmtPtr>(
-      {A->stmt(), B->stmt(), MatmulResult->stmt(), Result->stmt()}));
-  LoopNest l(root_stmt, {Result->buf()});
+      {A.stmt(), B.stmt(), MatmulResult.stmt(), Result.stmt()}));
+  LoopNest l(root_stmt, {Result.buf()});
 
   // Inlining should not inline anything here since all Bufs are either
   // defined or used in ExternalCalls
diff --git a/test/cpp/tensorexpr/test_ir_printer.cpp b/test/cpp/tensorexpr/test_ir_printer.cpp
index 76d9247579d7c..e11ba06740181 100644
--- a/test/cpp/tensorexpr/test_ir_printer.cpp
+++ b/test/cpp/tensorexpr/test_ir_printer.cpp
@@ -56,30 +56,30 @@ TEST(IRPrinter, FunctionName) {
   int M = 4;
   int N = 20;
 
-  Tensor* producer = Compute(
+  Tensor producer = Compute(
       "producer",
       {{M, "m"}, {N, "n"}},
       [&](const ExprHandle& m, const ExprHandle& n) { return m * n; });
 
-  Tensor* chunk_0 = Compute(
+  Tensor chunk_0 = Compute(
       "chunk",
       {{M, "m"}, {N / 2, "n"}},
       [&](const ExprHandle& m, const ExprHandle& n) {
-        return producer->load(m, n);
+        return producer.load(m, n);
       });
 
-  Tensor* chunk_1 = Compute(
+  Tensor chunk_1 = Compute(
       "chunk",
       {{M, "m"}, {N / 2, "n"}},
       [&](const ExprHandle& m, const ExprHandle& n) {
-        return producer->load(m, n + ExprHandle(N / 2));
+        return producer.load(m, n + ExprHandle(N / 2));
       });
 
-  Tensor* consumer = Compute(
+  Tensor consumer = Compute(
       "consumer",
       {{M, "i"}, {N / 2, "j"}},
       [&](const ExprHandle& i, const ExprHandle& j) {
-        return i * chunk_1->load(i, j);
+        return i * chunk_1.load(i, j);
       });
 
   LoopNest l({chunk_0, chunk_1, consumer});
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index 8f36f54395f49..765522ecf6cd4 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -1329,7 +1329,7 @@ TEST_F(Kernel, CodegenInspection) {
 #endif
 }
 
-Tensor* lowerNanToNum(
+Tensor lowerNanToNum(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType,
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index 75e6a064d1ac5..6081403c25650 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -593,10 +593,10 @@ TEST(LLVM, VectorizerLoadStoreTest) {
   KernelScope kernel_scope;
   Placeholder a(BufHandle("A", {1}, kInt));
 
-  Tensor* c =
+  Tensor c =
       Compute("c", {{4, "i"}}, [&](const VarHandle& i) { return a.load(i); });
 
-  Placeholder c_buf(BufHandle(c->buf()));
+  Placeholder c_buf(BufHandle(c.buf()));
   LoopNest l({c});
   StmtPtr s = l.root_stmt();
   ASSERT_TRUE(LoopNest::vectorize(to<For>(to<Block>(s)->front())));
@@ -616,11 +616,11 @@ TEST(LLVM, VectorizeBitCast) {
   KernelScope kernel_scope;
   Placeholder a(BufHandle("A", {128}, kInt));
 
-  Tensor* c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) {
     return bitcast<float>(a.load(i));
   });
 
-  Placeholder c_buf(BufHandle(c->buf()));
+  Placeholder c_buf(BufHandle(c.buf()));
   LoopNest l({c});
   StmtPtr s = l.root_stmt();
   ASSERT_TRUE(LoopNest::vectorize(to<For>(to<Block>(s)->front())));
@@ -1217,12 +1217,12 @@ TEST(LLVM, StoreFloat) {
 TEST(LLVM, SimpleMath01) {
   KernelScope kernel_scope;
   const int N = 1024;
-  Tensor* tensor = Compute("f", {{N, "i"}}, [](const VarHandle& i) {
+  Tensor tensor = Compute("f", {{N, "i"}}, [](const VarHandle& i) {
     return cast<float>(i * i + 1);
   });
   LoopNest l({tensor});
   StmtPtr stmt = l.root_stmt();
-  Placeholder f_buf(BufHandle(tensor->buf()));
+  Placeholder f_buf(BufHandle(tensor.buf()));
   LLVMCodeGen cg(stmt, {f_buf});
 
   PaddedBuffer<float> f_v(N, "f_v");
@@ -1241,11 +1241,11 @@ TEST(LLVM, ComputeMul) {
   const int N = 1024;
   Placeholder a(BufHandle("a", {N}, kFloat));
   Placeholder b(BufHandle("b", {N}, kFloat));
-  Tensor* c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) {
     return a.load(i) * b.load(i);
   });
 
-  Placeholder c_buf(BufHandle(c->buf()));
+  Placeholder c_buf(BufHandle(c.buf()));
   LoopNest l({c});
   StmtPtr s = l.root_stmt();
 
@@ -1265,12 +1265,12 @@ TEST(LLVM, BroadcastAdd) {
   const int N = 1024;
   Placeholder a(BufHandle("a", {M, N}, kFloat));
   Placeholder b(BufHandle("b", {N}, kFloat));
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         return a.load(i, j) + b.load(j);
       });
 
-  Placeholder c_buf(BufHandle(c->buf()));
+  Placeholder c_buf(BufHandle(c.buf()));
   LoopNest l({c});
   l.prepareForCodegen();
   StmtPtr s = l.root_stmt();
@@ -1372,7 +1372,7 @@ TEST(LLVM, TensorDynamicShapeAdd) {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {n}, kFloat));
     Placeholder b(BufHandle("b", {n}, kFloat));
-    Tensor* c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) {
+    Tensor c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) {
       return a.load(i) + b.load(i);
     });
     LoopNest l({c});
@@ -1396,7 +1396,7 @@ TEST(LLVM, DynamicShape2D) {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {m, n}, kFloat));
     Placeholder b(BufHandle("b", {m, n}, kFloat));
-    Tensor* c = Compute(
+    Tensor c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
           return a.load(i, j) + b.load(i, j);
         });
@@ -1428,7 +1428,7 @@ TEST(LLVM, EliminatedStmt) {
   KernelScope kernel_scope;
   Placeholder a(BufHandle("a", {1}, kFloat));
 
-  Tensor* c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; });
+  Tensor c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; });
 
   LoopNest l({c});
   l.prepareForCodegen();
@@ -1452,7 +1452,7 @@ TEST(LLVM, SimpleReduction) {
   // TODO: why doesn't implicit vector<DimArg> work?
   std::vector<DimArg> axis = {DimArg(1)};
   std::vector<DimArg> reduce_axis = {DimArg(M), DimArg(N)};
-  Tensor* b = Reduce("sum", axis, Sum(), a, reduce_axis);
+  Tensor b = Reduce("sum", axis, Sum(), a, reduce_axis);
   LoopNest loop({b});
 
   loop.prepareForCodegen();
@@ -1491,7 +1491,7 @@ TEST(LLVM, RFactorReduction) {
   // TODO: why doesn't implicit vector<DimArg> work?
   std::vector<DimArg> axis = {DimArg(1)};
   std::vector<DimArg> reduce_axis = {DimArg(M), DimArg(N)};
-  Tensor* b = Reduce("sum", axis, Sum(), a, reduce_axis);
+  Tensor b = Reduce("sum", axis, Sum(), a, reduce_axis);
   LoopNest loop({b});
 
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(b);
@@ -1502,7 +1502,7 @@ TEST(LLVM, RFactorReduction) {
   loops = loop.getLoopStmtsFor(b);
   loop_m = loops.at(2);
   loop_n = loops.at(1);
-  auto b_body = loop.getAllWritesToBuf(b->buf())[1];
+  auto b_body = loop.getAllWritesToBuf(b.buf())[1];
   ASSERT_TRUE(loop.rfactor(b_body, loop_n));
 
   loop.prepareForCodegen();
@@ -1538,13 +1538,13 @@ TEST(LLVM, RFactorVectorizedReduction) {
 
   Placeholder a("a", kFloat, {1, M, N});
 
-  Tensor* b = Reduce("sum", {{1, "K"}}, Sum(), a, {{M, "M"}, {N, "N"}});
+  Tensor b = Reduce("sum", {{1, "K"}}, Sum(), a, {{M, "M"}, {N, "N"}});
   LoopNest loopnest({b});
   std::vector<ForPtr> loops = loopnest.getLoopStmtsFor(b);
   // Reorder n and m loops
   loopnest.reorderAxis(loops.at(1), loops.at(2));
-  auto b_body = loopnest.getAllWritesToBuf(b->buf()).at(1);
-  auto all_loops = loopnest.getAllLoopNestsWritingToBuf(b->buf());
+  auto b_body = loopnest.getAllWritesToBuf(b.buf()).at(1);
+  auto all_loops = loopnest.getAllLoopNestsWritingToBuf(b.buf());
   ASSERT_TRUE(all_loops.size() == 2 && all_loops[1].size() == 3);
   ASSERT_TRUE(loopnest.rfactor(b_body, all_loops[1][1]));
   auto distributed_loops = loopnest.distributeLoop(all_loops[1][1]);
@@ -1585,7 +1585,7 @@ TEST(LLVM, SimpleParallel) {
     KernelScope kernel_scope;
     const int M = 4;
     const int N = 6;
-    Tensor* f = Compute(
+    Tensor f = Compute(
         "f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) {
           return cast<float>(m + n);
         });
@@ -1626,23 +1626,23 @@ TEST(LLVM, CompositeParallel) {
     KernelScope kernel_scope;
     int M = 5;
     int N = 7;
-    Tensor* t1 =
+    Tensor t1 =
         Compute("t1", {{M, "M"}}, [](const VarHandle& m) { return m + 1.f; });
-    Tensor* t2 =
+    Tensor t2 =
         Compute("t2", {{N, "N"}}, [](const VarHandle& n) { return n + 2.f; });
-    Tensor* t3 = Compute(
+    Tensor t3 = Compute(
         "t3",
         {{M, "M"}, {N, "N"}},
         [=](const VarHandle& m, const VarHandle& n) {
-          return t1->load(m) * t2->load(n);
+          return t1.load(m) * t2.load(n);
         });
-    Tensor* t4 = Compute(
+    Tensor t4 = Compute(
         "t4",
         {{M, "M"}, {N, "N"}},
         [=](const VarHandle& m, const VarHandle& n) {
-          return t3->load(m, n) + m + n;
+          return t3.load(m, n) + m + n;
         });
-    LoopNest loop_nest(std::vector<Tensor*>({t4}), {t1, t2, t3, t4});
+    LoopNest loop_nest({t4}, {t1, t2, t3, t4});
     std::vector<ForPtr> loop_list;
     {
       auto const& loops = loop_nest.getLoopStmtsFor(t1);
@@ -1695,7 +1695,7 @@ TEST(LLVM, VectorizedGEMM) {
 
   Placeholder AP(BufHandle("A", {M, K}, kFloat));
   Placeholder BP(BufHandle("B", {K, N}, kFloat));
-  Tensor* CT = Reduce(
+  Tensor CT = Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       Sum(),
@@ -1776,7 +1776,7 @@ TEST(LLVM, CallRaw) {
   VarHandle N("N", kInt);
   Placeholder a(BufHandle("a", {M, N}, kFloat));
   Placeholder b(BufHandle("b", {N}, kFloat));
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         return a.load(i, j) + b.load(j);
       });
@@ -1793,7 +1793,7 @@ TEST(LLVM, CallRaw) {
   std::vector<float> cv(M * N_value, 0);
   std::vector<void*> args({av.data(), bv.data(), cv.data(), &N_value});
 
-  LLVMCodeGen cg(s, {a, b, BufHandle(c->buf()), N});
+  LLVMCodeGen cg(s, {a, b, BufHandle(c.buf()), N});
   cg.call_raw(args);
 
   for (int i = 0; i < M; i++) {
@@ -1802,7 +1802,7 @@ TEST(LLVM, CallRaw) {
     }
   }
 
-  SimpleIREvaluator eval(s, {a, b, BufHandle(c->buf()), N});
+  SimpleIREvaluator eval(s, {a, b, BufHandle(c.buf()), N});
   eval.call_raw(args);
 
   for (int i = 0; i < M; i++) {
@@ -1818,7 +1818,7 @@ TEST(LLVM, CustomTarget) {
   Placeholder a("a", kFloat, {M});
   Placeholder b("b", kFloat, {M});
   Placeholder c("c", kFloat, {M});
-  Tensor* d = Compute("d", {{M, "m"}}, [&](const VarHandle& m) {
+  Tensor d = Compute("d", {{M, "m"}}, [&](const VarHandle& m) {
     return a.load(m) * b.load(m) + c.load(m);
   });
   LoopNest nest({d});
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index c80dd5f492d95..7c3eefaab3b1c 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -42,13 +42,12 @@ void checkExprIR(const ExprHandle& e, const std::string& pattern) {
 
 TEST(LoopNest, ExprSimple01) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{16, "X"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
   LoopNest::splitWithTail(loops[0], 2);
   LoopNest::splitWithTail(loops[0], 2);
@@ -56,7 +55,7 @@ TEST(LoopNest, ExprSimple01) {
 
 TEST(LoopNest, ExprLower01) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{16, "x"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
@@ -73,10 +72,9 @@ TEST(LoopNest, ExprSimple02) {
   auto func = [](const ExprHandle& x, const ExprHandle& y) {
     return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
   };
-  Tensor* tensor = Compute("f", {{26, "x"}, {5, "y"}}, func);
+  Tensor tensor = Compute("f", {{26, "x"}, {5, "y"}}, func);
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
   LoopNest::splitWithTail(loops[0], 4);
 
@@ -168,14 +166,13 @@ TEST(LoopNest, ExprSliceHeadWithLoopOptions) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   loops[0]->set_gpu_block_index(LoopOptions::IDX_Y);
   LoopNest::sliceHead(loops[0], 2, &head, &tail);
 
@@ -193,14 +190,13 @@ TEST(LoopNest, ExprSliceTailWithLoopOptions) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceTail(loops[0], 4, &head, &tail);
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -227,14 +223,13 @@ TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceHead(loops[0], 10, &head, &tail);
 
   ASSERT_EQ(head, loops[0]);
@@ -249,14 +244,13 @@ TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceHead(loops[0], 100, &head, &tail);
 
   ASSERT_EQ(head, loops[0]);
@@ -271,14 +265,13 @@ TEST(LoopNest, ExprSliceHead) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceHead(loops[0], 4, &head, &tail);
 
   ASSERT_NE(head, nullptr);
@@ -295,10 +288,9 @@ TEST(LoopNest, ExprSliceHeadWithNonZeroStart) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
@@ -323,14 +315,13 @@ TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceTail(loops[0], 10, &head, &tail);
 
   ASSERT_EQ(head, nullptr);
@@ -347,14 +338,13 @@ TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceTail(loops[0], 100, &head, &tail);
 
   ASSERT_EQ(head, nullptr);
@@ -369,14 +359,13 @@ TEST(LoopNest, ExprSliceTail) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceTail(loops[0], 4, &head, &tail);
 
   ASSERT_NE(head, nullptr);
@@ -396,15 +385,14 @@ TEST(LoopNest, ExprSplitAndSlice) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{100, "x"}}, func);
+  Tensor tensor = Compute("f", {{100, "x"}}, func);
   LoopNest l({tensor});
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr inner;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   // outer: [0, 4)
   // inner: [0, 21)
   // tail:  [84, 100)
@@ -450,10 +438,9 @@ TEST(LoopNest, ExprSliceAndNormalize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
@@ -482,11 +469,11 @@ TEST(LoopNest, ExprSliceWithVariableDimension) {
          const std::vector<std::pair<int, int>>& expected_for_ranges) {
         KernelScope kernel_scope;
         VarHandle dim("dim", kInt);
-        Tensor* tensor =
+        Tensor tensor =
             Compute("f", {{dim, "x"}}, [](const ExprHandle& x) { return x; });
         LoopNest l({tensor});
         std::vector<ForPtr> loops =
-            l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+            l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         ForPtr head;
@@ -521,10 +508,9 @@ TEST(LoopNest, ExprSplitWithTail) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{199, "x"}}, func);
+  Tensor tensor = Compute("f", {{199, "x"}}, func);
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   LoopNest::splitWithTail(loops[0], 17);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
@@ -552,10 +538,9 @@ TEST(LoopNest, ExprSplitWithTailNone) {
   auto func = [](const ExprHandle& x, const ExprHandle& y) {
     return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
   };
-  Tensor* tensor = Compute("f", {{24, "x"}, {5, "y"}}, func);
+  Tensor tensor = Compute("f", {{24, "x"}, {5, "y"}}, func);
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::splitWithTail(loops[0], 4);
 
   StmtPtr stmt = l.root_stmt();
@@ -612,14 +597,13 @@ TEST(LoopNest, ExprSplitWithMask01) {
   const int N = 5;
   Placeholder a_buf("a", kFloat, {M, N});
   Placeholder b_buf("b", kFloat, {M, N});
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
         return a_buf.load(m, n) + b_buf.load(m, n) + 1.0f;
       });
 
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::splitWithMask(loops[1], 4);
 
   StmtPtr stmt = l.root_stmt();
@@ -648,13 +632,12 @@ TEST(LoopNest, ExprSplitWithMaskRepeatedNoMask) {
   const int M = 64;
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
-  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+  Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
     return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
 
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4);
   LoopNest::splitWithMask(loops[0], 4);
 
@@ -736,14 +719,13 @@ TEST(LoopNest, TileSimple) {
   const int M = 64, N = 64;
   Placeholder a_buf("a", kFloat, {M, N});
   Placeholder b_buf("b", kFloat, {M, N});
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
         return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f;
       });
 
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   l.tile(loops[0], loops[1], 4, 8);
 
@@ -783,14 +765,13 @@ TEST(LoopNest, TileWithTails) {
   const int M = 64, N = 64;
   Placeholder a_buf("a", kFloat, {M, N});
   Placeholder b_buf("b", kFloat, {M, N});
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
         return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f;
       });
 
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   l.tile(loops[0], loops[1], 5, 9);
 
@@ -831,7 +812,7 @@ TEST(LoopNest, TileInMiddle) {
   const int M = 8, N = 8, L = 8, K = 8;
   Placeholder a_buf("a", kFloat, {M, N, L, K});
   Placeholder b_buf("b", kFloat, {M, N, L, K});
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f",
       {{M, "m"}, {N, "n"}, {L, "l"}, {K, "k"}},
       [&](const ExprHandle& m,
@@ -843,7 +824,7 @@ TEST(LoopNest, TileInMiddle) {
 
   LoopNest nest({tensor});
   std::vector<ForPtr> loops =
-      nest.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+      nest.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   nest.tile(loops[1], loops[2], 3, 3);
 
@@ -891,7 +872,7 @@ TEST(LoopNest, SplitWithTailWithLoopOptions) {
   const int M = 21;
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
-  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+  Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
     return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -922,7 +903,7 @@ TEST(LoopNest, SplitWithMaskWithLoopOptions) {
   const int M = 21;
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
-  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+  Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
     return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -949,7 +930,7 @@ TEST(LoopNest, ScheduleBroadcastAddBuffer) {
   const int K = 6;
   Placeholder a_buf("a", kFloat, {M, N});
   Placeholder b_buf("b", kFloat, {N, K});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "broadcast_add",
       {{M, "m"}, {N, "n"}, {K, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -998,20 +979,20 @@ TEST(LoopNest, ScheduleFunctionCall01) {
   const int K = 6;
   Placeholder a_buf("a", kFloat, {M, N});
   Placeholder b_buf("b", kFloat, {N, K});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "broadcast_add",
       {{M, "m"}, {N, "n"}, {K, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) + b_buf.load(n, k);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "d",
       {{M, "m"}, {N, "n"}, {K, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c->load(m, n, k) + 1;
+        return c.load(m, n, k) + 1;
       });
 
-  LoopNest l(std::vector<Tensor*>({d}), {c, d});
+  LoopNest l({d}, {c, d});
   l.prepareForCodegen();
   StmtPtr stmt = l.root_stmt();
   std::ostringstream oss;
@@ -1058,22 +1039,22 @@ TEST(LoopNest, ScheduleInlineSimple) {
   Placeholder c_buf("c", kFloat, {M, N});
   Placeholder d_buf("d", kFloat, {M, K});
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) * b_buf.load(n, k);
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
       });
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
   LoopNest l2(l1);
-  l2.computeInline(x->buf());
+  l2.computeInline(x.buf());
 
   l1.prepareForCodegen();
   l2.prepareForCodegen();
@@ -1139,31 +1120,31 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
   Placeholder c_buf("c", kFloat, {M, N});
   Placeholder d_buf("d", kFloat, {M, K});
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) * b_buf.load(n, k);
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
       });
-  Tensor* z = Compute(
+  Tensor z = Compute(
       "z",
       {{M, "m3"}, {N, "n3"}, {K, "k3"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + y->load(m, n, k);
+        return x.load(m, n, k) + y.load(m, n, k);
       });
 
-  LoopNest l(std::vector<Tensor*>({z}), {x, y, z});
+  LoopNest l({z}, {x, y, z});
   for (const std::string& order : inline_order) {
     if (order == "x") {
-      l.computeInline(x->buf());
+      l.computeInline(x.buf());
     } else if (order == "y") {
-      l.computeInline(y->buf());
+      l.computeInline(y.buf());
     } else {
       throw std::runtime_error("Invalid order: " + order);
     }
@@ -1218,7 +1199,7 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
   }
 
   if (inline_order.size() == 2) {
-    Tensor* z2 = Compute(
+    Tensor z2 = Compute(
         "z",
         {{M, "m3"}, {N, "n3"}, {K, "k3"}},
         [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -1254,21 +1235,21 @@ TEST(LoopNest, ScheduleInlineRandom) {
   const int N = 5;
   const int K = 6;
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return Mod::make(Intrinsics::make(kRand, kInt), 5);
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + x->load(m, n, k);
+        return x.load(m, n, k) + x.load(m, n, k);
       });
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
-  l1.computeInline(x->buf());
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
 
   // would normally compare results but Rand isn't implemented in the
   // SimpleIREvaluator, even if we could seed it.
@@ -1290,22 +1271,22 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) {
   const int N = 5;
   const int K = 6;
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return m * n * k;
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + Intrinsics::make(kRand, kInt) +
+        return x.load(m, n, k) + Intrinsics::make(kRand, kInt) +
             Intrinsics::make(kRand, kInt);
       });
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
-  l1.computeInline(x->buf());
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
 
   // would normally compare results but Rand isn't implemented in the
   // SimpleIREvaluator, even if we could seed it.
@@ -1327,18 +1308,18 @@ TEST(LoopNest, ScheduleInlineRandomLowerDimensions) {
   const int N = 5;
   const int K = 6;
 
-  Tensor* x = Compute("x", {{M, "m1"}}, [&](const VarHandle& m) {
+  Tensor x = Compute("x", {{M, "m1"}}, [&](const VarHandle& m) {
     return Mod::make(Intrinsics::make(kRand, kInt), 5);
   });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m) + x->load(m);
+        return x.load(m) + x.load(m);
       });
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
-  l1.computeInline(x->buf());
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
 
   // would normally compare results but Rand isn't implemented in the
   // SimpleIREvaluator, even if we could seed it.
@@ -1362,17 +1343,17 @@ TEST(LoopNest, ScheduleInlineIntrinsics) {
   Placeholder a_buf("a", kFloat, {M, N});
   Placeholder b_buf("b", kFloat, {N, K});
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) * b_buf.load(n, k);
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return Intrinsics::make(kSqrt, x->load(m, n, k));
+        return Intrinsics::make(kSqrt, x.load(m, n, k));
       });
 
   PaddedBuffer<float> a_v(M, N);
@@ -1389,9 +1370,9 @@ TEST(LoopNest, ScheduleInlineIntrinsics) {
     }
   }
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
   LoopNest l2(l1);
-  l2.computeInline(x->buf());
+  l2.computeInline(x.buf());
 
   l1.prepareForCodegen();
   l2.prepareForCodegen();
@@ -1421,21 +1402,21 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
   const int N = 5;
   const int K = 6;
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return Intrinsics::make(kRand, kFloat);
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return Intrinsics::make(kSqrt, x->load(m, n, k));
+        return Intrinsics::make(kSqrt, x.load(m, n, k));
       });
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
-  l1.computeInline(x->buf());
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
 
   StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
 
@@ -1451,31 +1432,31 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
 // Split a Compute then inline it into another compute.
 TEST(LoopNest, ScheduleSplitAThenInline) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
 
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4);
-  ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
+  ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices");
 }
 
 // Split a Compute then inline another Compute into it.
 TEST(LoopNest, ScheduleSplitBThenInline) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
 
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0);
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 3);
-  l.computeInline(a->buf());
+  l.computeInline(a.buf());
   l.prepareForCodegen();
   StmtPtr s = IRSimplifier::simplify(l.root_stmt());
 
@@ -1491,32 +1472,32 @@ TEST(LoopNest, ScheduleSplitBThenInline) {
 // Split a Compute twice then inline it.
 TEST(LoopNest, ScheduleSplitTwiceThenInline) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr i_inner;
 
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4, &i_inner);
   LoopNest::splitWithMask(i_inner, 2);
-  ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
+  ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices");
 }
 
 // Inline a Compute, then split.
 TEST(LoopNest, ScheduleInlineThenSplit) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
 
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  l.computeInline(a->buf());
+  LoopNest l({b}, {a, b});
+  l.computeInline(a.buf());
 
   std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
   LoopNest::splitWithMask(loops.back(), 3);
@@ -1534,16 +1515,16 @@ TEST(LoopNest, ScheduleInlineThenSplit) {
 // Split a Compute, inline it, then split the result.
 TEST(LoopNest, ScheduleSplitInlineThenSplit) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{16, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{16, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
 
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
+  LoopNest l({b}, {a, b});
   auto loops = NodeFinder<For>::find(l.root_stmt());
   LoopNest::splitWithMask(loops.back(), 2);
-  l.computeInline(a->buf());
+  l.computeInline(a.buf());
 
   loops = NodeFinder<For>::find(l.root_stmt());
   LoopNest::splitWithMask(loops.front(), 2);
@@ -1561,35 +1542,35 @@ TEST(LoopNest, ScheduleSplitInlineThenSplit) {
 // Oversplit a loop that is simplified out after inlining.
 TEST(LoopNest, ScheduleSplitInlineSimplify) {
   KernelScope kernel_scope;
-  Tensor* a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) {
+  Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) {
     return ExprHandle(4) * i - ExprHandle(2) * i;
   });
-  Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
-    return a->load(j) - ExprHandle(1);
+  Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
+    return a.load(j) - ExprHandle(1);
   });
 
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4);
-  ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
+  ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices");
 }
 
 // Inline a Compute with two consumers.
 TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a->load(k) * b->load(l);
+        return a.load(k) * b.load(l);
       });
 
-  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
-  l.computeInline(a->buf());
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  l.computeInline(a.buf());
   l.prepareForCodegen();
 
   StmtPtr s = IRSimplifier::simplify(l.root_stmt());
@@ -1607,20 +1588,20 @@ TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
 // Inline Compute A into B, then inline B into C.
 TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a->load(k) * b->load(l);
+        return a.load(k) * b.load(l);
       });
 
-  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
-  l.computeInline(a->buf());
-  l.computeInline(b->buf());
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  l.computeInline(a.buf());
+  l.computeInline(b.buf());
   l.prepareForCodegen();
 
   StmtPtr s = IRSimplifier::simplify(l.root_stmt());
@@ -1638,19 +1619,19 @@ TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
 // Inline a Compute that is both a producer and consumer.
 TEST(LoopNest, ScheduleInlineThreeMixedInner) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a->load(k) * b->load(l);
+        return a.load(k) * b.load(l);
       });
 
-  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
-  l.computeInline(b->buf());
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  l.computeInline(b.buf());
   l.prepareForCodegen();
 
   StmtPtr s = IRSimplifier::simplify(l.root_stmt());
@@ -1668,25 +1649,25 @@ TEST(LoopNest, ScheduleInlineThreeMixedInner) {
 // Split 3 Computes, then inline the first two into the last.
 TEST(LoopNest, ScheduleInlineThreeMixedSplit) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a->load(k) * b->load(l);
+        return a.load(k) * b.load(l);
       });
 
-  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4);
-  loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 3);
-  loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 2);
 
-  ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
+  ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices");
 }
 
 // Check that inlining works for output tensors too
@@ -1696,21 +1677,21 @@ TEST(LoopNest, ScheduleInlineOutputTensors) {
   const int N = 5;
   const int K = 6;
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return m * n * k;
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + m;
+        return x.load(m, n, k) + m;
       });
 
   LoopNest l1({x, y});
-  l1.computeInline(x->buf());
+  l1.computeInline(x.buf());
 
   // would normally compare results but Rand isn't implemented in the
   // SimpleIREvaluator, even if we could seed it.
@@ -1736,14 +1717,14 @@ TEST(LoopNest, ScheduleFuserStyle) {
 
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
 
-  Tensor* b = Compute(
+  Tensor b = Compute(
       "f", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) {
         return a_buf.load(axes[0]) + 11.0f;
       });
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "g", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) {
-        return b->load(axes[0]) + 1.0f;
+        return b.load(axes[0]) + 1.0f;
       });
 
   LoopNest l({b, c});
@@ -1772,17 +1753,17 @@ TEST(LoopNest, ScheduleFuserThreeArg) {
   Placeholder c(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder d(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
-  Tensor* e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+  Tensor e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
     return a.load(i) + b.load(i);
   });
-  Tensor* f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return e->load(i) + c.load(i);
+  Tensor f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+    return e.load(i) + c.load(i);
   });
-  Tensor* g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return f->load(i) + d.load(i);
+  Tensor g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+    return f.load(i) + d.load(i);
   });
 
-  LoopNest l(std::vector<Tensor*>({g}), {e, f, g});
+  LoopNest l({g}, {e, f, g});
   l.computeInline(l.getLoopBodyFor(e));
   l.computeInline(l.getLoopBodyFor(f));
   l.prepareForCodegen();
@@ -1807,7 +1788,7 @@ TEST(LoopNest, ScheduleDynamicShape2D) {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {m, n}, kFloat));
     Placeholder b(BufHandle("b", {m, n}, kFloat));
-    Tensor* c = Compute(
+    Tensor c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
           return a.load(i, j) + b.load(i, j);
         });
@@ -1842,12 +1823,12 @@ TEST(LoopNest, LoopNestComputeAt_1) {
   // and the temp should be used in B.
   KernelScope kernel_scope;
   VarHandle N("N", kInt);
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{N, "i_a"}}, [&](const VarHandle& i_a) { return i_a * i_a; });
-  Tensor* B = Compute(
-      "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A->load(i_b); });
-  LoopNest l(std::vector<Tensor*>({B}), {A, B});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B->buf()).at(0);
+  Tensor B = Compute(
+      "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A.load(i_b); });
+  LoopNest l({B}, {A, B});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0);
   LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
   l.prepareForCodegen();
   StmtPtr s = l.root_stmt();
@@ -1891,16 +1872,16 @@ TEST(LoopNest, LoopNestComputeAt_2) {
   const int kW = 16, kH = 16;
   VarHandle W("W", kInt);
   VarHandle H("H", kInt);
-  Tensor* p = Compute(
+  Tensor p = Compute(
       "prod",
       {{H + 1, "py"}, {W + 1, "px"}},
       [&](const VarHandle& py, const VarHandle& px) { return px * py; });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "cons",
       {{H, "cy"}, {W, "cx"}},
       [&](const VarHandle& y, const VarHandle& x) {
-        return p->load(y, x) + p->load(y + 1, x) + p->load(y, x + 1) +
-            p->load(y + 1, x + 1);
+        return p.load(y, x) + p.load(y + 1, x) + p.load(y, x + 1) +
+            p.load(y + 1, x + 1);
       });
 
   std::vector<int> c_ref(kW * kH, 0);
@@ -1909,12 +1890,12 @@ TEST(LoopNest, LoopNestComputeAt_2) {
       c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
     }
   }
-  LoopNest orig_loopnest(std::vector<Tensor*>({c}), {p, c});
+  LoopNest orig_loopnest({c}, {p, c});
 
   {
     // First let's try to compute P at axis cy (the outer loop)
     LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]);
     l.prepareForCodegen();
     StmtPtr s = l.root_stmt();
@@ -1940,7 +1921,7 @@ TEST(LoopNest, LoopNestComputeAt_2) {
   {
     // Now let's try to compute P at axis cx (the inner loop)
     LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]);
     l.prepareForCodegen();
     StmtPtr s = l.root_stmt();
@@ -1979,27 +1960,25 @@ TEST(LoopNest, LoopNestComputeAt_3) {
   const int kW = 16, kH = 16;
   VarHandle W("W", kInt);
   VarHandle H("H", kInt);
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A",
       {{H + 1, "ay"}, {W + 1, "ax"}},
       [&](const VarHandle& ay, const VarHandle& ax) { return ax * ay; });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B",
       {{H + 1, "by"}, {W + 1, "bx"}},
-      [&](const VarHandle& by, const VarHandle& bx) {
-        return A->load(by, bx);
-      });
-  Tensor* C = Compute(
+      [&](const VarHandle& by, const VarHandle& bx) { return A.load(by, bx); });
+  Tensor C = Compute(
       "C",
       {{H, "cy"}, {W, "cx"}},
       [&](const VarHandle& cy, const VarHandle& cx) {
-        return B->load(cy, cx + 1);
+        return B.load(cy, cx + 1);
       });
-  Tensor* D = Compute(
+  Tensor D = Compute(
       "D",
       {{H, "dy"}, {W, "dx"}},
       [&](const VarHandle& dy, const VarHandle& dx) {
-        return A->load(dy + 1, dx) + C->load(dy, dx);
+        return A.load(dy + 1, dx) + C.load(dy, dx);
       });
 
   std::vector<int> c_ref(kW * kH, 0);
@@ -2009,11 +1988,11 @@ TEST(LoopNest, LoopNestComputeAt_3) {
     }
   }
 
-  LoopNest orig_loopnest(std::vector<Tensor*>({D}), {A, B, C, D});
+  LoopNest orig_loopnest({D}, {A, B, C, D});
   {
     // First let's try to compute A at axis dy (the outer loop)
     LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
     l.prepareForCodegen();
     StmtPtr s = l.root_stmt();
@@ -2044,7 +2023,7 @@ TEST(LoopNest, LoopNestComputeAt_3) {
   {
     // Now let's try to compute A at axis dx (the inner loop)
     LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(A), loops[1]);
     l.prepareForCodegen();
     StmtPtr s = l.root_stmt();
@@ -2083,15 +2062,15 @@ TEST(LoopNest, Reduce2dComputeAt) {
   VarHandle W("W", kInt);
   VarHandle H("H", kInt);
 
-  Tensor* p =
+  Tensor p =
       Compute("prod", {{H + 1, "py"}, {W + 1, "px"}}, [&](Axis py, Axis px) {
         return px * py;
       });
-  Tensor* c = Reduce(
+  Tensor c = Reduce(
       "cons",
       {{H, "cy"}, {W, "cx"}},
       Sum(),
-      [&](Axis y, Axis x, Axis r, Axis s) { return p->load(y + r, x + s); },
+      [&](Axis y, Axis x, Axis r, Axis s) { return p.load(y + r, x + s); },
       {{2, "r"}, {2, "s"}});
 
   std::vector<int> c_ref(kW * kH, 0);
@@ -2100,7 +2079,7 @@ TEST(LoopNest, Reduce2dComputeAt) {
       c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
     }
   }
-  LoopNest orig_loopnest(std::vector<Tensor*>({c}), {p, c});
+  LoopNest orig_loopnest({c}, {p, c});
   checkIR(orig_loopnest.root_stmt(), R"IR(
 # CHECK: for (int py = 0; py < H + 1; py++) {
 # CHECK:   for (int px = 0; px < W + 1; px++) {
@@ -2122,7 +2101,7 @@ TEST(LoopNest, Reduce2dComputeAt) {
   {
     // First let's try to compute P at axis cy (the outer loop)
     LoopNest l(orig_loopnest);
-    auto loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+    auto loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]);
     // FIXME: Calling simplify here breaks the IR:
     // MALFORMED INPUT: could not find base node in Load - temp[...]
@@ -2159,7 +2138,7 @@ TEST(LoopNest, Reduce2dComputeAt) {
   {
     // Now let's try to compute P at axis cx (the inner loop)
     LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]);
     l.simplify();
     l.eliminateDeadStores();
@@ -2205,17 +2184,17 @@ TEST(LoopNest, DISABLED_Conv1d_NH) {
   int Pad = 1;
   Placeholder IP("input", kFloat, {H});
 
-  Tensor* A =
+  Tensor A =
       Compute("A", {{N, "np"}, {H + 2 * Pad, "hp"}}, [&](Axis n, Axis h) {
         auto cond = CompareSelect::make(h, Pad, 1, 0, kLT);
         cond = CompareSelect::make(h, H + Pad, 1, cond, kGE);
         return ifThenElse(cond, 0.f, IP.load(n, h - Pad));
       });
-  Tensor* B = Reduce(
+  Tensor B = Reduce(
       "B",
       {{N, "n"}, {H, "h"}},
       Sum(),
-      [&](Axis n, Axis h, Axis r) { return A->load(n, h + r); },
+      [&](Axis n, Axis h, Axis r) { return A.load(n, h + r); },
       {{R, "r"}});
   LoopNest l({B});
   checkIR(l.root_stmt(), R"IR(
@@ -2233,7 +2212,7 @@ TEST(LoopNest, DISABLED_Conv1d_NH) {
 # CHECK:   }
 # CHECK: }
 )IR");
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0);
   LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
   // FIXME: The current IR is totally broken.  The body of the inlined loop is:
 
@@ -2292,7 +2271,7 @@ class LoopOrderHelper : public IRVisitor {
 
 TEST(LoopNest, LoopNestReorderAxis1) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
@@ -2303,7 +2282,7 @@ TEST(LoopNest, LoopNestReorderAxis1) {
   SimpleIREvaluator cg(stmt1, {tensor});
   cg.call({stmt1_output});
 
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[1]);
   StmtPtr stmt2 = Stmt::clone(l.root_stmt());
 
@@ -2324,7 +2303,7 @@ TEST(LoopNest, LoopNestReorderAxis1) {
   }
 
   // Reorder them back.
-  loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[1]);
   StmtPtr stmt3 = l.root_stmt();
 
@@ -2341,7 +2320,7 @@ TEST(LoopNest, LoopNestReorderAxis1) {
 
 TEST(LoopNest, LoopNestReorderPartialAxes) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f",
       {{2, "x"}, {3, "y"}, {4, "z"}},
       [](const VarHandle& x, const VarHandle& y, const VarHandle& z) {
@@ -2358,7 +2337,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
   SimpleIREvaluator cg(stmt1, {tensor});
   cg.call({stmt1_output});
 
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[1]);
   ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,x,z,");
 
@@ -2372,7 +2351,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
     ASSERT_EQ(stmt1_output[i], stmt2_output[i]);
   }
 
-  loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[1], loops[2]);
   ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,z,x,");
 
@@ -2389,7 +2368,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
 
 TEST(LoopNest, LoopNestReorderInternalAxis) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f",
       {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}},
       [](const VarHandle& w,
@@ -2409,7 +2388,7 @@ TEST(LoopNest, LoopNestReorderInternalAxis) {
   SimpleIREvaluator cg(stmt1, {tensor});
   cg.call({stmt1_output});
 
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[2], loops[1]);
   ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "w,y,x,z,");
 
@@ -2426,7 +2405,7 @@ TEST(LoopNest, LoopNestReorderInternalAxis) {
 
 TEST(LoopNest, LoopNestReorderEnclosingAxis) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f",
       {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}},
       [](const VarHandle& w,
@@ -2445,7 +2424,7 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) {
   SimpleIREvaluator cg(stmt1, {tensor});
   cg.call({stmt1_output});
 
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[3]);
   ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "z,x,y,w,");
 
@@ -2462,14 +2441,14 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) {
 
 TEST(LoopNest, LoopNestReorderSameAxis) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
   LoopNest l({tensor});
   StmtPtr stmt1 = Stmt::clone(l.root_stmt());
 
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[1], loops[1]);
   StmtPtr stmt2 = Stmt::clone(l.root_stmt());
 
@@ -2492,7 +2471,7 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
 
   KernelScope kernel_scope;
 
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f",
       {{2, "x"}, {3, "y"}, {4, "z"}},
       [](const VarHandle& x, const VarHandle& y, const VarHandle& z) {
@@ -2503,7 +2482,7 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
 
   Placeholder extra(BufHandle("res", {6, 3}, kFloat));
 
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
   VarHandle i = VarHandle(loops[0]->var());
 
@@ -2589,7 +2568,7 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
    *
    *
    */
-  loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[2]);
   StmtPtr stmt3 = Stmt::clone(l.root_stmt());
 
@@ -2628,7 +2607,7 @@ void LoopNestReorderTestHelper(
     int index2) {
   KernelScope kernel_scope;
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "5d",
       {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}},
       [](const std::vector<VarHandle>&) { return -1; });
@@ -2636,7 +2615,7 @@ void LoopNestReorderTestHelper(
 
   Placeholder extra(BufHandle("extra", {5}, kInt));
 
-  auto loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
   int j = 0;
   for (auto l : loops) {
     // Add an increment at each layer of the loop which counts the number of
@@ -2677,7 +2656,7 @@ void LoopNestReorderTestHelper(
     ASSERT_EQ(extra1[i], expected_loops);
   }
 
-  loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
   LoopNest::reorderAxis(loops[index1], loops[index2]);
   StmtPtr stmt2 = Stmt::clone(l.root_stmt());
 
@@ -2752,26 +2731,26 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) {
   Placeholder c_buf("c", kFloat, {M, N});
   Placeholder d_buf("d", kFloat, {M, K});
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) * b_buf.load(n, k);
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
       });
-  Tensor* z = Compute(
+  Tensor z = Compute(
       "z",
       {{M, "m3"}, {N, "n3"}, {K, "k3"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + y->load(m, n, k);
+        return x.load(m, n, k) + y.load(m, n, k);
       });
 
-  LoopNest l(std::vector<Tensor*>({z}), {x, y, z});
+  LoopNest l({z}, {x, y, z});
   ForPtr a = nullptr;
   ForPtr b = nullptr;
   auto fors = NodeFinder<For>::find(l.root_stmt());
@@ -2845,14 +2824,14 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) {
 
 TEST(LoopNest, OuterLoopVectorization) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{8, "X"}, {8, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
   LoopNest l({tensor});
 
   ASSERT_TRUE(
-      LoopNest::vectorize(l.getAllLoopNestsWritingToBuf(tensor->buf())[0][0]));
+      LoopNest::vectorize(l.getAllLoopNestsWritingToBuf(tensor.buf())[0][0]));
 
   StmtPtr root_stmt = l.root_stmt();
   BlockPtr outer_block = to<Block>(root_stmt);
@@ -2899,10 +2878,10 @@ namespace {
 std::string constantUpperBoundLoopIR(int upper_bound_val) {
   KernelScope kernel_scope;
   ExprHandle upper_bound(upper_bound_val);
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; });
   LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A->buf())[0];
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
   StmtPtr unrolled = nullptr;
   LoopNest::unroll(loops[0], &unrolled);
   std::ostringstream oss;
@@ -2927,12 +2906,12 @@ TEST(LoopNest, UnrollOuter) {
   KernelScope kernel_scope;
   ExprHandle outer_bound(3);
   ExprHandle inner_bound(4);
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A",
       {{outer_bound, "x"}, {inner_bound, "y"}},
       [&](const VarHandle& x, const VarHandle& y) { return x + y; });
   LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A->buf())[0];
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
   StmtPtr unrolled = nullptr;
   LoopNest::unroll(loops[0], &unrolled);
   checkIR(unrolled, R"IR(
@@ -2951,12 +2930,12 @@ TEST(LoopNest, UnrollInner) {
   KernelScope kernel_scope;
   ExprHandle outer_bound(3);
   ExprHandle inner_bound(4);
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A",
       {{outer_bound, "x"}, {inner_bound, "y"}},
       [&](const VarHandle& x, const VarHandle& y) { return x + y; });
   LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A->buf())[0];
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
   StmtPtr unrolled = nullptr;
   LoopNest::unroll(
       static_to<For>(loops[0]->body()->stmts().front()), &unrolled);
@@ -3044,10 +3023,10 @@ TEST(LoopNest, UnrollEmpty) {
 TEST(LoopNest, NoUnroll) {
   KernelScope kernel_scope;
   VarHandle upper_bound("N", kInt);
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; });
   LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A->buf())[0];
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
   StmtPtr unrolled = nullptr;
   ASSERT_THROWS_WITH(
       LoopNest::unroll(loops[0], &unrolled), "non-constant loop");
@@ -3326,7 +3305,7 @@ TEST(LoopNest, NormalizeAndSplitWithTail) {
   // Create a dummy tensor to construct LoopNest.
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
+  Tensor b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
 
@@ -3615,12 +3594,12 @@ TEST(LoopNest, FlattenReductionLoopNestFromTensor) {
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
   Placeholder b(BufHandle("b", {m, n}, kFloat));
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
+  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
   LoopNest loop({c});
   HashProvider hasher;
   auto hash_before = hasher.hash(loop.root_stmt());
 
-  auto loops = loop.getAllLoopNestsWritingToBuf(c->buf())[1];
+  auto loops = loop.getAllLoopNestsWritingToBuf(c.buf())[1];
   ForPtr flattened = nullptr;
   ASSERT_FALSE(LoopNest::flatten(loops, &flattened));
   ASSERT_EQ(flattened, nullptr);
@@ -3673,14 +3652,14 @@ TEST(LoopNest, DetectInlineRankMismatch) {
   const int kTotalSize = 8;
 
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Tensor* a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+  Tensor a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i);
   });
-  Tensor* reshape = Compute(
+  Tensor reshape = Compute(
       "reshape",
       {{kTotalSize / 2, "i"}, {2, "j"}},
-      [&](const VarHandle& i, const VarHandle& j) { return a->load(i, j); });
-  LoopNest l(std::vector<Tensor*>({reshape}), {a, reshape});
+      [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j); });
+  LoopNest l({reshape}, {a, reshape});
   ASSERT_THROWS_WITH(
       l.computeInline(l.getLoopBodyFor(a)),
       "Placeholder indexed access is inconsistent with its rank");
@@ -3689,22 +3668,22 @@ TEST(LoopNest, DetectInlineRankMismatch) {
 TEST(LoopNest, CacheReadsSimple) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 3);
+        return A.load(i + 30, j + 3);
       });
-  Tensor* C = Compute(
+  Tensor C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
-  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1];
-  LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
 
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -3757,22 +3736,22 @@ TEST(LoopNest, CacheReadsSimple) {
 TEST(LoopNest, CacheReadsOuter) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 40) + A->load(i + 31, j + 41);
+        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
       });
-  Tensor* C = Compute(
+  Tensor C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
-  StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][0];
-  LoopNest::cacheAccesses(A->buf(), "A_local", i_loop);
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][0];
+  LoopNest::cacheAccesses(A.buf(), "A_local", i_loop);
 
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -3805,22 +3784,22 @@ TEST(LoopNest, CacheReadsOuter) {
 TEST(LoopNest, CacheReadsInternal) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 40) + A->load(i + 31, j + 41);
+        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
       });
-  Tensor* C = Compute(
+  Tensor C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
-  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1];
-  LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
 
@@ -3852,23 +3831,23 @@ TEST(LoopNest, CacheReadsInternal) {
 TEST(LoopNest, CacheReadsInner) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
   // note im changing the offset of the first arg of the first call to A.
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 34, j + 40) + A->load(i + 30, j + 41);
+        return A.load(i + 34, j + 40) + A.load(i + 30, j + 41);
       });
-  Tensor* C = Compute(
+  Tensor C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
+  LoopNest l({B, C}, {A, B, C});
   StmtPtr body = l.getLoopBodyFor(B);
-  LoopNest::cacheAccesses(A->buf(), "A_local", body);
+  LoopNest::cacheAccesses(A.buf(), "A_local", body);
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
 
@@ -3900,22 +3879,22 @@ TEST(LoopNest, CacheReadsInner) {
 TEST(LoopNest, CacheWritesSimple) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 40) + A->load(i + 31, j + 41);
+        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
       });
-  Tensor* C = Compute(
+  Tensor C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
-  StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A->buf())[0][1];
-  LoopNest::cacheAccesses(A->buf(), "A_local", a_loop);
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A.buf())[0][1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", a_loop);
 
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -4054,7 +4033,7 @@ TEST(LoopNest, CompoundTensorSimple) {
   auto outer_for2 = For::make(x, 0, 10, inner_for2);
   BlockPtr body = Block::make({outer_for1, outer_for2});
 
-  Tensor* A = new Tensor(a_buf.node(), body);
+  Tensor A = Tensor(a_buf.node(), body);
 
   LoopNest l({A});
   l.prepareForCodegen();
@@ -4080,22 +4059,22 @@ TEST(LoopNest, InlineConstantIndex) {
   KernelScope kernel_scope;
   const int N = 10;
   Placeholder x_buf("a", kFloat, {1, N, 1});
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "f",
       {{1, "m"}, {N, "n"}, {1, "o"}},
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) {
         return x_buf.load(m, n, o);
       });
-  Tensor* z = Compute(
+  Tensor z = Compute(
       "f",
       {{1, "m"}, {N, "n"}, {1, "o"}},
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) {
-        return y->load(m, n, o);
+        return y.load(m, n, o);
       });
 
-  LoopNest l(std::vector<Tensor*>({z}), {y, z});
+  LoopNest l({z}, {y, z});
   l.simplify();
-  ASSERT_TRUE(l.computeInline(y->buf()));
+  ASSERT_TRUE(l.computeInline(y.buf()));
 }
 
 TEST(LoopNest, CompoundTensorUsed) {
@@ -4115,14 +4094,14 @@ TEST(LoopNest, CompoundTensorUsed) {
   auto outer_for2 = For::make(x, 0, 10, inner_for2);
   BlockPtr body = Block::make({outer_for1, outer_for2});
 
-  Tensor* A = new Tensor(a_buf.node(), body);
-  Tensor* B = Compute(
+  Tensor A = Tensor(a_buf.node(), body);
+  Tensor B = Compute(
       "B", {{10, "i"}, {3, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i, j + 1) + A->load(i, j + 2);
+        return A.load(i, j + 1) + A.load(i, j + 2);
       });
 
-  LoopNest l(std::vector<Tensor*>({B}), {A, B});
-  ASSERT_FALSE(l.computeInline(A->buf()));
+  LoopNest l({B}, {A, B});
+  ASSERT_FALSE(l.computeInline(A.buf()));
   l.prepareForCodegen();
 
   std::vector<int> a_data(50, 0);
@@ -4707,12 +4686,10 @@ TEST(LoopNest, OptimizeConditionalsNotNormalized) {
   ASSERT_EQ(hash_before, hash_after);
 }
 
-static std::pair<std::unique_ptr<Placeholder>, Tensor*> colReduce(
-    int M,
-    int N) {
+static std::pair<std::unique_ptr<Placeholder>, Tensor> colReduce(int M, int N) {
   auto a =
       std::make_unique<Placeholder>("a", kFloat, std::vector<ExprHandle>{M, N});
-  Tensor* t = Reduce(
+  Tensor t = Reduce(
       "b",
       {{N, "n"}},
       Sum(),
@@ -4721,10 +4698,10 @@ static std::pair<std::unique_ptr<Placeholder>, Tensor*> colReduce(
   return {std::move(a), t};
 }
 
-static StmtPtr splitTailReorder(Tensor* b) {
+static StmtPtr splitTailReorder(Tensor b) {
   constexpr int kVectorWidth = 8;
   LoopNest nest({b});
-  auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[0];
+  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0];
   nest.splitWithTail(loops[0], kVectorWidth);
   // Now the loopnests will look like:
   //
@@ -4745,24 +4722,24 @@ static StmtPtr splitTailReorder(Tensor* b) {
   // Write #2: "b[n_outer * 8 + n_inner] = ReduceOp(...)"
   // Loopnest #2: {n_outer, n_inner, m};
   // We will have to reorder n_inner and m.
-  auto loopnests = nest.getAllLoopNestsWritingToBuf(b->buf());
+  auto loopnests = nest.getAllLoopNestsWritingToBuf(b.buf());
   LoopNest::reorderAxis(loopnests[1][1], loopnests[1][2]);
   nest.prepareForCodegen();
   return nest.root_stmt();
 }
 
-static StmtPtr splitMaskReorder(Tensor* b) {
+static StmtPtr splitMaskReorder(Tensor b) {
   constexpr int kVectorWidth = 8;
   LoopNest nest({b});
-  auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[1];
+  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[1];
   nest.splitWithMask(loops[0], kVectorWidth);
-  loops = nest.getAllLoopNestsWritingToBuf(b->buf())[1];
+  loops = nest.getAllLoopNestsWritingToBuf(b.buf())[1];
   LoopNest::reorderAxis(loops[1], loops[2]);
   nest.prepareForCodegen();
   return nest.root_stmt();
 }
 
-static void checkColReduce(StmtPtr s, Placeholder& p, Tensor* t) {
+static void checkColReduce(StmtPtr s, Placeholder& p, Tensor t) {
   int M = immediateAs<int>(p.dim(0));
   int N = immediateAs<int>(p.dim(1));
   PaddedBuffer<float> a(M, N);
@@ -4893,14 +4870,14 @@ TEST(LoopNest, VectorizeUse) {
   KernelScope kernel_scope;
   constexpr int N = 8;
   Placeholder a("a", kFloat, {N});
-  Tensor* b = Compute(
+  Tensor b = Compute(
       "b", {{N, "n"}}, [&](const VarHandle& n) { return a.load(n) + 1.0f; });
-  Tensor* c = Compute(
-      "c", {{N, "n"}}, [&](const VarHandle& n) { return b->load(n) + 2.0f; });
-  LoopNest nest(std::vector<Tensor*>({c}), {b, c});
-  auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[0];
+  Tensor c = Compute(
+      "c", {{N, "n"}}, [&](const VarHandle& n) { return b.load(n) + 2.0f; });
+  LoopNest nest({c}, {b, c});
+  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0];
   ASSERT_TRUE(LoopNest::vectorize(loops[0]));
-  loops = nest.getAllLoopNestsWritingToBuf(c->buf())[0];
+  loops = nest.getAllLoopNestsWritingToBuf(c.buf())[0];
   ASSERT_TRUE(LoopNest::vectorize(loops[0]));
   nest.prepareForCodegen();
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
@@ -4939,7 +4916,7 @@ TEST(LoopNest, Int64Compute) {
 
   constexpr int64_t N = 12;
   Placeholder a("a", kLong, {N});
-  Tensor* b = Compute("b", {{N, "n"}}, [&](const VarHandle& n) {
+  Tensor b = Compute("b", {{N, "n"}}, [&](const VarHandle& n) {
     return a.load(n) + LongImm::make(1l);
   });
   LoopNest nest({b});
diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp
index 9503f9d57b726..db37b66876976 100644
--- a/test/cpp/tensorexpr/test_memdependency.cpp
+++ b/test/cpp/tensorexpr/test_memdependency.cpp
@@ -2726,28 +2726,28 @@ TEST(MemDependency, MemDependencyCheckerComputeAPI) {
   // Can determine if 2 loops created by Compute are dependent.
   Placeholder a_buf("a", kFloat, {4, 5});
   Placeholder b_buf("b", kFloat, {5, 6});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "broadcast_add",
       {{4, "m"}, {5, "n"}, {6, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) + b_buf.load(n, k);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "d",
       {{4, "m"}, {5, "n"}, {6, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c->load(m, n, k) + 1;
+        return c.load(m, n, k) + 1;
       });
 
-  LoopNest l(std::vector<Tensor*>({d}), {c, d});
+  LoopNest l({d}, {c, d});
 
-  MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()});
+  MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d.buf()});
 
   l.root_stmt()->accept(&analyzer);
 
   // Sanity test: Output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), a_buf.data()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), b_buf.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a_buf.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b_buf.data()));
 
   // Second loop depends on first loop.
   auto c_loop = l.getLoopStmtsFor(c)[0];
@@ -2773,32 +2773,32 @@ TEST(MemDependency, MemDependencyCheckerComputeInline) {
 
   Placeholder a_buf("a", kFloat, {4, 5});
   Placeholder b_buf("b", kFloat, {5, 6});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "broadcast_add",
       {{4, "m"}, {5, "n"}, {6, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) + b_buf.load(n, k);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "d",
       {{4, "m"}, {5, "n"}, {6, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c->load(m, n, k) + 1;
+        return c.load(m, n, k) + 1;
       });
 
-  LoopNest l(std::vector<Tensor*>({d}), {c, d});
-  l.computeInline(c->buf());
+  LoopNest l({d}, {c, d});
+  l.computeInline(c.buf());
 
-  MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()});
+  MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d.buf()});
   l.root_stmt()->accept(&analyzer);
 
   // Sanity test: Output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), a_buf.data()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), b_buf.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a_buf.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b_buf.data()));
 
   // broadcast_add tensor should not appear in trace at all.
   for (auto& wi : analyzer.getHistory()) {
-    ASSERT_NE(wi->var(), c->buf()->base_handle());
+    ASSERT_NE(wi->var(), c.buf()->base_handle());
   }
 }
 
@@ -2810,7 +2810,7 @@ TEST(MemDependency, MemDependencyCheckerComputeSplit) {
 
   Placeholder a_buf("a", kFloat, {4, 5});
   Placeholder b_buf("b", kFloat, {5, 6});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "broadcast_add",
       {{4, "m"}, {5, "n"}, {6, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -2819,13 +2819,12 @@ TEST(MemDependency, MemDependencyCheckerComputeSplit) {
 
   LoopNest l({c});
 
-  MemDependencyChecker analyzer_before(
-      {a_buf.data(), b_buf.data()}, {c->buf()});
+  MemDependencyChecker analyzer_before({a_buf.data(), b_buf.data()}, {c.buf()});
   l.root_stmt()->accept(&analyzer_before);
 
   l.splitWithTail(l.getLoopStmtsFor(c)[0], 2);
 
-  MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c->buf()});
+  MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c.buf()});
   StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
   stmt->accept(&analyzer_after);
 
@@ -2859,7 +2858,7 @@ TEST(MemDependency, MemDependencyCheckerComputeReorder) {
 
   Placeholder a_buf("a", kFloat, {4, 5});
   Placeholder b_buf("b", kFloat, {5, 6});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "broadcast_add",
       {{4, "m"}, {5, "n"}, {6, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -2868,14 +2867,13 @@ TEST(MemDependency, MemDependencyCheckerComputeReorder) {
 
   LoopNest l({c});
 
-  MemDependencyChecker analyzer_before(
-      {a_buf.data(), b_buf.data()}, {c->buf()});
+  MemDependencyChecker analyzer_before({a_buf.data(), b_buf.data()}, {c.buf()});
   l.root_stmt()->accept(&analyzer_before);
 
   auto loops = l.getLoopStmtsFor(c);
   l.reorderAxis(loops[0], loops[1]);
 
-  MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c->buf()});
+  MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c.buf()});
   StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
   stmt->accept(&analyzer_after);
 
@@ -2928,22 +2926,22 @@ TEST(MemDependency, MemDependencyCheckerComputeReduce) {
   Placeholder a(BufHandle("a", {2, 3, 6}, kFloat));
   Placeholder b(BufHandle("b", {2, 3, 6}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{2, "l2"}, {3, "n1"}, {6, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}});
-  LoopNest l(std::vector<Tensor*>({d}), {c, d});
+  Tensor d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}});
+  LoopNest l({d}, {c, d});
 
-  MemDependencyChecker analyzer({a.data(), b.data()}, {d->buf()});
+  MemDependencyChecker analyzer({a.data(), b.data()}, {d.buf()});
 
   l.root_stmt()->accept(&analyzer);
 
   // Sanity test: Output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), a.data()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), b.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b.data()));
 
   // Second loop depends on first loop.
   auto c_loop = l.getLoopStmtsFor(c)[0];
@@ -2965,7 +2963,7 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
 
   Placeholder AP(BufHandle("A", {M, K}, kFloat));
   Placeholder BP(BufHandle("B", {K, N}, kFloat));
-  Tensor* CT = Reduce(
+  Tensor CT = Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       Sum(),
@@ -3011,7 +3009,7 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
   }
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    loop.cacheAccesses(CT->buf(), "C_regs", loops[2]);
+    loop.cacheAccesses(CT.buf(), "C_regs", loops[2]);
   }
 
   MemDependencyChecker analyzer_unlowered(
@@ -3026,12 +3024,12 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
     stmt->accept(&analyzer_unlowered);
 
     // Outputs depend on inputs.
-    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT->buf(), AP.data()));
-    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT->buf(), BP.data()));
+    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT.buf(), AP.data()));
+    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT.buf(), BP.data()));
 
     // The last write to gemm should cover the total bound of the output.
     std::shared_ptr<AccessInfo> outputAccess =
-        analyzer_unlowered.output(CT->buf());
+        analyzer_unlowered.output(CT.buf());
     // A single dependency.
     ASSERT_EQ(outputAccess->dependencies().size(), 1);
 
diff --git a/test/cpp/tensorexpr/test_ops.cpp b/test/cpp/tensorexpr/test_ops.cpp
index 674dbd9cb0199..122a498276f24 100644
--- a/test/cpp/tensorexpr/test_ops.cpp
+++ b/test/cpp/tensorexpr/test_ops.cpp
@@ -6,7 +6,7 @@
 
 using namespace torch::jit::tensorexpr;
 
-using Tensors = std::vector<Tensor*>;
+using Tensors = std::vector<Tensor>;
 using Args = std::vector<CodeGen::BufferArg>;
 std::unique_ptr<SimpleIREvaluator> compile(
     const Args& inputs,
@@ -28,7 +28,7 @@ TEST(Ops, Sum) {
     constexpr int N = 16;
 
     Placeholder a("a", kFloat, {M, N});
-    Tensor* b = computeSum({a.handle(), dims, false}, c10::kFloat);
+    Tensor b = computeSum({a.handle(), dims, false}, c10::kFloat);
     auto cg = compile({a}, {b});
 
     auto at = at::arange(M * N, at::kFloat).view({M, N});
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index 449edac19823f..6620ef2686a94 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -35,7 +35,7 @@ TEST(Reductions, ReduceSum0D_1) {
 
   std::vector<float> out(M, -1.f);
 
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {});
+  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -59,7 +59,7 @@ TEST(Reductions, ReduceSum0D_2) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {});
+  Tensor c = Reduce("sum", {}, Sum(), b, {});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -83,7 +83,7 @@ TEST(Reductions, ReduceSum1D) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{10, "m"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{10, "m"}});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -114,7 +114,7 @@ TEST(Reductions, ReduceSum2D) {
 
   std::vector<float> out(M, -1.f);
 
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
+  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -145,7 +145,7 @@ TEST(Reductions, ReduceSum3D) {
 
   Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
 
-  Tensor* c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}});
+  Tensor c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -175,7 +175,7 @@ TEST(Reductions, ReduceSum3D) {
     ASSERT_EQ(cData[i], expected);
   }
 
-  Tensor* d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}});
+  Tensor d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}});
   LoopNest loop2({d});
   loop2.prepareForCodegen();
   StmtPtr s2 = loop2.root_stmt();
@@ -192,8 +192,8 @@ TEST(Reductions, ReduceSum3D) {
   }
 
   // This is the same as just reducing the original result across that axis.
-  Placeholder c_buf(BufHandle(c->buf()));
-  Tensor* e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}});
+  Placeholder c_buf(BufHandle(c.buf()));
+  Tensor e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}});
   LoopNest loop3({e});
   loop3.prepareForCodegen();
   StmtPtr s3 = loop3.root_stmt();
@@ -219,7 +219,7 @@ TEST(Reductions, ReduceSum10D) {
   std::vector<float> in(InputSize, 1.f);
   std::vector<float> out(OutputSize, -1.f);
 
-  Tensor* c = Reduce(
+  Tensor c = Reduce(
       "sum",
       {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}},
       Sum(),
@@ -261,7 +261,7 @@ TEST(Reductions, ReduceProduct) {
   Reducer product(
       ExprHandle(1.f), [](ExprHandle a, ExprHandle b) { return a * b; });
 
-  Tensor* c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}});
+  Tensor c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -294,7 +294,7 @@ TEST(Reductions, ReduceMax) {
     in[j] = j;
   }
 
-  Tensor* dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}});
+  Tensor dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}});
 
   LoopNest loop({dm1});
   loop.prepareForCodegen();
@@ -309,7 +309,7 @@ TEST(Reductions, ReduceMax) {
   Placeholder in2_(BufHandle("b", {2, 5}, kFloat));
   std::vector<float> out2(2, -1.f);
 
-  Tensor* m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}});
+  Tensor m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}});
 
   LoopNest loop2({m2d});
   loop2.prepareForCodegen();
@@ -336,7 +336,7 @@ TEST(Reductions, ReduceMinCustomInitializer) {
     in[j] = 10 + j;
   }
 
-  Tensor* min = Reduce(
+  Tensor min = Reduce(
       "min",
       {},
       Minimum(ExprHandle(minInit)),
@@ -372,7 +372,7 @@ TEST(Reductions, ReduceAnyAll) {
     return CompareSelect::make(a, 1, 1, b, kEQ);
   });
 
-  Tensor* any = Reduce(
+  Tensor any = Reduce(
       "anyEqual",
       {{4, "i"}},
       anyEqSV,
@@ -415,7 +415,7 @@ TEST(Reductions, ReduceAnyAll) {
     return CompareSelect::make(a, 0, 0, b, kEQ);
   });
 
-  Tensor* allGreaterThan = Reduce(
+  Tensor allGreaterThan = Reduce(
       "allGreaterThan",
       {{4, "i"}},
       allGTSV,
@@ -465,7 +465,7 @@ TEST(Reductions, ReduceMatmul2D) {
     }
   }
 
-  Tensor* mm = Reduce(
+  Tensor mm = Reduce(
       "mm",
       {{3, "m"}, {3, "n"}},
       Sum(),
@@ -501,10 +501,10 @@ TEST(Reductions, ReduceRfactorLike) {
   std::vector<float> in_rf_(10, -2.f);
   std::vector<float> out(1, -1.f);
 
-  Tensor* l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}});
-  Placeholder in_rf(BufHandle(l1->buf()));
+  Tensor l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}});
+  Placeholder in_rf(BufHandle(l1.buf()));
 
-  Tensor* l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}});
+  Tensor l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}});
 
   LoopNest loop({l1, l2});
   loop.prepareForCodegen();
@@ -526,14 +526,14 @@ TEST(Reductions, ReduceAsProducer) {
   Placeholder a(BufHandle("a", {2, 3}, kFloat));
   Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
 
-  Tensor* c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}});
-  Tensor* d = Compute(
+  Tensor c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}});
+  Tensor d = Compute(
       "scale",
       {{2, "l2"}, {3, "n1"}},
       [&](const VarHandle& l, const VarHandle& n) {
-        return c->load(l, n) * a.load(l, n);
+        return c.load(l, n) * a.load(l, n);
       });
-  LoopNest loop(std::vector<Tensor*>({d}), {c, d});
+  LoopNest loop({d}, {c, d});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
   s = IRSimplifier::simplify(s);
@@ -571,14 +571,14 @@ TEST(Reductions, ReduceAsConsumer) {
   Placeholder a(BufHandle("a", {2, 3, m}, kFloat));
   Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{2, "l2"}, {3, "n1"}, {m, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
-  LoopNest loop(std::vector<Tensor*>({d}), {c, d});
+  Tensor d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
+  LoopNest loop({d}, {c, d});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
   s = IRSimplifier::simplify(s);
@@ -626,7 +626,7 @@ TEST(Reductions, SplitReduceAxis) {
   }
   std::vector<float> out(16, -1.f);
 
-  Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
   LoopNest::splitWithTail(loops[1], 2);
@@ -656,7 +656,7 @@ TEST(Reductions, SplitNonReduceAxis) {
     }
   }
   std::vector<float> out(16, -1.f);
-  Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
   LoopNest::splitWithTail(loops[0], 2);
@@ -687,14 +687,14 @@ TEST(Reductions, ReorderedReductionInitializer) {
   Placeholder in(BufHandle("in", {1, 12, 6}, kFloat));
   std::vector<float> in_(12 * 6, 1.f);
 
-  Tensor* tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
+  Tensor tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
   LoopNest l_({tensor_});
 
   l_.prepareForCodegen();
   StmtPtr s_ = Stmt::clone(l_.root_stmt());
   s_ = IRSimplifier::simplify(s_);
 
-  Tensor* tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
+  Tensor tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
   LoopNest l({tensor});
 
   auto loops = l.getLoopStmtsFor(tensor);
@@ -741,10 +741,10 @@ TEST(Reductions, ReduceRfactor) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
   ASSERT_TRUE(loop.rfactor(c_body, loops.at(0)));
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 2);
@@ -776,10 +776,10 @@ TEST(Reductions, Reduce3DRfactorInner) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
   ASSERT_FALSE(loop.rfactor(c_body, loops.at(2)));
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 1);
@@ -811,10 +811,10 @@ TEST(Reductions, Reduce3DRfactorOuter) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
   ASSERT_TRUE(loop.rfactor(c_body, loops.at(0)));
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 2);
@@ -837,7 +837,7 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) {
   std::vector<float> out(1, -1.f);
   std::vector<float> ref(1, -1.f);
 
-  Tensor* c = Reduce(
+  Tensor c = Reduce(
       "sum",
       {},
       Sum(),
@@ -854,7 +854,7 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) {
         IRSimplifier::simplify(refloop.root_stmt()), {in_, c});
     ref_cg.call({in, ref});
 
-    BufPtr tmp_buf = c->buf();
+    BufPtr tmp_buf = c.buf();
 
     for (int idx = 0; idx < rfac_number; idx++) {
       auto reduce = loop.getAllWritesToBuf(tmp_buf)[1];
@@ -890,7 +890,7 @@ TEST(Reductions, ReduceSplitTail) {
   for (int i = 0; i < 3; ++i) {
     std::vector<float> out(M, -1.f);
 
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithTail(loops[i], 8);
@@ -922,7 +922,7 @@ TEST(Reductions, ReduceSplitNoTail) {
   for (int i = 0; i < 3; ++i) {
     std::vector<float> out(M, -1.f);
 
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithTail(loops[i], 5);
@@ -956,7 +956,7 @@ TEST(Reductions, ReduceOverSplitTail) {
   for (int i = 0; i < 3; ++i) {
     std::vector<float> out(M, -1.f);
 
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithTail(loops[i], 16);
@@ -989,7 +989,7 @@ TEST(Reductions, ReduceSplitMask) {
   for (int i = 0; i < 3; ++i) {
     std::vector<float> out(M, -1.f);
 
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithMask(loops[i], 8);
@@ -1021,7 +1021,7 @@ TEST(Reductions, ReduceSplitNoMask) {
   for (int i = 0; i < 3; ++i) {
     std::vector<float> out(M, -1.f);
 
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithMask(loops[i], 5);
@@ -1054,7 +1054,7 @@ TEST(Reductions, ReduceOverSplitMask) {
   for (int i = 0; i < 3; ++i) {
     std::vector<float> out(M, -1.f);
 
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithMask(loops[i], 16);
@@ -1090,16 +1090,16 @@ TEST(Reductions, ReduceSplitRfactor) {
 
   std::vector<float> out(M, -1.f);
 
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
   LoopNest::splitWithTail(loops[2], SPLIT_FACTOR);
 
-  auto c_body = loop.getAllWritesToBuf(c->buf())[2];
-  auto all_loops = loop.getAllLoopNestsWritingToBuf(c->buf());
+  auto c_body = loop.getAllWritesToBuf(c.buf())[2];
+  auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
   ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3);
   LoopNest::reorderAxis(all_loops[2][1], all_loops[2][2]);
-  all_loops = loop.getAllLoopNestsWritingToBuf(c->buf());
+  all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
   ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3);
   ASSERT_TRUE(loop.rfactor(c_body, all_loops[2][1]));
   loop.prepareForCodegen();
@@ -1131,7 +1131,7 @@ TEST(Reductions, ReduceOverSplitRfactor) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -1139,9 +1139,9 @@ TEST(Reductions, ReduceOverSplitRfactor) {
   LoopNest::splitWithTail(loops[1], SPLIT_FACTOR, &i, &t);
   LoopNest::reorderAxis(loops[0], i);
 
-  auto all_loops = loop.getAllLoopNestsWritingToBuf(c->buf());
+  auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
   ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(1).size() == 3);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
   ASSERT_TRUE(loop.rfactor(c_body, all_loops[1][0]));
   LoopNest::reorderAxis(all_loops[1][0], all_loops[1][2]);
 
@@ -1182,9 +1182,9 @@ TEST(Reductions, ReduceInlineReduction) {
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M, N, K});
 
-  Tensor* x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}});
-  Tensor* y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) {
-    return a_buf.load(m) + x->load(m);
+  Tensor x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}});
+  Tensor y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) {
+    return a_buf.load(m) + x.load(m);
   });
 
   PaddedBuffer<float> a_v(M);
@@ -1201,9 +1201,9 @@ TEST(Reductions, ReduceInlineReduction) {
     }
   }
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
   // Cannot inline a reduction computation
-  ASSERT_FALSE(l1.computeInline(x->buf()));
+  ASSERT_FALSE(l1.computeInline(x.buf()));
 }
 
 TEST(Reductions, ReduceInlineConsumer) {
@@ -1215,13 +1215,13 @@ TEST(Reductions, ReduceInlineConsumer) {
   Placeholder a_buf("a", kFloat, {M, N, K});
   Placeholder b_buf("b", kFloat, {M, N, K});
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n, k) + b_buf.load(m, n, k);
       });
-  Tensor* y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}});
+  Tensor y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}});
 
   PaddedBuffer<float> a_v(M, N, K);
   PaddedBuffer<float> b_v(M, N, K);
@@ -1235,9 +1235,9 @@ TEST(Reductions, ReduceInlineConsumer) {
     }
   }
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
   LoopNest l2(l1);
-  l2.computeInline(x->buf());
+  l2.computeInline(x.buf());
 
   l1.prepareForCodegen();
   l2.prepareForCodegen();
@@ -1269,7 +1269,7 @@ TEST(Reductions, ReduceInlineReducerInternal) {
   Placeholder a_buf("a", kFloat, {M, N, K});
   Placeholder b_buf("b", kFloat, {M, N, K});
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -1279,7 +1279,7 @@ TEST(Reductions, ReduceInlineReducerInternal) {
   Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) {
     return Add::make(ExprHandle(1.f), Min::make(a, b, false));
   });
-  Tensor* y = Reduce("y", {{M, "m2"}}, minimum, x, {{N, "n2"}, {K, "k2"}});
+  Tensor y = Reduce("y", {{M, "m2"}}, minimum, x, {{N, "n2"}, {K, "k2"}});
 
   PaddedBuffer<float> a_v(M, N, K);
   PaddedBuffer<float> b_v(M, N, K);
@@ -1293,9 +1293,9 @@ TEST(Reductions, ReduceInlineReducerInternal) {
     }
   }
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
   LoopNest l2(l1);
-  l2.computeInline(x->buf());
+  l2.computeInline(x.buf());
 
   l1.prepareForCodegen();
   l2.prepareForCodegen();
@@ -1328,25 +1328,25 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) {
   Placeholder a(BufHandle("a", {L, N, M}, kFloat));
   Placeholder b(BufHandle("b", {L, N, M}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{L, "l2"}, {N, "n1"}, {M, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
+  Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
 
-  Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
   LoopNest l_before(l);
   l_before.prepareForCodegen();
   SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
 
   StmtPtr d_loop = l.getLoopStmtsFor(d)[0];
-  l.cacheAccesses(d->buf(), "d_local", d_loop);
+  l.cacheAccesses(d.buf(), "d_local", d_loop);
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1405,25 +1405,25 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) {
   Placeholder a(BufHandle("a", {L, N, M}, kFloat));
   Placeholder b(BufHandle("b", {L, N, M}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{L, "l2"}, {N, "n1"}, {M, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
+  Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
 
-  Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
   LoopNest l_before(l);
   l_before.prepareForCodegen();
   SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
 
   StmtPtr d_loop = l.getLoopStmtsFor(d)[1];
-  l.cacheAccesses(d->buf(), "d_local", d_loop);
+  l.cacheAccesses(d.buf(), "d_local", d_loop);
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1480,25 +1480,25 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) {
   Placeholder a(BufHandle("a", {L, N, M}, kFloat));
   Placeholder b(BufHandle("b", {L, N, M}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{L, "l2"}, {N, "n1"}, {M, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
+  Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
 
-  Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
   LoopNest l_before(l);
   l_before.prepareForCodegen();
   SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
 
   StmtPtr d_loop = l.getLoopStmtsFor(d)[2];
-  l.cacheAccesses(d->buf(), "d_local", d_loop);
+  l.cacheAccesses(d.buf(), "d_local", d_loop);
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1551,22 +1551,22 @@ TEST(Reductions, ReductionCacheBodyAccess) {
   Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
   Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{24, "l2"}, {32, "n1"}, {12, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
 
-  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
 
   StmtPtr d_loop = l.getLoopStmtsFor(d)[1];
-  l.cacheAccesses(c->buf(), "scale_local", d_loop);
+  l.cacheAccesses(c.buf(), "scale_local", d_loop);
 
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1592,24 +1592,24 @@ TEST(Reductions, ReductionCacheConsumerAccess) {
   Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
   Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{24, "l2"}, {32, "n1"}, {12, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
 
-  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
 
   LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4);
 
   StmtPtr e_loop = l.getLoopStmtsFor(e)[1];
-  l.cacheAccesses(d->buf(), "sum_local", e_loop);
+  l.cacheAccesses(d.buf(), "sum_local", e_loop);
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1633,19 +1633,19 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
   Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
   Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{24, "l2"}, {32, "n1"}, {12, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
 
-  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr inner;
@@ -1656,7 +1656,7 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
   // Split reduction consumer.
   LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner);
 
-  l.cacheAccesses(d->buf(), "sum_local", inner);
+  l.cacheAccesses(d.buf(), "sum_local", inner);
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1681,19 +1681,19 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
   Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
   Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{24, "l2"}, {32, "n1"}, {12, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
 
-  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr inner;
@@ -1705,7 +1705,7 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
   // Split reduction consumer.
   LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner);
 
-  l.cacheAccesses(d->buf(), "sum_local", inner);
+  l.cacheAccesses(d.buf(), "sum_local", inner);
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1742,13 +1742,13 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
   LoopNest loop({c});
 
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
   LoopNest::reorderAxis(loops.at(0), loops.at(1));
   loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   BufPtr rfac_buf;
   ASSERT_TRUE(loop.rfactor(c_body, loops.at(0), &rfac_buf));
@@ -1811,10 +1811,10 @@ TEST(Reductions, ReductionRfactorCacheTempInner) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
 
   LoopNest::reorderAxis(loops.at(0), loops.at(1));
   loops = loop.getLoopStmtsFor(c);
@@ -1871,7 +1871,7 @@ TEST(Reductions, ReductionVectorize) {
 
   Placeholder in(BufHandle("in", {8, 8}, kFloat));
 
-  Tensor* tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
   LoopNest l_before({tensor});
   LoopNest l(l_before);
   l_before.prepareForCodegen();
@@ -1909,7 +1909,7 @@ TEST(Reductions, ReductionVectorizeInner) {
 
   Placeholder in(BufHandle("in", {8, 8}, kFloat));
 
-  Tensor* tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
   LoopNest l({tensor});
 
   ASSERT_FALSE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[1]));
@@ -1929,7 +1929,7 @@ TEST(Reductions, ReductionVectorizeRfactor) {
 
   Placeholder in(BufHandle("in", {8, 8}, kFloat));
 
-  Tensor* tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}});
+  Tensor tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}});
 
   LoopNest l_before({tensor});
   LoopNest l(l_before);
@@ -1944,7 +1944,7 @@ TEST(Reductions, ReductionVectorizeRfactor) {
   std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
   LoopNest::reorderAxis(loops[0], loops[1]);
   loops = l.getLoopStmtsFor(tensor);
-  auto tensor_body = l.getAllWritesToBuf(tensor->buf())[1];
+  auto tensor_body = l.getAllWritesToBuf(tensor.buf())[1];
   BufPtr rfac_buf = nullptr;
   ASSERT_TRUE(LoopNest::rfactor(tensor_body, loops.at(0), &rfac_buf));
 
@@ -1988,7 +1988,7 @@ TEST(Reductions, InitFunction) {
   constexpr int N = 16;
   Placeholder A("A", kFloat, {M, N});
   Placeholder B("B", kFloat, {N});
-  Tensor* C = Reduce(
+  Tensor C = Reduce(
       "C",
       {{N, "n"}},
       Sum(),
diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
index c25ae4f68a1fc..0df9e9242e198 100644
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@@ -3932,7 +3932,7 @@ TEST(Simplify, SimplifyForCleansUp) {
   {
     Placeholder a("a", kFloat, {1, 12, 1});
     VarHandle x("x", kInt);
-    Tensor* b = Compute(
+    Tensor b = Compute(
         // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
         "x",
         {{1, "i"}, {12, "m"}, {1, "n"}},
diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp
index 5a6f257d6a79b..0f0277e37292e 100644
--- a/test/cpp/tensorexpr/tutorial.cpp
+++ b/test/cpp/tensorexpr/tutorial.cpp
@@ -155,8 +155,8 @@ int main(int argc, char* argv[]) {
     ExprPtr body = alloc<Mul>(i, j);
 
     // Finally, we pass all these pieces together to Tensor constructor:
-    Tensor* X = new Tensor(buf, args, body);
-    std::cout << "Tensor computation: " << *X << std::endl;
+    Tensor X = Tensor(buf, args, body);
+    std::cout << "Tensor computation: " << X << std::endl;
     // Prints:
     // Tensor computation: Tensor X[64, 32]:
     // for (int i = 0; i < 64; i++) {
@@ -171,11 +171,11 @@ int main(int argc, char* argv[]) {
     // constructing Exprs, Tensors also have a more convenient API for
     // construction. It is based on Compute API, which takes a name,
     // dimensions, and a lambda specifying the computation body:
-    Tensor* Z = Compute(
+    Tensor Z = Compute(
         "Z",
         {{64, "i"}, {32, "j"}},
         [](const VarHandle& i, const VarHandle& j) { return i / j; });
-    std::cout << "Tensor computation: " << *Z << std::endl;
+    std::cout << "Tensor computation: " << Z << std::endl;
     // Prints:
     // Tensor computation: Tensor Z[64, 32]:
     // for (int i = 0; i < 64; i++) {
@@ -187,13 +187,13 @@ int main(int argc, char* argv[]) {
     // Tensors might access other tensors and external placeholders in their
     // expressions. It can be done like so:
     Placeholder P("P", kInt, {64, 32});
-    Tensor* R = Compute(
+    Tensor R = Compute(
         "R",
         {{64, "i"}, {32, "j"}},
         [&](const VarHandle& i, const VarHandle& j) {
-          return Z->load(i, j) * P.load(i, j);
+          return Z.load(i, j) * P.load(i, j);
         });
-    std::cout << "Tensor computation: " << *R << std::endl;
+    std::cout << "Tensor computation: " << R << std::endl;
     // Prints:
     // Tensor computation: Tensor R[64, 32]:
     // for (int i = 0; i < 64; i++) {
@@ -224,20 +224,20 @@ int main(int argc, char* argv[]) {
     // Let's create a simple tensor expression and construct a loop nest for it.
     Placeholder A("A", kFloat, {64, 32});
     Placeholder B("B", kFloat, {64, 32});
-    Tensor* X = Compute(
+    Tensor X = Compute(
         "X",
         {{64, "i"}, {32, "j"}},
         [&](const VarHandle& i, const VarHandle& j) {
           return A.load(i, j) + B.load(i, j);
         });
-    Tensor* Y = Compute(
+    Tensor Y = Compute(
         "Y",
         {{64, "i"}, {32, "j"}},
         [&](const VarHandle& i, const VarHandle& j) {
-          return sigmoid(X->load(i, j));
+          return sigmoid(X.load(i, j));
         });
-    std::cout << "Tensor computation X: " << *X
-              << "Tensor computation Y: " << *Y << std::endl;
+    std::cout << "Tensor computation X: " << X << "Tensor computation Y: " << Y
+              << std::endl;
     // Prints:
     // Tensor computation X: Tensor X[64, 32]:
     // for (int i = 0; i < 64; i++) {
@@ -256,9 +256,7 @@ int main(int argc, char* argv[]) {
     // Creating a loop nest is as quite simple, we just need to specify a list
     // of all and a list of output tensors:
     // NOLINTNEXTLINE(bugprone-argument-comment)
-    std::vector<Tensor*> outputs = {Y};
-    std::vector<Tensor*> all = {X, Y};
-    LoopNest loopnest(outputs, all);
+    LoopNest loopnest(/*outputs=*/{Y}, /*all=*/{X, Y});
 
     // An IR used in LoopNest is based on tensor statements, represented by
     // `Stmt` class. Statements are used to specify the loop nest structure, and
@@ -357,7 +355,7 @@ int main(int argc, char* argv[]) {
     // Let's start by constructing a simple computation for us to work with:
     Placeholder A("A", kInt, {64, 32});
     Placeholder B("B", kInt, {64, 32});
-    Tensor* X = Compute(
+    Tensor X = Compute(
         "X",
         {{64, "i"}, {32, "j"}},
         [&](const VarHandle& i, const VarHandle& j) {
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 27f6e545ec7bc..0a34f476b0d3b 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -500,7 +500,7 @@ struct TEWrapper {
 
 void optimizePointwise(
     tensorexpr::LoopNest* ln,
-    tensorexpr::Tensor* target,
+    tensorexpr::Tensor target,
     int width) {
   using namespace torch::jit::tensorexpr;
   std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
@@ -513,7 +513,7 @@ void optimizePointwise(
 std::shared_ptr<TEWrapper> wrapTECompute(
     std::shared_ptr<TEWrapper> wrap,
     tensorexpr::Placeholder& in,
-    tensorexpr::Tensor* out,
+    tensorexpr::Tensor out,
     tensorexpr::VarHandle& dim,
     int width = kVectorWidth) {
   using namespace torch::jit::tensorexpr;
@@ -553,7 +553,7 @@ struct TEWrapper {
 std::shared_ptr<TEWrapper> wrapTECompute(
     std::shared_ptr<TEWrapper> wrap,
     tensorexpr::Placeholder& in,
-    tensorexpr::Tensor* out,
+    tensorexpr::Tensor out,
     tensorexpr::VarHandle& dim,
     int width = kVectorWidth) {
   return wrap;
@@ -593,7 +593,7 @@ std::shared_ptr<TEWrapper> createLogit(c10::optional<float> clamp) {
   auto wrap = std::make_shared<TEWrapper>();
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto A_elem = [&]() {
       if (!clamp) {
         return A.load(i);
@@ -619,7 +619,7 @@ std::shared_ptr<TEWrapper> createRelu() {
   wrap = std::make_shared<TEWrapper>();
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto zero = FloatImm::make(0.f);
     auto a = A.load(i);
     return ifThenElse(a < zero, zero, a);
@@ -638,7 +638,7 @@ std::shared_ptr<TEWrapper> createTanh() {
   wrap = std::make_shared<TEWrapper>();
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto a = A.load(i);
     return fast_tanh(a);
   });
@@ -656,7 +656,7 @@ std::shared_ptr<TEWrapper> createSigmoid() {
   wrap = std::make_shared<TEWrapper>();
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  Tensor* B =
+  Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) { return sigmoid(A.load(i)); });
   // NNC uses sleef for vectorizing sigmoid, which comes in an 8-wide flavor
   // (Sleef_expf8).
diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h
index 77ba8e173631e..d7cfe783fab8f 100644
--- a/torch/csrc/jit/tensorexpr/codegen.h
+++ b/torch/csrc/jit/tensorexpr/codegen.h
@@ -104,7 +104,7 @@ class TORCH_API CodeGen {
 class CodeGen::BufferArg {
  public:
   BufferArg(const Placeholder& buffer) : buf_(buffer.data()) {}
-  BufferArg(Tensor* tensor) : buf_(tensor->buf()) {}
+  BufferArg(Tensor tensor) : buf_(tensor.buf()) {}
   BufferArg(const VarHandle& var) : var_(var.node()), isVar_(true) {}
   BufferArg(const BufHandle& buf) : buf_(buf.node()) {}
 
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index f885246e24d2b..27b56e2f58146 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -545,7 +545,7 @@ std::ostream& operator<<(std::ostream& stream, const Stmt& stmt) {
 }
 
 std::ostream& operator<<(std::ostream& stream, const Tensor& t) {
-  stream << std::to_string(&t);
+  stream << std::to_string(t);
   return stream;
 }
 
@@ -568,7 +568,7 @@ void print(StmtPtr stmt) {
   }
 }
 
-void print(const Tensor* t) {
+void print(const Tensor& t) {
   std::cout << std::to_string(t);
 }
 
@@ -589,20 +589,17 @@ std::string to_string(StmtPtr stmt) {
   return oss.str();
 }
 
-std::string to_string(const Tensor* t) {
-  if (!t) {
-    return "(null tensor)\n";
-  }
+std::string to_string(const Tensor& t) {
   std::ostringstream oss;
   // TODO: move this to Buf printer
-  oss << "Tensor " << t->buf()->name_hint() << "[";
-  for (const auto i : c10::irange(t->buf()->ndim())) {
+  oss << "Tensor " << t.buf()->name_hint() << "[";
+  for (const auto i : c10::irange(t.buf()->ndim())) {
     if (i != 0) {
       oss << ", ";
     }
-    oss << *t->buf()->dim(i);
+    oss << *t.buf()->dim(i);
   }
-  oss << "]:\n" << *t->stmt() << "\n";
+  oss << "]:\n" << *t.stmt() << "\n";
   return oss.str();
 }
 } // namespace std
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h
index e76dccab846a1..321d1efe55457 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.h
+++ b/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -103,7 +103,7 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Tensor&);
 
 TORCH_API void print(ExprPtr expr);
 TORCH_API void print(StmtPtr stmt);
-TORCH_API void print(const Tensor* t);
+TORCH_API void print(const Tensor& t);
 
 } // namespace tensorexpr
 } // namespace jit
@@ -119,5 +119,5 @@ using torch::jit::tensorexpr::Tensor;
 
 TORCH_API std::string to_string(ExprPtr expr);
 TORCH_API std::string to_string(StmtPtr stmt);
-TORCH_API std::string to_string(const Tensor* t);
+TORCH_API std::string to_string(const Tensor& t);
 } // namespace std
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index faacd022e7e0b..8076ba2b71d67 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -960,7 +960,7 @@ std::vector<ExprHandle> TensorExprKernel::broadcastShapesMut(
   return res.first;
 }
 
-Tensor* computeOneOperand(
+Tensor computeOneOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
@@ -981,7 +981,7 @@ Tensor* computeOneOperand(
       });
 }
 
-Tensor* computeTwoOperand(
+Tensor computeTwoOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
@@ -1004,7 +1004,7 @@ Tensor* computeTwoOperand(
       });
 }
 
-Tensor* computeTwoOperandWithAlpha(
+Tensor computeTwoOperandWithAlpha(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
@@ -1028,7 +1028,7 @@ Tensor* computeTwoOperandWithAlpha(
       });
 }
 
-Tensor* computeConditionWithTwoOperand(
+Tensor computeConditionWithTwoOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
@@ -1055,7 +1055,7 @@ Tensor* computeConditionWithTwoOperand(
       });
 }
 
-Tensor* computeThreeOperand(
+Tensor computeThreeOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
@@ -1083,7 +1083,7 @@ Tensor* computeThreeOperand(
         return demoteOutput(compute, outputType);
       });
 }
-Tensor* computeFourOperand(
+Tensor computeFourOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
@@ -1135,7 +1135,7 @@ std::pair<ScalarType, std::vector<BufHandle>> processCatList(
   }
   return {highType, nonEmptyInputs};
 }
-Tensor* computeCatWoConditionals(
+Tensor computeCatWoConditionals(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape) {
   // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
@@ -1164,7 +1164,7 @@ Tensor* computeCatWoConditionals(
   auto output_buf =
       alloc<Buf>("aten_cat", output_sizes_expr, ToDtype(high_type));
   if (non_empty_inputs.size() == 0) {
-    return new Tensor(
+    return Tensor(
         output_buf, alloc<tensorexpr::Block>(std::vector<StmtPtr>({})));
   }
 
@@ -1213,10 +1213,10 @@ Tensor* computeCatWoConditionals(
     concat_dim_size =
         alloc<Add>(concat_dim_size, input_dims[norm_concat_dim].node());
   }
-  return new Tensor(output_buf, IRSimplifier::simplify(block));
+  return Tensor(output_buf, IRSimplifier::simplify(block));
 }
 
-Tensor* computeCat(
+Tensor computeCat(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     at::Device device) {
@@ -1276,7 +1276,7 @@ Tensor* computeCat(
       });
 }
 
-Tensor* computeConv2d(
+Tensor computeConv2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType) {
@@ -1319,10 +1319,10 @@ Tensor* computeConv2d(
        dilation[0],
        dilation[1],
        groups});
-  return new Tensor(ResultBuf.node(), s);
+  return Tensor(ResultBuf.node(), s);
 }
 
-Tensor* tensorexpr::computeOperandValue(
+Tensor tensorexpr::computeOperandValue(
     c10::Symbol op,
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
@@ -2391,7 +2391,7 @@ c10::optional<ScalarType> findDtypeForValue(const torch::jit::Value* v) {
   return c10::nullopt;
 }
 
-Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
+Tensor TensorExprKernel::computeValue(const torch::jit::Value* v) {
   auto inputs = v->node()->inputs();
   auto op = v->node()->kind();
 
@@ -2703,9 +2703,9 @@ static std::vector<ExprHandle> toExprHandles(const std::vector<T>& sizes) {
   return dims;
 }
 
-Tensor* TensorExprKernel::bindInput(const torch::jit::Value* input) {
+Tensor TensorExprKernel::bindInput(const torch::jit::Value* input) {
   auto const& t = input->type();
-  Tensor* result = nullptr;
+  Tensor result(nullptr, nullptr);
   switch (t->kind()) {
     case TypeKind::TensorType: {
       auto tt = input->type()->cast<TensorType>();
@@ -2744,7 +2744,7 @@ Tensor* TensorExprKernel::bindInput(const torch::jit::Value* input) {
             }
             return inBuffer.load(idx);
           });
-      bufs_.emplace(input, result->buf());
+      bufs_.emplace(input, result.buf());
 
       bufferArgs_.emplace_back(inBuffer);
       break;
@@ -2800,7 +2800,7 @@ bool denseAndNonOverlapping(
   return (strides == at::infer_dense_strides(sizes, strides));
 }
 
-Tensor* TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
+Tensor TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
   const TensorTypePtr& tt = v->type()->expect<TensorType>();
   TORCH_INTERNAL_ASSERT(bufs_.count(v));
   BufPtr buf = bufs_.at(v);
@@ -2816,19 +2816,19 @@ Tensor* TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
   auto sizes = *tt->sizes().concrete_sizes();
   std::vector<int64_t> default_strides = TensorType::contiguousStridesOf(sizes);
   if (!tt->strides().concrete_sizes()) {
-    return new Tensor(buf, nullptr);
+    return Tensor(buf, nullptr);
   }
   TORCH_INTERNAL_ASSERT(tt->strides().concrete_sizes());
   const std::vector<int64_t> strides = *tt->strides().concrete_sizes();
   // All Tensors in NNC are layed out in default, contiguous layout.
   // If the output is also default contiguous we don't need to do anything
   if (strides == default_strides) {
-    return new Tensor(buf, nullptr);
+    return Tensor(buf, nullptr);
   }
   // If the tensor is not dense or overlaps, we have
   // no way of matching the profiled striding
   if (!denseAndNonOverlapping(sizes, strides)) {
-    return new Tensor(buf, nullptr);
+    return Tensor(buf, nullptr);
   }
 
   auto dims = c10::fmap<DimArg>(sizesForValue(v));
@@ -2922,8 +2922,9 @@ void TensorExprKernel::compile() {
   nInputs_ = graph_->inputs().size();
   genInputDebugNames();
   for (auto const& input : graph_->inputs()) {
-    if (Tensor* t = bindInput(input)) {
-      block->append_stmt(t->stmt());
+    Tensor t = bindInput(input);
+    if (t.stmt()) {
+      block->append_stmt(t.stmt());
     }
   }
 
@@ -2937,10 +2938,9 @@ void TensorExprKernel::compile() {
     } else {
       for (auto const& output : n->outputs()) {
         if (output->hasUses()) {
-          Tensor* t = computeValue(output);
-          bufs_.emplace(output, t->buf());
-          // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-          block->append_stmt(t->stmt());
+          Tensor t = computeValue(output);
+          bufs_.emplace(output, t.buf());
+          block->append_stmt(t.stmt());
         }
       }
     }
@@ -2958,12 +2958,12 @@ void TensorExprKernel::compile() {
     // The "strided" tensor will be incorrect if used in NNC,
     // since NNC views it as contiguous. Only convert it to the right
     // strides at the end of the kernel (if already contiguous it's a no-op)
-    Tensor* properly_strided_output = convertOutputToCorrectStrides(output);
-    if (properly_strided_output->stmt()) {
-      block->append_stmt(properly_strided_output->stmt());
+    Tensor properly_strided_output = convertOutputToCorrectStrides(output);
+    if (properly_strided_output.stmt()) {
+      block->append_stmt(properly_strided_output.stmt());
     }
     // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    bufs_[output] = properly_strided_output->buf();
+    bufs_[output] = properly_strided_output.buf();
     const auto& tt = output->type()->expect<TensorType>();
     auto sizes = *tt->sizes().concrete_sizes();
     tensorOutputSizes_.push_back(sizes);
diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h
index 7b35e1e44905c..a8a57b9f15a16 100644
--- a/torch/csrc/jit/tensorexpr/kernel.h
+++ b/torch/csrc/jit/tensorexpr/kernel.h
@@ -47,7 +47,7 @@ using ArgValue = c10::variant<
     IntList,
     ArgNone>;
 
-using NNCLoweringFunction = std::function<Tensor*(
+using NNCLoweringFunction = std::function<Tensor(
     const std::vector<ArgValue>&,
     const std::vector<ExprHandle>&,
     const c10::optional<ScalarType>&,
@@ -123,7 +123,7 @@ struct TensorInfo {
   c10::ScalarType dtype;
 };
 
-TORCH_API Tensor* computeOperandValue(
+TORCH_API Tensor computeOperandValue(
     c10::Symbol op,
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
@@ -209,7 +209,7 @@ class TORCH_API TensorExprKernel {
       const torch::jit::Value* v,
       const std::vector<ExprHandle>& axes);
 
-  Tensor* computeValue(const torch::jit::Value* v);
+  Tensor computeValue(const torch::jit::Value* v);
 
   void bindConstant(const torch::jit::Value* v);
 
@@ -222,9 +222,9 @@ class TORCH_API TensorExprKernel {
       std::vector<at::Tensor>& outputs);
   BackendType inferBackendTypeFromDevice(at::Device device);
 
-  Tensor* bindInput(const torch::jit::Value* input);
+  Tensor bindInput(const torch::jit::Value* input);
 
-  Tensor* convertOutputToCorrectStrides(torch::jit::Value* v);
+  Tensor convertOutputToCorrectStrides(torch::jit::Value* v);
 
   // Captures the information for reduction operation nodes.
   struct ReductionInfo {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index d9d20736057fb..190499998b289 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -47,14 +47,14 @@ LoopNest::LoopNest(StmtPtr stmt, std::unordered_set<BufPtr> output_bufs)
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 LoopNest::LoopNest(
-    const std::vector<Tensor*>& output_tensors,
-    const std::vector<Tensor*>& tensors_to_compute) {
+    const std::vector<Tensor>& output_tensors,
+    const std::vector<Tensor>& tensors_to_compute) {
   initialize(output_tensors, tensors_to_compute);
   verify(root_stmt_);
 }
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-LoopNest::LoopNest(const std::vector<Tensor*>& output_tensors) {
+LoopNest::LoopNest(const std::vector<Tensor>& output_tensors) {
   initialize(output_tensors, output_tensors);
   verify(root_stmt_);
 }
@@ -486,15 +486,15 @@ bool LoopNest::vectorize(ForPtr f) {
 }
 
 void LoopNest::initialize(
-    const std::vector<Tensor*>& output_tensors,
-    const std::vector<Tensor*>& tensors_to_compute) {
+    const std::vector<Tensor>& output_tensors,
+    const std::vector<Tensor>& tensors_to_compute) {
   for (auto t : output_tensors) {
-    output_bufs_.insert(t->buf());
+    output_bufs_.insert(t.buf());
   }
 
   std::vector<StmtPtr> loops;
-  for (Tensor* t : tensors_to_compute) {
-    StmtPtr loop = t->stmt();
+  for (Tensor t : tensors_to_compute) {
+    StmtPtr loop = t.stmt();
     if (loop->get_parent()) {
       std::cerr << "Error: creating a loopnest from already used Tensors\n";
       loops = {};
@@ -2384,7 +2384,7 @@ void LoopNest::compressAllBuffers(StmtPtr stmt) {
   }
 }
 
-std::vector<ForPtr> LoopNest::getLoopStmtsFor(Tensor* t) const {
+std::vector<ForPtr> LoopNest::getLoopStmtsFor(Tensor t) const {
   StmtPtr cur_stmt = getLoopBodyFor(t);
   return getLoopStmtsFor(cur_stmt);
 }
@@ -2407,8 +2407,8 @@ std::vector<ForPtr> LoopNest::getLoopStmtsFor(StmtPtr s) const {
   return result;
 }
 
-StmtPtr LoopNest::getLoopBodyFor(Tensor* t) const {
-  return getLoopBodyFor(t->buf());
+StmtPtr LoopNest::getLoopBodyFor(Tensor t) const {
+  return getLoopBodyFor(t.buf());
 }
 
 StmtPtr LoopNest::getLoopBodyFor(BufPtr buf) const {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h
index c8cf2d8553d2d..42f072d2da7d8 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -27,11 +27,11 @@ class TORCH_API LoopNest {
  public:
   // A constructor for building a LoopNest from a list of Tensors
   LoopNest(
-      const std::vector<Tensor*>& output_tensors,
-      const std::vector<Tensor*>& tensors_to_compute);
+      const std::vector<Tensor>& output_tensors,
+      const std::vector<Tensor>& tensors_to_compute);
 
   // A convenience constructor for the case when all tensors are output tensors
-  LoopNest(const std::vector<Tensor*>& output_tensors);
+  LoopNest(const std::vector<Tensor>& output_tensors);
 
   // A constructor for building a LoopNest from an Stmt and a list of output
   // buffers.
@@ -45,10 +45,10 @@ class TORCH_API LoopNest {
     return root_stmt_;
   }
 
-  std::vector<ForPtr> getLoopStmtsFor(Tensor*) const;
+  std::vector<ForPtr> getLoopStmtsFor(Tensor) const;
   std::vector<ForPtr> getLoopStmtsFor(BufPtr) const;
   std::vector<ForPtr> getLoopStmtsFor(StmtPtr) const;
-  StmtPtr getLoopBodyFor(Tensor*) const;
+  StmtPtr getLoopBodyFor(Tensor) const;
   StmtPtr getLoopBodyFor(BufPtr) const;
 
   // Returns the For stmt indexed by 'indices' in the 'root' For stmt.
@@ -547,8 +547,8 @@ class TORCH_API LoopNest {
 
  private:
   void initialize(
-      const std::vector<Tensor*>& output_tensors,
-      const std::vector<Tensor*>& tensors_to_compute);
+      const std::vector<Tensor>& output_tensors,
+      const std::vector<Tensor>& tensors_to_compute);
   StmtPtr insertAllocFree(StmtPtr stmt);
   const std::unordered_set<BufPtr> getIntermediateBufs() const;
 
diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
index c4af83a8cc6f4..51d323f4130a4 100644
--- a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
@@ -16,7 +16,7 @@ void assert_dims_constant(const BufHandle& buf) {
 
 using InitFunc = std::function<ExprHandle(const std::vector<VarHandle>&)>;
 
-Tensor* conv2d_depthwise_static(
+Tensor conv2d_depthwise_static(
     BufHandle input,
     BufHandle weight,
     const InitFunc& init_func,
@@ -45,7 +45,7 @@ Tensor* conv2d_depthwise_static(
   auto OH = (H - R + 2 * pad) / stride + 1;
   auto OW = (W - S + 2 * pad) / stride + 1;
 
-  Tensor* conv = Reduce(
+  Tensor conv = Reduce(
       "conv2d_depthwise",
       {{N, "n"}, {K, "k"}, {OH, "oh"}, {OW, "ow"}},
       Sum(),
@@ -83,7 +83,7 @@ Tensor* conv2d_depthwise_static(
   } else if (R == 3 && stride == 1 && pad == 1) {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     ForPtr main, peeled;
-    auto loops = nest.getAllLoopNestsWritingToBuf(conv->buf());
+    auto loops = nest.getAllLoopNestsWritingToBuf(conv.buf());
     main = loops[1][kLoopW];
     nest.sliceHead(main, 1, &peeled, &main);
     nest.sliceTail(main, 1, &main, &peeled);
@@ -92,10 +92,10 @@ Tensor* conv2d_depthwise_static(
     nest.sliceTail(main, 1, &main, &peeled);
   }
 
-  return new Tensor(conv->buf(), nest.root_stmt());
+  return Tensor(conv.buf(), nest.root_stmt());
 }
 
-Tensor* conv2d_depthwise_dynamic(
+Tensor conv2d_depthwise_dynamic(
     BufHandle input,
     BufHandle weight,
     const InitFunc& init_func,
@@ -144,7 +144,7 @@ Tensor* conv2d_depthwise_dynamic(
 
 } // namespace
 
-Tensor* conv2d_depthwise(
+Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     BufHandle bias,
@@ -158,7 +158,7 @@ Tensor* conv2d_depthwise(
   return conv2d_depthwise_static(input, weight, init_func, stride, pad, groups);
 }
 
-Tensor* conv2d_depthwise(
+Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     int stride,
@@ -170,7 +170,7 @@ Tensor* conv2d_depthwise(
   return conv2d_depthwise_static(input, weight, init_func, stride, pad, groups);
 }
 
-Tensor* conv2d_depthwise(
+Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     BufHandle bias,
@@ -206,7 +206,7 @@ Tensor* conv2d_depthwise(
       groups);
 }
 
-Tensor* conv2d_depthwise(
+Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     ExprHandle N,
diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.h b/torch/csrc/jit/tensorexpr/operators/conv2d.h
index 14612fb17ee74..4c2215b38d868 100644
--- a/torch/csrc/jit/tensorexpr/operators/conv2d.h
+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.h
@@ -7,7 +7,7 @@ namespace jit {
 namespace tensorexpr {
 
 // An API to compute 2D depthwise convolutions with bias.
-TORCH_API Tensor* conv2d_depthwise(
+TORCH_API Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     BufHandle bias,
@@ -16,14 +16,14 @@ TORCH_API Tensor* conv2d_depthwise(
     int groups);
 
 // An API to compute 2D depthwise convolutions without bias.
-TORCH_API Tensor* conv2d_depthwise(
+TORCH_API Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     int stride,
     int pad,
     int groups);
 
-TORCH_API Tensor* conv2d_depthwise(
+TORCH_API Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     BufHandle bias,
@@ -39,7 +39,7 @@ TORCH_API Tensor* conv2d_depthwise(
     ExprHandle pad,
     ExprHandle groups);
 
-TORCH_API Tensor* conv2d_depthwise(
+TORCH_API Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     ExprHandle N,
diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.cpp b/torch/csrc/jit/tensorexpr/operators/matmul.cpp
index 23cb45564c97c..581514cdcb095 100644
--- a/torch/csrc/jit/tensorexpr/operators/matmul.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/matmul.cpp
@@ -5,7 +5,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeMatmul(
+Tensor computeMatmul(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType) {
@@ -44,13 +44,13 @@ Tensor* computeMatmul(
         },
         {{size_a[1], "K"}});
   } else {
-    return new Tensor(
+    return Tensor(
         ResultBuf.node(),
         ExternalCall::make(ResultBuf, "nnc_aten_matmul", {a, b}, {}));
   }
 }
 
-Tensor* computeAddMM(
+Tensor computeAddMM(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType) {
@@ -59,7 +59,7 @@ Tensor* computeAddMM(
     dtype = Dtype(*outputType);
   }
   BufHandle ResultBuf("addmm", outputShape, dtype);
-  return new Tensor(
+  return Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.h b/torch/csrc/jit/tensorexpr/operators/matmul.h
index 35b30f4168914..0b52ad65c43c8 100644
--- a/torch/csrc/jit/tensorexpr/operators/matmul.h
+++ b/torch/csrc/jit/tensorexpr/operators/matmul.h
@@ -6,11 +6,11 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeMatmul(
+Tensor computeMatmul(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType);
-Tensor* computeAddMM(
+Tensor computeAddMM(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType);
diff --git a/torch/csrc/jit/tensorexpr/operators/norm.cpp b/torch/csrc/jit/tensorexpr/operators/norm.cpp
index d96ebcd9447db..610f928d4e0b8 100644
--- a/torch/csrc/jit/tensorexpr/operators/norm.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/norm.cpp
@@ -4,7 +4,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeBatchNorm(
+Tensor computeBatchNorm(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType) {
diff --git a/torch/csrc/jit/tensorexpr/operators/norm.h b/torch/csrc/jit/tensorexpr/operators/norm.h
index 98d53b4c306e3..7f1412f0aecd0 100644
--- a/torch/csrc/jit/tensorexpr/operators/norm.h
+++ b/torch/csrc/jit/tensorexpr/operators/norm.h
@@ -6,7 +6,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeBatchNorm(
+Tensor computeBatchNorm(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType);
diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.cpp b/torch/csrc/jit/tensorexpr/operators/reduction.cpp
index c1f3f7f4f2630..fe5cb6d286bd5 100644
--- a/torch/csrc/jit/tensorexpr/operators/reduction.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/reduction.cpp
@@ -19,7 +19,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeSum(
+Tensor computeSum(
     const std::vector<ArgValue>& inputs,
     const c10::optional<ScalarType>& outputType) {
   std::vector<size_t> axes;
@@ -100,7 +100,7 @@ Tensor* computeSum(
       reductionDims);
 }
 
-Tensor* computeMean(
+Tensor computeMean(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType) {
@@ -120,13 +120,13 @@ Tensor* computeMean(
       mean_dims_expr.emplace_back(idx);
     }
   }
-  return new Tensor(
+  return Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf, "nnc_aten_mean", {InputBuf}, mean_dims_expr));
 }
 
-Tensor* computeAdaptiveAvgPool2d(
+Tensor computeAdaptiveAvgPool2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType) {
@@ -137,7 +137,7 @@ Tensor* computeAdaptiveAvgPool2d(
   BufHandle ResultBuf("adaptive_avgpool2d", outputShape, dtype);
   // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
   auto out_size_param = c10::get<IntList>(inputs[1]);
-  return new Tensor(
+  return Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.h b/torch/csrc/jit/tensorexpr/operators/reduction.h
index 4335d7b3bd7f7..d76bac6aa34a1 100644
--- a/torch/csrc/jit/tensorexpr/operators/reduction.h
+++ b/torch/csrc/jit/tensorexpr/operators/reduction.h
@@ -6,14 +6,14 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-TORCH_API Tensor* computeSum(
+TORCH_API Tensor computeSum(
     const std::vector<ArgValue>& inputs,
     const c10::optional<ScalarType>& outputType);
-TORCH_API Tensor* computeMean(
+TORCH_API Tensor computeMean(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType);
-TORCH_API Tensor* computeAdaptiveAvgPool2d(
+TORCH_API Tensor computeAdaptiveAvgPool2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType);
diff --git a/torch/csrc/jit/tensorexpr/operators/softmax.cpp b/torch/csrc/jit/tensorexpr/operators/softmax.cpp
index d6cb6c0d7d089..c1c2872cc4efe 100644
--- a/torch/csrc/jit/tensorexpr/operators/softmax.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/softmax.cpp
@@ -6,7 +6,7 @@ namespace tensorexpr {
 
 using namespace torch::jit::tensorexpr;
 
-Tensor* computeSoftmax(
+Tensor computeSoftmax(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     bool log_softmax) {
@@ -111,48 +111,43 @@ Tensor* computeSoftmax(
       Compute("aten_softmax_exp", output_dims, [&](ParameterList& indices) {
         auto inp = tensorOrConstant(
             inputs[0], convert_indices_to_expr_handle(indices));
-        return exp(inp - max->load(remove_softmax_dim_index(indices)));
+        return exp(inp - max.load(remove_softmax_dim_index(indices)));
       });
   auto sum = Reduce(
       "aten_softmax_sum",
       non_softmax_dims,
       Sum(),
       [&](ParameterList& indices) {
-        return e->load(move_softmax_dim_index_to_pos(indices));
+        return e.load(move_softmax_dim_index_to_pos(indices));
       },
       {output_dims[softmax_dim]});
   if (!log_softmax) {
     auto result =
         Compute("aten_softmax", output_dims, [&](ParameterList& indices) {
-          return e->load(indices) /
-              sum->load(remove_softmax_dim_index(indices));
+          return e.load(indices) / sum.load(remove_softmax_dim_index(indices));
         });
-    return new Tensor(
-        result->buf(),
+    return Tensor(
+        result.buf(),
         alloc<tensorexpr::Block>(std::vector<StmtPtr>(
-            {max->stmt(), e->stmt(), sum->stmt(), result->stmt()})));
+            {max.stmt(), e.stmt(), sum.stmt(), result.stmt()})));
   }
 
   auto log_sum = Compute(
       "aten_softmax_log_sum", non_softmax_dims, [&](ParameterList& indices) {
-        return log(sum->load(indices));
+        return log(sum.load(indices));
       });
   auto result =
       Compute("aten_log_softmax", output_dims, [&](ParameterList& indices) {
         auto inp = tensorOrConstant(
             inputs[0], convert_indices_to_expr_handle(indices));
         auto non_softmax_indices = remove_softmax_dim_index(indices);
-        return inp - max->load(non_softmax_indices) -
-            log_sum->load(non_softmax_indices);
+        return inp - max.load(non_softmax_indices) -
+            log_sum.load(non_softmax_indices);
       });
-  return new Tensor(
-      result->buf(),
+  return Tensor(
+      result.buf(),
       alloc<tensorexpr::Block>(std::vector<StmtPtr>(
-          {max->stmt(),
-           e->stmt(),
-           sum->stmt(),
-           log_sum->stmt(),
-           result->stmt()})));
+          {max.stmt(), e.stmt(), sum.stmt(), log_sum.stmt(), result.stmt()})));
 }
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/softmax.h b/torch/csrc/jit/tensorexpr/operators/softmax.h
index 07ddd0f95b355..b74a867a91b9b 100644
--- a/torch/csrc/jit/tensorexpr/operators/softmax.h
+++ b/torch/csrc/jit/tensorexpr/operators/softmax.h
@@ -6,7 +6,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeSoftmax(
+Tensor computeSoftmax(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     bool log_softmax);
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp
index 9df70f81be4a9..ea3902dcf3c0d 100644
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -48,7 +48,7 @@ StmtPtr Tensor::constructStmt(
   return s;
 }
 
-Tensor* Compute(
+Tensor Compute(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func) {
@@ -57,10 +57,10 @@ Tensor* Compute(
   unpack_dim_args(dim_args, &dims, &args);
   ExprPtr body = body_func(VarVectorToVarHandleVector(args)).node();
   BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
 }
 
-Tensor* Compute(
+Tensor Compute(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const VarHandle&)>& body_func) {
@@ -73,10 +73,10 @@ Tensor* Compute(
   unpack_dim_args(dim_args, &dims, &args);
   ExprPtr body = body_func(VarHandle(args[0])).node();
   BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
 }
 
-Tensor* Compute(
+Tensor Compute(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
@@ -89,10 +89,10 @@ Tensor* Compute(
   unpack_dim_args(dim_args, &dims, &args);
   ExprPtr body = body_func(VarHandle(args[0]), VarHandle(args[1])).node();
   BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
 }
 
-Tensor* Compute(
+Tensor Compute(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<
@@ -108,10 +108,10 @@ Tensor* Compute(
       body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2]))
           .node();
   BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
 }
 
-Tensor* Compute(
+Tensor Compute(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(
@@ -132,10 +132,10 @@ Tensor* Compute(
                      VarHandle(args[3]))
                      .node();
   BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
 }
 
-Tensor* Reduce(
+Tensor Reduce(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
@@ -149,7 +149,7 @@ Tensor* Reduce(
       reduce_args);
 }
 
-Tensor* Reduce(
+Tensor Reduce(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
@@ -163,17 +163,17 @@ Tensor* Reduce(
       reduce_args);
 }
 
-Tensor* Reduce(
+Tensor Reduce(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
-    Tensor* tensor,
+    Tensor tensor,
     const std::vector<DimArg>& reduce_args) {
   return Reduce(
       name,
       dim_args,
       reducer,
-      [&](ParameterList& p) { return tensor->load(p); },
+      [&](ParameterList& p) { return tensor.load(p); },
       reduce_args);
 }
 
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
index 3eb02c69bda78..8d8ffe5cfee44 100644
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -12,7 +12,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-class TORCH_API Tensor : KernelScopedObject {
+class TORCH_API Tensor {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   Tensor(BufPtr buf, const std::vector<VarPtr>& args, ExprPtr body)
@@ -42,9 +42,9 @@ class TORCH_API Tensor : KernelScopedObject {
   }
 
   template <typename T>
-  inline ExprHandle load(const std::vector<T>& args);
+  inline ExprHandle load(const std::vector<T>& args) const;
   template <typename... Ts>
-  inline ExprHandle load(const Ts&... ts);
+  inline ExprHandle load(const Ts&... ts) const;
 
  private:
   StmtPtr constructStmt(
@@ -134,22 +134,22 @@ class Placeholder {
   std::vector<ExprPtr> strides_;
 };
 
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const VarHandle&)>& body_func);
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
         body_func);
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const std::function<
         ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
         body_func);
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(
@@ -157,7 +157,7 @@ TORCH_API Tensor* Compute(
         const VarHandle&,
         const VarHandle&,
         const VarHandle&)>& body_func);
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func);
@@ -179,7 +179,7 @@ inline void unpack_dim_args(
 
 // Handle reductions over a Reducer and a body_func which produces values.
 template <typename InitFunc, typename BodyFunc>
-Tensor* Reduce(
+Tensor Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
@@ -207,7 +207,7 @@ Tensor* Reduce(
             .node();
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     BufPtr func_result = alloc<Buf>(func_name, dims, body->dtype());
-    return new Tensor(func_result, vars, body);
+    return Tensor(func_result, vars, body);
   }
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -227,13 +227,12 @@ Tensor* Reduce(
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ReduceOpPtr reduce_op = reducer(func_result, body, output_args, reduce_vars);
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  Tensor* t =
-      new Tensor(func_result, vars, reduce_dims, reduce_vars, reduce_op);
+  Tensor t = Tensor(func_result, vars, reduce_dims, reduce_vars, reduce_op);
   return t;
 }
 
 template <typename BodyFunc>
-Tensor* Reduce(
+Tensor Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
@@ -250,7 +249,7 @@ Tensor* Reduce(
 
 // Overload which allows inline lambda functions for the body_func.
 template <typename BodyFunc>
-Tensor* Reduce(
+Tensor Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
@@ -260,14 +259,14 @@ Tensor* Reduce(
 }
 
 // Overload for the common case of all dimensions of a Placeholder.
-TORCH_API Tensor* Reduce(
+TORCH_API Tensor Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
     const Placeholder& buffer,
     const std::vector<DimArg>& reduce_args);
 
-TORCH_API Tensor* Reduce(
+TORCH_API Tensor Reduce(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
@@ -276,22 +275,22 @@ TORCH_API Tensor* Reduce(
 
 // Overload for the common case of all dimensions of a prevously Computed
 // Tensor.
-TORCH_API Tensor* Reduce(
+TORCH_API Tensor Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
-    Tensor* tensor,
+    Tensor tensor,
     const std::vector<DimArg>& reduce_args);
 
 template <typename... Ts>
-inline ExprHandle Tensor::load(const Ts&... ts) {
+inline ExprHandle Tensor::load(const Ts&... ts) const {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<ExprHandle> params({ExprHandle(ts)...});
   return Load::make(BufHandle(this->buf()), params);
 }
 
 template <typename T>
-inline ExprHandle Tensor::load(const std::vector<T>& args) {
+inline ExprHandle Tensor::load(const std::vector<T>& args) const {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<ExprHandle> params(args.begin(), args.end());
   return Load::make(BufHandle(this->buf()), params);
diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
index 4e1618a8745d7..c380233cce16a 100644
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -185,9 +185,9 @@ void initTensorExprBindings(PyObject* module) {
              const std::vector<ExprHandle>& args,
              const ExprHandle& val) { return self.store(args, val); })
       .def("data", [](Placeholder& self) { return BufHandle(self.data()); });
-  py::class_<Tensor, std::unique_ptr<Tensor, py::nodelete>>(te, "Tensor")
-      .def(py::init(
-          [](BufHandle& b, StmtPtr s) { return new Tensor(b.node(), s); }))
+  py::class_<Tensor>(te, "Tensor")
+      .def(
+          py::init([](BufHandle& b, StmtPtr s) { return Tensor(b.node(), s); }))
       .def(
           "load",
           [](Tensor& self, const std::vector<ExprHandle>& v) {
@@ -268,7 +268,7 @@ void initTensorExprBindings(PyObject* module) {
       [](const std::string& func_name,
          const std::vector<DimArg>& dim_args,
          const Reducer& reducer,
-         Tensor* buffer,
+         Tensor buffer,
          const std::vector<DimArg>& reduce_args) {
         return Reduce(func_name, dim_args, reducer, buffer, reduce_args);
       },
@@ -380,7 +380,7 @@ void initTensorExprBindings(PyObject* module) {
       .def(py::init(&ExternalCall::make));
 
   py::class_<LoopNest>(te, "LoopNest")
-      .def(py::init<const std::vector<Tensor*>&>())
+      .def(py::init<const std::vector<Tensor>&>())
       .def(py::init([](StmtPtr s, const std::vector<BufHandle>& bufs) {
         std::unordered_set<BufPtr> buf_nodes;
         for (auto& buf : bufs) {
@@ -392,9 +392,7 @@ void initTensorExprBindings(PyObject* module) {
       .def("prepare_for_codegen", &LoopNest::prepareForCodegen)
       .def(
           "get_loop_body_for",
-          [](const LoopNest& self, Tensor* t) {
-            return self.getLoopBodyFor(t);
-          },
+          [](const LoopNest& self, Tensor t) { return self.getLoopBodyFor(t); },
           py::return_value_policy::reference)
       .def(
           "get_loop_body_for",
@@ -404,7 +402,7 @@ void initTensorExprBindings(PyObject* module) {
           py::return_value_policy::reference)
       .def(
           "get_loops_for",
-          [](const LoopNest& self, Tensor* t) {
+          [](const LoopNest& self, Tensor t) {
             return self.getLoopStmtsFor(t);
           },
           py::return_value_policy::reference)
@@ -760,12 +758,12 @@ void initTensorExprBindings(PyObject* module) {
 
   py::class_<CodeGen::BufferArg>(te, "BufferArg")
       .def(py::init<const Placeholder&>())
-      .def(py::init<Tensor*>())
+      .def(py::init<Tensor>())
       .def(py::init<const VarHandle&>())
       .def(py::init<const BufHandle&>());
 
   py::implicitly_convertible<Placeholder, CodeGen::BufferArg>();
-  py::implicitly_convertible<Tensor*, CodeGen::BufferArg>();
+  py::implicitly_convertible<Tensor, CodeGen::BufferArg>();
   py::implicitly_convertible<VarHandle, CodeGen::BufferArg>();
   py::implicitly_convertible<BufHandle, CodeGen::BufferArg>();
 

From f0d274294d48c7979b47ad0a0257b978d739936d Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Tue, 24 Aug 2021 00:29:22 -0700
Subject: [PATCH 160/530] [TensorExpr] Nuke KernelArena and KernelScope.
 (#63587)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63587

Now that there is no classes using KernelArena for memory management we
can remove it.

Differential Revision:
D30429115
D30429115

Test Plan: Imported from OSS

Reviewed By: navahgar

Pulled By: ZolotukhinM

fbshipit-source-id: 375f6f9294d27790645eeb7cb5a8e87047a57544
---
 benchmarks/cpp/tensorexpr/bench_approx.cpp    |   8 -
 benchmarks/cpp/tensorexpr/bench_batchnorm.cpp |   2 -
 benchmarks/cpp/tensorexpr/bench_compile.cpp   |   2 -
 benchmarks/cpp/tensorexpr/bench_concat.cpp    |   2 -
 benchmarks/cpp/tensorexpr/bench_gemm.cpp      |   5 -
 benchmarks/cpp/tensorexpr/bench_parallel.cpp  |   1 -
 benchmarks/cpp/tensorexpr/bench_reduce.cpp    |   7 -
 test/cpp/tensorexpr/test_approx.cpp           |   1 -
 test/cpp/tensorexpr/test_aten.cpp             |  35 ---
 test/cpp/tensorexpr/test_boundsinference.cpp  |  41 ---
 test/cpp/tensorexpr/test_conv.cpp             |   5 -
 test/cpp/tensorexpr/test_cpp_codegen.cpp      |   3 -
 test/cpp/tensorexpr/test_cuda.cpp             |  27 --
 test/cpp/tensorexpr/test_expr.cpp             |  31 ---
 test/cpp/tensorexpr/test_external_calls.cpp   |  15 --
 test/cpp/tensorexpr/test_graph_opt.cpp        |   7 -
 test/cpp/tensorexpr/test_ir_printer.cpp       |   4 -
 test/cpp/tensorexpr/test_ir_verifier.cpp      |   8 -
 test/cpp/tensorexpr/test_kernel.cpp           |  33 ---
 test/cpp/tensorexpr/test_llvm.cpp             |  83 ------
 test/cpp/tensorexpr/test_loopnest.cpp         | 254 ------------------
 test/cpp/tensorexpr/test_memdependency.cpp    |  40 ---
 test/cpp/tensorexpr/test_ops.cpp              |   2 -
 test/cpp/tensorexpr/test_reductions.cpp       |  83 ------
 test/cpp/tensorexpr/test_registerizer.cpp     |  70 -----
 test/cpp/tensorexpr/test_simplify.cpp         | 114 --------
 test/cpp/tensorexpr/test_te_fuser_pass.cpp    |   1 -
 test/cpp/tensorexpr/test_type.cpp             |  22 --
 test/cpp/tensorexpr/tutorial.cpp              |  13 -
 test/test_tensorexpr_pybind.py                | 154 +++++------
 tools/build_variables.bzl                     |   1 -
 torch/csrc/jit/runtime/static/ops.cpp         |   4 -
 torch/csrc/jit/tensorexpr/expr.h              |   1 -
 torch/csrc/jit/tensorexpr/kernel.cpp          |   5 -
 torch/csrc/jit/tensorexpr/kernel.h            |   1 -
 torch/csrc/jit/tensorexpr/mem_arena.cpp       |  67 -----
 torch/csrc/jit/tensorexpr/mem_arena.h         |  60 -----
 torch/csrc/jit/tensorexpr/tensorexpr_init.cpp |   1 -
 38 files changed, 70 insertions(+), 1143 deletions(-)
 delete mode 100644 torch/csrc/jit/tensorexpr/mem_arena.cpp
 delete mode 100644 torch/csrc/jit/tensorexpr/mem_arena.h

diff --git a/benchmarks/cpp/tensorexpr/bench_approx.cpp b/benchmarks/cpp/tensorexpr/bench_approx.cpp
index 92c26401f1617..425d19faabc30 100644
--- a/benchmarks/cpp/tensorexpr/bench_approx.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_approx.cpp
@@ -29,7 +29,6 @@ void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor target) {
 }
 
 static void relu_nnc(benchmark::State& state) {
-  KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   auto clamp = 0;
@@ -64,7 +63,6 @@ static void relu_nnc(benchmark::State& state) {
 }
 
 static void log_nnc_sleef(benchmark::State& state) {
-  KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   torch::jit::tensorexpr::Tensor B =
@@ -94,7 +92,6 @@ static void log_nnc_sleef(benchmark::State& state) {
 }
 
 static void log_nnc_fast(benchmark::State& state) {
-  KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   torch::jit::tensorexpr::Tensor B =
@@ -124,7 +121,6 @@ static void log_nnc_fast(benchmark::State& state) {
 }
 
 static void log_nnc_vml(benchmark::State& state) {
-  KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   torch::jit::tensorexpr::Tensor B =
@@ -164,7 +160,6 @@ static void log_aten(benchmark::State& state) {
 }
 
 static void logit_nnc_sleef(benchmark::State& state) {
-  KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   auto clamp = 1e-6f;
@@ -201,7 +196,6 @@ static void logit_nnc_sleef(benchmark::State& state) {
 }
 
 static void logit_nnc_fast(benchmark::State& state) {
-  KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   auto clamp = 1e-6f;
@@ -238,7 +232,6 @@ static void logit_nnc_fast(benchmark::State& state) {
 }
 
 static void logit_nnc_vml(benchmark::State& state) {
-  KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   auto clamp = 1e-6f;
@@ -316,7 +309,6 @@ static void logit_caffe2(benchmark::State& state) {
 }
 
 static void tanh_nnc_fast(benchmark::State& state) {
-  KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   torch::jit::tensorexpr::Tensor B =
diff --git a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
index 85bf9d326ffac..702ed1cf3ab9d 100644
--- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
@@ -74,7 +74,6 @@ BENCHMARK_DEFINE_F(BatchNorm, ATen)(benchmark::State& state) {
 }
 
 BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) {
-  KernelScope ks;
 
   Placeholder input("input", kFloat, {N_, C_, H_, W_});
   Placeholder weight("weight", kFloat, {C_});
@@ -137,7 +136,6 @@ BENCHMARK_DEFINE_F(BatchNorm, ATenRelu)(benchmark::State& state) {
 }
 
 BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) {
-  KernelScope ks;
 
   Placeholder input("input", kFloat, {N_, C_, H_, W_});
   Placeholder weight("weight", kFloat, {C_});
diff --git a/benchmarks/cpp/tensorexpr/bench_compile.cpp b/benchmarks/cpp/tensorexpr/bench_compile.cpp
index 50d54e57f889a..f204377ab8126 100644
--- a/benchmarks/cpp/tensorexpr/bench_compile.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_compile.cpp
@@ -10,7 +10,6 @@ namespace te = torch::jit::tensorexpr;
 static void BM_CompileSwish(benchmark::State& state) {
   for (auto _ : state) {
     constexpr int N = 512;
-    te::KernelScope ks;
     te::VarHandle n("n", te::kInt);
     te::Placeholder A(te::BufHandle("A", {N}, te::kFloat));
     te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
@@ -40,7 +39,6 @@ static void BM_CompileSwish(benchmark::State& state) {
 
 static void BM_CompileSwishLLVMOnly(benchmark::State& state) {
   constexpr int N = 512;
-  te::KernelScope ks;
   te::VarHandle n("n", te::kInt);
   te::Placeholder A(te::BufHandle("A", {N}, te::kFloat));
   te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp
index 856065d6e789f..c108c867acbf4 100644
--- a/benchmarks/cpp/tensorexpr/bench_concat.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp
@@ -47,7 +47,6 @@ class ConcatBench : public benchmark::Fixture {
   }
 
   void runNNC(benchmark::State& state) {
-    KernelScope ks;
 
     size_t num_inputs = inputs_.size();
     size_t num_dims = 2;
@@ -101,7 +100,6 @@ class ConcatBench : public benchmark::Fixture {
   }
 
   void runNNCLoop(benchmark::State& state) {
-    KernelScope ks;
 
     size_t num_inputs = inputs_.size();
     size_t num_dims = 2;
diff --git a/benchmarks/cpp/tensorexpr/bench_gemm.cpp b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
index 8646e97b756c2..ec13b09025eea 100644
--- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
@@ -40,7 +40,6 @@ BENCHMARK_DEFINE_F(Gemm, Torch)(benchmark::State& state) {
 }
 
 BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
-  te::KernelScope ks;
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
@@ -64,7 +63,6 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
 }
 
 BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
-  te::KernelScope ks;
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
@@ -124,7 +122,6 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
 }
 
 BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
-  te::KernelScope ks;
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
@@ -184,7 +181,6 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
 }
 
 BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
-  te::KernelScope ks;
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
@@ -252,7 +248,6 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
 }
 
 BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
-  te::KernelScope ks;
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
index 847b66d9ee58b..178a8795edd03 100644
--- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
@@ -35,7 +35,6 @@ class ParallelAdd : public benchmark::Fixture {
 };
 
 BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
-  KernelScope kernel_scope;
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
   Tensor c_tensor = Compute(
diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
index 9d3570197414d..e053317feca60 100644
--- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
@@ -217,7 +217,6 @@ BENCHMARK_REGISTER_F(Reduce1D, NativeTiled)->Args({1 << 24});
 #endif // USE_AVX2
 
 BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
-  te::KernelScope ks;
 
   int M = A.numel();
 
@@ -250,7 +249,6 @@ BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
 BENCHMARK_REGISTER_F(Reduce1D, TeNaive)->Args({1 << 24});
 
 BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
-  te::KernelScope ks;
 
   int M = A.numel();
 
@@ -291,7 +289,6 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
 BENCHMARK_REGISTER_F(Reduce1D, TeSplitTail)->Args({1 << 24});
 
 BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
-  te::KernelScope ks;
 
   int M = A.numel();
 
@@ -332,7 +329,6 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
 BENCHMARK_REGISTER_F(Reduce1D, TeSplitMask)->Args({1 << 24});
 
 BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
-  te::KernelScope ks;
 
   int M = A.numel();
   const int kChunkSize = 8;
@@ -385,7 +381,6 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
 BENCHMARK_REGISTER_F(Reduce1D, TeRfactorV1)->Args({1 << 24});
 
 BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) {
-  te::KernelScope ks;
   const int M = A.numel();
   const int kChunkSize = 8;
 
@@ -450,7 +445,6 @@ BENCHMARK_REGISTER_F(Reduce2DCol, Torch)
 ->Args({1 << 12, 1 << 12});
 
 BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) {
-  te::KernelScope ks;
   constexpr int kCacheSize = 1 << 12;
   te::Placeholder a("A", te::kFloat, {M, N});
   te::Tensor b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
@@ -557,7 +551,6 @@ BENCHMARK_REGISTER_F(Reduce2DRow, Hand)
 ->Args({1 << 18, 1 << 6});
 
 BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) {
-  te::KernelScope ks;
   constexpr int kChunkSize = 8;
   te::Placeholder a("A", te::kFloat, {M, N});
   te::Tensor b = te::computeSum({a.handle(), te::IntList({1}), false}, at::kFloat);
diff --git a/test/cpp/tensorexpr/test_approx.cpp b/test/cpp/tensorexpr/test_approx.cpp
index 2a4ce9485acde..8de395fe92796 100644
--- a/test/cpp/tensorexpr/test_approx.cpp
+++ b/test/cpp/tensorexpr/test_approx.cpp
@@ -30,7 +30,6 @@ std::string diffs(const at::Tensor& a, const at::Tensor& b) {
 }
 
 TEST(Approx, log_vml) {
-  te::KernelScope ks;
   te::VarHandle N("N", te::kInt);
   te::Placeholder A("A", te::kFloat, {N});
   te::Tensor B = te::Compute(
diff --git a/test/cpp/tensorexpr/test_aten.cpp b/test/cpp/tensorexpr/test_aten.cpp
index 9eb141250cb35..040b7b0a920fb 100644
--- a/test/cpp/tensorexpr/test_aten.cpp
+++ b/test/cpp/tensorexpr/test_aten.cpp
@@ -15,7 +15,6 @@ namespace jit {
 using namespace torch::jit::tensorexpr;
 
 TEST(ATen, _cast_Float) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -43,7 +42,6 @@ TEST(ATen, _cast_Float) {
 }
 
 TEST(ATen, negInt) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
@@ -71,7 +69,6 @@ TEST(ATen, negInt) {
 }
 
 TEST(ATen, negFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -99,7 +96,6 @@ TEST(ATen, negFloat) {
 }
 
 TEST(ATen, addInt) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
@@ -136,7 +132,6 @@ TEST(ATen, addInt) {
 }
 
 TEST(ATen, addFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -173,7 +168,6 @@ TEST(ATen, addFloat) {
 }
 
 TEST(ATen, subInt) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
@@ -210,7 +204,6 @@ TEST(ATen, subInt) {
 }
 
 TEST(ATen, subFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -247,7 +240,6 @@ TEST(ATen, subFloat) {
 }
 
 TEST(ATen, lerp) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -284,7 +276,6 @@ TEST(ATen, lerp) {
 }
 
 TEST(ATen, addcmulInt) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
@@ -326,7 +317,6 @@ TEST(ATen, addcmulInt) {
 }
 
 TEST(ATen, addcmulFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -368,7 +358,6 @@ TEST(ATen, addcmulFloat) {
 }
 
 TEST(ATen, mulInt) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
@@ -400,7 +389,6 @@ TEST(ATen, mulInt) {
 }
 
 TEST(ATen, mulFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -432,7 +420,6 @@ TEST(ATen, mulFloat) {
 }
 
 TEST(ATen, divInt) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
@@ -464,7 +451,6 @@ TEST(ATen, divInt) {
 }
 
 TEST(ATen, divFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -496,7 +482,6 @@ TEST(ATen, divFloat) {
 }
 
 TEST(ATen, maxInt) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
@@ -528,7 +513,6 @@ TEST(ATen, maxInt) {
 }
 
 TEST(ATen, maxFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -560,7 +544,6 @@ TEST(ATen, maxFloat) {
 }
 
 TEST(ATen, minInt) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
@@ -592,7 +575,6 @@ TEST(ATen, minInt) {
 }
 
 TEST(ATen, minFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -624,7 +606,6 @@ TEST(ATen, minFloat) {
 }
 
 void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -651,7 +632,6 @@ void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() {
 }
 
 TEST(ATen, reluInt) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
@@ -678,7 +658,6 @@ TEST(ATen, reluInt) {
 }
 
 TEST(ATen, reluFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -707,7 +686,6 @@ TEST(ATen, reluFloat) {
 }
 
 TEST(ATen, logFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -734,7 +712,6 @@ TEST(ATen, logFloat) {
 }
 
 TEST(ATen, fastLogFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -766,7 +743,6 @@ TEST(ATen, fastLogFloat) {
 }
 
 TEST(ATen, fastTanhFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -798,7 +774,6 @@ TEST(ATen, fastTanhFloat) {
 }
 
 TEST(ATen, fastSigmoidFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -831,7 +806,6 @@ TEST(ATen, fastSigmoidFloat) {
 }
 
 TEST(ATen, log10Float) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -858,7 +832,6 @@ TEST(ATen, log10Float) {
 }
 
 TEST(ATen, log2Float) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -885,7 +858,6 @@ TEST(ATen, log2Float) {
 }
 
 TEST(ATen, expFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -913,7 +885,6 @@ TEST(ATen, expFloat) {
 }
 
 TEST(ATen, erfFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -941,7 +912,6 @@ TEST(ATen, erfFloat) {
 }
 
 TEST(ATen, cosFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -969,7 +939,6 @@ TEST(ATen, cosFloat) {
 }
 
 TEST(ATen, eqInt) {
-  KernelScope kernel_scope;
   constexpr int N = 128;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
@@ -995,7 +964,6 @@ TEST(ATen, eqInt) {
 }
 
 TEST(ATen, geInt) {
-  KernelScope kernel_scope;
   constexpr int N = 128;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
@@ -1021,7 +989,6 @@ TEST(ATen, geInt) {
 }
 
 TEST(ATen, gtInt) {
-  KernelScope kernel_scope;
   constexpr int N = 128;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
@@ -1047,7 +1014,6 @@ TEST(ATen, gtInt) {
 }
 
 TEST(ATen, leInt) {
-  KernelScope kernel_scope;
   constexpr int N = 128;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
@@ -1073,7 +1039,6 @@ TEST(ATen, leInt) {
 }
 
 TEST(ATen, ltInt) {
-  KernelScope kernel_scope;
   constexpr int N = 128;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp
index d038665fad75b..2eb0dfb997da8 100644
--- a/test/cpp/tensorexpr/test_boundsinference.cpp
+++ b/test/cpp/tensorexpr/test_boundsinference.cpp
@@ -46,7 +46,6 @@ TEST(BoundsInference, _1) {
   //   b[i] = a[i]
   // For this loop bounds inference should yield the following:
   // {{b, kStore, 0, 99}, {a, kLoad, 0, 99}}
-  KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor b =
@@ -71,7 +70,6 @@ TEST(BoundsInference, _2) {
   //   b[i] = a[i]
   // For this loop bounds inference should yield the following:
   // {{b, kStore, 0, n-1}, {a, kLoad, 0, n-1}}
-  KernelScope kernel_scope;
   VarHandle n("n", kInt);
   Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor b =
@@ -96,7 +94,6 @@ TEST(BoundsInference, _3) {
   //   b[i] = a[i] * a[i+10]
   // For this loop bounds inference should yield the following:
   // {{b, kStore, 0, 99}, {a, kLoad, 0, 109}}
-  KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n + 10}, kFloat));
   Tensor b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) {
@@ -125,7 +122,6 @@ TEST(BoundsInference, _4) {
   // for y in 0..200:
   //   for x in 0..320:
   //     c[y,x] = a[y,x] * b[y,x]
-  KernelScope kernel_scope;
   ExprHandle W(320);
   ExprHandle H(200);
   Placeholder a(BufHandle("a", {H, W}, kFloat));
@@ -205,7 +201,6 @@ TEST(BoundsInference, _5) {
   //     b[i_outer * 16 + i_inner] = a[i_outer * 16 + i_inner]
   // for i_tail in 0..100%16:
   //   b[i_tail + (100/16)*16] = a[i_tail + (100/16)*16];
-  KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor b =
@@ -257,7 +252,6 @@ TEST(BoundsInference, _6) {
   // for y in 0..20:
   //   for x in 0..32:
   //     c[y,x] = a[y+100,x+100] * b[y*2,x*5]
-  KernelScope kernel_scope;
   ExprHandle W(320);
   ExprHandle H(200);
   ExprHandle CW(32);
@@ -328,7 +322,6 @@ TEST(BoundsInference, _6) {
 }
 
 TEST(BoundsInference, Adjacent) {
-  KernelScope kernel_scope;
   ExprHandle H(6);
   Placeholder a(BufHandle("a", {20}, kFloat));
   Tensor b =
@@ -388,7 +381,6 @@ TEST(BoundsInference, Adjacent) {
 }
 
 TEST(BoundsInference, MultipleTopLoopLoad) {
-  KernelScope kernel_scope;
   Placeholder a(BufHandle("a", {100}, kFloat));
   Tensor b =
       Compute("b", {{64, "x"}}, [&](const VarHandle& x) { return a.load(x); });
@@ -444,7 +436,6 @@ TEST(BoundsInference, MultipleTopLoopLoad) {
 }
 
 TEST(BoundsInference, MultipleTopLoopStore) {
-  KernelScope kernel_scope;
   BufHandle a("a", {100}, kFloat);
   BufHandle b("b", {100}, kFloat);
   BufHandle c("c", {100}, kFloat);
@@ -504,8 +495,6 @@ TEST(BoundsInference, MultipleTopLoopStore) {
 }
 
 TEST(BoundsInference, CacheReads) {
-  KernelScope kernel_scope;
-
   Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
@@ -570,7 +559,6 @@ TEST(BoundsInference, CacheReads) {
 }
 
 TEST(BoundsInference, Flattened) {
-  KernelScope kernel_scope;
   Tensor b = Compute(
       "b",
       {{3, "z"}, {4, "y"}, {5, "x"}},
@@ -597,7 +585,6 @@ TEST(BoundsInference, Flattened) {
 }
 
 TEST(BoundsInference, GetPotentialHazards) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -649,8 +636,6 @@ TEST(BoundsInference, GetPotentialHazards) {
 }
 
 TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
-  KernelScope kernel_scope;
-
   Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
@@ -677,8 +662,6 @@ TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
 }
 
 TEST(BoundsInference, GetPotentialHazardsLoopCall) {
-  KernelScope kernel_scope;
-
   Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
@@ -704,8 +687,6 @@ TEST(BoundsInference, GetPotentialHazardsLoopCall) {
 }
 
 TEST(BoundsInference, GetPotentialHazardsLoopSplit) {
-  KernelScope kernel_scope;
-
   Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
@@ -731,8 +712,6 @@ TEST(BoundsInference, GetPotentialHazardsLoopSplit) {
 }
 
 TEST(BoundsInference, HasConflictingOverlapSameBufferWithPartialOverlap) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 10; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -755,8 +734,6 @@ TEST(BoundsInference, HasConflictingOverlapSameBufferWithPartialOverlap) {
 }
 
 TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlap) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 10; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -778,8 +755,6 @@ TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlap) {
 }
 
 TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlapRAW) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 10; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -803,8 +778,6 @@ TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlapRAW) {
 }
 
 TEST(BoundsInference, HasConflictingOverlapSameBufferNotOverlapping) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 10; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -827,8 +800,6 @@ TEST(BoundsInference, HasConflictingOverlapSameBufferNotOverlapping) {
 }
 
 TEST(BoundsInference, HasConflictingOverlap2DBufferWithOverlap) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 100; j++) {
@@ -868,8 +839,6 @@ TEST(BoundsInference, HasConflictingOverlap2DBufferWithOverlap) {
 }
 
 TEST(BoundsInference, HasConflictingOverlap2DBufferWithNoOverlap) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 100; j++) {
@@ -909,8 +878,6 @@ TEST(BoundsInference, HasConflictingOverlap2DBufferWithNoOverlap) {
 }
 
 TEST(BoundsInference, HasConflictingOverlapDifferentBuffers) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 100; j++) {
@@ -949,8 +916,6 @@ TEST(BoundsInference, HasConflictingOverlapDifferentBuffers) {
 }
 
 TEST(BoundsInference, HasConflictingOverlapDueToRAWDependence) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 0; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -978,8 +943,6 @@ TEST(BoundsInference, HasConflictingOverlapDueToRAWDependence) {
 }
 
 TEST(BoundsInference, HasConflictingOverlapDueToWARDependence) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int k = 0; k < 100; k++) {
   //     B[k] = 20 * A[99-k];
@@ -1007,8 +970,6 @@ TEST(BoundsInference, HasConflictingOverlapDueToWARDependence) {
 }
 
 TEST(BoundsInference, HasConflictingOverlapWithLoads) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int k = 10; k < 100; k++) {
   //     B[k] = 20 * A[99-k];
@@ -1041,8 +1002,6 @@ TEST(BoundsInference, HasConflictingOverlapWithLoads) {
 }
 
 TEST(BoundsInference, IsOverlapping) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 100; i++) {
   //     A[i] = i * 10;               // storeA1
diff --git a/test/cpp/tensorexpr/test_conv.cpp b/test/cpp/tensorexpr/test_conv.cpp
index 293fbe248f176..19372779094a6 100644
--- a/test/cpp/tensorexpr/test_conv.cpp
+++ b/test/cpp/tensorexpr/test_conv.cpp
@@ -21,7 +21,6 @@ static at::Tensor genTestData(c10::IntArrayRef args) {
 #ifdef TORCH_ENABLE_LLVM
 
 TEST(Conv, DepthwiseConv2D) {
-  te::KernelScope kernel_scope;
   constexpr int N = 1, C = 72, H = 56, W = 56;
   constexpr int K = 72, R = 3, S = 3;
   constexpr int kPad = 1, kStride = 2, kGroups = C;
@@ -53,7 +52,6 @@ TEST(Conv, DepthwiseConv2D) {
 }
 
 TEST(Conv, DepthwiseConv2DNoBias) {
-  te::KernelScope kernel_scope;
   constexpr int N = 1, C = 72, H = 56, W = 56;
   constexpr int K = 72, R = 3, S = 3;
   constexpr int kPad = 1, kStride = 2, kGroups = C;
@@ -80,7 +78,6 @@ TEST(Conv, DepthwiseConv2DNoBias) {
 }
 
 TEST(Conv, DepthwiseConv2DDynamicShapes) {
-  te::KernelScope kernel_scope;
   te::VarHandle N_var("N", te::kInt);
   te::VarHandle C_var("C", te::kInt);
   te::VarHandle H_var("H", te::kInt);
@@ -164,8 +161,6 @@ TEST(Conv, DepthwiseConv2DDynamicShapes) {
 #endif
 
 TEST(Conv, Conv2D) {
-  te::KernelScope kernel_scope;
-
   // Input dimensions.
   constexpr int N = 1;
   constexpr int C = 3;
diff --git a/test/cpp/tensorexpr/test_cpp_codegen.cpp b/test/cpp/tensorexpr/test_cpp_codegen.cpp
index 82ea40d995f29..df9166b675859 100644
--- a/test/cpp/tensorexpr/test_cpp_codegen.cpp
+++ b/test/cpp/tensorexpr/test_cpp_codegen.cpp
@@ -3,7 +3,6 @@
 #include <test/cpp/tensorexpr/test_base.h>
 
 #include <torch/csrc/jit/tensorexpr/cpp_codegen.h>
-#include <torch/csrc/jit/tensorexpr/mem_arena.h>
 #include <torch/csrc/jit/tensorexpr/stmt.h>
 #include <torch/csrc/jit/testing/file_check.h>
 
@@ -13,7 +12,6 @@ namespace jit {
 using namespace torch::jit::tensorexpr;
 
 TEST(CppPrinter, AllocateOnStackThenFree) {
-  KernelScope kernel_scope;
   std::vector<ExprPtr> dims = {alloc<IntImm>(2), alloc<IntImm>(3)};
   BufPtr buf = alloc<Buf>("x", dims, kInt);
   AllocatePtr alloc_ = alloc<Allocate>(buf);
@@ -32,7 +30,6 @@ TEST(CppPrinter, AllocateOnStackThenFree) {
 }
 
 TEST(CppPrinter, AllocateOnHeapThenFree) {
-  KernelScope kernel_scope;
   std::vector<ExprPtr> dims = {
       alloc<IntImm>(20), alloc<IntImm>(50), alloc<IntImm>(3)};
   BufPtr buf = alloc<Buf>("y", dims, kLong);
diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
index ed5c070ea8689..164ff772d5b46 100644
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ b/test/cpp/tensorexpr/test_cuda.cpp
@@ -27,7 +27,6 @@ using namespace torch::jit::tensorexpr;
 
 template <typename ctype>
 static void testCudaTestVectorAdd01_impl() {
-  KernelScope kernel_scope;
   const int num_iter = 3;
   const int block_count = 16;
   const int block_size = 128;
@@ -93,7 +92,6 @@ float sigmoid(float x) {
 }
 
 TEST(Cuda, Sigmoid_CUDA) {
-  KernelScope kernel_scope;
   const int num_iter = 3;
   const int block_count = 16;
   const int block_size = 128;
@@ -162,7 +160,6 @@ TEST(Cuda, TestVectorAdd01_CUDA) {
 }
 
 static void testCudaTestVectorAdd02_impl(int N, int block_size) {
-  KernelScope kernel_scope;
   Placeholder a_buf("a", kFloat, {N});
   Placeholder b_buf("b", kFloat, {N});
   Tensor c = Compute(
@@ -222,7 +219,6 @@ TEST(Cuda, TestVectorAdd02_CUDA) {
 }
 
 TEST(Cuda, HalfCast_CUDA) {
-  KernelScope ks;
   auto half = ToDtype<at::Half>();
   Placeholder a("a", half, {4});
   Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
@@ -261,7 +257,6 @@ TEST(Cuda, HalfCast_CUDA) {
 }
 
 TEST(Cuda, DynamicShape2D_CUDA) {
-  KernelScope kernel_scope;
   auto testWithSize = [](int32_t M, int32_t N) {
     VarHandle m("m", kInt);
     VarHandle n("n", kInt);
@@ -324,7 +319,6 @@ TEST(Cuda, DynamicShape2D_CUDA) {
 }
 
 TEST(Cuda, TestRand01_CUDA) {
-  KernelScope kernel_scope;
   const int num_iter = 3;
   const int block_count = 16;
   const int block_size = 128;
@@ -383,7 +377,6 @@ TEST(Cuda, TestRand01_CUDA) {
 }
 
 TEST(Cuda, DynamicShapeSplit_CUDA) {
-  KernelScope ks;
   constexpr int N = 4096;
   VarHandle n("n", kInt);
   Placeholder a(BufHandle("a", {n}, kFloat));
@@ -434,7 +427,6 @@ TEST(Cuda, DynamicShapeSplit_CUDA) {
 
 TEST(Cuda, OneBlockOneThreadGlobalReduce1_CUDA) {
   const static int N = 1024;
-  KernelScope kernel_scope;
   Placeholder data_buf("data", kFloat, {N});
   Placeholder output_buf("output", kFloat, {1});
 
@@ -501,7 +493,6 @@ TEST(Cuda, OneBlockOneThreadGlobalReduce1_CUDA) {
 
 TEST(Cuda, OneBlockMultiThreadGlobalReduce1_CUDA) {
   const static int N = 1024;
-  KernelScope kernel_scope;
 
   // This test does the following reduction:
   // clang-format off
@@ -578,8 +569,6 @@ TEST(Cuda, OneBlockMultiThreadGlobalReduce1_CUDA) {
 }
 
 TEST(Cuda, NoThreadIdxWrite_1_CUDA) {
-  KernelScope kernel_scope;
-
   // This test does the following reduction:
   //
   // for k in 0..1: // block-idx
@@ -676,7 +665,6 @@ TEST(Cuda, NoThreadIdxWrite_1_CUDA) {
 
 TEST(Cuda, SharedMemReduce_1_CUDA) {
   // FIXME: this test is flaky in CI.
-  KernelScope kernel_scope;
   // This test does the following:
   //  for k in 0..1:  // block-idx
   //    alloc(c, 64)
@@ -814,7 +802,6 @@ TEST(Cuda, SharedMemReduce_1_CUDA) {
 }
 
 TEST(Cuda, LocalMemReduce_1_CUDA) {
-  KernelScope kernel_scope;
   // This test does the following:
   //  for k in 0..1:  // block-idx
   //    b(k) = 0
@@ -925,7 +912,6 @@ TEST(Cuda, LocalMemReduce_1_CUDA) {
 }
 
 TEST(Cuda, HalfSupport_CUDA) {
-  KernelScope ks;
   auto half = ToDtype<at::Half>();
   Placeholder a("a", half, {4});
   Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
@@ -983,7 +969,6 @@ TEST(Cuda, HalfSupport_CUDA) {
 }
 
 TEST(Cuda, HalfPropagation_CUDA) {
-  KernelScope kernel_scope;
   auto half = ToDtype<at::Half>();
   Placeholder a("a", half, {4});
   Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
@@ -1032,7 +1017,6 @@ TEST(Cuda, HalfPropagation_CUDA) {
 }
 
 TEST(Cuda, UnusedHalfArgument_CUDA) {
-  KernelScope kernel_scope;
   Placeholder a("a", kFloat, {4});
   auto half = ToDtype<at::Half>();
   Placeholder b("b", half, {4});
@@ -1089,7 +1073,6 @@ TEST(Cuda, UnusedHalfArgument_CUDA) {
 }
 
 TEST(Cuda, PrioritizeDependents_CUDA) {
-  KernelScope kernel_scope;
   Placeholder a("a", kFloat, {10});
   Placeholder b("b", kFloat, {12});
   Placeholder c("c", kFloat, {12});
@@ -1163,7 +1146,6 @@ TEST(Cuda, PrioritizeDependents_CUDA) {
 /// Tests the case where there are two loops which have different extents bound
 /// to the same block dimension. We must mask the smaller extent loop body.
 TEST(Cuda, MaskBlockDim_CUDA) {
-  KernelScope kernel_scope;
   int A_SIZE = 100;
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {A_SIZE});
@@ -1256,7 +1238,6 @@ TEST(Cuda, MaskBlockDim_CUDA) {
 /// to the same thread dimension. This is the same as the above - the smaller
 /// rank write should be masked. But this time we also need to syncthreads.
 TEST(Cuda, MaskThreadDim_CUDA) {
-  KernelScope kernel_scope;
   int A_SIZE = 50;
   int B_SIZE = 100;
   Placeholder a_buf("a", kFloat, {A_SIZE});
@@ -1351,7 +1332,6 @@ TEST(Cuda, MaskThreadDim_CUDA) {
 // Note: this is an extremely dumb pattern which we should never see, but is a
 // useful edge case to make sure we've got things covered.
 TEST(Cuda, MaskMultiBlockDim_CUDA) {
-  KernelScope kernel_scope;
   int A_SIZE = 100;
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {A_SIZE});
@@ -1445,7 +1425,6 @@ TEST(Cuda, MaskMultiBlockDim_CUDA) {
 // Note: this is an extremely dumb pattern which we should never see, but is a
 // useful edge case to make sure we've got things covered.
 TEST(Cuda, MaskBlockAndThreadDim_CUDA) {
-  KernelScope kernel_scope;
   int A_SIZE = 100;
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {A_SIZE});
@@ -1537,7 +1516,6 @@ TEST(Cuda, MaskBlockAndThreadDim_CUDA) {
 /// outer loop bound to blockDim.x and the inner loop bound to threadDim.x. In
 /// this case all writes with a rank smaller than the max should be masked.
 TEST(Cuda, MaskMultiDim_CUDA) {
-  KernelScope kernel_scope;
   int OUTER_SIZE = 10;
   int A_SIZE = 100;
   int B_SIZE = 50;
@@ -1667,7 +1645,6 @@ TEST(Cuda, MaskMultiDim_CUDA) {
 // In this case both stores must be masked against the extent of the other loop,
 // incase it is larger.
 TEST(Cuda, MaskMultiDimSymbolic_CUDA) {
-  KernelScope kernel_scope;
   VarHandle OUTER_SIZE("OUTER_SIZE", kInt);
   VarHandle A_SIZE("A_SIZE", kInt);
   VarHandle B_SIZE("B_SIZE", kInt);
@@ -1803,7 +1780,6 @@ TEST(Cuda, MaskMultiDimSymbolic_CUDA) {
 // extents but are bound to the same thread dimension. The smaller loop should
 // be masked.
 TEST(Cuda, MaskCompoundInnerLoop_CUDA) {
-  KernelScope kernel_scope;
   int OUTER_SIZE = 10;
   int A_SIZE = 100;
   int B_SIZE = 50;
@@ -1942,7 +1918,6 @@ TEST(Cuda, MaskCompoundInnerLoop_CUDA) {
 // the first thread dimensions. This should work just like the MaskThreadDim
 // test where the bigger loop is unmasked but the smaller is masked.
 TEST(Cuda, MaskInnerLoopOneBlock_CUDA) {
-  KernelScope kernel_scope;
   int OUTER_SIZE = 10;
   int A_SIZE = 100;
   int B_SIZE = 50;
@@ -2081,7 +2056,6 @@ TEST(Cuda, MaskInnerLoopOneBlock_CUDA) {
 // this case both bodies must be masked against the other dimension being > 0.
 // Note: this is a bit degenerate no one would actually write this for perf.
 TEST(Cuda, MaskMultiDimMultiAxis_CUDA) {
-  KernelScope kernel_scope;
   int OUTER_SIZE = 10;
   int A_SIZE = 30;
   int B_SIZE = 15;
@@ -2211,7 +2185,6 @@ TEST(Cuda, MaskMultiDimMultiAxis_CUDA) {
 // the second loop is smaller in both cases - the second store must be masked
 // for both the block and thread dimension.
 TEST(Cuda, MaskMultiDimMultiLevel_CUDA) {
-  KernelScope kernel_scope;
   int OUTER_A_SIZE = 10;
   int OUTER_B_SIZE = 5;
   int A_SIZE = 30;
diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp
index 7c234fb95cdb1..d2405353e8301 100644
--- a/test/cpp/tensorexpr/test_expr.cpp
+++ b/test/cpp/tensorexpr/test_expr.cpp
@@ -24,7 +24,6 @@ using namespace torch::jit::tensorexpr;
 using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
 
 TEST(Expr, BasicValueTest) {
-  KernelScope kernel_scope;
   ExprHandle a = IntImm::make(2), b = IntImm::make(3);
   ExprHandle c = Add::make(a, b);
   SimpleIRExprEval eval(c);
@@ -32,7 +31,6 @@ TEST(Expr, BasicValueTest) {
 }
 
 TEST(Expr, BasicValueTest02) {
-  KernelScope kernel_scope;
   ExprHandle a(2.0f);
   ExprHandle b(3.0f);
   ExprHandle c(4.0f);
@@ -43,7 +41,6 @@ TEST(Expr, BasicValueTest02) {
 }
 
 TEST(Expr, LetTest01) {
-  KernelScope kernel_scope;
   VarHandle x("x", kFloat);
   ExprHandle body = ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f));
   SimpleIRExprEval eval(body);
@@ -52,7 +49,6 @@ TEST(Expr, LetTest01) {
 }
 
 TEST(Expr, LetTest02) {
-  KernelScope kernel_scope;
   VarHandle x("x", kFloat);
   VarHandle y("y", kFloat);
   ExprHandle body =
@@ -64,7 +60,6 @@ TEST(Expr, LetTest02) {
 }
 
 TEST(Expr, LetStmtTest01) {
-  KernelScope kernel_scope;
   Placeholder a_buf("a", kFloat, {1});
   Placeholder b_buf("b", kFloat, {1});
 
@@ -88,7 +83,6 @@ TEST(Expr, LetStmtTest01) {
 }
 
 TEST(Expr, IntTest) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   ExprHandle body = ExprHandle(2) + (x * ExprHandle(3) + ExprHandle(4));
   SimpleIRExprEval eval(body);
@@ -97,7 +91,6 @@ TEST(Expr, IntTest) {
 }
 
 TEST(Expr, FloatTest) {
-  KernelScope kernel_scope;
   VarHandle x("x", kFloat);
   ExprHandle body = ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f));
   SimpleIRExprEval eval(body);
@@ -106,7 +99,6 @@ TEST(Expr, FloatTest) {
 }
 
 TEST(Expr, ByteTest) {
-  KernelScope kernel_scope;
   VarHandle x("x", kByte);
   ExprHandle body = ExprHandle((uint8_t)2) +
       (x * ExprHandle((uint8_t)3) + ExprHandle((uint8_t)4));
@@ -116,7 +108,6 @@ TEST(Expr, ByteTest) {
 }
 
 TEST(Expr, CharTest) {
-  KernelScope kernel_scope;
   VarHandle x("x", kChar);
   ExprHandle body = ExprHandle((int8_t)2) +
       (x * ExprHandle((int8_t)3) + ExprHandle((int8_t)4));
@@ -126,7 +117,6 @@ TEST(Expr, CharTest) {
 }
 
 TEST(Expr, ShortTest) {
-  KernelScope kernel_scope;
   VarHandle x("x", kShort);
   ExprHandle body = ExprHandle((int16_t)2) +
       (x * ExprHandle((int16_t)3) + ExprHandle((int16_t)4));
@@ -136,7 +126,6 @@ TEST(Expr, ShortTest) {
 }
 
 TEST(Expr, LongTest) {
-  KernelScope kernel_scope;
   VarHandle x("x", kLong);
   ExprHandle body = ExprHandle((int64_t)2) +
       (x * ExprHandle((int64_t)3) + ExprHandle((int64_t)4));
@@ -146,7 +135,6 @@ TEST(Expr, LongTest) {
 }
 
 TEST(Expr, HalfTest) {
-  KernelScope kernel_scope;
   VarHandle x("x", kHalf);
   ExprHandle body = ExprHandle((at::Half)2) +
       (x * ExprHandle((at::Half)3) + ExprHandle((at::Half)4));
@@ -156,7 +144,6 @@ TEST(Expr, HalfTest) {
 }
 
 TEST(Expr, DoubleTest) {
-  KernelScope kernel_scope;
   VarHandle x("x", kDouble);
   ExprHandle body = ExprHandle((double)2) +
       (x * ExprHandle((double)3) + ExprHandle((double)4));
@@ -166,7 +153,6 @@ TEST(Expr, DoubleTest) {
 }
 
 TEST(Expr, VectorAdd01) {
-  KernelScope kernel_scope;
   const int kVectorSize = 8;
   const int kVectorCount = 128;
   const int kTotalSize = kVectorSize * kVectorCount;
@@ -212,7 +198,6 @@ TEST(Expr, VectorAdd01) {
 }
 
 TEST(Expr, CompareSelectEQ) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
@@ -251,7 +236,6 @@ TEST(Expr, CompareSelectDtypes) {
   // This test constructs a CompareSelect expression where the input dtype is
   // different from the output dtype and verifies that it works correctly:
   //   result = ((int)lhs == (int)rhs) ? (float)retval1 : (float)retval2
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
@@ -290,7 +274,6 @@ TEST(Expr, CompareSelectDtypes) {
 }
 
 TEST(Expr, IntrinsicsDtypes) {
-  KernelScope kernel_scope;
   constexpr int N = 256;
   Placeholder a(BufHandle("A", {N}, kDouble));
   Placeholder b(BufHandle("B", {N}, kDouble));
@@ -312,7 +295,6 @@ TEST(Expr, IntrinsicsDtypes) {
 }
 
 TEST(Expr, Substitute01) {
-  KernelScope kernel_scope;
   VarPtr x = alloc<Var>("x", kFloat);
   VarPtr y = alloc<Var>("y", kFloat);
   ExprPtr e =
@@ -334,7 +316,6 @@ TEST(Expr, Substitute01) {
 }
 
 TEST(Expr, Math01) {
-  KernelScope kernel_scope;
   ExprHandle v = sin(ExprHandle(1.0f));
 
   std::ostringstream oss;
@@ -348,7 +329,6 @@ TEST(Expr, Math01) {
 }
 
 TEST(Expr, UnaryMath01) {
-  KernelScope kernel_scope;
   struct TestConfig {
     std::function<ExprHandle(const ExprHandle&)> func;
     std::function<float(float)> ref_func;
@@ -416,7 +396,6 @@ TEST(Expr, UnaryMath01) {
 }
 
 TEST(Expr, BinaryMath01) {
-  KernelScope kernel_scope;
   struct TestConfig {
     std::function<ExprHandle(const ExprHandle&, const ExprHandle&)> func;
     std::function<float(float, float)> ref_func;
@@ -440,7 +419,6 @@ TEST(Expr, BinaryMath01) {
 }
 
 TEST(Expr, LogicalOps01) {
-  KernelScope kernel_scope;
   ExprHandle a(23);
   ExprHandle b(11);
   ExprHandle c(0.72f);
@@ -473,7 +451,6 @@ TEST(Expr, LogicalOps01) {
 }
 
 TEST(Expr, LogicalOps02) {
-  KernelScope kernel_scope;
   ExprHandle a(23);
   ExprHandle b(11);
   ExprHandle c(0.72f);
@@ -492,7 +469,6 @@ TEST(Expr, LogicalOps02) {
 }
 
 TEST(Expr, LogicalOps03) {
-  KernelScope kernel_scope;
   ExprHandle a(23);
   ExprHandle b(11);
   ExprHandle c(0.72f);
@@ -550,7 +526,6 @@ TEST(Expr, LogicalOps03) {
 }
 
 TEST(Expr, BitwiseOps) {
-  KernelScope kernel_scope;
   ExprHandle a(59);
   ExprHandle b(11);
   ExprHandle c(101);
@@ -562,7 +537,6 @@ TEST(Expr, BitwiseOps) {
 }
 
 TEST(Expr, DynamicShapeAdd) {
-  KernelScope kernel_scope;
   auto testWithSize = [](int32_t size) {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {n}, kFloat));
@@ -582,7 +556,6 @@ TEST(Expr, DynamicShapeAdd) {
 }
 
 void testCond01() {
-  KernelScope kernel_scope;
   const int N = 16;
   PaddedBuffer<float> a_v(N);
   Placeholder a_buf("a", kFloat, {N});
@@ -606,7 +579,6 @@ void testCond01() {
 }
 
 void testIfThenElse01() {
-  KernelScope kernel_scope;
   ExprHandle v = ifThenElse(ExprHandle(1), ExprHandle(1.0f), ExprHandle(2.0f));
 
   std::ostringstream oss;
@@ -618,7 +590,6 @@ void testIfThenElse01() {
 }
 
 void testIfThenElse02() {
-  KernelScope kernel_scope;
   ExprHandle v = ifThenElse(ExprHandle(0), ExprHandle(1.0f), ExprHandle(2.0f));
 
   std::ostringstream oss;
@@ -630,7 +601,6 @@ void testIfThenElse02() {
 }
 
 void testIfThenElse03() {
-  KernelScope kernel_scope;
   ExprHandle v =
       ifThenElse(BoolImm::make(false), ExprHandle(1.0f), ExprHandle(2.0f));
 
@@ -643,7 +613,6 @@ void testIfThenElse03() {
 }
 
 void testStmtClone() {
-  KernelScope kernel_scope;
   const int N = 16;
 
   Placeholder a_buf("a", kInt, {N});
diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp
index a170e530fa98f..176158e7fe13a 100644
--- a/test/cpp/tensorexpr/test_external_calls.cpp
+++ b/test/cpp/tensorexpr/test_external_calls.cpp
@@ -20,8 +20,6 @@ namespace jit {
 using namespace torch::jit::tensorexpr;
 
 TEST(ExternalCall, Conv2d_float) {
-  KernelScope kernel_scope;
-
   Placeholder Input("Input", kFloat, {1, 3, 224, 224});
   Placeholder Weight("Weight", kFloat, {16, 3, 3, 3});
   Placeholder Bias("Bias", kFloat, {16});
@@ -84,7 +82,6 @@ TEST(ExternalCall, Conv2d_float) {
 
 TEST(ExternalCall, Conv2d_int) {
   // A similar test, but now using kInt tensors
-  KernelScope kernel_scope;
 
   Placeholder Input("Input", kInt, {1, 3, 224, 224});
   Placeholder Weight("Weight", kInt, {16, 3, 3, 3});
@@ -147,8 +144,6 @@ TEST(ExternalCall, Conv2d_int) {
 }
 
 TEST(ExternalCall, Conv2d_nobias_noargs) {
-  KernelScope kernel_scope;
-
   Placeholder Input("Input", kFloat, {1, 16, 112, 112});
   Placeholder Weight("Weight", kFloat, {16, 16, 1, 1});
   BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat);
@@ -194,8 +189,6 @@ TEST(ExternalCall, Conv2d_nobias_noargs) {
 }
 
 TEST(ExternalCall, Addmm_float) {
-  KernelScope kernel_scope;
-
   Placeholder Input("Input", kFloat, {100, 300});
   Placeholder Mat1("Mat1", kFloat, {100, 200});
   Placeholder Mat2("Mat2", kFloat, {200, 300});
@@ -252,8 +245,6 @@ TEST(ExternalCall, Addmm_float) {
 TEST(ExternalCall, Prepacked_Linear_float) {
   using namespace at::native::xnnpack;
 
-  KernelScope kernel_scope;
-
   Placeholder Input("Input", kFloat, {100, 200});
   BufHandle ResultBuf("Result", {100, 300}, kFloat);
 
@@ -317,8 +308,6 @@ TEST(ExternalCall, Prepacked_Linear_float) {
 TEST(ExternalCall, Prepacked_Conv2d_float) {
   using namespace at::native::xnnpack;
 
-  KernelScope kernel_scope;
-
   Placeholder Input("Input", kFloat, {1, 3, 224, 224});
   BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat);
   int64_t stride = 2;
@@ -404,7 +393,6 @@ TEST(ExternalCall, Prepacked_Conv2d_float) {
 #endif // USE_XNNPACK
 
 TEST(ExternalCall, BinaryFloat) {
-  KernelScope kernel_scope;
   using TensorFunc = std::function<at::Tensor(at::Tensor, at::Tensor)>;
   using Test = std::tuple<
       std::vector<int64_t>,
@@ -479,7 +467,6 @@ TEST(ExternalCall, BinaryFloat) {
 }
 
 TEST(ExternalCall, UnaryFloat) {
-  KernelScope kernel_scope;
   using TensorFunc = std::function<at::Tensor(at::Tensor)>;
   auto toExprHandleVec = [](std::vector<int64_t> v) {
     auto intV = std::vector<int>(v.begin(), v.end());
@@ -561,7 +548,6 @@ TEST(ExternalCall, UnaryFloat) {
 TEST(ExternalCall, ComputeInterop) {
   // This test verifies that Tensors using external calls can be used by and can
   // use Tensors built with Compute API.
-  KernelScope kernel_scope;
 
   BufHandle ConvResultBuf("ConvResult", {1, 16, 32, 32}, kFloat);
   BufHandle MatmulResultBuf("MatmulResult", {1, 16, 32, 32}, kFloat);
@@ -654,7 +640,6 @@ TEST(ExternalCall, ComputeInterop) {
 TEST(ExternalCall, Inlining) {
   // This test verifies that Tensors using external calls can be used by and
   // can use Tensors built with Compute API.
-  KernelScope kernel_scope;
 
   BufHandle MatmulResultBuf("MatmulResult", {8, 8}, kFloat);
 
diff --git a/test/cpp/tensorexpr/test_graph_opt.cpp b/test/cpp/tensorexpr/test_graph_opt.cpp
index 3175d7f142524..e5a237f5f7541 100644
--- a/test/cpp/tensorexpr/test_graph_opt.cpp
+++ b/test/cpp/tensorexpr/test_graph_opt.cpp
@@ -45,7 +45,6 @@ TEST_F(GraphOpt, OptimizeCat) {
   torch::jit::parseIR(graph_string, g.get());
   g->lint();
 
-  KernelScope kernel_scope;
   TensorExprKernel kernel(g);
 
   // The `aten::log` op must be moved to the inputs of `aten::cat`.
@@ -88,7 +87,6 @@ TEST_F(GraphOpt, OptimizeCat2) {
   torch::jit::parseIR(graph_string, g.get());
   g->lint();
 
-  KernelScope kernel_scope;
   TensorExprKernel kernel(g);
 
   // The `aten::log` and `aten::tanh` ops must be moved to the inputs of
@@ -137,7 +135,6 @@ TEST_F(GraphOpt, OptimizeCat3) {
   torch::jit::parseIR(graph_string, g.get());
   g->lint();
 
-  KernelScope kernel_scope;
   TensorExprKernel kernel(g);
 
   // The `aten::tanh` op must be moved to the inputs of `aten::cat`.
@@ -183,7 +180,6 @@ TEST_F(GraphOpt, OptimizeCatWithTypePromotionInUser) {
   torch::jit::parseIR(graph_string, g.get());
   g->lint();
 
-  KernelScope kernel_scope;
   TensorExprKernel kernel(g);
 
   // The `aten::tanh` op must be moved to the inputs of `aten::cat`.
@@ -227,7 +223,6 @@ TEST_F(GraphOpt, OptimizeCatWithTypePromotionInCat) {
   torch::jit::parseIR(graph_string, g.get());
   g->lint();
 
-  KernelScope kernel_scope;
   TensorExprKernel kernel(g);
 
   // No transformation should have happened because the `aten::cat` op performs
@@ -257,7 +252,6 @@ TEST_F(GraphOpt, OptimizeCatNoSingleTensorElementwiseOp) {
   torch::jit::parseIR(graph_string, g.get());
   g->lint();
 
-  KernelScope kernel_scope;
   TensorExprKernel kernel(g);
 
   // No transformation is expected since the consumers of cat are not
@@ -290,7 +284,6 @@ TEST_F(GraphOpt, OptimizeCatNoSingleTensorElementwiseOp2) {
   torch::jit::parseIR(graph_string, g.get());
   g->lint();
 
-  KernelScope kernel_scope;
   TensorExprKernel kernel(g);
 
   // No transformation is expected since the consumers of cat are not
diff --git a/test/cpp/tensorexpr/test_ir_printer.cpp b/test/cpp/tensorexpr/test_ir_printer.cpp
index e11ba06740181..820f12689acca 100644
--- a/test/cpp/tensorexpr/test_ir_printer.cpp
+++ b/test/cpp/tensorexpr/test_ir_printer.cpp
@@ -17,7 +17,6 @@ namespace jit {
 using namespace torch::jit::tensorexpr;
 
 TEST(IRPrinter, BasicValueTest) {
-  KernelScope kernel_scope;
   ExprHandle a = IntImm::make(2), b = IntImm::make(3);
   ExprHandle c = Add::make(a, b);
 
@@ -27,7 +26,6 @@ TEST(IRPrinter, BasicValueTest) {
 }
 
 TEST(IRPrinter, BasicValueTest02) {
-  KernelScope kernel_scope;
   ExprHandle a(2.0f);
   ExprHandle b(3.0f);
   ExprHandle c(4.0f);
@@ -40,7 +38,6 @@ TEST(IRPrinter, BasicValueTest02) {
 }
 
 TEST(IRPrinter, CastTest) {
-  KernelScope kernel_scope;
   VarHandle x("x", kHalf);
   VarHandle y("y", kFloat);
   ExprHandle body = ExprHandle(2.f) +
@@ -52,7 +49,6 @@ TEST(IRPrinter, CastTest) {
 }
 
 TEST(IRPrinter, FunctionName) {
-  KernelScope kernel_scope;
   int M = 4;
   int N = 20;
 
diff --git a/test/cpp/tensorexpr/test_ir_verifier.cpp b/test/cpp/tensorexpr/test_ir_verifier.cpp
index 2c91d8b24b253..cbe15502ad1f9 100644
--- a/test/cpp/tensorexpr/test_ir_verifier.cpp
+++ b/test/cpp/tensorexpr/test_ir_verifier.cpp
@@ -17,7 +17,6 @@ namespace jit {
 using namespace torch::jit::tensorexpr;
 
 TEST(IRVerifier, BitwiseOps) {
-  KernelScope kernel_scope;
   VarPtr X = alloc<Var>("x", kInt);
   VarPtr Y = alloc<Var>("y", kFloat);
   {
@@ -48,7 +47,6 @@ TEST(IRVerifier, BitwiseOps) {
 }
 
 TEST(IRVerifier, CompareSelect) {
-  KernelScope kernel_scope;
   ExprPtr X = alloc<IntImm>(1);
   ExprPtr Y = alloc<FloatImm>(3.14f);
   {
@@ -64,7 +62,6 @@ TEST(IRVerifier, CompareSelect) {
 }
 
 TEST(IRVerifier, Ramp) {
-  KernelScope kernel_scope;
   VarPtr I = alloc<Var>("i", kInt);
   VarPtr J = alloc<Var>("j", kFloat);
   {
@@ -75,7 +72,6 @@ TEST(IRVerifier, Ramp) {
 }
 
 TEST(IRVerifier, Load) {
-  KernelScope kernel_scope;
   VarPtr I = alloc<Var>("i", kInt);
   VarPtr J = alloc<Var>("j", kLong);
   VarPtr K = alloc<Var>("k", kFloat);
@@ -105,7 +101,6 @@ TEST(IRVerifier, Load) {
 }
 
 TEST(IRVerifier, IfThenElse) {
-  KernelScope kernel_scope;
   VarPtr I = alloc<Var>("i", kInt);
   VarPtr J = alloc<Var>("j", kLong);
   VarPtr K = alloc<Var>("k", kFloat);
@@ -130,7 +125,6 @@ TEST(IRVerifier, IfThenElse) {
 }
 
 TEST(IRVerifier, For) {
-  KernelScope kernel_scope;
   VarPtr I = alloc<Var>("i", kInt);
   VarPtr J = alloc<Var>("j", kInt);
   StmtPtr body = alloc<Block>(std::vector<StmtPtr>({}));
@@ -143,7 +137,6 @@ TEST(IRVerifier, For) {
 }
 
 TEST(IRVerifier, Block) {
-  KernelScope kernel_scope;
   VarPtr I = alloc<Var>("i", kInt);
   BufPtr B = alloc<Buf>("B", std::vector<ExprPtr>({alloc<IntImm>(10)}), kInt);
   {
@@ -160,7 +153,6 @@ TEST(IRVerifier, Block) {
 }
 
 TEST(IRVerifier, Store) {
-  KernelScope kernel_scope;
   VarPtr I = alloc<Var>("i", kInt);
   VarPtr J = alloc<Var>("j", kLong);
   VarPtr K = alloc<Var>("k", kFloat);
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index 765522ecf6cd4..e14282f258893 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -39,7 +39,6 @@ TEST_F(Kernel, InliningIntermediates) {
           %4 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
           %5: Float(5, 3, strides=[3, 1]) = aten::add(%4, %1, %one)
           return (%5))IR";
-    KernelScope kernel_scope;
     auto graph = std::make_shared<Graph>();
     parseIR(graph_string, &*graph);
     TensorExprKernel k(graph);
@@ -63,7 +62,6 @@ TEST_F(Kernel, InliningIntermediates) {
         continue;
       }
 
-      KernelScope kernel_scope;
       TemplateEnv env;
       env.s("device", use_cuda ? "cuda:0" : "cpu");
       const auto graph_string = format(graph_template, env);
@@ -88,8 +86,6 @@ TEST_F(Kernel, InliningIntermediates) {
 }
 
 TEST_F(Kernel, _1) {
-  KernelScope kernel_scope;
-
   const auto graph_string = R"IR(
       graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
             %1 : Float(5, 3, strides=[3, 1], device=cpu)):
@@ -127,8 +123,6 @@ TEST_F(Kernel, _1) {
 }
 
 TEST_F(Kernel, _2) {
-  KernelScope kernel_scope;
-
   const auto graph_string = R"IR(
       graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
             %1 : Float(5, 3, strides=[1, 5], device=cpu)):
@@ -167,8 +161,6 @@ TEST_F(Kernel, _2) {
 }
 
 TEST_F(Kernel, _3) {
-  KernelScope kernel_scope;
-
   const auto graph_string = R"IR(
       graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
             %1 : Float(5, 3, strides=[12, 2], device=cpu)):
@@ -212,8 +204,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) {
   // Test TensorExpr shape inference capabilities: it should only require shapes
   // for the inputs
   {
-    KernelScope kernel_scope;
-
     const auto graph_string = R"IR(
       graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
             %1 : Float(5, 3, strides=[12, 2], device=cpu)):
@@ -251,8 +241,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) {
     }
   }
   {
-    KernelScope kernel_scope;
-
     const auto graph_string = R"IR(
       graph(%0 : Float(8, 8, strides=[8, 1], device=cpu),
             %1 : Float(8, 8, strides=[8, 1], device=cpu)):
@@ -292,7 +280,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) {
   }
   {
     // Test that shape inference handles aten::unsqueeze
-    KernelScope kernel_scope;
 
     const auto graph_string = R"IR(
       graph(%a : Float(4, 2, strides=[2, 1], device=cpu),
@@ -355,7 +342,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) {
   }
   {
     // Test that shape inference handles aten::cat
-    KernelScope kernel_scope;
 
     const auto graph_string = R"IR(
       graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
@@ -409,7 +395,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) {
   }
   {
     // Test that we throw an error when input list for aten::cat is empty
-    KernelScope kernel_scope;
 
     const auto graph_string = R"IR(
       graph():
@@ -427,7 +412,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) {
   }
   {
     // Test that we throw an error when 'dim' passed to aten::cat is invalid
-    KernelScope kernel_scope;
 
     const auto ir_dim_99 = R"IR(
       graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
@@ -458,7 +442,6 @@ TEST_F(Kernel, DISABLED_Shape_Inference) {
 TEST_F(Kernel, CatInputTypesPromotion) {
   {
     // Test that we properly promote input types for aten::cat
-    KernelScope kernel_scope;
 
     const auto graph_string = R"IR(
       graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu),
@@ -676,7 +659,6 @@ TEST_F(Kernel, SumAllAxes) {
   auto a = iotaTensor({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
 
   for (auto scalar_type : {ScalarType::Undefined, ScalarType::Double}) {
-    KernelScope kernel_scope;
     TemplateEnv env;
     env.s("dtype", dtypeConstant(scalar_type));
     if (scalar_type == ScalarType::Undefined) {
@@ -745,7 +727,6 @@ TEST_F(Kernel, SumOneAxis) {
   for (int dim = -a.dim(); dim < a.dim(); ++dim) {
     for (bool keepdim : {false, true}) {
       for (auto scalar_type : {ScalarType::Undefined, ScalarType::Double}) {
-        KernelScope kernel_scope;
         TemplateEnv env;
         env.d("dim", dim);
         env.d("keepdim", keepdim);
@@ -812,7 +793,6 @@ TEST_F(Kernel, SumMultipleAxes) {
   for (int dim1 = 0; dim1 < a.dim(); ++dim1) {
     for (int dim2 = dim1 + 1; dim2 < a.dim(); ++dim2) {
       for (bool keepdim : {false, true}) {
-        KernelScope kernel_scope;
         TemplateEnv env;
         env.d("dim1", dim1);
         env.d("dim2", dim2);
@@ -888,7 +868,6 @@ TEST_F(Kernel, Softmax2D) {
       auto other_dim = (softmax_dim + 1) % a.dim();
       auto ref =
           log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim);
-      KernelScope kernel_scope;
       TemplateEnv env;
       env.d("dim", softmax_dim);
       env.s("op", log_softmax ? "log_softmax" : "softmax");
@@ -964,7 +943,6 @@ TEST_F(Kernel, Softmax3D) {
       auto ref =
           log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim);
 
-      KernelScope kernel_scope;
       TemplateEnv env;
       env.d("dim", softmax_dim);
       env.s("op", log_softmax ? "log_softmax" : "softmax");
@@ -1046,7 +1024,6 @@ TEST_F(Kernel, Softmax4D) {
       auto ref =
           log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim);
 
-      KernelScope kernel_scope;
       TemplateEnv env;
       env.d("dim", softmax_dim);
       env.s("op", log_softmax ? "log_softmax" : "softmax");
@@ -1090,8 +1067,6 @@ TEST_F(Kernel, Softmax4D) {
 }
 
 TEST_F(Kernel, InlineProducerIntoReduction) {
-  KernelScope kernel_scope;
-
   // Inline producer (mul) into reduction (sum).
   const auto graph_string = R"IR(
       graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
@@ -1129,8 +1104,6 @@ TEST_F(Kernel, InlineProducerIntoReduction) {
 }
 
 TEST_F(Kernel, InlineReductionIntoConsumer) {
-  KernelScope kernel_scope;
-
   // Inline producer (mul %2) into reduction (sum %4) but DO NOT
   // inline the reduction into consumer (mul %4).
   const auto graph_string = R"IR(
@@ -1179,7 +1152,6 @@ TEST_F(Kernel, SanitizeNames_CUDA) {
         %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
         %4 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
         return (%4))IR";
-  KernelScope kernel_scope;
   auto graph = std::make_shared<Graph>();
   parseIR(graph_string, &*graph);
   graph->inputs().at(0)->setDebugName("aten::add:");
@@ -1204,7 +1176,6 @@ TEST_F(Kernel, ConstantTensors) {
           %y : Float(16, 16, strides=[16, 1], device=cpu) = aten::ones(%sizes, %none, %none, %none, %none)
           %z : Float(16, 16, strides=[16, 1], device=cpu) = aten::mul(%x, %y)
           return (%z))IR";
-  KernelScope kernel_scope;
   auto graph = std::make_shared<Graph>();
   parseIR(graph_string, &*graph);
   // IRParser doesn't support tensor constants, so we insert a call to
@@ -1237,7 +1208,6 @@ TEST_F(Kernel, ConstantTensorsNonContiguous) {
           %y : Tensor = aten::t(%y_t)
           %z : Float(16, 16, strides=[16, 1], device=cpu) = aten::mul(%x, %y)
           return (%z))IR";
-  KernelScope kernel_scope;
   auto graph = std::make_shared<Graph>();
   parseIR(graph_string, &*graph);
   // IRParser doesn't support tensor constants, so we generate several aten
@@ -1261,7 +1231,6 @@ TEST_F(Kernel, ConstantTensorsNonContiguous) {
 TEST_F(Kernel, RunFast) {
 #ifdef TORCH_ENABLE_LLVM
   // TODO: Implement call_raw in IREval and remove the ifdef
-  KernelScope kernel_scope;
 
   const auto graph_string = R"IR(
       graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
@@ -1301,7 +1270,6 @@ TEST_F(Kernel, CodegenInspection) {
           %y : Tensor = aten::t(%y_t)
           %z : Float(16, 16, strides=[16, 1], device=cpu) = aten::mul(%x, %y)
           return (%z))IR";
-  KernelScope kernel_scope;
   auto graph = std::make_shared<Graph>();
   parseIR(graph_string, &*graph);
   // IRParser doesn't support tensor constants, so we generate several aten
@@ -1353,7 +1321,6 @@ TEST_F(Kernel, CustomLowering) {
           %y : Float(2, 2, strides=[2, 1], requires_grad=0, device=cpu) = aten::nan_to_num(%x, %none, %none, %none)
           return (%y)
 )IR";
-  KernelScope kernel_scope;
   auto graph = std::make_shared<Graph>();
   parseIR(graph_string, &*graph);
 
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index 6081403c25650..139763b071317 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -36,7 +36,6 @@ using LLVMExprEval = ExprEval<LLVMCodeGen>;
 
 #define IMM_TEST(Type, Name, Val)                  \
   TEST(LLVM, Name##ImmTest) {                      \
-    KernelScope kernel_scope;                      \
     auto a = Name##Imm::make(Val);                 \
     LLVMExprEval cg(a);                            \
     if (std::is_floating_point<decltype(Val)>()) { \
@@ -50,7 +49,6 @@ TEST_LLVM_SCALAR_TYPES(IMM_TEST)
 
 #define ADD_TEST(Type, Name, Val)                  \
   TEST(LLVM, Name##AddTest) {                      \
-    KernelScope kernel_scope;                      \
     auto a = Name##Imm::make(Val);                 \
     auto b = Name##Imm::make(Val * 2);             \
     auto c = Add::make(a, b);                      \
@@ -66,7 +64,6 @@ TEST_LLVM_SCALAR_TYPES(ADD_TEST)
 
 #define SUB_TEST(Type, Name, Val)                  \
   TEST(LLVM, Name##SubTest) {                      \
-    KernelScope kernel_scope;                      \
     auto a = Name##Imm::make(Val * 2);             \
     auto b = Name##Imm::make(Val);                 \
     auto c = Sub::make(a, b);                      \
@@ -82,7 +79,6 @@ TEST_LLVM_SCALAR_TYPES(SUB_TEST)
 
 #define MUL_TEST(Type, Name, Val)                  \
   TEST(LLVM, Name##MulTest) {                      \
-    KernelScope kernel_scope;                      \
     auto a = Name##Imm::make(Val);                 \
     auto b = Name##Imm::make((Type)4);             \
     auto c = Mul::make(a, b);                      \
@@ -98,7 +94,6 @@ TEST_LLVM_SCALAR_TYPES(MUL_TEST)
 
 #define DIV_TEST(Type, Name, Val)                  \
   TEST(LLVM, Name##DivTest) {                      \
-    KernelScope kernel_scope;                      \
     auto a = Name##Imm::make((Type)6);             \
     auto b = Name##Imm::make((Type)3);             \
     auto c = Div::make(a, b);                      \
@@ -113,7 +108,6 @@ TEST_LLVM_SCALAR_TYPES(DIV_TEST)
 #undef DIV_TEST
 
 TEST(LLVM, IntToFloatCastTest) {
-  KernelScope kernel_scope;
   auto a = IntImm::make(2);
   auto b = Cast::make(kFloat, a);
   LLVMExprEval cg(b, {});
@@ -121,7 +115,6 @@ TEST(LLVM, IntToFloatCastTest) {
 }
 
 TEST(LLVM, FloatToIntCastTest) {
-  KernelScope kernel_scope;
   auto a = FloatImm::make(2.0);
   auto b = Cast::make(kInt, a);
   LLVMExprEval cg(b);
@@ -129,7 +122,6 @@ TEST(LLVM, FloatToIntCastTest) {
 }
 
 TEST(LLVM, IntToLongCastTest) {
-  KernelScope kernel_scope;
   auto a = IntImm::make(12345);
   auto b = Cast::make(kLong, a);
   LLVMExprEval cg(b);
@@ -137,7 +129,6 @@ TEST(LLVM, IntToLongCastTest) {
 }
 
 TEST(LLVM, ByteToCharCastTest) {
-  KernelScope kernel_scope;
   auto a = ByteImm::make(250);
   auto b = Cast::make(kChar, a);
   LLVMExprEval cg(b);
@@ -145,7 +136,6 @@ TEST(LLVM, ByteToCharCastTest) {
 }
 
 TEST(LLVM, HalfToLongCastTest) {
-  KernelScope kernel_scope;
   auto a = HalfImm::make(2.0);
   auto b = Cast::make(kLong, a);
   LLVMExprEval cg(b);
@@ -153,7 +143,6 @@ TEST(LLVM, HalfToLongCastTest) {
 }
 
 TEST(LLVM, ByteToDoubleCastTest) {
-  KernelScope kernel_scope;
   auto a = ByteImm::make(2);
   auto b = Cast::make(kDouble, a);
   LLVMExprEval cg(b);
@@ -170,7 +159,6 @@ TEST(LLVM, BitCast) {
 
   // this is broken
   /*{
-    KernelScope kernel_scope;
     at::Half k_;
     at::Half* k = &k_;
     *reinterpret_cast<int16_t*>(k) = ref16;
@@ -181,7 +169,6 @@ TEST(LLVM, BitCast) {
   }*/
 
   {
-    KernelScope kernel_scope;
     float k = raw_bitcast<float>(ref32);
     auto a = FloatImm::make(k);
     auto b = BitCast::make(kInt, a);
@@ -190,7 +177,6 @@ TEST(LLVM, BitCast) {
   }
 
   {
-    KernelScope kernel_scope;
     double k = raw_bitcast<double>(ref64);
     auto a = DoubleImm::make(k);
     auto b = BitCast::make(kLong, a);
@@ -199,7 +185,6 @@ TEST(LLVM, BitCast) {
   }
 
   {
-    KernelScope kernel_scope;
     int64_t k = raw_bitcast<int64_t>(reff64);
     auto a = LongImm::make(k);
     auto b = BitCast::make(kDouble, a);
@@ -208,7 +193,6 @@ TEST(LLVM, BitCast) {
   }
 
   {
-    KernelScope kernel_scope;
     int32_t k = raw_bitcast<int32_t>(reff32);
     auto a = IntImm::make(k);
     auto b = BitCast::make(kFloat, a);
@@ -218,7 +202,6 @@ TEST(LLVM, BitCast) {
 }
 
 TEST(LLVM, fastLogFloat) {
-  KernelScope kernel_scope;
   const int kTotalSize = 128 * 128;
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
@@ -250,8 +233,6 @@ TEST(LLVM, fastLogFloat) {
 }
 
 TEST(LLVM, LetTest01) {
-  KernelScope kernel_scope;
-
   Placeholder a(BufHandle("A", {1}, kFloat));
   std::vector<float> v = {1, 0};
   std::vector<void*> args({v.data()});
@@ -267,8 +248,6 @@ TEST(LLVM, LetTest01) {
 }
 
 TEST(LLVM, LetTest02) {
-  KernelScope kernel_scope;
-
   Placeholder a(BufHandle("A", {1}, kFloat));
   std::vector<float> v = {1, 0};
   std::vector<void*> args({v.data()});
@@ -287,8 +266,6 @@ TEST(LLVM, LetTest02) {
 }
 
 TEST(LLVM, LetTestMultitype) {
-  KernelScope kernel_scope;
-
   Placeholder a(BufHandle("A", {1}, kDouble));
   std::vector<double> v = {1, 0};
   std::vector<void*> args({v.data()});
@@ -310,7 +287,6 @@ TEST(LLVM, LetTestMultitype) {
 }
 
 TEST(LLVM, BufferTest) {
-  KernelScope kernel_scope;
   Placeholder a(BufHandle("A", {32}, kFloat));
   std::vector<int32_t> v(5);
   std::vector<void*> args({v.data()});
@@ -320,7 +296,6 @@ TEST(LLVM, BufferTest) {
 }
 
 TEST(LLVM, BlockTest) {
-  KernelScope kernel_scope;
   Placeholder a(BufHandle("A", {32}, kInt));
   std::vector<int32_t> v = {1, 2};
   std::vector<void*> args({v.data()});
@@ -338,7 +313,6 @@ TEST(LLVM, BlockTest) {
 }
 
 TEST(LLVM, LoadStoreTest) {
-  KernelScope kernel_scope;
   Placeholder a(BufHandle("A", {1}, kInt));
   Placeholder b(BufHandle("B", {1}, kInt));
   std::vector<int32_t> a_buffer = {42};
@@ -353,7 +327,6 @@ TEST(LLVM, LoadStoreTest) {
 }
 
 TEST(LLVM, IfThenElseTest) {
-  KernelScope kernel_scope;
   Placeholder a(BufHandle("A", {1}, kInt));
   Placeholder b(BufHandle("B", {1}, kInt));
   Placeholder c(BufHandle("C", {1}, kInt));
@@ -371,8 +344,6 @@ TEST(LLVM, IfThenElseTest) {
 
 // if (x < 10) x = x + 1
 TEST(LLVM, CondNoFalseBlockTest) {
-  KernelScope kernel_scope;
-
   Placeholder x(BufHandle("X", {1}, kInt));
   auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
   auto cond = Cond::make(cmp, x.store({0}, x.load(0) + 1), nullptr);
@@ -396,8 +367,6 @@ TEST(LLVM, CondNoFalseBlockTest) {
 //   x = x - 1;
 // }
 TEST(LLVM, CondTest) {
-  KernelScope kernel_scope;
-
   Placeholder x(BufHandle("X", {1}, kInt));
   auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
   auto cond =
@@ -434,8 +403,6 @@ TEST(LLVM, CondTest) {
 //   }
 // }
 TEST(LLVM, CondNestedTest) {
-  KernelScope kernel_scope;
-
   Placeholder x(BufHandle("X", {1}, kInt));
   auto true_cmp =
       CompareSelect::make(x.load(0), 5, CompareSelectOperation::kGT);
@@ -470,7 +437,6 @@ TEST(LLVM, CondNestedTest) {
 }
 
 TEST(LLVM, DirectVectorization) {
-  KernelScope ks;
   constexpr int M = 3;
   constexpr int N = 64;
   BufHandle a("a", {M, N}, kFloat);
@@ -491,7 +457,6 @@ TEST(LLVM, DirectVectorization) {
 }
 
 TEST(LLVM, VecLoadStoreTest) {
-  KernelScope kernel_scope;
   Placeholder a(BufHandle("A", {1}, kInt));
   Placeholder b(BufHandle("B", {1}, kInt));
   std::vector<int32_t> a_buffer = {1, 1, 1, 1};
@@ -513,7 +478,6 @@ TEST(LLVM, VecLoadStoreTest) {
 
 #define FLOAT_INTRINSICS_TEST(Name, Lanes)                                   \
   TEST(LLVM, VecFloat_##Name##Lane##Lanes##Test) {                           \
-    KernelScope kernel_scope;                                                \
     Placeholder a(BufHandle("A", {1}, kFloat));                              \
     Placeholder b(BufHandle("B", {1}, kFloat));                              \
     float val = 0.5f;                                                        \
@@ -552,7 +516,6 @@ FLOAT_INTRINSICS_TEST(lgamma, 8)
 
 #define DOUBLE_INTRINSICS_TEST(Name, Lanes)                                  \
   TEST(LLVM, VecDouble_##Name##Lane##Lanes##Test) {                          \
-    KernelScope kernel_scope;                                                \
     Placeholder a(BufHandle("A", {1}, kDouble));                             \
     Placeholder b(BufHandle("B", {1}, kDouble));                             \
     float val = 0.5f;                                                        \
@@ -590,7 +553,6 @@ DOUBLE_INTRINSICS_TEST(lgamma, 4)
 #undef DOUBLE_INTRINSICS_TEST
 
 TEST(LLVM, VectorizerLoadStoreTest) {
-  KernelScope kernel_scope;
   Placeholder a(BufHandle("A", {1}, kInt));
 
   Tensor c =
@@ -613,7 +575,6 @@ TEST(LLVM, VectorizerLoadStoreTest) {
 }
 
 TEST(LLVM, VectorizeBitCast) {
-  KernelScope kernel_scope;
   Placeholder a(BufHandle("A", {128}, kInt));
 
   Tensor c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) {
@@ -639,7 +600,6 @@ TEST(LLVM, VectorizeBitCast) {
 }
 
 TEST(LLVM, MemcpyTest) {
-  KernelScope kernel_scope;
   constexpr int N = 32;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
@@ -661,7 +621,6 @@ TEST(LLVM, MemcpyTest) {
 }
 
 TEST(LLVM, BzeroTest) {
-  KernelScope kernel_scope;
   constexpr int N = 32;
   Placeholder b(BufHandle("B", {N}, kInt));
   std::vector<int32_t> b_buffer(N, 11);
@@ -679,7 +638,6 @@ TEST(LLVM, BzeroTest) {
 }
 
 TEST(LLVM, ElemwiseAdd) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
@@ -705,7 +663,6 @@ TEST(LLVM, ElemwiseAdd) {
 }
 
 TEST(LLVM, ElemwiseAddFloat) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kFloat));
   Placeholder b(BufHandle("B", {N}, kFloat));
@@ -731,7 +688,6 @@ TEST(LLVM, ElemwiseAddFloat) {
 }
 
 TEST(LLVM, ElemwiseLog10Float) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kFloat));
   Placeholder b(BufHandle("B", {N}, kFloat));
@@ -758,7 +714,6 @@ TEST(LLVM, ElemwiseLog10Float) {
 }
 
 TEST(LLVM, ElemwiseLog1pFloat) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kFloat));
   Placeholder b(BufHandle("B", {N}, kFloat));
@@ -785,7 +740,6 @@ TEST(LLVM, ElemwiseLog1pFloat) {
 }
 
 TEST(LLVM, ElemwiseMaxInt) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
@@ -812,7 +766,6 @@ TEST(LLVM, ElemwiseMaxInt) {
 }
 
 TEST(LLVM, ElemwiseMinInt) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
@@ -839,7 +792,6 @@ TEST(LLVM, ElemwiseMinInt) {
 }
 
 TEST(LLVM, ElemwiseMaxFloat) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kFloat));
   Placeholder b(BufHandle("B", {N}, kFloat));
@@ -866,7 +818,6 @@ TEST(LLVM, ElemwiseMaxFloat) {
 }
 
 TEST(LLVM, ElemwiseMaxNaNFloat) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kFloat));
   Placeholder b(BufHandle("B", {N}, kFloat));
@@ -894,7 +845,6 @@ TEST(LLVM, ElemwiseMaxNaNFloat) {
 }
 
 TEST(LLVM, ElemwiseMinFloat) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kFloat));
   Placeholder b(BufHandle("B", {N}, kFloat));
@@ -921,7 +871,6 @@ TEST(LLVM, ElemwiseMinFloat) {
 }
 
 TEST(LLVM, ElemwiseMinNaNFloat) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kFloat));
   Placeholder b(BufHandle("B", {N}, kFloat));
@@ -949,7 +898,6 @@ TEST(LLVM, ElemwiseMinNaNFloat) {
 }
 
 TEST(LLVM, ElemwiseMod) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
@@ -975,7 +923,6 @@ TEST(LLVM, ElemwiseMod) {
 }
 
 TEST(LLVM, CompareSelectIntEQ) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kInt));
   Placeholder b(BufHandle("B", {N}, kInt));
@@ -1016,7 +963,6 @@ TEST(LLVM, CompareSelectIntEQ) {
 }
 
 TEST(LLVM, CompareSelectFloatEQ) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kFloat));
   Placeholder b(BufHandle("B", {N}, kFloat));
@@ -1050,7 +996,6 @@ TEST(LLVM, CompareSelectFloatEQ) {
 }
 
 TEST(LLVM, CompareSelectByteGT) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kByte));
   Placeholder b(BufHandle("B", {N}, kByte));
@@ -1091,7 +1036,6 @@ TEST(LLVM, CompareSelectByteGT) {
 }
 
 TEST(LLVM, CompareSelectByteGE) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kByte));
   Placeholder b(BufHandle("B", {N}, kByte));
@@ -1127,7 +1071,6 @@ TEST(LLVM, CompareSelectByteGE) {
 }
 
 TEST(LLVM, CompareSelectByteLT) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kByte));
   Placeholder b(BufHandle("B", {N}, kByte));
@@ -1168,7 +1111,6 @@ TEST(LLVM, CompareSelectByteLT) {
 }
 
 TEST(LLVM, CompareSelectByteLE) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   Placeholder a(BufHandle("A", {N}, kByte));
   Placeholder b(BufHandle("B", {N}, kByte));
@@ -1204,7 +1146,6 @@ TEST(LLVM, CompareSelectByteLE) {
 }
 
 TEST(LLVM, StoreFloat) {
-  KernelScope kernel_scope;
   Placeholder result(BufHandle("result", {1}, kFloat));
   std::vector<float> result_buffer = {0.0f};
   auto expr = result.store({0}, FloatImm::make(3.14f));
@@ -1215,7 +1156,6 @@ TEST(LLVM, StoreFloat) {
 }
 
 TEST(LLVM, SimpleMath01) {
-  KernelScope kernel_scope;
   const int N = 1024;
   Tensor tensor = Compute("f", {{N, "i"}}, [](const VarHandle& i) {
     return cast<float>(i * i + 1);
@@ -1237,7 +1177,6 @@ TEST(LLVM, SimpleMath01) {
 }
 
 TEST(LLVM, ComputeMul) {
-  KernelScope kernel_scope;
   const int N = 1024;
   Placeholder a(BufHandle("a", {N}, kFloat));
   Placeholder b(BufHandle("b", {N}, kFloat));
@@ -1260,7 +1199,6 @@ TEST(LLVM, ComputeMul) {
 }
 
 TEST(LLVM, BroadcastAdd) {
-  KernelScope kernel_scope;
   const int M = 32;
   const int N = 1024;
   Placeholder a(BufHandle("a", {M, N}, kFloat));
@@ -1293,7 +1231,6 @@ TEST(LLVM, BroadcastAdd) {
 }
 
 TEST(LLVM, BitwiseOps) {
-  KernelScope kernel_scope;
   auto a = IntImm::make(59);
   auto b = IntImm::make(11);
   auto c = IntImm::make(101);
@@ -1306,7 +1243,6 @@ TEST(LLVM, BitwiseOps) {
 }
 
 TEST(LLVM, ArithmeticRightShift) {
-  KernelScope ks;
   auto a = CharImm::make(-4);
   auto b = CharImm::make(1);
   ExprHandle f = a >> b;
@@ -1315,7 +1251,6 @@ TEST(LLVM, ArithmeticRightShift) {
 }
 
 TEST(LLVM, LogicalRightShift) {
-  KernelScope ks;
   auto a = ByteImm::make(0xfc);
   auto b = ByteImm::make(1);
   ExprHandle f = a >> b;
@@ -1324,7 +1259,6 @@ TEST(LLVM, LogicalRightShift) {
 }
 
 TEST(LLVM, DynamicShapeAdd) {
-  KernelScope kernel_scope;
   auto testWithSize = [](int32_t size) {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {n}, kFloat));
@@ -1346,7 +1280,6 @@ TEST(LLVM, DynamicShapeAdd) {
 }
 
 TEST(LLVM, BindDynamicShapeAdd) {
-  KernelScope kernel_scope;
   auto testWithSize = [](int32_t size) {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {n}, kFloat));
@@ -1367,7 +1300,6 @@ TEST(LLVM, BindDynamicShapeAdd) {
 }
 
 TEST(LLVM, TensorDynamicShapeAdd) {
-  KernelScope kernel_scope;
   auto testWithSize = [](int32_t size) {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {n}, kFloat));
@@ -1390,7 +1322,6 @@ TEST(LLVM, TensorDynamicShapeAdd) {
 }
 
 TEST(LLVM, DynamicShape2D) {
-  KernelScope kernel_scope;
   auto testWithSize = [](int32_t M, int32_t N) {
     VarHandle m("m", kInt);
     VarHandle n("n", kInt);
@@ -1416,7 +1347,6 @@ TEST(LLVM, DynamicShape2D) {
 }
 
 TEST(LLVM, EmptyStmt) {
-  KernelScope kernel_scope;
   StmtPtr s = alloc<Block>(std::vector<StmtPtr>({}));
 
   LLVMCodeGen cg(s, {});
@@ -1425,7 +1355,6 @@ TEST(LLVM, EmptyStmt) {
 }
 
 TEST(LLVM, EliminatedStmt) {
-  KernelScope kernel_scope;
   Placeholder a(BufHandle("a", {1}, kFloat));
 
   Tensor c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; });
@@ -1441,8 +1370,6 @@ TEST(LLVM, EliminatedStmt) {
 }
 
 TEST(LLVM, SimpleReduction) {
-  KernelScope kernel_scope;
-
   int M = 128;
   int N = 64;
   const int kTotalSize = M * N;
@@ -1480,8 +1407,6 @@ TEST(LLVM, SimpleReduction) {
 }
 
 TEST(LLVM, RFactorReduction) {
-  KernelScope kernel_scope;
-
   int M = 128;
   int N = 64;
   const int kTotalSize = M * N;
@@ -1530,8 +1455,6 @@ TEST(LLVM, RFactorReduction) {
 }
 
 TEST(LLVM, RFactorVectorizedReduction) {
-  KernelScope kernel_scope;
-
   int M = 128;
   int N = 64;
   const int kTotalSize = M * N;
@@ -1582,7 +1505,6 @@ TEST(LLVM, SimpleParallel) {
   for (int test_cfg = 0; test_cfg < 4; test_cfg++) {
     // Compute a simple operation, and try all loop-axis combination to be
     // parallel or sequential.
-    KernelScope kernel_scope;
     const int M = 4;
     const int N = 6;
     Tensor f = Compute(
@@ -1623,7 +1545,6 @@ TEST(LLVM, CompositeParallel) {
   // Compute a composite operation, and try all loop-axis combination to be
   // parallel or sequential.
   for (int test_cfg = 0; test_cfg < test_count; test_cfg++) {
-    KernelScope kernel_scope;
     int M = 5;
     int N = 7;
     Tensor t1 =
@@ -1687,8 +1608,6 @@ TEST(LLVM, CompositeParallel) {
 }
 
 TEST(LLVM, VectorizedGEMM) {
-  KernelScope ks;
-
   int M = 32;
   int N = 32;
   int K = 48;
@@ -1771,7 +1690,6 @@ TEST(LLVM, VectorizedGEMM) {
 }
 
 TEST(LLVM, CallRaw) {
-  KernelScope kernel_scope;
   const int M = 32;
   VarHandle N("N", kInt);
   Placeholder a(BufHandle("a", {M, N}, kFloat));
@@ -1813,7 +1731,6 @@ TEST(LLVM, CallRaw) {
 }
 
 TEST(LLVM, CustomTarget) {
-  KernelScope kernel_scope;
   constexpr int M = 16;
   Placeholder a("a", kFloat, {M});
   Placeholder b("b", kFloat, {M});
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index 7c3eefaab3b1c..28934f622d057 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -41,7 +41,6 @@ void checkExprIR(const ExprHandle& e, const std::string& pattern) {
 }
 
 TEST(LoopNest, ExprSimple01) {
-  KernelScope kernel_scope;
   Tensor tensor = Compute(
       "f", {{16, "X"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
@@ -54,7 +53,6 @@ TEST(LoopNest, ExprSimple01) {
 }
 
 TEST(LoopNest, ExprLower01) {
-  KernelScope kernel_scope;
   Tensor tensor = Compute(
       "f", {{16, "x"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
@@ -68,7 +66,6 @@ TEST(LoopNest, ExprLower01) {
 }
 
 TEST(LoopNest, ExprSimple02) {
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x, const ExprHandle& y) {
     return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
   };
@@ -162,7 +159,6 @@ void assertForRanges(
 }
 
 TEST(LoopNest, ExprSliceHeadWithLoopOptions) {
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
@@ -186,7 +182,6 @@ TEST(LoopNest, ExprSliceHeadWithLoopOptions) {
 }
 
 TEST(LoopNest, ExprSliceTailWithLoopOptions) {
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
@@ -219,7 +214,6 @@ TEST(LoopNest, ExprSliceTailWithLoopOptions) {
 TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) {
   // When factor equals the For loop's original size, keep using the original
   // For loop.
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
@@ -240,7 +234,6 @@ TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) {
 }
 
 TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) {
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
@@ -261,7 +254,6 @@ TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) {
 }
 
 TEST(LoopNest, ExprSliceHead) {
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
@@ -284,7 +276,6 @@ TEST(LoopNest, ExprSliceHead) {
 }
 
 TEST(LoopNest, ExprSliceHeadWithNonZeroStart) {
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
@@ -311,7 +302,6 @@ TEST(LoopNest, ExprSliceHeadWithNonZeroStart) {
 TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) {
   // When factor equals the For loop's original size, keep using the original
   // For loop.
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
@@ -334,7 +324,6 @@ TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) {
 TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) {
   // When factor equals the For loop's original size, keep using the original
   // For loop.
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
@@ -355,7 +344,6 @@ TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) {
 }
 
 TEST(LoopNest, ExprSliceTail) {
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
@@ -381,7 +369,6 @@ TEST(LoopNest, ExprSplitAndSlice) {
   // 0: splitWithTail
   // 1: sliceTail on inner loop
   // 2: sliceHead on outer loop
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
@@ -434,7 +421,6 @@ TEST(LoopNest, ExprSplitAndSlice) {
 TEST(LoopNest, ExprSliceAndNormalize) {
   // 0: sliceHead
   // 1: normalize tail
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
@@ -467,7 +453,6 @@ TEST(LoopNest, ExprSliceWithVariableDimension) {
   auto testWithDimension =
       [](int dimension,
          const std::vector<std::pair<int, int>>& expected_for_ranges) {
-        KernelScope kernel_scope;
         VarHandle dim("dim", kInt);
         Tensor tensor =
             Compute("f", {{dim, "x"}}, [](const ExprHandle& x) { return x; });
@@ -504,7 +489,6 @@ TEST(LoopNest, ExprSliceWithVariableDimension) {
 }
 
 TEST(LoopNest, ExprSplitWithTail) {
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
@@ -534,7 +518,6 @@ TEST(LoopNest, ExprSplitWithTail) {
 }
 
 TEST(LoopNest, ExprSplitWithTailNone) {
-  KernelScope kernel_scope;
   auto func = [](const ExprHandle& x, const ExprHandle& y) {
     return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
   };
@@ -592,7 +575,6 @@ TEST(LoopNest, ExprSplitWithTailNone) {
 }
 
 TEST(LoopNest, ExprSplitWithMask01) {
-  KernelScope kernel_scope;
   const int M = 26;
   const int N = 5;
   Placeholder a_buf("a", kFloat, {M, N});
@@ -628,7 +610,6 @@ TEST(LoopNest, ExprSplitWithMask01) {
 // Tests the case where we split a loop cleanly multiple times, we should not
 // insert any masks.
 TEST(LoopNest, ExprSplitWithMaskRepeatedNoMask) {
-  KernelScope kernel_scope;
   const int M = 64;
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
@@ -655,8 +636,6 @@ TEST(LoopNest, ExprSplitWithMaskRepeatedNoMask) {
 }
 
 TEST(LoopNest, getLoopAt) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //  for (int i = 0; i < 100; i++) {
   //    for (int j = 0; j < 100; j++) {
@@ -714,7 +693,6 @@ TEST(LoopNest, getLoopAt) {
 }
 
 TEST(LoopNest, TileSimple) {
-  KernelScope kernel_scope;
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   const int M = 64, N = 64;
   Placeholder a_buf("a", kFloat, {M, N});
@@ -760,7 +738,6 @@ TEST(LoopNest, TileSimple) {
 }
 
 TEST(LoopNest, TileWithTails) {
-  KernelScope kernel_scope;
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   const int M = 64, N = 64;
   Placeholder a_buf("a", kFloat, {M, N});
@@ -807,7 +784,6 @@ TEST(LoopNest, TileWithTails) {
 }
 
 TEST(LoopNest, TileInMiddle) {
-  KernelScope kernel_scope;
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   const int M = 8, N = 8, L = 8, K = 8;
   Placeholder a_buf("a", kFloat, {M, N, L, K});
@@ -868,7 +844,6 @@ TEST(LoopNest, TileInMiddle) {
 }
 
 TEST(LoopNest, SplitWithTailWithLoopOptions) {
-  KernelScope kernel_scope;
   const int M = 21;
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
@@ -899,7 +874,6 @@ TEST(LoopNest, SplitWithTailWithLoopOptions) {
 }
 
 TEST(LoopNest, SplitWithMaskWithLoopOptions) {
-  KernelScope kernel_scope;
   const int M = 21;
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
@@ -924,7 +898,6 @@ TEST(LoopNest, SplitWithMaskWithLoopOptions) {
 }
 
 TEST(LoopNest, ScheduleBroadcastAddBuffer) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -973,7 +946,6 @@ TEST(LoopNest, ScheduleBroadcastAddBuffer) {
 }
 
 TEST(LoopNest, ScheduleFunctionCall01) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -1030,7 +1002,6 @@ TEST(LoopNest, ScheduleFunctionCall01) {
 }
 
 TEST(LoopNest, ScheduleInlineSimple) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -1111,7 +1082,6 @@ static std::string remove_space(const std::string& str) {
 }
 
 void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -1230,7 +1200,6 @@ TEST(LoopNest, ScheduleInlineFunc01) {
 
 // Make sure we cache random vars if we should.
 TEST(LoopNest, ScheduleInlineRandom) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -1266,7 +1235,6 @@ TEST(LoopNest, ScheduleInlineRandom) {
 
 // Make sure we don't cache random vars that are not being inlined.
 TEST(LoopNest, ScheduleInlineRandomUnrelated) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -1303,7 +1271,6 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) {
 // Make sure we generate the right number of random values == the dimensionality
 // of the production tensor.
 TEST(LoopNest, ScheduleInlineRandomLowerDimensions) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -1336,7 +1303,6 @@ TEST(LoopNest, ScheduleInlineRandomLowerDimensions) {
 
 // Make sure we don't screw up intrinsics thinking they're rand.
 TEST(LoopNest, ScheduleInlineIntrinsics) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -1397,7 +1363,6 @@ TEST(LoopNest, ScheduleInlineIntrinsics) {
 
 // Make sure we can handle rand and non-rand intrinsics.
 TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -1431,7 +1396,6 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
 
 // Split a Compute then inline it into another compute.
 TEST(LoopNest, ScheduleSplitAThenInline) {
-  KernelScope kernel_scope;
   Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
   Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
@@ -1446,7 +1410,6 @@ TEST(LoopNest, ScheduleSplitAThenInline) {
 
 // Split a Compute then inline another Compute into it.
 TEST(LoopNest, ScheduleSplitBThenInline) {
-  KernelScope kernel_scope;
   Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
   Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
@@ -1471,7 +1434,6 @@ TEST(LoopNest, ScheduleSplitBThenInline) {
 
 // Split a Compute twice then inline it.
 TEST(LoopNest, ScheduleSplitTwiceThenInline) {
-  KernelScope kernel_scope;
   Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
   Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
@@ -1489,7 +1451,6 @@ TEST(LoopNest, ScheduleSplitTwiceThenInline) {
 
 // Inline a Compute, then split.
 TEST(LoopNest, ScheduleInlineThenSplit) {
-  KernelScope kernel_scope;
   Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
   Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
@@ -1514,7 +1475,6 @@ TEST(LoopNest, ScheduleInlineThenSplit) {
 
 // Split a Compute, inline it, then split the result.
 TEST(LoopNest, ScheduleSplitInlineThenSplit) {
-  KernelScope kernel_scope;
   Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
   Tensor b = Compute("b", {{16, "j"}}, [&](const VarHandle& j) {
@@ -1541,7 +1501,6 @@ TEST(LoopNest, ScheduleSplitInlineThenSplit) {
 
 // Oversplit a loop that is simplified out after inlining.
 TEST(LoopNest, ScheduleSplitInlineSimplify) {
-  KernelScope kernel_scope;
   Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) {
     return ExprHandle(4) * i - ExprHandle(2) * i;
   });
@@ -1557,7 +1516,6 @@ TEST(LoopNest, ScheduleSplitInlineSimplify) {
 
 // Inline a Compute with two consumers.
 TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
-  KernelScope kernel_scope;
   Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
   Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
@@ -1587,7 +1545,6 @@ TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
 
 // Inline Compute A into B, then inline B into C.
 TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
-  KernelScope kernel_scope;
   Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
   Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
@@ -1618,7 +1575,6 @@ TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
 
 // Inline a Compute that is both a producer and consumer.
 TEST(LoopNest, ScheduleInlineThreeMixedInner) {
-  KernelScope kernel_scope;
   Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
   Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
@@ -1648,7 +1604,6 @@ TEST(LoopNest, ScheduleInlineThreeMixedInner) {
 
 // Split 3 Computes, then inline the first two into the last.
 TEST(LoopNest, ScheduleInlineThreeMixedSplit) {
-  KernelScope kernel_scope;
   Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
   Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
@@ -1672,7 +1627,6 @@ TEST(LoopNest, ScheduleInlineThreeMixedSplit) {
 
 // Check that inlining works for output tensors too
 TEST(LoopNest, ScheduleInlineOutputTensors) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -1710,7 +1664,6 @@ TEST(LoopNest, ScheduleInlineOutputTensors) {
 }
 
 TEST(LoopNest, ScheduleFuserStyle) {
-  KernelScope kernel_scope;
   const int kVectorSize = 8;
   const int kVectorCount = 128;
   const int kTotalSize = kVectorSize * kVectorCount;
@@ -1743,7 +1696,6 @@ TEST(LoopNest, ScheduleFuserStyle) {
 }
 
 TEST(LoopNest, ScheduleFuserThreeArg) {
-  KernelScope kernel_scope;
   const int kVectorSize = 8;
   const int kVectorCount = 128;
   const int kTotalSize = kVectorSize * kVectorCount;
@@ -1782,7 +1734,6 @@ TEST(LoopNest, ScheduleFuserThreeArg) {
 }
 
 TEST(LoopNest, ScheduleDynamicShape2D) {
-  KernelScope kernel_scope;
   auto testWithSize = [](int32_t M, int32_t N) {
     VarHandle m("m", kInt);
     VarHandle n("n", kInt);
@@ -1821,7 +1772,6 @@ TEST(LoopNest, LoopNestComputeAt_1) {
   // should be in that loop after the transformation. Also, computation of A
   // should not be inlined into B. Instead, it should be computed into the temp,
   // and the temp should be used in B.
-  KernelScope kernel_scope;
   VarHandle N("N", kInt);
   Tensor A = Compute(
       "A", {{N, "i_a"}}, [&](const VarHandle& i_a) { return i_a * i_a; });
@@ -1867,7 +1817,6 @@ TEST(LoopNest, LoopNestComputeAt_2) {
   //                 p[cy,cx+1] + p[cy+1,cx+1]
   //   }
   // }
-  KernelScope kernel_scope;
 
   const int kW = 16, kH = 16;
   VarHandle W("W", kInt);
@@ -1955,7 +1904,6 @@ TEST(LoopNest, LoopNestComputeAt_3) {
   // D(x,y) = A(x, y+1) + C(x, y)
   //
   // i.e. when 'A' comes to 'D' directly and indirectly through 'C'.
-  KernelScope kernel_scope;
 
   const int kW = 16, kH = 16;
   VarHandle W("W", kInt);
@@ -2056,8 +2004,6 @@ TEST(LoopNest, LoopNestComputeAt_3) {
 using Axis = const VarHandle&;
 
 TEST(LoopNest, Reduce2dComputeAt) {
-  KernelScope kernel_scope;
-
   const int kW = 16, kH = 16;
   VarHandle W("W", kInt);
   VarHandle H("H", kInt);
@@ -2176,7 +2122,6 @@ TEST(LoopNest, DISABLED_Conv1d_NH) {
   // Lots of stuff is broken here.  The computeAt swaps the axes for some odd
   // reason.  Even without that, the index flattener fails due to "dimensions
   // mismatch in flatten index".
-  KernelScope kernel_scope;
 
   int N = 4;
   int H = 256;
@@ -2270,7 +2215,6 @@ class LoopOrderHelper : public IRVisitor {
 };
 
 TEST(LoopNest, LoopNestReorderAxis1) {
-  KernelScope kernel_scope;
   Tensor tensor = Compute(
       "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
@@ -2319,7 +2263,6 @@ TEST(LoopNest, LoopNestReorderAxis1) {
 }
 
 TEST(LoopNest, LoopNestReorderPartialAxes) {
-  KernelScope kernel_scope;
   Tensor tensor = Compute(
       "f",
       {{2, "x"}, {3, "y"}, {4, "z"}},
@@ -2367,7 +2310,6 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
 }
 
 TEST(LoopNest, LoopNestReorderInternalAxis) {
-  KernelScope kernel_scope;
   Tensor tensor = Compute(
       "f",
       {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}},
@@ -2404,7 +2346,6 @@ TEST(LoopNest, LoopNestReorderInternalAxis) {
 }
 
 TEST(LoopNest, LoopNestReorderEnclosingAxis) {
-  KernelScope kernel_scope;
   Tensor tensor = Compute(
       "f",
       {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}},
@@ -2440,7 +2381,6 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) {
 }
 
 TEST(LoopNest, LoopNestReorderSameAxis) {
-  KernelScope kernel_scope;
   Tensor tensor = Compute(
       "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
@@ -2469,8 +2409,6 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
    *     Stmt 4
    */
 
-  KernelScope kernel_scope;
-
   Tensor tensor = Compute(
       "f",
       {{2, "x"}, {3, "y"}, {4, "z"}},
@@ -2605,8 +2543,6 @@ void LoopNestReorderTestHelper(
     bool append,
     int index1,
     int index2) {
-  KernelScope kernel_scope;
-
   Tensor c = Compute(
       "5d",
       {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}},
@@ -2722,7 +2658,6 @@ TEST(LoopNest, LoopNestReorderLongStringFull) {
 }
 
 TEST(LoopNest, LoopNestReorderInternalLoopNest) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -2823,7 +2758,6 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) {
 }
 
 TEST(LoopNest, OuterLoopVectorization) {
-  KernelScope kernel_scope;
   Tensor tensor = Compute(
       "f", {{8, "X"}, {8, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
@@ -2851,8 +2785,6 @@ TEST(LoopNest, OuterLoopVectorization) {
 }
 
 TEST(LoopNest, VectorizeLoopNotNormalized) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 10; i++) {
   //     for (int j = 1; j < 5; j++) {
@@ -2876,7 +2808,6 @@ TEST(LoopNest, VectorizeLoopNotNormalized) {
 namespace {
 
 std::string constantUpperBoundLoopIR(int upper_bound_val) {
-  KernelScope kernel_scope;
   ExprHandle upper_bound(upper_bound_val);
   Tensor A = Compute(
       "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; });
@@ -2903,7 +2834,6 @@ TEST(LoopNest, Unroll) {
 }
 
 TEST(LoopNest, UnrollOuter) {
-  KernelScope kernel_scope;
   ExprHandle outer_bound(3);
   ExprHandle inner_bound(4);
   Tensor A = Compute(
@@ -2927,7 +2857,6 @@ TEST(LoopNest, UnrollOuter) {
 }
 
 TEST(LoopNest, UnrollInner) {
-  KernelScope kernel_scope;
   ExprHandle outer_bound(3);
   ExprHandle inner_bound(4);
   Tensor A = Compute(
@@ -2949,7 +2878,6 @@ TEST(LoopNest, UnrollInner) {
 }
 
 TEST(LoopNest, UnrollMultipleStatements) {
-  KernelScope kernel_scope;
   const int kTotalSize = 3;
   BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
   BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
@@ -2975,8 +2903,6 @@ TEST(LoopNest, UnrollMultipleStatements) {
 }
 
 TEST(LoopNest, UnrollNonLiteralConstantBounds) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 2 - 1; i < 12 / 3; i++) {
   //     for (int j = 0; j < 4; j++) {
@@ -3021,7 +2947,6 @@ TEST(LoopNest, UnrollEmpty) {
 }
 
 TEST(LoopNest, NoUnroll) {
-  KernelScope kernel_scope;
   VarHandle upper_bound("N", kInt);
   Tensor A = Compute(
       "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; });
@@ -3033,7 +2958,6 @@ TEST(LoopNest, NoUnroll) {
 }
 
 TEST(LoopNest, UnrollWithLet) {
-  KernelScope kernel_scope;
   const int kTotalSize = 3;
   BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kInt);
   BufHandle b_buf("B", {ExprHandle(kTotalSize)}, kInt);
@@ -3076,8 +3000,6 @@ TEST(LoopNest, UnrollWithLet) {
 }
 
 TEST(LoopNest, IsNormalized) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 50; i < 100; i++) {
   //     A[i] = B[i];
@@ -3100,8 +3022,6 @@ TEST(LoopNest, IsNormalized) {
 }
 
 TEST(LoopNest, NormalizeStartPositive) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int x = 50; x < 100; x++) {
   //     A[x] = B[x];
@@ -3132,8 +3052,6 @@ TEST(LoopNest, NormalizeStartPositive) {
 }
 
 TEST(LoopNest, NormalizeStartNegative) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int x = -50; x < 100; x++) {
   //     A[x + 50] = B[x + 50];
@@ -3164,8 +3082,6 @@ TEST(LoopNest, NormalizeStartNegative) {
 }
 
 TEST(LoopNest, NormalizeStartZero) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int x = 0; x < 100; x++) {
   //     A[x] = B[x];
@@ -3198,8 +3114,6 @@ TEST(LoopNest, NormalizeStartZero) {
 }
 
 TEST(LoopNest, NormalizeStartVariable) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int x = y; x < 100; x++) {
   //     A[x] = B[x];
@@ -3232,8 +3146,6 @@ TEST(LoopNest, NormalizeStartVariable) {
 }
 
 TEST(LoopNest, NormalizeOnNestedOuterLoop) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int x = 50; x < 100; x++) {
   //     for (int y = 10; y < 100; y++) {
@@ -3266,8 +3178,6 @@ TEST(LoopNest, NormalizeOnNestedOuterLoop) {
 }
 
 TEST(LoopNest, NormalizeOnNestedInnerLoop) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int x = 50; x < 100; x++) {
   //     for (int y = 10; y < 100; y++) {
@@ -3300,8 +3210,6 @@ TEST(LoopNest, NormalizeOnNestedInnerLoop) {
 }
 
 TEST(LoopNest, NormalizeAndSplitWithTail) {
-  KernelScope kernel_scope;
-
   // Create a dummy tensor to construct LoopNest.
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
@@ -3349,8 +3257,6 @@ TEST(LoopNest, NormalizeAndSplitWithTail) {
 }
 
 TEST(LoopNest, FlattenSimpleLoopNest2D) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 10; i++) {
   //     for (int j = 0; j < 5; j++) {
@@ -3392,8 +3298,6 @@ TEST(LoopNest, FlattenSimpleLoopNest2D) {
 }
 
 TEST(LoopNest, FlattenSimpleLoopNest3D) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 10; i++) {
   //     for (int j = 0; j < 5; j++) {
@@ -3439,8 +3343,6 @@ TEST(LoopNest, FlattenSimpleLoopNest3D) {
 }
 
 TEST(LoopNest, FlattenLoopNestAfterNormalize) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 2; i < 10; i++) {
   //     for (int j = 3; j < 15; j++) {
@@ -3482,8 +3384,6 @@ TEST(LoopNest, FlattenLoopNestAfterNormalize) {
 }
 
 TEST(LoopNest, FlattenLoopNestWithNonLiteralConstantBounds) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 15-5; i++) {
   //     for (int j = 0; j < 20/4; j++) {
@@ -3524,8 +3424,6 @@ TEST(LoopNest, FlattenLoopNestWithNonLiteralConstantBounds) {
 }
 
 TEST(LoopNest, FlattenImperfectLoopNest) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 10; i++) {
   //     A[i, i] = 0;
@@ -3555,8 +3453,6 @@ TEST(LoopNest, FlattenImperfectLoopNest) {
 }
 
 TEST(LoopNest, FlattenReductionLoopNest) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 10; i++) {
   //     S[i] = 0;
@@ -3588,7 +3484,6 @@ TEST(LoopNest, FlattenReductionLoopNest) {
 }
 
 TEST(LoopNest, FlattenReductionLoopNestFromTensor) {
-  KernelScope kernel_scope;
   const int M = 3;
   const int N = 7;
   VarHandle m("m", kInt);
@@ -3608,8 +3503,6 @@ TEST(LoopNest, FlattenReductionLoopNestFromTensor) {
 }
 
 TEST(LoopNest, FlattenIncorrectLoopsAsInput) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 10; i++) {
   //     for (int j = 0; j < 5; j++) {
@@ -3648,7 +3541,6 @@ TEST(LoopNest, FlattenIncorrectLoopsAsInput) {
 }
 
 TEST(LoopNest, DetectInlineRankMismatch) {
-  KernelScope kernel_scope;
   const int kTotalSize = 8;
 
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
@@ -3666,8 +3558,6 @@ TEST(LoopNest, DetectInlineRankMismatch) {
 }
 
 TEST(LoopNest, CacheReadsSimple) {
-  KernelScope kernel_scope;
-
   Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
@@ -3734,8 +3624,6 @@ TEST(LoopNest, CacheReadsSimple) {
 }
 
 TEST(LoopNest, CacheReadsOuter) {
-  KernelScope kernel_scope;
-
   Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
@@ -3782,8 +3670,6 @@ TEST(LoopNest, CacheReadsOuter) {
 }
 
 TEST(LoopNest, CacheReadsInternal) {
-  KernelScope kernel_scope;
-
   Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
@@ -3829,8 +3715,6 @@ TEST(LoopNest, CacheReadsInternal) {
 }
 
 TEST(LoopNest, CacheReadsInner) {
-  KernelScope kernel_scope;
-
   Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
@@ -3877,8 +3761,6 @@ TEST(LoopNest, CacheReadsInner) {
 }
 
 TEST(LoopNest, CacheWritesSimple) {
-  KernelScope kernel_scope;
-
   Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
@@ -3929,7 +3811,6 @@ TEST(LoopNest, CacheWritesSimple) {
 }
 
 TEST(LoopNest, DeadStoreElimination) {
-  KernelScope kernel_scope;
   VarHandle y("y", kInt);
   VarHandle x("x_tail", kInt);
   BufHandle f("f", {26, 5}, kInt);
@@ -3970,7 +3851,6 @@ TEST(LoopNest, DeadStoreElimination) {
 }
 
 TEST(LoopNest, DeadStoreEliminationWithIntermediates) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   VarHandle z("z", kInt);
@@ -4017,8 +3897,6 @@ TEST(LoopNest, DeadStoreEliminationWithIntermediates) {
 }
 
 TEST(LoopNest, CompoundTensorSimple) {
-  KernelScope kernel_scope;
-
   BufHandle a_buf("A", {10, 5}, kInt);
   VarHandle i("i", kInt);
   VarHandle j("j", kInt);
@@ -4056,7 +3934,6 @@ TEST(LoopNest, CompoundTensorSimple) {
 }
 
 TEST(LoopNest, InlineConstantIndex) {
-  KernelScope kernel_scope;
   const int N = 10;
   Placeholder x_buf("a", kFloat, {1, N, 1});
   Tensor y = Compute(
@@ -4078,8 +3955,6 @@ TEST(LoopNest, InlineConstantIndex) {
 }
 
 TEST(LoopNest, CompoundTensorUsed) {
-  KernelScope kernel_scope;
-
   BufHandle a_buf("A", {10, 5}, kInt);
   VarHandle i("i", kInt);
   VarHandle j("j", kInt);
@@ -4124,8 +3999,6 @@ TEST(LoopNest, CompoundTensorUsed) {
 }
 
 TEST(LoopNest, InlineFromLoad) {
-  KernelScope kernel_scope;
-
   constexpr int N = 1024;
   BufHandle a("A", {N}, kInt);
   BufHandle b("B", {N}, kInt);
@@ -4150,8 +4023,6 @@ TEST(LoopNest, InlineFromLoad) {
 }
 
 TEST(LoopNest, OptimizeConditionalsSimple) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
@@ -4192,8 +4063,6 @@ TEST(LoopNest, OptimizeConditionalsSimple) {
 }
 
 TEST(LoopNest, OptimizeConditionalsNestedConditions) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = IfThenElse(i<10, IfThenElse(i<5, B[i], C[i-5]), D[i-10])
@@ -4241,8 +4110,6 @@ TEST(LoopNest, OptimizeConditionalsNestedConditions) {
 }
 
 TEST(LoopNest, OptimizeConditionalsMultipleStores) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
@@ -4301,8 +4168,6 @@ TEST(LoopNest, OptimizeConditionalsMultipleStores) {
 }
 
 TEST(LoopNest, OptimizeConditionalsMultipleStoresInOneLoop) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 50; i++) {
   //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
@@ -4355,8 +4220,6 @@ TEST(LoopNest, OptimizeConditionalsMultipleStoresInOneLoop) {
 }
 
 TEST(LoopNest, OptimizeConditionalsOuterLoopVar) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 100; j++) {
@@ -4400,8 +4263,6 @@ TEST(LoopNest, OptimizeConditionalsOuterLoopVar) {
 }
 
 TEST(LoopNest, OptimizeConditionalsCompValuesNotOrdered) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = IfThenElse(i<5, IfThenElse(i<10, B[i], C[i-5]), D[i-10])
@@ -4441,8 +4302,6 @@ TEST(LoopNest, OptimizeConditionalsCompValuesNotOrdered) {
 }
 
 TEST(LoopNest, OptimizeConditionalsCompValuesNotConstants) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = IfThenElse(i<N, IfThenElse(i<5, B[i], C[i-5]), D[i-10])
@@ -4483,8 +4342,6 @@ TEST(LoopNest, OptimizeConditionalsCompValuesNotConstants) {
 }
 
 TEST(LoopNest, OptimizeConditionalsInvalidCondition) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = IfThenElse(i<10, IfThenElse(i>5, B[i], C[i-5]), D[i-10])
@@ -4524,8 +4381,6 @@ TEST(LoopNest, OptimizeConditionalsInvalidCondition) {
 }
 
 TEST(LoopNest, OptimizeConditionalsInvalidCondition2) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = IfThenElse(10<i, IfThenElse(i<5, B[i], C[i-5]), D[i-10])
@@ -4566,8 +4421,6 @@ TEST(LoopNest, OptimizeConditionalsInvalidCondition2) {
 }
 
 TEST(LoopNest, OptimizeConditionalsInvalidCondition3) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = IfThenElse(i<10, IfThenElse(k<5, B[i], C[i-5]), D[i-10])
@@ -4609,8 +4462,6 @@ TEST(LoopNest, OptimizeConditionalsInvalidCondition3) {
 }
 
 TEST(LoopNest, OptimizeConditionalsInvalidCondition4) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = IfThenElse(k<10, IfThenElse(k<5, B[i], C[i-5]), D[i-10])
@@ -4652,8 +4503,6 @@ TEST(LoopNest, OptimizeConditionalsInvalidCondition4) {
 }
 
 TEST(LoopNest, OptimizeConditionalsNotNormalized) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 2; i < 20; i++) {
   //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
@@ -4761,7 +4610,6 @@ static void checkColReduce(StmtPtr s, Placeholder& p, Tensor t) {
 }
 
 TEST(LoopNest, ColReduceSplitTailEvenReorder) {
-  KernelScope kernel_scope;
   constexpr int M = 76, N = 128;
   auto p = colReduce(M, N);
   StmtPtr s = splitTailReorder(p.second);
@@ -4784,7 +4632,6 @@ TEST(LoopNest, ColReduceSplitTailEvenReorder) {
 }
 
 TEST(LoopNest, ColReduceSplitTailUnevenReorder) {
-  KernelScope kernel_scope;
   constexpr int M = 76, N = 100;
   auto p = colReduce(M, N);
   StmtPtr s = splitTailReorder(p.second);
@@ -4810,7 +4657,6 @@ TEST(LoopNest, ColReduceSplitTailUnevenReorder) {
 }
 
 TEST(LoopNest, ColReduceSplitMaskEvenReorder) {
-  KernelScope kernel_scope;
   constexpr int M = 76, N = 128;
   auto p = colReduce(M, N);
   StmtPtr s = splitMaskReorder(p.second);
@@ -4818,7 +4664,6 @@ TEST(LoopNest, ColReduceSplitMaskEvenReorder) {
 }
 
 TEST(LoopNest, ColReduceSplitMaskUnevenReorder) {
-  KernelScope kernel_scope;
   constexpr int M = 76, N = 100;
   auto p = colReduce(M, N);
   StmtPtr s = splitMaskReorder(p.second);
@@ -4826,8 +4671,6 @@ TEST(LoopNest, ColReduceSplitMaskUnevenReorder) {
 }
 
 TEST(LoopNest, ReorderAxisWithMultipleConds) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     if i > 5 {
@@ -4867,7 +4710,6 @@ TEST(LoopNest, ReorderAxisWithMultipleConds) {
 }
 
 TEST(LoopNest, VectorizeUse) {
-  KernelScope kernel_scope;
   constexpr int N = 8;
   Placeholder a("a", kFloat, {N});
   Tensor b = Compute(
@@ -4898,8 +4740,6 @@ const char* int64Loop = R"IR(
 )IR";
 
 TEST(LoopNest, Int64Direct) {
-  KernelScope kernel_scope;
-
   constexpr int64_t N = 12;
   Placeholder a("a", kLong, {N});
   Placeholder b("b", kLong, {N});
@@ -4912,8 +4752,6 @@ TEST(LoopNest, Int64Direct) {
 }
 
 TEST(LoopNest, Int64Compute) {
-  KernelScope kernel_scope;
-
   constexpr int64_t N = 12;
   Placeholder a("a", kLong, {N});
   Tensor b = Compute("b", {{N, "n"}}, [&](const VarHandle& n) {
@@ -4928,8 +4766,6 @@ TEST(LoopNest, Int64Compute) {
 }
 
 TEST(LoopNest, DistributeLoopWithAllStmtsAsPivots) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = 0;
@@ -4990,8 +4826,6 @@ TEST(LoopNest, DistributeLoopWithAllStmtsAsPivots) {
 }
 
 TEST(LoopNest, DistributeLoopWithOneStmtAsPivot) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = 0;
@@ -5049,8 +4883,6 @@ TEST(LoopNest, DistributeLoopWithOneStmtAsPivot) {
 }
 
 TEST(LoopNest, DistributeLoopWithoutAnyPivot) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = 0;
@@ -5111,8 +4943,6 @@ TEST(LoopNest, DistributeLoopWithoutAnyPivot) {
 }
 
 TEST(LoopNest, DistributeLoopOverInnerLoops) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = 0;
@@ -5170,8 +5000,6 @@ TEST(LoopNest, DistributeLoopOverInnerLoops) {
 }
 
 TEST(LoopNest, DistributeLoopAndParentsWithoutAnyPivot) {
-  KernelScope kernel_scope;
-
   // Input IR:
   // for (int m = 0; m < 50; m++) {
   //   for (int i = 0; i < 20; i++) {
@@ -5281,8 +5109,6 @@ TEST(LoopNest, DistributeLoopAndParentsWithoutAnyPivot) {
 }
 
 TEST(LoopNest, fuseLoopsSimple) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 0; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -5317,8 +5143,6 @@ TEST(LoopNest, fuseLoopsSimple) {
 }
 
 TEST(LoopNest, fuseLoopsMultiple) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 100; i++) {
   //     A[i+100] = 20 + i;
@@ -5360,8 +5184,6 @@ TEST(LoopNest, fuseLoopsMultiple) {
 }
 
 TEST(LoopNest, fuseLoopsNested) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int m = 0; m < 20; m++) {
   //     A[m] = 0;
@@ -5422,8 +5244,6 @@ TEST(LoopNest, fuseLoopsNested) {
 }
 
 TEST(LoopNest, fuseLoopsNested2D) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 100; j++) {
@@ -5482,8 +5302,6 @@ TEST(LoopNest, fuseLoopsNested2D) {
 }
 
 TEST(LoopNest, fuseLoopsNested2DInner) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 100; j++) {
@@ -5524,8 +5342,6 @@ TEST(LoopNest, fuseLoopsNested2DInner) {
 }
 
 TEST(LoopNest, fuseLoopsDifferentStopBounds) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 0; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -5547,8 +5363,6 @@ TEST(LoopNest, fuseLoopsDifferentStopBounds) {
 }
 
 TEST(LoopNest, fuseLoopsDifferentStartBounds) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 0; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -5570,8 +5384,6 @@ TEST(LoopNest, fuseLoopsDifferentStartBounds) {
 }
 
 TEST(LoopNest, fuseLoopsNotContiguous) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 0; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -5595,8 +5407,6 @@ TEST(LoopNest, fuseLoopsNotContiguous) {
 }
 
 TEST(LoopNest, fuseLoopsWithDifferentParents) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 50; i++) {
   //     for (int j = 0; j < 100; j++) {
@@ -5624,8 +5434,6 @@ TEST(LoopNest, fuseLoopsWithDifferentParents) {
 }
 
 TEST(LoopNest, fuseLoopsWithVariableBounds) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 0; j < N; j++) {
   //     A[j] = 10 * j;
@@ -5662,8 +5470,6 @@ TEST(LoopNest, fuseLoopsWithVariableBounds) {
 }
 
 TEST(LoopNest, fuseLoopsWithExprBounds) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 0; j < M + N; j++) {
   //     A[j] = 10 * j;
@@ -5700,8 +5506,6 @@ TEST(LoopNest, fuseLoopsWithExprBounds) {
 }
 
 TEST(LoopNest, fuseLoopsWithDifferentExprBounds) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = M; j < N * 2; j++) {
   //     A[j] = 10 * j;
@@ -5739,8 +5543,6 @@ TEST(LoopNest, fuseLoopsWithDifferentExprBounds) {
 }
 
 TEST(LoopNest, fuseLoopsWithNonOverlappingBufferAccesses) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 10; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -5776,8 +5578,6 @@ TEST(LoopNest, fuseLoopsWithNonOverlappingBufferAccesses) {
 }
 
 TEST(LoopNest, fuseLoopsWithNonOverlapping2DBufferAccesses) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 100; j++) {
@@ -5826,8 +5626,6 @@ TEST(LoopNest, fuseLoopsWithNonOverlapping2DBufferAccesses) {
 }
 
 TEST(LoopNest, fuseLoopsWithReductions) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     A[i] = 0
@@ -5874,8 +5672,6 @@ TEST(LoopNest, fuseLoopsWithReductions) {
 }
 
 TEST(LoopNest, fuseLoopsWith2DReductions) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 50; j++) {
@@ -5934,8 +5730,6 @@ TEST(LoopNest, fuseLoopsWith2DReductions) {
 }
 
 TEST(LoopNest, fuseLoopsWithComplexIndices) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 20; j++) {
@@ -5982,8 +5776,6 @@ TEST(LoopNest, fuseLoopsWithComplexIndices) {
 }
 
 TEST(LoopNest, fuseLoopsWithMixedLoopVarsAsIndices) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 20; j++) {
@@ -6013,8 +5805,6 @@ TEST(LoopNest, fuseLoopsWithMixedLoopVarsAsIndices) {
 }
 
 TEST(LoopNest, fuseLoopsWithTranspose) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 20; j++) {
@@ -6044,8 +5834,6 @@ TEST(LoopNest, fuseLoopsWithTranspose) {
 }
 
 TEST(LoopNest, fuseLoopsThatViolateDependencies1) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 10; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -6067,8 +5855,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies1) {
 }
 
 TEST(LoopNest, fuseLoopsThatViolateDependencies2) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 10; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -6090,8 +5876,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies2) {
 }
 
 TEST(LoopNest, fuseLoopsThatViolateDependencies3) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int m = 0; m < 20; m++) {
   //     A[m] = 0;
@@ -6135,8 +5919,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies3) {
 }
 
 TEST(LoopNest, fuseLoopsThatViolateDependencies4) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 100; j++) {
@@ -6179,8 +5961,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies4) {
 }
 
 TEST(LoopNest, fuseLoopsThatViolateDependencies5) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 100; j++) {
@@ -6209,8 +5989,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies5) {
 }
 
 TEST(LoopNest, fuseLoopsThatViolateDependencies6) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int j = 0; j < 100; j++) {
   //     A[j] = 10 * j;
@@ -6237,8 +6015,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies6) {
 }
 
 TEST(LoopNest, fuseLoopsThatViolateDependencies7) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int k = 0; k < 100; k++) {
   //     B[k] = 20 * A[99-k];
@@ -6265,8 +6041,6 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies7) {
 }
 
 TEST(LoopNest, areLoopsPerfectlyNested) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 30; j++) {
@@ -6309,8 +6083,6 @@ TEST(LoopNest, areLoopsPerfectlyNested) {
 }
 
 TEST(LoopNest, reorderNestedLoops2D) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 30; j++) {
@@ -6335,8 +6107,6 @@ TEST(LoopNest, reorderNestedLoops2D) {
 }
 
 TEST(LoopNest, reorderNestedLoops3D) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 30; j++) {
@@ -6366,8 +6136,6 @@ TEST(LoopNest, reorderNestedLoops3D) {
 }
 
 TEST(LoopNest, reorderNestedLoops4D) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 30; j++) {
@@ -6405,8 +6173,6 @@ TEST(LoopNest, reorderNestedLoops4D) {
 }
 
 TEST(LoopNest, reorderTrivialPermutation) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 30; j++) {
@@ -6436,8 +6202,6 @@ TEST(LoopNest, reorderTrivialPermutation) {
 }
 
 TEST(LoopNest, reorderInvalidPermutations) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 30; j++) {
@@ -6475,8 +6239,6 @@ TEST(LoopNest, reorderInvalidPermutations) {
 }
 
 TEST(LoopNest, reorderInvalidLoopNest) {
-  KernelScope kernel_scope;
-
   // Input IR:
   //   for (int i = 0; i < 20; i++) {
   //     for (int j = 0; j < 30; j++) {
@@ -6518,8 +6280,6 @@ TEST(LoopNest, reorderInvalidLoopNest) {
 }
 
 TEST(LoopNest, compressBufferSimple) {
-  KernelScope kernel_scope;
-
   // Input IR:
   // for (int i = 0; i < 100; ++i) {
   //   for (int j = 0; j < 200; ++j) {
@@ -6564,8 +6324,6 @@ TEST(LoopNest, compressBufferSimple) {
 }
 
 TEST(LoopNest, compressBufferMultipleDims) {
-  KernelScope kernel_scope;
-
   // Input IR:
   // for (int i = 0; i < 100; ++i) {
   //   for (int j = 0; j < 200; ++j) {
@@ -6604,8 +6362,6 @@ TEST(LoopNest, compressBufferMultipleDims) {
 }
 
 TEST(LoopNest, compressBufferMultipleDims2) {
-  KernelScope kernel_scope;
-
   // Input IR:
   // for (int i = 0; i < 100; ++i) {
   //   for (int j = 0; j < 200; ++j) {
@@ -6654,8 +6410,6 @@ TEST(LoopNest, compressBufferMultipleDims2) {
 }
 
 TEST(LoopNest, compressBufferDifferentOrderIndices) {
-  KernelScope kernel_scope;
-
   // Input IR:
   // for (int i = 0; i < 100; ++i) {
   //   for (int j = 0; j < 200; ++j) {
@@ -6700,8 +6454,6 @@ TEST(LoopNest, compressBufferDifferentOrderIndices) {
 }
 
 TEST(LoopNest, compressBufferVariableBounds) {
-  KernelScope kernel_scope;
-
   // Input IR:
   // for (int i = 0; i < M; ++i) {
   //   for (int j = 0; j < N; ++j) {
@@ -6749,8 +6501,6 @@ TEST(LoopNest, compressBufferVariableBounds) {
 }
 
 TEST(LoopNest, compressBufferNoCommonParentLoops) {
-  KernelScope kernel_scope;
-
   // Input IR:
   // for (int i = 0; i < 100; ++i) {
   //   for (int j = 0; j < 200; ++j) {
@@ -6800,8 +6550,6 @@ TEST(LoopNest, compressBufferNoCommonParentLoops) {
 }
 
 TEST(LoopNest, compressBufferIndicesMixed) {
-  KernelScope kernel_scope;
-
   // Input IR:
   // for (int i = 0; i < 100; ++i) {
   //   for (int j = 0; j < 200; ++j) {
@@ -6848,8 +6596,6 @@ TEST(LoopNest, compressBufferIndicesMixed) {
 }
 
 TEST(LoopNest, compressMultipleBuffers) {
-  KernelScope kernel_scope;
-
   // Input IR:
   // for (int i = 0; i < 100; ++i) {
   //   for (int j = 0; j < 200; ++j) {
diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp
index db37b66876976..c9990dcacfb41 100644
--- a/test/cpp/tensorexpr/test_memdependency.cpp
+++ b/test/cpp/tensorexpr/test_memdependency.cpp
@@ -19,8 +19,6 @@ using namespace torch::jit::tensorexpr;
 // larger and fully encloses B, while ContainedOrEqual is the reverse. Equal
 // ranges are ContainedOrEqual.
 TEST(MemDependency, BoundOverlap) {
-  KernelScope kernel_scope;
-
   using namespace analysis;
 
   auto CB = [](int s, int e) {
@@ -79,7 +77,6 @@ TEST(MemDependency, BoundOverlap) {
 }
 
 TEST(MemDependency, BoundOverlapSymbolic) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   VarHandle z("z", kInt);
@@ -116,8 +113,6 @@ TEST(MemDependency, BoundOverlapSymbolic) {
 // This uses boundOverlap on each dimension and return the "lowest" kind of
 // overlap.
 TEST(MemDependency, BoundOverlapMultiDim) {
-  KernelScope kernel_scope;
-
   using namespace analysis;
 
   auto CB = [](int s, int e) {
@@ -189,8 +184,6 @@ TEST(MemDependency, BoundOverlapMultiDim) {
 // Test the helper we use to subtract bounds: returns the regions(s) of A which
 // remain after removing the region of B.
 TEST(MemDependency, BoundSubtract) {
-  KernelScope kernel_scope;
-
   using namespace analysis;
 
   auto CB = [](int s, int e) {
@@ -224,7 +217,6 @@ TEST(MemDependency, BoundSubtract) {
 }
 
 TEST(MemDependency, BoundSubtractSymbolic) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   VarHandle z("z", kInt);
@@ -273,8 +265,6 @@ TEST(MemDependency, BoundSubtractSymbolic) {
 // Tests the helper function that does subtraction, but for multi dimensional
 // indices bounds.
 TEST(MemDependency, BoundSubtractMultiDim) {
-  KernelScope kernel_scope;
-
   using namespace analysis;
 
   auto CB = [](int s, int e) {
@@ -335,7 +325,6 @@ TEST(MemDependency, BoundSubtractMultiDim) {
 // Tests the multi dimensional subtraction code for bounds that cannot be fully
 // materialized.
 TEST(MemDependency, BoundSubtractMultiDimSymbolic) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
 
@@ -403,7 +392,6 @@ TEST(MemDependency, BoundSubtractMultiDimSymbolic) {
 
 // Simple check that the analyzer does anything at all...
 TEST(MemDependency, MemDependencyCheckerSimple) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   BufHandle b("B", {1}, kInt);
 
@@ -429,7 +417,6 @@ TEST(MemDependency, MemDependencyCheckerSimple) {
 
 // Check that there is a difference between direct and indirect dependence.
 TEST(MemDependency, MemDependencyCheckerMultiStmt) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   BufHandle b("B", {1}, kInt);
   BufHandle c("C", {1}, kInt);
@@ -466,7 +453,6 @@ TEST(MemDependency, MemDependencyCheckerMultiStmt) {
 
 // Verify that we do filter writes that are totally overlapped by later writes.
 TEST(MemDependency, MemDependencyCheckerOverlap) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   BufHandle b("B", {1}, kInt);
 
@@ -499,7 +485,6 @@ TEST(MemDependency, MemDependencyCheckerOverlap) {
 // Verify that bounds match loop iterations, and that dependencies progress
 // across loop scopes.
 TEST(MemDependency, MemDependencyCheckerLoop) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   BufHandle b("B", {1}, kInt);
   VarHandle x("x", kInt);
@@ -541,7 +526,6 @@ TEST(MemDependency, MemDependencyCheckerLoop) {
 
 // Reductions should promote dependencies as well.
 TEST(MemDependency, MemDependencyCheckerLoopReduce) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -600,7 +584,6 @@ TEST(MemDependency, MemDependencyCheckerLoopReduce) {
 
 // Lowering a reduction doesn't affect dependency analysis.
 TEST(MemDependency, MemDependencyCheckerLoopReduceExpanded) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -654,7 +637,6 @@ TEST(MemDependency, MemDependencyCheckerLoopReduceExpanded) {
 
 // Can determine dependencies of outputs, through to inputs.
 TEST(MemDependency, MemDependencyCheckerInputsOutputs) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -707,7 +689,6 @@ TEST(MemDependency, MemDependencyCheckerInputsOutputs) {
 
 // Can tell if an output does not depend on an input.
 TEST(MemDependency, MemDependencyCheckerOutputDoesntDepend) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -746,7 +727,6 @@ TEST(MemDependency, MemDependencyCheckerOutputDoesntDepend) {
 // Verify different loop extents produce accesses with different bounds, and
 // that later accesses find dependencies that overlap their entire bound range.
 TEST(MemDependency, MemDependencyCheckerLoopBounds) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   BufHandle c("C", {10}, kInt);
@@ -928,7 +908,6 @@ TEST(MemDependency, MemDependencyCheckerLoopBounds) {
 
 // Verify that we can still infer bounds when the loop var is offset.
 TEST(MemDependency, MemDependencyCheckerLoopBoundsIndexShift) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -1111,7 +1090,6 @@ TEST(MemDependency, MemDependencyCheckerLoopBoundsIndexShift) {
 // iteration. This is affected by whether or not we can trust the execution
 // order of the loop.
 TEST(MemDependency, MemDependencyCheckerLoopSelfDependency) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   VarHandle x("x", kInt);
@@ -1749,7 +1727,6 @@ TEST(MemDependency, MemDependencyCheckerLoopSelfDependency) {
 // TODO: actually this only works because of the size of the ranges, revist this
 // test after strided overlap is implemented.
 TEST(MemDependency, MemDependencyCheckerLoopDistinctStrides) {
-  KernelScope kernel_scope;
   BufHandle a("A", {20}, kInt);
   BufHandle b("B", {20}, kInt);
   VarHandle x("x", kInt);
@@ -1775,7 +1752,6 @@ TEST(MemDependency, MemDependencyCheckerLoopDistinctStrides) {
 
 /* TODO(nickg) - this test will fail due to the lack of stride math in Bound
 TEST(MemDependency, MemDependencyCheckerLoopDistinctStrides) {
-  KernelScope kernel_scope;
   BufHandle a("A", {20}, kInt);
   BufHandle b("B", {20}, kInt);
   BufHandle c("C", {10}, kInt);
@@ -1806,7 +1782,6 @@ TEST(MemDependency, MemDependencyCheckerLoopDistinctStrides) {
 
 // analysis on Stmts using Cond.
 TEST(MemDependency, MemDependencyCheckerLoopBoundsCond) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   BufHandle c("C", {10}, kInt);
@@ -2002,7 +1977,6 @@ TEST(MemDependency, MemDependencyCheckerLoopBoundsCond) {
 
 // Stmts using IfThenElse.
 TEST(MemDependency, MemDependencyCheckerIfThenElse) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   BufHandle c("C", {10}, kInt);
@@ -2112,7 +2086,6 @@ TEST(MemDependency, MemDependencyCheckerIfThenElse) {
 
 // Cutting a loop with single elem writes
 TEST(MemDependency, MemDependencyCheckerCutLoop) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -2194,7 +2167,6 @@ TEST(MemDependency, MemDependencyCheckerCutLoop) {
 
 // Dynamic shapes (load in indices).
 TEST(MemDependency, MemDependencyCheckerDynamicShapes) {
-  KernelScope kernel_scope;
   BufHandle a("A", {100}, kInt);
   BufHandle b("B", {100}, kInt);
   BufHandle c("C", {100}, kInt);
@@ -2436,7 +2408,6 @@ TEST(MemDependency, MemDependencyCheckerDynamicShapes) {
 
 // Verify multi dimensional bounds work.
 TEST(MemDependency, MemDependencyCheckerMultiDim) {
-  KernelScope kernel_scope;
   int M = 10, N = 9, K = 12;
   BufHandle a("A", {M, N, K}, kInt);
   BufHandle b("B", {M, N, K}, kInt);
@@ -2703,8 +2674,6 @@ TEST(MemDependency, MemDependencyCheckerMultiDim) {
 
 // Various tests using the external Compute/Reduce API.
 TEST(MemDependency, MemDependencyCheckerComputeAPI) {
-  KernelScope kernel_scope;
-
   using namespace analysis;
 
   /* for (int m = 0; m < 4; m++) {
@@ -2756,8 +2725,6 @@ TEST(MemDependency, MemDependencyCheckerComputeAPI) {
 }
 
 TEST(MemDependency, MemDependencyCheckerComputeInline) {
-  KernelScope kernel_scope;
-
   using namespace analysis;
 
   /* for (int m = 0; m < 4; m++) {
@@ -2803,8 +2770,6 @@ TEST(MemDependency, MemDependencyCheckerComputeInline) {
 }
 
 TEST(MemDependency, MemDependencyCheckerComputeSplit) {
-  KernelScope kernel_scope;
-
   using namespace analysis;
   // Split an axis, so the number of loops != the number of dimensions.
 
@@ -2851,8 +2816,6 @@ TEST(MemDependency, MemDependencyCheckerComputeSplit) {
 }
 
 TEST(MemDependency, MemDependencyCheckerComputeReorder) {
-  KernelScope kernel_scope;
-
   using namespace analysis;
   // Reorder an axis, so the loop order doesn't match the indexing order.
 
@@ -2900,8 +2863,6 @@ TEST(MemDependency, MemDependencyCheckerComputeReorder) {
 }
 
 TEST(MemDependency, MemDependencyCheckerComputeReduce) {
-  KernelScope kernel_scope;
-
   using namespace analysis;
   /* for (int l2 = 0; l2 < 2; l2++) {
    *   for (int n1 = 0; n1 < 3; n1++) {
@@ -2955,7 +2916,6 @@ TEST(MemDependency, MemDependencyCheckerComputeReduce) {
 }
 
 TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
-  KernelScope kernel_scope;
   int M = 1024;
   int N = 1024;
   int K = 2048;
diff --git a/test/cpp/tensorexpr/test_ops.cpp b/test/cpp/tensorexpr/test_ops.cpp
index 122a498276f24..586c093e213d1 100644
--- a/test/cpp/tensorexpr/test_ops.cpp
+++ b/test/cpp/tensorexpr/test_ops.cpp
@@ -20,8 +20,6 @@ std::unique_ptr<SimpleIREvaluator> compile(
 }
 
 TEST(Ops, Sum) {
-  KernelScope ks;
-
   std::vector<IntList> testDims = {{0}, {1}, {0, 1}};
   for (auto const& dims : testDims) {
     constexpr int M = 8;
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index 6620ef2686a94..411b58db57f9e 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -24,7 +24,6 @@ namespace jit {
 using namespace torch::jit::tensorexpr;
 
 TEST(Reductions, ReduceSum0D_1) {
-  KernelScope kernel_scope;
   const int M = 10;
 
   Placeholder b(BufHandle("b", {M}, kFloat));
@@ -50,7 +49,6 @@ TEST(Reductions, ReduceSum0D_1) {
 }
 
 TEST(Reductions, ReduceSum0D_2) {
-  KernelScope kernel_scope;
   const int M = 10;
 
   Placeholder b(BufHandle("b", {}, kFloat));
@@ -73,8 +71,6 @@ TEST(Reductions, ReduceSum0D_2) {
 
 // Sum an array to a single value.
 TEST(Reductions, ReduceSum1D) {
-  KernelScope kernel_scope;
-
   Placeholder b(BufHandle("b", {10}, kFloat));
   std::vector<float> in(10);
   for (int j = 0; j < 10; ++j) {
@@ -96,8 +92,6 @@ TEST(Reductions, ReduceSum1D) {
 }
 // Sum a 2D tensor to a 1D tensor with dynamic shapes.
 TEST(Reductions, ReduceSum2D) {
-  KernelScope kernel_scope;
-
   const int M = 3;
   const int N = 7;
 
@@ -138,8 +132,6 @@ TEST(Reductions, ReduceSum2D) {
 // Sum a 3D tensor to both a 2D and 1D tensor, then reduce the 2D tensor flat to
 // check our work.
 TEST(Reductions, ReduceSum3D) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   VarHandle m("m", kInt);
 
@@ -209,8 +201,6 @@ TEST(Reductions, ReduceSum3D) {
 
 // Sum a large (10 D) Tensor 5 dimensions in.
 TEST(Reductions, ReduceSum10D) {
-  KernelScope kernel_scope;
-
   Placeholder in_(BufHandle("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat));
   const int InputSize = 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3;
   Placeholder out_(BufHandle("out_", {2, 3, 2, 3, 2}, kFloat));
@@ -243,8 +233,6 @@ TEST(Reductions, ReduceSum10D) {
 
 // Reduce via Mul rather than Add using a custom Reducer.
 TEST(Reductions, ReduceProduct) {
-  KernelScope kernel_scope;
-
   const int M = 4;
   const int N = 4;
 
@@ -284,8 +272,6 @@ TEST(Reductions, ReduceProduct) {
 
 // Maximum reductions.
 TEST(Reductions, ReduceMax) {
-  KernelScope kernel_scope;
-
   Placeholder in_(BufHandle("b", {10}, kFloat));
 
   std::vector<float> in(10);
@@ -325,8 +311,6 @@ TEST(Reductions, ReduceMax) {
 
 // Minimum reduction, with custom initialization.
 TEST(Reductions, ReduceMinCustomInitializer) {
-  KernelScope kernel_scope;
-
   VarHandle minInit("minInit", kFloat);
   Placeholder in_(BufHandle("b", {10}, kFloat));
 
@@ -363,8 +347,6 @@ TEST(Reductions, ReduceMinCustomInitializer) {
 // Example implementation of Any/All.
 // TODO: this is very awkward without logical And/Or operators.
 TEST(Reductions, ReduceAnyAll) {
-  KernelScope kernel_scope;
-
   VarHandle searchValue("searchValue", kInt);
   Placeholder b(BufHandle("b", {4, 10}, kInt));
 
@@ -449,8 +431,6 @@ TEST(Reductions, ReduceAnyAll) {
 }
 
 TEST(Reductions, ReduceMatmul2D) {
-  KernelScope kernel_scope;
-
   Placeholder tA(BufHandle("tA", {3, 2}, kFloat));
   Placeholder tB(BufHandle("tB", {2, 3}, kFloat));
 
@@ -491,8 +471,6 @@ TEST(Reductions, ReduceMatmul2D) {
 }
 
 TEST(Reductions, ReduceRfactorLike) {
-  KernelScope kernel_scope;
-
   Placeholder in(BufHandle("in", {10, 10}, kFloat));
   std::vector<float> in_(100);
   for (int i = 0; i < 100; ++i) {
@@ -518,8 +496,6 @@ TEST(Reductions, ReduceRfactorLike) {
 }
 
 TEST(Reductions, ReduceAsProducer) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   VarHandle m("m", kInt);
 
@@ -563,8 +539,6 @@ TEST(Reductions, ReduceAsProducer) {
 }
 
 TEST(Reductions, ReduceAsConsumer) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   VarHandle m("m", kInt);
 
@@ -614,8 +588,6 @@ TEST(Reductions, ReduceAsConsumer) {
 }
 
 TEST(Reductions, SplitReduceAxis) {
-  KernelScope kernel_scope;
-
   Placeholder in(BufHandle("in", {16, 8}, kFloat));
 
   std::vector<float> in_(16 * 8);
@@ -645,8 +617,6 @@ TEST(Reductions, SplitReduceAxis) {
 }
 
 TEST(Reductions, SplitNonReduceAxis) {
-  KernelScope kernel_scope;
-
   Placeholder in(BufHandle("in", {16, 8}, kFloat));
 
   std::vector<float> in_(16 * 8);
@@ -676,7 +646,6 @@ TEST(Reductions, SplitNonReduceAxis) {
 }
 
 TEST(Reductions, ReorderedReductionInitializer) {
-  KernelScope kernel_scope;
   /* From the quip:
   for k in 0..1:  // blockIdx
     for m in 0..128:
@@ -726,8 +695,6 @@ TEST(Reductions, ReorderedReductionInitializer) {
 }
 
 TEST(Reductions, ReduceRfactor) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   const int N = 10;
   VarHandle m("m", kInt);
@@ -759,8 +726,6 @@ TEST(Reductions, ReduceRfactor) {
 }
 
 TEST(Reductions, Reduce3DRfactorInner) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   const int N = 10;
   const int K = 10;
@@ -794,8 +759,6 @@ TEST(Reductions, Reduce3DRfactorInner) {
 }
 
 TEST(Reductions, Reduce3DRfactorOuter) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   const int N = 10;
   const int K = 10;
@@ -828,8 +791,6 @@ TEST(Reductions, Reduce3DRfactorOuter) {
 }
 
 TEST(Reductions, ReduceRepeatedInternalRfactor) {
-  KernelScope kernel_scope;
-
   Placeholder in_(BufHandle("in_", {2, 3, 4, 5, 6}, kFloat));
   const int InputSize = 2 * 3 * 4 * 5 * 6;
 
@@ -875,8 +836,6 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) {
 
 // Split a reduction axis with a tail loop.
 TEST(Reductions, ReduceSplitTail) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   const int N = 10;
   const int K = 10;
@@ -908,8 +867,6 @@ TEST(Reductions, ReduceSplitTail) {
 
 // Split a reduction axis cleanly so there is no tail loop.
 TEST(Reductions, ReduceSplitNoTail) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   const int N = 10;
   const int K = 10;
@@ -941,8 +898,6 @@ TEST(Reductions, ReduceSplitNoTail) {
 // Split a reduction axis with only a tail loop (the split loop will be size 0
 // and eliminated out).
 TEST(Reductions, ReduceOverSplitTail) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   const int N = 10;
   const int K = 10;
@@ -974,8 +929,6 @@ TEST(Reductions, ReduceOverSplitTail) {
 
 // Split a reduction axis with a mask.
 TEST(Reductions, ReduceSplitMask) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   const int N = 10;
   const int K = 10;
@@ -1007,8 +960,6 @@ TEST(Reductions, ReduceSplitMask) {
 
 // Split a reduction axis cleanly not requiring a mask.
 TEST(Reductions, ReduceSplitNoMask) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   const int N = 10;
   const int K = 10;
@@ -1039,8 +990,6 @@ TEST(Reductions, ReduceSplitNoMask) {
 
 // Split a reduction axis with all logic in the mask.
 TEST(Reductions, ReduceOverSplitMask) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   const int N = 10;
   const int K = 10;
@@ -1073,8 +1022,6 @@ TEST(Reductions, ReduceOverSplitMask) {
 // Test an rfactor when there are two ReduceOps in the graph due to a
 // splitWithTail.
 TEST(Reductions, ReduceSplitRfactor) {
-  KernelScope kernel_scope;
-
   const int M = 2;
   const int N = 10;
   const int K = 10;
@@ -1117,8 +1064,6 @@ TEST(Reductions, ReduceSplitRfactor) {
 // Test an rfactor which ends up being eliminated since the total loop size is
 // smaller than the split factor.
 TEST(Reductions, ReduceOverSplitRfactor) {
-  KernelScope kernel_scope;
-
   const int N = 10;
   const int K = 10;
   const int SPLIT_FACTOR = 16;
@@ -1174,7 +1119,6 @@ TEST(Reductions, ReduceOverSplitRfactor) {
 }
 
 TEST(Reductions, ReduceInlineReduction) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -1207,7 +1151,6 @@ TEST(Reductions, ReduceInlineReduction) {
 }
 
 TEST(Reductions, ReduceInlineConsumer) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -1261,7 +1204,6 @@ TEST(Reductions, ReduceInlineConsumer) {
 }
 
 TEST(Reductions, ReduceInlineReducerInternal) {
-  KernelScope kernel_scope;
   const int M = 4;
   const int N = 5;
   const int K = 6;
@@ -1319,8 +1261,6 @@ TEST(Reductions, ReduceInlineReducerInternal) {
 }
 
 TEST(Reductions, ReductionCacheAccessesOperatorAxis) {
-  KernelScope kernel_scope;
-
   int L = 4;
   int N = 3;
   int M = 2;
@@ -1396,8 +1336,6 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) {
 }
 
 TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) {
-  KernelScope kernel_scope;
-
   int L = 4;
   int N = 3;
   int M = 2;
@@ -1471,8 +1409,6 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) {
 }
 
 TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) {
-  KernelScope kernel_scope;
-
   int L = 4;
   int N = 3;
   int M = 2;
@@ -1546,8 +1482,6 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) {
 }
 
 TEST(Reductions, ReductionCacheBodyAccess) {
-  KernelScope kernel_scope;
-
   Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
   Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
 
@@ -1587,8 +1521,6 @@ TEST(Reductions, ReductionCacheBodyAccess) {
 }
 
 TEST(Reductions, ReductionCacheConsumerAccess) {
-  KernelScope kernel_scope;
-
   Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
   Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
 
@@ -1628,8 +1560,6 @@ TEST(Reductions, ReductionCacheConsumerAccess) {
 }
 
 TEST(Reductions, ReductionSplitCacheConsumerAccess) {
-  KernelScope kernel_scope;
-
   Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
   Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
 
@@ -1676,8 +1606,6 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
 }
 
 TEST(Reductions, ReductionReorderCacheConsumerAccess) {
-  KernelScope kernel_scope;
-
   Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
   Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
 
@@ -1725,8 +1653,6 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
 }
 
 TEST(Reductions, ReductionRfactorCacheTempOuter) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   const int N = 10;
   const int K = 10;
@@ -1794,8 +1720,6 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) {
 }
 
 TEST(Reductions, ReductionRfactorCacheTempInner) {
-  KernelScope kernel_scope;
-
   const int M = 10;
   const int N = 10;
   const int K = 10;
@@ -1858,8 +1782,6 @@ TEST(Reductions, ReductionRfactorCacheTempInner) {
 }
 
 TEST(Reductions, ReductionVectorize) {
-  KernelScope kernel_scope;
-
   std::vector<float> in_(8 * 8);
   for (int i = 0; i < 8; ++i) {
     for (int j = 0; j < 8; ++j) {
@@ -1905,8 +1827,6 @@ TEST(Reductions, ReductionVectorize) {
 }
 
 TEST(Reductions, ReductionVectorizeInner) {
-  KernelScope kernel_scope;
-
   Placeholder in(BufHandle("in", {8, 8}, kFloat));
 
   Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
@@ -1916,8 +1836,6 @@ TEST(Reductions, ReductionVectorizeInner) {
 }
 
 TEST(Reductions, ReductionVectorizeRfactor) {
-  KernelScope kernel_scope;
-
   std::vector<float> in_(8 * 8);
   for (int i = 0; i < 8; ++i) {
     for (int j = 0; j < 8; ++j) {
@@ -1983,7 +1901,6 @@ TEST(Reductions, ReductionVectorizeRfactor) {
 }
 
 TEST(Reductions, InitFunction) {
-  KernelScope ks;
   constexpr int M = 32;
   constexpr int N = 16;
   Placeholder A("A", kFloat, {M, N});
diff --git a/test/cpp/tensorexpr/test_registerizer.cpp b/test/cpp/tensorexpr/test_registerizer.cpp
index 98a53058a1a65..1338b6d19c929 100644
--- a/test/cpp/tensorexpr/test_registerizer.cpp
+++ b/test/cpp/tensorexpr/test_registerizer.cpp
@@ -13,7 +13,6 @@ using namespace torch::jit::tensorexpr;
 
 // Can replace a simple scalar access with a local variable.
 TEST(Registerizer, RegisterizerSimple) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -58,7 +57,6 @@ TEST(Registerizer, RegisterizerSimple) {
 
 // Won't do replacement of a loop access.
 TEST(Registerizer, RegisterizerLoop) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -105,7 +103,6 @@ TEST(Registerizer, RegisterizerLoop) {
 // Won't replace even if the load is a fixed scalar, since the store could
 // invalidate it.
 TEST(Registerizer, RegisterizerLoopFixedLoad) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -152,7 +149,6 @@ TEST(Registerizer, RegisterizerLoopFixedLoad) {
 // We can registerize accesses that occur entirely within inner scopes, even if
 // they depend on the loop var.
 TEST(Registerizer, RegisterizerLoopInternal) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make({For::make(
@@ -203,7 +199,6 @@ TEST(Registerizer, RegisterizerLoopInternal) {
 // An access can be overlapped by another read in the same Expr. In this case
 // B[z] and B[y] overlap and prevent registerization of both accesses.
 TEST(Registerizer, RegisterizerLoopInternalLoadOverlap) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -235,7 +230,6 @@ TEST(Registerizer, RegisterizerLoopInternalLoadOverlap) {
 }
 
 TEST(Registerizer, RegisterizerLoopInternalRepeated) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -307,7 +301,6 @@ TEST(Registerizer, RegisterizerLoopInternalRepeated) {
 }
 
 TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapLoopVar) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -353,7 +346,6 @@ TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapLoopVar) {
 }
 
 TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapOther) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
@@ -400,7 +392,6 @@ TEST(Registerizer, RegisterizerLoopInternalRepeatedOverlapOther) {
 
 // Will registerize multiple accesses of different items of the same buffer.
 TEST(Registerizer, RegisterizerMultiVar) {
-  KernelScope kernel_scope;
   BufHandle a("A", {2}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make({
@@ -456,7 +447,6 @@ TEST(Registerizer, RegisterizerMultiVar) {
 
 // Will registerize the valid accesses while skipping invalid replacements.
 TEST(Registerizer, RegisterizerVariableLoad) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -512,7 +502,6 @@ TEST(Registerizer, RegisterizerVariableLoad) {
 
 // Can registerize variable accesses so long as the variable does not change.
 TEST(Registerizer, RegisterizerSymbolicIndices) {
-  KernelScope kernel_scope;
   VarHandle i("i", kInt);
   VarHandle N("N", kInt);
   BufHandle a("A", {N}, kInt);
@@ -559,7 +548,6 @@ TEST(Registerizer, RegisterizerSymbolicIndices) {
 
 // Can registerize accesses dependent on multiple loop vars.
 TEST(Registerizer, RegisterizerMultiLoop) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
@@ -616,7 +604,6 @@ TEST(Registerizer, RegisterizerMultiLoop) {
 
 // Can registerize correctly if scalars already exist in the program.
 TEST(Registerizer, RegisterizerRepeated) {
-  KernelScope kernel_scope;
   BufHandle a("A", {2}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make({
@@ -673,7 +660,6 @@ TEST(Registerizer, RegisterizerRepeated) {
 
 // Can registerize the load of A.
 TEST(Registerizer, RegisterizerNoLoads) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -714,7 +700,6 @@ TEST(Registerizer, RegisterizerNoLoads) {
 
 // Can registerize the load of A but not the store of B.
 TEST(Registerizer, RegisterizerNoRepeatedStores) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -763,7 +748,6 @@ TEST(Registerizer, RegisterizerNoRepeatedStores) {
 
 // Won't registerize if there are multiple accesses which may overlap.
 TEST(Registerizer, RegisterizerMultiVarOverlap) {
-  KernelScope kernel_scope;
   BufHandle a("A", {2}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make({
@@ -792,8 +776,6 @@ TEST(Registerizer, RegisterizerMultiVarOverlap) {
 }
 
 TEST(Registerizer, RegisterizerAllocs) {
-  KernelScope kernel_scope;
-
   BufHandle a("A", {2}, kInt);
   BufHandle c("C", {1}, kInt);
   VarHandle x("x", kInt);
@@ -860,7 +842,6 @@ TEST(Registerizer, RegisterizerAllocs) {
 }
 
 TEST(Registerizer, RegisterizerNoInitializer) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make({For::make(
@@ -900,7 +881,6 @@ TEST(Registerizer, RegisterizerNoInitializer) {
 }
 
 TEST(Registerizer, RegisterizerNoInitializerLoopVar) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make({For::make(
@@ -929,7 +909,6 @@ TEST(Registerizer, RegisterizerNoInitializerLoopVar) {
 }
 
 TEST(Registerizer, RegisterizerLoadThenStore) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   BufHandle b("B", {1}, kInt);
   VarHandle x("x", kInt);
@@ -980,7 +959,6 @@ TEST(Registerizer, RegisterizerLoadThenStore) {
 }
 
 TEST(Registerizer, RegisterizerParallelized) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   LoopOptions loopOpts;
@@ -1009,7 +987,6 @@ TEST(Registerizer, RegisterizerParallelized) {
 // Should be able to registerize this since the scalar would exist before the
 // branch.
 TEST(Registerizer, RegisterizerConditionAfter) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1059,7 +1036,6 @@ TEST(Registerizer, RegisterizerConditionAfter) {
 // Should be able to registerize this since the scalar exists in the same form
 // after the branch and there is no overlap.
 TEST(Registerizer, RegisterizerConditionBefore) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1111,7 +1087,6 @@ TEST(Registerizer, RegisterizerConditionBefore) {
 
 // Should be able to registerize this as the combination of the two above rules.
 TEST(Registerizer, RegisterizerConditionInside) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1171,7 +1146,6 @@ TEST(Registerizer, RegisterizerConditionInside) {
 // condition, and both sides are large enough to be registerized but cannot be
 // because there is no safe place to put the initializer or finalizer.
 TEST(Registerizer, RegisterizerConditionInsideOverlap1) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1231,7 +1205,6 @@ TEST(Registerizer, RegisterizerConditionInsideOverlap1) {
 // the condition, and the first group must be finalized before the Cond, the
 // second initialized after it.
 TEST(Registerizer, RegisterizerConditionInsideOverlap2) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1317,7 +1290,6 @@ TEST(Registerizer, RegisterizerConditionInsideOverlap2) {
 // the accesses in it don't need to be valid (think size checks on the index).
 // In this case the accesses cannot be registerized.
 TEST(Registerizer, RegisterizerConditionHidden) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1359,7 +1331,6 @@ TEST(Registerizer, RegisterizerConditionHidden) {
 // the user's fault). It "unhides" the conditional accesses, allowing
 // registerization to occur.
 TEST(Registerizer, RegisterizerConditionUnhidden) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1420,7 +1391,6 @@ TEST(Registerizer, RegisterizerConditionUnhidden) {
 
 // Can registerize a load that occurs in the condition of a Cond.
 TEST(Registerizer, RegisterizerCondCondition) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1471,7 +1441,6 @@ TEST(Registerizer, RegisterizerCondCondition) {
 // Appearing in the condition of a Cond makes it visible to the enclosing scope,
 // and so we can registerize internal usages.
 TEST(Registerizer, RegisterizerCondConditionUnhidden) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1520,7 +1489,6 @@ TEST(Registerizer, RegisterizerCondConditionUnhidden) {
 
 // Conditional hiding also works for IfThenElse exprs.
 TEST(Registerizer, RegisterizerIfThenElseHidden) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1562,7 +1530,6 @@ TEST(Registerizer, RegisterizerIfThenElseHidden) {
 
 // Conditional unhiding also works for IfThenElse exprs.
 TEST(Registerizer, RegisterizerIfThenElseUnhidden) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1617,7 +1584,6 @@ TEST(Registerizer, RegisterizerIfThenElseUnhidden) {
 
 // Nested IfThenElse exprs can't promote to higher level scopes.
 TEST(Registerizer, RegisterizerIfThenElseNested) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1661,7 +1627,6 @@ TEST(Registerizer, RegisterizerIfThenElseNested) {
 // to check that we don't promote the initializer/finalizer to the enclosing
 // Block.
 TEST(Registerizer, RegisterizerIfThenElseInternal) {
-  KernelScope kernel_scope;
   // Making these floats so they don't get simplified to a single access.
   BufHandle a("A", {5}, kFloat);
   BufHandle b("B", {5}, kFloat);
@@ -1740,7 +1705,6 @@ TEST(Registerizer, RegisterizerIfThenElseInternal) {
 
 // Can registerize a load that occurs in the condition of an IfThenElse;
 TEST(Registerizer, RegisterizerIfThenElseCondition) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1786,7 +1750,6 @@ TEST(Registerizer, RegisterizerIfThenElseCondition) {
 // Appearing in the condition of a Cond makes it visible to the enclosing scope,
 // and so we can registerize internal usages.
 TEST(Registerizer, RegisterizerIfThenElseConditionUnhidden) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1826,7 +1789,6 @@ TEST(Registerizer, RegisterizerIfThenElseConditionUnhidden) {
 // Cannot promote accesses internal to IfThenElse branches even if the enclosing
 // scope if conditional.
 TEST(Registerizer, RegisterizerConditionBranchOnly) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make({For::make(
@@ -1877,7 +1839,6 @@ TEST(Registerizer, RegisterizerConditionBranchOnly) {
 // We can registerize an IfThenElse that appears in the condition branch of a
 // Cond. This is a weird but valid thing to do.
 TEST(Registerizer, RegisterizerCondIfThenElse) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   BufHandle c("C", {5}, kInt);
@@ -1927,7 +1888,6 @@ TEST(Registerizer, RegisterizerCondIfThenElse) {
 // Can registerize a conditional access in the RHS of a store unhidden by it's
 // LHS, and hoist it out of a loop.
 TEST(Registerizer, RegisterizerIfThenElseLoop) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   VarHandle x("x", kInt);
@@ -1977,7 +1937,6 @@ TEST(Registerizer, RegisterizerIfThenElseLoop) {
 
 // Cannot registerize if the RHS overlaps the access creating visibility.
 TEST(Registerizer, RegisterizerIfThenElseLoopCut) {
-  KernelScope kernel_scope;
   BufHandle a("A", {5}, kInt);
   BufHandle b("B", {5}, kInt);
   VarHandle x("x", kInt);
@@ -2016,7 +1975,6 @@ TEST(Registerizer, RegisterizerIfThenElseLoopCut) {
 // Simple case where an access is cut by an overlapping access later in the
 // program, we can registerize up until the overlap.
 TEST(Registerizer, RegisterizerPartialAfter) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -2073,7 +2031,6 @@ TEST(Registerizer, RegisterizerPartialAfter) {
 // We can registerize an access which overlaps a previous access, the
 // initializer must be inserted after the previous access.
 TEST(Registerizer, RegisterizerPartialBefore) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -2130,7 +2087,6 @@ TEST(Registerizer, RegisterizerPartialBefore) {
 // The combination of the previous two tests, an access is cut by an overlapping
 // access in both directions.
 TEST(Registerizer, RegisterizerPartialInside) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x1("x1", kInt);
   VarHandle x2("x2", kInt);
@@ -2200,7 +2156,6 @@ TEST(Registerizer, RegisterizerPartialInside) {
 // access, we should break this into two scalars and write back to the buffer
 // before the condition.
 TEST(Registerizer, RegisterizerPartialCondition) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -2270,7 +2225,6 @@ TEST(Registerizer, RegisterizerPartialCondition) {
 // Tests case where an access is cut by an internal conditional access which
 // itself is registerized.
 TEST(Registerizer, RegisterizerPartialConditionInternalCut) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -2333,7 +2287,6 @@ TEST(Registerizer, RegisterizerPartialConditionInternalCut) {
 // First statment in condition closes outer access, but can be registerized with
 // later statements.
 TEST(Registerizer, RegisterizerPartialConditionInternalStart) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -2397,7 +2350,6 @@ TEST(Registerizer, RegisterizerPartialConditionInternalStart) {
 
 // An access cuts two open overlaps and creates four scalar variables.
 TEST(Registerizer, RegisterizerPartialOverlapsTwo) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -2468,7 +2420,6 @@ TEST(Registerizer, RegisterizerPartialOverlapsTwo) {
 // Nested blocks will automatically be flattened and do not provent
 // registerization of enclosed accesses.
 TEST(Registerizer, RegisterizerNestedBlocks) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -2522,7 +2473,6 @@ TEST(Registerizer, RegisterizerNestedBlocks) {
 // The access can be registerized internally to a condition, but must ensure
 // that both initializer and finalizer are within the same condition.
 TEST(Registerizer, RegisterizerNestedConditions) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make({Cond::make(
@@ -2578,7 +2528,6 @@ TEST(Registerizer, RegisterizerNestedConditions) {
 // If an access exists outside the scope of the condition then we can lift
 // nested conditional usages into the same scalar.
 TEST(Registerizer, RegisterizerNestedConditionsUnhidden) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -2634,7 +2583,6 @@ TEST(Registerizer, RegisterizerNestedConditionsUnhidden) {
 }
 
 TEST(Registerizer, RegisterizerNestedConditionsHiddenFirst) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -2677,7 +2625,6 @@ TEST(Registerizer, RegisterizerNestedConditionsHiddenFirst) {
 }
 
 TEST(Registerizer, RegisterizerNestedConditionsHiddenSecond) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -2722,7 +2669,6 @@ TEST(Registerizer, RegisterizerNestedConditionsHiddenSecond) {
 // If an access is cut by another access internal to a condition block, it still
 // cuts the access.
 TEST(Registerizer, RegisterizerNestedConditionsCut) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -2761,7 +2707,6 @@ TEST(Registerizer, RegisterizerNestedConditionsCut) {
 }
 
 TEST(Registerizer, RegisterizerNestedConditionLoopHidden) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -2808,7 +2753,6 @@ TEST(Registerizer, RegisterizerNestedConditionLoopHidden) {
 // Three loops and four element regions, three of which should be registerized
 // at different levels of the IR.
 TEST(Registerizer, RegisterizerNestedConditionThreeDeep) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -2908,7 +2852,6 @@ TEST(Registerizer, RegisterizerNestedConditionThreeDeep) {
 // Can replace a simple scalar access with a local variable even when that
 // variable is an outer loop var.
 TEST(Registerizer, RegisterizerNestedLoopSimple) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
@@ -2963,7 +2906,6 @@ TEST(Registerizer, RegisterizerNestedLoopSimple) {
 // conditional access can be hoisted up through a loop to match an existing
 // access in a higher scope and the two can be registerized.
 TEST(Registerizer, RegisterizerHiddenAccessYes) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -3046,7 +2988,6 @@ TEST(Registerizer, RegisterizerHiddenAccessYes) {
 // never unhidden at a higher scope and registerization occurs at the lower
 // scope.
 TEST(Registerizer, RegisterizerHiddenAccessNo) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -3126,7 +3067,6 @@ TEST(Registerizer, RegisterizerHiddenAccessNo) {
 // two accesses here one is unhidden and the other isnt. A[0] can be
 // registerized but B[0] cannot.
 TEST(Registerizer, RegisterizerHiddenAccessMultiLoop) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
@@ -3208,7 +3148,6 @@ TEST(Registerizer, RegisterizerHiddenAccessMultiLoop) {
 // Accesses are registerized inside two conditions, but the immeidate parent is
 // not a condition.
 TEST(Registerizer, RegisterizerTwoConditionalLoops) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -3280,7 +3219,6 @@ TEST(Registerizer, RegisterizerTwoConditionalLoops) {
 
 // Accesses are registerized inside two conditions, cut in the middle.
 TEST(Registerizer, RegisterizerTwoConditionalLoopsCut) {
-  KernelScope kernel_scope;
   BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -3362,7 +3300,6 @@ TEST(Registerizer, RegisterizerTwoConditionalLoopsCut) {
 // references a Let var in a local scope which cannot be hoisted out of the
 // loop.
 TEST(Registerizer, RegisterizerLoopLetVar) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
@@ -3396,7 +3333,6 @@ TEST(Registerizer, RegisterizerLoopLetVar) {
 // references a Let var in an outer scope that does not prevent hoisting the
 // initializer.
 TEST(Registerizer, RegisterizerLoopLetVarOuter) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
@@ -3444,7 +3380,6 @@ TEST(Registerizer, RegisterizerLoopLetVarOuter) {
 // Okay so the registerizer generally goes after index flattening, but just in
 // case. Test multi index registerization.
 TEST(Registerizer, RegisterizerMultiDim) {
-  KernelScope kernel_scope;
   BufHandle a("A", {3, 4, 5}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -3490,7 +3425,6 @@ TEST(Registerizer, RegisterizerMultiDim) {
 // Wont registerize if only some dims match, but will still registerize distinct
 // elements.
 TEST(Registerizer, RegisterizerMultiDimPartial) {
-  KernelScope kernel_scope;
   BufHandle a("A", {3, 4, 5}, kInt);
   VarHandle x("x", kInt);
   StmtPtr stmt = Block::make(
@@ -3538,7 +3472,6 @@ TEST(Registerizer, RegisterizerMultiDimPartial) {
 
 // If they could overlap across all dimensions we cannot registerize.
 TEST(Registerizer, RegisterizerMultiDimOverlap) {
-  KernelScope kernel_scope;
   BufHandle a("A", {3, 4, 5}, kInt);
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
@@ -3573,7 +3506,6 @@ TEST(Registerizer, RegisterizerMultiDimOverlap) {
 
 // But, if one dimension is known to be distinct they do not overlap.
 TEST(Registerizer, RegisterizerMultiDimPartialOverlap) {
-  KernelScope kernel_scope;
   BufHandle a("A", {3, 4, 5}, kInt);
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
@@ -3619,7 +3551,6 @@ TEST(Registerizer, RegisterizerMultiDimPartialOverlap) {
 
 // A 3D reduction with different input dimensionality.
 TEST(Registerizer, RegisterizerMultiDim3DReduction1) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10, 10}, kInt);
   BufHandle c("C", {10, 10, 10}, kInt);
@@ -3691,7 +3622,6 @@ TEST(Registerizer, RegisterizerMultiDim3DReduction1) {
 // A 3D reduction with the same smaller dimensionality using different loop
 // vars.
 TEST(Registerizer, RegisterizerMultiDim3DReduction2) {
-  KernelScope kernel_scope;
   BufHandle a("A", {10}, kInt);
   BufHandle b("B", {10}, kInt);
   BufHandle c("C", {10}, kInt);
diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
index 0df9e9242e198..48983c8f4ba33 100644
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@@ -14,7 +14,6 @@ using namespace torch::jit::tensorexpr;
 using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
 
 TEST(Simplify, ConstantFoldSimple) {
-  KernelScope kernel_scope;
   ExprHandle a(2.0f);
   ExprHandle b(3.0f);
   ExprHandle f = (a + b);
@@ -28,7 +27,6 @@ TEST(Simplify, ConstantFoldSimple) {
 }
 
 TEST(Simplify, ConstantFoldTwoLayer) {
-  KernelScope kernel_scope;
   ExprHandle a(2.0f);
   ExprHandle b(3.0f);
   ExprHandle c(4.0f);
@@ -44,7 +42,6 @@ TEST(Simplify, ConstantFoldTwoLayer) {
 }
 
 TEST(Simplify, ConstantFoldShifts) {
-  KernelScope kernel_scope;
   ExprHandle a(7);
   ExprHandle b(2);
   ExprHandle c(3);
@@ -59,7 +56,6 @@ TEST(Simplify, ConstantFoldShifts) {
 }
 
 TEST(Simplify, ConstantFoldBitwise) {
-  KernelScope kernel_scope;
   ExprHandle a(59);
   ExprHandle b(22);
   ExprHandle c(101);
@@ -74,7 +70,6 @@ TEST(Simplify, ConstantFoldBitwise) {
 }
 
 TEST(Simplify, ConstantFoldMultiOp) {
-  KernelScope kernel_scope;
   ExprHandle a(2.0f);
   ExprHandle b(3.0f);
   ExprHandle c(4.0f);
@@ -93,7 +88,6 @@ TEST(Simplify, ConstantFoldMultiOp) {
 }
 
 TEST(Simplify, ConstantFoldMinMax) {
-  KernelScope kernel_scope;
   ExprHandle a(12.0f);
   ExprHandle b(15.0f);
   ExprHandle c(17.0f);
@@ -113,7 +107,6 @@ TEST(Simplify, ConstantFoldMinMax) {
 }
 
 TEST(Simplify, ConstantFoldIntrinsics) {
-  KernelScope kernel_scope;
   ExprHandle a(2.0f);
   ExprHandle b(3.0f);
   ExprHandle c(4.0f);
@@ -135,7 +128,6 @@ TEST(Simplify, ConstantFoldIntrinsics) {
 }
 
 TEST(Simplify, ConstantFoldCastToBool) {
-  KernelScope kernel_scope;
   ExprHandle f = Cast::make(kBool, IntImm::make(0));
   ExprHandle newF = IRSimplifier::simplify(f);
   SimpleIRExprEval eval(newF);
@@ -143,7 +135,6 @@ TEST(Simplify, ConstantFoldCastToBool) {
 }
 
 TEST(Simplify, ConstantFoldWithVar) {
-  KernelScope kernel_scope;
   {
     VarHandle x("x", kInt);
     ExprHandle body = x * (ExprHandle(2) + ExprHandle(4));
@@ -174,7 +165,6 @@ TEST(Simplify, ConstantFoldWithVar) {
 }
 
 TEST(Simplify, ConditionalSelectFoldSimple) {
-  KernelScope kernel_scope;
   ExprHandle a(3.0f);
   ExprHandle b(4.0f);
   ExprHandle c(3.0f);
@@ -221,7 +211,6 @@ TEST(Simplify, ConditionalSelectFoldSimple) {
 }
 
 TEST(Simplify, ConditionalSelectFoldTwoLayer) {
-  KernelScope kernel_scope;
   ExprHandle a(3.0f);
   ExprHandle b(2.0f);
   ExprHandle c(2.0f);
@@ -269,7 +258,6 @@ TEST(Simplify, ConditionalSelectFoldTwoLayer) {
 }
 
 TEST(Simplify, ConditionalSelectFoldWithVar) {
-  KernelScope kernel_scope;
   VarHandle x("x", kFloat);
   ExprHandle f = x < 4.f;
 
@@ -290,7 +278,6 @@ TEST(Simplify, ConditionalSelectFoldWithVar) {
 }
 
 TEST(Simplify, UnFoldableExpr) {
-  KernelScope kernel_scope;
   VarHandle x("x", kFloat);
   VarHandle y("y", kFloat);
   ExprHandle body = (ExprHandle(3) * x) + (ExprHandle(5) * y);
@@ -308,7 +295,6 @@ TEST(Simplify, UnFoldableExpr) {
 }
 
 TEST(Simplify, HashSimple) {
-  KernelScope kernel_scope;
   VarHandle x("x", kFloat);
   ExprHandle a(2.0f);
   ExprHandle b(3.0f);
@@ -329,7 +315,6 @@ TEST(Simplify, HashSimple) {
 }
 
 TEST(Simplify, HashEquivalence) {
-  KernelScope kernel_scope;
   VarHandle x("x", kFloat);
   VarHandle y("y", kFloat);
   ExprHandle f = (x * y) + (x * y);
@@ -366,7 +351,6 @@ TEST(Simplify, HashEquivalence) {
 }
 
 TEST(Simplify, HashEquivalenceRand) {
-  KernelScope kernel_scope;
   ExprHandle f =
       Intrinsics::make(kRand, kFloat) + Intrinsics::make(kRand, kInt);
 
@@ -386,7 +370,6 @@ TEST(Simplify, HashEquivalenceRand) {
 }
 
 TEST(Simplify, HashEquivalenceAfterFolding) {
-  KernelScope kernel_scope;
   VarHandle x("x", kFloat);
   ExprHandle a(2.0f);
   ExprHandle b(3.0f);
@@ -412,8 +395,6 @@ TEST(Simplify, HashEquivalenceAfterFolding) {
 }
 
 TEST(Simplify, HashDifferenceTypes) {
-  KernelScope kernel_scope;
-
   HashProvider hasher;
   std::vector<ExprPtr> immediates;
 
@@ -446,7 +427,6 @@ TEST(Simplify, HashDifferenceTypes) {
 }
 
 TEST(Simplify, HashLargeExpression) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   BufHandle a("A", {N}, kInt);
   BufHandle b("B", {N}, kInt);
@@ -490,7 +470,6 @@ TEST(Simplify, HashLargeExpression) {
 }
 
 TEST(Simplify, HashForLoopOptions) {
-  KernelScope kernel_scope;
   constexpr int N = 1024;
   BufHandle a("A", {N}, kInt);
   BufHandle b("B", {N}, kInt);
@@ -532,7 +511,6 @@ TEST(Simplify, HashForLoopOptions) {
 
 /// (2 + x) + 4 => x + 6
 TEST(Simplify, SimplifyAdd) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
 
@@ -558,7 +536,6 @@ TEST(Simplify, SimplifyAdd) {
 
 /// (2 - x) - 4 => -2 - x
 TEST(Simplify, SimplifySub) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   ExprHandle body = (ExprHandle(2) - x) - ExprHandle(4);
 
@@ -575,7 +552,6 @@ TEST(Simplify, SimplifySub) {
 
 /// 2 * (1 - x) - 4 => 2 * (-3 - x)
 TEST(Simplify, SimplifyMultiLayer) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   ExprHandle body = ExprHandle(2) * ((ExprHandle(1) - x) - ExprHandle(4));
   ExprHandle simplified = IRSimplifier::simplify(body);
@@ -588,7 +564,6 @@ TEST(Simplify, SimplifyMultiLayer) {
 
 /// 2 * (3 * x) - (x * 4) => 2 * x
 TEST(Simplify, SimplifyMultiTerm) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   ExprHandle body =
       (ExprHandle(2) * ((ExprHandle(3) * x)) - (x * ExprHandle(4)));
@@ -606,7 +581,6 @@ TEST(Simplify, SimplifyMultiTerm) {
 
 /// 2 * (3 * (long)x) - (x * 4) => 2 * x
 TEST(Simplify, SimplifyCasts) {
-  KernelScope kernel_scope;
   VarHandle x("x", kLong);
   ExprHandle body =
       (ExprHandle(2) * ((ExprHandle(3) * x)) - (x * ExprHandle(4)));
@@ -624,7 +598,6 @@ TEST(Simplify, SimplifyCasts) {
 
 /// (x + 0) * 1 => x
 TEST(Simplify, SimplifyEliminatesNoOps) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   ExprHandle body = (x + ExprHandle(0)) * 1;
 
@@ -636,7 +609,6 @@ TEST(Simplify, SimplifyEliminatesNoOps) {
 
 /// Cannot simplify this.
 TEST(Simplify, SimplifyMultiVar) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   ExprHandle body = x * 24 + y * 34;
@@ -659,7 +631,6 @@ TEST(Simplify, SimplifyMultiVar) {
 
 // x + 2 + y => x + y + 2
 TEST(Simplify, DISABLED_SimplifyReorderings) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   ExprHandle body = x + 2 + y;
@@ -676,7 +647,6 @@ TEST(Simplify, DISABLED_SimplifyReorderings) {
 
 /// y + x * 0 => y
 TEST(Simplify, SimplifyEliminatesVar) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   ExprHandle body = y + x * ExprHandle(0);
@@ -686,7 +656,6 @@ TEST(Simplify, SimplifyEliminatesVar) {
 }
 
 TEST(Simplify, SimplifyAdds) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
 
@@ -757,7 +726,6 @@ TEST(Simplify, SimplifyAdds) {
 }
 
 TEST(Simplify, SimplifyMuls) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
 
@@ -959,7 +927,6 @@ TEST(Simplify, SimplifyMuls) {
 
 // Sub an expr from itself will result in zero.
 TEST(Simplify, SimplifySubs) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
 
@@ -1125,7 +1092,6 @@ TEST(Simplify, SimplifySubs) {
 }
 
 TEST(Simplify, SimplifyDiv) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
 
   {
@@ -1144,7 +1110,6 @@ TEST(Simplify, SimplifyDiv) {
 }
 
 TEST(Simplify, SimplifyDivWithLoopContext1) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 6; i++) {
   //  A[i] = (i + 24) / 6;
@@ -1166,7 +1131,6 @@ TEST(Simplify, SimplifyDivWithLoopContext1) {
 }
 
 TEST(Simplify, SimplifyDivWithLoopContext2) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 5; i++) {
   //  A[i] = (i + 25) / 6;
@@ -1188,7 +1152,6 @@ TEST(Simplify, SimplifyDivWithLoopContext2) {
 }
 
 TEST(Simplify, SimplifyDivWithLoopContext3) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 6; i++) {
   //  A[i] = (i + 24) / (-6);
@@ -1210,7 +1173,6 @@ TEST(Simplify, SimplifyDivWithLoopContext3) {
 }
 
 TEST(Simplify, SimplifyDivWithLoopContext4) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 5; i++) {
   //  A[i] = (i - 5) / 6;
@@ -1232,7 +1194,6 @@ TEST(Simplify, SimplifyDivWithLoopContext4) {
 }
 
 TEST(Simplify, SimplifyDivWithLoopContext5) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 6; i++) {
   //  for (int j = 0; j < 10; j++) {
@@ -1259,7 +1220,6 @@ TEST(Simplify, SimplifyDivWithLoopContext5) {
 }
 
 TEST(Simplify, SimplifyDivWithLoopContext6) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 6; i++) {
   //  for (int j = -1; j < 9; j++) {
@@ -1287,7 +1247,6 @@ TEST(Simplify, SimplifyDivWithLoopContext6) {
 }
 
 TEST(Simplify, SimplifyDivWithLoopContext7) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 6; i++) {
   //  for (int j = 0; j < 10; j++) {
@@ -1315,7 +1274,6 @@ TEST(Simplify, SimplifyDivWithLoopContext7) {
 }
 
 TEST(Simplify, SimplifyModWithLoopContext0) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 100; i++) {
   //  A[i] = i % 100;
@@ -1337,7 +1295,6 @@ TEST(Simplify, SimplifyModWithLoopContext0) {
 }
 
 TEST(Simplify, SimplifyModWithLoopContext1) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 6; i++) {
   //  A[i] = (i + 24) % 6;
@@ -1359,7 +1316,6 @@ TEST(Simplify, SimplifyModWithLoopContext1) {
 }
 
 TEST(Simplify, SimplifyModWithLoopContext2) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 5; i++) {
   //  A[i] = (i + 25) % 6;
@@ -1381,7 +1337,6 @@ TEST(Simplify, SimplifyModWithLoopContext2) {
 }
 
 TEST(Simplify, SimplifyModWithLoopContext3) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 6; i++) {
   //  A[i] = (i + 24) % (-6);
@@ -1403,7 +1358,6 @@ TEST(Simplify, SimplifyModWithLoopContext3) {
 }
 
 TEST(Simplify, SimplifyModWithLoopContext4) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 5; i++) {
   //  A[i] = (i - 5) % 6;
@@ -1425,7 +1379,6 @@ TEST(Simplify, SimplifyModWithLoopContext4) {
 }
 
 TEST(Simplify, SimplifyModWithLoopContext5) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 6; i++) {
   //  for (int j = 0; j < 10; j++) {
@@ -1452,7 +1405,6 @@ TEST(Simplify, SimplifyModWithLoopContext5) {
 }
 
 TEST(Simplify, SimplifyModWithLoopContext6) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 6; i++) {
   //  for (int j = -1; j < 9; j++) {
@@ -1480,7 +1432,6 @@ TEST(Simplify, SimplifyModWithLoopContext6) {
 }
 
 TEST(Simplify, SimplifyModWithLoopContext7) {
-  KernelScope kernel_scope;
   // Stmt to simplify:
   // for (int i = 0; i < 6; i++) {
   //  for (int j = 0; j < 10; j++) {
@@ -1508,7 +1459,6 @@ TEST(Simplify, SimplifyModWithLoopContext7) {
 }
 
 TEST(Simplify, SimplifyMod) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   VarHandle z("z", kInt);
@@ -1635,7 +1585,6 @@ TEST(Simplify, SimplifyMod) {
 
 // Test that mixing ops together simplifies as expected.
 TEST(Simplify, SimplifyMultiOp) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
 
@@ -1704,7 +1653,6 @@ TEST(Simplify, SimplifyMultiOp) {
 
 // Test that chaining many ops together works as expected.
 TEST(Simplify, SimplifyManyOps) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
 
@@ -1752,7 +1700,6 @@ TEST(Simplify, SimplifyManyOps) {
 }
 
 TEST(Simplify, SimplifyFactorization) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
 
@@ -1874,7 +1821,6 @@ TEST(Simplify, SimplifyFactorization) {
 
 // (4 * x + y + z * 2) + (4 * x + y + z * 4) => 2 * (y + 3 * z + 4 * x)
 TEST(Simplify, SimplifyFactorizeUneven) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   VarHandle z("z", kInt);
@@ -1901,7 +1847,6 @@ TEST(Simplify, SimplifyFactorizeUneven) {
 // (x * y) + (2 * x) * (x + y) => 2 * (x * x) + 3 * (x * y)
 // This is kind of a placeholder test for variable factorization.
 TEST(Simplify, SimplifyDeeperTerms) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   ExprHandle body = (x * y) + (ExprHandle(2) * x) * (x + y);
@@ -1925,7 +1870,6 @@ TEST(Simplify, SimplifyDeeperTerms) {
 // Tests the difference between two less trivial expressions.
 // (m * (1 * n_1) + (n  + 1)) - (m *  (1 * n_1) + n) => 1
 TEST(Simplify, SimplifyDeeperDifference) {
-  KernelScope kernel_scope;
   VarHandle n("n", kInt);
   VarHandle n_1("n_1", kInt);
   VarHandle m("m", kInt);
@@ -1939,7 +1883,6 @@ TEST(Simplify, SimplifyDeeperDifference) {
 // Test constant folding into the difference between expressions.
 // 2 + char((m * (1 * n_1) + (n  + 1)) - (m *  (1 * n_1) + n)) => 3
 TEST(Simplify, SimplifyFoldComplexDifference) {
-  KernelScope kernel_scope;
   VarHandle n("n", kInt);
   VarHandle n_1("n_1", kInt);
   VarHandle m("m", kInt);
@@ -1954,7 +1897,6 @@ TEST(Simplify, SimplifyFoldComplexDifference) {
 }
 
 TEST(Simplify, SimplifyIfComponents) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   ExprHandle body = IfThenElse::make(
@@ -1976,7 +1918,6 @@ TEST(Simplify, SimplifyIfComponents) {
 }
 
 TEST(Simplify, SimplifyOpaqueTerms) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
 
@@ -2002,8 +1943,6 @@ TEST(Simplify, SimplifyOpaqueTerms) {
 }
 
 TEST(Simplify, SimplifySymbolicMinMax) {
-  KernelScope kernel_scope;
-
   {
     // Minimum with constant difference between terms.
     VarHandle x("x", kInt);
@@ -2038,7 +1977,6 @@ TEST(Simplify, SimplifySymbolicMinMax) {
 }
 
 TEST(Simplify, SimplifyNestedMax) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   VarHandle z("z", kInt);
@@ -2315,7 +2253,6 @@ TEST(Simplify, SimplifyNestedMax) {
 }
 
 TEST(Simplify, SimplifyNestedMin) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   VarHandle z("z", kInt);
@@ -2592,8 +2529,6 @@ TEST(Simplify, SimplifyNestedMin) {
 }
 
 TEST(Simplify, SimplifyWontReorderFloat) {
-  KernelScope kernel_scope;
-
   {
     // 3 * (3 * x) - 3 * (3 * y) => 9 * (x - y)
     // This is an expression we can simplify.
@@ -2704,8 +2639,6 @@ TEST(Simplify, SimplifyWontReorderFloat) {
 }
 
 TEST(Simplify, SimplifyRoundModPattern) {
-  KernelScope kernel_scope;
-
   {
     // (x/y)*y + x%y => x.
     VarHandle x("x", kInt);
@@ -2887,8 +2820,6 @@ TEST(Simplify, SimplifyRoundModPattern) {
 }
 
 TEST(Simplify, SimplifyRoundModPatternFactorization) {
-  KernelScope kernel_scope;
-
   {
     // Full factorization.
     // 2 * (x/y * y) + 2 * (x%y) => 2 * x.
@@ -2947,8 +2878,6 @@ TEST(Simplify, SimplifyRoundModPatternFactorization) {
 }
 
 TEST(Simplify, SimplifyRoundModPatternMultivar) {
-  KernelScope kernel_scope;
-
   {
     // Multivar.
     // (x/8) * 8 + (y/5)*5 + x%8 + y%5 => x + y.
@@ -2997,8 +2926,6 @@ TEST(Simplify, SimplifyRoundModPatternMultivar) {
 }
 
 TEST(Simplify, SimplifyModRoundModPattern) {
-  KernelScope kernel_scope;
-
   {
     // t/7 % 9 * 7 + t % 7 => t%63
     VarHandle t("t", kInt);
@@ -3085,8 +3012,6 @@ TEST(Simplify, SimplifyModRoundModPattern) {
 }
 
 TEST(Simplify, SimplifyModRoundModPatternFactorization) {
-  KernelScope kernel_scope;
-
   {
     // 2 * (t /7 % 9 * 7) + 2 * (t % 7) => 2 * (t % 63)
     VarHandle t("t", kInt);
@@ -3154,8 +3079,6 @@ TEST(Simplify, SimplifyModRoundModPatternFactorization) {
 }
 
 TEST(Simplify, SimplifyModRoundModPatternMultivar) {
-  KernelScope kernel_scope;
-
   {
     // t/7 % 9 * 7 + t % 7 + t => t % 63 + t
     VarHandle t("t", kInt);
@@ -3260,8 +3183,6 @@ TEST(Simplify, SimplifyModRoundModPatternMultivar) {
 }
 
 TEST(Simplify, SimplifyDivisionScalarFactorization) {
-  KernelScope kernel_scope;
-
   {
     // Simple factorization of numerator and denominator.
     // 8x / 4y => 2x / y.
@@ -3332,8 +3253,6 @@ TEST(Simplify, SimplifyDivisionScalarFactorization) {
 }
 
 TEST(Simplify, SimplifyConstantBranches) {
-  KernelScope kernel_scope;
-
   {
     // If the condition is constant true then take the true_value.
     // 1 ? x : y => x
@@ -3390,8 +3309,6 @@ TEST(Simplify, SimplifyConstantBranches) {
 }
 
 TEST(Simplify, SimplifyConstantCond) {
-  KernelScope kernel_scope;
-
   {
     // If the condition is constant true then take the true_value.
     // 1 ? A[0] = 1 : B[0] = 1 => A[0] = 1
@@ -3508,7 +3425,6 @@ TEST(Simplify, SimplifyConstantCond) {
 }
 
 TEST(Simplify, SimplifyEliminateEmptyCond) {
-  KernelScope kernel_scope;
   // If the branches are empty in different ways, eliminate.
   {
     VarHandle x("x", kInt);
@@ -3536,8 +3452,6 @@ TEST(Simplify, SimplifyEliminateEmptyCond) {
 }
 
 TEST(Simplify, SimplifyConstantComparisons) {
-  KernelScope kernel_scope;
-
   auto ComparisonTest =
       [](ExprHandle a, ExprHandle b, CompareSelectOperation op, int result) {
         ExprHandle body = CompareSelect::make(a, b, op);
@@ -3582,7 +3496,6 @@ TEST(Simplify, SimplifyConstantComparisons) {
 }
 
 TEST(Simplify, SimplifySymbolicComparisons) {
-  KernelScope kernel_scope;
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
 
@@ -3720,8 +3633,6 @@ TEST(Simplify, SimplifySymbolicComparisons) {
 }
 
 TEST(Simplify, SimplifyEliminateZeroLengthFor) {
-  KernelScope kernel_scope;
-
   {
     // Will eliminate zero loop For.
     BufHandle a("A", {4}, kInt);
@@ -3780,8 +3691,6 @@ TEST(Simplify, SimplifyEliminateZeroLengthFor) {
 }
 
 TEST(Simplify, SimplifyOneLoopFor) {
-  KernelScope kernel_scope;
-
   {
     // Will remove the loop if the body is run once.
     BufHandle a("A", {4}, kInt);
@@ -3849,8 +3758,6 @@ TEST(Simplify, SimplifyOneLoopFor) {
 }
 
 TEST(Simplify, SimplifyForWontLoseLoopOptions) {
-  KernelScope kernel_scope;
-
   {
     // Sanity check does nothing if the condition is not met.
     BufHandle a("A", {4}, kInt);
@@ -3868,8 +3775,6 @@ TEST(Simplify, SimplifyForWontLoseLoopOptions) {
 }
 
 TEST(Simplify, SimplifyMultilevelFor) {
-  KernelScope kernel_scope;
-
   {
     // Multiple layers of For will be simplified out.
     BufHandle a("A", {4}, kInt);
@@ -3927,8 +3832,6 @@ TEST(Simplify, SimplifyMultilevelFor) {
 }
 
 TEST(Simplify, SimplifyForCleansUp) {
-  KernelScope kernel_scope;
-
   {
     Placeholder a("a", kFloat, {1, 12, 1});
     VarHandle x("x", kInt);
@@ -3957,8 +3860,6 @@ TEST(Simplify, SimplifyForCleansUp) {
 }
 
 TEST(Simplify, SimplifyEliminateEmptyFor) {
-  KernelScope kernel_scope;
-
   {
     // Flatten many layers around an empty block to an empty block.
     StmtPtr last = alloc<Block>(std::vector<StmtPtr>({}));
@@ -3974,8 +3875,6 @@ TEST(Simplify, SimplifyEliminateEmptyFor) {
 }
 
 TEST(Simplify, SimplifyFlattenBlock) {
-  KernelScope kernel_scope;
-
   {
     // Flatten multiple blocks down to one.
     // { { { stmt1, stmt2 } } } =>  { stmt1, stmt2 }
@@ -4059,8 +3958,6 @@ TEST(Simplify, SimplifyFlattenBlock) {
 }
 
 TEST(Simplify, SimplifyEliminateZeroLengthAlloc) {
-  KernelScope kernel_scope;
-
   {
     // Simple positive case.
     BufHandle b("x", {0}, kInt);
@@ -4135,8 +4032,6 @@ TEST(Simplify, SimplifyEliminateZeroLengthAlloc) {
 }
 
 TEST(Simplify, DontSimplifyRand) {
-  KernelScope kernel_scope;
-
   {
     // rand() + rand() = rand() + rand() NOT 2 * rand().
     ExprHandle body =
@@ -4169,7 +4064,6 @@ TEST(Simplify, DontSimplifyRand) {
 }
 
 TEST(Simplify, SimplifyReorderForCond) {
-  KernelScope kernel_scope;
   BufHandle a("A", {4}, kInt);
   BufHandle b("B", {1}, kInt);
   BufHandle c("C", {4}, kInt);
@@ -4368,7 +4262,6 @@ TEST(Simplify, SimplifyReorderForCond) {
 }
 
 TEST(Simplify, SimplifyFuseConditions) {
-  KernelScope kernel_scope;
   BufHandle a("A", {2}, kInt);
   BufHandle b("B", {2}, kInt);
   VarHandle i("i", kInt);
@@ -4778,7 +4671,6 @@ TEST(Simplify, SimplifyFuseConditions) {
 }
 
 TEST(Simplify, SimplifySyncThreads) {
-  KernelScope kernel_scope;
   BufHandle a("A", {4}, kInt);
   VarHandle i("i", kInt);
 
@@ -4876,7 +4768,6 @@ TEST(Simplify, SimplifySyncThreads) {
 }
 
 TEST(Simplify, SimplifyRampSubBroadcast) {
-  KernelScope kernel_scope;
   int num_lanes = 4;
   ExprHandle ramp = Ramp::make(ExprHandle(0), ExprHandle(6), num_lanes);
   ExprHandle broadcast = Broadcast::make(ExprHandle(-5), num_lanes);
@@ -4890,7 +4781,6 @@ TEST(Simplify, SimplifyRampSubBroadcast) {
 }
 
 TEST(Simplify, SimplifyBroadcastTermExpander) {
-  KernelScope kernel_scope;
   int num_lanes = 8;
   ExprHandle bc0 = Broadcast::make(ExprHandle(0), num_lanes);
   ExprHandle bc1 = Broadcast::make(ExprHandle(1), num_lanes);
@@ -4920,7 +4810,6 @@ TEST(Simplify, DISABLED_CompareSelectCondAlwaysInLoopBounds) {
   //   for (int n = 1; n < N; n++) {
   //     b[n] = 1.f;
   //   }
-  KernelScope kernel_scope;
   constexpr int N = 8;
   Placeholder b("b", kFloat, {N});
   VarHandle n("n", kInt);
@@ -4945,7 +4834,6 @@ TEST(Simplify, DISABLED_IfThenCondAlwaysInLoopBounds) {
   //   for (int n = 1; n < N; n++) {
   //     b[n] = 1.f;
   //   }
-  KernelScope kernel_scope;
   constexpr int N = 8;
   Placeholder b("b", kFloat, {N});
   VarHandle n("n", kInt);
@@ -4974,7 +4862,6 @@ TEST(Simplify, DISABLED_MultiClauseCondAlwaysInLoopBounds) {
   //   for (int i = 1; i < 7; i++) {
   //     for (int j = 1; j < 7; j++) {
   //       b[i, j] = 1.f;
-  KernelScope kernel_scope;
   constexpr int N = 8;
   Placeholder b("b", kFloat, {N, N});
   VarHandle i("i", kInt);
@@ -5010,7 +4897,6 @@ TEST(Simplify, DISABLED_SimplifyLoopBounds) {
   //   for (int i = 1; i < 3; i++) {
   //     for (int j = 1; j < 3; j++) {
   //       b[i, j] = (b[i, j]) + 1.f;
-  KernelScope kernel_scope;
   constexpr int N = 8;
   constexpr int K = 3;
   Placeholder a("a", kFloat, {N, N});
diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
index 8dd616453362b..723a8fef81bea 100644
--- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp
+++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@@ -5,7 +5,6 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
-#include <torch/csrc/jit/tensorexpr/mem_arena.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include <sstream>
 
diff --git a/test/cpp/tensorexpr/test_type.cpp b/test/cpp/tensorexpr/test_type.cpp
index cc8a6967b7255..67c1a0a528b7c 100644
--- a/test/cpp/tensorexpr/test_type.cpp
+++ b/test/cpp/tensorexpr/test_type.cpp
@@ -9,7 +9,6 @@ namespace jit {
 using namespace torch::jit::tensorexpr;
 
 TEST(Type, Test01) {
-  KernelScope kernel_scope;
   {
     Dtype dt1 = kInt;
     ASSERT_EQ(dt1, kInt);
@@ -45,28 +44,24 @@ TEST(Type, Test01) {
 
 TEST(Type, BitCasting) {
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kFloat);
     ExprHandle y = bitcast<int32_t>(x);
     // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
     ASSERT_EQ(y.dtype(), kInt);
   }
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kInt);
     ExprHandle y = bitcast<float>(x);
     // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
     ASSERT_EQ(y.dtype(), kFloat);
   }
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kShort);
     ExprHandle y = bitcast<at::Half>(x);
     // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
     ASSERT_EQ(y.dtype(), kHalf);
   }
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kHalf);
     ExprHandle y = bitcast<int16_t>(x);
     // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
@@ -82,7 +77,6 @@ TEST(Type, BitCasting) {
   using SimpleIRExprEval = ExprEval<SimpleIREvaluator>;
   // this is broken
   /*{
-    KernelScope kernel_scope;
     at::Half k_;
     at::Half* k = &k_;
     *reinterpret_cast<int16_t*>(k) = ref16;
@@ -93,7 +87,6 @@ TEST(Type, BitCasting) {
   }*/
 
   {
-    KernelScope kernel_scope;
     float k = raw_bitcast<float>(ref32);
     auto a = FloatImm::make(k);
     auto b = BitCast::make(kInt, a);
@@ -102,7 +95,6 @@ TEST(Type, BitCasting) {
   }
 
   {
-    KernelScope kernel_scope;
     double k = raw_bitcast<double>(ref64);
     auto a = DoubleImm::make(k);
     auto b = BitCast::make(kLong, a);
@@ -111,7 +103,6 @@ TEST(Type, BitCasting) {
   }
 
   {
-    KernelScope kernel_scope;
     int64_t k = raw_bitcast<int64_t>(reff64);
     auto a = LongImm::make(k);
     auto b = BitCast::make(kDouble, a);
@@ -120,7 +111,6 @@ TEST(Type, BitCasting) {
   }
 
   {
-    KernelScope kernel_scope;
     int32_t k = raw_bitcast<int32_t>(reff32);
     auto a = IntImm::make(k);
     auto b = BitCast::make(kFloat, a);
@@ -130,27 +120,22 @@ TEST(Type, BitCasting) {
 
   // This segfaults :(
   /*{
-    KernelScope kernel_scope;
     VarHandle x("x", kDouble);
     ASSERT_ANY_THROW(ExprHandle y = bitcast<int32_t>(x));
   }
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kFloat);
     ASSERT_ANY_THROW(ExprHandle y = bitcast<int64_t>(x));
   }
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kLong);
     ASSERT_ANY_THROW(ExprHandle y = bitcast<float>(x));
   }
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kShort);
     ASSERT_ANY_THROW(ExprHandle y = bitcast<float>(x));
   }
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kInt);
     ASSERT_ANY_THROW(ExprHandle y = bitcast<at::Half>(x));
   }*/
@@ -159,7 +144,6 @@ TEST(Type, BitCasting) {
 TEST(Type, Propagation) {
   // Same types:
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kFloat);
     VarHandle y("y", kFloat);
     ExprHandle body = FloatImm::make(2.f) +
@@ -168,7 +152,6 @@ TEST(Type, Propagation) {
   }
   // Int to bigger int:
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kShort);
     VarHandle y("y", kLong);
     ExprHandle body =
@@ -177,7 +160,6 @@ TEST(Type, Propagation) {
   }
   // Float to bigger float:
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kHalf);
     VarHandle y("y", kDouble);
     ExprHandle body =
@@ -186,7 +168,6 @@ TEST(Type, Propagation) {
   }
   // Int to Float:
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kFloat);
     VarHandle y("y", kInt);
     ExprHandle body =
@@ -195,7 +176,6 @@ TEST(Type, Propagation) {
   }
   // Smaller float, bigger Int:
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kHalf);
     VarHandle y("y", kLong);
     ExprHandle body =
@@ -204,7 +184,6 @@ TEST(Type, Propagation) {
   }
   // Bigger float, smaller Int:
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kChar);
     VarHandle y("y", kDouble);
     ExprHandle body =
@@ -213,7 +192,6 @@ TEST(Type, Propagation) {
   }
   // Sign change char/byte upgrades to short:
   {
-    KernelScope kernel_scope;
     VarHandle x("x", kChar);
     VarHandle y("y", kByte);
     ExprHandle body =
diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp
index 0f0277e37292e..16605e5e6d501 100644
--- a/test/cpp/tensorexpr/tutorial.cpp
+++ b/test/cpp/tensorexpr/tutorial.cpp
@@ -49,19 +49,6 @@
 using namespace torch::jit::tensorexpr;
 
 int main(int argc, char* argv[]) {
-  // Memory management for tensor expressions is currently done with memory
-  // arenas. That is, whenever an object is created it registers itself in an
-  // arena and the object is kept alive as long as the arena is alive. When the
-  // arena gets destructed, it deletes all objects registered in it.
-  //
-  // The easiest way to set up a memory arena is to use `KernelScope` class - it
-  // is a resource guard that creates a new arena on construction and restores
-  // the previously set arena on destruction.
-  //
-  // We will create a kernel scope here, and thus we'll set up a mem arena for
-  // the entire tutorial.
-  KernelScope kernel_scope;
-
   std::cout << "*** Structure of tensor expressions ***" << std::endl;
   {
     // A tensor expression is a tree of expressions. Each expression has a type,
diff --git a/test/test_tensorexpr_pybind.py b/test/test_tensorexpr_pybind.py
index 0ae59e1c56484..6a348053c01fd 100644
--- a/test/test_tensorexpr_pybind.py
+++ b/test/test_tensorexpr_pybind.py
@@ -9,14 +9,6 @@
 LLVM_ENABLED = torch._C._llvm_enabled()
 
 
-class kernel_arena_scope(object):
-    def __enter__(self):
-        self.scope = torch._C._te.KernelScope()
-
-    def __exit__(self, typ, val, traceback):
-        self.scope = None
-
-
 def construct_adder(n: int, dtype=te.Dtype.Float):
     dN = te.ExprHandle.int(n)
     A = te.Placeholder('A', dtype, [dN])
@@ -36,85 +28,80 @@ def compute(i):
 
 class TestTensorExprPyBind(JitTestCase):
     def test_simple_sum(self):
-        with kernel_arena_scope():
-            n = 32
-            cg = construct_adder(n)
+        n = 32
+        cg = construct_adder(n)
 
-            tA = torch.randn(n)
-            tB = torch.randn(n)
-            tC = torch.empty(n)
-            cg.call([tA, tB, tC])
-            torch.testing.assert_close(tA + tB, tC)
+        tA = torch.randn(n)
+        tB = torch.randn(n)
+        tC = torch.empty(n)
+        cg.call([tA, tB, tC])
+        torch.testing.assert_close(tA + tB, tC)
 
     def test_call_raw(self):
-        with kernel_arena_scope():
-            n = 16
-            cg = construct_adder(n, dtype=torch.float64)
+        n = 16
+        cg = construct_adder(n, dtype=torch.float64)
 
-            tA = torch.randn(n, dtype=torch.float64)
-            tB = torch.randn(n, dtype=torch.float64)
-            tC = torch.empty(n, dtype=torch.float64)
-            cg.call_raw([tA.data_ptr(), tB.data_ptr(), tC.data_ptr()])
-            torch.testing.assert_close(tA + tB, tC)
+        tA = torch.randn(n, dtype=torch.float64)
+        tB = torch.randn(n, dtype=torch.float64)
+        tC = torch.empty(n, dtype=torch.float64)
+        cg.call_raw([tA.data_ptr(), tB.data_ptr(), tC.data_ptr()])
+        torch.testing.assert_close(tA + tB, tC)
 
     def test_external_calls(self):
-        with kernel_arena_scope():
-            dtype = torch.float32
+        dtype = torch.float32
 
-            ONE = te.ExprHandle.int(1)
-            FOUR = te.ExprHandle.int(4)
-            A = te.BufHandle('A', [ONE, FOUR], dtype)
-            B = te.BufHandle('B', [FOUR, ONE], dtype)
-            C = te.BufHandle('C', [ONE, ONE], dtype)
+        ONE = te.ExprHandle.int(1)
+        FOUR = te.ExprHandle.int(4)
+        A = te.BufHandle('A', [ONE, FOUR], dtype)
+        B = te.BufHandle('B', [FOUR, ONE], dtype)
+        C = te.BufHandle('C', [ONE, ONE], dtype)
 
-            s = te.ExternalCall(C, "nnc_aten_matmul", [A, B], [])
+        s = te.ExternalCall(C, "nnc_aten_matmul", [A, B], [])
 
-            loopnest = te.LoopNest(s, [C])
-            loopnest.prepare_for_codegen()
-            codegen = te.construct_codegen('ir_eval', s, [te.BufferArg(x) for x in [A, B, C]])
+        loopnest = te.LoopNest(s, [C])
+        loopnest.prepare_for_codegen()
+        codegen = te.construct_codegen('ir_eval', s, [te.BufferArg(x) for x in [A, B, C]])
 
-            tA = torch.ones(1, 4)
-            tB = torch.ones(4, 1)
-            tC = torch.empty(1, 1)
-            codegen.call([tA, tB, tC])
-            torch.testing.assert_close(torch.matmul(tA, tB), tC)
+        tA = torch.ones(1, 4)
+        tB = torch.ones(4, 1)
+        tC = torch.empty(1, 1)
+        codegen.call([tA, tB, tC])
+        torch.testing.assert_close(torch.matmul(tA, tB), tC)
 
     def test_dynamic_shape(self):
-        with kernel_arena_scope():
-            dN = te.VarHandle(torch.int32)
-            A = te.BufHandle(torch.float64)
-            B = te.BufHandle(torch.float64)
+        dN = te.VarHandle(torch.int32)
+        A = te.BufHandle(torch.float64)
+        B = te.BufHandle(torch.float64)
 
-            def compute(i):
-                return A.load(i) - B.load(i)
+        def compute(i):
+            return A.load(i) - B.load(i)
 
-            C = te.Compute('C', [dN], compute)
+        C = te.Compute('C', [dN], compute)
 
-            loopnest = te.LoopNest([C])
-            loopnest.prepare_for_codegen()
+        loopnest = te.LoopNest([C])
+        loopnest.prepare_for_codegen()
 
-            cg = te.construct_codegen(
-                'ir_eval',
-                loopnest.simplify(),
-                [A, B, C, dN])
+        cg = te.construct_codegen(
+            'ir_eval',
+            loopnest.simplify(),
+            [A, B, C, dN])
 
-            def test_with_shape(n):
-                tA = torch.randn(n, dtype=torch.double)
-                tB = torch.randn(n, dtype=torch.double)
-                tC = torch.empty(n, dtype=torch.double)
-                cg.call([tA, tB, tC, n])
-                torch.testing.assert_close(tA - tB, tC)
+        def test_with_shape(n):
+            tA = torch.randn(n, dtype=torch.double)
+            tB = torch.randn(n, dtype=torch.double)
+            tC = torch.empty(n, dtype=torch.double)
+            cg.call([tA, tB, tC, n])
+            torch.testing.assert_close(tA - tB, tC)
 
-            test_with_shape(8)
-            test_with_shape(31)
+        test_with_shape(8)
+        test_with_shape(31)
 
     def test_dtype_error(self):
-        with kernel_arena_scope():
-            one = te.ExprHandle.int(1)
-            te.Placeholder([one], torch.float32)  # ok
-            te.Placeholder([one])  # ok
-            self.assertRaises(TypeError,
-                              lambda: te.Placeholder([one], "float55"))
+        one = te.ExprHandle.int(1)
+        te.Placeholder([one], torch.float32)  # ok
+        te.Placeholder([one])  # ok
+        self.assertRaises(TypeError,
+                          lambda: te.Placeholder([one], "float55"))
 
     @unittest.skipIf(not LLVM_ENABLED, "LLVM backend not enabled")
     def test_kernel_with_tensor_inputs(self):
@@ -396,23 +383,22 @@ def f(a):
 
     @unittest.skipIf(not LLVM_ENABLED, "LLVM backend not enabled")
     def test_alloc_in_loop(self):
-        with kernel_arena_scope():
-            a, tmp, b = [
-                te.Placeholder(name, te.Dtype.Float, [te.ExprHandle.int(1)])
-                for name in ["a", "tmp", "b"]]
-            t0, t100 = [te.ExprHandle.int(n) for n in [0, 100]]
-            body = te.Block([
-                tmp.store([t0], a.load([t0])),
-                b.store([t0], tmp.load([t0]))
-            ])
-            for _ in range(4):
-                i = te.VarHandle("i", te.Dtype.Int)
-                body = te.For.make(i, t0, t100, body)
-            nest = te.LoopNest(body, [b.data()])
-            nest.prepare_for_codegen()
-            f = te.construct_codegen("llvm", nest.simplify(), [a, b])
-            ta, tb = [torch.ones(1) for _ in range(2)]
-            f.call([ta.data_ptr(), tb.data_ptr()])
+        a, tmp, b = [
+            te.Placeholder(name, te.Dtype.Float, [te.ExprHandle.int(1)])
+            for name in ["a", "tmp", "b"]]
+        t0, t100 = [te.ExprHandle.int(n) for n in [0, 100]]
+        body = te.Block([
+            tmp.store([t0], a.load([t0])),
+            b.store([t0], tmp.load([t0]))
+        ])
+        for _ in range(4):
+            i = te.VarHandle("i", te.Dtype.Int)
+            body = te.For.make(i, t0, t100, body)
+        nest = te.LoopNest(body, [b.data()])
+        nest.prepare_for_codegen()
+        f = te.construct_codegen("llvm", nest.simplify(), [a, b])
+        ta, tb = [torch.ones(1) for _ in range(2)]
+        f.call([ta.data_ptr(), tb.data_ptr()])
 
 if __name__ == '__main__':
     run_tests()
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 2eabbd0a8b230..157c30663ce1c 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -301,7 +301,6 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/tensorexpr/llvm_codegen.cpp",
     "torch/csrc/jit/tensorexpr/llvm_jit.cpp",
     "torch/csrc/jit/tensorexpr/loopnest.cpp",
-    "torch/csrc/jit/tensorexpr/mem_arena.cpp",
     "torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp",
     "torch/csrc/jit/tensorexpr/operators/conv2d.cpp",
     "torch/csrc/jit/tensorexpr/operators/matmul.cpp",
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 0a34f476b0d3b..5ef770c6755fc 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -481,8 +481,6 @@ static constexpr int kVectorWidth = 16;
 #ifdef TORCH_ENABLE_LLVM
 
 struct TEWrapper {
-  tensorexpr::KernelArena ka;
-  tensorexpr::KernelScope ks;
   std::unique_ptr<tensorexpr::LLVMCodeGen> cg;
   TEWrapper() = default;
   void update(std::unique_ptr<tensorexpr::LLVMCodeGen>&& cg_) {
@@ -534,8 +532,6 @@ std::shared_ptr<TEWrapper> wrapTECompute(
 #else
 
 struct TEWrapper {
-  tensorexpr::KernelArena ka;
-  tensorexpr::KernelScope ks;
   TEWrapper() = default;
   template <typename... Ts>
   void operator()(const Ts&... ts) {
diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h
index 1b942eaf353fc..108236e2e17f8 100644
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -8,7 +8,6 @@
 #include <torch/csrc/jit/tensorexpr/fwd_decls.h>
 #include <torch/csrc/jit/tensorexpr/ir_mutator.h>
 #include <torch/csrc/jit/tensorexpr/ir_visitor.h>
-#include <torch/csrc/jit/tensorexpr/mem_arena.h>
 #include <torch/csrc/jit/tensorexpr/types.h>
 
 namespace torch {
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 8076ba2b71d67..fed5e1e139d3d 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -2909,7 +2909,6 @@ void TensorExprKernel::bindConstant(const torch::jit::Value* v) {
 }
 
 void TensorExprKernel::compile() {
-  KernelScope kernelScope(&kernelArena_);
   GRAPH_DUMP("TensorExprKernel graph:", graph_);
 
   device_ = *pickDeviceType(graph_);
@@ -3080,8 +3079,6 @@ StmtPtr TensorExprKernel::getCodeGenStmt() {
 }
 
 void TensorExprKernel::runKernel(Stack& stack) {
-  KernelScope kernelScope(&kernelArena_);
-
   // Set up arguments (inputs, then outputs) for kernel call.
   auto inputs = last(stack, nInputs_);
   std::vector<at::Tensor> outputs;
@@ -3101,8 +3098,6 @@ void TensorExprKernel::runKernel(Stack& stack) {
 void TensorExprKernel::runFast(
     const std::vector<void*>& inputs,
     const std::vector<void*>& outputs) {
-  KernelScope kernelScope(&kernelArena_);
-
   std::vector<void*> args(inputs);
   args.reserve(inputs.size() + outputs.size() + constants_.size());
   args.insert(args.end(), outputs.begin(), outputs.end());
diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h
index a8a57b9f15a16..99a3b123a6816 100644
--- a/torch/csrc/jit/tensorexpr/kernel.h
+++ b/torch/csrc/jit/tensorexpr/kernel.h
@@ -266,7 +266,6 @@ class TORCH_API TensorExprKernel {
   std::unordered_map<const torch::jit::Value*, std::string> input_name_map_;
   std::unique_ptr<CodeGen> codegen_;
   at::Device device_ = at::kCPU;
-  KernelArena kernelArena_;
   std::shared_ptr<Graph> graph_;
   Code code_;
   bool allow_fallback_{false};
diff --git a/torch/csrc/jit/tensorexpr/mem_arena.cpp b/torch/csrc/jit/tensorexpr/mem_arena.cpp
deleted file mode 100644
index 1769563424f5c..0000000000000
--- a/torch/csrc/jit/tensorexpr/mem_arena.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-#include <c10/util/Exception.h>
-#include <torch/csrc/jit/tensorexpr/mem_arena.h>
-#include <stdexcept>
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-
-namespace {
-// Define in an anonymous namespace to hide this symbol from other compilation
-// units
-thread_local KernelArena* current_arena = nullptr;
-} // namespace
-
-KernelArena::~KernelArena() {
-  for (KernelScopedObject* p : kernel_objects_) {
-    delete p;
-  }
-}
-
-KernelScopedObject::KernelScopedObject() {
-  KernelArena* kernel = KernelArena::GetCurrentKernelArena();
-  if (kernel == nullptr) {
-    throw std::runtime_error(
-        "KernelScope() must be constructed before calling this");
-  }
-  kernel->kernel_objects_.push_back(this);
-}
-
-void KernelArena::SetCurrentKernelArena(KernelArena* new_kernel_arena) {
-  current_arena = new_kernel_arena;
-}
-
-KernelArena* KernelArena::GetCurrentKernelArena() {
-  return current_arena;
-}
-
-KernelScope::KernelScope()
-    : kernel_arena_(new KernelArena()),
-      old_kernel_arena_(KernelArena::GetCurrentKernelArena()),
-      owning_(true) {
-  KernelArena::SetCurrentKernelArena(kernel_arena_);
-}
-
-KernelScope::KernelScope(KernelArena* arena_)
-    : kernel_arena_(arena_),
-      old_kernel_arena_(KernelArena::GetCurrentKernelArena()),
-      owning_(false) {
-  KernelArena::SetCurrentKernelArena(kernel_arena_);
-}
-
-KernelScope::~KernelScope() {
-  if (KernelArena::GetCurrentKernelArena() != kernel_arena_) {
-    // This should be an error, but it gets triggered in
-    // caffe2/benchmarks/static_runtime:static_runtime_cpptest
-    TORCH_WARN("KernelScope() destructed out of order, leaking memory");
-    return;
-  }
-  KernelArena::SetCurrentKernelArena(old_kernel_arena_);
-  if (owning_) {
-    delete kernel_arena_;
-  }
-}
-
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/mem_arena.h b/torch/csrc/jit/tensorexpr/mem_arena.h
deleted file mode 100644
index a39ab6f0068c7..0000000000000
--- a/torch/csrc/jit/tensorexpr/mem_arena.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#pragma once
-#include <torch/csrc/WindowsTorchApiMacro.h>
-#include <vector>
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-
-class KernelScopedObject;
-
-// An arena that manages all the underlying kernel-scoped objects.
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-class KernelArena {
- public:
-  static KernelArena* GetCurrentKernelArena();
-  static void SetCurrentKernelArena(KernelArena* new_arena);
-  TORCH_API KernelArena() = default;
-  TORCH_API ~KernelArena();
-  KernelArena(const KernelArena&) = delete;
-  KernelArena& operator=(const KernelArena&) = delete;
-
- private:
-  friend class KernelScopedObject;
-  std::vector<KernelScopedObject*> kernel_objects_; // owned
-};
-
-// A RAII convenience wrapper on top of a kernel.
-// It either creates or takes an existing Kernel and sets it as the current
-// Kernel. When this object is destroyed, the previous Kernel is set as current,
-// and the created kernel is freed. If the kernel was passed, it stays alive.
-class KernelScope {
- public:
-  TORCH_API KernelScope();
-  TORCH_API explicit KernelScope(KernelArena* arena_);
-  TORCH_API ~KernelScope();
-  KernelScope(const KernelScope&) = delete;
-  KernelScope& operator=(const KernelScope&) = delete;
-
- private:
-  KernelArena* kernel_arena_; // maybe owned
-  KernelArena* old_kernel_arena_; // previous arena, restored in destructor
-  bool owning_; // determines whether the arena will be freed along with
-                // the scope object
-};
-
-// The base object managed by the Kernel.
-// The object must be created through "new", and when the Kernel is destroyed,
-// All its registered objects are destroyed through "delete".
-class TORCH_API KernelScopedObject {
- public:
-  KernelScopedObject();
-  virtual ~KernelScopedObject() = default;
-
-  KernelScopedObject(const KernelScopedObject&) = delete;
-  KernelScopedObject& operator=(const KernelScopedObject&) = delete;
-};
-
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
index c380233cce16a..c7f48824303d4 100644
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -61,7 +61,6 @@ void initTensorExprBindings(PyObject* module) {
 
   // Tensor Expr Classes
   auto te = m.def_submodule("_te");
-  py::class_<KernelScope>(te, "KernelScope").def(py::init<>());
 
   auto dtype_class =
       py::class_<Dtype>(te, "Dtype").def(py::init(&parsePythonDtype));

From c545b099aac56a7394875346613789bfc7f71cb7 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <marjanf@fb.com>
Date: Tue, 24 Aug 2021 01:43:33 -0700
Subject: [PATCH 161/530] Separating quantization test from distributed_test
 (#63058)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63058

Dedicating separate tests for different quantization methods. Currently supporting FP16 method.
ghstack-source-id: 136499767

Test Plan: uck test mode/dev //caffe2/test/distributed/algorithms/quantization:quantization_gloo_fork -- name_of_the_test

Reviewed By: wanchaol

Differential Revision: D30142580

fbshipit-source-id: 3aacec1a231a662067d2b48c001f0c69fefcdd60
---
 .../quantization/test_quantization.py         | 180 ++++++++++++++++++
 .../{ => quantization}/quantization.py        |   3 -
 .../_internal/distributed/distributed_test.py |  36 +---
 3 files changed, 183 insertions(+), 36 deletions(-)
 create mode 100644 test/distributed/algorithms/quantization/test_quantization.py
 rename torch/distributed/algorithms/{ => quantization}/quantization.py (99%)

diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py
new file mode 100644
index 0000000000000..7872920f21141
--- /dev/null
+++ b/test/distributed/algorithms/quantization/test_quantization.py
@@ -0,0 +1,180 @@
+import torch
+import os
+import torch.cuda
+import sys
+import torch.distributed as dist
+import torch.distributed.algorithms.quantization.quantization as quant
+from torch.distributed.algorithms.quantization.quantization import DQuantType
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    requires_gloo,
+    skip_if_lt_x_gpu,
+    requires_nccl,
+)
+from torch.testing._internal.distributed.distributed_test import (
+    apply_hack_for_nccl
+)
+from torch.testing._internal.common_utils import sandcastle_skip_if, run_tests, TEST_WITH_DEV_DBG_ASAN, NO_MULTIPROCESSING_SPAWN
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+def _build_tensor(size, value=None, dtype=torch.float, device_id=None):
+    if value is None:
+        value = size
+    if device_id is None:
+        return torch.empty(size, size, size, dtype=dtype).fill_(value)
+    else:
+        return torch.empty(size, size, size, dtype=dtype).fill_(value).cuda(device_id)
+if TEST_WITH_DEV_DBG_ASAN:
+    print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr)
+    sys.exit(0)
+
+if NO_MULTIPROCESSING_SPAWN:
+    print("Spawn not available, skipping tests.", file=sys.stderr)
+    sys.exit(0)
+
+BACKEND = os.environ["BACKEND"]
+
+if BACKEND == "gloo" or BACKEND == "nccl":
+    class DistQuantizationTests(MultiProcessTestCase):
+
+        def setUp(self):
+            super(DistQuantizationTests, self).setUp()
+            self._spawn_processes()
+            torch.backends.cudnn.flags(allow_tf32=False).__enter__()
+
+        def tearDown(self):
+            super(DistQuantizationTests, self).tearDown()
+            try:
+                os.remove(self.file_name)
+            except OSError:
+                pass
+
+        @property
+        def op_timeout_sec(self):
+            return 1
+
+        @property
+        def world_size(self):
+            return 2
+
+        def _init_multigpu_helper(self):
+            """Multigpu tests are designed to simulate the multi nodes with multi
+            GPUs on each node. Nccl backend requires equal #GPUs in each process.
+            On a single node, all visible GPUs are evenly
+            divided to subsets, each process only uses a subset.
+            """
+            nGPUs = torch.cuda.device_count()
+            world_size = dist.get_world_size()
+            visible_devices = range(nGPUs)
+
+            if BACKEND == "nccl":
+                apply_hack_for_nccl()
+
+            # If rank is lesser than or equal to number of available GPU's
+            # then each rank can be mapped to corresponding GPU.
+            nGPUs_per_process = 1
+            if world_size > nGPUs:
+                nGPUs_per_process = nGPUs // world_size
+            rank_to_GPU = {
+                i: list(
+                    visible_devices[i * nGPUs_per_process : (i + 1) * nGPUs_per_process]
+                )
+                for i in range(world_size)
+            }
+            return rank_to_GPU
+
+        @requires_gloo()
+        @sandcastle_skip_if(BACKEND != "gloo", "Only gloo backend supports all_gather_fp16")
+        def test_all_gather_fp16(self):
+            store = dist.FileStore(self.file_name, int(self.world_size))
+            dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo')
+            device = torch.device(f"cuda:{self.rank}")
+            group = list(range(0, self.world_size))
+            group_id = dist.group.WORLD
+            self._test_all_gather(group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.FP16)
+
+        @requires_nccl()
+        @sandcastle_skip_if(BACKEND != "nccl", "Only nccl backend supports all_to_all_fp16")
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_all_to_all_fp16(self):
+            store = dist.FileStore(self.file_name, int(self.world_size))
+            dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl')
+            device = torch.device(f"cuda:{self.rank}")
+            group = list(range(0, self.world_size))
+            group_id = dist.new_group(range(self.world_size))
+            rank_to_GPU = self._init_multigpu_helper()
+            self._test_all_to_all(
+                group,
+                group_id,
+                self.rank,
+                cuda=True,
+                rank_to_GPU=rank_to_GPU,
+                dtype=torch.float32,
+                qtype=DQuantType.FP16)
+
+        def _test_all_gather(
+                self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float, qtype=None):
+            for dest in group:
+                tensor = _build_tensor(dest + 1, rank, dtype=dtype)
+                tensors = [_build_tensor(dest + 1, -1, dtype=dtype) for i in group]
+                expected_tensors = [_build_tensor(dest + 1, i, dtype=dtype) for i in group]
+                if (qtype is not None):
+                    allgather = quant.auto_quantize(dist.all_gather, qtype, quant_loss=None)
+                else:
+                    allgather = dist.all_gather
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                    tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
+                if tensors[0].dtype == torch.complex64:
+                    tensor_shapes = [torch.view_as_real(tensors[0]).shape]
+                else:
+                    tensor_shapes = [tensors[0].shape]
+                allgather(tensors, tensor, group=group_id, async_op=False)
+
+                for t1, t2 in zip(tensors, expected_tensors):
+                    self.assertEqual(t1, t2)
+
+        def _test_all_to_all(
+            self,
+            group,
+            group_id,
+            rank,
+            cuda=False,
+            rank_to_GPU=None,
+            dtype=torch.float,
+            qtype=None
+        ):
+            if group_id is not None:
+                size = len(group)
+                in_splits = [i + 1 for i in group]
+                in_tensors = [
+                    torch.ones([in_splits[i], size], dtype=dtype) * rank
+                    for i, _ in enumerate(group)
+                ]
+                out_tensors = [
+                    torch.ones([(rank + 1), size], dtype=dtype) for _ in group
+                ]
+                expected_tensors = [
+                    torch.ones([rank + 1, size], dtype=dtype) * i for i in group
+                ]
+                if cuda:
+                    in_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in in_tensors]
+                    expected_tensors = [
+                        t.cuda(rank_to_GPU[rank][0]) for t in expected_tensors
+                    ]
+                    out_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in out_tensors]
+                if(qtype is not None):
+                    quantize_alltoall = quant.auto_quantize(dist.all_to_all, qtype, quant_loss=None)
+                    quantize_alltoall(out_tensors, in_tensors, group=group_id)
+                else:
+                    dist.all_to_all(out_tensors, in_tensors, group=group_id)
+                for t1, t2 in zip(out_tensors, expected_tensors):
+                    self.assertEqual(t1, t2)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/algorithms/quantization.py b/torch/distributed/algorithms/quantization/quantization.py
similarity index 99%
rename from torch/distributed/algorithms/quantization.py
rename to torch/distributed/algorithms/quantization/quantization.py
index dead78af600b2..724d6aa362487 100644
--- a/torch/distributed/algorithms/quantization.py
+++ b/torch/distributed/algorithms/quantization/quantization.py
@@ -86,16 +86,13 @@ def auto_quantize(func, qtype, quant_loss=None):
     """
     This is a prototype API that automatically quantize the input tensors, choose the precision types, and
     pass other necessary arguments and then dequantizes the output.
-
     Currently it only supports:
         . FP16 quantization method
         . all_gather, all_to_all collective ops
-
     Args:
         func (callable): A function representing collective operations.
         qtype (QuantType): Quantization method
         quant_loss (float, optional): This can be used to improve accuracy in the dequantization.
-
     Returns:
         (callable): the same collective as func but enables automatic quantization/dequantization.
     """
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 1631983d32ec7..f4bc073a4317e 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -19,7 +19,6 @@
 import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD
 import torch.distributed.algorithms.model_averaging.averagers as averagers
 import torch.distributed.algorithms.model_averaging.utils as model_averaging_utils
-import torch.distributed.algorithms.quantization as quant
 import torch.nn as nn
 import torch.nn.functional as F
 from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
@@ -29,7 +28,6 @@
 from torch.distributed.algorithms.ddp_comm_hooks import (
     quantization as quantization_hooks,
 )
-from torch.distributed.algorithms.quantization import DQuantType
 from torch.distributed.distributed_c10d import (
     get_world_size,
     _get_default_group,
@@ -2764,15 +2762,12 @@ def test_gather_full_group(self):
 
         # ALL GATHER
         def _test_all_gather_helper(
-            self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float, qtype=None
+            self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float
         ):
             for dest in group:
                 tensor = _build_tensor(dest + 1, rank, dtype=dtype)
                 tensors = [_build_tensor(dest + 1, -1, dtype=dtype) for i in group]
-                if qtype is not None:
-                    allgather = quant.auto_quantize(dist.all_gather, qtype, quant_loss=None)
-                else:
-                    allgather = dist.all_gather
+                allgather = dist.all_gather
                 if cuda:
                     tensor = tensor.cuda(rank_to_GPU[rank][0])
                     tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
@@ -2838,12 +2833,6 @@ def test_all_gather_full_group(self):
             group, group_id, rank = self._init_full_group_test()
             self._test_all_gather_helper(group, group_id, rank)
 
-        @sandcastle_skip_if(BACKEND == "nccl", "Nccl does not support CPU tensors")
-        @sandcastle_skip_if(BACKEND == "mpi", "all_gather_quantized does not support MPI")
-        def test_all_gather_quantized(self):
-            group, group_id, rank = self._init_global_test()
-            self._test_all_gather_helper(group, group_id, rank, dtype=torch.float32, qtype=DQuantType.FP16)
-
         def _run_all_gather_coalesced_and_verify(
             self, output_tensor_lists, input_tensors, expected_tensors, group_id
         ):
@@ -3046,7 +3035,6 @@ def _test_all_to_all_helper(
             cuda=False,
             rank_to_GPU=None,
             dtype=torch.float,
-            qtype=None
         ):
             if group_id is not None:
                 size = len(group)
@@ -3067,11 +3055,7 @@ def _test_all_to_all_helper(
                         t.cuda(rank_to_GPU[rank][0]) for t in expected_tensors
                     ]
                     out_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in out_tensors]
-                if(qtype is not None):
-                    quantize_alltoall = quant.auto_quantize(dist.all_to_all, qtype, quant_loss=None)
-                    quantize_alltoall(out_tensors, in_tensors, group=group_id)
-                else:
-                    dist.all_to_all(out_tensors, in_tensors, group=group_id)
+                dist.all_to_all(out_tensors, in_tensors, group=group_id)
                 for t1, t2 in zip(out_tensors, expected_tensors):
                     self.assertEqual(t1, t2)
             self._barrier()
@@ -3154,20 +3138,6 @@ def test_all_to_all(self):
             group, group_id, rank = self._init_global_test()
             self._test_all_to_all_helper(group, group_id, rank)
 
-        @sandcastle_skip_if(BACKEND != "nccl", "Only NCCL supports all_to_all")
-        @skip_if_rocm
-        def test_all_to_all_quantized(self):
-            group, group_id, rank = self._init_global_test()
-            rank_to_GPU = self._init_multigpu_helper()
-            self._test_all_to_all_helper(
-                group,
-                group_id,
-                rank,
-                cuda=True,
-                rank_to_GPU=rank_to_GPU,
-                dtype=torch.float32,
-                qtype=DQuantType.FP16)
-
         @sandcastle_skip_if(BACKEND != "nccl", "Only NCCL supports CUDA all_to_all")
         @skip_if_rocm
         def test_all_to_all_cuda(self):

From 83d9bad44a1e1e6202103cd22e4dbd2bd3d7dae0 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Tue, 24 Aug 2021 06:52:38 -0700
Subject: [PATCH 162/530] Add a common autograd TLS state (#63114)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63114

This PR collapses the GradMode and InferenceMode thread local booleans into a single thread local uint8.
This helps reducing the number of thread local variable accesses done when we propagate ThreadLocalStates.

Note that this is even more beneficial as we will add a forward mode AD TLS (similar to GradMode) higher in this stack and this new structure should reduce the perf impact of adding this new TLS.

Here is the full benchmark result between master and the top of this stack: https://gist.github.com/albanD/e421101e9ed344e94999bef3a54bf0f3
tl;dr: give a benefit in most cases. It is never detrimental.

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30388099

Pulled By: albanD

fbshipit-source-id: 8e03f940150ff063c2edd792733663413ae2f486
---
 aten/src/ATen/ThreadLocalState.cpp | 22 +++++++++++------
 aten/src/ATen/ThreadLocalState.h   |  5 +---
 c10/core/AutogradState.cpp         | 19 +++++++++++++++
 c10/core/AutogradState.h           | 39 ++++++++++++++++++++++++++++++
 c10/core/GradMode.cpp              |  7 +++---
 c10/core/InferenceMode.cpp         |  8 +-----
 c10/core/InferenceMode.h           | 19 +++++++--------
 7 files changed, 87 insertions(+), 32 deletions(-)
 create mode 100644 c10/core/AutogradState.cpp
 create mode 100644 c10/core/AutogradState.h

diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index ba7be1a06b8a1..fc4b8fa9c27ec 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -12,15 +12,12 @@ namespace at {
 ThreadLocalState::ThreadLocalState(bool keep_grad_mode)
     : dispatch_key_(c10::impl::tls_local_dispatch_key_set()),
       debug_info_(c10::ThreadLocalDebugInfo::current()),
-      inference_mode_enabled_(c10::InferenceMode::is_enabled()) {
+      autograd_tls_(c10::AutogradState::get_tls_state()) {
   rf_tls_ = at::get_record_function_tls_();
   saved_tensors_default_hooks_ = SavedTensorDefaultHooks::get_hooks();
 
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   keep_grad_mode_ = keep_grad_mode;
-  if (keep_grad_mode_) {
-    grad_mode_enabled_ = GradMode::is_enabled();
-  }
 #endif
   bumped_record_all_functions_ = at::checkRecordAllFunctions();
 }
@@ -28,10 +25,23 @@ ThreadLocalState::ThreadLocalState(bool keep_grad_mode)
 /* static */
 void ThreadLocalState::setThreadLocalState(
     const ThreadLocalState& state) {
+  // Note that setting the InferenceMode TLS in this function is ONLY ok because we always
+  // restore the dispatch key set TLS at the same time.
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   if (state.keep_grad_mode_) {
-    GradMode::set_enabled(state.grad_mode_enabled_);
+    c10::AutogradState::set_tls_state(state.autograd_tls_);
+  } else {
+    auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(),
+                                        /* inference_mode */ state.autograd_tls_.get_inference_mode());
+    c10::AutogradState::set_tls_state(new_state);
   }
+#else
+  // The mobile build explicitly ignore grad_mode but fails if we propagate
+  // its value across threads or set it to a fixed value.
+  // So we have to make sure the grad_mode value is not changed here.
+  auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(),
+                                      /* inference_mode */ state.autograd_tls_.get_inference_mode());
+  c10::AutogradState::set_tls_state(new_state);
 #endif
 
   at::set_record_function_tls_(state.rf_tls_);
@@ -43,8 +53,6 @@ void ThreadLocalState::setThreadLocalState(
   c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_);
 
   c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_);
-
-  c10::InferenceMode::_set_enabled(state.inference_mode_enabled_);
 }
 
 } // namespace at
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index f30f5e3442cc1..4942399cbd6d7 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -35,14 +35,11 @@ class TORCH_API ThreadLocalState {
   // RecordFunction TLS
   RecordFunctionTLS rf_tls_;
 
+  AutogradState autograd_tls_;
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   bool keep_grad_mode_ = true;
-  bool grad_mode_enabled_;
 #endif
 
-  // TLS for InferenceMode
-  bool inference_mode_enabled_;
-
   // TLS for saved tensors default hooks
   std::pair<PyObject*, PyObject*> saved_tensors_default_hooks_;
 
diff --git a/c10/core/AutogradState.cpp b/c10/core/AutogradState.cpp
new file mode 100644
index 0000000000000..9684a76b78564
--- /dev/null
+++ b/c10/core/AutogradState.cpp
@@ -0,0 +1,19 @@
+#include <c10/core/AutogradState.h>
+
+namespace c10 {
+
+namespace {
+// By default, grad mode is enabled and inference mode is disabled
+thread_local AutogradState autograd_state_tls =
+    AutogradState(/* grad_mode */ true, /* inference_mode */ false);
+} // namespace
+
+AutogradState& AutogradState::get_tls_state() {
+  return autograd_state_tls;
+}
+
+void AutogradState::set_tls_state(AutogradState state) {
+  autograd_state_tls = state;
+}
+
+} // namespace c10
diff --git a/c10/core/AutogradState.h b/c10/core/AutogradState.h
new file mode 100644
index 0000000000000..83ea3607cd2af
--- /dev/null
+++ b/c10/core/AutogradState.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+#include <cstdint>
+
+namespace c10 {
+
+// Structure used to pack all the thread local boolean
+// flags used by autograd
+struct TORCH_API AutogradState {
+  static AutogradState& get_tls_state();
+  static void set_tls_state(AutogradState state);
+
+  AutogradState(bool grad_mode, bool inference_mode)
+      : grad_mode_(grad_mode), inference_mode_(inference_mode) {}
+
+  void set_grad_mode(bool enabled) {
+    grad_mode_ = enabled;
+  }
+
+  void set_inference_mode(bool enabled) {
+    inference_mode_ = enabled;
+  }
+
+  bool get_grad_mode() const {
+    return grad_mode_;
+  }
+
+  bool get_inference_mode() const {
+    return inference_mode_;
+  }
+
+ private:
+  bool grad_mode_ : 1;
+  bool inference_mode_ : 1;
+};
+
+} // namespace c10
diff --git a/c10/core/GradMode.cpp b/c10/core/GradMode.cpp
index 32747a6698afa..a5db198083b2b 100644
--- a/c10/core/GradMode.cpp
+++ b/c10/core/GradMode.cpp
@@ -1,16 +1,15 @@
+#include <c10/core/AutogradState.h>
 #include <c10/core/GradMode.h>
 
 #include <stdexcept>
 
 namespace c10 {
 
-thread_local bool GradMode_enabled = true;
-
 bool GradMode::is_enabled() {
-  return GradMode_enabled;
+  return AutogradState::get_tls_state().get_grad_mode();
 }
 
 void GradMode::set_enabled(bool enabled) {
-  GradMode_enabled = enabled;
+  AutogradState::get_tls_state().set_grad_mode(enabled);
 }
 } // namespace c10
diff --git a/c10/core/InferenceMode.cpp b/c10/core/InferenceMode.cpp
index b588ab4da54b5..59eca760cf504 100644
--- a/c10/core/InferenceMode.cpp
+++ b/c10/core/InferenceMode.cpp
@@ -2,18 +2,12 @@
 #include <stdexcept>
 
 namespace c10 {
-thread_local bool InferenceMode_enabled = false;
-
 // Invariant:
 //   is_enabled() ==
 //   !c10::impl::tls_is_dispatch_key_included(DispatchKey::ADInplaceOrView);
 // InferenceMode::is_enabled() is in perf critical path (TensorImpl constructor)
 // so it worths a separate TLS to skip the DispatchKeySet check.
 bool InferenceMode::is_enabled() {
-  return InferenceMode_enabled;
-}
-
-void InferenceMode::_set_enabled(bool enabled) {
-  InferenceMode_enabled = enabled;
+  return AutogradState::get_tls_state().get_inference_mode();
 }
 } // namespace c10
diff --git a/c10/core/InferenceMode.h b/c10/core/InferenceMode.h
index 7a9c2c593a453..9748d6eccfb54 100644
--- a/c10/core/InferenceMode.h
+++ b/c10/core/InferenceMode.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/core/AutogradState.h>
 #include <c10/core/GradMode.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/macros/Macros.h>
@@ -50,10 +51,12 @@ struct TORCH_API InferenceMode {
   //    are applicable to InferenceMode as well, e.g.
   //    `tensorTypeInCurrentExecutionContext` in interpreter.cpp.
   InferenceMode(bool enabled = true)
-      : prev_mode(InferenceMode::is_enabled()),
-        prev_keyset(c10::impl::tls_local_dispatch_key_set()),
-        grad_mode(at::AutoGradMode(!enabled)) {
-    _set_enabled(enabled);
+      : prev_mode(AutogradState::get_tls_state()),
+        prev_keyset(c10::impl::tls_local_dispatch_key_set()) {
+    // Enabling inference mode means disabling grad mode
+    // And disabling inference mode means enabling grad mode
+    AutogradState::set_tls_state(
+        AutogradState(/* grad_mode */ !enabled, /* inference_mode */ enabled));
     DispatchKeySet included = enabled
         ? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView)
         : prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView);
@@ -67,17 +70,13 @@ struct TORCH_API InferenceMode {
   }
 
   ~InferenceMode() {
-    _set_enabled(prev_mode);
+    AutogradState::set_tls_state(prev_mode);
     c10::impl::_force_tls_local_dispatch_key_set(prev_keyset);
   }
   static bool is_enabled();
-  // _set_enabled() is not user facing and should be only used in
-  // ThreadLocalState.cpp.
-  static void _set_enabled(bool enabled);
 
  private:
-  bool prev_mode;
+  AutogradState prev_mode;
   c10::impl::LocalDispatchKeySet prev_keyset;
-  at::AutoGradMode grad_mode;
 };
 } // namespace c10

From 9914fb66155ac2dc674f62e2705f78f837d00c3f Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 24 Aug 2021 06:58:05 -0700
Subject: [PATCH 163/530] ENH Adds no_batch_dim tests/docs for LPPool1d and
 Identity (#62190)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/60585

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62190

Reviewed By: ejguan

Differential Revision: D29942385

Pulled By: jbschlosser

fbshipit-source-id: 00df6f6f01ad039631bb8679f8de94863aac7650
---
 torch/nn/modules/linear.py           | 4 ++++
 torch/nn/modules/pooling.py          | 4 ++--
 torch/testing/_internal/common_nn.py | 8 ++++++++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index 07fe1063283fc..21425f2be2aad 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -16,6 +16,10 @@ class Identity(Module):
         args: any argument (unused)
         kwargs: any keyword argument (unused)
 
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
     Examples::
 
         >>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False)
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index d09e257452e44..3665e893fa5ec 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -882,8 +882,8 @@ class LPPool1d(_LPPoolNd):
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
     Shape:
-        - Input: :math:`(N, C, L_{in})`
-        - Output: :math:`(N, C, L_{out})`, where
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
 
           .. math::
               L_{out} = \left\lfloor\frac{L_{in} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index aeaf6616e28b1..6b1bcf66f8bd5 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -2246,6 +2246,14 @@ def single_batch_reference_fn(input, parameters, module):
         cpp_constructor_args='torch::nn::LPPool1dOptions(2, 2).stride(3)',
         input_size=(1, 3, 7),
     ),
+    dict(
+        module_name='LPPool1d',
+        constructor_args=(2, 2, 3),
+        cpp_constructor_args='torch::nn::LPPool1dOptions(2, 2).stride(3)',
+        input_size=(3, 7),
+        reference_fn=single_batch_reference_fn,
+        desc='no_batch_dim',
+    ),
     dict(
         module_name='LocalResponseNorm',
         constructor_args=(3, ),

From 688f06cac353149d92dfda4793a3fb003b4c0e5a Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Tue, 24 Aug 2021 07:20:56 -0700
Subject: [PATCH 164/530] Revert D30388099: Add a common autograd TLS state

Test Plan: revert-hammer

Differential Revision:
D30388099 (https://github.com/pytorch/pytorch/commit/83d9bad44a1e1e6202103cd22e4dbd2bd3d7dae0)

Original commit changeset: 8e03f940150f

fbshipit-source-id: f6d60fec66e8292f5268335bb8a3e7e1a662f23b
---
 aten/src/ATen/ThreadLocalState.cpp | 22 ++++++-----------
 aten/src/ATen/ThreadLocalState.h   |  5 +++-
 c10/core/AutogradState.cpp         | 19 ---------------
 c10/core/AutogradState.h           | 39 ------------------------------
 c10/core/GradMode.cpp              |  7 +++---
 c10/core/InferenceMode.cpp         |  8 +++++-
 c10/core/InferenceMode.h           | 19 ++++++++-------
 7 files changed, 32 insertions(+), 87 deletions(-)
 delete mode 100644 c10/core/AutogradState.cpp
 delete mode 100644 c10/core/AutogradState.h

diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index fc4b8fa9c27ec..ba7be1a06b8a1 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -12,12 +12,15 @@ namespace at {
 ThreadLocalState::ThreadLocalState(bool keep_grad_mode)
     : dispatch_key_(c10::impl::tls_local_dispatch_key_set()),
       debug_info_(c10::ThreadLocalDebugInfo::current()),
-      autograd_tls_(c10::AutogradState::get_tls_state()) {
+      inference_mode_enabled_(c10::InferenceMode::is_enabled()) {
   rf_tls_ = at::get_record_function_tls_();
   saved_tensors_default_hooks_ = SavedTensorDefaultHooks::get_hooks();
 
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   keep_grad_mode_ = keep_grad_mode;
+  if (keep_grad_mode_) {
+    grad_mode_enabled_ = GradMode::is_enabled();
+  }
 #endif
   bumped_record_all_functions_ = at::checkRecordAllFunctions();
 }
@@ -25,23 +28,10 @@ ThreadLocalState::ThreadLocalState(bool keep_grad_mode)
 /* static */
 void ThreadLocalState::setThreadLocalState(
     const ThreadLocalState& state) {
-  // Note that setting the InferenceMode TLS in this function is ONLY ok because we always
-  // restore the dispatch key set TLS at the same time.
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   if (state.keep_grad_mode_) {
-    c10::AutogradState::set_tls_state(state.autograd_tls_);
-  } else {
-    auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(),
-                                        /* inference_mode */ state.autograd_tls_.get_inference_mode());
-    c10::AutogradState::set_tls_state(new_state);
+    GradMode::set_enabled(state.grad_mode_enabled_);
   }
-#else
-  // The mobile build explicitly ignore grad_mode but fails if we propagate
-  // its value across threads or set it to a fixed value.
-  // So we have to make sure the grad_mode value is not changed here.
-  auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(),
-                                      /* inference_mode */ state.autograd_tls_.get_inference_mode());
-  c10::AutogradState::set_tls_state(new_state);
 #endif
 
   at::set_record_function_tls_(state.rf_tls_);
@@ -53,6 +43,8 @@ void ThreadLocalState::setThreadLocalState(
   c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_);
 
   c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_);
+
+  c10::InferenceMode::_set_enabled(state.inference_mode_enabled_);
 }
 
 } // namespace at
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index 4942399cbd6d7..f30f5e3442cc1 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -35,11 +35,14 @@ class TORCH_API ThreadLocalState {
   // RecordFunction TLS
   RecordFunctionTLS rf_tls_;
 
-  AutogradState autograd_tls_;
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   bool keep_grad_mode_ = true;
+  bool grad_mode_enabled_;
 #endif
 
+  // TLS for InferenceMode
+  bool inference_mode_enabled_;
+
   // TLS for saved tensors default hooks
   std::pair<PyObject*, PyObject*> saved_tensors_default_hooks_;
 
diff --git a/c10/core/AutogradState.cpp b/c10/core/AutogradState.cpp
deleted file mode 100644
index 9684a76b78564..0000000000000
--- a/c10/core/AutogradState.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#include <c10/core/AutogradState.h>
-
-namespace c10 {
-
-namespace {
-// By default, grad mode is enabled and inference mode is disabled
-thread_local AutogradState autograd_state_tls =
-    AutogradState(/* grad_mode */ true, /* inference_mode */ false);
-} // namespace
-
-AutogradState& AutogradState::get_tls_state() {
-  return autograd_state_tls;
-}
-
-void AutogradState::set_tls_state(AutogradState state) {
-  autograd_state_tls = state;
-}
-
-} // namespace c10
diff --git a/c10/core/AutogradState.h b/c10/core/AutogradState.h
deleted file mode 100644
index 83ea3607cd2af..0000000000000
--- a/c10/core/AutogradState.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include <c10/macros/Macros.h>
-
-#include <cstdint>
-
-namespace c10 {
-
-// Structure used to pack all the thread local boolean
-// flags used by autograd
-struct TORCH_API AutogradState {
-  static AutogradState& get_tls_state();
-  static void set_tls_state(AutogradState state);
-
-  AutogradState(bool grad_mode, bool inference_mode)
-      : grad_mode_(grad_mode), inference_mode_(inference_mode) {}
-
-  void set_grad_mode(bool enabled) {
-    grad_mode_ = enabled;
-  }
-
-  void set_inference_mode(bool enabled) {
-    inference_mode_ = enabled;
-  }
-
-  bool get_grad_mode() const {
-    return grad_mode_;
-  }
-
-  bool get_inference_mode() const {
-    return inference_mode_;
-  }
-
- private:
-  bool grad_mode_ : 1;
-  bool inference_mode_ : 1;
-};
-
-} // namespace c10
diff --git a/c10/core/GradMode.cpp b/c10/core/GradMode.cpp
index a5db198083b2b..32747a6698afa 100644
--- a/c10/core/GradMode.cpp
+++ b/c10/core/GradMode.cpp
@@ -1,15 +1,16 @@
-#include <c10/core/AutogradState.h>
 #include <c10/core/GradMode.h>
 
 #include <stdexcept>
 
 namespace c10 {
 
+thread_local bool GradMode_enabled = true;
+
 bool GradMode::is_enabled() {
-  return AutogradState::get_tls_state().get_grad_mode();
+  return GradMode_enabled;
 }
 
 void GradMode::set_enabled(bool enabled) {
-  AutogradState::get_tls_state().set_grad_mode(enabled);
+  GradMode_enabled = enabled;
 }
 } // namespace c10
diff --git a/c10/core/InferenceMode.cpp b/c10/core/InferenceMode.cpp
index 59eca760cf504..b588ab4da54b5 100644
--- a/c10/core/InferenceMode.cpp
+++ b/c10/core/InferenceMode.cpp
@@ -2,12 +2,18 @@
 #include <stdexcept>
 
 namespace c10 {
+thread_local bool InferenceMode_enabled = false;
+
 // Invariant:
 //   is_enabled() ==
 //   !c10::impl::tls_is_dispatch_key_included(DispatchKey::ADInplaceOrView);
 // InferenceMode::is_enabled() is in perf critical path (TensorImpl constructor)
 // so it worths a separate TLS to skip the DispatchKeySet check.
 bool InferenceMode::is_enabled() {
-  return AutogradState::get_tls_state().get_inference_mode();
+  return InferenceMode_enabled;
+}
+
+void InferenceMode::_set_enabled(bool enabled) {
+  InferenceMode_enabled = enabled;
 }
 } // namespace c10
diff --git a/c10/core/InferenceMode.h b/c10/core/InferenceMode.h
index 9748d6eccfb54..7a9c2c593a453 100644
--- a/c10/core/InferenceMode.h
+++ b/c10/core/InferenceMode.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <c10/core/AutogradState.h>
 #include <c10/core/GradMode.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/macros/Macros.h>
@@ -51,12 +50,10 @@ struct TORCH_API InferenceMode {
   //    are applicable to InferenceMode as well, e.g.
   //    `tensorTypeInCurrentExecutionContext` in interpreter.cpp.
   InferenceMode(bool enabled = true)
-      : prev_mode(AutogradState::get_tls_state()),
-        prev_keyset(c10::impl::tls_local_dispatch_key_set()) {
-    // Enabling inference mode means disabling grad mode
-    // And disabling inference mode means enabling grad mode
-    AutogradState::set_tls_state(
-        AutogradState(/* grad_mode */ !enabled, /* inference_mode */ enabled));
+      : prev_mode(InferenceMode::is_enabled()),
+        prev_keyset(c10::impl::tls_local_dispatch_key_set()),
+        grad_mode(at::AutoGradMode(!enabled)) {
+    _set_enabled(enabled);
     DispatchKeySet included = enabled
         ? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView)
         : prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView);
@@ -70,13 +67,17 @@ struct TORCH_API InferenceMode {
   }
 
   ~InferenceMode() {
-    AutogradState::set_tls_state(prev_mode);
+    _set_enabled(prev_mode);
     c10::impl::_force_tls_local_dispatch_key_set(prev_keyset);
   }
   static bool is_enabled();
+  // _set_enabled() is not user facing and should be only used in
+  // ThreadLocalState.cpp.
+  static void _set_enabled(bool enabled);
 
  private:
-  AutogradState prev_mode;
+  bool prev_mode;
   c10::impl::LocalDispatchKeySet prev_keyset;
+  at::AutoGradMode grad_mode;
 };
 } // namespace c10

From f4aff3a346a0525e37d6071f318f7a4c54d5e1fb Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Tue, 24 Aug 2021 08:01:36 -0700
Subject: [PATCH 165/530] [BE] add distributed run_test options (#63147)

Summary:
Currently distributed tests are mixed within test_python.
We would like to split the distributed tests into its own batch thus we need to split them out.

Adding an option to include/exclude distributed tests with CUSTOM_HANDLERS.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63147

Test Plan:
- locally run with the addition run_test.py options.
- CI

Dependency: found a bug in mpiexec test and need https://github.com/pytorch/pytorch/issues/63580 to fix it first.

Reviewed By: bdhirsh

Differential Revision: D30496178

Pulled By: walterddr

fbshipit-source-id: 7903a57b619f2425028028f944211938823918a6
---
 .jenkins/pytorch/test.sh | 11 ++++++++---
 test/run_test.py         | 34 +++++++++++++++++++++++++++++++---
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 4fce9ab00dcc9..e27ba3e0cd838 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -158,17 +158,17 @@ test_python_legacy_jit() {
 }
 
 test_python_shard1() {
-  time python test/run_test.py --exclude-jit-executor --shard 1 2 --verbose --determine-from="$DETERMINE_FROM"
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard 1 2 --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
 test_python_shard2() {
-  time python test/run_test.py --exclude-jit-executor --shard 2 2 --verbose --determine-from="$DETERMINE_FROM"
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard 2 2 --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
 test_python() {
-  time python test/run_test.py --exclude-jit-executor --verbose --determine-from="$DETERMINE_FROM"
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
@@ -297,6 +297,10 @@ test_vulkan() {
 }
 
 test_distributed() {
+  echo "Testing distributed python tests"
+  time python test/run_test.py --distributed-tests --verbose --determine-from="$DETERMINE_FROM"
+  assert_git_not_dirty
+
   if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
     echo "Testing distributed C++ tests"
     ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
@@ -505,6 +509,7 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 || "$
   test_without_numpy
   install_torchvision
   test_python_shard1
+  test_distributed
   test_aten
 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 || "${SHARD_NUMBER}" == 2 ]]; then
   install_torchvision
diff --git a/test/run_test.py b/test/run_test.py
index ad3cbb90b6d30..f3b7bf72bbbba 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -394,6 +394,11 @@
     'test_jit_fuser_legacy',
 ]
 
+DISTRIBUTED_TESTS = [
+    'distributed/test_distributed_fork',
+    'distributed/test_distributed_spawn',
+]
+
 # Dictionary matching test modules (in TESTS) to lists of test cases (within that test_module) that would be run when
 # options.run_specified_test_cases is enabled.
 # For example:
@@ -640,6 +645,11 @@ def parse_args():
         '--jit',
         action='store_true',
         help='run all jit tests')
+    parser.add_argument(
+        '--distributed-tests',
+        '--distributed-tests',
+        action='store_true',
+        help='run all distributed tests')
     parser.add_argument(
         '-pt', '--pytest', action='store_true',
         help='If true, use `pytest` to execute the tests. E.g., this runs '
@@ -723,6 +733,11 @@ def parse_args():
         action='store_true',
         help='exclude tests that are run for a specific jit config'
     )
+    parser.add_argument(
+        '--exclude-distributed-tests',
+        action='store_true',
+        help='exclude distributed tests'
+    )
     parser.add_argument(
         '--run-specified-test-cases',
         nargs='?',
@@ -800,6 +815,7 @@ def exclude_tests(exclude_list, selected_tests, exclude_message=None):
 
 
 def get_selected_tests(options):
+    # First make sure run specific test cases options are processed.
     if options.run_specified_test_cases:
         if options.use_specified_test_cases_by == 'include':
             options.include = list(SPECIFIED_TEST_CASES_DICT.keys())
@@ -808,6 +824,16 @@ def get_selected_tests(options):
 
     selected_tests = options.include
 
+    # filter if there's JIT only and distributed only test options
+    if options.jit:
+        selected_tests = list(
+            filter(lambda test_name: "jit" in test_name, selected_tests))
+
+    if options.distributed_tests:
+        selected_tests = list(
+            filter(lambda test_name: test_name in DISTRIBUTED_TESTS, selected_tests))
+
+    # process reordering
     if options.bring_to_front:
         to_front = set(options.bring_to_front)
         selected_tests = options.bring_to_front + list(filter(lambda name: name not in to_front,
@@ -821,9 +847,13 @@ def get_selected_tests(options):
         last_index = find_test_index(options.last, selected_tests, find_last_index=True)
         selected_tests = selected_tests[:last_index + 1]
 
+    # process exclusion
     if options.exclude_jit_executor:
         options.exclude.extend(JIT_EXECUTOR_TESTS)
 
+    if options.exclude_distributed_tests:
+        options.exclude.extend(DISTRIBUTED_TESTS)
+
     selected_tests = exclude_tests(options.exclude, selected_tests)
 
     if sys.platform == 'win32' and not options.ignore_win_blocklist:
@@ -840,6 +870,7 @@ def get_selected_tests(options):
     elif TEST_WITH_ROCM:
         selected_tests = exclude_tests(ROCM_BLOCKLIST, selected_tests, 'on ROCm')
 
+    # sharding
     if options.shard:
         assert len(options.shard) == 2, "Unexpected shard format"
         assert min(options.shard) > 0, "Shards must be positive numbers"
@@ -1030,9 +1061,6 @@ def main():
     if options.coverage and not PYTORCH_COLLECT_COVERAGE:
         shell(['coverage', 'erase'])
 
-    if options.jit:
-        selected_tests = filter(lambda test_name: "jit" in test_name, TESTS)
-
     if options.determine_from is not None and os.path.exists(options.determine_from):
         slow_tests = get_slow_tests_based_on_S3(TESTS, TARGET_DET_LIST, SLOW_TEST_THRESHOLD)
         print('Added the following tests to target_det tests as calculated based on S3:')

From 1385f9fb12e6607c98d2d9d5edaaaab2bc07386f Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Tue, 24 Aug 2021 08:19:38 -0700
Subject: [PATCH 166/530] [JIT] Add variadic stack op (#63578)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63578

Added a new op `prim::VarStack` and a pass that transforms instances of `aten::stack(list, dim)` into `prim::VarStack(list[0], ..., list[n], dim)`. Also provided a JIT interpreter implementation.

Most of the implementation/tests are the same as `prim::VarConcat`.

Test Plan: `buck test caffe2/test/cpp/jit:jit -- TestStackOpt`

Reviewed By: navahgar

Differential Revision: D30426232

fbshipit-source-id: 9829a7db6e0a5038c9b7528c43c25b0c221aa2ce
---
 aten/src/ATen/core/interned_strings.h        |   1 +
 test/cpp/jit/CMakeLists.txt                  |   1 +
 test/cpp/jit/test_stack_opt.cpp              | 308 +++++++++++++++++++
 torch/csrc/jit/passes/variadic_ops.cpp       |   9 +
 torch/csrc/jit/passes/variadic_ops.h         |   7 +
 torch/csrc/jit/runtime/register_prim_ops.cpp |  12 +
 torch/csrc/jit/runtime/static/ops.cpp        |   2 +-
 7 files changed, 339 insertions(+), 1 deletion(-)
 create mode 100644 test/cpp/jit/test_stack_opt.cpp

diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index c1dbc75a26b99..69e5f97f7127a 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -84,6 +84,7 @@ namespace c10 {
   _(prim, NumToTensor)               \
   _(prim, Uninitialized)             \
   _(prim, VarConcat)                 \
+  _(prim, VarStack)                  \
   _(prim, With)                      \
   _(prim, Enter)                     \
   _(prim, Exit)                      \
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index 08115433312f5..e766f33a250b2 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -62,6 +62,7 @@ set(JIT_TEST_SRCS
   ${JIT_TEST_ROOT}/test_qualified_name.cpp
   ${JIT_TEST_ROOT}/test_save_load.cpp
   ${JIT_TEST_ROOT}/test_schema_matching.cpp
+  ${JIT_TEST_ROOT}/test_stack_opt.cpp
   ${JIT_TEST_ROOT}/test_subgraph_matcher.cpp
   ${JIT_TEST_ROOT}/test_subgraph_rewriter.cpp
   ${JIT_TEST_ROOT}/test_subgraph_utils.cpp
diff --git a/test/cpp/jit/test_stack_opt.cpp b/test/cpp/jit/test_stack_opt.cpp
new file mode 100644
index 0000000000000..fea1bb5f81042
--- /dev/null
+++ b/test/cpp/jit/test_stack_opt.cpp
@@ -0,0 +1,308 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/variadic_ops.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/testing/file_check.h>
+
+namespace torch {
+namespace jit {
+
+TEST(StackOptTest, UseVariadicStack) {
+  auto graph = std::make_shared<Graph>();
+
+  const std::string input =
+      R"IR(
+        graph(%0: Float(56, 56, 56),
+              %1: Float(56, 56, 56),
+              %2: Float(56, 56, 56),
+              %3: Float(56, 56, 56),
+              %4: Float(56, 56, 56),
+              %5: Float(56, 56, 56)):
+          %10 : int = prim::Constant[value=0]()
+          %input : Tensor[] = prim::ListConstruct(%0, %1, %2, %3, %4, %5)
+          %stack : Float(5, 56, 56, 56) = aten::stack(%input, %10)
+          return (%stack)
+      )IR";
+  parseIR(input, graph.get());
+  std::vector<at::Tensor> inputs = {
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU)};
+  auto orig_outputs = runGraph(graph, inputs);
+
+  ASSERT_TRUE(UseVariadicStack(graph));
+  graph->lint();
+  auto opt_outputs = runGraph(graph, inputs);
+
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
+
+  // After replacing `aten::stack` with `prim::VarStack` we should have the
+  // following graph:
+  //
+  //  graph(%0 : ...,
+  //        %1 : ...):
+  //    %zero : int = prim:Constant[value=0]()
+  //    %varstack : Tensor = prim::VarStack(%0, %1, %2, %3, %4, %5, %zero)
+  //    return (%varstack)
+  testing::FileCheck()
+      .check_count("= prim::VarStack(", 1, /*exactly*/ true)
+      ->check_count("= aten::stack(", 0, /*exactly*/ true)
+      ->check_count("= prim::ListConstruct(", 0, /*exactly*/ true)
+      ->run(*graph);
+}
+
+TEST(StackOptTest, UseVariadicStackReplaceMultiple) {
+  auto graph = std::make_shared<Graph>();
+
+  const std::string input =
+      R"IR(
+        graph(%0: Float(56, 56, 56),
+              %1: Float(56, 56, 56),
+              %2: Float(56, 56, 56),
+              %3: Float(56, 56, 56)):
+          %10 : int = prim::Constant[value=0]()
+          %input1 : Tensor[] = prim::ListConstruct(%0, %1)
+          %stack1 : Float(4, 56, 56, 56) = aten::stack(%input1, %10)
+          %input2 : Tensor[] = prim::ListConstruct(%2, %3)
+          %stack2 : Float(4, 56, 56, 56) = aten::stack(%input2, %10)
+          return (%stack1, %stack2)
+      )IR";
+  parseIR(input, graph.get());
+  std::vector<at::Tensor> inputs = {
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU)};
+  auto orig_outputs = runGraph(graph, inputs);
+
+  ASSERT_TRUE(UseVariadicStack(graph));
+  graph->lint();
+  auto opt_outputs = runGraph(graph, inputs);
+
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
+
+  // After full stack optimization we should have the following graph:
+  //
+  //  graph(%0 : ...,
+  //        %1 : ...,
+  //        %2 : ...,
+  //        %3 : ....):
+  //    %zero : int = prim:Constant[value=0]()
+  //    %varcat1 : Tensor = prim::VarStack(%0, %1, %zero)
+  //    %varcat2 : Tensor = prim::VarStack(%2, %3, %zero)
+  //    return (%varcat1, %varcat2)
+  testing::FileCheck()
+      .check_count("= prim::VarStack(", 2, /*exactly*/ true)
+      ->check_count("= aten::stack(", 0, /*exactly*/ true)
+      ->check_count("= prim::ListConstruct(", 0, /*exactly*/ true)
+      ->run(*graph);
+}
+
+TEST(StackOptTest, UseVariadicStackWithMultipleListUses) {
+  auto graph = std::make_shared<Graph>();
+
+  const std::string input =
+      R"IR(
+        graph(%0: Float(56, 56, 56),
+              %1: Float(56, 56, 56)):
+          %2 : int = prim::Constant[value=0]()
+          %input : Tensor[] = prim::ListConstruct(%0, %1)
+          %stack : Float(2, 56, 56, 56) = aten::stack(%input, %2)
+          return (%stack, %input)
+      )IR";
+  parseIR(input, graph.get());
+  std::vector<at::Tensor> inputs = {
+      at::rand({56, 56, 56}, at::kCPU), at::rand({56, 56, 56}, at::kCPU)};
+  auto orig_outputs = runGraph(graph, inputs);
+
+  ASSERT_TRUE(UseVariadicStack(graph));
+  graph->lint();
+  auto opt_outputs = runGraph(graph, inputs);
+
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
+
+  // After replacing `aten::stack` with `prim::VarStack` we should have the
+  // following graph:
+  //
+  //  graph(%0 : ...,
+  //        %1 : ...):
+  //    %zero : int = prim:Constant[value=0]()
+  //    %input : Tensor[] = prim::ListConstruct(%0, %1)
+  //    %varcat : Tensor = prim::VarStack(%0, %1, %zero)
+  //    return (%varcat, %input)
+  testing::FileCheck()
+      .check_count("= prim::ListConstruct(", 1, /*exactly*/ true)
+      ->check_count("= prim::VarStack(", 1, /*exactly*/ true)
+      ->check_count("= aten::stack(", 0, /*exactly*/ true)
+      ->run(*graph);
+}
+
+TEST(StackOptTest, UseVariadicStackWithListMutationAfterCat) {
+  auto graph = std::make_shared<Graph>();
+
+  const std::string input =
+      R"IR(
+        graph(%0: Float(56, 56, 56),
+              %1: Float(56, 56, 56),
+              %2: Float(56, 56, 56)):
+          %10 : int = prim::Constant[value=0]()
+          %input : Tensor[] = prim::ListConstruct(%0, %1)
+          %stack : Float(3, 56, 56, 56) = aten::stack(%input, %10)
+          %11 : Tensor = aten::append(%input, %2)
+          return (%stack, %input)
+      )IR";
+  parseIR(input, graph.get());
+  std::vector<at::Tensor> inputs = {
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU)};
+  auto orig_outputs = runGraph(graph, inputs);
+
+  ASSERT_TRUE(UseVariadicStack(graph));
+  graph->lint();
+  auto opt_outputs = runGraph(graph, inputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
+
+  // The input list to `aten::stack` is mutated only after `aten::stack` op. So,
+  // it should have been replaced with `prim::VarStack`. The transformed graph
+  // should look like the following:
+  //
+  //  graph(%0 : ...,
+  //        %1 : ...,
+  //        %2 : ...):
+  //    %3 : int = prim:Constant[value=0]()
+  //    %4 : Tensor[] = prim::ListConstruct(%0, %1)
+  //    %7 : Tensor = prim::VarStack(%0, %1, %3)
+  //    %6 : Tensor = aten::append(%4, %2)
+  //    return (%7, %4)
+  testing::FileCheck()
+      .check_count("= prim::ListConstruct(", 1, /*exactly*/ true)
+      ->check_count("= prim::VarStack(", 1, /*exactly*/ true)
+      ->check_count("= aten::stack(", 0, /*exactly*/ true)
+      ->run(*graph);
+}
+
+TEST(StackOptTest, UseVariadicStackWithListMutationBeforeCat) {
+  auto graph = std::make_shared<Graph>();
+
+  const std::string input =
+      R"IR(
+        graph(%0: Float(56, 56, 56),
+              %1: Float(56, 56, 56),
+              %2: Float(56, 56, 56)):
+          %10 : int = prim::Constant[value=0]()
+          %input : Tensor[] = prim::ListConstruct(%0, %1)
+          %11 : Tensor = aten::append(%input, %2)
+          %stack : Float(3, 56, 56, 56) = aten::stack(%input, %10)
+          return (%stack)
+      )IR";
+  parseIR(input, graph.get());
+  std::vector<at::Tensor> inputs = {
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU)};
+  auto orig_outputs = runGraph(graph, inputs);
+
+  {
+    ASSERT_FALSE(UseVariadicStack(graph));
+    graph->lint();
+    auto opt_outputs = runGraph(graph, inputs);
+    ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
+
+    // No transformation should have happened since the `prim::ListConstruct` is
+    // mutated before `aten::stack`.
+    testing::FileCheck()
+        .check_count("= prim::ListConstruct(", 1, /*exactly*/ true)
+        ->check_count("= aten::stack(", 1, /*exactly*/ true)
+        ->check_count("= prim::VarStack(", 0, /*exactly*/ true)
+        ->run(*graph);
+  }
+
+  {
+    ASSERT_TRUE(RemoveListMutationAndUseVariadicStack(graph));
+    graph->lint();
+    auto opt_outputs = runGraph(graph, inputs);
+    ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
+
+    // The mutation of the list must be removed and the `aten::stack` op must
+    // be replaced with the `prim::VarStack` op in the graph. The transformed
+    // graph should look like the following:
+    //
+    //  graph(%0 : ...,
+    //        %1 : ...,
+    //        %2 : ...):
+    //    %3 : int = prim:Constant[value=0]()
+    //    %7 : Tensor = prim::VarStack(%0, %1, %2, %3)
+    //    return (%7)
+    testing::FileCheck()
+        .check_count("= prim::VarStack(", 1, /*exactly*/ true)
+        ->check_count("= prim::ListConstruct(", 0, /*exactly*/ true)
+        ->check_count("= aten::stack(", 0, /*exactly*/ true)
+        ->run(*graph);
+  }
+}
+
+TEST(StackOptTest, UseVariadicStackWithMultipleListMutations) {
+  auto graph = std::make_shared<Graph>();
+
+  const std::string input =
+      R"IR(
+        graph(%0: Float(56, 56, 56),
+              %1: Float(56, 56, 56),
+              %2: Float(56, 56, 56),
+              %3: Float(56, 56, 56),
+              %4: Float(56, 56, 56)):
+          %10 : int = prim::Constant[value=0]()
+          %input : Tensor[] = prim::ListConstruct(%0, %1)
+          %stack.1 : Float(5, 56, 56, 56) = aten::stack(%input, %10)
+          %11 : Tensor = aten::append(%input, %2)
+          %stack.2 : Float(5, 56, 56, 56) = aten::stack(%input, %10)
+          %12 : Tensor = aten::append(%input, %3)
+          %stack.3 : Float(5, 56, 56, 56) = aten::stack(%input, %10)
+          %13 : Tensor = aten::append(%input, %4)
+          %stack.4 : Float(5, 56, 56, 56) = aten::stack(%input, %10)
+          return (%stack.1, %stack.2, %stack.3, %stack.4)
+      )IR";
+  parseIR(input, graph.get());
+  std::vector<at::Tensor> inputs = {
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU),
+      at::rand({56, 56, 56}, at::kCPU)};
+  auto orig_outputs = runGraph(graph, inputs);
+
+  ASSERT_TRUE(RemoveListMutationAndUseVariadicStack(graph));
+  graph->lint();
+  auto opt_outputs = runGraph(graph, inputs);
+  ASSERT_TRUE(exactlyEqual(orig_outputs, opt_outputs));
+
+  // All the mutations of the list must be removed and the `aten::stack` ops
+  // must be replaced with `prim::VarStack` ops in the graph. The transformed
+  // graph should look like the following:
+  //
+  //  graph(%0 : ...,
+  //        %1 : ...,
+  //        %2 : ...,
+  //        %3 : ...,
+  //        %4 : ...):
+  //    %10 : int = prim:Constant[value=0]()
+  //    %5 : Tensor = prim::VarStack(%0, %1, %10)
+  //    %6 : Tensor = prim::VarStack(%0, %1, %2, %10)
+  //    %7 : Tensor = prim::VarStack(%0, %1, %2, %3, %10)
+  //    %8 : Tensor = prim::VarStack(%0, %1, %2, %3, %4, %10)
+  //    return (%5, %6, %7, %8)
+  testing::FileCheck()
+      .check_count("= prim::VarStack(", 4, /*exactly*/ true)
+      ->check_count("= prim::ListConstruct(", 0, /*exactly*/ true)
+      ->check_count("= aten::stack(", 0, /*exactly*/ true)
+      ->run(*graph);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/variadic_ops.cpp b/torch/csrc/jit/passes/variadic_ops.cpp
index aeb70747b3a57..6f4d23cec7b66 100644
--- a/torch/csrc/jit/passes/variadic_ops.cpp
+++ b/torch/csrc/jit/passes/variadic_ops.cpp
@@ -122,5 +122,14 @@ bool RemoveListMutationAndUseVariadicCat(const std::shared_ptr<Graph>& graph) {
   return RemoveListMutationAndUseVariadicOp(graph, aten::cat, prim::VarConcat);
 }
 
+bool UseVariadicStack(const std::shared_ptr<Graph>& graph) {
+  return UseVariadicOp(graph, aten::stack, prim::VarStack);
+}
+
+bool RemoveListMutationAndUseVariadicStack(
+    const std::shared_ptr<Graph>& graph) {
+  return RemoveListMutationAndUseVariadicOp(graph, aten::stack, prim::VarStack);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/variadic_ops.h b/torch/csrc/jit/passes/variadic_ops.h
index 1c52e9513ae2b..20cc6648dddb4 100644
--- a/torch/csrc/jit/passes/variadic_ops.h
+++ b/torch/csrc/jit/passes/variadic_ops.h
@@ -12,5 +12,12 @@ TORCH_API bool UseVariadicCat(const std::shared_ptr<Graph>& graph);
 TORCH_API bool RemoveListMutationAndUseVariadicCat(
     const std::shared_ptr<Graph>& graph);
 
+// Replaces the `aten::stack` ops in the given graph with variadic cat ops.
+// Returns true if the graph is modified.
+TORCH_API bool UseVariadicStack(const std::shared_ptr<Graph>& graph);
+
+TORCH_API bool RemoveListMutationAndUseVariadicStack(
+    const std::shared_ptr<Graph>& graph);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 984073fbf72c1..60458a0ae11e5 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -776,6 +776,18 @@ RegisterOperators reg(
            push(stack, at::cat(inputs, dim));
          },
          aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("prim::VarStack(...) -> Tensor"),
+         [](Stack* stack) {
+           auto num_inputs = pop(stack).toInt();
+           auto dim = pop(stack).toInt();
+           std::vector<at::Tensor> inputs(num_inputs - 1);
+           for (int i = 0; i < num_inputs - 1; ++i) {
+             inputs[num_inputs - 2 - i] = pop(stack).toTensor();
+           }
+           push(stack, at::stack(inputs, dim));
+         },
+         aliasAnalysisFromSchema()),
      OperatorGenerator(
          TORCH_SELECTIVE_SCHEMA(
              "aten::eq.enum(AnyEnumType a, AnyEnumType b) -> bool"),
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 5ef770c6755fc..62d86126a0577 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -214,7 +214,7 @@ std::function<void(ProcessedNode*)> getOutOfPlaceOperation(Node* n) {
 
 // Returns true if the node represents an op with variadic arguments.
 bool hasVarArgs(Node* n) {
-  if (n->kind() == prim::VarConcat) {
+  if (n->kind() == prim::VarConcat || n->kind() == prim::VarStack) {
     return true;
   }
   return false;

From d3be02d1004aa84cb2e001a2d72a04356d4d4cd3 Mon Sep 17 00:00:00 2001
From: mingfeima <mingfei.ma@intel.com>
Date: Tue, 24 Aug 2021 08:22:47 -0700
Subject: [PATCH 167/530] fix batchnorm2d issue when input is non contiguous
 (#63392)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63392

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30476317

Pulled By: VitalyFedyunin

fbshipit-source-id: 03055a0aec21cf2c029b6f32315da2b09cb722d0
---
 aten/src/ATen/native/Normalization.cpp        | 14 ++-
 .../src/ATen/native/cpu/batch_norm_kernel.cpp | 95 ++++++++-----------
 test/test_nn.py                               | 19 ++++
 3 files changed, 71 insertions(+), 57 deletions(-)

diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 611faf010abaf..25ae1a765e85f 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -74,6 +74,13 @@ static inline bool is_contiguous(const Tensor& t) {
   return t.is_contiguous() || t.is_contiguous(at::MemoryFormat::ChannelsLast);
 }
 
+// For some ambiguous cases, it is possible a channels last contiguous Tensor has
+//   `suggest_memory_format` of Contiguous.
+// See https://github.com/pytorch/pytorch/issues/63224 for details.
+static inline MemoryFormat suggest_memory_format_contig(const Tensor& t) {
+  return t.is_contiguous() ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+}
+
 template<typename scalar_t>
 std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
     const Tensor& input, const Tensor& weight, const Tensor& bias,
@@ -87,10 +94,9 @@ std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
       && running_mean.is_contiguous()
       && running_var.is_contiguous();
 
-  Tensor output = at::empty_like(input, input.suggest_memory_format());
-
   // inference contiguous path
   if (all_contiguous) {
+    Tensor output = at::empty_like(input, suggest_memory_format_contig(input));
     batch_norm_cpu_stub(kCPU, output, input, weight, bias,
         save_mean, save_invstd, running_mean, running_var, train, eps);
     return std::make_tuple(output, save_mean, save_invstd);
@@ -120,6 +126,7 @@ std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
   auto b = bias.defined() ? as_nd(bias) :
       at::detail::scalar_tensor_static(0, input.scalar_type(), kCPU);
 
+  Tensor output = at::empty_like(input, input.suggest_memory_format());
   auto iter = TensorIteratorConfig()
     .add_output(output)
     .add_input(input)
@@ -250,6 +257,9 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
       && input.suggest_memory_format() == grad_out_.suggest_memory_format();
 
   if (all_contiguous) {
+    if (grad_input_mask[0]) {
+      grad_input = at::empty_like(input, suggest_memory_format_contig(input));
+    }
     batch_norm_cpu_backward_stub(kCPU, grad_input, grad_weight, grad_bias,
         grad_out_, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps);
     return std::make_tuple(grad_input, grad_weight, grad_bias);
diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
index 2d1275538d89f..75037606d3ff4 100644
--- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
@@ -611,48 +611,38 @@ void batch_norm_cpu_backward_channels_last_impl(Tensor& grad_input, Tensor& grad
 void batch_norm_cpu_kernel(Tensor& output, const Tensor& input,
     const Tensor& weight, const Tensor& bias, const Tensor& save_mean,  const Tensor& save_invstd,
     const Tensor& running_mean, const Tensor& running_var, bool train, double eps) {
-  switch (input.suggest_memory_format()) {
-    case at::MemoryFormat::Contiguous: {
-      AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_contiguous", [&] {
-        batch_norm_cpu_contiguous_impl<scalar_t>(output, input, weight, bias,
-            save_mean, save_invstd, running_mean, running_var, train, eps);
-      });
-      break;
-    }
-    case at::MemoryFormat::ChannelsLast: {
-      AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_channels_last", [&] {
-        batch_norm_cpu_channels_last_impl<scalar_t>(output, input, weight, bias,
-            save_mean, save_invstd, running_mean, running_var, train, eps);
-      });
-      break;
-    }
-    default:
-      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  if (input.is_contiguous()) {
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_contiguous", [&] {
+      batch_norm_cpu_contiguous_impl<scalar_t>(output, input, weight, bias,
+          save_mean, save_invstd, running_mean, running_var, train, eps);
+    });
+  } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_channels_last", [&] {
+      batch_norm_cpu_channels_last_impl<scalar_t>(output, input, weight, bias,
+          save_mean, save_invstd, running_mean, running_var, train, eps);
+    });
+  } else {
+    TORCH_CHECK(false, "batch_norm_cpu_kernel: expecting input to be contiguous.");
   }
 }
 
 void batch_norm_cpu_collect_stats_kernel(
     Tensor& mean, Tensor& var_sum, const Tensor& input) {
   int64_t image_size = input.numel() / input.size(0) / input.size(1);
-  switch (input.suggest_memory_format()) {
-    case at::MemoryFormat::Contiguous: {
-      AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_collect_stats_contiguous", [&] {
-        if (image_size == 1) { // NC11 is also channels last
-          batch_norm_cpu_collect_stats_channels_last_impl<scalar_t>(mean, var_sum, input);
-        } else {
-          batch_norm_cpu_collect_stats_contiguous_impl<scalar_t>(mean, var_sum, input);
-        }
-      });
-      break;
-    }
-    case at::MemoryFormat::ChannelsLast: {
-      AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_collect_stats_channels_last", [&] {
+  if (input.is_contiguous()) {
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_collect_stats_contiguous", [&] {
+      if (image_size == 1) { // NC11 is also channels last
         batch_norm_cpu_collect_stats_channels_last_impl<scalar_t>(mean, var_sum, input);
-      });
-      break;
-    }
-    default:
-      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      } else {
+        batch_norm_cpu_collect_stats_contiguous_impl<scalar_t>(mean, var_sum, input);
+      }
+    });
+  } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_collect_stats_channels_last", [&] {
+      batch_norm_cpu_collect_stats_channels_last_impl<scalar_t>(mean, var_sum, input);
+    });
+  } else {
+    TORCH_CHECK(false, "batch_norm_cpu_collect_stats_kernel: expecting input to be contiguous.");
   }
 }
 
@@ -661,28 +651,23 @@ void batch_norm_cpu_backward_kernel(Tensor& grad_input, Tensor& grad_weight, Ten
     const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd,
     bool train, double eps) {
   int64_t image_size = input.numel() / input.size(0) / input.size(1);
-  switch (input.suggest_memory_format()) {
-    case at::MemoryFormat::Contiguous: {
-      AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_backward_contiguous", [&] {
-        if (image_size == 1) { // NC11 is also channels last
-          batch_norm_cpu_backward_channels_last_impl<scalar_t>(grad_input, grad_weight, grad_bias,
-              grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps);
-        } else {
-          batch_norm_cpu_backward_contiguous_impl<scalar_t>(grad_input, grad_weight, grad_bias,
-              grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps);
-        }
-      });
-      break;
-    }
-    case at::MemoryFormat::ChannelsLast: {
-      AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_backward_channels_last", [&] {
+  if (input.is_contiguous()) {
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_backward_contiguous", [&] {
+      if (image_size == 1) { // NC11 is also channels last
         batch_norm_cpu_backward_channels_last_impl<scalar_t>(grad_input, grad_weight, grad_bias,
             grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps);
-      });
-      break;
-    }
-    default:
-      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      } else {
+        batch_norm_cpu_backward_contiguous_impl<scalar_t>(grad_input, grad_weight, grad_bias,
+            grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps);
+      }
+    });
+  } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_backward_channels_last", [&] {
+      batch_norm_cpu_backward_channels_last_impl<scalar_t>(grad_input, grad_weight, grad_bias,
+          grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps);
+    });
+  } else {
+    TORCH_CHECK(false, "batch_norm_cpu_backward_kernel: expecting input to be contiguous.");
   }
 }
 
diff --git a/test/test_nn.py b/test/test_nn.py
index f4691e6a5fa03..07a2b48cc6a20 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -8929,6 +8929,25 @@ def helper(self, size):
         helper(self, (4, 1, 9, 9))
         helper(self, (4, 9, 1, 1))
 
+    def test_batchnorm_non_contig_cpu(self):
+        input = torch.arange(6, dtype=torch.float).reshape(1, 3, 2, 1).cpu()
+        input = input.permute(0, 2, 1, 3)
+
+        bn = torch.nn.BatchNorm2d(2).cpu().float().eval()
+        bn.weight.data.uniform_()
+        bn.bias.data.uniform_()
+
+        ref_input = input.detach().clone().contiguous()
+        ref_bn = nn.BatchNorm2d(2).cpu().float().eval()
+        ref_bn.load_state_dict(bn.state_dict())
+
+        out = bn(input)
+        ref_out = ref_bn(ref_input)
+
+        self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+        self.assertTrue(ref_out.is_contiguous())
+        self.assertEqual(out, ref_out)
+
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
     @skipIfRocm

From 2ca2761f3c448e58d587b440d459d328ae8efc95 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 24 Aug 2021 08:26:21 -0700
Subject: [PATCH 168/530] ENH Adds no_batch_dim for NLLLoss (#62651)

Summary:
Towards https://github.com/pytorch/pytorch/issues/60585

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62651

Reviewed By: VitalyFedyunin

Differential Revision: D30303340

Pulled By: jbschlosser

fbshipit-source-id: 7ab478cf63bf6cd1f850cad5fd101e74a2cfe3f5
---
 aten/src/ATen/native/LossNLL.cpp     | 32 +++++++++++++++++-----------
 aten/src/ATen/native/cuda/Loss.cu    |  1 -
 torch/nn/modules/loss.py             |  5 +++--
 torch/testing/_internal/common_nn.py | 11 ++++++++--
 4 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index 7c306c2bb863c..c7c65f7b8cc22 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -22,10 +22,12 @@ TORCH_META_FUNC(nll_loss_forward)
   TORCH_CHECK(
       self.dim() > 0 && self.dim() <= 2, "input tensor should be 1D or 2D");
   TORCH_CHECK(
-      target.dim() == 1,
-      "1D target tensor expected, multi-target not supported");
+      target.dim() <= 1,
+      "0D or 1D target tensor expected, multi-target not supported");
+
+  auto no_batch_dim = self.dim() == 1  && target.dim() == 0;
   TORCH_CHECK(
-      self.size(0) == target.size(0),
+      no_batch_dim || (self.size(0) == target.size(0)),
       "size mismatch (got input: ",
       self.sizes(),
       ", target: ",
@@ -66,10 +68,12 @@ TORCH_META_FUNC(nll_loss_backward)
   TORCH_CHECK(
       self.dim() > 0 && self.dim() <= 2, "input tensor should be 1D or 2D");
   TORCH_CHECK(
-      target.dim() == 1,
-      "1D target tensor expected, multi-target not supported");
+      target.dim() <= 1,
+      "0D or 1D target tensor expected, multi-target not supported");
+
+  auto no_batch_dim = self.dim() == 1  && target.dim() == 0;
   TORCH_CHECK(
-      self.size(0) == target.size(0),
+      no_batch_dim || (self.size(0) == target.size(0)),
       "size mismatch (got input: ",
       self.sizes(),
       ", target: ",
@@ -181,7 +185,6 @@ static void nll_loss_out_frame(
   const int64_t ndim = input.dim();
   TORCH_CHECK(ndim <= 2);
   const int64_t batch_size = ndim == 1 ? 1 : input.size(0);
-  TORCH_CHECK(target.size(0) == batch_size);
 
   constexpr int64_t cascade_sum_num_levels = 8;
   const int64_t level_power =
@@ -298,7 +301,11 @@ static void nll_loss_backward_out_frame(
   const auto n_dims = input.dim();
   const auto n_classes = input.size(-1);
 
-  auto target_acc = target.accessor<target_t, 1>();
+  auto target_ = target;
+  if (target.dim() == 0) {
+    target_ = target.unsqueeze(0);
+  }
+  auto target_acc = target_.accessor<target_t, 1>();
 
   auto weight_contiguous = optional_contiguous(weight);
   const scalar_t* weight_data = optional_data<scalar_t>(weight_contiguous);
@@ -349,7 +356,6 @@ static void nll_loss_backward_out_frame(
     auto grad_input_acc = grad_input.accessor<scalar_t, 2>();
 
     const auto batch_size = input.size(0);
-    TORCH_CHECK(target.size(0) == batch_size);
 
     for (int64_t i = 0; i < batch_size; i++) {
       const auto cur_target = target_acc[i];
@@ -548,12 +554,12 @@ Tensor nll_loss_nd(
     const c10::optional<Tensor>& weight,
     int64_t reduction,
     int64_t ignore_index) {
-  if (self.dim() < 2) {
+  if (self.dim() < 1) {
     TORCH_CHECK_VALUE(
-        false, "Expected 2 or more dimensions (got ", self.dim(), ")");
+        false, "Expected 1 or more dimensions (got ", self.dim(), ")");
   }
 
-  if (self.sizes()[0] != target.sizes()[0]) {
+  if (self.dim() != 1 && self.sizes()[0] != target.sizes()[0]) {
     TORCH_CHECK_VALUE(
         false,
         "Expected input batch_size (",
@@ -566,7 +572,7 @@ Tensor nll_loss_nd(
   Tensor ret;
   Tensor input_ = self;
   Tensor target_ = target;
-  if (input_.dim() == 2) {
+  if (input_.dim() == 1 || input_.dim() == 2) {
     ret = at::nll_loss(input_, target_, weight, reduction, ignore_index);
   } else if (input_.dim() == 4) {
     ret = at::nll_loss2d(input_, target_, weight, reduction, ignore_index);
diff --git a/aten/src/ATen/native/cuda/Loss.cu b/aten/src/ATen/native/cuda/Loss.cu
index d814eae01f4ec..ac9c3c0d8130f 100644
--- a/aten/src/ATen/native/cuda/Loss.cu
+++ b/aten/src/ATen/native/cuda/Loss.cu
@@ -468,7 +468,6 @@ void nll_loss_backward_out_cuda_template(
   int64_t n_dims = input.dim();
   int64_t n_classes = input.size(-1);
   int64_t batch_size = n_dims == 1 ? 1 : input.size(0);
-  int64_t num_targets = target.size(0);
 
   auto weight_ = weight.defined() ? weight.contiguous() : weight;
 
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 7f39db405c861..03732b6d192d8 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -164,10 +164,11 @@ class NLLLoss(_WeightedLoss):
             :attr:`reduction`. Default: ``'mean'``
 
     Shape:
-        - Input: :math:`(N, C)` where `C = number of classes`, or
+        - Input: :math:`(N, C)` or :math:`(C)`, where `C = number of classes`, or
           :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
           in the case of `K`-dimensional loss.
-        - Target: :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`, or
+        - Target: :math:`(N)` or :math:`()`, where each value is
+          :math:`0 \leq \text{targets}[i] \leq C-1`, or
           :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of
           K-dimensional loss.
         - Output: If :attr:`reduction` is ``'none'``, shape :math:`(N)` or
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 6b1bcf66f8bd5..90024dea510ed 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -97,6 +97,7 @@ def get_weight(m):
 # - `test_cpp_api_parity`: if `False`, skips the C++ parity test for this test dict. Default: True.
 # - `has_parity`: if `False`, expects this test dict to fail the C++ parity test. Default: True.
 
+
 module_tests = [
     dict(
         module_name='Linear',
@@ -1308,6 +1309,7 @@ def single_batch_reference_fn(input, parameters, module):
     with freeze_rng_state():
         return module(single_batch_input).squeeze(0)
 
+
 new_module_tests = [
     poissonnllloss_no_reduce_test(),
     bceloss_no_reduce_test(),
@@ -4055,6 +4057,7 @@ def kldivloss_reference(input, target, reduction='mean'):
         return result.sum() / result.size(0)
     return result
 
+
 def kldivloss_log_target_reference(input, target, reduction='mean'):
     result = torch.exp(target) * (target - input)
     if reduction == 'mean':
@@ -5182,6 +5185,7 @@ def single_batch_reference_criterion_fn(*args):
     ('HingeEmbeddingLoss', lambda: torch.randn(9), lambda: torch.tensor([-1, 1, 1] * 3)),
     ('MultiLabelMarginLoss', lambda: torch.randn(4), lambda: torch.tensor([3, 0, -1, 1])),
     ('SoftMarginLoss', lambda: torch.randn(9), lambda: torch.tensor([-1, 1, 1] * 3)),
+    ('NLLLoss', lambda: F.log_softmax(torch.randn(3), dim=0), lambda: torch.tensor(1)),
 ]
 classification_criterion_no_batch_extra_info: Dict[str, dict] = {
     'MultiLabelMarginLoss': {'check_gradgrad': False},
@@ -5580,6 +5584,7 @@ def test_cuda(self, test_case):
 
         self.test_noncontig(test_case, gpu_module, gpu_input_tuple)
 
+
 class InputVariableMixin(object):
     def _get_input(self):
         input = TestBase._get_input(self, False)  # type: ignore[arg-type]
@@ -5888,8 +5893,10 @@ def convert_dtype(obj, dtype, requires_grad=False):
         test_case.assertEqualIgnoreType(cpu_output, gpu_output,
                                         atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0)
 
-        cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_output, cpu_target, extra_args=extra_args)
-        gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_output, gpu_target, extra_args=extra_args)
+        cpu_gradInput = test_case._backward_criterion(
+            cpu_module, cpu_input, cpu_output, cpu_target, extra_args=extra_args)
+        gpu_gradInput = test_case._backward_criterion(
+            gpu_module, gpu_input, gpu_output, gpu_target, extra_args=extra_args)
         # dtype used to be able to be None, so set precision in this way instead of a precision map
         # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
         test_case.assertEqualIgnoreType(cpu_gradInput, gpu_gradInput,

From 33a163d886d1a7b236bc34e69d5a7415a133bc23 Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Tue, 24 Aug 2021 08:32:33 -0700
Subject: [PATCH 169/530] Enable BFloat16 LeakyReLU and RReLU in CPU path
 (#61514)

Summary:
Enable and optimize BFloat16 LeakyReLU and RReLU in CPU path.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61514

Reviewed By: ejguan

Differential Revision: D30257612

Pulled By: VitalyFedyunin

fbshipit-source-id: 8cc0d1faacd02dcc9827af724a86d95b6952748f
---
 aten/src/ATen/native/Activation.cpp           |  2 +-
 aten/src/ATen/native/cpu/Activation.cpp       | 93 +++++++++++++------
 test/cpp/api/functional.cpp                   | 70 +++++++-------
 test/cpp/api/modules.cpp                      | 80 ++++++++--------
 test/test_autograd.py                         |  6 ++
 .../_internal/common_methods_invocations.py   |  1 +
 6 files changed, 154 insertions(+), 98 deletions(-)

diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index e6ae3c9ebc3d7..37700bb586793 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -524,7 +524,7 @@ Tensor& rrelu_with_noise_out_cpu(const Tensor& self,
     c10::optional<Generator> generator,
     Tensor& output) {
   if (training) {
-    AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "rrelu_with_noise_out_cpu", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "rrelu_with_noise_out_cpu", [&] {
       _rrelu_with_noise_train<scalar_t>(output, self.contiguous(), noise, lower, upper, generator);
     });
     return output;
diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
index ae1403d1a25d1..fc5cc0d1924fb 100644
--- a/aten/src/ATen/native/cpu/Activation.cpp
+++ b/aten/src/ATen/native/cpu/Activation.cpp
@@ -401,41 +401,80 @@ void hardswish_backward_kernel(TensorIterator& iter) {
 }
 
 static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "leaky_relu_cpu", [&] {
-    using Vec = Vectorized<scalar_t>;
-    auto zero_vec = Vec((scalar_t)(0));
-    auto one_vec = Vec((scalar_t)(1));
-    scalar_t negval = negval_.to<scalar_t>();
-    Vec negval_v = Vec(negval);
+  if (iter.common_dtype() == kBFloat16) {
+    auto zero_vec = Vectorized<float>((float)(0));
+    auto one_vec = Vectorized<float>((float)(1));
+    float negval = negval_.to<float>();
+    Vectorized<float> negval_v = Vectorized<float>(negval);
     cpu_kernel_vec(
         iter,
-        [&](scalar_t a) -> scalar_t {
-          return a > scalar_t(0) ? a : a * negval;
+        [&](BFloat16 a) -> BFloat16 {
+          return float(a) > float(0) ? float(a) : float(a) * negval;
         },
-        [&](Vec a) -> Vec {
-          auto r = Vec::blendv(negval_v, one_vec, a > zero_vec);
-          return a * r;
+        [&](Vectorized<BFloat16> a) -> Vectorized<BFloat16> {
+          Vectorized<float> a0, a1;
+          std::tie(a0, a1) = convert_bfloat16_float(a);
+          auto res0 = a0 * (Vectorized<float>::blendv(negval_v, one_vec, a0 > zero_vec));
+          auto res1 = a1 * (Vectorized<float>::blendv(negval_v, one_vec, a1 > zero_vec));
+          return convert_float_bfloat16(res0, res1);
         });
-  });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "leaky_relu_cpu", [&] {
+      using Vec = Vectorized<scalar_t>;
+      auto zero_vec = Vec((scalar_t)(0));
+      auto one_vec = Vec((scalar_t)(1));
+      scalar_t negval = negval_.to<scalar_t>();
+      Vec negval_v = Vec(negval);
+      cpu_kernel_vec(
+          iter,
+          [&](scalar_t a) -> scalar_t {
+            return a > scalar_t(0) ? a : a * negval;
+          },
+          [&](Vec a) -> Vec {
+            auto r = Vec::blendv(negval_v, one_vec, a > zero_vec);
+            return a * r;
+          });
+    });
+  }
 }
 
 static void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "leaky_relu_backward_cpu", [&] {
-    using Vec = Vectorized<scalar_t>;
-    auto zero_vec = Vec((scalar_t)(0));
-    auto one_vec = Vec((scalar_t)(1));
-    scalar_t negval = negval_.to<scalar_t>();
-    Vec negval_v = Vec(negval);
+  if (iter.common_dtype() == kBFloat16) {
+    auto zero_vec = Vectorized<float>((float)(0));
+    auto one_vec = Vectorized<float>((float)(1));
+    float negval = negval_.to<float>();
+    Vectorized<float> negval_v = Vectorized<float>(negval);
     cpu_kernel_vec(
-        iter,
-        [&](scalar_t a, scalar_t b) -> scalar_t {
-          return a > scalar_t(0) ? b : b * negval;
-        },
-        [&](Vec a, Vec b) -> Vec {
-          auto r = Vec::blendv(negval_v, one_vec, a > zero_vec);
-          return b * r;
-        });
-  });
+      iter,
+      [&](BFloat16 a, BFloat16 b) -> BFloat16 {
+        return float(a) > float(0) ? float(b) : float(b) * negval;
+      },
+      [&](Vectorized<BFloat16> a, Vectorized<BFloat16> b) -> Vectorized<BFloat16> {
+        Vectorized<float> a0, a1, b0, b1;
+        std::tie(a0, a1) = convert_bfloat16_float(a);
+        std::tie(b0, b1) = convert_bfloat16_float(b);
+        auto res0 = b0 * (Vectorized<float>::blendv(negval_v, one_vec, a0 > zero_vec));
+        auto res1 = b1 * (Vectorized<float>::blendv(negval_v, one_vec, a1 > zero_vec));
+        return convert_float_bfloat16(res0, res1);
+      });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "leaky_relu_backward_cpu", [&] {
+      using Vec = Vectorized<scalar_t>;
+      auto zero_vec = Vec((scalar_t)(0));
+      auto one_vec = Vec((scalar_t)(1));
+      scalar_t negval = negval_.to<scalar_t>();
+      Vec negval_v = Vec(negval);
+      cpu_kernel_vec(
+          iter,
+          [&](scalar_t a, scalar_t b) -> scalar_t {
+            return a > scalar_t(0) ? b : b * negval;
+          },
+          [&](Vec a, Vec b) -> Vec {
+            auto r = Vec::blendv(negval_v, one_vec, a > zero_vec);
+            return b * r;
+          });
+    });
+  }
 }
 
 void softplus_kernel(TensorIteratorBase& iter, const Scalar& beta_, const Scalar& threshold_) {
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 582b1eebdb784..2ecb84189c55a 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -1034,17 +1034,19 @@ TEST_F(FunctionalTest, LeakyReLU) {
   const auto size = 3;
   for (const auto negative_slope : {0.0, 0.42, 1.0}) {
     for (const auto inplace : {false, true}) {
-      auto x = torch::linspace(-10.0, 10.0, size * size * size);
-      x.resize_({size, size, size});
-      auto y_exp = (x < 0) * x * negative_slope + (x >= 0) * x;
-      auto y = F::leaky_relu(x, F::LeakyReLUFuncOptions()
-        .negative_slope(negative_slope).inplace(inplace));
+      for (const auto type : {torch::kFloat, torch::kBFloat16}) {
+        auto x = torch::linspace(-10.0, 10.0, size * size * size).to(type);
+        x.resize_({size, size, size});
+        auto y_exp = (x < 0) * x * negative_slope + (x >= 0) * x;
+        auto y = F::leaky_relu(x, F::LeakyReLUFuncOptions()
+          .negative_slope(negative_slope).inplace(inplace));
 
-      ASSERT_EQ(y.ndimension(), 3);
-      ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
-      ASSERT_TRUE(torch::allclose(y, y_exp));
-      if (inplace) {
-        ASSERT_TRUE(torch::allclose(x, y_exp));
+        ASSERT_EQ(y.ndimension(), 3);
+        ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
+        ASSERT_TRUE(torch::allclose(y, y_exp));
+        if (inplace) {
+          ASSERT_TRUE(torch::allclose(x, y_exp));
+        }
       }
     }
   }
@@ -1443,19 +1445,21 @@ TEST_F(FunctionalTest, RReLU) {
   for (const auto lower : {0.01, 0.1, 0.2}) {
     for (const auto upper : {0.3, 0.4, 0.5}) {
       for (const auto inplace : {false, true}) {
-        auto x = torch::linspace(-10.0, 10.0, size * size * size);
-        x.resize_({size, size, size});
-        auto x_copy = x.clone();
-        auto y = F::rrelu(x, F::RReLUFuncOptions().lower(lower)
-          .upper(upper).inplace(inplace));
-        auto z = ((x_copy >= 0) * (x_copy == y) +
-          (x_copy < 0) * (y >= x_copy * upper) * (y <= lower * x_copy)) * 1.0;
-
-        ASSERT_EQ(y.ndimension(), 3);
-        ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
-        ASSERT_TRUE(torch::allclose(z, torch::ones_like(z)));
-        if (inplace) {
-          ASSERT_TRUE(torch::allclose(x, y));
+        for (const auto type : {torch::kFloat, torch::kBFloat16}) {
+          auto x = torch::linspace(-10.0, 10.0, size * size * size).to(type);
+          x.resize_({size, size, size});
+          auto x_copy = x.clone();
+          auto y = F::rrelu(x, F::RReLUFuncOptions().lower(lower)
+            .upper(upper).inplace(inplace));
+          auto z = ((x_copy >= 0) * (x_copy == y) +
+            (x_copy < 0) * (y >= x_copy * upper) * (y <= lower * x_copy)) * 1.0;
+
+          ASSERT_EQ(y.ndimension(), 3);
+          ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
+          ASSERT_TRUE(torch::allclose(z, torch::ones_like(z)));
+          if (inplace) {
+            ASSERT_TRUE(torch::allclose(x, y));
+          }
         }
       }
     }
@@ -1467,16 +1471,18 @@ TEST_F(FunctionalTest, RReLUDefaultOptions) {
   const auto size = 3;
   const auto lower = 1.0 / 8.0;
   const auto upper = 1.0 / 3.0;
-  auto x = torch::linspace(-10.0, 10.0, size * size * size);
-  x.resize_({size, size, size});
-  auto x_copy = x.clone();
-  auto y = F::rrelu(x);
-  auto z = ((x_copy >= 0) * (x_copy == y) +
-    (x_copy < 0) * (y >= x_copy * upper) * (y <= lower * x_copy)) * 1.0;
+  for (const auto type : {torch::kFloat, torch::kBFloat16}) {
+    auto x = torch::linspace(-10.0, 10.0, size * size * size).to(type);
+    x.resize_({size, size, size});
+    auto x_copy = x.clone();
+    auto y = F::rrelu(x);
+    auto z = ((x_copy >= 0) * (x_copy == y) +
+      (x_copy < 0) * (y >= x_copy * upper) * (y <= lower * x_copy)) * 1.0;
 
-  ASSERT_EQ(y.ndimension(), 3);
-  ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
-  ASSERT_TRUE(torch::allclose(z, torch::ones_like(z)));
+    ASSERT_EQ(y.ndimension(), 3);
+    ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
+    ASSERT_TRUE(torch::allclose(z, torch::ones_like(z)));
+  }
 }
 
 TEST_F(FunctionalTest, CELU) {
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 036ff5e4bf2ec..23d75efeee21f 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -2521,25 +2521,27 @@ TEST_F(ModulesTest, LeakyReLU) {
   const auto size = 3;
   for (const auto inplace : {false, true}) {
     for (const auto negative_slope : {0.0, 0.42, 1.0}) {
-      LeakyReLU model {LeakyReLUOptions().negative_slope(negative_slope).inplace(inplace)};
-      auto x = torch::linspace(-10.0, 10.0, size * size * size);
-      x.resize_({size, size, size});
-      if (!inplace) {
-        x.requires_grad_(true);
-      }
-      auto x_orig = x.clone();
-      auto y = model(x);
-      torch::Tensor s = y.sum();
+      for (const auto type : {torch::kFloat, torch::kBFloat16}) {
+        LeakyReLU model {LeakyReLUOptions().negative_slope(negative_slope).inplace(inplace)};
+        auto x = torch::linspace(-10.0, 10.0, size * size * size).to(type);
+        x.resize_({size, size, size});
+        if (!inplace) {
+          x.requires_grad_(true);
+        }
+        auto x_orig = x.clone();
+        auto y = model(x);
+        torch::Tensor s = y.sum();
 
-      ASSERT_EQ(s.ndimension(), 0);
-      ASSERT_EQ(y.ndimension(), 3);
-      ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
-      auto y_exp = (x_orig < 0) * x_orig * negative_slope + (x_orig >= 0) * x_orig;
-      ASSERT_TRUE(torch::allclose(y, y_exp));
-      if (inplace) {
-        ASSERT_TRUE(torch::allclose(x, y_exp));
-      } else {
-        s.backward();
+        ASSERT_EQ(s.ndimension(), 0);
+        ASSERT_EQ(y.ndimension(), 3);
+        ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
+        auto y_exp = (x_orig < 0) * x_orig * negative_slope + (x_orig >= 0) * x_orig;
+        ASSERT_TRUE(torch::allclose(y, y_exp));
+        if (inplace) {
+          ASSERT_TRUE(torch::allclose(x, y_exp));
+        } else {
+          s.backward();
+        }
       }
     }
   }
@@ -2740,26 +2742,28 @@ TEST_F(ModulesTest, RReLU) {
   for (const auto lower : {0.01, 0.1, 0.2}) {
     for (const auto upper : {0.3, 0.4, 0.5}) {
       for (const auto inplace : {false, true}) {
-        RReLU model {RReLUOptions().lower(lower).upper(upper).inplace(inplace)};
-        auto x = torch::linspace(-10.0, 10.0, size * size * size);
-        x.resize_({size, size, size});
-        if (!inplace) {
-          x.requires_grad_(true);
-        }
-        auto x_orig = x.clone();
-        auto y = model(x);
-        torch::Tensor s = y.sum();
-
-        ASSERT_EQ(s.ndimension(), 0);
-        ASSERT_EQ(y.ndimension(), 3);
-        ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
-        auto z = ((x_orig >= 0) * (x_orig == y) +
-          (x_orig < 0) * (y >= x_orig * upper) * (y <= lower * x_orig)) * 1.0;
-        ASSERT_TRUE(torch::allclose(z, torch::ones_like(z)));
-        if (inplace) {
-          ASSERT_TRUE(torch::allclose(x, y));
-        } else {
-          s.backward();
+        for (const auto type : {torch::kFloat, torch::kBFloat16}) {
+          RReLU model {RReLUOptions().lower(lower).upper(upper).inplace(inplace)};
+          auto x = torch::linspace(-10.0, 10.0, size * size * size).to(type);
+          x.resize_({size, size, size});
+          if (!inplace) {
+            x.requires_grad_(true);
+          }
+          auto x_orig = x.clone();
+          auto y = model(x);
+          torch::Tensor s = y.sum();
+
+          ASSERT_EQ(s.ndimension(), 0);
+          ASSERT_EQ(y.ndimension(), 3);
+          ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
+          auto z = ((x_orig >= 0) * (x_orig == y) +
+            (x_orig < 0) * (y >= x_orig * upper) * (y <= lower * x_orig)) * 1.0;
+          ASSERT_TRUE(torch::allclose(z, torch::ones_like(z)));
+          if (inplace) {
+            ASSERT_TRUE(torch::allclose(x, y));
+          } else {
+            s.backward();
+          }
         }
       }
     }
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 8b7aeb4159f23..126d9230fe687 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -8256,6 +8256,12 @@ def test_leaky_relu_inplace_with_zero_slope(self, device):
         expected = torch.tensor([0., 0., 1.], device=device)
         self.assertEqual(a.grad, expected)
 
+        a_bf16 = torch.tensor([-2., 0., 2.], device=device, dtype=torch.bfloat16, requires_grad=True)
+        b_bf16 = torch.nn.functional.leaky_relu_(a_bf16.clone(), 0.0)
+        b_bf16.backward(torch.ones(3, device=device))
+        expected_bf16 = torch.tensor([0., 0., 1.], device=device, dtype=torch.bfloat16)
+        self.assertEqual(a_bf16.grad, expected_bf16)
+
     @onlyOnCPUAndCUDA
     def test_elu_inplace_with_neg_alpha(self, device):
         a = torch.tensor([-1., 1.], device=device, requires_grad=True)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 63af3965a2e7d..8dd9db3303456 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -7154,6 +7154,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            aten_name="leaky_relu",
            dtypes=floating_types(),
            sample_inputs_func=sample_inputs_leaky_relu,
+           dtypesIfCPU=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_autograd=True,
            assert_autodiffed=True,

From 94d621584a8d2780252546aa787aab23203221b2 Mon Sep 17 00:00:00 2001
From: mingfeima <mingfei.ma@intel.com>
Date: Tue, 24 Aug 2021 08:54:36 -0700
Subject: [PATCH 170/530] optimize BFloat16 elemwise operators CPU: sigmoid,
 sigmoid_backward, tanh_backward, addcmul, addcdiv (#55221)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/55221

Test Plan: Imported from OSS

Reviewed By: bdhirsh

Differential Revision: D28836797

Pulled By: VitalyFedyunin

fbshipit-source-id: 6b79098c902ffe65d228668118ef36fb49bab800
---
 aten/src/ATen/native/cpu/BinaryOpsKernel.cpp  | 65 ++++++++++----
 .../ATen/native/cpu/PointwiseOpsKernel.cpp    | 84 ++++++++++++++-----
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp   | 36 ++++++--
 .../_internal/common_methods_invocations.py   |  4 +-
 4 files changed, 143 insertions(+), 46 deletions(-)

diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 2a8f73cb88dd0..16efa2511899f 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -684,19 +684,35 @@ void sigmoid_backward_kernel(TensorIteratorBase& iter) {
           return a * ((one_vec - b) * b).conj();
         });
     });
+  } else if (iter.dtype() == kBFloat16) {
+    auto one_vec = Vectorized<float>((float)(1));
+    cpu_kernel_vec(
+      iter,
+      [=](BFloat16 a, BFloat16 b) -> BFloat16 {
+         float a0 = static_cast<float>(a);
+         float b0 = static_cast<float>(b);
+         return a0 * (float(1) - b0) * b0;
+      },
+      [=](Vectorized<BFloat16> a, Vectorized<BFloat16> b) {
+         Vectorized<float> a0, a1, b0, b1;
+         std::tie(a0, a1) = convert_bfloat16_float(a);
+         std::tie(b0, b1) = convert_bfloat16_float(b);
+         a0 = a0 * (one_vec - b0) * b0;
+         a1 = a1 * (one_vec - b1) * b1;
+         return convert_float_bfloat16(a0, a1);
+      });
   } else {
-    AT_DISPATCH_FLOATING_TYPES_AND2(
-        kBFloat16, kHalf, iter.dtype(), "sigmoid_backward_cpu", [&]() {
-          auto one_vec = Vectorized<scalar_t>((scalar_t)(1));
-          cpu_kernel_vec(
-              iter,
-              [=](scalar_t a, scalar_t b) -> scalar_t {
-                return a * (scalar_t(1) - b) * b;
-              },
-              [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) {
-                return a * (one_vec - b) * b;
-              });
+    AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "sigmoid_backward_cpu", [&]() {
+      auto one_vec = Vectorized<scalar_t>((scalar_t)(1));
+      cpu_kernel_vec(
+        iter,
+        [=](scalar_t a, scalar_t b) -> scalar_t {
+          return a * (scalar_t(1) - b) * b;
+        },
+        [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) {
+          return a * (one_vec - b) * b;
         });
+    });
   }
 }
 
@@ -754,15 +770,32 @@ void tanh_backward_kernel(TensorIteratorBase& iter) {
   if (isComplexType(iter.dtype())) {
     AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() {
       auto one_vec = Vectorized<scalar_t>(scalar_t{1});
+      cpu_kernel_vec(
+        iter,
+        [=](scalar_t a, scalar_t b) -> scalar_t {
+          return a * std::conj(scalar_t{1} - b * b);
+        },
+        [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) {
+          return a * (one_vec - b * b).conj();
+        });
+    });
+  } else if (iter.dtype() == kBFloat16) {
+    auto one_vec = Vectorized<float>(float{1});
     cpu_kernel_vec(
       iter,
-      [=](scalar_t a, scalar_t b) -> scalar_t {
-        return a * std::conj(scalar_t{1} - b * b);
+      [=](BFloat16 a, BFloat16 b) -> BFloat16 {
+        float a0 = float(a);
+        float b0 = float(b);
+        return a0 * (float{1} - b0 * b0);
       },
-      [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) {
-        return a * (one_vec - b * b).conj();
+      [=](Vectorized<BFloat16> a, Vectorized<BFloat16> b) {
+        Vectorized<float> a0, a1, b0, b1;
+        std::tie(a0, a1) = convert_bfloat16_float(a);
+        std::tie(b0, b1) = convert_bfloat16_float(b);
+        a0 = a0 * (one_vec - b0 * b0);
+        a1 = a1 * (one_vec - b1 * b1);
+        return convert_float_bfloat16(a0, a1);
       });
-  });
   } else {
     AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() {
       auto one_vec = Vectorized<scalar_t>(scalar_t{1});
diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
index e0807d14b1b5a..0d0508adb7c11 100644
--- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
@@ -12,38 +12,82 @@ namespace {
 
 static void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
   ScalarType dtype = iter.dtype(0);
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(dtype, "addcmul_cpu_out", [&] {
-    scalar_t scalar_val = value.to<scalar_t>();
-    auto scalar_vec = Vectorized<scalar_t>(scalar_val);
+  if (iter.dtype() == kBFloat16) {
+    float float_val = value.to<float>();
+    auto float_vec = Vectorized<float>(float_val);
     cpu_kernel_vec(
         iter,
-        [=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t {
-          return self_val + scalar_val * t1_val * t2_val;
+        [=](BFloat16 self_val, BFloat16 t1_val, BFloat16 t2_val) -> BFloat16 {
+          return float(self_val) + float_val * float(t1_val) * float(t2_val);
         },
-        [=](Vectorized<scalar_t> self_vec,
-            Vectorized<scalar_t> t1_vec,
-            Vectorized<scalar_t> t2_vec) {
-          return self_vec + scalar_vec * t1_vec * t2_vec;
+        [=](Vectorized<BFloat16> self_vec,
+          Vectorized<BFloat16> t1_vec,
+          Vectorized<BFloat16> t2_vec) {
+          Vectorized<float> self_vec0, self_vec1;
+          std::tie(self_vec0, self_vec1) = convert_bfloat16_float(self_vec);
+          Vectorized<float> t1_vec0, t1_vec1, t2_vec0, t2_vec1;
+          std::tie(t1_vec0, t1_vec1) = convert_bfloat16_float(t1_vec);
+          std::tie(t2_vec0, t2_vec1) = convert_bfloat16_float(t2_vec);
+          self_vec0 = self_vec0 + float_vec * t1_vec0 * t2_vec0;
+          self_vec1 = self_vec1 + float_vec * t1_vec1 * t2_vec1;
+          return convert_float_bfloat16(self_vec0, self_vec1);
         });
-  });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(dtype, "addcmul_cpu_out", [&] {
+      scalar_t scalar_val = value.to<scalar_t>();
+      auto scalar_vec = Vectorized<scalar_t>(scalar_val);
+      cpu_kernel_vec(
+          iter,
+          [=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t {
+            return self_val + scalar_val * t1_val * t2_val;
+          },
+          [=](Vectorized<scalar_t> self_vec,
+              Vectorized<scalar_t> t1_vec,
+              Vectorized<scalar_t> t2_vec) {
+            return self_vec + scalar_vec * t1_vec * t2_vec;
+          });
+    });
+  }
 }
 
 static void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
   ScalarType dtype = iter.dtype(0);
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(dtype, "addcdiv_cpu_out", [&] {
-    scalar_t scalar_val = value.to<scalar_t>();
-    auto scalar_vec = Vectorized<scalar_t>(scalar_val);
+  if (dtype == kBFloat16) {
+    float float_val = value.to<float>();
+    auto float_vec = Vectorized<float>(float_val);
     cpu_kernel_vec(
         iter,
-        [=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t {
-          return self_val + scalar_val * t1_val / t2_val;
+        [=](BFloat16 self_val, BFloat16 t1_val, BFloat16 t2_val) -> BFloat16 {
+          return float(self_val) + float_val * float(t1_val) / float(t2_val);
         },
-        [=](Vectorized<scalar_t> self_vec,
-            Vectorized<scalar_t> t1_vec,
-            Vectorized<scalar_t> t2_vec) {
-          return self_vec + scalar_vec * t1_vec / t2_vec;
+        [=](Vectorized<BFloat16> self_vec,
+            Vectorized<BFloat16> t1_vec,
+            Vectorized<BFloat16> t2_vec) {
+            Vectorized<float> self_vec0, self_vec1;
+          std::tie(self_vec0, self_vec1) = convert_bfloat16_float(self_vec);
+          Vectorized<float> t1_vec0, t1_vec1, t2_vec0, t2_vec1;
+          std::tie(t1_vec0, t1_vec1) = convert_bfloat16_float(t1_vec);
+          std::tie(t2_vec0, t2_vec1) = convert_bfloat16_float(t2_vec);
+          self_vec0 = self_vec0 + float_vec * t1_vec0 / t2_vec0;
+          self_vec1 = self_vec1 + float_vec * t1_vec1 / t2_vec1;
+          return convert_float_bfloat16(self_vec0, self_vec1);
         });
-  });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(dtype, "addcdiv_cpu_out", [&] {
+      scalar_t scalar_val = value.to<scalar_t>();
+      auto scalar_vec = Vectorized<scalar_t>(scalar_val);
+      cpu_kernel_vec(
+          iter,
+          [=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t {
+            return self_val + scalar_val * t1_val / t2_val;
+          },
+          [=](Vectorized<scalar_t> self_vec,
+              Vectorized<scalar_t> t1_vec,
+              Vectorized<scalar_t> t2_vec) {
+            return self_vec + scalar_vec * t1_vec / t2_vec;
+          });
+    });
+  }
 }
 
 static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) {
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index a867a2a0ce519..007e444d5cd33 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -35,18 +35,36 @@ namespace CPU_CAPABILITY {
 using namespace vec;
 
 static void sigmoid_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, iter.common_dtype(), "sigmoid_cpu", [&]() {
+  if (iter.common_dtype() == kBFloat16) {
     cpu_kernel_vec(
         iter,
-        [=](scalar_t a) -> scalar_t { return (static_cast<scalar_t>(1) / (static_cast<scalar_t>(1) + std::exp((-a)))); },
-        [=](Vectorized<scalar_t> a) {
-          a = Vectorized<scalar_t>(static_cast<scalar_t>(0)) - a;
-          a = a.exp();
-          a = Vectorized<scalar_t>(static_cast<scalar_t>(1)) + a;
-          a = a.reciprocal();
-          return a;
+        [=](BFloat16 a) -> BFloat16 {
+          float a0 = static_cast<float>(a);
+          return static_cast<float>(1) / (static_cast<float>(1) + std::exp((-a0)));
+        },
+        [=](Vectorized<BFloat16> a) {
+          Vectorized<float> a0, a1;
+          std::tie(a0, a1) = convert_bfloat16_float(a);
+          a0 = (Vectorized<float>(static_cast<float>(1)) + a0.neg().exp()).reciprocal();
+          a1 = (Vectorized<float>(static_cast<float>(1)) + a1.neg().exp()).reciprocal();
+          return convert_float_bfloat16(a0, a1);
         });
-  });
+  } else {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.common_dtype(), "sigmoid_cpu", [&]() {
+      cpu_kernel_vec(
+          iter,
+          [=](scalar_t a) -> scalar_t {
+            return (static_cast<scalar_t>(1) / (static_cast<scalar_t>(1) + std::exp((-a))));
+          },
+          [=](Vectorized<scalar_t> a) {
+            a = Vectorized<scalar_t>(static_cast<scalar_t>(0)) - a;
+            a = a.exp();
+            a = Vectorized<scalar_t>(static_cast<scalar_t>(1)) + a;
+            a = a.reciprocal();
+            return a;
+          });
+    });
+  }
 }
 
 #if AT_MKL_ENABLED()
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 8dd9db3303456..b725c4831d25f 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -5689,6 +5689,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
     OpInfo('addcmul',
            dtypes=all_types_and_complex(),
+           dtypesIfCPU=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=all_types_and_complex_and(torch.float16, torch.bfloat16),
            assert_autodiffed=True,
            supports_forward_ad=True,
@@ -5699,6 +5700,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            sample_inputs_func=sample_inputs_addcmul_addcdiv),
     OpInfo('addcdiv',
            dtypes=floating_and_complex_types(),
+           dtypesIfCPU=floating_and_complex_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            supports_inplace_autograd=False,
            supports_forward_ad=True,
@@ -7677,7 +7679,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    # "tanh_backward_cpu" not implemented for 'BFloat16'
-                   backward_dtypesIfCPU=all_types_and_complex_and(torch.bool),
+                   backward_dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    assert_autodiffed=True,
                    safe_casts_outputs=True,
                    supports_forward_ad=True,

From 227cb268bccd22feb8aa8651773a202ec1e09c7f Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 24 Aug 2021 09:24:50 -0700
Subject: [PATCH 171/530] [Reland] Embedding thrust->cub migration (#63806)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63427

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63806

Reviewed By: bdhirsh

Differential Revision: D30498255

Pulled By: ngimel

fbshipit-source-id: 78b7085a92a168cf0163f53dcb712bac922f5235
---
 aten/src/ATen/cuda/cub.cuh                    | 19 +++-
 aten/src/ATen/native/cuda/Embedding.cu        | 91 +++++--------------
 .../native/cuda/EmbeddingBackwardKernel.cuh   |  4 -
 aten/src/ATen/native/cuda/Indexing.cu         |  3 -
 .../ATen/native/cuda/LegacyThrustHelpers.cu   | 43 +++++++++
 aten/src/ATen/native/cuda/Randperm.cu         |  2 -
 aten/src/ATen/native/cuda/UniqueCub.cu        | 13 +--
 torch/testing/_internal/common_nn.py          |  8 ++
 8 files changed, 95 insertions(+), 88 deletions(-)

diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
index 62da28d34e8e5..38e5852260f3a 100644
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@@ -3,6 +3,7 @@
 #include <cstddef>
 #include <type_traits>
 #include <iterator>
+#include <limits>
 
 // include cub in a safe manner, see:
 // https://github.com/pytorch/pytorch/pull/55292
@@ -102,6 +103,8 @@ static inline void sort_keys(
     const key_t *keys_in, key_t *keys_out,
     int64_t n, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8
 ) {
+  TORCH_CHECK(n <= std::numeric_limits<int>::max(),
+    "cub sort does not support sorting more than INT_MAX elements");
   using key_t_ = typename detail::cuda_type<key_t>::type;
 
   const key_t_ *keys_in_ = reinterpret_cast<const key_t_*>(keys_in);
@@ -124,6 +127,8 @@ static inline void sort_pairs(
     const value_t *values_in, value_t *values_out,
     int64_t n, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8
 ) {
+  TORCH_CHECK(n <= std::numeric_limits<int>::max(),
+    "cub sort does not support sorting more than INT_MAX elements");
   using key_t_ = typename detail::cuda_type<key_t>::type;
 
   auto allocator = c10::cuda::CUDACachingAllocator::get();
@@ -156,6 +161,10 @@ static inline void segmented_sort_pairs(
     OffsetIteratorT begin_offsets, OffsetIteratorT end_offsets,
     bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8
 ) {
+  TORCH_CHECK(num_elements <= std::numeric_limits<int>::max(),
+    "cub sort does not support sorting more than INT_MAX elements");
+  TORCH_CHECK(num_segments <= std::numeric_limits<int>::max(),
+    "cub sort does not support sorting more than INT_MAX elements");
   using key_t_ = typename detail::cuda_type<key_t>::type;
 
   auto allocator = c10::cuda::CUDACachingAllocator::get();
@@ -305,4 +314,12 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
   }
 }
 
-}}}
+template<typename InputIteratorT , typename OutputIteratorT , typename NumSelectedIteratorT >
+inline void unique(InputIteratorT input, OutputIteratorT output, NumSelectedIteratorT num_selected_out, int64_t num_items) {
+  TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
+    "cub unique does not support more than INT_MAX elements");
+  CUB_WRAPPER(NO_ROCM(detail)::cub::DeviceSelect::Unique,
+    input, output, num_selected_out, num_items, at::cuda::getCurrentCUDAStream());
+}
+
+}}}  // namespace at::cuda::cub
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 10a42b8914e62..ba79fa10f926a 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -7,12 +7,9 @@
 
 #include <THC/THCDeviceUtils.cuh>
 #include <THC/THCTensorMathReduce.cuh>
-#include <THC/THCThrustAllocator.cuh>
 #include <THC/THCReduceApplyUtils.cuh>
 
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/unique.h>
+#include <ATen/cuda/cub.cuh>
 
 #include <ATen/native/cuda/EmbeddingBackwardKernel.cuh>
 #include <ATen/native/cuda/SortingCommon.cuh>
@@ -224,14 +221,19 @@ __global__ void renorm_kernel(
 
 } // anonymous namespace
 
-Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices,
+template<typename index_t>
+void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
+
+Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_,
                                int64_t num_weights, int64_t padding_idx,
                                bool scale_grad_by_freq) {
   auto grad_arg = TensorArg(grad_, "grad", 1);
-  auto indices_arg = TensorArg(indices, "indices", 1);
+  auto indices_arg = TensorArg(indices_, "indices", 1);
   checkScalarTypes("embedding_backward", indices_arg, {kLong, kInt});
   checkSameGPU("embedding_backward", grad_arg, indices_arg);
 
+  auto indices = indices_.contiguous();
+
   auto num_indices = indices.numel();
   auto grad = grad_.contiguous().view({num_indices, grad_.size(-1)});
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -272,59 +274,16 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
   auto orig_indices = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   Tensor count;
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
-    using device_ptr = thrust::device_ptr<index_t>;
-
-    // Sort the inputs into sorted with the corresponding indices; we
-    // don't need a stable or multidimensional sort, so just use Thrust
-    // directly
-    {
-        sorted_indices.copy_(indices);
-
-        auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
-        auto policy = thrust::cuda::par(allocator).on(stream);
-
-        // Fill sortedOrigIndices with sequential indices
-        auto count_iter = thrust::counting_iterator<index_t>(0);
-        auto orig_data = device_ptr(orig_indices.data_ptr<index_t>());
-        thrust::copy(policy, count_iter, count_iter + num_indices, orig_data);
-
-        // Sort; a stable sort is not required
-        auto sorted_data = device_ptr(sorted_indices.data_ptr<index_t>());
-        thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data,
-                            LTOp<index_t>());
-    }
+    auto range = at::arange(num_indices, indices.options());
+    int64_t nbits = cuda::cub::get_num_bits(num_weights);
+    cuda::cub::sort_pairs(
+      indices.data_ptr<index_t>(), sorted_indices.data_ptr<index_t>(),
+      range.data_ptr<index_t>(), orig_indices.data_ptr<index_t>(),
+      num_indices, false/*, 0, nbits*/);
 
     if (scale_grad_by_freq) {
       count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-
-      auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
-      auto policy = thrust::cuda::par(allocator).on(stream);
-
-      // Compute an increasing sequence per unique item in sortedIndices:
-      // sorted: 2 5 5 5 7 7 8 9 9
-      //  count: 1 1 2 3 1 2 1 1 2
-      auto sorted_data = device_ptr(sorted_indices.data_ptr<index_t>());
-      auto count_data = device_ptr(count.data_ptr<index_t>());
-      thrust::inclusive_scan_by_key(
-        policy,
-        sorted_data,
-        sorted_data + num_indices,
-        thrust::make_constant_iterator(1),
-        count_data
-      );
-
-      // Take the maximum of each count per unique key in reverse:
-      // sorted: 2 5 5 5 7 7 8 9 9
-      //  count: 1 3 3 3 2 2 1 2 2
-      thrust::inclusive_scan_by_key(
-        policy,
-        thrust::make_reverse_iterator(sorted_data + num_indices),
-        thrust::make_reverse_iterator(sorted_data),
-        thrust::make_reverse_iterator(count_data + num_indices),
-        thrust::make_reverse_iterator(count_data + num_indices),
-        thrust::equal_to<index_t>(),
-        thrust::maximum<index_t>()
-      );
+      embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
     }
   });
 
@@ -340,23 +299,23 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
   checkSameGPU("embedding_renorm", self_arg, indices_arg);
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
-  auto policy = thrust::cuda::par(allocator).on(stream);
 
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_renorm_cuda_", [&] () {
-    using device_ptr = thrust::device_ptr<index_t>;
 
     auto num_indices = indices.numel();
     auto indices_contig = std::get<0>(indices.sort()).contiguous();
-    auto indices_data = device_ptr(indices_contig.data_ptr<index_t>());
-
     auto unique_indices = at::empty(indices.numel(), indices.options());
-    auto unique_data = device_ptr(unique_indices.data_ptr<index_t>());
-    auto end = thrust::unique_copy(policy, indices_data, indices_data + num_indices, unique_data);
-    auto num_unique_indices = static_cast<int>(end - unique_data);
+    auto num_unique_indices = at::empty({}, indices.options().dtype(kLong));
+
+    cuda::cub::unique(
+      indices_contig.data_ptr<index_t>(),
+      unique_indices.data_ptr<index_t>(),
+      num_unique_indices.data_ptr<int64_t>(),
+      num_indices
+    );
 
-    dim3 grid(num_unique_indices);
-    dim3 block(128);
+    dim3 grid = num_unique_indices.item<int64_t>();
+    dim3 block = 128;
     int dim = self.stride(0);
 
     AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "embedding_backward", [&] {
diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh
index f06b850668591..c79bf83cc8a6a 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh
+++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh
@@ -10,10 +10,6 @@
 #include <THC/THCThrustAllocator.cuh>
 #include <THC/THCAtomics.cuh>
 
-#include <thrust/execution_policy.h>
-#include <thrust/unique.h>
-#include <thrust/device_vector.h>
-
 #pragma once
 
 namespace at {
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index 95ab33e512f02..57654f2fb9b74 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -218,9 +218,6 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<c10::optional<Ten
   std::tie(linearIndex, src, nElemBefore, strideBefore, sliceSize, inversePerm) = makeLinearIndex(self, indices, !unsafe);
   int64_t num_indices = linearIndex.numel();
 
-  TORCH_CHECK(num_indices <= std::numeric_limits<int>::max(),
-    "index_put of tensors larger than INT_MAX is not supported yet in pytorch");
-
   if (num_indices > 0 && sliceSize > 0) {
       const bool permuted = !src.is_contiguous();
       auto src_ = permuted ? src.contiguous() : src;
diff --git a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
index 582dc9ebe0498..446aa085a31d3 100644
--- a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
+++ b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
@@ -5,6 +5,8 @@
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/sort.h>
+#include <thrust/unique.h>
+#include <thrust/device_ptr.h>
 
 namespace at { namespace native {
 
@@ -30,4 +32,45 @@ void index_put_with_sort_kernel_thrust_helper(Tensor &linearIndex, Tensor &orig_
   thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data, LTOp<int64_t>());
 }
 
+template<typename index_t>
+void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) {
+  using device_ptr = thrust::device_ptr<index_t>;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  auto num_indices = count.numel();
+
+  // Compute an increasing sequence per unique item in sortedIndices:
+  // sorted: 2 5 5 5 7 7 8 9 9
+  //  count: 1 1 2 3 1 2 1 1 2
+  auto sorted_data = device_ptr(sorted_indices.data_ptr<index_t>());
+  auto count_data = device_ptr(count.data_ptr<index_t>());
+  thrust::inclusive_scan_by_key(
+    policy,
+    sorted_data,
+    sorted_data + num_indices,
+    thrust::make_constant_iterator(1),
+    count_data
+  );
+
+  // Take the maximum of each count per unique key in reverse:
+  // sorted: 2 5 5 5 7 7 8 9 9
+  //  count: 1 3 3 3 2 2 1 2 2
+  thrust::inclusive_scan_by_key(
+    policy,
+    thrust::make_reverse_iterator(sorted_data + num_indices),
+    thrust::make_reverse_iterator(sorted_data),
+    thrust::make_reverse_iterator(count_data + num_indices),
+    thrust::make_reverse_iterator(count_data + num_indices),
+    thrust::equal_to<index_t>(),
+    thrust::maximum<index_t>()
+  );
+}
+
+template
+void embedding_dense_backward_cuda_scan<int>(Tensor &sorted_indices, Tensor &count);
+template
+void embedding_dense_backward_cuda_scan<int64_t>(Tensor &sorted_indices, Tensor &count);
+
 }}
diff --git a/aten/src/ATen/native/cuda/Randperm.cu b/aten/src/ATen/native/cuda/Randperm.cu
index 4c5e16a1ceed0..56b8eb20faae6 100644
--- a/aten/src/ATen/native/cuda/Randperm.cu
+++ b/aten/src/ATen/native/cuda/Randperm.cu
@@ -47,8 +47,6 @@ template <int N> struct alignas(N) OpaqueType { char data[N]; };
 
 Tensor& randperm_out_cuda(int64_t n, c10::optional<Generator> generator, Tensor& result) {
   TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
-  TORCH_CHECK(n <= std::numeric_limits<int>::max(),
-    "randperm of tensors larger than INT_MAX is not supported yet in pytorch");
 
   check_supported_max_int_with_precision(n, result);
 
diff --git a/aten/src/ATen/native/cuda/UniqueCub.cu b/aten/src/ATen/native/cuda/UniqueCub.cu
index 1b9619b29812b..eb31fd2f76bb8 100644
--- a/aten/src/ATen/native/cuda/UniqueCub.cu
+++ b/aten/src/ATen/native/cuda/UniqueCub.cu
@@ -94,13 +94,7 @@ std::tuple<Tensor, Tensor, Tensor, int64_t> compute_unique(
   Tensor length = at::empty({1}, options);
   int64_t num_out;
   if (!return_counts) {
-    CUB_WRAPPER(
-        cub::DeviceSelect::Unique,
-        data,
-        data_out.data_ptr<scalar_t>(),
-        length.data_ptr<int64_t>(),
-        num_inp,
-        stream);
+    cuda::cub::unique(data, data_out.data_ptr<scalar_t>(), length.data_ptr<int64_t>(), num_inp);
     num_out = length.item<int64_t>();
   } else {
     counts.resize_(num_inp);
@@ -135,11 +129,6 @@ std::tuple<Tensor, Tensor, Tensor> unique_cuda_template(
 
   auto options = self.options().dtype(kLong);
   int64_t num_inp = self.numel();
-  TORCH_CHECK(
-      num_inp <= INT_MAX,
-      "num_inp ",
-      num_inp,
-      " is too big to be handled by cub");
   Tensor sorted;
   Tensor self_c = self.contiguous();
   if (consecutive) {
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 90024dea510ed..e0d09b7ba03fc 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -2774,6 +2774,14 @@ def single_batch_reference_fn(input, parameters, module):
         input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
         check_gradgrad=False,
     ),
+    dict(
+        module_name='Embedding',
+        constructor_args=(4, 3),
+        cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3)',
+        input_fn=lambda: torch.empty(1, 512, dtype=torch.long).random_(4).expand(7, 512),
+        check_gradgrad=False,
+        desc='discontiguous'
+    ),
     dict(
         module_name='EmbeddingBag',
         constructor_args=(4, 3),

From 7774a4e95b69d7a61ae1644a27f9b924d93ca600 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Tue, 24 Aug 2021 09:38:25 -0700
Subject: [PATCH 172/530] [Static Runtime] Implement prim::VarStack out variant
 (#63579)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63579

Provide a static runtime out variant implementation for the new op introduced in D30426232 (https://github.com/pytorch/pytorch/commit/1385f9fb12e6607c98d2d9d5edaaaab2bc07386f).

Test Plan: `buck test //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- IndividualOps_VarStack`

Reviewed By: navahgar

Differential Revision: D30410525

fbshipit-source-id: bc59a3d8ad23e3d94561ec2dca9cc20687dbadf8
---
 benchmarks/static_runtime/test_scripts.h      |  5 +++
 .../static_runtime/test_static_runtime.cc     | 16 ++++++++++
 torch/csrc/jit/runtime/static/impl.cpp        |  1 +
 torch/csrc/jit/runtime/static/ops.cpp         | 31 ++++++++++++++++---
 4 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index 9e01d3b8d0b87..c82dd57752bd6 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -587,6 +587,11 @@ const auto var_cat_script = R"JIT(
    return torch.cat([inp1, inp2], dim).clone()
 )JIT";
 
+const auto var_stack_script = R"JIT(
+  def forward(self, inp1: Tensor, inp2: Tensor, dim: int):
+   return torch.stack([inp1, inp2], dim).clone()
+)JIT";
+
 const auto isinstance_int_script = R"JIT(
   def forward(self, a: Any):
       return isinstance(a, int)
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 71102215b4e2e..701231e7720d1 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -1195,3 +1195,19 @@ TEST(StaticRuntime, QuantizedLinear) {
 
   testStaticRuntime(quantize_script, {input, weight}, {input_2, weight_2});
 }
+
+TEST(StaticRuntime, IndividualOps_VarStack) {
+  // 2D tensors - stack dim = 0
+  std::vector<IValue> args1 = {at::randn({6, 6}), at::randn({6, 6}), 0};
+  testStaticRuntime(var_stack_script, args1);
+
+  // 3D tensors - stack dim = 1
+  std::vector<IValue> args2 = {at::randn({4, 5, 6}), at::randn({4, 5, 6}), 1};
+  testStaticRuntime(var_stack_script, args2);
+
+  // 3D tensors - stack dim = 2
+  std::vector<IValue> args3 = {at::randn({4, 5, 6}), at::randn({4, 5, 6}), 2};
+  testStaticRuntime(var_stack_script, args3);
+
+  testStaticRuntime(var_stack_script, args1, args2);
+}
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 1b5ee724b45a4..cb9342b364cc5 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -66,6 +66,7 @@ void OptimizeGraph(
   EliminateDeadCode(graph);
   FuseInferenceOpsForSparseNN(graph);
   UseVariadicCat(graph);
+  UseVariadicStack(graph);
 
   // TODO: we can avoid this guard by moving operations
   // to exposed folders.
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 62d86126a0577..e6af641083fc2 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -452,6 +452,29 @@ SROperator aten_stack(Node* n) {
 
 REGISTER_OPERATOR_FUNCTOR(aten::stack, aten_stack, aten_stack);
 
+REGISTER_OPERATOR_FUNCTOR(
+    prim::VarStack,
+    prim_VarStack,
+    [](Node* n) -> SROperator {
+      return [](ProcessedNode* p_node) {
+        const size_t num_inputs = p_node->inputs().size();
+
+        std::vector<at::Tensor> inputs(num_inputs - 1);
+        for (size_t i = 0; i < num_inputs - 1; ++i) {
+          inputs[i] = p_node->Input(i).toTensor();
+        }
+
+        const auto dim = p_node->Input(num_inputs - 1).toInt();
+        if (p_node->Output(0).isNone()) {
+          p_node->Output(0) = at::native::_stack_cpu(inputs, dim);
+        } else {
+          auto& out_t = p_node->Output(0).toTensor();
+          fastResizeToZero(out_t);
+          at::native::_stack_out_cpu(inputs, dim, out_t);
+        }
+      };
+    });
+
 REGISTER_OPERATOR_FUNCTOR(aten::leaky_relu, aten_leaky_relu, [](Node* n) -> SROperator {
   if (!n->matches(torch::schema(
           "aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor"))) {
@@ -472,10 +495,10 @@ REGISTER_OPERATOR_FUNCTOR(aten::leaky_relu, aten_leaky_relu, [](Node* n) -> SROp
 
 namespace {
 
-// Use the width of an AVX-512 vector by default; this happens to work OK for
-// AVX2 as well. Some ops benefit from using multiple AVX ports, in which case
-// they are vectorized by twice this constant.  An exception is logit, since it
-// contains FP divide, which is single-ported.
+// Use the width of an AVX-512 vector by default; this happens to work OK
+// for AVX2 as well. Some ops benefit from using multiple AVX ports, in
+// which case they are vectorized by twice this constant.  An exception is
+// logit, since it contains FP divide, which is single-ported.
 static constexpr int kVectorWidth = 16;
 
 #ifdef TORCH_ENABLE_LLVM

From 5dee15401c7730ae2a0e49e774c9da2d067527b7 Mon Sep 17 00:00:00 2001
From: Karen Zhou <kazhou@fb.com>
Date: Tue, 24 Aug 2021 10:17:28 -0700
Subject: [PATCH 173/530] [pruner] refactor `ActivationReconstruction` forward
 hooks (#63158)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63158

Combined functionality for `ActivationReconstruction` for both Linear and Conv2d in one class. The only difference between the old classes was the size and indexing of the reconstructed tensor -- that logic can be generalized by iterating over the size of `output`.
ghstack-source-id: 136467465

Test Plan:
`buck test mode/dev-nosan //caffe2/test:ao -- TestBasePruner`

https://pxl.cl/1MSSv

Reviewed By: raghuramank100

Differential Revision: D30282765

fbshipit-source-id: 08a1e4e0650511019fff85cf52b41dd818b0c7f8
---
 torch/ao/sparsity/__init__.py                 |  3 +--
 .../experimental/pruner/base_pruner.py        | 10 +++-----
 .../experimental/pruner/parametrization.py    | 25 +++++++++----------
 3 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/torch/ao/sparsity/__init__.py b/torch/ao/sparsity/__init__.py
index ef03c71c64732..9ba05f2cde5ab 100644
--- a/torch/ao/sparsity/__init__.py
+++ b/torch/ao/sparsity/__init__.py
@@ -17,8 +17,7 @@
 
 # Parametrizations
 from .experimental.pruner.parametrization import PruningParametrization
-from .experimental.pruner.parametrization import LinearActivationReconstruction
-from .experimental.pruner.parametrization import Conv2dActivationReconstruction
+from .experimental.pruner.parametrization import ActivationReconstruction
 
 # Pruner
 from .experimental.pruner.base_pruner import BasePruner
diff --git a/torch/ao/sparsity/experimental/pruner/base_pruner.py b/torch/ao/sparsity/experimental/pruner/base_pruner.py
index 075a7ceae305a..92e1945666262 100644
--- a/torch/ao/sparsity/experimental/pruner/base_pruner.py
+++ b/torch/ao/sparsity/experimental/pruner/base_pruner.py
@@ -8,7 +8,7 @@
 
 from torch.nn.modules.container import ModuleDict, ModuleList
 
-from .parametrization import PruningParametrization, LinearActivationReconstruction, Conv2dActivationReconstruction
+from .parametrization import PruningParametrization, ActivationReconstruction
 
 SUPPORTED_MODULES = {
     nn.Linear,
@@ -140,13 +140,9 @@ def prepare(self, use_path=False, *args, **kwargs):
 
             assert isinstance(module.parametrizations, ModuleDict)  # make mypy happy
             assert isinstance(module.parametrizations.weight, ModuleList)
-            if isinstance(module, nn.Linear):
+            if isinstance(module, tuple(SUPPORTED_MODULES)):
                 self.activation_handles.append(module.register_forward_hook(
-                    LinearActivationReconstruction(module.parametrizations.weight[0])
-                ))
-            elif isinstance(module, nn.Conv2d):
-                self.activation_handles.append(module.register_forward_hook(
-                    Conv2dActivationReconstruction(module.parametrizations.weight[0])
+                    ActivationReconstruction(module.parametrizations.weight[0])
                 ))
             else:
                 raise NotImplementedError("This module type is not supported yet.")
diff --git a/torch/ao/sparsity/experimental/pruner/parametrization.py b/torch/ao/sparsity/experimental/pruner/parametrization.py
index 1156ea8af4ef1..d4bebb27725cd 100644
--- a/torch/ao/sparsity/experimental/pruner/parametrization.py
+++ b/torch/ao/sparsity/experimental/pruner/parametrization.py
@@ -1,5 +1,6 @@
 import torch
 from torch import nn
+from typing import Any, List
 
 
 class PruningParametrization(nn.Module):
@@ -13,27 +14,25 @@ def forward(self, x):
         return x[list(valid_outputs)]
 
 
-class LinearActivationReconstruction:
+class ActivationReconstruction:
     def __init__(self, parametrization):
         self.param = parametrization
 
     def __call__(self, module, input, output):
         max_outputs = self.param.original_outputs
         pruned_outputs = self.param.pruned_outputs
-        reconstructed_tensor = torch.zeros((output.shape[0], len(max_outputs)))
         valid_columns = list(max_outputs - pruned_outputs)
-        reconstructed_tensor[:, valid_columns] = output
-        return reconstructed_tensor
 
+        # get size of reconstructed output
+        sizes = list(output.shape)
+        sizes[1] = len(max_outputs)
 
-class Conv2dActivationReconstruction:
-    def __init__(self, parametrization):
-        self.param = parametrization
+        # get valid indices of reconstructed output
+        indices: List[Any] = []
+        for size in output.shape:
+            indices.append(slice(0, size, 1))
+        indices[1] = valid_columns
 
-    def __call__(self, module, input, output):
-        max_outputs = self.param.original_outputs
-        pruned_outputs = self.param.pruned_outputs
-        reconstructed_tensor = torch.zeros((output.shape[0], len(max_outputs), output.shape[2], output.shape[3]))
-        valid_columns = list(max_outputs - pruned_outputs)
-        reconstructed_tensor[:, valid_columns, :, :] = output
+        reconstructed_tensor = torch.zeros(sizes)
+        reconstructed_tensor[indices] = output
         return reconstructed_tensor

From 16ba20507a7a8fcb62d88c719eceab578e09e210 Mon Sep 17 00:00:00 2001
From: Karen Zhou <kazhou@fb.com>
Date: Tue, 24 Aug 2021 10:17:28 -0700
Subject: [PATCH 174/530] [pruner] amend base pruner API to match base
 sparsifier (#63178)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63178

Update base pruner API to match base sparsifier API as defined in D28970960 / PR58955

Changes include:
- `enable_mask_update = True` in `__init__`
- `prepare` takes model and config instead of constructor
- convert functionality renamed to `squash_mask`, `convert` method call now raises Error
- `activation_handles` ad `bias_handles` initialized in `_prepare` instead of constructor
ghstack-source-id: 136467595

Test Plan:
Function names updates according to changes

`buck test mode/dev-nosan //caffe2/test:ao -- TestBasePruner`

https://pxl.cl/1MTgH

TODO will need to modify `fbcode/scripts/kazhou/fusion_tests.py` to use new API

Reviewed By: z-a-f

Differential Revision: D30287179

fbshipit-source-id: d4727bea1873b500f2d4bb784db26d532bf26cce
---
 test/ao/sparsity/test_pruner.py               |  61 +++++-----
 torch/ao/sparsity/__init__.py                 |   2 +
 .../experimental/pruner/base_pruner.py        | 109 +++---------------
 .../ao/sparsity/sparsifier/base_sparsifier.py |  24 +---
 torch/ao/sparsity/sparsifier/utils.py         |  18 +++
 5 files changed, 66 insertions(+), 148 deletions(-)

diff --git a/test/ao/sparsity/test_pruner.py b/test/ao/sparsity/test_pruner.py
index 8f5f6dd19abbe..c358df6ac95ae 100644
--- a/test/ao/sparsity/test_pruner.py
+++ b/test/ao/sparsity/test_pruner.py
@@ -161,7 +161,7 @@ def _check_pruner_prepared(self, model, pruner, device):
             # Assume that this is the 1st/only parametrization
             assert type(module.parametrizations.weight[0]) == PruningParametrization
 
-    def _check_pruner_converted(self, model, pruner, device):
+    def _check_pruner_mask_squashed(self, model, pruner, device):
         for g in pruner.module_groups:
             module = g['module']
             assert module.weight.device == device
@@ -184,16 +184,18 @@ def _test_constructor_on_device(self, model, device):
         self.assertRaisesRegex(TypeError, 'with abstract methods update_mask',
                                BasePruner)
         model = model.to(device)
-        pruner = SimplePruner(model, None, None)
+        pruner = SimplePruner(None)
+        pruner.prepare(model, None)
         for g in pruner.module_groups:
             module = g['module']
             assert module.weight.device == device
         assert len(pruner.module_groups) == 2
         pruner.step()
         # Can instantiate the model with configs
-        pruner = SimplePruner(model, [model.linear], {'test': 3})
+        pruner = SimplePruner({'test': 3})
+        pruner.prepare(model, [model.linear])
         assert len(pruner.module_groups) == 1
-        assert pruner.module_groups[0]['path'] == 'linear'
+        assert pruner.module_groups[0]['fqn'] == 'linear'
         assert 'test' in pruner.module_groups[0]
         assert pruner.module_groups[0]['test'] == 3
 
@@ -205,8 +207,8 @@ def test_constructor(self):
     def _test_prepare_linear_on_device(self, model, device):
         model = model.to(device)
         x = torch.ones(128, 16)
-        pruner = SimplePruner(model, None, None)
-        pruner.prepare()
+        pruner = SimplePruner(None)
+        pruner.prepare(model, None)
         self._check_pruner_prepared(model, pruner, device)
         assert model(x).shape == (128, 16)
 
@@ -219,8 +221,8 @@ def test_prepare_linear(self):
     def _test_prepare_conv2d_on_device(self, model, device):
         model = model.to(device)
         x = torch.ones((1, 1, 28, 28))
-        pruner = SimplePruner(model, None, None)
-        pruner.prepare()
+        pruner = SimplePruner(None)
+        pruner.prepare(model, None)
         self._check_pruner_prepared(model, pruner, device)
         assert model(x).shape == (1, 64, 24, 24)
 
@@ -230,51 +232,49 @@ def test_prepare_conv2d(self):
             for model in models:
                 self._test_prepare_conv2d_on_device(model, torch.device(device))
 
-    def _test_convert_linear_on_device(self, model, device):
+    def _test_squash_mask_linear_on_device(self, model, device):
         model = model.to(device)
         x = torch.ones(128, 16)
-        pruner = SimplePruner(model, None, None)
-        pruner.prepare()
-        pruner.convert()
-        self._check_pruner_converted(model, pruner, device)
+        pruner = SimplePruner(None)
+        pruner.prepare(model, None)
+        pruner.squash_mask()
+        self._check_pruner_mask_squashed(model, pruner, device)
         assert model(x).shape == (128, 16)
 
-    def test_convert_linear(self):
+    def test_squash_mask_linear(self):
         models = [Linear(), LinearB()]  # without and with bias
         for device in DEVICES:
             for model in models:
-                self._test_convert_linear_on_device(model, torch.device(device))
+                self._test_squash_mask_linear_on_device(model, torch.device(device))
 
-    def _test_convert_conv2d_on_device(self, model, device):
+    def _test_squash_mask_conv2d_on_device(self, model, device):
         model = model.to(device)
         x = torch.ones((1, 1, 28, 28))
-        pruner = SimplePruner(model, None, None)
-        pruner.prepare()
-        pruner.convert()
-        self._check_pruner_converted(model, pruner, device)
+        pruner = SimplePruner(None)
+        pruner.prepare(model, None)
+        pruner.squash_mask()
+        self._check_pruner_mask_squashed(model, pruner, device)
         assert model(x).shape == (1, 64, 24, 24)
 
-    def test_convert_conv2d(self):
+    def test_squash_mask_conv2d(self):
         models = [Conv2dA(), Conv2dB(), Conv2dC()]
         for device in DEVICES:
             for model in models:
-                self._test_convert_conv2d_on_device(model, torch.device(device))
+                self._test_squash_mask_conv2d_on_device(model, torch.device(device))
 
     def _test_step_linear_on_device(self, model, is_basic, device):
         model = model.to(device)
         if is_basic:
             x = torch.ones(16, 16)
-            pruner = SimplePruner(model, None, None)
-            pruner.prepare()
-            pruner.enable_mask_update = True
+            pruner = SimplePruner(None)
+            pruner.prepare(model, None)
             self._check_pruner_valid_before_step(model, pruner, device)
             pruner.step()
             self._check_pruner_valid_after_step(model, pruner, {1}, device)
         else:
             x = torch.ones(7, 7)
-            pruner = MultiplePruner(model, None, None)
-            pruner.prepare()
-            pruner.enable_mask_update = True
+            pruner = MultiplePruner(None)
+            pruner.prepare(model, None)
             self._check_pruner_valid_before_step(model, pruner, device)
             pruner.step()
             self._check_pruner_valid_after_step(model, pruner, {1, 2}, device)
@@ -291,9 +291,8 @@ def test_step_linear(self):
     def _test_step_conv2d_on_device(self, model, device):
         model = model.to(device)
         x = torch.ones((1, 1, 28, 28))
-        pruner = SimplePruner(model, None, None)
-        pruner.prepare()
-        pruner.enable_mask_update = True
+        pruner = SimplePruner(None)
+        pruner.prepare(model, None)
         self._check_pruner_valid_before_step(model, pruner, device)
         pruner.step()
         self._check_pruner_valid_after_step(model, pruner, {1}, device)
diff --git a/torch/ao/sparsity/__init__.py b/torch/ao/sparsity/__init__.py
index 9ba05f2cde5ab..55b8d7059c9ae 100644
--- a/torch/ao/sparsity/__init__.py
+++ b/torch/ao/sparsity/__init__.py
@@ -12,6 +12,8 @@
 
 # Parametrizations
 from .sparsifier.utils import FakeSparsity
+from .sparsifier.utils import module_to_fqn
+from .sparsifier.utils import fqn_to_module
 
 # === Experimental ===
 
diff --git a/torch/ao/sparsity/experimental/pruner/base_pruner.py b/torch/ao/sparsity/experimental/pruner/base_pruner.py
index 92e1945666262..d89b3cc86d550 100644
--- a/torch/ao/sparsity/experimental/pruner/base_pruner.py
+++ b/torch/ao/sparsity/experimental/pruner/base_pruner.py
@@ -1,6 +1,5 @@
 
 import abc
-import copy
 
 import torch
 from torch import nn
@@ -10,31 +9,15 @@
 
 from .parametrization import PruningParametrization, ActivationReconstruction
 
+from torch.ao.sparsity import BaseSparsifier, fqn_to_module
+
 SUPPORTED_MODULES = {
     nn.Linear,
     nn.Conv2d
 }
 
-def _module_to_path(model, layer, prefix=''):
-    for name, child in model.named_children():
-        new_name = prefix + '.' + name
-        if child is layer:
-            return new_name
-        child_path = _module_to_path(child, layer, prefix=new_name)
-        if child_path is not None:
-            return child_path
-    return None
-
-def _path_to_module(model, path):
-    path = path.split('.')
-    for name in path:
-        model = getattr(model, name, None)
-        if model is None:
-            return None
-    return model
-
-
-class BasePruner(abc.ABC):
+
+class BasePruner(BaseSparsifier):
     r"""Base class for all pruners.
 
     Abstract methods that need to be implemented:
@@ -53,66 +36,8 @@ class BasePruner(abc.ABC):
             be updated.
 
     """
-    def __init__(self, model, config, defaults):
-        super().__init__()
-        self.config = config
-        self.defaults = defaults
-        if self.defaults is None:
-            self.defaults = dict()
-
-        self.module_groups = []
-        self.enable_mask_update = False
-        self.activation_handles = []
-        self.bias_handles = []
-
-        self.model = model
-        # If no config -- try getting all the supported layers
-        if self.config is None:
-            # Add all models to the config
-            self.config = []
-            stack = [model]
-            while stack:
-                module = stack.pop()
-                for name, child in module.named_children():
-                    if type(child) in SUPPORTED_MODULES:
-                        self.config.append(child)
-                    else:
-                        stack.append(child)
-
-        for module_config in self.config:
-            if isinstance(module_config, nn.Module):
-                module_config = {'module': module_config}
-            local_args = copy.deepcopy(self.defaults)
-            local_args.update(module_config)
-            module = local_args['module']
-            module_path = _module_to_path(self.model, module)
-            if module_path and module_path[0] == '.':
-                module_path = module_path[1:]
-            local_args['path'] = module_path
-            self.module_groups.append(local_args)
-
-    def __getstate__(self):
-        return {
-            'defaults': self.defaults,
-            'module_groups': self.module_groups,
-        }
-
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-
-    def __repr__(self):
-        format_string = self.__class__.__name__ + ' ('
-        for i, sparse_args in enumerate(self.module_groups):
-            module = sparse_args['module']
-            format_string += '\n'
-            format_string += f'\tModule Group {i}\n'
-            format_string += f'\t    module: {module}\n'
-            for key in sorted(sparse_args.keys()):
-                if key == 'module':
-                    continue
-                format_string += f'\t    {key}: {sparse_args[key]}\n'
-        format_string += ')'
-        return format_string
+    def __init__(self, defaults):
+        super().__init__(defaults)
 
     def bias_hook(self, module, input, output):
         if getattr(module, '_bias', None) is not None:
@@ -122,12 +47,15 @@ def bias_hook(self, module, input, output):
             output += bias
         return output
 
-    def prepare(self, use_path=False, *args, **kwargs):
+    def _prepare(self, use_path=False, *args, **kwargs):
         r"""Adds mask parametrization to the layer weight
         """
+        self.activation_handles = []  # store removable hook handles
+        self.bias_handles = []
+
         for config in self.module_groups:
             if use_path:
-                module = _path_to_module(self.model, config['path'])
+                module = fqn_to_module(self.model, config['fqn'])
             else:
                 module = config['module']
 
@@ -152,10 +80,10 @@ def prepare(self, use_path=False, *args, **kwargs):
                 module.bias = None
             self.bias_handles.append(module.register_forward_hook(self.bias_hook))
 
-    def convert(self, use_path=False, *args, **kwargs):
+    def squash_mask(self, use_path=False, *args, **kwargs):
         for config in self.module_groups:
             if use_path:
-                module = _path_to_module(self.model, config['path'])
+                module = fqn_to_module(self.model, config['fqn'])
             else:
                 module = config['module']
             parametrize.remove_parametrizations(module, 'weight',
@@ -166,17 +94,6 @@ def convert(self, use_path=False, *args, **kwargs):
                 del module._buffers['mask']
             delattr(module, 'mask')
 
-    def step(self, use_path=True):
-        if not self.enable_mask_update:
-            return
-        with torch.no_grad():
-            for config in self.module_groups:
-                if use_path:
-                    module = _path_to_module(self.model, config['path'])
-                else:
-                    module = config['module']
-                self.update_mask(module, **config)
-
     @abc.abstractmethod
     def update_mask(self, layer, **kwargs):
         pass
diff --git a/torch/ao/sparsity/sparsifier/base_sparsifier.py b/torch/ao/sparsity/sparsifier/base_sparsifier.py
index d6bc7d75248cf..1d01b71daae25 100644
--- a/torch/ao/sparsity/sparsifier/base_sparsifier.py
+++ b/torch/ao/sparsity/sparsifier/base_sparsifier.py
@@ -8,30 +8,12 @@
 from torch import nn
 from torch.nn.utils import parametrize
 
-from .utils import FakeSparsity
+from .utils import FakeSparsity, module_to_fqn, fqn_to_module
 
 SUPPORTED_MODULES = {
     nn.Linear
 }
 
-def _module_to_fqn(model, layer, prefix=''):
-    for name, child in model.named_children():
-        new_name = prefix + '.' + name
-        if child is layer:
-            return new_name
-        child_path = _module_to_fqn(child, layer, prefix=new_name)
-        if child_path is not None:
-            return child_path
-    return None
-
-def _fqn_to_module(model, path):
-    path = path.split('.')
-    for name in path:
-        model = getattr(model, name, None)
-        if model is None:
-            return None
-    return model
-
 
 class BaseSparsifier(abc.ABC):
     r"""Base class for all sparsifiers.
@@ -136,7 +118,7 @@ def load_state_dict(self, state_dict, strict=True):
         module_groups = copy.deepcopy(state_dict['module_groups'])
         states = state_dict['state']
         for fqn, s in states.items():
-            layer = _fqn_to_module(self.model, fqn)
+            layer = fqn_to_module(self.model, fqn)
             if strict and layer is None:
                 raise RuntimeError(f'Error loading {fqn} into the model')
 
@@ -186,7 +168,7 @@ def prepare(self, model, config):
             local_args = copy.deepcopy(self.defaults)
             local_args.update(module_config)
             module = local_args['module']
-            module_fqn = _module_to_fqn(model, module)
+            module_fqn = module_to_fqn(model, module)
             if module_fqn and module_fqn[0] == '.':
                 module_fqn = module_fqn[1:]
             local_args['fqn'] = module_fqn
diff --git a/torch/ao/sparsity/sparsifier/utils.py b/torch/ao/sparsity/sparsifier/utils.py
index 6271a8d502f0d..3124b1b767b0f 100644
--- a/torch/ao/sparsity/sparsifier/utils.py
+++ b/torch/ao/sparsity/sparsifier/utils.py
@@ -1,5 +1,23 @@
 from torch import nn
 
+def module_to_fqn(model, layer, prefix=''):
+    for name, child in model.named_children():
+        new_name = prefix + '.' + name
+        if child is layer:
+            return new_name
+        child_path = module_to_fqn(child, layer, prefix=new_name)
+        if child_path is not None:
+            return child_path
+    return None
+
+def fqn_to_module(model, path):
+    path = path.split('.')
+    for name in path:
+        model = getattr(model, name, None)
+        if model is None:
+            return None
+    return model
+
 # Parametrizations
 class FakeSparsity(nn.Module):
     r"""Parametrization for the weights. Should be attached to the 'weight' or

From 1256dcd50967b18c2ca335662558e77aeefe4f13 Mon Sep 17 00:00:00 2001
From: Karen Zhou <kazhou@fb.com>
Date: Tue, 24 Aug 2021 10:17:28 -0700
Subject: [PATCH 175/530] [pruner] modify base pruner to prune bias by default
 (#63202)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63202

By default, the prune will also prune biases, such that the whole output channel is removed. The user can manually set `also_prune_bias` to False when calling `prepare` if they don't want the bias to be pruned.
ghstack-source-id: 136466671

Test Plan:
`buck test mode/dev-nosan //caffe2/test:ao -- TestBasePruner`

https://pxl.cl/1MV32

modify `fusion_tests` according to API change
`buck test mode/opt //scripts/kazhou:fusion_tests`

https://pxl.cl/1NbKz

Reviewed By: z-a-f

Differential Revision: D30294494

fbshipit-source-id: c84655648bee0035559195ca855b98fb7edaa134
---
 torch/ao/sparsity/__init__.py                 |  1 +
 .../experimental/pruner/base_pruner.py        | 22 +++++--------------
 .../experimental/pruner/parametrization.py    | 22 +++++++++++++++++++
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/torch/ao/sparsity/__init__.py b/torch/ao/sparsity/__init__.py
index 55b8d7059c9ae..06854a42cf9ce 100644
--- a/torch/ao/sparsity/__init__.py
+++ b/torch/ao/sparsity/__init__.py
@@ -20,6 +20,7 @@
 # Parametrizations
 from .experimental.pruner.parametrization import PruningParametrization
 from .experimental.pruner.parametrization import ActivationReconstruction
+from .experimental.pruner.parametrization import BiasHook
 
 # Pruner
 from .experimental.pruner.base_pruner import BasePruner
diff --git a/torch/ao/sparsity/experimental/pruner/base_pruner.py b/torch/ao/sparsity/experimental/pruner/base_pruner.py
index d89b3cc86d550..a8a7b69141be3 100644
--- a/torch/ao/sparsity/experimental/pruner/base_pruner.py
+++ b/torch/ao/sparsity/experimental/pruner/base_pruner.py
@@ -7,7 +7,7 @@
 
 from torch.nn.modules.container import ModuleDict, ModuleList
 
-from .parametrization import PruningParametrization, ActivationReconstruction
+from .parametrization import PruningParametrization, ActivationReconstruction, BiasHook
 
 from torch.ao.sparsity import BaseSparsifier, fqn_to_module
 
@@ -26,26 +26,16 @@ class BasePruner(BaseSparsifier):
         `module_groups`.
 
     Args:
-        - model [nn.Module]: model to configure. The model itself is not saved
-            but used for the state_dict saving / loading.
-        - config [list]: configuration elements could either be instances of
-            nn.Module or dict maps. The dicts must have a key 'module' with the
-            value being an instance of a nn.Module.
         - defaults [dict]: default configurations will be attached to the
             configuration. Only the keys that don't exist in the `config` will
             be updated.
+        - also_prune_bias [bool]: whether to prune bias in addition to weights (to prune full output channel)
+            or not; default=True.
 
     """
-    def __init__(self, defaults):
+    def __init__(self, defaults, also_prune_bias=True):
         super().__init__(defaults)
-
-    def bias_hook(self, module, input, output):
-        if getattr(module, '_bias', None) is not None:
-            idx = [1] * len(output.shape)
-            idx[1] = output.shape[1]
-            bias = module._bias.reshape(idx)
-            output += bias
-        return output
+        self.prune_bias = also_prune_bias
 
     def _prepare(self, use_path=False, *args, **kwargs):
         r"""Adds mask parametrization to the layer weight
@@ -78,7 +68,7 @@ def _prepare(self, use_path=False, *args, **kwargs):
             if module.bias is not None:
                 module.register_parameter('_bias', nn.Parameter(module.bias.detach()))
                 module.bias = None
-            self.bias_handles.append(module.register_forward_hook(self.bias_hook))
+            self.bias_handles.append(module.register_forward_hook(BiasHook(module.parametrizations.weight[0], self.prune_bias)))
 
     def squash_mask(self, use_path=False, *args, **kwargs):
         for config in self.module_groups:
diff --git a/torch/ao/sparsity/experimental/pruner/parametrization.py b/torch/ao/sparsity/experimental/pruner/parametrization.py
index d4bebb27725cd..696b16e1edccc 100644
--- a/torch/ao/sparsity/experimental/pruner/parametrization.py
+++ b/torch/ao/sparsity/experimental/pruner/parametrization.py
@@ -36,3 +36,25 @@ def __call__(self, module, input, output):
         reconstructed_tensor = torch.zeros(sizes)
         reconstructed_tensor[indices] = output
         return reconstructed_tensor
+
+
+class BiasHook:
+    def __init__(self, parametrization, prune_bias):
+        self.param = parametrization
+        self.prune_bias = prune_bias
+
+    def __call__(self, module, input, output):
+        pruned_outputs = self.param.pruned_outputs
+
+        if getattr(module, '_bias', None) is not None:
+            bias = module._bias.data
+            if self.prune_bias:
+                bias[list(pruned_outputs)] = 0
+
+            # reshape bias to broadcast over output dimensions
+            idx = [1] * len(output.shape)
+            idx[1] = -1
+            bias = bias.reshape(idx)
+
+            output += bias
+        return output

From 14d4723abde40375118afd097d8e47a99145c222 Mon Sep 17 00:00:00 2001
From: mingfeima <mingfei.ma@intel.com>
Date: Tue, 24 Aug 2021 10:30:18 -0700
Subject: [PATCH 176/530] add bf16 support for bucketize (#55588)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/55588

Test Plan: Imported from OSS

Reviewed By: bdhirsh

Differential Revision: D28836796

Pulled By: VitalyFedyunin

fbshipit-source-id: c9ae5b969c30a45473533be5f29bb497f8da5143
---
 aten/src/ATen/native/Bucketization.cpp |  4 ++--
 test/test_reductions.py                | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/Bucketization.cpp b/aten/src/ATen/native/Bucketization.cpp
index 7dc76a7577aa2..c11ce253f1d4a 100644
--- a/aten/src/ATen/native/Bucketization.cpp
+++ b/aten/src/ATen/native/Bucketization.cpp
@@ -74,12 +74,12 @@ void searchsorted_cpu_contiguous(Tensor& result, const Tensor& input, const Tens
 
 void dispatch(Tensor& result, const Tensor& input, const Tensor& boundaries, bool out_int32, bool right) {
   if (!out_int32) {
-    AT_DISPATCH_ALL_TYPES(input.scalar_type(), "searchsorted_out_cpu", [&] {
+    AT_DISPATCH_ALL_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "searchsorted_out_cpu", [&] {
       searchsorted_cpu_contiguous<scalar_t, int64_t>(result, input, boundaries, right);
     });
   }
   else {
-    AT_DISPATCH_ALL_TYPES(input.scalar_type(), "searchsorted_out_cpu", [&] {
+    AT_DISPATCH_ALL_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "searchsorted_out_cpu", [&] {
       searchsorted_cpu_contiguous<scalar_t, int>(result, input, boundaries, right);
     });
   }
diff --git a/test/test_reductions.py b/test/test_reductions.py
index f3f0d4c936451..1497ed6ad419d 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -1011,6 +1011,23 @@ def test_output_dtype(dtype, is_int32):
         test_output_dtype(torch.int32, False)
         test_output_dtype(torch.int64, True)
 
+        # scalar type bfloat16
+        if self.device_type == 'cpu':
+            def test_dtype_bfloat16(values_bf16=False, boundaries_bf16=False):
+                values_1d_float = values_1d.to(torch.float32)
+                boundaries = torch.tensor([0.9, 1, 2, 2, 3, 3, 4, 4.1, 9, 9], device=device, dtype=torch.float32)
+                if values_bf16:
+                    values_1d_float = values_1d_float.to(torch.bfloat16)
+                if boundaries_bf16:
+                    boundaries = boundaries.to(torch.bfloat16)
+                expected_result = torch.tensor([1, 2, 4, 6, 8, 8, 8, 8, 8], device=device, dtype=torch.int32)
+                self.assertEqual(torch.searchsorted(boundaries, values_1d_float, out_int32=True), expected_result)
+                self.assertEqual(torch.bucketize(values_1d_float, boundaries, out_int32=True), expected_result)
+
+            test_dtype_bfloat16(True, False)
+            test_dtype_bfloat16(False, True)
+            test_dtype_bfloat16(True, True)
+
     @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_complex=False))
     def test_nansum(self, device, dtype):
         args = product(

From e6dc7bc61b106427bb44bb6822451369dfab0eda Mon Sep 17 00:00:00 2001
From: peterjc123 <peterghost86@gmail.com>
Date: Tue, 24 Aug 2021 10:44:45 -0700
Subject: [PATCH 177/530] Subprocess encoding fixes for cpp extension (#63756)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63584

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63756

Reviewed By: bdhirsh

Differential Revision: D30485046

Pulled By: ezyang

fbshipit-source-id: 4f0ac383da4e8843e2a602dceae85f389d7434ee
---
 torch/utils/cpp_extension.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index b313423426caa..bb0a85982c665 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -37,6 +37,8 @@
 BUILD_SPLIT_CUDA = os.getenv('BUILD_SPLIT_CUDA') or (os.path.exists(os.path.join(
     TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cu{CLIB_EXT}')) and os.path.exists(os.path.join(TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cpp{CLIB_EXT}')))
 
+SUBPROCESS_DECODE_ARGS = ('oem',) if IS_WINDOWS else ()
+
 # Taken directly from python stdlib < 3.9
 # See https://github.com/pytorch/pytorch/issues/48617
 def _nt_quote_args(args: Optional[List[str]]) -> List[str]:
@@ -60,7 +62,7 @@ def _find_cuda_home() -> Optional[str]:
             which = 'where' if IS_WINDOWS else 'which'
             with open(os.devnull, 'w') as devnull:
                 nvcc = subprocess.check_output([which, 'nvcc'],
-                                               stderr=devnull).decode().rstrip('\r\n')
+                                               stderr=devnull).decode(*SUBPROCESS_DECODE_ARGS).rstrip('\r\n')
                 cuda_home = os.path.dirname(os.path.dirname(nvcc))
         except Exception:
             # Guess #3
@@ -90,7 +92,7 @@ def _find_rocm_home() -> Optional[str]:
                 ["which hipcc | xargs readlink -f"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
             hipcc, _ = pipe_hipcc.communicate()
             # this will be either <ROCM_HOME>/hip/bin/hipcc or <ROCM_HOME>/bin/hipcc
-            rocm_home = os.path.dirname(os.path.dirname(hipcc.decode().rstrip('\r\n')))
+            rocm_home = os.path.dirname(os.path.dirname(hipcc.decode(*SUBPROCESS_DECODE_ARGS).rstrip('\r\n')))
             if os.path.basename(rocm_home) == 'hip':
                 rocm_home = os.path.dirname(rocm_home)
         except Exception:
@@ -251,12 +253,12 @@ def check_compiler_ok_for_platform(compiler: str) -> bool:
         return True
     which = subprocess.check_output(['which', compiler], stderr=subprocess.STDOUT)
     # Use os.path.realpath to resolve any symlinks, in particular from 'c++' to e.g. 'g++'.
-    compiler_path = os.path.realpath(which.decode().strip())
+    compiler_path = os.path.realpath(which.decode(*SUBPROCESS_DECODE_ARGS).strip())
     # Check the compiler name
     if any(name in compiler_path for name in _accepted_compilers_for_platform()):
         return True
     # If ccache is used the compiler path is /usr/bin/ccache. Check by -v flag.
-    version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT).decode()
+    version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT).decode(*SUBPROCESS_DECODE_ARGS)
     if sys.platform.startswith('linux'):
         # Check for 'gcc' or 'g++'
         pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE)
@@ -303,11 +305,11 @@ def check_compiler_abi_compatibility(compiler) -> bool:
         if sys.platform.startswith('linux'):
             minimum_required_version = MINIMUM_GCC_VERSION
             versionstr = subprocess.check_output([compiler, '-dumpfullversion', '-dumpversion'])
-            version = versionstr.decode().strip().split('.')
+            version = versionstr.decode(*SUBPROCESS_DECODE_ARGS).strip().split('.')
         else:
             minimum_required_version = MINIMUM_MSVC_VERSION
             compiler_info = subprocess.check_output(compiler, stderr=subprocess.STDOUT)
-            match = re.search(r'(\d+)\.(\d+)\.(\d+)', compiler_info.decode().strip())
+            match = re.search(r'(\d+)\.(\d+)\.(\d+)', compiler_info.decode(*SUBPROCESS_DECODE_ARGS).strip())
             version = (0, 0, 0) if match is None else match.groups()
     except Exception:
         _, error, _ = sys.exc_info()
@@ -767,7 +769,7 @@ def _check_abi(self):
     def _check_cuda_version(self):
         if CUDA_HOME:
             nvcc = os.path.join(CUDA_HOME, 'bin', 'nvcc')
-            cuda_version_str = subprocess.check_output([nvcc, '--version']).strip().decode()
+            cuda_version_str = subprocess.check_output([nvcc, '--version']).strip().decode(*SUBPROCESS_DECODE_ARGS)
             cuda_version = re.search(r'release (\d+[.]\d+)', cuda_version_str)
             if cuda_version is not None:
                 cuda_str_version = cuda_version.group(1)
@@ -1727,7 +1729,7 @@ def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) ->
         # `error` is a CalledProcessError (which has an `ouput`) attribute, but
         # mypy thinks it's Optional[BaseException] and doesn't narrow
         if hasattr(error, 'output') and error.output:  # type: ignore[union-attr]
-            message += f": {error.output.decode()}"  # type: ignore[union-attr]
+            message += f": {error.output.decode(*SUBPROCESS_DECODE_ARGS)}"  # type: ignore[union-attr]
         raise RuntimeError(message) from e
 
 
@@ -1996,7 +1998,7 @@ def sanitize_flags(flags):
         link_rule = ['rule link']
         if IS_WINDOWS:
             cl_paths = subprocess.check_output(['where',
-                                                'cl']).decode().split('\r\n')
+                                                'cl']).decode(*SUBPROCESS_DECODE_ARGS).split('\r\n')
             if len(cl_paths) >= 1:
                 cl_path = os.path.dirname(cl_paths[0]).replace(':', '$:')
             else:

From 73431449b357de30ccb5a775c7395f343cb24d73 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Tue, 24 Aug 2021 10:50:57 -0700
Subject: [PATCH 178/530] update readme and contributing.md (#63843)

Summary:
1. In fact, Visual Studio isn't supported as CMAKE generator
2. I was asked many times why there's error as 'Could NOT find OpenMP'
3. Add Newly added Best Practices link in contributing.md

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63843

Reviewed By: seemethere, heitorschueroff

Differential Revision: D30514095

Pulled By: janeyx99

fbshipit-source-id: 76715a1d8c049122546e5a7778cafe54e4dfd5d6
---
 CONTRIBUTING.md | 1 +
 README.md       | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index baafcefdc59fe..7cf3aecabd7c1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -197,6 +197,7 @@ with `brew install cmake` if you are developing on MacOS or Linux system.
     Could not find .../pytorch/third_party/pybind11/CMakeLists.txt
     ```
     remove any `submodule.*` settings in your local git config (`.git/config` of your pytorch repo) and try again.
+* If you're a Windows contributor, please check out [Best Practices](https://github.com/pytorch/pytorch/wiki/Best-Practices-to-Edit-and-Compile-Pytorch-Source-Code-On-Windows).
 
 ## Nightly Checkout & Pull
 
diff --git a/README.md b/README.md
index 9b2a854ef3557..ed793fb8874e6 100644
--- a/README.md
+++ b/README.md
@@ -291,9 +291,10 @@ You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob
 ```cmd
 cmd
 
-:: [Optional] If you want to build with the VS 2017 generator for old CUDA and PyTorch, please change the value in the next line to `Visual Studio 15 2017`.
-:: Note: This value is useless if Ninja is detected. However, you can force that by using `set USE_NINJA=OFF`.
-set CMAKE_GENERATOR=Visual Studio 16 2019
+:: Set the environment variables after you have downloaded and upzipped the mkl package,
+:: else CMake would throw error as `Could NOT find OpenMP`.
+set CMAKE_INCLUDE_PATH={Your directory}\mkl\include
+set LIB={Your directory}\mkl\lib;%LIB%
 
 :: Read the content in the previous section carefully before you proceed.
 :: [Optional] If you want to override the underlying toolset used by Ninja and Visual Studio with CUDA, please run the following script block.

From d08a36f831cbcb4516fc1b68e3e3deff8ab45aba Mon Sep 17 00:00:00 2001
From: Aayush Prakash <aayushp@fb.com>
Date: Tue, 24 Aug 2021 11:19:34 -0700
Subject: [PATCH 179/530] Removing tensor.data usage in utils with tensor set_
 method (#63867)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63867

When updating the model parameter, updating `parameter.data` is no longer recommended, because this `data` field will be deprecated in the future.

The replacement is `tensor.set_`.

ghstack-source-id: 136531233

Test Plan: buck test mode/dev-nosan //caffe2/test/distributed:distributed_nccl_spawn -- test_periodic_model_averager

Reviewed By: SciPioneer

Differential Revision: D30513613

fbshipit-source-id: 402efb9c30fafc3f285bebc631639f656ceae585
---
 torch/distributed/algorithms/model_averaging/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py
index 44ee422b9e92d..5d796e885d2ae 100644
--- a/torch/distributed/algorithms/model_averaging/utils.py
+++ b/torch/distributed/algorithms/model_averaging/utils.py
@@ -29,5 +29,6 @@ def average_parameters(
 
     offset = 0
     for p in params_it2:
-        p.data = flat_params[offset : offset + p.numel()].view_as(p)
+        with torch.no_grad():
+            p.set_(flat_params[offset : offset + p.numel()].view_as(p))
         offset += p.numel()

From 835dac0869fa155ab9bdb434a230f141dd5afad3 Mon Sep 17 00:00:00 2001
From: Bo Wang <bowangbj@fb.com>
Date: Tue, 24 Aug 2021 11:45:54 -0700
Subject: [PATCH 180/530] Merge common fields from TensorInitParams and
 ShardedTensorMetadata into TensorProperties (#63731)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63731
1) Follow up [PR/63378 last comment](https://github.com/pytorch/pytorch/pull/63378#discussion_r693143053)
2) Also updated the caller side (usage of ShardedTensorMetadta) in fbcode

Ref: [landing workflow 3](https://www.internalfb.com/intern/wiki/PyTorch/PyTorchDev/Workflow/Landing/#landing-your-prs-from-gi-1)

Test Plan:
Imported from OSS

OSS: (pytorch).. $ python test/distributed/_sharded_tensor/test_sharded_tensor.py --v
FB:  fbcode $ buck test mode/dev //aiplatform/modelstore/checkpointing/pyper/tests:checkpoint_utils_test

Reviewed By: wanchaol, heitorschueroff

Differential Revision: D30472281

fbshipit-source-id: 727fb0e7f10eab4eb7a10476194e9008f2ac1fb5
---
 .../_sharded_tensor/test_sharded_tensor.py    |  65 ++++---
 torch/distributed/_sharded_tensor/__init__.py |  62 +++----
 torch/distributed/_sharded_tensor/api.py      | 161 +++++++++---------
 3 files changed, 154 insertions(+), 134 deletions(-)

diff --git a/test/distributed/_sharded_tensor/test_sharded_tensor.py b/test/distributed/_sharded_tensor/test_sharded_tensor.py
index 26a176b1455c1..6c03d9fdf631c 100644
--- a/test/distributed/_sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_sharded_tensor/test_sharded_tensor.py
@@ -19,6 +19,7 @@
 from torch.distributed._sharded_tensor.api import (
     CreateOp,
     TensorInitParams,
+    TensorProperties,
     _create_tensor_from_params,
 )
 from torch.testing._internal.common_distributed import (
@@ -125,13 +126,14 @@ def wrapper(self):
 class TestCreateTensorFromParams(TestCase):
     @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed')
     def test_empty(self):
-        tensor_init_params = TensorInitParams(
-            create_op=CreateOp.EMPTY,
+        tensor_properties = TensorProperties(
             dtype=torch.double,
             layout=torch.strided,
             requires_grad=False,
             pin_memory=False,
             memory_format=torch.contiguous_format, )
+        tensor_init_params = TensorInitParams(create_op=CreateOp.EMPTY,
+                                              tensor_properties=tensor_properties)
         local_device = torch.device('cuda:0')
         local_tensor = _create_tensor_from_params(
             5, 10, local_device=local_device, tensor_init_params=tensor_init_params)
@@ -142,13 +144,14 @@ def test_empty(self):
 
     @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed')
     def test_ones(self):
-        tensor_init_params = TensorInitParams(
-            create_op=CreateOp.ONES,
+        tensor_properties = TensorProperties(
             dtype=torch.double,
             layout=torch.strided,
             requires_grad=False,
             pin_memory=False,
             memory_format=torch.contiguous_format, )
+        tensor_init_params = TensorInitParams(
+            create_op=CreateOp.ONES, tensor_properties=tensor_properties)
         local_device = torch.device('cuda:0')
         local_tensor = _create_tensor_from_params(
             5, 10, local_device=local_device, tensor_init_params=tensor_init_params)
@@ -1267,15 +1270,18 @@ def test_init_from_local_shards(self):
 
         local_shards = [_sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata)]
 
-        sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata(
-            shards_metadata=shards_metadata,
-            size=torch.Size([10, 10]),
+        tensor_properties = TensorProperties(
             dtype=torch.get_default_dtype(),
             layout=torch.strided,
             requires_grad=False,
             memory_format=torch.contiguous_format,
             pin_memory=False,
         )
+        sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata(
+            shards_metadata=shards_metadata,
+            size=torch.Size([10, 10]),
+            tensor_properties=tensor_properties,
+        )
 
         sharded_tensor = _sharded_tensor.init_from_local_shards(local_shards, sharded_tensor_metadata, init_rrefs=True)
         self.assertEqual((10, 10), sharded_tensor.size())
@@ -1334,15 +1340,19 @@ def test_init_from_local_shards_new_group(self):
             local_shard_metadata = rank1_shard_metadata if self.rank == 1 else rank3_shard_metadata
             local_shards.append(_sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata))
 
-        sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata(
-            shards_metadata=shards_metadata,
-            size=torch.Size([10, 5]),
+        tensor_properties = TensorProperties(
             dtype=torch.get_default_dtype(),
             layout=torch.strided,
             requires_grad=False,
             memory_format=torch.contiguous_format,
             pin_memory=False,
         )
+
+        sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata(
+            shards_metadata=shards_metadata,
+            size=torch.Size([10, 5]),
+            tensor_properties=tensor_properties
+        )
         sharded_tensor = _sharded_tensor.init_from_local_shards(local_shards, sharded_tensor_metadata, new_pg, init_rrefs=True)
 
         if self.rank == 1 or self.rank == 3:
@@ -1403,15 +1413,18 @@ def test_init_from_local_shards_invalid_shards(self):
                     placement=f"rank:{r}/cuda:{r}"
                 ))
 
-        sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata(
-            shards_metadata=shards_metadata,
-            size=torch.Size([10, 10]),
+        tensor_properties = TensorProperties(
             dtype=torch.get_default_dtype(),
             layout=torch.strided,
             requires_grad=False,
             memory_format=torch.contiguous_format,
             pin_memory=False,
         )
+        sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata(
+            shards_metadata=shards_metadata,
+            size=torch.Size([10, 10]),
+            tensor_properties=tensor_properties
+        )
 
         empty_local_shards = []
         with self.assertRaisesRegex(RuntimeError, 'does not match number of local shards metadata'):
@@ -1435,7 +1448,7 @@ def test_init_from_local_shards_invalid_shards(self):
         wrong_dtype_shards = [
             _sharded_tensor.Shard(torch.ones(5, 5, device=f"cuda:{self.rank}", dtype=torch.int), local_shard_metadata)
         ]
-        with self.assertRaisesRegex(ValueError, 'Local shard tensor dtype does not match with sharded_tensor_metadata'):
+        with self.assertRaisesRegex(ValueError, 'Local shard tensor dtype does not match with tensor_properties!'):
             sharded_tensor = _sharded_tensor.init_from_local_shards(wrong_dtype_shards, sharded_tensor_metadata, init_rrefs=True)
 
         indices = [[0, 1, 1], [2, 0, 2]]
@@ -1445,21 +1458,21 @@ def test_init_from_local_shards_invalid_shards(self):
         wrong_layout_shards = [
             _sharded_tensor.Shard(sparse_tensor, local_shard_metadata)
         ]
-        with self.assertRaisesRegex(ValueError, 'Local shard tensor layout does not match with sharded_tensor_metadata'):
+        with self.assertRaisesRegex(ValueError, 'Local shard tensor layout does not match with tensor_properties!'):
             sharded_tensor = _sharded_tensor.init_from_local_shards(
                 wrong_layout_shards, sharded_tensor_metadata, init_rrefs=True)
 
         wrong_requires_grad_shards = [
             _sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}", requires_grad=True), local_shard_metadata)
         ]
-        with self.assertRaisesRegex(ValueError, 'Local shard tensor requires_grad does not match with sharded_tensor_metadata'):
+        with self.assertRaisesRegex(ValueError, 'Local shard tensor requires_grad does not match with tensor_properties!'):
             sharded_tensor = _sharded_tensor.init_from_local_shards(
                 wrong_requires_grad_shards, sharded_tensor_metadata, init_rrefs=True)
 
         wrong_pin_memory_shards = [
             _sharded_tensor.Shard(torch.randn(5, 5, pin_memory=True), local_shard_metadata)
         ]
-        with self.assertRaisesRegex(ValueError, 'Local shard tensor pin_memory does not match with sharded_tensor_metadata'):
+        with self.assertRaisesRegex(ValueError, 'Local shard tensor pin_memory does not match with tensor_properties!'):
             sharded_tensor = _sharded_tensor.init_from_local_shards(
                 wrong_pin_memory_shards, sharded_tensor_metadata, init_rrefs=True)
 
@@ -1492,15 +1505,18 @@ def test_init_from_local_shards_invalid_shards_overlap(self):
                     placement=f"rank:{r}/cuda:{r}"
                 ))
 
-        sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata(
-            shards_metadata=shards_metadata,
-            size=torch.Size([10, 10]),
+        tensor_properties = TensorProperties(
             dtype=torch.get_default_dtype(),
             layout=torch.strided,
             requires_grad=False,
             memory_format=torch.contiguous_format,
             pin_memory=False,
         )
+        sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata(
+            shards_metadata=shards_metadata,
+            size=torch.Size([10, 10]),
+            tensor_properties=tensor_properties
+        )
 
         local_shard_size = (5, 5) if self.rank != 0 else (6, 6)
 
@@ -1531,15 +1547,18 @@ def test_init_from_local_shards_invalid_shards_gaps(self):
                     placement=f"rank:{r}/cuda:{r}"
                 ))
 
-        sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata(
-            shards_metadata=shards_metadata,
-            size=torch.Size([10, 10]),
+        tensor_properties = TensorProperties(
             dtype=torch.get_default_dtype(),
             layout=torch.strided,
             requires_grad=False,
             memory_format=torch.contiguous_format,
             pin_memory=False,
         )
+        sharded_tensor_metadata = _sharded_tensor.ShardedTensorMetadata(
+            shards_metadata=shards_metadata,
+            size=torch.Size([10, 10]),
+            tensor_properties=tensor_properties
+        )
 
         local_shard_size = (5, 5) if self.rank != 0 else (4, 4)
 
diff --git a/torch/distributed/_sharded_tensor/__init__.py b/torch/distributed/_sharded_tensor/__init__.py
index ecb7ea1fed8c6..4cbdded8ba1c4 100644
--- a/torch/distributed/_sharded_tensor/__init__.py
+++ b/torch/distributed/_sharded_tensor/__init__.py
@@ -8,20 +8,20 @@
     ShardedTensor,
     ShardedTensorMetadata,
     TensorInitParams,
+    TensorProperties,
     load_with_process_group,
 )
 
 
-def empty(
-        sharding_spec: ShardingSpec,
-        *size,
-        dtype=None,
-        layout=torch.strided,
-        requires_grad=False,
-        pin_memory=False,
-        memory_format=torch.contiguous_format,
-        process_group=None,
-        init_rrefs=False):
+def empty(sharding_spec: ShardingSpec,
+          *size,
+          dtype=None,
+          layout=torch.strided,
+          requires_grad=False,
+          pin_memory=False,
+          memory_format=torch.contiguous_format,
+          process_group=None,
+          init_rrefs=False):
     """
     Creates an empty :class:`ShardedTensor`. Needs to be called on all ranks in an SPMD fashion.
 
@@ -52,9 +52,10 @@ def empty(
     Returns:
         A :class:`ShardedTensor` object on each rank
     """
-    tensor_init_params = TensorInitParams(create_op=CreateOp.EMPTY, dtype=dtype, layout=layout,
-                                          requires_grad=requires_grad,
-                                          pin_memory=pin_memory, memory_format=memory_format)
+    tensor_properties = TensorProperties(dtype=dtype, layout=layout,
+                                         requires_grad=requires_grad,
+                                         pin_memory=pin_memory, memory_format=memory_format, )
+    tensor_init_params = TensorInitParams(create_op=CreateOp.EMPTY, tensor_properties=tensor_properties, )
     return ShardedTensor(
         sharding_spec,
         *size,
@@ -63,16 +64,15 @@ def empty(
         init_rrefs=init_rrefs,
     )
 
-def ones(
-        sharding_spec: ShardingSpec,
-        *size,
-        dtype=None,
-        layout=torch.strided,
-        requires_grad=False,
-        pin_memory=False,
-        memory_format=torch.contiguous_format,
-        process_group=None,
-        init_rrefs=False):
+def ones(sharding_spec: ShardingSpec,
+         *size,
+         dtype=None,
+         layout=torch.strided,
+         requires_grad=False,
+         pin_memory=False,
+         memory_format=torch.contiguous_format,
+         process_group=None,
+         init_rrefs=False):
     """
     Creates a ones :class:`ShardedTensor`. Needs to be called on all ranks in an SPMD fashion.
 
@@ -101,9 +101,10 @@ def ones(
     Returns:
         A :class:`ShardedTensor` object on each rank
     """
-    tensor_init_params = TensorInitParams(create_op=CreateOp.ONES, dtype=dtype, layout=layout,
-                                          requires_grad=requires_grad,
-                                          pin_memory=pin_memory, memory_format=memory_format)
+    tensor_properties = TensorProperties(dtype=dtype, layout=layout,
+                                         requires_grad=requires_grad,
+                                         pin_memory=pin_memory, memory_format=memory_format, )
+    tensor_init_params = TensorInitParams(create_op=CreateOp.ONES, tensor_properties=tensor_properties)
     return ShardedTensor(
         sharding_spec,
         *size,
@@ -112,11 +113,10 @@ def ones(
         init_rrefs=init_rrefs,
     )
 
-def init_from_local_shards(
-        local_shards: List[Shard],
-        sharded_tensor_metadata: ShardedTensorMetadata,
-        process_group=None,
-        init_rrefs=False):
+def init_from_local_shards(local_shards: List[Shard],
+                           sharded_tensor_metadata: ShardedTensorMetadata,
+                           process_group=None,
+                           init_rrefs=False):
     """
     Creates an :class:`ShardedTensor` from local shards and the global metadata.
     Needs to be called on all ranks in an SPMD fashion.
diff --git a/torch/distributed/_sharded_tensor/api.py b/torch/distributed/_sharded_tensor/api.py
index 5f501b7689e4e..ae1a3a9f38844 100644
--- a/torch/distributed/_sharded_tensor/api.py
+++ b/torch/distributed/_sharded_tensor/api.py
@@ -59,6 +59,17 @@ class Shard(object):
     tensor: torch.Tensor
     metadata: ShardMetadata
 
+@dataclass
+class TensorProperties(object):
+    """ Properties used to create :class:`Tensor` """
+
+    # Regular tensor fields
+    dtype: torch.dtype = field(default=torch.get_default_dtype())
+    layout: torch.layout = field(default=torch.strided)
+    requires_grad: bool = False
+    memory_format: torch.memory_format = field(default=torch.contiguous_format)
+    pin_memory: bool = False
+
 @dataclass
 class ShardedTensorMetadata(object):
     """
@@ -71,50 +82,55 @@ class ShardedTensorMetadata(object):
     # Size of each dim of the overall Tensor.
     size: torch.Size = field(default=torch.Size([]))
 
-    # Regular tensor fields
-    dtype: torch.dtype = field(default=torch.get_default_dtype())
-    layout: torch.layout = field(default=torch.strided)
-    requires_grad: bool = False
-    memory_format: torch.memory_format = field(default=torch.contiguous_format)
-    pin_memory: bool = False
+    tensor_properties: TensorProperties = field(
+        default=TensorProperties(dtype=torch.get_default_dtype(),
+                                 layout=torch.strided,
+                                 requires_grad=False,
+                                 memory_format=torch.contiguous_format,
+                                 pin_memory=False))
 
     def __getstate__(self):
         # Since torch.memory_format cannot be pickled!
-        if self.memory_format == torch.contiguous_format:
+        memory_format = self.tensor_properties.memory_format
+        if memory_format == torch.contiguous_format:
             mem_format_encoding = 0
-        elif self.memory_format == torch.channels_last:
+        elif memory_format == torch.channels_last:
             mem_format_encoding = 1
-        elif self.memory_format == torch.preserve_format:
+        elif memory_format == torch.preserve_format:
             mem_format_encoding = 1
         else:
-            raise RuntimeError(f'Invalid torch.memory_format: {self.memory_format}')
+            raise RuntimeError(f'Invalid torch.memory_format: {memory_format}')
 
+        # Keep old seriazation to ensure backward compatibility
         return (
             self.shards_metadata,
             self.size,
-            self.dtype,
-            self.layout,
-            self.requires_grad,
+            self.tensor_properties.dtype,
+            self.tensor_properties.layout,
+            self.tensor_properties.requires_grad,
             mem_format_encoding,
-            self.pin_memory,
+            self.tensor_properties.pin_memory,
         )
 
     def __setstate__(
         self,
         state,
     ):
-        (self.shards_metadata, self.size, self.dtype, self.layout,
-            self.requires_grad, mem_format_encoding, self.pin_memory) = state
+        (self.shards_metadata, self.size, dtype, layout, requires_grad, mem_format_encoding, pin_memory) = state
 
         if mem_format_encoding == 0:
-            self.memory_format = torch.contiguous_format
+            memory_format = torch.contiguous_format
         elif mem_format_encoding == 1:
-            self.memory_format = torch.channels_last
+            memory_format = torch.channels_last
         elif mem_format_encoding == 2:
-            self.memory_format = torch.preserve_format
+            memory_format = torch.preserve_format
         else:
             raise RuntimeError(f'Invalid torch.memory_format encoding: {mem_format_encoding}')
 
+        self.tensor_properties = TensorProperties(
+            dtype=dtype, layout=layout, requires_grad=requires_grad,
+            memory_format=memory_format, pin_memory=pin_memory, )
+
 
 def _register_remote_shards(sharded_tensor_id: int, rrefs: List[rpc.RRef[Shard]], rpc_rank: int):
     with _sharded_tensor_lock:
@@ -134,15 +150,10 @@ class CreateOp(Enum):
 class TensorInitParams(object):
     """ Container for list of common params to create new local tensor. """
 
-    __slots__ = ['create_op', 'dtype', 'layout', 'requires_grad', 'pin_memory',
-                 'memory_format']
+    __slots__ = ['create_op', 'tensor_properties']
 
     create_op: CreateOp
-    dtype: torch.dtype
-    layout: torch.layout
-    requires_grad: bool
-    pin_memory: bool
-    memory_format: torch.memory_format
+    tensor_properties: TensorProperties
 
 
 class ShardedTensor(object):
@@ -188,13 +199,16 @@ def __init__(
         # _process_group, _local_shards, etc.
         self._prepare_init(process_group=process_group, init_rrefs=init_rrefs)
 
-        if tensor_init_params.dtype is None:
-            tensor_init_params.dtype = torch.get_default_dtype()
+        if tensor_init_params.tensor_properties is None:
+            raise ValueError('tensor_properties must not be None.')
+
+        if tensor_init_params.tensor_properties.dtype is None:
+            tensor_init_params.tensor_properties.dtype = torch.get_default_dtype()
 
-        if tensor_init_params.layout != torch.strided:
+        if tensor_init_params.tensor_properties.layout != torch.strided:
             raise ValueError('Only torch.strided layout is currently supported')
 
-        if tensor_init_params.memory_format != torch.contiguous_format:
+        if tensor_init_params.tensor_properties.memory_format != torch.contiguous_format:
             raise ValueError('Only torch.contiguous_format memory_format is currently supported')
 
         if len(size) == 1 and isinstance(size[0], collections.Sequence):
@@ -309,11 +323,12 @@ def _init_from_local_shards(
         init_rrefs=False,
     ):
         shards_metadata = sharded_tensor_metadata.shards_metadata
+        tensor_properties = sharded_tensor_metadata.tensor_properties
 
         if len(shards_metadata) == 0:
             raise ValueError("shards_metadata must not be empty!")
 
-        if sharded_tensor_metadata.layout != torch.strided:
+        if tensor_properties.layout != torch.strided:
             raise ValueError('Only torch.strided layout is currently supported')
 
         sharded_tensor = cls.__new__(cls)
@@ -354,11 +369,11 @@ def _init_from_local_shards(
             assert shard_meta in local_shard_metadatas, \
                 "local shard metadata not in sharded_tensor_metadata!"
 
-            if local_shard_tensor.layout != sharded_tensor_metadata.layout:
+            if local_shard_tensor.layout != tensor_properties.layout:
                 raise ValueError(
-                    f'Local shard tensor layout does not match with sharded_tensor_metadata! '
+                    f'Local shard tensor layout does not match with tensor_properties! '
                     f'local shard tensor layout: {local_shard_tensor.dtype}, '
-                    f'sharded_tensor_metadata layout: {sharded_tensor_metadata.layout}'
+                    f'tensor_properties layout: {tensor_properties.layout}'
                 )
 
             if not local_shard_tensor.is_contiguous():
@@ -371,11 +386,11 @@ def _init_from_local_shards(
                     f'local ShardMetadata shard lengths: {shard_meta.shard_lengths}'
                 )
 
-            if local_shard_tensor.is_pinned() != sharded_tensor_metadata.pin_memory:
+            if local_shard_tensor.is_pinned() != tensor_properties.pin_memory:
                 raise ValueError(
-                    f'Local shard tensor pin_memory does not match with sharded_tensor_metadata! '
+                    f'Local shard tensor pin_memory does not match with tensor_properties! '
                     f'local shard tensor pin_memory: {local_shard_tensor.is_pinned()}, '
-                    f'sharded_tensor_metadata pin_memory: {sharded_tensor_metadata.pin_memory}'
+                    f'tensor_properties pin_memory: {tensor_properties.pin_memory}'
                 )
 
             if local_shard_tensor.device != local_device:
@@ -385,18 +400,18 @@ def _init_from_local_shards(
                     f'local shard metadata placement device: {local_device}'
                 )
 
-            if local_shard_tensor.dtype != sharded_tensor_metadata.dtype:
+            if local_shard_tensor.dtype != tensor_properties.dtype:
                 raise ValueError(
-                    f'Local shard tensor dtype does not match with sharded_tensor_metadata! '
+                    f'Local shard tensor dtype does not match with tensor_properties! '
                     f'local shard tensor dtype: {local_shard_tensor.dtype}, '
-                    f'sharded_tensor_metadata dtype: {sharded_tensor_metadata.dtype}'
+                    f'tensor_properties dtype: {tensor_properties.dtype}'
                 )
 
-            if local_shard_tensor.requires_grad != sharded_tensor_metadata.requires_grad:
+            if local_shard_tensor.requires_grad != tensor_properties.requires_grad:
                 raise ValueError(
-                    f'Local shard tensor requires_grad does not match with sharded_tensor_metadata! '
+                    f'Local shard tensor requires_grad does not match with tensor_properties! '
                     f'local shard tensor requires_grad: {local_shard_tensor.requires_grad}, '
-                    f'sharded_tensor_metadata requires_grad: {sharded_tensor_metadata.requires_grad}'
+                    f'tensor_properties requires_grad: {tensor_properties.requires_grad}'
                 )
 
         # check if shards_metadata have overlap shards
@@ -459,14 +474,7 @@ def _init_chunked(self, dims, tensor_init_params: TensorInitParams, ):
 
         # Build overall metadata
         self._metadata = ShardedTensorMetadata(
-            shards_metadata,
-            dims,
-            tensor_init_params.dtype,
-            tensor_init_params.layout,
-            tensor_init_params.requires_grad,
-            tensor_init_params.memory_format,
-            tensor_init_params.pin_memory,
-        )
+            shards_metadata, dims, tensor_init_params.tensor_properties, )
 
     def _init_enumerable(self, dims, tensor_init_params: TensorInitParams):
         # Validate the sharding spec is compatible with the tensor.
@@ -488,14 +496,7 @@ def _init_enumerable(self, dims, tensor_init_params: TensorInitParams):
 
         # Build overall metadata
         self._metadata = ShardedTensorMetadata(
-            shards_metadata,
-            dims,
-            tensor_init_params.dtype,
-            tensor_init_params.layout,
-            tensor_init_params.requires_grad,
-            tensor_init_params.memory_format,
-            tensor_init_params.pin_memory,
-        )
+            shards_metadata, dims, tensor_init_params.tensor_properties, )
 
     def _parse_and_validate_remote_device(self, remote_device: torch.distributed._remote_device):
 
@@ -555,14 +556,14 @@ def is_pinned(self) -> bool:
         """
         Returns True if the sharded tensor (each local shard) resides in pinned memory.
         """
-        return self._metadata.pin_memory
+        return self._metadata.tensor_properties.pin_memory
 
     def is_contiguous(self) -> bool:
         """
         Returns True if the sharded tensor (each local shard) is contiguous in memory
         in the order specified by memory format.
         """
-        return self._metadata.memory_format == torch.contiguous_format
+        return self._metadata.tensor_properties.memory_format == torch.contiguous_format
 
     @property
     def shape(self):
@@ -570,15 +571,15 @@ def shape(self):
 
     @property
     def requires_grad(self):
-        return self._metadata.requires_grad
+        return self._metadata.tensor_properties.requires_grad
 
     @property
     def dtype(self):
-        return self._metadata.dtype
+        return self._metadata.tensor_properties.dtype
 
     @property
     def layout(self):
-        return self._metadata.layout
+        return self._metadata.tensor_properties.layout
 
     def _register_remote_shards(self, remote_shards: List[rpc.RRef[Shard]], rpc_rank: int):
         self._remote_shards[rpc_rank] = remote_shards
@@ -667,21 +668,21 @@ def __setstate__(self, state):
 def _create_tensor_from_params(*size, local_device, tensor_init_params: TensorInitParams):
     """ Helper to construct tensor from size, device and common params. """
 
-    if tensor_init_params.create_op == CreateOp.ONES:
-        return torch.ones(*size,
-                          dtype=tensor_init_params.dtype,
-                          layout=tensor_init_params.layout,
-                          device=local_device,
-                          pin_memory=tensor_init_params.pin_memory,
-                          requires_grad=tensor_init_params.requires_grad,)
-    elif tensor_init_params.create_op == CreateOp.EMPTY:
-        return torch.empty(*size,
-                           dtype=tensor_init_params.dtype,
-                           layout=tensor_init_params.layout,
-                           device=local_device,
-                           requires_grad=tensor_init_params.requires_grad,
-                           # Note memory_format param is not accepted by torch.ones
-                           memory_format=tensor_init_params.memory_format,
-                           pin_memory=tensor_init_params.pin_memory,)
+    create_op = tensor_init_params.create_op
+    dtype = tensor_init_params.tensor_properties.dtype
+    layout = tensor_init_params.tensor_properties.layout
+    requires_grad = tensor_init_params.tensor_properties.requires_grad
+    memory_format = tensor_init_params.tensor_properties.memory_format
+    pin_memory = tensor_init_params.tensor_properties.pin_memory
+
+    if create_op == CreateOp.ONES:
+        return torch.ones(*size, dtype=dtype, layout=layout,
+                          device=local_device, pin_memory=pin_memory,
+                          requires_grad=requires_grad,)
+    elif create_op == CreateOp.EMPTY:
+        return torch.empty(*size, dtype=dtype, layout=layout,
+                           device=local_device, requires_grad=requires_grad,
+                           # NB: memory_format param is not accepted by torch.ones
+                           memory_format=memory_format, pin_memory=pin_memory,)
     else:
         raise ValueError(f'Unsupported create_op: {tensor_init_params.create_op}')

From 699c764d2ef1e489ef2766d360701f61f602a7d7 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 24 Aug 2021 12:19:16 -0700
Subject: [PATCH 181/530] Revert D30513613: Removing tensor.data usage in utils
 with tensor set_ method

Test Plan: revert-hammer

Differential Revision:
D30513613 (https://github.com/pytorch/pytorch/commit/d08a36f831cbcb4516fc1b68e3e3deff8ab45aba)

Original commit changeset: 402efb9c30fa

fbshipit-source-id: 911c66a9852de77dc5274b5fb373258c0c97739a
---
 torch/distributed/algorithms/model_averaging/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py
index 5d796e885d2ae..44ee422b9e92d 100644
--- a/torch/distributed/algorithms/model_averaging/utils.py
+++ b/torch/distributed/algorithms/model_averaging/utils.py
@@ -29,6 +29,5 @@ def average_parameters(
 
     offset = 0
     for p in params_it2:
-        with torch.no_grad():
-            p.set_(flat_params[offset : offset + p.numel()].view_as(p))
+        p.data = flat_params[offset : offset + p.numel()].view_as(p)
         offset += p.numel()

From 4a0776100e347c887378b676f3deabd14b88d7b2 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Tue, 24 Aug 2021 12:43:27 -0700
Subject: [PATCH 182/530] Migrate legacy lstsq from THC to ATen (CUDA) (#63504)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63504

Closes gh-24592

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D30441304

Pulled By: ngimel

fbshipit-source-id: ec176596f54bc084af48a73d1dbb0dcb82fec593
---
 BUILD.bazel                                   |  1 -
 aten/src/ATen/LegacyTHFunctionsCUDA.h         |  5 --
 aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp  | 77 -----------------
 .../ATen/native/cuda/BatchLinearAlgebra.cu    | 78 +++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    |  4 +-
 aten/src/THC/CMakeLists.txt                   |  3 -
 aten/src/THC/THCTensorMath.h                  |  3 -
 aten/src/THC/THCTensorMathMagma.cpp           | 16 ----
 aten/src/THC/THCTensorMathMagma.h             | 20 -----
 aten/src/THC/generic/THCTensorMathMagma.cpp   | 83 -------------------
 aten/src/THC/generic/THCTensorMathMagma.h     | 17 ----
 11 files changed, 80 insertions(+), 227 deletions(-)
 delete mode 100644 aten/src/THC/THCTensorMathMagma.h
 delete mode 100644 aten/src/THC/generic/THCTensorMathMagma.cpp
 delete mode 100644 aten/src/THC/generic/THCTensorMathMagma.h

diff --git a/BUILD.bazel b/BUILD.bazel
index 5acbe4082d38e..afdd4699b160f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -393,7 +393,6 @@ filegroup(
         "aten/src/THC/THCTensor.cu.cc",
         "aten/src/THC/THCTensorCopy.cu.cc",
         "aten/src/THC/THCTensorMath.cu.cc",
-        "aten/src/THC/THCTensorMathMagma.cu.cc",
         "aten/src/THC/THCTensorMathPairwise.cu.cc",
         "aten/src/THC/THCTensorMathScan.cu.cc",
         "aten/src/THC/THCTensorScatterGather.cu.cc",
diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
index 1a20e0bb8fa0b..41cbdd6f4ffe1 100644
--- a/aten/src/ATen/LegacyTHFunctionsCUDA.h
+++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@@ -18,12 +18,7 @@ namespace native {
 namespace legacy {
 namespace cuda {
 
-std::tuple<Tensor &,Tensor &> _th_gels_out(const Tensor & self, const Tensor & A, Tensor & res1, Tensor & res2);
-std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
-Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper);
-Tensor _th_potri(const Tensor & self, bool upper);
 Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src);
-Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training);
 
 } // namespace th
 } // namespace legacy
diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
index 0ad6dc8256ff0..c4e9dfe78cebe 100644
--- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
+++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -39,83 +39,6 @@ namespace {
   }
 }
 
-std::tuple<Tensor &,Tensor &> _th_gels_out(const Tensor & self, const Tensor & A, Tensor & res1, Tensor & res2) {
-    TORCH_WARN_ONCE(
-      "torch.lstsq is deprecated in favor of torch.linalg.lstsq and will be removed in a future PyTorch release.\n",
-      "torch.linalg.lstsq has reversed arguments and does not return the QR decomposition in "
-      "the returned tuple (although it returns other information about the problem).\n",
-      "To get the qr decomposition consider using torch.linalg.qr.\n",
-      "The returned solution in torch.lstsq stored the residuals of the solution in the ",
-      "last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, the ",
-      "residuals in the field 'residuals' of the returned named tuple.\n",
-      "The unpacking of the solution, as in\n",
-      "X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n",
-      "should be replaced with\n",
-      "X = torch.linalg.lstsq(A, B).solution"
-    );
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaDoubleTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_gels_out not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return std::tuple<Tensor &, Tensor &>(res1, res2);
-}
-std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A) {
-    TORCH_WARN_ONCE(
-      "torch.lstsq is deprecated in favor of torch.linalg.lstsq and will be removed in a future PyTorch release.\n",
-      "torch.linalg.lstsq has reversed arguments and does not return the QR decomposition in "
-      "the returned tuple (although it returns other information about the problem).\n",
-      "To get the qr decomposition consider using torch.linalg.qr.\n",
-      "The returned solution in torch.lstsq stored the residuals of the solution in the ",
-      "last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, the ",
-      "residuals in the field 'residuals' of the returned named tuple.\n",
-      "The unpacking of the solution, as in\n",
-      "X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n",
-      "should be replaced with\n",
-      "X = torch.linalg.lstsq(A, B).solution"
-    );
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto res1_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto res1 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res1_));
-    auto res2_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto res2 = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(res2_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaDoubleTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto A_ = checked_dense_tensor_unwrap(A, "A", 2, "_th_gels", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaTensor_gels(globalContext().getTHCState(), res1_, res2_, self_, A_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_gels not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return std::tuple<Tensor, Tensor>(res1, res2);
-}
 Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index 0dae7a2aa3c11..4e806f000c5ae 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -3114,6 +3114,84 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul
 
 REGISTER_DISPATCH(lstsq_stub, &lstsq_kernel);
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ legacy_lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+std::tuple<Tensor, Tensor> legacy_lstsq_cuda(const Tensor &B, const Tensor &A) {
+  TORCH_WARN_ONCE(
+      "torch.lstsq is deprecated in favor of torch.linalg.lstsq and will be removed in a future PyTorch release.\n",
+      "torch.linalg.lstsq has reversed arguments and does not return the QR decomposition in "
+      "the returned tuple (although it returns other information about the problem).\n",
+      "To get the qr decomposition consider using torch.linalg.qr.\n",
+      "The returned solution in torch.lstsq stored the residuals of the solution in the ",
+      "last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, the ",
+      "residuals in the field 'residuals' of the returned named tuple.\n",
+      "The unpacking of the solution, as in\n",
+      "X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n",
+      "should be replaced with\n",
+      "X = torch.linalg.lstsq(A, B).solution"
+    );
+
+#ifndef USE_MAGMA
+  TORCH_CHECK(false, "solve: MAGMA library not found in "
+              "compilation. Please rebuild with MAGMA.");
+#else
+  const auto dtype = A.scalar_type();
+  TORCH_CHECK(B.scalar_type() == dtype, "exepected A and B dtypes to match but found ",
+              dtype, " and ", B.scalar_type());
+  TORCH_CHECK(A.numel() > 0 && A.dim() == 2, "A should be (non-empty) 2 dimensional");
+  TORCH_CHECK(B.numel() > 0 && B.dim() == 2, "B should be (non-empty) 2 dimensional");
+  auto a_sizes = A.sizes();
+  auto b_sizes = B.sizes();
+  TORCH_CHECK(a_sizes[0] == b_sizes[0], "Expected A and b to have same size "
+      "at dim 0, but A has ", a_sizes[0], " rows and B has ", b_sizes[0], " rows");
+  TORCH_CHECK(a_sizes[0] >= a_sizes[1], "Expected A with shape (m x n) to have "
+      "m >= n. The case for m < n is not implemented yet.");
+
+  Tensor A_working = cloneBatchedColumnMajor(A);
+  Tensor B_working = cloneBatchedColumnMajor(B);
+
+  int64_t m = a_sizes[0];
+  int64_t n = a_sizes[1];
+  int64_t nrhs = b_sizes[1];
+
+  int info;
+  AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "legacy_lstsq_cuda", [&] {
+    scalar_t *a_data = A_working.data_ptr<scalar_t>();
+    scalar_t *b_data = B_working.data_ptr<scalar_t>();
+    scalar_t wkopt;
+    magmaGels(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info);
+
+    const auto hwork_size = static_cast<magma_int_t>(wkopt);
+    scalar_t *hwork = nullptr;
+    ALLOCATE_ARRAY(hwork, scalar_t, hwork_size);
+
+    magmaGels(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, hwork_size, &info);
+  });
+
+  TORCH_CHECK(info == 0, "MAGMA gels : Argument %d : illegal value", -info);
+  return std::tuple<Tensor, Tensor>(B_working, A_working);
+#endif  // USE_MAGMA
+}
+
+std::tuple<Tensor&, Tensor&> legacy_lstsq_out_cuda(
+    const Tensor& B, const Tensor& A, Tensor& B_out, Tensor& A_out) {
+  const auto dtype = A.scalar_type();
+  TORCH_CHECK(B.scalar_type() == dtype, "exepected A and B dtypes to match but found ",
+              A.scalar_type(), " and ", B.scalar_type());
+  TORCH_CHECK(A_out.scalar_type() == dtype, "A_out to have scalar type ", dtype,
+              " but found", A_out.scalar_type());
+  TORCH_CHECK(B_out.scalar_type() == dtype, "A_out to have scalar type ", dtype,
+              " but found", B_out.scalar_type());
+  Tensor A_tmp, B_tmp;
+  std::tie(B_tmp, A_tmp) = native::legacy_lstsq_cuda(B, A);
+  resize_output(A_out, A_tmp.sizes());
+  A_out.copy_(A_tmp);
+  resize_output(B_out, B_tmp.sizes());
+  B_out.copy_(B_tmp);
+  return std::tuple<Tensor&, Tensor&>(B_out, A_out);
+}
+
+
 }}  // namespace at::native
 
 #undef ALLOCATE_ARRAY
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 9bce764b1ee1a..4f7d7e66a7d5e 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6658,13 +6658,13 @@
 - func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)
   dispatch:
     CPU: legacy_lstsq_out
-    CUDA: legacy::cuda::_th_gels_out
+    CUDA: legacy_lstsq_out_cuda
 
 - func: lstsq(Tensor self, Tensor A) -> (Tensor solution, Tensor QR)
   variants: method, function
   dispatch:
     CPU: legacy_lstsq
-    CUDA: legacy::cuda::_th_gels
+    CUDA: legacy_lstsq_cuda
 
 - func: triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient)
   dispatch:
diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt
index 786506027ea8f..f34b040e484ce 100644
--- a/aten/src/THC/CMakeLists.txt
+++ b/aten/src/THC/CMakeLists.txt
@@ -66,7 +66,6 @@ install(FILES
           THCNumerics.cuh
           THCTensorInfo.cuh
           THCTensorTypeUtils.cuh
-          THCTensorMathMagma.h
           THCThrustAllocator.cuh
           # See Note [TH abstraction violation]
           THCTensor.hpp
@@ -88,8 +87,6 @@ install(FILES
           generic/THCTensorCopy.h
           generic/THCTensorMath.h
           generic/THCTensorMath.cu
-          generic/THCTensorMathMagma.h
-          generic/THCTensorMathMagma.cpp
           generic/THCTensorMathPairwise.h
           generic/THCTensorMathPairwise.cu
           DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THC/generic")
diff --git a/aten/src/THC/THCTensorMath.h b/aten/src/THC/THCTensorMath.h
index 422a423959457..b70d4d14b02d9 100644
--- a/aten/src/THC/THCTensorMath.h
+++ b/aten/src/THC/THCTensorMath.h
@@ -13,9 +13,6 @@
 #include <THC/generic/THCTensorMath.h>
 #include <THC/THCGenerateBFloat16Type.h>
 
-#include <THC/generic/THCTensorMathMagma.h>
-#include <THC/THCGenerateAllTypes.h>
-
 #include <THC/generic/THCTensorMathPairwise.h>
 #include <THC/THCGenerateAllTypes.h>
 
diff --git a/aten/src/THC/THCTensorMathMagma.cpp b/aten/src/THC/THCTensorMathMagma.cpp
index ca0cc8a621282..43607531bd60e 100644
--- a/aten/src/THC/THCTensorMathMagma.cpp
+++ b/aten/src/THC/THCTensorMathMagma.cpp
@@ -1,23 +1,10 @@
 #include <THC/THCGeneral.h>
-#include <THC/THCTensorMath.h>
-#include <THC/THCTensorCopy.h>
-#include <THC/THCTensorMathMagma.h>
-#include <THC/THCTensor.hpp>
-#include <THC/THCStorage.hpp>
-#include <algorithm>
-#include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/cuda/detail/CUDAHooks.h>
 
 #ifdef USE_MAGMA
 #include <magma_v2.h>
 #endif
 
-#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y) - 1) / (y))
-#endif
-
-#define NoMagma(name) "No CUDA implementation of '" #name "'. Install MAGMA and rebuild cutorch (http://icl.cs.utk.edu/magma/)"
-
 namespace {
 void _THCMagma_init() {
 #ifdef USE_MAGMA
@@ -31,6 +18,3 @@ struct Initializer {
   };
 } initializer;
 } // anonymous namespace
-
-#include <THC/generic/THCTensorMathMagma.cpp>
-#include <THC/THCGenerateAllTypes.h>
diff --git a/aten/src/THC/THCTensorMathMagma.h b/aten/src/THC/THCTensorMathMagma.h
deleted file mode 100644
index 1fb5821afce56..0000000000000
--- a/aten/src/THC/THCTensorMathMagma.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef THC_TENSOR_MATH_MAGMA_CUH
-#define THC_TENSOR_MATH_MAGMA_CUH
-
-#ifdef USE_MAGMA
-#include <magma_v2.h>
-#endif
-
-#ifdef USE_MAGMA
-template <typename T>
-static inline T* th_magma_malloc_pinned(size_t n)
-{
-  void* ptr;
-  if (MAGMA_SUCCESS != magma_malloc_pinned(&ptr, n * sizeof(T)))
-    THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456);
-  return reinterpret_cast<T*>(ptr);
-}
-
-#endif
-
-#endif // THC_TENSOR_MATH_MAGMA_CUH
diff --git a/aten/src/THC/generic/THCTensorMathMagma.cpp b/aten/src/THC/generic/THCTensorMathMagma.cpp
deleted file mode 100644
index 0d94fc320e53b..0000000000000
--- a/aten/src/THC/generic/THCTensorMathMagma.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THC/generic/THCTensorMathMagma.cpp"
-#else
-
-#include <c10/cuda/CUDAException.h>
-
-#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
-
-static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, THCTensor *src)
-{
-  THAssert(src->dim() == 2);
-  if (self == src && self->stride(0) == 1 && self->stride(1) == self->size(0))
-  {
-    THCTensor_(retain)(state, self);
-    return self;
-  }
-
-  if (self == src)
-    self = THCTensor_(new)(state);
-  else
-    THCTensor_(retain)(state, self);
-
-  int64_t size[2] = { src->size(0), src->size(1) };
-  int64_t stride[2] = { 1, src->size(0) };
-
-  THCTensor_(resizeNd)(state, self, 2, size, stride);
-  THCTensor_(copy)(state, self, src);
-  return self;
-}
-
-void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
-{
-#ifdef USE_MAGMA
-  THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
-  THArgCheck(!b_->is_empty() && b_->dim() == 2, 1, "b should be (non-empty) 2 dimensional");
-  TORCH_CHECK(a_->size(0) == b_->size(0), "Expected A and b to have same size "
-      "at dim 0, but A has ", a_->size(0), " rows and B has ", b_->size(0), " rows");
-  THArgCheck(a_->size(0) >= a_->size(1), 2, "Expected A with shape (m x n) to have "
-      "m >= n. The case for m < n is not implemented yet.");
-
-  THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
-  THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_);
-  scalar_t *a_data = THCTensor_(data)(state, a);
-  scalar_t *b_data = THCTensor_(data)(state, b);
-
-  int64_t m = a->size(0);
-  int64_t n = a->size(1);
-  int64_t nrhs = b->size(1);
-  scalar_t wkopt;
-
-  int info;
-  {
-    at::native::MagmaStreamSyncGuard guard;
-#if defined(THC_REAL_IS_FLOAT)
-    magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info);
-#else
-    magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info);
-#endif
-
-    scalar_t *hwork = th_magma_malloc_pinned<scalar_t>((size_t)wkopt);
-
-#if defined(THC_REAL_IS_FLOAT)
-    magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info);
-#else
-    magma_dgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info);
-#endif
-
-    magma_free_pinned(hwork);
-  }
-
-  if (info != 0)
-    THError("MAGMA gels : Argument %d : illegal value", -info);
-
-  THCTensor_(freeCopyTo)(state, a, ra_);
-  THCTensor_(freeCopyTo)(state, b, rb_);
-#else
-  THError(NoMagma(gels));
-#endif
-}
-
-#endif
-
-#endif
diff --git a/aten/src/THC/generic/THCTensorMathMagma.h b/aten/src/THC/generic/THCTensorMathMagma.h
deleted file mode 100644
index 585d02ceff7a7..0000000000000
--- a/aten/src/THC/generic/THCTensorMathMagma.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THC/generic/THCTensorMathMagma.h"
-#else
-
-#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
-
-// MAGMA (i.e. CUDA implementation of LAPACK functions)
-TORCH_CUDA_CU_API void THCTensor_(gels)(
-    THCState* state,
-    THCTensor* rb_,
-    THCTensor* ra_,
-    THCTensor* b_,
-    THCTensor* a_);
-
-#endif // defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
-
-#endif

From 5be17ec1fca4fcce5464cd679ee3f3dd6f102059 Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Tue, 24 Aug 2021 13:02:27 -0700
Subject: [PATCH 183/530] Do not modify saved variables in-place for spectral
 norm during power iteration (#62293)

Summary:
Interestingly enough, the original code did have a mechanism that aims to prevent this very issue:
but it performs a clone AFTER modifying u and v in-place.
This wouldn't work though because we can later use the cloned u and v in operations that save for backward, and the next time we execute forward, we modify the same cloned u and v in-place.
So if the idea is that we want to avoid modifying saved variable in-place we should clone it BEFORE the in-place operation.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62293

Reviewed By: bdhirsh

Differential Revision: D30489750

Pulled By: soulitzer

fbshipit-source-id: cbe8dea885aef97adda8481f7a822e5bd91f7889
---
 test/test_nn.py                    | 3 +++
 torch/nn/utils/parametrizations.py | 9 +++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 07a2b48cc6a20..43e105a676ced 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -4220,6 +4220,9 @@ def fn(input):
                     out1 = wrapped_m(input)
                     return out0 + out1
 
+                # Make sure we can compute gradients wrt to all the parameters in the case
+                # of double forward
+                fn(input.clone().requires_grad_()).sum().backward()
                 gradcheck(fn, (input.clone().requires_grad_(),), check_batched_grad=False)
 
                 # test removing
diff --git a/torch/nn/utils/parametrizations.py b/torch/nn/utils/parametrizations.py
index 7941f41f19cac..de3d5c7144f9e 100644
--- a/torch/nn/utils/parametrizations.py
+++ b/torch/nn/utils/parametrizations.py
@@ -84,6 +84,7 @@ def _power_method(self, weight_mat: torch.Tensor, n_power_iterations: int) -> No
 
         # Precondition
         assert weight_mat.ndim > 1
+
         for _ in range(n_power_iterations):
             # Spectral norm of weight equals to `u^T W v`, where `u` and `v`
             # are the first left and right singular vectors.
@@ -92,9 +93,6 @@ def _power_method(self, weight_mat: torch.Tensor, n_power_iterations: int) -> No
                                   dim=0, eps=self.eps, out=self._u)   # type: ignore[has-type]
             self._v = F.normalize(torch.mv(weight_mat.t(), self._u),
                                   dim=0, eps=self.eps, out=self._v)   # type: ignore[has-type]
-        # See above on why we need to clone
-        self._u = self._u.clone(memory_format=torch.contiguous_format)
-        self._v = self._v.clone(memory_format=torch.contiguous_format)
 
     def forward(self, weight: torch.Tensor) -> torch.Tensor:
         if weight.ndim == 1:
@@ -104,10 +102,13 @@ def forward(self, weight: torch.Tensor) -> torch.Tensor:
             weight_mat = self._reshape_weight_to_matrix(weight)
             if self.training:
                 self._power_method(weight_mat, self.n_power_iterations)
+            # See above on why we need to clone
+            u = self._u.clone(memory_format=torch.contiguous_format)
+            v = self._v.clone(memory_format=torch.contiguous_format)
             # The proper way of computing this should be through F.bilinear, but
             # it seems to have some efficiency issues:
             # https://github.com/pytorch/pytorch/issues/58093
-            sigma = torch.dot(self._u, torch.mv(weight_mat, self._v))
+            sigma = torch.dot(u, torch.mv(weight_mat, v))
             return weight / sigma
 
     def right_inverse(self, value: torch.Tensor) -> torch.Tensor:

From 4e37a015c7ed7ac4a4f57057cb726f53b94e693b Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Tue, 24 Aug 2021 13:44:52 -0700
Subject: [PATCH 184/530] [FX] Fix _replicate_for_data_parallel (#63821)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63821

Test Plan: Imported from OSS

Reviewed By: suo

Differential Revision: D30502115

Pulled By: jamesr66a

fbshipit-source-id: 0f004f95def6e1ba21ccbeab40cb0a739a0ad20c
---
 test/test_fx.py          | 15 +++++++++++++++
 torch/fx/graph_module.py |  5 +++++
 2 files changed, 20 insertions(+)

diff --git a/test/test_fx.py b/test/test_fx.py
index c55e97dc7da84..27f64e1cd1827 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -2296,6 +2296,21 @@ def forward(self, x):
                             r"Call using an FX-traced Module, line .* of the "
                             r"traced Module's generated forward function:")
 
+    def test_graph_module_replicate_for_dp(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return torch.relu(x)
+
+        gm = torch.fx.symbolic_trace(Foo())
+
+        x = torch.randn(5, 3)
+        out = gm(x)
+
+        replica = gm._replicate_for_data_parallel()
+        out_replica = replica(x)
+
+        torch.testing.assert_allclose(out_replica, out)
+
     def test_ast_rewriter_rewrites_assert(self):
         class M(torch.nn.Module):
             def forward(self, x: torch.Tensor, y: int, z: int):
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index b87aeaaa78a03..c91857342ffcd 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -656,6 +656,11 @@ def __str__(self) -> str:
         orig_str = super().__str__()
         return '\n'.join([orig_str, self._code])
 
+    def _replicate_for_data_parallel(self):
+        new_gm = self.__copy__()
+        new_gm._is_replica = True
+        return new_gm
+
 # workarounds for issues in __torch_function__
 
 # WAR for __torch_function__ not handling tensor lists,

From 865d127a66e961b12f895747c3c59bd7503223dd Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Tue, 24 Aug 2021 14:13:04 -0700
Subject: [PATCH 185/530] .github: Enable with-ssh for Windows (#63440)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63440

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: janeyx99

Differential Revision: D30521460

Pulled By: seemethere

fbshipit-source-id: e987e170e73fb4f9d9f024bed0e58404ed206848
---
 .github/scripts/kill_active_ssh_sessions.ps1  | 11 +++++++
 .github/scripts/wait_for_ssh_to_drain.ps1     | 17 +++++++++++
 .github/templates/windows_ci_workflow.yml.j2  | 24 +++++++++++++++
 ...rated-periodic-win-vs2019-cuda11.3-py3.yml | 30 +++++++++++++++++++
 .../generated-win-vs2019-cpu-py3.yml          | 30 +++++++++++++++++++
 .../generated-win-vs2019-cuda10.1-py3.yml     | 30 +++++++++++++++++++
 .../generated-win-vs2019-cuda11.1-py3.yml     | 30 +++++++++++++++++++
 7 files changed, 172 insertions(+)
 create mode 100644 .github/scripts/kill_active_ssh_sessions.ps1
 create mode 100644 .github/scripts/wait_for_ssh_to_drain.ps1

diff --git a/.github/scripts/kill_active_ssh_sessions.ps1 b/.github/scripts/kill_active_ssh_sessions.ps1
new file mode 100644
index 0000000000000..09cc63e94bc1f
--- /dev/null
+++ b/.github/scripts/kill_active_ssh_sessions.ps1
@@ -0,0 +1,11 @@
+function Get-SSH-Sessions {
+    Get-Process sshd -IncludeUserName |
+        Where-Object UserName -notLike "*SYSTEM*" |
+        Select-Object Id
+}
+
+$runningSessions = Get-SSH-Sessions
+
+foreach ($session in $runningSessions) {
+    Stop-Process -id $session.Id
+}
diff --git a/.github/scripts/wait_for_ssh_to_drain.ps1 b/.github/scripts/wait_for_ssh_to_drain.ps1
new file mode 100644
index 0000000000000..ab3ab41f355ce
--- /dev/null
+++ b/.github/scripts/wait_for_ssh_to_drain.ps1
@@ -0,0 +1,17 @@
+function Get-SSH-Users {
+    # Gets ssh sessions for all users not named SYSTEM
+    Get-CimInstance -ClassName Win32_Process -Filter "Name = 'sshd.exe'" |
+        Get-CimAssociatedInstance -Association Win32_SessionProcess |
+        Get-CimAssociatedInstance -Association Win32_LoggedOnUser |
+        Where-Object {$_.Name -ne 'SYSTEM'} |
+        Measure-Object
+}
+
+$usersLoggedOn = Get-SSH-Users
+
+Write-Output "Holding runner until all ssh sessions have logged out"
+while ($usersLoggedOn.Count -gt 0) {
+    $usersLoggedOn = Get-SSH-Users
+    Write-Output "."
+    Start-Sleep -s 5
+}
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index f00f4b19a903d..6756bf4720ac6 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -4,6 +4,20 @@
 {# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #}
 {%- set squid_no_proxy = "localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%}
 
+{%- macro wait_and_kill_ssh() -%}
+      - name: Wait until all sessions have drained
+        shell: powershell
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+{%- endmacro -%}
+
 # Template is at:    .github/templates/windows_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: !{{ build_environment }}
@@ -76,6 +90,10 @@ jobs:
       http_proxy: "!{{ squid_proxy }}"
       https_proxy: "!{{ squid_proxy }}"
     steps:
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
         uses: actions/checkout@v2
         with:
@@ -123,6 +141,7 @@ jobs:
           if-no-files-found: error
           name: ${{ env.BUILD_ENVIRONMENT }}
           path: C:\${{ github.run_id }}\build-results
+      !{{ wait_and_kill_ssh() }}
       - name: Cleanup build-results and workspaces
         if: always()
         shell: bash
@@ -193,6 +212,10 @@ jobs:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
       !{{ common.display_ec2_information() }}
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -251,6 +274,7 @@ jobs:
           if-no-files-found: error
           path:
             pytorch-${{ github.run_id }}/test-reports-*.zip
+      !{{ wait_and_kill_ssh() }}
       - name: Cleanup workspace
         if: always()
         shell: bash
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml
index 407aace6e4aee..6d1eff302dba9 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml
@@ -51,6 +51,10 @@ jobs:
       http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
     steps:
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
         uses: actions/checkout@v2
         with:
@@ -99,6 +103,17 @@ jobs:
           if-no-files-found: error
           name: ${{ env.BUILD_ENVIRONMENT }}
           path: C:\${{ github.run_id }}\build-results
+      - name: Wait until all sessions have drained
+        shell: powershell
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
       - name: Cleanup build-results and workspaces
         if: always()
         shell: bash
@@ -163,6 +178,10 @@ jobs:
         shell: bash
         run: |
           .github/scripts/display_ec2_information.sh
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -219,6 +238,17 @@ jobs:
           if-no-files-found: error
           path:
             pytorch-${{ github.run_id }}/test-reports-*.zip
+      - name: Wait until all sessions have drained
+        shell: powershell
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
       - name: Cleanup workspace
         if: always()
         shell: bash
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index 35f9feccaf26c..53acdd8a961b7 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -51,6 +51,10 @@ jobs:
       http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
     steps:
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
         uses: actions/checkout@v2
         with:
@@ -91,6 +95,17 @@ jobs:
           if-no-files-found: error
           name: ${{ env.BUILD_ENVIRONMENT }}
           path: C:\${{ github.run_id }}\build-results
+      - name: Wait until all sessions have drained
+        shell: powershell
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
       - name: Cleanup build-results and workspaces
         if: always()
         shell: bash
@@ -155,6 +170,10 @@ jobs:
         shell: bash
         run: |
           .github/scripts/display_ec2_information.sh
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -203,6 +222,17 @@ jobs:
           if-no-files-found: error
           path:
             pytorch-${{ github.run_id }}/test-reports-*.zip
+      - name: Wait until all sessions have drained
+        shell: powershell
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
       - name: Cleanup workspace
         if: always()
         shell: bash
diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
index 26b703500b0d1..a3447bc41f616 100644
--- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
@@ -53,6 +53,10 @@ jobs:
       http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
     steps:
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
         uses: actions/checkout@v2
         with:
@@ -101,6 +105,17 @@ jobs:
           if-no-files-found: error
           name: ${{ env.BUILD_ENVIRONMENT }}
           path: C:\${{ github.run_id }}\build-results
+      - name: Wait until all sessions have drained
+        shell: powershell
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
       - name: Cleanup build-results and workspaces
         if: always()
         shell: bash
@@ -165,6 +180,10 @@ jobs:
         shell: bash
         run: |
           .github/scripts/display_ec2_information.sh
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -221,6 +240,17 @@ jobs:
           if-no-files-found: error
           path:
             pytorch-${{ github.run_id }}/test-reports-*.zip
+      - name: Wait until all sessions have drained
+        shell: powershell
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
       - name: Cleanup workspace
         if: always()
         shell: bash
diff --git a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml
index d4175aca5f02d..2b3a30c6187fd 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml
@@ -53,6 +53,10 @@ jobs:
       http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
     steps:
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
         uses: actions/checkout@v2
         with:
@@ -101,6 +105,17 @@ jobs:
           if-no-files-found: error
           name: ${{ env.BUILD_ENVIRONMENT }}
           path: C:\${{ github.run_id }}\build-results
+      - name: Wait until all sessions have drained
+        shell: powershell
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
       - name: Cleanup build-results and workspaces
         if: always()
         shell: bash
@@ -165,6 +180,10 @@ jobs:
         shell: bash
         run: |
           .github/scripts/display_ec2_information.sh
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -221,6 +240,17 @@ jobs:
           if-no-files-found: error
           path:
             pytorch-${{ github.run_id }}/test-reports-*.zip
+      - name: Wait until all sessions have drained
+        shell: powershell
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
       - name: Cleanup workspace
         if: always()
         shell: bash

From 41ffec07ce8abfffd3f4f450ae442a8f5982a074 Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Tue, 24 Aug 2021 15:32:42 -0700
Subject: [PATCH 186/530] Add a common autograd TLS state (#63860)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63860

Test Plan: Imported from OSS

Reviewed By: heitorschueroff

Differential Revision: D30513253

Pulled By: albanD

fbshipit-source-id: 97d76ed54dfbdf4ba3fc7051ce3b9bb636cefb4b
---
 aten/src/ATen/ThreadLocalState.cpp | 22 +++++++++++------
 aten/src/ATen/ThreadLocalState.h   |  5 +---
 c10/core/AutogradState.cpp         | 19 +++++++++++++++
 c10/core/AutogradState.h           | 39 ++++++++++++++++++++++++++++++
 c10/core/GradMode.cpp              |  7 +++---
 c10/core/InferenceMode.cpp         |  8 +-----
 c10/core/InferenceMode.h           | 19 +++++++--------
 7 files changed, 87 insertions(+), 32 deletions(-)
 create mode 100644 c10/core/AutogradState.cpp
 create mode 100644 c10/core/AutogradState.h

diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index ba7be1a06b8a1..fc4b8fa9c27ec 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -12,15 +12,12 @@ namespace at {
 ThreadLocalState::ThreadLocalState(bool keep_grad_mode)
     : dispatch_key_(c10::impl::tls_local_dispatch_key_set()),
       debug_info_(c10::ThreadLocalDebugInfo::current()),
-      inference_mode_enabled_(c10::InferenceMode::is_enabled()) {
+      autograd_tls_(c10::AutogradState::get_tls_state()) {
   rf_tls_ = at::get_record_function_tls_();
   saved_tensors_default_hooks_ = SavedTensorDefaultHooks::get_hooks();
 
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   keep_grad_mode_ = keep_grad_mode;
-  if (keep_grad_mode_) {
-    grad_mode_enabled_ = GradMode::is_enabled();
-  }
 #endif
   bumped_record_all_functions_ = at::checkRecordAllFunctions();
 }
@@ -28,10 +25,23 @@ ThreadLocalState::ThreadLocalState(bool keep_grad_mode)
 /* static */
 void ThreadLocalState::setThreadLocalState(
     const ThreadLocalState& state) {
+  // Note that setting the InferenceMode TLS in this function is ONLY ok because we always
+  // restore the dispatch key set TLS at the same time.
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   if (state.keep_grad_mode_) {
-    GradMode::set_enabled(state.grad_mode_enabled_);
+    c10::AutogradState::set_tls_state(state.autograd_tls_);
+  } else {
+    auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(),
+                                        /* inference_mode */ state.autograd_tls_.get_inference_mode());
+    c10::AutogradState::set_tls_state(new_state);
   }
+#else
+  // The mobile build explicitly ignore grad_mode but fails if we propagate
+  // its value across threads or set it to a fixed value.
+  // So we have to make sure the grad_mode value is not changed here.
+  auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(),
+                                      /* inference_mode */ state.autograd_tls_.get_inference_mode());
+  c10::AutogradState::set_tls_state(new_state);
 #endif
 
   at::set_record_function_tls_(state.rf_tls_);
@@ -43,8 +53,6 @@ void ThreadLocalState::setThreadLocalState(
   c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_);
 
   c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_);
-
-  c10::InferenceMode::_set_enabled(state.inference_mode_enabled_);
 }
 
 } // namespace at
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index f30f5e3442cc1..4942399cbd6d7 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -35,14 +35,11 @@ class TORCH_API ThreadLocalState {
   // RecordFunction TLS
   RecordFunctionTLS rf_tls_;
 
+  AutogradState autograd_tls_;
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   bool keep_grad_mode_ = true;
-  bool grad_mode_enabled_;
 #endif
 
-  // TLS for InferenceMode
-  bool inference_mode_enabled_;
-
   // TLS for saved tensors default hooks
   std::pair<PyObject*, PyObject*> saved_tensors_default_hooks_;
 
diff --git a/c10/core/AutogradState.cpp b/c10/core/AutogradState.cpp
new file mode 100644
index 0000000000000..9684a76b78564
--- /dev/null
+++ b/c10/core/AutogradState.cpp
@@ -0,0 +1,19 @@
+#include <c10/core/AutogradState.h>
+
+namespace c10 {
+
+namespace {
+// By default, grad mode is enabled and inference mode is disabled
+thread_local AutogradState autograd_state_tls =
+    AutogradState(/* grad_mode */ true, /* inference_mode */ false);
+} // namespace
+
+AutogradState& AutogradState::get_tls_state() {
+  return autograd_state_tls;
+}
+
+void AutogradState::set_tls_state(AutogradState state) {
+  autograd_state_tls = state;
+}
+
+} // namespace c10
diff --git a/c10/core/AutogradState.h b/c10/core/AutogradState.h
new file mode 100644
index 0000000000000..1447594433fe4
--- /dev/null
+++ b/c10/core/AutogradState.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+#include <cstdint>
+
+namespace c10 {
+
+// Structure used to pack all the thread local boolean
+// flags used by autograd
+struct C10_API AutogradState {
+  static AutogradState& get_tls_state();
+  static void set_tls_state(AutogradState state);
+
+  AutogradState(bool grad_mode, bool inference_mode)
+      : grad_mode_(grad_mode), inference_mode_(inference_mode) {}
+
+  void set_grad_mode(bool enabled) {
+    grad_mode_ = enabled;
+  }
+
+  void set_inference_mode(bool enabled) {
+    inference_mode_ = enabled;
+  }
+
+  bool get_grad_mode() const {
+    return grad_mode_;
+  }
+
+  bool get_inference_mode() const {
+    return inference_mode_;
+  }
+
+ private:
+  bool grad_mode_ : 1;
+  bool inference_mode_ : 1;
+};
+
+} // namespace c10
diff --git a/c10/core/GradMode.cpp b/c10/core/GradMode.cpp
index 32747a6698afa..a5db198083b2b 100644
--- a/c10/core/GradMode.cpp
+++ b/c10/core/GradMode.cpp
@@ -1,16 +1,15 @@
+#include <c10/core/AutogradState.h>
 #include <c10/core/GradMode.h>
 
 #include <stdexcept>
 
 namespace c10 {
 
-thread_local bool GradMode_enabled = true;
-
 bool GradMode::is_enabled() {
-  return GradMode_enabled;
+  return AutogradState::get_tls_state().get_grad_mode();
 }
 
 void GradMode::set_enabled(bool enabled) {
-  GradMode_enabled = enabled;
+  AutogradState::get_tls_state().set_grad_mode(enabled);
 }
 } // namespace c10
diff --git a/c10/core/InferenceMode.cpp b/c10/core/InferenceMode.cpp
index b588ab4da54b5..59eca760cf504 100644
--- a/c10/core/InferenceMode.cpp
+++ b/c10/core/InferenceMode.cpp
@@ -2,18 +2,12 @@
 #include <stdexcept>
 
 namespace c10 {
-thread_local bool InferenceMode_enabled = false;
-
 // Invariant:
 //   is_enabled() ==
 //   !c10::impl::tls_is_dispatch_key_included(DispatchKey::ADInplaceOrView);
 // InferenceMode::is_enabled() is in perf critical path (TensorImpl constructor)
 // so it worths a separate TLS to skip the DispatchKeySet check.
 bool InferenceMode::is_enabled() {
-  return InferenceMode_enabled;
-}
-
-void InferenceMode::_set_enabled(bool enabled) {
-  InferenceMode_enabled = enabled;
+  return AutogradState::get_tls_state().get_inference_mode();
 }
 } // namespace c10
diff --git a/c10/core/InferenceMode.h b/c10/core/InferenceMode.h
index 7a9c2c593a453..9748d6eccfb54 100644
--- a/c10/core/InferenceMode.h
+++ b/c10/core/InferenceMode.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/core/AutogradState.h>
 #include <c10/core/GradMode.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/macros/Macros.h>
@@ -50,10 +51,12 @@ struct TORCH_API InferenceMode {
   //    are applicable to InferenceMode as well, e.g.
   //    `tensorTypeInCurrentExecutionContext` in interpreter.cpp.
   InferenceMode(bool enabled = true)
-      : prev_mode(InferenceMode::is_enabled()),
-        prev_keyset(c10::impl::tls_local_dispatch_key_set()),
-        grad_mode(at::AutoGradMode(!enabled)) {
-    _set_enabled(enabled);
+      : prev_mode(AutogradState::get_tls_state()),
+        prev_keyset(c10::impl::tls_local_dispatch_key_set()) {
+    // Enabling inference mode means disabling grad mode
+    // And disabling inference mode means enabling grad mode
+    AutogradState::set_tls_state(
+        AutogradState(/* grad_mode */ !enabled, /* inference_mode */ enabled));
     DispatchKeySet included = enabled
         ? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView)
         : prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView);
@@ -67,17 +70,13 @@ struct TORCH_API InferenceMode {
   }
 
   ~InferenceMode() {
-    _set_enabled(prev_mode);
+    AutogradState::set_tls_state(prev_mode);
     c10::impl::_force_tls_local_dispatch_key_set(prev_keyset);
   }
   static bool is_enabled();
-  // _set_enabled() is not user facing and should be only used in
-  // ThreadLocalState.cpp.
-  static void _set_enabled(bool enabled);
 
  private:
-  bool prev_mode;
+  AutogradState prev_mode;
   c10::impl::LocalDispatchKeySet prev_keyset;
-  at::AutoGradMode grad_mode;
 };
 } // namespace c10

From 6d58c830072eda05786a336c252b624c105a6fac Mon Sep 17 00:00:00 2001
From: Xiaodong Wang <xdwang@fb.com>
Date: Tue, 24 Aug 2021 15:45:59 -0700
Subject: [PATCH 187/530] Turn off layer norm in jit symbolic differentiation
 (#63816)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63816

Test Plan:
Confirmed this can rescue the NE:

https://www.internalfb.com/mast/job/torchx_xdwang-SparseNNApplication_72cf593d

Reviewed By: ngimel

Differential Revision: D30498746

fbshipit-source-id: 4a387f32ee2f70685de6104459c7f21bfbddc187
---
 torch/csrc/jit/runtime/symbolic_script.cpp           | 2 +-
 torch/testing/_internal/jit_metaprogramming_utils.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch/csrc/jit/runtime/symbolic_script.cpp b/torch/csrc/jit/runtime/symbolic_script.cpp
index 29ce74a7d3ef7..6f2acca134738 100644
--- a/torch/csrc/jit/runtime/symbolic_script.cpp
+++ b/torch/csrc/jit/runtime/symbolic_script.cpp
@@ -1141,7 +1141,7 @@ const std::vector<std::string> functions = {
 
             return output, backward
 
-        def layer_norm(input : Tensor,
+        def layer_norm_disabled(input : Tensor,
                        normalized_shape : List[int],
                        weight : Optional[Tensor],
                        bias : Optional[Tensor],
diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py
index 350866cdbf083..75b1615d065d5 100644
--- a/torch/testing/_internal/jit_metaprogramming_utils.py
+++ b/torch/testing/_internal/jit_metaprogramming_utils.py
@@ -144,14 +144,14 @@
         'with_only_weight_inference', (True, 'aten::_batch_norm_impl_index')),
     ('instance_norm', (S, S, S), (non_differentiable(torch.zeros(S)), non_differentiable(torch.ones(S))),),
     ('layer_norm', (S, S, S, S), ([5],), '',
-     (True, ['aten::native_layer_norm'])),
+     (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])),
     ('layer_norm', (S, S, S, S), ([5], non_differentiable(torch.rand(S)),), 'with_only_weight',
-     (True, ['aten::native_layer_norm'])),
+     (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])),
     ('layer_norm', (S, S, S, S), ([5], None, non_differentiable(torch.rand(S)),), 'with_only_bias',
-     (True, ['aten::native_layer_norm'])),
+     (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])),
     ('layer_norm', (S, S, S, S), ([5], non_differentiable(torch.rand(S)),
                                   non_differentiable(torch.rand(S))), 'with_weight_and_bias',
-     (True, ['aten::native_layer_norm'])),
+     (False, ['aten::contiguous', 'aten::_batch_norm_impl_index', 'aten::addcmul'])),
     ('group_norm', (S, S, S), (1, torch.rand(5),),),
     ('local_response_norm', (S, S, S), (2, ),),
     ('nll_loss', F.log_softmax(torch.randn(3, 5), dim=0), (torch.tensor([1, 0, 4]),), '',),

From 956c8fa01ee6122122d96043f9b192fd106eb139 Mon Sep 17 00:00:00 2001
From: Harut Movsisyan <harutm@fb.com>
Date: Tue, 24 Aug 2021 16:20:13 -0700
Subject: [PATCH 188/530] Microbenchmarking matrix mult (einsum, torch.mult,
 torch.mm) (#63654)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63654

Test Plan:
```
> buck run mode/opt caffe2/benchmarks/operator_benchmark/pt:matrix_mult_test

# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : short

# Benchmarking PyTorch: einsum_bmm
# Mode: Eager
# Name: einsum_bmm_B4_M5_N3_K2_cpu
# Input: B: 4, M: 5, N: 3, K: 2, device: cpu
Forward Execution Time (us) : 27.970

# Benchmarking PyTorch: einsum_bmm
# Mode: Eager
# Name: einsum_bmm_B32_M25_N20_K30_cpu
# Input: B: 32, M: 25, N: 20, K: 30, device: cpu
Forward Execution Time (us) : 41.830

# Benchmarking PyTorch: einsum_bmm
# Mode: Eager
# Name: einsum_bmm_B128_M100_N120_K110_cpu
# Input: B: 128, M: 100, N: 120, K: 110, device: cpu
Forward Execution Time (us) : 499.114

# Benchmarking PyTorch: bmm
# Mode: Eager
# Name: bmm_B4_M5_N3_K2_cpu
# Input: B: 4, M: 5, N: 3, K: 2, device: cpu
Forward Execution Time (us) : 6.268

# Benchmarking PyTorch: bmm
# Mode: Eager
# Name: bmm_B32_M25_N20_K30_cpu
# Input: B: 32, M: 25, N: 20, K: 30, device: cpu
Forward Execution Time (us) : 12.676

# Benchmarking PyTorch: bmm
# Mode: Eager
# Name: bmm_B128_M100_N120_K110_cpu
# Input: B: 128, M: 100, N: 120, K: 110, device: cpu
Forward Execution Time (us) : 438.219

# Benchmarking PyTorch: einsum_elementwise
# Mode: Eager
# Name: einsum_elementwise_B4_M5_N3_cpu
# Input: B: 4, M: 5, N: 3, device: cpu
Forward Execution Time (us) : 7.657

# Benchmarking PyTorch: einsum_elementwise
# Mode: Eager
# Name: einsum_elementwise_B32_M25_N20_cpu
# Input: B: 32, M: 25, N: 20, device: cpu
Forward Execution Time (us) : 18.523

# Benchmarking PyTorch: einsum_elementwise
# Mode: Eager
# Name: einsum_elementwise_B100_M90_N110_cpu
# Input: B: 100, M: 90, N: 110, device: cpu
Forward Execution Time (us) : 55.103

# Benchmarking PyTorch: mul
# Mode: Eager
# Name: mul_B4_M5_N3_cpu
# Input: B: 4, M: 5, N: 3, device: cpu
Forward Execution Time (us) : 2.501

# Benchmarking PyTorch: mul
# Mode: Eager
# Name: mul_B32_M25_N20_cpu
# Input: B: 32, M: 25, N: 20, device: cpu
Forward Execution Time (us) : 10.589

# Benchmarking PyTorch: mul
# Mode: Eager
# Name: mul_B100_M90_N110_cpu
# Input: B: 100, M: 90, N: 110, device: cpu
Forward Execution Time (us) : 50.102

Reviewed By: ajyu

Differential Revision: D30455179

fbshipit-source-id: 9f2d92b2d2b860f41a8e59be2cc086d75b587f7b
---
 .../operator_benchmark/pt/matrix_mult_test.py | 119 ++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 benchmarks/operator_benchmark/pt/matrix_mult_test.py

diff --git a/benchmarks/operator_benchmark/pt/matrix_mult_test.py b/benchmarks/operator_benchmark/pt/matrix_mult_test.py
new file mode 100644
index 0000000000000..ad7d42318140d
--- /dev/null
+++ b/benchmarks/operator_benchmark/pt/matrix_mult_test.py
@@ -0,0 +1,119 @@
+import operator_benchmark as op_bench
+import torch
+
+"""
+Microbenchmarks for batch matrix mult with einsum and torch.bmm.
+"""
+
+batch_mm_configs_short = op_bench.config_list(
+    attr_names=["B", "M", "N", "K"],
+    attrs=[
+        [4, 5, 3, 2],
+        [32, 25, 20, 30],
+        [128, 100, 120, 110],
+    ],
+    cross_product_configs={
+        'device': ['cpu', 'cuda'],
+    },
+    tags=["short"],
+)
+
+batch_mm_configs_long = op_bench.config_list(
+    attr_names=["B", "M", "N", "K"],
+    attrs=[
+        [128, 256, 128, 256],
+        [512, 1024, 1024, 512],
+    ],
+    cross_product_configs={
+        'device': ['cpu', 'cuda'],
+    },
+    tags=["long"],
+)
+
+batch_mm_op_list = op_bench.op_list(
+    attr_names=['op_name', 'op_func'],
+    attrs=[
+        ['einsum_bmm', torch.einsum],
+        ['bmm', torch.bmm],
+    ],
+)
+
+class BatchMatrixMultBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, B, M, N, K, device, op_func):
+        self.inputs = {
+            "input_one": torch.rand(B, M, N, device=device),
+            "input_two": torch.rand(B, N, K, device=device)
+        }
+        self.op_func = op_func
+
+    def forward(self, input_one, input_two):
+        if self.op_func.__name__ == "einsum":
+            return torch.einsum('bij,bjk->bik', input_one, input_two)
+        else:
+            return torch.bmm(input_one, input_two)
+
+
+"""
+Microbenchmarks for element-wise matrix mult with einsum and torch.mul.
+"""
+
+batch_elementwise_configs_short = op_bench.config_list(
+    attr_names=["B", "M", "N"],
+    attrs=[
+        [4, 5, 3],
+        [32, 25, 20],
+        [100, 90, 110],
+    ],
+    cross_product_configs={
+        'device': ['cpu', 'cuda'],
+    },
+    tags=["short"],
+)
+
+
+batch_elementwise_configs_long = op_bench.cross_product_configs(
+    B=[128, 512, 1024],
+    M=[128, 512, 1024],
+    N=[128, 512, 1024],
+    device=['cpu', 'cuda'],
+    tags=['long']
+)
+
+batch_elementwise_op_list = op_bench.op_list(
+    attr_names=['op_name', 'op_func'],
+    attrs=[
+        ['einsum_elementwise', torch.einsum],
+        ['mul', torch.mul],
+    ],
+)
+
+class BatchElementWiseBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, B, M, N, device, op_func):
+        self.inputs = {
+            "input_one": torch.rand(B, M, N, device=device),
+            "input_two": torch.rand(B, M, N, device=device)
+        }
+        self.op_func = op_func
+
+    def forward(self, input_one, input_two):
+        if self.op_func.__name__ == "einsum":
+            return torch.einsum('bij,bij->bij', input_one, input_two)
+        else:
+            return torch.mul(input_one, input_two)
+
+
+op_bench.generate_pt_tests_from_op_list(
+    batch_mm_op_list,
+    batch_mm_configs_short + batch_mm_configs_long,
+    BatchMatrixMultBenchmark,
+)
+
+op_bench.generate_pt_tests_from_op_list(
+    batch_elementwise_op_list,
+    batch_elementwise_configs_short + batch_elementwise_configs_long,
+    BatchElementWiseBenchmark,
+)
+
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()

From 6fa646ad547f5ea9975f59cbece7e287959503fe Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Tue, 24 Aug 2021 17:06:18 -0700
Subject: [PATCH 189/530] [StaticRuntime] Fix bug in HasInplaceOp (#63842)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63842

Reviewed By: mikeiovine

Differential Revision: D30506914

fbshipit-source-id: b2e358cfb991dacdb295b61bbc37beb36b73b852
---
 benchmarks/static_runtime/test_scripts.h         | 16 ++++++++++++++++
 benchmarks/static_runtime/test_static_runtime.cc |  1 +
 torch/csrc/jit/runtime/static/passes.cpp         |  4 +++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index c82dd57752bd6..90f93b20c94c0 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -138,6 +138,22 @@ const auto reshape_inplace_script = R"JIT(
       return (d, e, f)
 )JIT";
 
+const auto reshape_inplace_script_1 = R"JIT(
+  def forward(self, inp: Tensor, shape: List[int], flag: bool):
+    if flag:
+      a = inp + inp
+      b = a.reshape(shape)
+      c = b.sigmoid()
+    else:
+      a = inp * inp
+      b = a.sigmoid_()
+      c = b.reshape(shape)
+    d = c + c
+    e = a + a
+    f = b + b
+    return (d, e, f)
+)JIT";
+
 const auto sigmoid_inplace_script = R"JIT(
   def forward(self, inp: Tensor):
       a = torch.sigmoid(inp, out=inp).clone()
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 701231e7720d1..f6ec677bbb7bc 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -69,6 +69,7 @@ Node* getNodeWithKind(const StaticModule& smodule, const std::string& kind) {
 
 TEST(StaticRuntime, InPlace) {
   EXPECT_TRUE(testHasInplaceOp(reshape_inplace_script));
+  EXPECT_TRUE(testHasInplaceOp(reshape_inplace_script_1));
   EXPECT_TRUE(testHasInplaceOp(sigmoid_inplace_script));
   EXPECT_FALSE(testHasInplaceOp(sigmoid_out_script));
 }
diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
index 2e9eb5746d276..c8e1107199528 100644
--- a/torch/csrc/jit/runtime/static/passes.cpp
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@@ -12,7 +12,9 @@ namespace {
 bool HasInplaceOp(Block* block, const AliasDb& alias_db) {
   for (auto* node : block->nodes()) {
     for (Block* sub_block : node->blocks()) {
-      return HasInplaceOp(sub_block, alias_db);
+      if (HasInplaceOp(sub_block, alias_db)) {
+        return true;
+      }
     }
     auto inputs = node->inputs();
     // check if node modifies inputs (both inplace ops and certain out variants

From 5b28e3c18359ef863946f540717f80fd1dcaa193 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 24 Aug 2021 18:20:43 -0700
Subject: [PATCH 190/530] [quant][graphmode][fx] Add reference option support
 for binary ops (#62698)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62698

We also removed the special handling in match_utils for binary ops

Test Plan:
python test/test_quantize.py TestQuantizeFx
python test/test_quantize.py TestQuantizeFxOps

Imported from OSS

Reviewed By: vkuzo

Differential Revision: D30093781

fbshipit-source-id: 58cc972de8211a80dd4d111e25dc4ad36057933f
---
 test/quantization/fx/test_numeric_suite_fx.py |   4 +-
 torch/quantization/fx/convert.py              |  15 +-
 torch/quantization/fx/match_utils.py          |  67 ++------
 torch/quantization/fx/prepare.py              |   4 +-
 .../quantization/fx/quantization_patterns.py  | 145 +++++++++---------
 torch/quantization/ns/mappings.py             |   1 +
 6 files changed, 99 insertions(+), 137 deletions(-)

diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index d605eba34d922..61062fba781e5 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -646,7 +646,6 @@ def _op_is_unmatchable(op):
                 # these ops do not have quantized equivalents
                 ops_to_skip = [
                     torch.bmm,
-                    torch.sum,
                     torch.div,
                     torch.sub,
                     operator.truediv,
@@ -662,6 +661,9 @@ def _op_is_unmatchable(op):
                 # RNNDynamicQuantizeHandler
                 pass
             elif qhandler_cls == qp.DefaultNodeQuantizeHandler:
+                # torch.sum does not have quantized equivalents
+                if base_op == torch.sum:
+                    continue
                 self.assertTrue(
                     _op_in_base_sets_of_related_ops(base_op),
                     f"{base_op} not in sets of related ops")
diff --git a/torch/quantization/fx/convert.py b/torch/quantization/fx/convert.py
index 976ca0c6aeca7..671c2704d7da7 100644
--- a/torch/quantization/fx/convert.py
+++ b/torch/quantization/fx/convert.py
@@ -335,11 +335,18 @@ def node_arg_is_quantized(node_arg: Any) -> bool:
         else:
             return False
 
-    def is_output_quantized(node: Node, obj: QuantizeHandler, qconfig: QConfigAny, modules: Dict[str, torch.nn.Module]) -> bool:
+    def is_output_quantized(
+            node: Node, obj: QuantizeHandler, qconfig: QConfigAny,
+            modules: Dict[str, torch.nn.Module], is_reference=False) -> bool:
         """ Check if output node is quantized or not """
         assert modules is not None
-        # by default the output for a quantizable node is expected to be quantized
-        quantized = True
+        # for some ops the output is quantized only when `is_reference` is True
+        # and when `is_reference` is False, it has limited qconfig
+        # support, for example `add`
+        # ideally this check should not happen here, it should happen either in
+        # prepare or during lowering, we don't need this check
+        # after the default path is changed to produce reference patterns
+        quantized = obj.is_output_quantized(qconfig, is_reference)
 
         # Need to get correct quantized/non-quantized state forn the output
         # of FixedQParamsQuantizeHandler
@@ -454,7 +461,7 @@ def insert_quantize_node(node: Node, modules: Dict[str, torch.nn.Module]) -> Non
                     node, qconfig, modules, quantized_graph, node_name_to_scope, load_arg, is_reference=is_reference,
                     convert_custom_config_dict=convert_custom_config_dict)
                 if not is_observed_standalone_module_node:
-                    quantized = is_output_quantized(node, obj, qconfig, modules)
+                    quantized = is_output_quantized(node, obj, qconfig, modules, is_reference)
 
             if quantized:
                 env[node.name][activation_dtype(qconfig)] = result
diff --git a/torch/quantization/fx/match_utils.py b/torch/quantization/fx/match_utils.py
index dd8501c9b8bf1..4aa9275870c26 100644
--- a/torch/quantization/fx/match_utils.py
+++ b/torch/quantization/fx/match_utils.py
@@ -9,9 +9,6 @@
     QuantizeHandler,
     CustomModuleQuantizeHandler,
     StandaloneModuleQuantizeHandler,
-    BinaryOpQuantizeHandler,
-    binary_op_supported_dtypes,
-    binary_reference_op_supported_dtypes,
 )
 from ..qconfig import (
     QConfigAny,
@@ -19,7 +16,6 @@
 from .graph_module import (
     is_observed_standalone_module,
 )
-from ..utils import get_qconfig_dtypes
 
 from typing import Any, Dict, List, Callable, Optional, Tuple, Set
 
@@ -135,60 +131,15 @@ def record_match(pattern, node, matched):
         if node.name not in match_map and node.name not in all_matched:
             for pattern, value in patterns.items():
                 if is_match(modules, node, pattern):
-                    skip_this_match = False
-                    if value is BinaryOpQuantizeHandler:
-
-                        # to properly check for dtype support, we need to
-                        # navigate to the base node of an add-relu or mul-relu
-                        # pattern
-                        base_node = node
-                        if (
-                            (node.op == 'call_function' and
-                             node.target is torch.nn.functional.relu) or
-                            (node.op == 'call_module' and
-                             isinstance(modules[node.target], torch.nn.ReLU))
-                        ):
-                            base_node = node.args[0]
-
-                        this_node_qconfig = \
-                            qconfig_map[base_node.name]
-                        if this_node_qconfig:
-                            dtypes = get_qconfig_dtypes(this_node_qconfig)
-                            # TODO(future PR): update the pattern to quantize
-                            # handler logic to take this into account.
-
-
-                            # This needs to handle 3 cases
-                            # 1) op and dtype is in either [is_ref or non-ref] list -> don't skip
-                            # 2) op is not in either list (i.e. relu) -> don't skip
-                            # 3) op is in non-ref list, but not for dtype, and op+dtype not in is_ref list -> skip
-
-                            # note: the value of is_reference is unknown at prepare, so we have to cover both cases
-                            # handle is_reference = False
-                            skip_match_not_is_reference = (
-                                (base_node.target in binary_op_supported_dtypes) and
-                                (dtypes not in binary_op_supported_dtypes[base_node.target])
-                            )
-
-                            # handle is_reference = True
-                            supported_is_reference = (
-                                (base_node.target in binary_reference_op_supported_dtypes) and
-                                (dtypes in binary_reference_op_supported_dtypes[base_node.target])
-                            )
-
-                            # only skip if not reference says skip and is_reference doesn't support
-                            skip_this_match = skip_match_not_is_reference and not supported_is_reference
-
-                    if not skip_this_match:
-                        matched: List[Any] = []
-                        record_match(pattern, node, matched)
-                        for n in matched:
-                            match_map[n.name] = (
-                                node, matched, pattern, value(node, modules),  # type: ignore[operator]
-                                qconfig_map[n.name])
-                            all_matched.add(n.name)
-                        # break after finding the first match
-                        break
+                    matched: List[Any] = []
+                    record_match(pattern, node, matched)
+                    for n in matched:
+                        match_map[n.name] = (
+                            node, matched, pattern, value(node, modules),  # type: ignore[operator]
+                            qconfig_map[n.name])
+                        all_matched.add(n.name)
+                    # break after finding the first match
+                    break
 
     # add custom module instances to the match result
     assert modules is not None
diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py
index 29600b8797c52..86abac2d20991 100644
--- a/torch/quantization/fx/prepare.py
+++ b/torch/quantization/fx/prepare.py
@@ -87,7 +87,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 def is_activation_post_process_node(node: Node, modules: Dict[str, torch.nn.Module]) -> bool:
-    return node.op == "call_module" and \
+    return isinstance(node, torch.fx.Node) and node.op == "call_module" and \
         is_activation_post_process(modules[str(node.target)])
 
 def node_arg_is_weight(node: Node, arg: Any) -> bool:
@@ -772,6 +772,8 @@ def maybe_make_input_output_share_observers(
     # we need to navigate up to the first observer
     iteration_guard = 0
     while not is_activation_post_process_node(first_arg_arg, modules):
+        if not isinstance(first_arg_arg, Node):
+            return False
         # did not find an activation_post_process for the op
         if first_arg_arg.op == "placeholder":
             return False
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index a68eea2bbf44c..1ce43cadc8e98 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -155,6 +155,15 @@ def get_activation_ctr(
         """
         return qconfig.activation
 
+    def is_output_quantized(self, qconfig, is_reference):
+        """ Returns true if the output node of convert is quantized
+        when is_reference is False, we would return float node when a certain dtype
+        combination is not supported (since fbgemm/qnnpack only support certain dtype
+        combinations), so the output may be float, but when is_reference is True,
+        we support all dtype combinations so the output will always be quantized.
+        """
+        return True
+
 
     @abstractmethod
     def convert(self,
@@ -180,34 +189,52 @@ def convert(self,
 
 # tuple (activation_dtype, weight_dtype, compute_dtype)
 # these are supported types for common binary ops like add/mul etc.
-binary_op_all_dtypes = [
+all_dtypes = [
     (torch.quint8, torch.qint8, None),
     (torch.float16, torch.float16, None),
 ]
-binary_op_float16_dtypes = [
+fp16_dtypes = [
     (torch.float16, torch.float16, None)
 ]
-binary_op_int8_dtypes = [
+int8_dtypes = [
     (torch.quint8, torch.qint8, None),
 ]
 binary_op_supported_dtypes : Dict[Union[Callable, str], List[Tuple[torch.dtype, torch.dtype, None]]] = {
-    operator.add: binary_op_all_dtypes,
-    torch.add: binary_op_all_dtypes,
-    operator.mul: binary_op_all_dtypes,
-    torch.mul: binary_op_all_dtypes,
-    torch.bmm: binary_op_float16_dtypes,
-    torch.sub: binary_op_float16_dtypes,
-    operator.sub: binary_op_float16_dtypes,
-    torch.div: binary_op_float16_dtypes,
-    operator.truediv: binary_op_float16_dtypes,
-    torch.sum: binary_op_float16_dtypes
+    operator.add: all_dtypes,
+    torch.add: all_dtypes,
+    operator.mul: all_dtypes,
+    torch.mul: all_dtypes,
+    torch.bmm: fp16_dtypes,
+    torch.sub: fp16_dtypes,
+    operator.sub: fp16_dtypes,
+    torch.div: fp16_dtypes,
+    operator.truediv: fp16_dtypes,
 }
-binary_reference_op_supported_dtypes : Dict[Union[Callable, str], List[Tuple[torch.dtype, torch.dtype, None]]] = {
-    torch.bmm: binary_op_int8_dtypes,
-    operator.add: binary_op_int8_dtypes,
-    torch.add: binary_op_int8_dtypes,
-    operator.mul: binary_op_int8_dtypes,
-    torch.mul: binary_op_int8_dtypes,
+
+default_op_supported_dtypes = {
+    torch.nn.ConvTranspose1d: int8_dtypes,
+    torch.nn.ConvTranspose2d: int8_dtypes,
+    torch.nn.ELU: int8_dtypes,
+    torch.nn.LeakyReLU: int8_dtypes,
+    torch.nn.Hardswish: int8_dtypes,
+    torch.nn.InstanceNorm1d: int8_dtypes,
+    torch.nn.InstanceNorm2d: int8_dtypes,
+    torch.nn.InstanceNorm3d: int8_dtypes,
+    torch.nn.LayerNorm: all_dtypes,
+    torch.nn.SiLU: fp16_dtypes,
+    torch.nn.Mish: fp16_dtypes,
+    torch.nn.GELU: int8_dtypes,
+    torch.nn.Softmax: int8_dtypes,
+    torch.nn.functional.elu: int8_dtypes,
+    torch.nn.functional.hardswish: int8_dtypes,
+    torch.nn.functional.instance_norm: int8_dtypes,
+    torch.nn.functional.layer_norm: all_dtypes,
+    torch.nn.functional.leaky_relu: int8_dtypes,
+    torch.nn.functional.silu: fp16_dtypes,
+    torch.nn.functional.mish: fp16_dtypes,
+    torch.nn.functional.gelu: int8_dtypes,
+    torch.nn.functional.softmax: int8_dtypes,
+    torch.sum: fp16_dtypes,
 }
 
 QAT_CONV_MODULE_CLASSES = \
@@ -266,7 +293,6 @@ def _get_name():
 @register_quant_pattern(torch.sub)
 @register_quant_pattern(torch.mul)
 @register_quant_pattern(torch.div)
-@register_quant_pattern(torch.sum)
 @register_quant_pattern(torch.bmm)
 @register_quant_pattern((torch.nn.ReLU, operator.add))
 @register_quant_pattern((torch.nn.ReLU, operator.mul))
@@ -344,6 +370,13 @@ def input_output_observed(self):
         # for x + y where x and y are scalars, we do not observe anything
         return self.num_tensor_args > 0
 
+    def is_output_quantized(self, qconfig, is_reference):
+        dtypes = get_qconfig_dtypes(qconfig)
+        if not is_reference:
+            return self.binary_op in binary_op_supported_dtypes and \
+                dtypes in binary_op_supported_dtypes[self.binary_op]
+        return True
+
     def convert(self,
                 node: Node,
                 qconfig: QConfigAny,
@@ -361,11 +394,14 @@ def convert(self,
 
         dtypes = get_qconfig_dtypes(qconfig)
 
-        if is_reference and self.binary_op in binary_reference_op_supported_dtypes and \
-                dtypes in binary_reference_op_supported_dtypes[self.binary_op]:
-            if dtypes in binary_op_int8_dtypes:
-                # make sure both inputs are quantized to torch.quint8
-                load_arg(quantized={0: torch.quint8, 1: torch.quint8})(self.binary_op_node.args)
+        if is_reference:
+            act_dtype = activation_dtype(qconfig)
+            if act_dtype == torch.float:
+                return quantized_graph.node_copy(node, load_arg(quantized=torch.float))
+            else:
+                if self.num_tensor_args == 2:
+                    # make sure both inputs are quantized to act_dtype
+                    load_arg(quantized={0: act_dtype, 1: act_dtype})(self.binary_op_node.args)
                 args = load_arg(quantized=torch.float)(self.binary_op_node.args)
                 kwargs = load_arg(quantized=torch.float)(self.binary_op_node.kwargs)
                 op_out = quantized_graph.node_copy(self.binary_op_node, load_arg(quantized=torch.float))
@@ -384,12 +420,6 @@ def modified_load_arg(n: Node):
                 return quantize_node(
                     op_out, activation_post_process,
                     node, modules, quantized_graph, node_name_to_scope, is_input=False)
-            else:
-                warnings.warn(
-                    "No implementation found for dtype combination: {}"
-                    "for op {} with is_reference={} despite it being listed as supported"
-                    "this should not happen".format(dtypes, self.binary_op, is_reference))
-                return quantized_graph.node_copy(node, load_arg(quantized=torch.float))
         elif not is_reference and self.binary_op in binary_op_supported_dtypes and \
                 dtypes in binary_op_supported_dtypes[self.binary_op]:
             if dtypes in [(torch.quint8, torch.qint8, None)]:
@@ -445,15 +475,10 @@ def modified_load_arg(n: Node):
                 "dtype combination: {} is not "
                 "supported by {} for is_reference={}. "
                 "Supported non-reference dtype combinations are: {} "
-                "Supported reference dtype combinations are: {}"
                 "".format(dtypes,
                           self.binary_op,
                           is_reference,
-                          binary_op_supported_dtypes[self.binary_op],
-                          (
-                              [] if self.binary_op not in binary_reference_op_supported_dtypes.keys()
-                              else binary_reference_op_supported_dtypes[self.binary_op]
-                          )
+                          binary_op_supported_dtypes[self.binary_op]
                           )
             )
             if self.relu_node:
@@ -1226,6 +1251,7 @@ def convert(self,
 # until they receive a proper fp16 kernel. To use the reference pattern, use a custom qconfig
 # @register_quant_pattern(torch.nn.functional.gelu)
 # @register_quant_pattern(torch.nn.functional.softmax)
+@register_quant_pattern(torch.sum)
 class DefaultNodeQuantizeHandler(QuantizeHandler):
     """ Common quantized op, first input and first output will be quantized
     """
@@ -1239,6 +1265,13 @@ def __init__(
         elif node.op == "call_module":
             self.op = type(modules[str(node.target)])
 
+    def is_output_quantized(self, qconfig, is_reference):
+        dtypes = get_qconfig_dtypes(qconfig)
+        if not is_reference:
+            return self.op in default_op_supported_dtypes and \
+                dtypes in default_op_supported_dtypes[self.op]
+        return True
+
     def convert(self,
                 node: Node,
                 qconfig: QConfigAny,
@@ -1256,46 +1289,12 @@ def convert(self,
             convert_custom_config_dict = {}
         additional_static_quant_mapping = convert_custom_config_dict.get("static", {})
 
-        all_dtypes = [
-            (torch.quint8, torch.qint8, None),
-            (torch.float16, torch.float16, None)
-        ]
-        int8_dtypes = [
-            (torch.quint8, torch.qint8, None)
-        ]
-        fp16_dtypes = [
-            (torch.float16, torch.float16, None)
-        ]
-        supported_dtypes = {
-            torch.nn.ConvTranspose1d: int8_dtypes,
-            torch.nn.ConvTranspose2d: int8_dtypes,
-            torch.nn.ELU: int8_dtypes,
-            torch.nn.LeakyReLU: int8_dtypes,
-            torch.nn.Hardswish: int8_dtypes,
-            torch.nn.InstanceNorm1d: int8_dtypes,
-            torch.nn.InstanceNorm2d: int8_dtypes,
-            torch.nn.InstanceNorm3d: int8_dtypes,
-            torch.nn.LayerNorm: all_dtypes,
-            torch.nn.SiLU: fp16_dtypes,
-            torch.nn.Mish: fp16_dtypes,
-            torch.nn.GELU: int8_dtypes,
-            torch.nn.Softmax: int8_dtypes,
-            torch.nn.functional.elu: int8_dtypes,
-            torch.nn.functional.hardswish: int8_dtypes,
-            torch.nn.functional.instance_norm: int8_dtypes,
-            torch.nn.functional.layer_norm: all_dtypes,
-            torch.nn.functional.leaky_relu: int8_dtypes,
-            torch.nn.functional.silu: fp16_dtypes,
-            torch.nn.functional.mish: fp16_dtypes,
-            torch.nn.functional.gelu: int8_dtypes,
-            torch.nn.functional.softmax: int8_dtypes,
-        }
         dtypes = get_qconfig_dtypes(qconfig)
-        if not is_reference and dtypes not in supported_dtypes[self.op]:
+        if not is_reference and dtypes not in default_op_supported_dtypes[self.op]:
             warnings.warn(
                 "dtype combination: {} is not "
                 "supported by {} "
-                "supported dtype combinations are: {}".format(dtypes, self.op, supported_dtypes[self.op]))
+                "supported dtype combinations are: {}".format(dtypes, self.op, default_op_supported_dtypes[self.op]))
             return quantized_graph.node_copy(node, load_arg(quantized=torch.float))
         # TODO: make helper functions for (torch.quint8, torch.qint8, None)
         if not is_reference:
diff --git a/torch/quantization/ns/mappings.py b/torch/quantization/ns/mappings.py
index 2a7c859347f3d..399ddca22668e 100644
--- a/torch/quantization/ns/mappings.py
+++ b/torch/quantization/ns/mappings.py
@@ -419,6 +419,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         # uncomment below
         # operator.add,
         # operator.mul,
+        torch.sum,
     ])
 
     FUNS_IO_TYPE_FP16: Set[NSNodeTargetType] = set()

From d454c9e76e19f51ad95644509ae497f49bcd1924 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Tue, 24 Aug 2021 18:48:25 -0700
Subject: [PATCH 191/530] Migrate THCTensor_copyIgnoringOverlaps to ATen
 (#63505)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63505

This isn't a public operator, just a helper function used in CUDA_tensor_apply.

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D30441305

Pulled By: ngimel

fbshipit-source-id: 84fabc701cbd8479e02d80f373a3dd62d70df2ce
---
 aten/src/ATen/LegacyTHFunctionsCUDA.h         |  26 -----
 aten/src/ATen/cuda/CUDAApplyUtils.cuh         |  18 +--
 aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp  | 104 ------------------
 aten/src/ATen/native/Copy.cpp                 |  16 +++
 aten/src/ATen/native/Copy.h                   |   2 +
 .../ATen/native/cuda/DistributionBernoulli.cu |   1 -
 .../native/cuda/DistributionCauchyKernel.cu   |   1 -
 .../cuda/DistributionExponentialKernel.cu     |   1 -
 .../cuda/DistributionGeometricKernel.cu       |   1 -
 .../cuda/DistributionLogNormalKernel.cu       |   1 -
 .../ATen/native/cuda/DistributionNormal.cu    |   1 -
 .../native/cuda/DistributionRandomKernel.cu   |   1 -
 aten/src/ATen/native/cuda/Distributions.cu    |   1 -
 .../src/ATen/native/cuda/MultinomialKernel.cu |   1 -
 aten/src/ATen/native/cuda/Sort.cu             |   1 -
 aten/src/ATen/native/cuda/TensorTopK.cu       |   1 -
 .../src/ATen/native/cuda/UpSampleNearest3d.cu |   1 -
 .../ATen/templates/RegisterDispatchKey.cpp    |   1 -
 tools/codegen/gen.py                          |   3 -
 tools/codegen/gen_backend_stubs.py            |   1 -
 20 files changed, 23 insertions(+), 160 deletions(-)
 delete mode 100644 aten/src/ATen/LegacyTHFunctionsCUDA.h
 delete mode 100644 aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp

diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
deleted file mode 100644
index 41cbdd6f4ffe1..0000000000000
--- a/aten/src/ATen/LegacyTHFunctionsCUDA.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-
-#include <ATen/Context.h>
-#include <c10/core/ScalarType.h>
-#include <c10/core/TensorOptions.h>
-
-namespace c10 {
-class Scalar;
-}
-namespace at {
-struct Generator;
-class Tensor;
-struct Type;
-} // namespace at
-
-namespace at {
-namespace native {
-namespace legacy {
-namespace cuda {
-
-Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src);
-
-} // namespace th
-} // namespace legacy
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
index 2617870eea519..2b1538ec15ade 100644
--- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@@ -5,7 +5,7 @@
 #include <THC/THCAtomics.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/macros/Macros.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
+#include <ATen/native/Copy.h>
 
 #include <math.h>
 
@@ -453,13 +453,11 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
 
   if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
     // Must perform in contiguous space
-    oldA = a;
-    a = a.contiguous();
+    oldA = std::exchange(a, a.contiguous());
   }
   if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) {
     // Must perform in contiguous space
-    oldB = b;
-    b = b.contiguous();
+    oldB = std::exchange(b, b.contiguous());
   }
 
   // It is possible that the tensor dimensions are able to be collapsed,
@@ -547,17 +545,11 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
 #undef HANDLE_A_CASE
 
   if (oldA.defined()) {
-    // Ignore overlaps when copying back; if we use copy
-    // instead, it will recursively try and invoke ourselves to make
-    // oldA contiguous.
-    at::native::legacy::cuda::_th_copy_ignoring_overlaps_(oldA, a);
+    at::native::copy_ignoring_overlaps(oldA, a);
   }
 
   if (oldB.defined()) {
-    // Ignore overlaps when copying back; if we use copy
-    // instead, it will recursively try and invoke ourselves to make
-    // oldB contiguous.
-    at::native::legacy::cuda::_th_copy_ignoring_overlaps_(oldB, b);
+    at::native::copy_ignoring_overlaps(oldB, b);
   }
 
   return true;
diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
deleted file mode 100644
index c4e9dfe78cebe..0000000000000
--- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-#include <ATen/LegacyTHFunctionsCUDA.h>
-
-#include <ATen/ATen.h>
-#include <ATen/Utils.h>
-#include <ATen/NamedTensorUtils.h>
-#include <ATen/CUDAGeneratorImpl.h>
-#include <ATen/ExpandUtils.h>
-#include <THC/THC.h>
-#include <THC/THCTensor.hpp>
-#undef THNN_
-#undef THCIndexTensor_
-#include <ATen/DeviceGuard.h>
-#include <ATen/cuda/ATenCUDAGeneral.h>
-#include <ATen/cuda/CUDADevice.h>
-#include <ATen/cuda/CUDAContext.h>
-
-namespace at {
-namespace native {
-namespace legacy {
-namespace cuda {
-
-namespace {
-  ScalarType infer_scalar_type(const Tensor & t) {
-    return t.scalar_type();
-  }
-  ScalarType infer_scalar_type(const TensorList & tl) {
-    TORCH_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
-    return tl[0].scalar_type();
-  }
-
-  TensorOptions options(ScalarType s) {
-    return TensorOptions().dtype(s)
-                          .device(DeviceType::CUDA)
-                          .layout(kStrided);
-  }
-
-  Allocator* allocator() {
-    return at::cuda::getCUDADeviceAllocator();
-  }
-}
-
-Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaByteTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaCharTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaDoubleTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaIntTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaLongTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaShortTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto src_ = checked_dense_tensor_unwrap(src, "src", 2, "_th_copy_ignoring_overlaps_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaHalfTensor_copyIgnoringOverlaps(globalContext().getTHCState(), self_, src_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_copy_ignoring_overlaps_ not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return self;
-}
-
-} // namespace th
-} // namespace legacy
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index 1b8538ec07601..7fa952d020ef9 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -253,6 +253,22 @@ Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) {
   return self;
 }
 
+void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src) {
+  // Called when we are copying into an overlapping index `dst`, but we don't
+  // care which writer wins. Hacky but it works. This is only used by
+  // CUDA_tensor_apply2 in case that there are write overlaps.
+  // FIXME: really, overlapping writes should be illegal/an error in Torch
+  auto iter = TensorIteratorConfig()
+      .add_output(dst)
+      .add_input(src)
+      .resize_outputs(false)
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(true)
+      .check_all_same_device(true)
+      .build();
+  copy_stub(iter.device_type(), iter, /*non_blocking=*/false);
+}
+
 DEFINE_DISPATCH(copy_stub);
 
 } // namespace native
diff --git a/aten/src/ATen/native/Copy.h b/aten/src/ATen/native/Copy.h
index 2dfd9e9f4922b..938466102b469 100644
--- a/aten/src/ATen/native/Copy.h
+++ b/aten/src/ATen/native/Copy.h
@@ -13,5 +13,7 @@ using copy_fn = void (*)(TensorIterator&, bool non_blocking);
 
 DECLARE_DISPATCH(copy_fn, copy_stub);
 
+TORCH_API void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src);
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/cuda/DistributionBernoulli.cu b/aten/src/ATen/native/cuda/DistributionBernoulli.cu
index 3acf87c3c4b40..0baaf2e049b04 100644
--- a/aten/src/ATen/native/cuda/DistributionBernoulli.cu
+++ b/aten/src/ATen/native/cuda/DistributionBernoulli.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>
diff --git a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
index 35a1e6ef5a98c..6f43ee664cb2c 100644
--- a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>
diff --git a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
index b4cf288bcb7b8..6e1823032a789 100644
--- a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>
diff --git a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
index eb71ab3231f12..9086e2a35c8d3 100644
--- a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>
diff --git a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
index 89b9c04b3a687..9497cf83cc405 100644
--- a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>
diff --git a/aten/src/ATen/native/cuda/DistributionNormal.cu b/aten/src/ATen/native/cuda/DistributionNormal.cu
index da647277c1762..32d223c5d0a93 100644
--- a/aten/src/ATen/native/cuda/DistributionNormal.cu
+++ b/aten/src/ATen/native/cuda/DistributionNormal.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>
diff --git a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
index 8d6614b9010d8..57d0701329d91 100644
--- a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
index cf1281d320b14..a48a3778305ab 100644
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>
diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
index 3912af58e1d99..65c45e7027964 100644
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -1,6 +1,5 @@
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/CUDAFunctions.h>
 #include <ATen/cuda/CUDAContext.h>
diff --git a/aten/src/ATen/native/cuda/Sort.cu b/aten/src/ATen/native/cuda/Sort.cu
index f53f7b478dadf..83fce65d33b6c 100644
--- a/aten/src/ATen/native/cuda/Sort.cu
+++ b/aten/src/ATen/native/cuda/Sort.cu
@@ -3,7 +3,6 @@
 #include <ATen/ATen.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/WrapDimUtils.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 #include <ATen/core/Array.h>
 #include <ATen/cuda/cub.cuh>
 #include <ATen/cuda/CUDAContext.h>
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
index c0bc353110b6f..121208dd58dc2 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -1,7 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/detail/TensorInfo.cuh>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/cuda/SortingCommon.cuh>
 #include <ATen/native/cuda/SortingRadixSelect.cuh>
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
index 522225b5fd85a..6270bba9eafee 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
@@ -1,6 +1,5 @@
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp
index c702a68063c31..1abc3ee391ae2 100644
--- a/aten/src/ATen/templates/RegisterDispatchKey.cpp
+++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp
@@ -33,7 +33,6 @@
 #include <ATen/core/op_registration/adaption.h>
 #include <torch/library.h>
 $extra_cuda_headers
-$legacy_th_headers
 $external_backend_headers
 $namespaced_headers
 
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index c0ce886c3d50a..44bb3b4f87e1d 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -1096,9 +1096,6 @@ def make_file_manager(install_dir: str) -> FileManager:
 
         fm.write_with_template(f'Register{dispatch_key}.cpp', 'RegisterDispatchKey.cpp', lambda: {
             'extra_cuda_headers': extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else '',
-            'legacy_th_headers':
-                '#include <ATen/LegacyTHFunctionsCUDA.h>' if dispatch_key == DispatchKey.CUDA else
-                '',
             'external_backend_headers': '',
             'namespaced_headers': f'#include <ATen/{dispatch_key}Functions.h>' if dispatch_key in functions_keys else '',
             'DispatchKey': dispatch_key,
diff --git a/tools/codegen/gen_backend_stubs.py b/tools/codegen/gen_backend_stubs.py
index a712a239ad565..51f81c702e122 100644
--- a/tools/codegen/gen_backend_stubs.py
+++ b/tools/codegen/gen_backend_stubs.py
@@ -227,7 +227,6 @@ def make_file_manager(install_dir: str) -> FileManager:
         for dispatch_key in [backend_dispatch_key, autograd_dispatch_key]:
             fm.write_with_template(f'Register{dispatch_key}.cpp', 'RegisterDispatchKey.cpp', lambda: {
                 'extra_cuda_headers': '',
-                'legacy_th_headers': '',
                 'external_backend_headers': f'#include "{output_dir}/{backend_key}NativeFunctions.h"',
                 'namespaced_headers': '',
                 'DispatchKey': dispatch_key,

From 543130511a97a0aab57a6b2345156cc638b8cf7f Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Tue, 24 Aug 2021 18:52:29 -0700
Subject: [PATCH 192/530] [nnc] Disable erf and erfc (#63775)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63775

These introduce small accuracy differences that cause some internal
tests to fail, and it's not worth fixing the tests right now because they're
slower than the ATen ops anyways.
ghstack-source-id: 136526229

Test Plan:
```
buck test mode/dev //aml/eccv/mcm/training:tests -- --exact 'aml/eccv/mcm/training:tests - test_build_torch_script_model (aml.eccv.mcm.training.tests.publish_helper_tests.TransformerPredictorPublishHelperTests)'
```

Reviewed By: navahgar

Differential Revision: D30484557

fbshipit-source-id: 095a9c810539a499105b76e1d96843dbc61b0079
---
 test/test_jit_fuser_te.py                  |  3 +++
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 5e8204a4c7b14..f2dce12673d51 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1274,8 +1274,11 @@ def apply(fn):
             lambda x: torch.threshold(x, 0, -10),
             lambda x: torch.clamp(x, -10, 10),
         ]
+        gpu_only = {torch.erf, torch.erfc}
         sizes = [(1,), (2,), (4, 4)]
         for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes):
+            if op in gpu_only and device == "cpu":
+                continue
             try:
                 x = self.data_for(dtype, device, size=size)
                 fn = apply(op)
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index d4add03506c4f..3f0cd14668169 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -948,6 +948,14 @@ class TensorExprFuser {
       "aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor",
       "aten::matmul(Tensor self, Tensor other) -> Tensor",
     };
+    static const OperatorSet gpu_only_operator_set{
+      // On CPU, these are slower and less accurate than ATen kernels, because
+      // ATen is able to use MKL-VML, whereas the fuser currently can't.  The
+      // fuser uses sleef instead because sleef provides functions that operate
+      // on vectors, instead of large buffers.
+      "aten::erf(Tensor self) -> Tensor",
+      "aten::erfc(Tensor self) -> Tensor",
+    };
     static const OperatorSet pow{
       "aten::pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor",
     };
@@ -1026,6 +1034,17 @@ class TensorExprFuser {
       }
     }
 
+    // Operator is only supported on GPU.
+    if (node->isMemberOf(gpu_only_operator_set)) {
+      auto device = tensorexpr::pickDeviceType(node->inputs());
+      if (!device) {
+        device = tensorexpr::pickDeviceType(node->outputs());
+      }
+      if (!device || !device->is_cuda()) {
+        return false;
+      }
+    }
+
     if (node->kind() == aten::to) {
       // only support same-device conversion
       auto device = tensorexpr::pickDeviceType(node->inputs());

From 1787b905c4a571ff1ae09ddc56ce56cb04e52136 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Tue, 24 Aug 2021 18:52:29 -0700
Subject: [PATCH 193/530] Don't switch executors mid test (#63830)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63830

It's really not safe to change the executor out from under models that may have
already been partially compiled.
ghstack-source-id: 136526228

Test Plan:
```
DEBUG=1 CFLAGS="-fsanitize=address" CXXFLAGS="-fsanitize=address" USE_LLVM=$(realpath ../llvm-project/install) CMAKE_PREFIX_PATH=$CONDA_PREFIX python setup.py install
LD_PRELOAD=/lib64/libasan.so.5 numactl -C3 pytest -v --cov --cov-report xml:test/coverage.xml --cov-append onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset11 -s
```

Reviewed By: desertfire

Differential Revision: D30504489

fbshipit-source-id: 188581cb53f0cf5bd3442d1e9d46e8c0c7e124f8
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 9d56c1169dec2..ffeef00cc9ac6 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -531,10 +531,6 @@ def test_faster_rcnn(self):
                       dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
 
     def test_paste_mask_in_image(self):
-        # disable profiling
-        torch._C._jit_set_profiling_executor(False)
-        torch._C._jit_set_profiling_mode(False)
-
         masks = torch.rand(10, 1, 26, 26)
         boxes = torch.rand(10, 4)
         boxes[:, 2:] += torch.rand(10, 2)
@@ -582,10 +578,6 @@ def test_mask_rcnn(self):
                                     "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5)
 
     def test_heatmaps_to_keypoints(self):
-        # disable profiling
-        torch._C._jit_set_profiling_executor(False)
-        torch._C._jit_set_profiling_mode(False)
-
         maps = torch.rand(10, 1, 26, 26)
         rois = torch.rand(10, 4)
         from torchvision.models.detection.roi_heads import heatmaps_to_keypoints

From 8dda299d9631e0f6e121dcb9f8f94bbdd8435515 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Tue, 24 Aug 2021 18:52:29 -0700
Subject: [PATCH 194/530] Re-apply: [nnc] Support thread level parallelism in
 fused kernels (#63776)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63776

I reverted this out of an abundance of caution because some test
failures occurred, but they were all due to precision issues fixed lower in
this stack.  Let's try again.

I've rolled the elimination of the allow-parallelism-in-fusions toggle into
this diff since they're pretty tightly coupled.
ghstack-source-id: 136529847

Test Plan: CI

Reviewed By: huiguoo

Differential Revision: D30484555

fbshipit-source-id: 38fd33520f710585d1130c365a8c60c9ce794a59
---
 test/cpp/tensorexpr/test_kernel.cpp        | 28 +++++++
 test/cpp/tensorexpr/test_te_fuser_pass.cpp |  6 +-
 test/jit/test_profiler.py                  |  3 -
 test/test_jit_fuser_te.py                  |  5 --
 test/test_tensorexpr.py                    |  4 -
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 19 +----
 torch/csrc/jit/passes/tensorexpr_fuser.h   |  2 -
 torch/csrc/jit/python/init.cpp             |  2 -
 torch/csrc/jit/tensorexpr/kernel.cpp       | 87 ++++++++++++++++++++++
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 24 ++++--
 torch/csrc/jit/tensorexpr/llvm_jit.h       |  8 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp     |  7 ++
 12 files changed, 148 insertions(+), 47 deletions(-)

diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index e14282f258893..8cdf2ef90df11 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -198,6 +198,34 @@ TEST_F(Kernel, _3) {
   }
 }
 
+TEST_F(Kernel, ParallelStrided) {
+  const auto graph_string = R"IR(
+      graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu),
+            %1 : Float(5, 3, 40005, strides=[960120, 160020, 2], device=cpu)):
+        %2 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %1)
+        %3 : Float(5, 3, 40005, strides=[120015, 40005, 1]) = aten::mul(%0, %2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+
+  auto a = at::rand({5, 3, 40005}, TensorOptions(kCPU).dtype(at::kFloat));
+  auto b = at::rand({10, 6, 80010}, TensorOptions(kCPU).dtype(at::kFloat))
+               .index(
+                   {Slice(None, None, 2),
+                    Slice(None, None, 2),
+                    Slice(None, None, 2)});
+  auto ref = a * (a * b);
+  auto o = at::zeros_like(ref);
+  TensorExprKernel k(graph);
+  std::vector<at::Tensor> inputs = {a, b};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  o = stack[0].toTensor();
+  for (size_t i = 0; i < 5 * 3; i++) {
+    CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
+  }
+}
+
 TEST_F(Kernel, DISABLED_Shape_Inference) {
   // disabled: doesn't do stride propagation, and isn't being used currently
 
diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
index 723a8fef81bea..b82d383bc99b0 100644
--- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp
+++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@@ -14,19 +14,15 @@ namespace jit {
 using namespace torch::jit::tensorexpr;
 
 struct WithCPUFuser {
-  WithCPUFuser(bool val = true)
-      : cpuFuserEnabled(canFuseOnCPU()), parallel(texprParallelCPUEnabled()) {
+  WithCPUFuser(bool val = true) : cpuFuserEnabled(canFuseOnCPU()) {
     overrideCanFuseOnCPU(val);
-    setTexprParallelCPUEnabled(true);
   }
 
   ~WithCPUFuser() {
     overrideCanFuseOnCPU(cpuFuserEnabled);
-    setTexprParallelCPUEnabled(parallel);
   }
 
   bool cpuFuserEnabled;
-  bool parallel;
 };
 
 TEST(TEFuserPass, FuserPass_1) {
diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py
index aa8be0518385f..b9ed9d0b78eb5 100644
--- a/test/jit/test_profiler.py
+++ b/test/jit/test_profiler.py
@@ -29,8 +29,6 @@ def setUp(self):
         torch._C._debug_set_fusion_group_inlining(False)
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
-        self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled()
-        torch._C._jit_set_texpr_parallel_cpu_enabled(True)
 
     def tearDown(self):
         torch._C._jit_set_profiling_executor(self.prev_exec)
@@ -42,7 +40,6 @@ def tearDown(self):
         torch._C._jit_set_texpr_reductions_enabled(self.old_reduction_enabled)
         torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
-        torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel)
 
     def test_tensor_type_not_determined_by_inputs(self):
         @torch.jit.script
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index f2dce12673d51..014f142cf1443 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -85,10 +85,6 @@ def setUp(self):
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
 
-        # TODO: CPU fuser currently is disabled when multithreading.
-        self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled()
-        torch._C._jit_set_texpr_parallel_cpu_enabled(True)
-
         self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         self.int_dtypes = [
             torch.int8,
@@ -116,7 +112,6 @@ def tearDown(self):
 
         torch._C._jit_set_texpr_fuser_enabled(self.texpr_fuser_state)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
-        torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel)
 
     def assertLastGraphAllFused(self):
         self.assertAllFused(torch.jit.last_executed_optimized_graph())
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 6353113a1ec4c..47c7e689aa6a4 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -24,9 +24,6 @@ def setUp(self):
         torch._C._debug_set_fusion_group_inlining(False)
         self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
         torch._C._jit_set_te_must_use_llvm_cpu(False)
-        # TODO: CPU fuser currently is disabled when multithreading.
-        self.old_fuse_parallel = torch._C._jit_texpr_parallel_cpu_enabled()
-        torch._C._jit_set_texpr_parallel_cpu_enabled(True)
 
         self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
 
@@ -39,7 +36,6 @@ def tearDown(self):
         torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuser_state)
         torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining)
         torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
-        torch._C._jit_set_texpr_parallel_cpu_enabled(self.old_fuse_parallel)
 
     def assertLastGraphAllFused(self):
         self.assertAllFused(torch.jit.last_executed_optimized_graph())
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 3f0cd14668169..085291afbdcf8 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 
-#include <ATen/Parallel.h>
 #include <ATen/core/interned_strings.h>
 #include <ATen/record_function.h>
 #include <c10/util/FunctionRef.h>
@@ -250,15 +249,6 @@ bool isSupported(Node* node) {
 } // namespace tensorexpr
 
 static bool texpr_fuser_enabled_ = true;
-static bool texpr_parallel_cpu_enabled = false;
-
-bool texprParallelCPUEnabled() {
-  return texpr_parallel_cpu_enabled;
-}
-
-void setTexprParallelCPUEnabled(bool val) {
-  texpr_parallel_cpu_enabled = val;
-}
 
 void setTensorExprFuserEnabled(bool val) {
   texpr_fuser_enabled_ = val;
@@ -898,14 +888,7 @@ class TensorExprFuser {
       return false;
     }
     if (device->is_cpu()) {
-      // CPU fusion is only supported for single-thread.
-      if (!canFuseOnCPU()) {
-        return false;
-      }
-      if (at::get_num_threads() == 1 || texprParallelCPUEnabled()) {
-        return true;
-      }
-      return false;
+      return canFuseOnCPU();
     } else if (device->is_cuda()) {
       return canFuseOnGPU();
     } else if (device->is_xpu()) {
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h
index 3f6538b7e587a..254aebd91d12f 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.h
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -24,8 +24,6 @@ TORCH_API void setTensorExprFuserEnabled(bool val);
 TORCH_API bool tensorExprFuserEnabled();
 TORCH_API bool setTexprReductionsEnabled(bool value);
 TORCH_API bool texprReductionsEnabled();
-TORCH_API bool texprParallelCPUEnabled();
-TORCH_API void setTexprParallelCPUEnabled(bool val);
 
 TORCH_API void RemoveProfileNodesAndSpecializeTypes(
     std::shared_ptr<Graph>& graph);
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index baea47d63ed18..645fea2274fb2 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -714,8 +714,6 @@ void initJITBindings(PyObject* module) {
       .def("_jit_texpr_set_fallback_allowed", &tensorexpr::setFallbackAllowed)
       .def("_jit_set_texpr_reductions_enabled", &setTexprReductionsEnabled)
       .def("_jit_texpr_reductions_enabled", &texprReductionsEnabled)
-      .def("_jit_set_texpr_parallel_cpu_enabled", &setTexprParallelCPUEnabled)
-      .def("_jit_texpr_parallel_cpu_enabled", &texprParallelCPUEnabled)
       .def(
           "_jit_set_te_generate_block_code",
           [](bool gen_block_code) {
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index fed5e1e139d3d..d53e857d75a48 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 
 #include <ATen/ExpandUtils.h>
+#include <ATen/Parallel.h>
 #include <ATen/TensorGeometry.h>
 #include <c10/util/irange.h>
 #include <c10/util/string_utils.h>
@@ -2487,6 +2488,86 @@ void fuseAllLoops(StmtPtr st) {
   }
 }
 
+// Compute the trip count of a loop if it is a constant.
+c10::optional<int64_t> tripCount(ForPtr loop) {
+  auto tc = IRSimplifier::simplify(
+      cast<int64_t>(ExprHandle(loop->stop()) - ExprHandle(loop->start())));
+  if (auto val = to<LongImm>(tc.node())) {
+    return val->value();
+  }
+  return c10::nullopt;
+}
+
+// Prune innermost loops until iterations satisfies a minimum grain size.
+static void pruneByGrainSize(std::vector<ForPtr>& loops) {
+  constexpr int64_t minGrainSize = 32768;
+  int64_t grainSize = 1;
+  for (int64_t i = loops.size(); i > 0; i--) {
+    auto tc = tripCount(loops[i - 1]);
+    if (!tc) {
+      break;
+    }
+    grainSize *= *tc;
+    if (grainSize < minGrainSize) {
+      loops.pop_back();
+    }
+  }
+}
+
+// Retain enough outermost loops to fill the number of threads.
+static void pruneByThreadCount(std::vector<ForPtr>& loops) {
+  int64_t trips = 1;
+  auto threads = at::get_num_threads();
+  auto it = loops.begin();
+  for (; it != loops.end(); it++) {
+    if (trips >= threads) {
+      break;
+    }
+    auto tc = tripCount(*it);
+    if (!tc) {
+      break;
+    }
+    trips *= *tc;
+  }
+  loops.erase(it, loops.end());
+}
+
+// Flatten and parallelize outer loops, subject to a minimum number of elements
+// in the inner loop, and a maximum level of thread-level parallelism in the
+// outer loops.
+template <typename Bufs>
+static void parallelizeOuterLoops(LoopNest& l, Bufs&& bufs) {
+  for (auto const& buf : bufs) {
+    auto loops = l.getLoopStmtsFor(buf);
+    pruneByGrainSize(loops);
+    pruneByThreadCount(loops);
+
+    // There are no loops to parallelize; give up.
+    if (loops.size() == 0) {
+      continue;
+    }
+    // The loop nest contains a reduction; give up.
+    auto reductions = NodeFinder<ReduceOp>::find(loops[0]);
+    if (reductions.size() > 0) {
+      continue;
+    }
+    // The loop nest has loop carried dependences; give up.
+    if (LoopNest::hasLoopCarriedDependence(loops[0])) {
+      continue;
+    }
+    // Try to flatten the outer loops and parallelize them if successful.
+    ForPtr flattened = nullptr;
+    if (loops.size() == 1) {
+      flattened = loops[0];
+    } else {
+      LoopNest::flatten(loops, &flattened);
+    }
+    if (flattened) {
+      flattened->set_parallel();
+    }
+  }
+}
+
 StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) {
   torch::jit::tensorexpr::LoopNest l(st, bufOutputs_);
   GRAPH_DEBUG("Original Stmt:\n", std::to_string(l.root_stmt()), "\n");
@@ -2528,6 +2609,8 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) {
   if (backendType == kLLVMCodeGen) {
     fuseAllLoops(l.root_stmt());
     GRAPH_DEBUG("after fuse", *l.root_stmt());
+    parallelizeOuterLoops(l, bufOutputs_);
+    GRAPH_DEBUG("after parallelize", *l.root_stmt());
   }
 
   if (backendType == kCudaCodeGen) {
@@ -2602,9 +2685,13 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) {
   }
 
   l.prepareForCodegen();
+  GRAPH_DEBUG("after prepareForCodegen", *l.root_stmt());
+  l.simplify();
+  GRAPH_DEBUG("after simplification", *l.root_stmt());
 
   if (backendType == kLLVMCodeGen && !hasReduction) {
     l.vectorizeInnerLoops();
+    GRAPH_DEBUG("after vectorization", *l.root_stmt());
   }
 
   StmtPtr stmt = l.root_stmt();
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 4ab2d53cc4942..5346d3668ec7e 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -274,15 +274,24 @@ class LLVMCodeGenImpl : public IRVisitor {
   }
 };
 
+extern "C" {
 typedef void (*ParallelCallee)(int index, int8_t* packed_data);
-void DispatchParallel(int8_t* func, int start, int stop, int8_t* packed_data) {
+void DispatchParallel(
+    int8_t* func,
+    int start,
+    int stop,
+    int8_t* packed_data) noexcept {
   // TODO: preserve the func type.
-  ParallelCallee callee = reinterpret_cast<ParallelCallee>(func);
-  at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) {
-    for (int index = f_begin; index < f_end; index++) {
-      callee(index, packed_data);
-    }
-  });
+  try {
+    ParallelCallee callee = reinterpret_cast<ParallelCallee>(func);
+    at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) {
+      for (int index = f_begin; index < f_end; index++) {
+        callee(index, packed_data);
+      }
+    });
+  } catch (...) {
+  }
+}
 }
 
 } // namespace tensorexpr
@@ -1288,6 +1297,7 @@ void LLVMCodeGenImpl::processParallelFor(ForPtr v) {
       module_->getOrInsertFunction("DispatchParallel", dispatcher_fntype);
   llvm::Function* dispatcher =
       llvm::cast<llvm::Function>(dispatcher_callee.getCallee());
+  dispatcher->addFnAttr(llvm::Attribute::NoUnwind);
   irb_.CreateCall(
       dispatcher, {func_value, start, stop, packed_caller_args_ptr});
   value_ = llvm::ConstantInt::get(IntTy_, 0);
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h
index 30ad5317a1b3c..8585900abc8d6 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.h
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.h
@@ -17,7 +17,13 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-void DispatchParallel(int8_t* func, int start, int stop, int8_t* packed_data);
+extern "C" {
+void DispatchParallel(
+    int8_t* func,
+    int start,
+    int stop,
+    int8_t* packed_data) noexcept;
+}
 
 inline std::string formatError(llvm::Error&& err, const char* msg) {
   static constexpr char* defaultErrorMsg = "Unexpected failure in LLVM JIT";
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 190499998b289..d3a4b919bef33 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -179,6 +179,13 @@ class Vectorizer : public IRMutator {
     });
   }
 
+  ExprPtr mutate(ModPtr v) override {
+    std::vector<ExprPtr> inputs = {v->lhs(), v->rhs()};
+    return try_vectorize(v, inputs, [&]() {
+      return ExprHandle(inputs[0]) % ExprHandle(inputs[1]);
+    });
+  }
+
   ExprPtr mutate(AndPtr v) override {
     std::vector<ExprPtr> inputs = {v->lhs(), v->rhs()};
     return try_vectorize(v, inputs, [&]() {

From 58ef99bd5aaf94c2cf5744b938ba4774773eb98d Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 24 Aug 2021 18:55:23 -0700
Subject: [PATCH 195/530] TST Adds pickle testing for ModuleInfo (#63736)

Summary:
Follow up to https://github.com/pytorch/pytorch/pull/61935

This PR adds `test_pickle` to `test_modules`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63736

Reviewed By: heitorschueroff

Differential Revision: D30522462

Pulled By: jbschlosser

fbshipit-source-id: a03b66ea0d81c6d0845c4fddf0ddc3714bbf0ab1
---
 test/test_modules.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/test/test_modules.py b/test/test_modules.py
index bb0fe5f1f9689..52520dad080de 100644
--- a/test/test_modules.py
+++ b/test/test_modules.py
@@ -1,3 +1,5 @@
+import tempfile
+
 import torch
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_modules import module_db, modules
@@ -108,6 +110,36 @@ def test_factory_kwargs(self, device, dtype, module_info):
                             buffer.dtype, dtype,
                             f'Buffer {name} is of dtype {buffer.dtype} instead of the expected dtype {dtype}')
 
+    @modules(module_db)
+    def test_pickle(self, device, dtype, module_info):
+        # Test that module can be pickled and unpickled.
+        module_cls = module_info.module_cls
+        module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
+                                                       requires_grad=False)
+        for module_input in module_inputs:
+            if module_input.forward_input is None:
+                continue
+
+            args, kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs
+
+            with freeze_rng_state():
+                # === Instantiate the module. ===
+                args, kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs
+                m = module_cls(*args, **kwargs)
+                m.to(device).to(dtype)
+
+                # === Do forward pass. ===
+                args, kwargs = module_input.forward_input.args, module_input.forward_input.kwargs
+                output = m(*args, **kwargs)
+
+                # === Check unpickled module gives the same output. ===
+                with tempfile.TemporaryFile() as f:
+                    torch.save(m, f)
+                    f.seek(0)
+                    m_copy = torch.load(f)
+                    output_from_copy = m_copy(*args, **kwargs)
+                    self.assertEqual(output, output_from_copy)
+
 
 instantiate_device_type_tests(TestModule, globals())
 

From 544af391b5649c8c407fa36b36631a2307997a09 Mon Sep 17 00:00:00 2001
From: Joel Schlosser <jbschlosser@fb.com>
Date: Tue, 24 Aug 2021 19:00:33 -0700
Subject: [PATCH 196/530] Allow arbitrary objects in state_dicts (#62976)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/62094

Introduces functionality for adding arbitrary objects to module state_dicts. To take advantage of this, the following functions can be defined on a module:
* `get_extra_state(self) -> dict` - Returns a dict defining any extra state this module wants to save
* `set_extra_state(self, state)` - Subsumes the given state within the module

In the details, a sub-dictionary is stored in the state_dict under the key `_extra_state` for each module that requires extra state.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62976

Reviewed By: heitorschueroff

Differential Revision: D30518657

Pulled By: jbschlosser

fbshipit-source-id: 5fb35ab8e3d36f35e3e96dcd4498f8c917d1f386
---
 test/test_nn.py            | 86 ++++++++++++++++++++++++++++++++++++++
 torch/jit/_script.py       |  2 +
 torch/nn/modules/module.py | 51 +++++++++++++++++++++-
 3 files changed, 138 insertions(+), 1 deletion(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 43e105a676ced..d577493fd531c 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -5465,6 +5465,92 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata,
         self.assertEqual(mm[0].param[0].item(), 10)
         self.assertEqual(mm[0].sub.weight[0, 0].item(), 555)
 
+    def test_extra_state(self):
+
+        class SubModule(torch.nn.Module):
+            def __init__(self, foo):
+                super().__init__()
+                self.foo = foo
+
+            def get_extra_state(self):
+                return {
+                    'foo': self.foo
+                }
+
+            def set_extra_state(self, state):
+                self.foo = state['foo']
+
+        class MyModule(torch.nn.Module):
+            def __init__(self, foo, bar):
+                super().__init__()
+                self.sub = SubModule(foo)
+                self.bar = bar
+
+            def get_extra_state(self):
+                return {
+                    'bar': self.bar
+                }
+
+            def set_extra_state(self, state):
+                self.bar = state['bar']
+
+        # Ensure state_dict contains the extra state by loading it into another module.
+        m = MyModule(3, 'something')
+        m2 = MyModule(5, 'something else')
+        m2.load_state_dict(m.state_dict())
+        self.assertEqual(m.state_dict(), m2.state_dict())
+        self.assertEqual(m2.bar, m.bar)
+        self.assertEqual(m2.sub.foo, m.sub.foo)
+
+    def test_extra_state_non_dict(self):
+
+        class MyModule(torch.nn.Module):
+            def __init__(self, foo):
+                super().__init__()
+                self.foo = foo
+
+            def get_extra_state(self):
+                return self.foo
+
+            def set_extra_state(self, state):
+                self.foo = state
+
+        # Test various types of extra state.
+        for state in ('something', 5, MyModule(3)):
+            m = MyModule(state)
+            m2 = MyModule('something else')
+            m2.load_state_dict(m.state_dict())
+            self.assertEqual(m.state_dict(), m2.state_dict())
+            self.assertEqual(m.foo, m2.foo)
+
+    def test_extra_state_missing_set_extra_state(self):
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def get_extra_state(self):
+                return {
+                    'foo': 5
+                }
+
+        m = MyModule()
+        with self.assertRaisesRegex(RuntimeError, 'Unexpected key'):
+            m.load_state_dict(m.state_dict())
+
+    def test_extra_state_missing_get_extra_state(self):
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def set_extra_state(self):
+                pass
+
+        m = MyModule()
+        with self.assertRaisesRegex(RuntimeError, 'Missing key'):
+            m.load_state_dict(m.state_dict())
+
     def test_parameter_assignment(self):
         l = nn.Linear(5, 5)
 
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index 0c3e5ef7f0726..3d173ae27bd01 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -912,6 +912,8 @@ def _get_methods(cls):
         "_tracing_name",
         "eval",
         "train",
+        "get_extra_state",
+        "set_extra_state"
     }
 
     def _make_fail(name):
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 2376422117306..28b220e24037f 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -46,6 +46,8 @@ def _addindent(s_, numSpaces):
 _global_forward_pre_hooks: Dict[int, Callable] = OrderedDict()
 _global_forward_hooks: Dict[int, Callable] = OrderedDict()
 
+_EXTRA_STATE_KEY_SUFFIX = '_extra_state'
+
 
 def register_module_forward_pre_hook(hook: Callable[..., None]) -> RemovableHandle:
     r"""Registers a forward pre-hook common to all modules.
@@ -528,6 +530,41 @@ def get_buffer(self, target: str) -> "Tensor":
 
         return buffer
 
+    def get_extra_state(self) -> Any:
+        """
+        Returns any extra state to include in the module's state_dict.
+        Implement this and a corresponding :func:`set_extra_state` for your module
+        if you need to store extra state. This function is called when building the
+        module's `state_dict()`.
+
+        Note that extra state should be pickleable to ensure working serialization
+        of the state_dict. We only provide provide backwards compatibility guarantees
+        for serializing Tensors; other objects may break backwards compatibility if
+        their serialized pickled form changes.
+
+        Returns:
+            object: Any extra state to store in the module's state_dict
+        """
+        raise RuntimeError(
+            "Reached a code path in Module.get_extra_state() that should never be called. "
+            "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.md "
+            "to report this bug.")
+
+    def set_extra_state(self, state: Any):
+        """
+        This function is called from :func:`load_state_dict` to handle any extra state
+        found within the `state_dict`. Implement this function and a corresponding
+        :func:`get_extra_state` for your module if you need to store extra state within its
+        `state_dict`.
+
+        Args:
+            state (dict): Extra state from the `state_dict`
+        """
+        raise RuntimeError(
+            "Reached a code path in Module.set_extra_state() that should never be called. "
+            "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.md "
+            "to report this bug.")
+
     def _apply(self, fn):
         for module in self.children():
             module._apply(fn)
@@ -1228,6 +1265,9 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
         for name, buf in self._buffers.items():
             if buf is not None and name not in self._non_persistent_buffers_set:
                 destination[prefix + name] = buf if keep_vars else buf.detach()
+        extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
+        if getattr(self.__class__, "get_extra_state", Module.get_extra_state) is not Module.get_extra_state:
+            destination[extra_state_key] = self.get_extra_state()
 
     # The user can pass an optional arbitrary mappable object to `state_dict`, in which case `state_dict` returns
     # back that same object. But if they pass nothing, an `OrederedDict` is created and returned.
@@ -1365,9 +1405,18 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
             elif strict:
                 missing_keys.append(key)
 
+        extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
+        if getattr(self.__class__, "set_extra_state", Module.set_extra_state) is not Module.set_extra_state:
+            if extra_state_key in state_dict:
+                self.set_extra_state(state_dict[extra_state_key])
+            elif strict:
+                missing_keys.append(extra_state_key)
+        elif strict and (extra_state_key in state_dict):
+            unexpected_keys.append(extra_state_key)
+
         if strict:
             for key in state_dict.keys():
-                if key.startswith(prefix):
+                if key.startswith(prefix) and key != extra_state_key:
                     input_name = key[len(prefix):]
                     input_name = input_name.split('.', 1)[0]  # get the name of param/buffer/child
                     if input_name not in self._modules and input_name not in local_state:

From ba126df61448ca3442ec77374bc32f43fcdd9773 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 24 Aug 2021 19:03:07 -0700
Subject: [PATCH 197/530] TST Adds more modules into common module tests
 (#62999)

Summary:
This PR moves some modules into `common_modules` to see what it looks like.

While migrating some no batch modules into `common_modules`, I noticed that `desc` is not used for the name. This means we can not use `-k` to filter tests. This PR moves the sample generation into `_parametrize_test`, and passes in the already generated `module_input` into users of `modules(modules_db)`.

I can see this is a little different from opsinfo and would be happy to revert to the original implementation of `modules`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62999

Reviewed By: heitorschueroff

Differential Revision: D30522737

Pulled By: jbschlosser

fbshipit-source-id: 7ed1aeb3753fc97a4ad6f1a3c789727c78e1bc73
---
 torch/testing/_internal/common_modules.py | 100 +++++++++++++++++++++-
 1 file changed, 98 insertions(+), 2 deletions(-)

diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index 088e66f962592..99525a7b68756 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -5,8 +5,8 @@
 from torch.testing import floating_types
 from torch.testing._internal.common_device_type import (
     _TestParametrizer, _dtype_test_suffix, _update_param_kwargs, skipIf)
-from torch.testing._internal.common_nn import nllloss_reference
-from torch.testing._internal.common_utils import make_tensor
+from torch.testing._internal.common_nn import nllloss_reference, get_reduction
+from torch.testing._internal.common_utils import make_tensor, freeze_rng_state
 from types import ModuleType
 from typing import List, Tuple, Type, Set, Dict
 
@@ -46,6 +46,7 @@
 
 class modules(_TestParametrizer):
     """ PROTOTYPE: Decorator for specifying a list of modules over which to run a test. """
+
     def __init__(self, module_info_list):
         self.module_info_list = module_info_list
 
@@ -199,8 +200,103 @@ def reference_fn(m, p, i, t, constructor_kwargs=constructor_kwargs):
     return module_inputs
 
 
+def no_batch_dim_reference_fn(m, p, *args, **kwargs):
+    """Reference function for modules supporting no batch dimensions.
+
+    The module is passed the input and target in batched form with a single item.
+    The output is squeezed to compare with the no-batch input.
+    """
+    single_batch_input_args = [input.unsqueeze(0) for input in args]
+    with freeze_rng_state():
+        return m(*single_batch_input_args).squeeze(0)
+
+
+def no_batch_dim_reference_criterion_fn(m, *args, **kwargs):
+    """Reference function for criterion supporting no batch dimensions."""
+    output = no_batch_dim_reference_fn(m, *args, **kwargs)
+    reduction = get_reduction(m)
+    if reduction == 'none':
+        return output.squeeze(0)
+    # reduction is 'sum' or 'mean' which results in a 0D tensor
+    return output
+
+
+def generate_regression_criterion_inputs(make_input):
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(reduction=reduction),
+            forward_input=FunctionInput(make_input(size=(4, )), make_input(size=4,)),
+            reference_fn=no_batch_dim_reference_criterion_fn,
+            desc='no_batch_dim_{}'.format(reduction)
+        ) for reduction in ['none', 'mean', 'sum']]
+
+
+def module_inputs_torch_nn_AvgPool1d(module_info, device, dtype, requires_grad, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(kernel_size=2),
+                    forward_input=FunctionInput(make_input(size=(3, 6))),
+                    desc='no_batch_dim',
+                    reference_fn=no_batch_dim_reference_fn)]
+
+
+def module_inputs_torch_nn_ELU(module_info, device, dtype, requires_grad, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(alpha=2.),
+                    forward_input=FunctionInput(make_input(size=(3, 2, 5))),
+                    reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2 * (i.exp() - 1))),
+        ModuleInput(constructor_input=FunctionInput(alpha=2.),
+                    forward_input=FunctionInput(make_input(size=())),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(size=(3,))),
+                    desc='no_batch_dim',
+                    reference_fn=no_batch_dim_reference_fn)]
+
+
+def module_inputs_torch_nn_CELU(module_info, device, dtype, requires_grad, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(alpha=2.),
+                    forward_input=FunctionInput(make_input(size=(3, 2, 5))),
+                    reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2. * ((.5 * i).exp() - 1))),
+        ModuleInput(constructor_input=FunctionInput(alpha=2.),
+                    forward_input=FunctionInput(make_input(size=())),
+                    reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2 * (i.exp() - 1)),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(alpha=2.),
+                    forward_input=FunctionInput(make_input(size=(3,))),
+                    desc='no_batch_dim',
+                    reference_fn=no_batch_dim_reference_fn)]
+
+
+def module_inputs_torch_nn_L1Loss(module_info, device, dtype, requires_grad, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(size=(2, 3, 4)),
+                                                make_input(size=(2, 3, 4))),
+                    reference_fn=lambda m, p, i, t: 1. / i.numel() * sum((a - b).abs().sum()
+                                                                         for a, b in zip(i, t))),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(size=()), make_input(size=())),
+                    reference_fn=lambda m, p, i, t: 1. / i.numel() * (i - t).abs().sum(),
+                    desc='scalar')] + generate_regression_criterion_inputs(make_input)
+
+
 # Database of ModuleInfo entries in alphabetical order.
 module_db: List[ModuleInfo] = [
+    ModuleInfo(torch.nn.AvgPool1d,
+               module_inputs_func=module_inputs_torch_nn_AvgPool1d),
+    ModuleInfo(torch.nn.ELU,
+               module_inputs_func=module_inputs_torch_nn_ELU),
+    ModuleInfo(torch.nn.L1Loss,
+               module_inputs_func=module_inputs_torch_nn_L1Loss),
     ModuleInfo(torch.nn.Linear,
                module_inputs_func=module_inputs_torch_nn_Linear),
     ModuleInfo(torch.nn.NLLLoss,

From e69a1398cbe534874060460faf36af21d24ce6e7 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Tue, 24 Aug 2021 19:37:54 -0700
Subject: [PATCH 198/530] compute reduction intermediate buffer size in
 elements (#63885)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63869
`iter` strides are in bytes, and we are additionally multiplying size computed using those strides by `sizeof(arg_t)`. Computing `output_memory_size` in elements should be enough.
This doesn't fix the still real problem of allocating large intermediate tensor, but it makes this tensor smaller by typically a factor of 4.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63885

Reviewed By: mruberry

Differential Revision: D30526034

Pulled By: ngimel

fbshipit-source-id: 0aca7f887974b7776e380463bbd82d32a5786ee8
---
 aten/src/ATen/native/cuda/Reduce.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 8c423061a79f6..161a896094976 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -923,6 +923,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
       for (int dim = 0; dim < iter.ndim(); dim++) {
         output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
       }
+      output_memory_size /= iter.element_size(0); //iter.strides is in bytes
       owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t),
                                                  sizeof(out_scalar_t),
                                                  (char*) iter.data_ptr(0),

From c8527bc39837e6c1e00fb770c0e158508279ba2c Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 24 Aug 2021 21:05:14 -0700
Subject: [PATCH 199/530] [qunat][graphmode][fx] Add a separate
 lower_to_native_backend function for relu (#62861)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62861

This PR adds a lower_to_native_backend function to lower a quantized reference model
to a model that uses fbgemm/qnnpack ops. We'll gradually add support and remove
the fbgemm/qnnpack specific handling in quantization_patterns.py

Test Plan:
python test/test_quantization.py TestQuantizeFx
python test/test_quantization.py TestQuantizeFxOps

Imported from OSS

Reviewed By: vkuzo

Differential Revision: D30165828

fbshipit-source-id: de1149cd7e7c1840c17c251cd4d35004afd015b7
---
 test/quantization/fx/test_quantize_fx.py      | 22 +++++++++++++
 .../fx/_lower_to_native_backend.py            | 14 +++++++++
 torch/quantization/fx/convert.py              |  3 ++
 torch/quantization/fx/lower_to_fbgemm.py      |  8 +++++
 torch/quantization/fx/lower_to_qnnpack.py     |  8 +++++
 .../quantization/fx/quantization_patterns.py  |  4 ++-
 ...ntized_fusion_patterns_and_replacements.py | 31 +++++++++++++++++++
 7 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 torch/quantization/fx/_lower_to_native_backend.py
 create mode 100644 torch/quantization/fx/lower_to_fbgemm.py
 create mode 100644 torch/quantization/fx/lower_to_qnnpack.py
 create mode 100644 torch/quantization/fx/quantized_fusion_patterns_and_replacements.py

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index bf15a06831bac..1bc6b610d1662 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -2861,6 +2861,28 @@ def forward(self, inputs: torch.Tensor, state: List[torch.Tensor]):
             if n.target == "lstm":
                 self.assertEqual(type(n.args[1]), tuple)
 
+    def test_lowering(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return torch.nn.functional.relu(x)
+
+        m = M().eval()
+        m = prepare_fx(m, {"": default_qconfig})
+        m_copy = copy.deepcopy(m)
+        m = convert_fx(m)
+        m_ref = convert_fx(m_copy, is_reference=True)
+        node_occurrence = {
+            ns.call_function(torch.quantize_per_tensor): 1,
+            ns.call_method("dequantize"): 1
+        }
+        node_occurrence_ref = {
+            ns.call_function(torch.quantize_per_tensor): 2,
+            ns.call_method("dequantize"): 2
+        }
+
+        self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
+        self.checkGraphModuleNodes(m_ref, expected_node_occurrence=node_occurrence_ref)
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     """Unit tests for individual ops
diff --git a/torch/quantization/fx/_lower_to_native_backend.py b/torch/quantization/fx/_lower_to_native_backend.py
new file mode 100644
index 0000000000000..a5518996bc44e
--- /dev/null
+++ b/torch/quantization/fx/_lower_to_native_backend.py
@@ -0,0 +1,14 @@
+from torch.fx import subgraph_rewriter
+from .graph_module import QuantizedGraphModule
+from .quantized_fusion_patterns_and_replacements import get_fbgemm_patterns_and_replacements
+
+def _lower_to_native_backend(model: QuantizedGraphModule) -> QuantizedGraphModule:
+    """ Lower a quantized reference model (with reference quantized operator patterns)
+    to the native backend in PyTorch (fbgemm/qnnpack), both backends shares the same
+    operator signature so they can be lowered with the same function
+    """
+    module_dict = dict(model.named_modules())
+    for pattern, replacement in get_fbgemm_patterns_and_replacements():
+        subgraph_rewriter.replace_pattern(model, pattern, replacement)
+    model.graph.lint()
+    return model
diff --git a/torch/quantization/fx/convert.py b/torch/quantization/fx/convert.py
index 671c2704d7da7..867b0b24cf7ad 100644
--- a/torch/quantization/fx/convert.py
+++ b/torch/quantization/fx/convert.py
@@ -45,6 +45,8 @@
     activation_dtype,
 )
 
+from .lower_to_fbgemm import lower_to_fbgemm
+
 # weight prepacking ops
 WEIGHT_PREPACK_OPS = {
     torch._ops.ops.quantized.linear_prepack,
@@ -535,4 +537,5 @@ def load_arg_remove(a: Argument) -> Argument:
     model = QuantizedGraphModule(model, act_post_process_removed_graph, preserved_attributes)
     if not is_reference:
         model = fold_weight(model, node_name_to_scope)
+        model = lower_to_fbgemm(model)
     return model
diff --git a/torch/quantization/fx/lower_to_fbgemm.py b/torch/quantization/fx/lower_to_fbgemm.py
new file mode 100644
index 0000000000000..fc76d135ee809
--- /dev/null
+++ b/torch/quantization/fx/lower_to_fbgemm.py
@@ -0,0 +1,8 @@
+from ._lower_to_native_backend import _lower_to_native_backend
+from .graph_module import QuantizedGraphModule
+
+def lower_to_fbgemm(model: QuantizedGraphModule) -> QuantizedGraphModule:
+    """ Lower a quantized reference model (with reference quantized operator patterns)
+    to fbgemm
+    """
+    return _lower_to_native_backend(model)
diff --git a/torch/quantization/fx/lower_to_qnnpack.py b/torch/quantization/fx/lower_to_qnnpack.py
new file mode 100644
index 0000000000000..0a0ea9cd248cd
--- /dev/null
+++ b/torch/quantization/fx/lower_to_qnnpack.py
@@ -0,0 +1,8 @@
+from ._lower_to_native_backend import _lower_to_native_backend
+from .graph_module import QuantizedGraphModule
+
+def lower_to_qnnpack(model: QuantizedGraphModule) -> QuantizedGraphModule:
+    """ Lower a quantized reference model (with reference quantized operator patterns)
+    to qnnpack
+    """
+    return _lower_to_native_backend(model)
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 1ce43cadc8e98..1a7d714136501 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -1496,7 +1496,9 @@ def convert(self,
                 load_arg: Callable,
                 is_reference: bool = False,
                 convert_custom_config_dict: Dict[str, Any] = None) -> Node:
-        if is_reference:
+        # always produce reference pattern for relu
+        is_relu = node.op == "call_function" and node.target == torch.nn.functional.relu
+        if is_reference or is_relu:
             # when activation dtype is torch.float, the node does not require
             # observation
             # e.g. dynamic quantization or weight_only quantization
diff --git a/torch/quantization/fx/quantized_fusion_patterns_and_replacements.py b/torch/quantization/fx/quantized_fusion_patterns_and_replacements.py
new file mode 100644
index 0000000000000..07c109ec4f922
--- /dev/null
+++ b/torch/quantization/fx/quantized_fusion_patterns_and_replacements.py
@@ -0,0 +1,31 @@
+import torch
+
+def relu_inplace_pattern(x, scale, zero_point):
+    x = x.dequantize()
+    x = torch.nn.functional.relu(x, inplace=True)
+    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
+    return x
+
+def relu_non_inplace_pattern(x, scale, zero_point):
+    x = x.dequantize()
+    x = torch.nn.functional.relu(x, inplace=False)
+    x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8)
+    return x
+
+def relu_replacement(x, scale, zero_point):
+    x = torch.nn.functional.relu(x)
+    return x
+
+
+def _get_all_patterns_and_replacements():
+    return [
+        (relu_inplace_pattern, relu_replacement),
+        (relu_non_inplace_pattern, relu_replacement)
+    ]
+
+
+def get_fbgemm_patterns_and_replacements():
+    return _get_all_patterns_and_replacements()
+
+def get_qnnpack_patterns_and_replacements():
+    return _get_all_patterns_and_replacements()

From d388a1a5df0e0255c07dc123e65002ec7396ad05 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Tue, 24 Aug 2021 21:21:57 -0700
Subject: [PATCH 200/530] [TensorExpr] LLVMCodegen: Use addFnAttr instead of
 addAttribute which was deleted. (#63886)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63886

cc gmagogsfm

Test Plan: Imported from OSS

Reviewed By: bertmaher

Differential Revision: D30523135

Pulled By: ZolotukhinM

fbshipit-source-id: 62e125f917b2a0153eb30879d93cf956587a05e0
---
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 5346d3668ec7e..a93fd64df0a68 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -424,9 +424,7 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
   llvm::FunctionType* fntype = llvm::FunctionType::get(retTy, params, false);
   fn_ = llvm::Function::Create(
       fntype, llvm::Function::PrivateLinkage, "pytorch", module_.get());
-  fn_->addAttribute(
-      llvm::AttributeList::AttrIndex::FunctionIndex,
-      llvm::Attribute::AlwaysInline);
+  fn_->addFnAttr(llvm::Attribute::AlwaysInline);
   for (const auto i : c10::irange(args.size())) {
     if (!args[i].isVar()) {
       fn_->addParamAttr(i, llvm::Attribute::NoAlias);

From 0301c3bc01329613c29c59cffa2c77f3ae2d0829 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 24 Aug 2021 21:28:40 -0700
Subject: [PATCH 201/530] [quant][graphmode][fx] Make maxpool and flatten
 produce the reference pattern (#63501)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63501

Currently some of the ops are considered as working with both float and quantized input,
so we may have things like "quant - some_op - dequant" this might not work well with the backend,
we may consider change everything to produce "quant - dequant - some_op - quant - dequant" instead
in the future, this PR fixes it for maxpool and flatten only to unblock resnet benchmarking on TensorRT

Test Plan:
python test/test_quantization.py TestQuantizeFxOps

Imported from OSS

Reviewed By: mruberry

Differential Revision: D30402788

fbshipit-source-id: 892c5ff6552775070e2c1453f65846590fb12735
---
 torch/quantization/fx/quantization_patterns.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 1a7d714136501..09ca190a73668 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -1447,6 +1447,9 @@ def convert(self,
 @register_quant_pattern(torch.nn.AvgPool3d)
 @register_quant_pattern(torch.nn.Dropout)
 @register_quant_pattern(torch.nn.Hardtanh)
+@register_quant_pattern(torch.nn.MaxPool1d)
+@register_quant_pattern(torch.nn.MaxPool2d)
+@register_quant_pattern(torch.nn.MaxPool3d)
 @register_quant_pattern(torch.nn.ReLU)
 @register_quant_pattern(torch.nn.ReLU6)
 @register_quant_pattern(torch.adaptive_avg_pool1d)
@@ -1456,12 +1459,16 @@ def convert(self,
 @register_quant_pattern(torch.nn.functional.hardtanh)
 @register_quant_pattern(torch.nn.functional.hardtanh_)
 @register_quant_pattern(torch.nn.functional.interpolate)
+@register_quant_pattern(torch.nn.functional.max_pool1d)
+@register_quant_pattern(torch.nn.functional.max_pool2d)
+@register_quant_pattern(torch.nn.functional.max_pool3d)
 @register_quant_pattern(torch.nn.functional.relu)
 @register_quant_pattern(torch.nn.functional.relu6)
 @register_quant_pattern(torch.avg_pool1d)
 @register_quant_pattern(torch._C._nn.avg_pool2d)
 @register_quant_pattern(torch._C._nn.avg_pool3d)
 @register_quant_pattern(torch.clamp)
+@register_quant_pattern(torch.flatten)
 @register_quant_pattern(torch.max)
 @register_quant_pattern(torch.mean)
 @register_quant_pattern(torch.min)
@@ -1556,15 +1563,8 @@ def convert(self,
         # module attribute like module._QUANTIZED_INPUT_INDEXES
         return quantized_graph.node_copy(node, load_arg(quantized=None))
 
-@register_quant_pattern(torch.nn.MaxPool1d)
-@register_quant_pattern(torch.nn.MaxPool2d)
-@register_quant_pattern(torch.nn.MaxPool3d)
 @register_quant_pattern(torch.nn.Identity)
-@register_quant_pattern(torch.nn.functional.max_pool1d)
-@register_quant_pattern(torch.nn.functional.max_pool2d)
-@register_quant_pattern(torch.nn.functional.max_pool3d)
 @register_quant_pattern(torch.chunk)
-@register_quant_pattern(torch.flatten)
 @register_quant_pattern(torch.transpose)
 @register_quant_pattern(torch.repeat_interleave)
 @register_quant_pattern(torch.sort)

From 10dfa58eba055a1bbc1cc89df033cd2815cbb403 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 24 Aug 2021 21:33:12 -0700
Subject: [PATCH 202/530] [fx2trt] Add a test for quantized resnet18 (#63446)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63446

Add a test for quantized resnet18 running in TensorRT

Test Plan: buck run mode/opt -c python.package_style=inplace caffe2:fx2trt_quantized_resnet_test

Reviewed By: 842974287

Differential Revision: D30384746

fbshipit-source-id: 1a863877711618cd23d887694269ed9e44ee606c
---
 .../fx2trt/converters/acc_ops_converters.py   |  15 +--
 .../fx2trt/example/quantized_resnet_test.py   | 117 ++++++++++++++++++
 2 files changed, 121 insertions(+), 11 deletions(-)
 create mode 100644 torch/fx/experimental/fx2trt/example/quantized_resnet_test.py

diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
index 566359bf2af0d..33a817d4ccdb5 100644
--- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
+++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
@@ -1300,15 +1300,11 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name):
     if q_zero_point != 0:
         raise RuntimeError(f"Only support zero_point == 0, get {q_zero_point}")
 
-    # temporarily set q_scale to 1 to make sure the q_scale is different
-    # for quantize and dequantize to avoid the error
-    # TODO: follow up with nvidia TensorRT team to repro and fix the problem
-    q_scale = 1
     scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([float(q_scale)], dtype=np.float32)))
     scale_layer.name = input_val.name + ".quant.scale"
     scale = scale_layer.get_output(0)
-    assert trt.__version__ > "8.0", "Explicit quantize op is only supported in "
-    "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
+    # assert trt.__version__ > "8.0", "Explicit quantize op is only supported in "
+    # "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
     layer = network.add_quantize(input=input_val, scale=scale)
     layer.axis = 0
     layer.name = input_val.name + ".quant"
@@ -1316,9 +1312,6 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name):
 
 @tensorrt_converter(acc_ops.dequantize)
 def acc_ops_dequantize(network, target, args, kwargs, name):
-    """
-    Currently just a no-op.
-    """
     input_val = kwargs["input"]
 
     if not isinstance(input_val, trt.tensorrt.ITensor):
@@ -1339,8 +1332,8 @@ def acc_ops_dequantize(network, target, args, kwargs, name):
     scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([q_scale], dtype=np.float32)))
     scale_layer.name = input_val.name + ".dequant.scale"
     scale = scale_layer.get_output(0)
-    assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in "
-    "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
+    # assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in "
+    # "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
     layer = network.add_dequantize(input=input_val, scale=scale)
     layer.name = input_val.name + ".dequant"
     layer.axis = 0
diff --git a/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py b/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py
new file mode 100644
index 0000000000000..39553dfd9dfb6
--- /dev/null
+++ b/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py
@@ -0,0 +1,117 @@
+import torch.fx
+import torchvision.models as models
+from torch.fx.experimental.fx2trt.fx2trt import TRTInterpreter, InputTensorSpec, TRTModule
+from torch.quantization.quantize_fx import prepare_fx, convert_fx
+import torch.fx.experimental.fx_acc.acc_tracer as acc_tracer
+import copy
+from torch.fx.passes import shape_prop
+from torch.fx.experimental.normalize import NormalizeArgs
+
+rn18 = models.resnet18().eval()
+
+def build_fp16_trt(rn18):
+    rn18 = copy.deepcopy(rn18)
+    rn18 = acc_tracer.trace(rn18, [torch.randn(1, 3, 224, 224)])
+    interp = TRTInterpreter(rn18, [InputTensorSpec([3, 224, 224], torch.float, has_batch_dim=False)])
+    engine, input_names, output_names = interp.run(fp16_mode=True)
+    return TRTModule(engine, input_names, output_names)
+
+@torch.no_grad()
+def build_int8_trt(rn18):
+    rn18 = copy.deepcopy(rn18)
+    data = torch.randn(1, 3, 224, 224)
+    # data = torch.randn(1, 64, 10, 10)
+    # TensorRT only supports symmetric quantization
+    qconfig = torch.quantization.QConfig(
+        activation=torch.quantization.observer.HistogramObserver.with_args(
+            qscheme=torch.per_tensor_symmetric, dtype=torch.qint8
+        ),
+        weight=torch.quantization.default_weight_observer
+    )
+    prepared = prepare_fx(rn18, {"": qconfig})
+    for _ in range(10):
+        prepared(data)
+    quantized_rn18 = convert_fx(prepared, is_reference=True)
+    print("quantized model:", quantized_rn18)
+
+    quantized_rn18 = acc_tracer.trace(quantized_rn18, [data])
+    interp = TRTInterpreter(quantized_rn18, [InputTensorSpec(data.shape[1:], torch.float, has_batch_dim=False)])
+    engine, input_names, output_names = interp.run(fp16_mode=False, int8_mode=True)
+    return TRTModule(engine, input_names, output_names)
+
+@torch.no_grad()
+def build_int8_trt_implicit_quant(rn18):
+    rn18 = copy.deepcopy(rn18)
+    data = torch.randn(1, 3, 224, 224)
+    # Quantization
+    qconfig = torch.quantization.QConfig(
+        activation=torch.quantization.observer.HistogramObserver.with_args(
+            qscheme=torch.per_tensor_symmetric, reduce_range=True
+        ),
+        weight=torch.quantization.default_per_channel_weight_observer
+    )
+    prepared = prepare_fx(rn18, {"": qconfig})
+    for _ in range(10):
+        prepared(data)
+    quantized_rn18 = convert_fx(prepared, is_reference=True)
+
+    # Build trt int8 model
+    traced_rn18 = torch.fx.symbolic_trace(quantized_rn18)
+    shape_prop.ShapeProp(traced_rn18).propagate(data)
+    traced_rn18 = NormalizeArgs(traced_rn18).transform()
+    interp = TRTInterpreter(traced_rn18, InputTensorSpec.from_tensors([data]))
+    engine, input_names, output_names = interp.run(fp16_mode=False, int8_mode=True, strict_type_constraints=True)
+    trt_mod = TRTModule(engine, input_names, output_names)
+    return trt_mod
+
+class M(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 3, 3, padding=1)
+
+    def forward(self, x):
+        out = self.conv(x)
+        # out = torch.nn.functional.relu(out)
+        out += x
+        out += out
+        out = torch.nn.functional.relu(out)
+        return out
+
+# rn18 = M().eval()
+# rn18 = rn18.layer1
+int8_trt = build_int8_trt(rn18)
+implicit_int8_trt = build_int8_trt_implicit_quant(rn18)
+fp16_trt = build_fp16_trt(rn18)
+x = torch.randn(5, 3, 224, 224, device="cuda")
+rn18 = rn18.cuda()
+
+import time
+NITER = 100
+
+torch.cuda.synchronize()
+s = time.time()
+for _ in range(NITER):
+    fp16_trt(x)
+    torch.cuda.synchronize()
+print('trt fp16 time (ms/iter)', (time.time() - s) / NITER * 1000)
+
+torch.cuda.synchronize()
+s = time.time()
+for _ in range(NITER):
+    int8_trt(x)
+    torch.cuda.synchronize()
+print('trt int8 time (ms/iter)', (time.time() - s) / NITER * 1000)
+
+torch.cuda.synchronize()
+s = time.time()
+for _ in range(NITER):
+    implicit_int8_trt(x)
+    torch.cuda.synchronize()
+print('trt implicit int8 time (ms/iter)', (time.time() - s) / NITER * 1000)
+
+torch.cuda.synchronize()
+s = time.time()
+for _ in range(NITER):
+    rn18(x)
+    torch.cuda.synchronize()
+print('PyTorch time (ms/iter)', (time.time() - s) / NITER * 1000)

From 839eaa2e91556ecd4532596b4fef18a1c3f6e1c1 Mon Sep 17 00:00:00 2001
From: Linbin Yu <linbin@fb.com>
Date: Wed, 25 Aug 2021 00:42:03 -0700
Subject: [PATCH 203/530] Revert D30384746: [fx2trt] Add a test for quantized
 resnet18

Test Plan: revert-hammer

Differential Revision:
D30384746 (https://github.com/pytorch/pytorch/commit/10dfa58eba055a1bbc1cc89df033cd2815cbb403)

Original commit changeset: 1a8638777116

fbshipit-source-id: b93235323e229b391f5456f6e3543988062dd0d4
---
 .../fx2trt/converters/acc_ops_converters.py   |  15 ++-
 .../fx2trt/example/quantized_resnet_test.py   | 117 ------------------
 2 files changed, 11 insertions(+), 121 deletions(-)
 delete mode 100644 torch/fx/experimental/fx2trt/example/quantized_resnet_test.py

diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
index 33a817d4ccdb5..566359bf2af0d 100644
--- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
+++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
@@ -1300,11 +1300,15 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name):
     if q_zero_point != 0:
         raise RuntimeError(f"Only support zero_point == 0, get {q_zero_point}")
 
+    # temporarily set q_scale to 1 to make sure the q_scale is different
+    # for quantize and dequantize to avoid the error
+    # TODO: follow up with nvidia TensorRT team to repro and fix the problem
+    q_scale = 1
     scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([float(q_scale)], dtype=np.float32)))
     scale_layer.name = input_val.name + ".quant.scale"
     scale = scale_layer.get_output(0)
-    # assert trt.__version__ > "8.0", "Explicit quantize op is only supported in "
-    # "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
+    assert trt.__version__ > "8.0", "Explicit quantize op is only supported in "
+    "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
     layer = network.add_quantize(input=input_val, scale=scale)
     layer.axis = 0
     layer.name = input_val.name + ".quant"
@@ -1312,6 +1316,9 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name):
 
 @tensorrt_converter(acc_ops.dequantize)
 def acc_ops_dequantize(network, target, args, kwargs, name):
+    """
+    Currently just a no-op.
+    """
     input_val = kwargs["input"]
 
     if not isinstance(input_val, trt.tensorrt.ITensor):
@@ -1332,8 +1339,8 @@ def acc_ops_dequantize(network, target, args, kwargs, name):
     scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([q_scale], dtype=np.float32)))
     scale_layer.name = input_val.name + ".dequant.scale"
     scale = scale_layer.get_output(0)
-    # assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in "
-    # "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
+    assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in "
+    "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
     layer = network.add_dequantize(input=input_val, scale=scale)
     layer.name = input_val.name + ".dequant"
     layer.axis = 0
diff --git a/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py b/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py
deleted file mode 100644
index 39553dfd9dfb6..0000000000000
--- a/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import torch.fx
-import torchvision.models as models
-from torch.fx.experimental.fx2trt.fx2trt import TRTInterpreter, InputTensorSpec, TRTModule
-from torch.quantization.quantize_fx import prepare_fx, convert_fx
-import torch.fx.experimental.fx_acc.acc_tracer as acc_tracer
-import copy
-from torch.fx.passes import shape_prop
-from torch.fx.experimental.normalize import NormalizeArgs
-
-rn18 = models.resnet18().eval()
-
-def build_fp16_trt(rn18):
-    rn18 = copy.deepcopy(rn18)
-    rn18 = acc_tracer.trace(rn18, [torch.randn(1, 3, 224, 224)])
-    interp = TRTInterpreter(rn18, [InputTensorSpec([3, 224, 224], torch.float, has_batch_dim=False)])
-    engine, input_names, output_names = interp.run(fp16_mode=True)
-    return TRTModule(engine, input_names, output_names)
-
-@torch.no_grad()
-def build_int8_trt(rn18):
-    rn18 = copy.deepcopy(rn18)
-    data = torch.randn(1, 3, 224, 224)
-    # data = torch.randn(1, 64, 10, 10)
-    # TensorRT only supports symmetric quantization
-    qconfig = torch.quantization.QConfig(
-        activation=torch.quantization.observer.HistogramObserver.with_args(
-            qscheme=torch.per_tensor_symmetric, dtype=torch.qint8
-        ),
-        weight=torch.quantization.default_weight_observer
-    )
-    prepared = prepare_fx(rn18, {"": qconfig})
-    for _ in range(10):
-        prepared(data)
-    quantized_rn18 = convert_fx(prepared, is_reference=True)
-    print("quantized model:", quantized_rn18)
-
-    quantized_rn18 = acc_tracer.trace(quantized_rn18, [data])
-    interp = TRTInterpreter(quantized_rn18, [InputTensorSpec(data.shape[1:], torch.float, has_batch_dim=False)])
-    engine, input_names, output_names = interp.run(fp16_mode=False, int8_mode=True)
-    return TRTModule(engine, input_names, output_names)
-
-@torch.no_grad()
-def build_int8_trt_implicit_quant(rn18):
-    rn18 = copy.deepcopy(rn18)
-    data = torch.randn(1, 3, 224, 224)
-    # Quantization
-    qconfig = torch.quantization.QConfig(
-        activation=torch.quantization.observer.HistogramObserver.with_args(
-            qscheme=torch.per_tensor_symmetric, reduce_range=True
-        ),
-        weight=torch.quantization.default_per_channel_weight_observer
-    )
-    prepared = prepare_fx(rn18, {"": qconfig})
-    for _ in range(10):
-        prepared(data)
-    quantized_rn18 = convert_fx(prepared, is_reference=True)
-
-    # Build trt int8 model
-    traced_rn18 = torch.fx.symbolic_trace(quantized_rn18)
-    shape_prop.ShapeProp(traced_rn18).propagate(data)
-    traced_rn18 = NormalizeArgs(traced_rn18).transform()
-    interp = TRTInterpreter(traced_rn18, InputTensorSpec.from_tensors([data]))
-    engine, input_names, output_names = interp.run(fp16_mode=False, int8_mode=True, strict_type_constraints=True)
-    trt_mod = TRTModule(engine, input_names, output_names)
-    return trt_mod
-
-class M(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv = torch.nn.Conv2d(3, 3, 3, padding=1)
-
-    def forward(self, x):
-        out = self.conv(x)
-        # out = torch.nn.functional.relu(out)
-        out += x
-        out += out
-        out = torch.nn.functional.relu(out)
-        return out
-
-# rn18 = M().eval()
-# rn18 = rn18.layer1
-int8_trt = build_int8_trt(rn18)
-implicit_int8_trt = build_int8_trt_implicit_quant(rn18)
-fp16_trt = build_fp16_trt(rn18)
-x = torch.randn(5, 3, 224, 224, device="cuda")
-rn18 = rn18.cuda()
-
-import time
-NITER = 100
-
-torch.cuda.synchronize()
-s = time.time()
-for _ in range(NITER):
-    fp16_trt(x)
-    torch.cuda.synchronize()
-print('trt fp16 time (ms/iter)', (time.time() - s) / NITER * 1000)
-
-torch.cuda.synchronize()
-s = time.time()
-for _ in range(NITER):
-    int8_trt(x)
-    torch.cuda.synchronize()
-print('trt int8 time (ms/iter)', (time.time() - s) / NITER * 1000)
-
-torch.cuda.synchronize()
-s = time.time()
-for _ in range(NITER):
-    implicit_int8_trt(x)
-    torch.cuda.synchronize()
-print('trt implicit int8 time (ms/iter)', (time.time() - s) / NITER * 1000)
-
-torch.cuda.synchronize()
-s = time.time()
-for _ in range(NITER):
-    rn18(x)
-    torch.cuda.synchronize()
-print('PyTorch time (ms/iter)', (time.time() - s) / NITER * 1000)

From 87a661c79f486f2fcc11f125a86a2b5f0ec53d83 Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Wed, 25 Aug 2021 07:15:18 -0700
Subject: [PATCH 204/530] Revert D30526034: [pytorch][PR] compute reduction
 intermediate buffer size in elements

Test Plan: revert-hammer

Differential Revision:
D30526034 (https://github.com/pytorch/pytorch/commit/e69a1398cbe534874060460faf36af21d24ce6e7)

Original commit changeset: 0aca7f887974

fbshipit-source-id: a22472723818d6fe0c11a6e134080df1ac408038
---
 aten/src/ATen/native/cuda/Reduce.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 161a896094976..8c423061a79f6 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -923,7 +923,6 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
       for (int dim = 0; dim < iter.ndim(); dim++) {
         output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
       }
-      output_memory_size /= iter.element_size(0); //iter.strides is in bytes
       owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t),
                                                  sizeof(out_scalar_t),
                                                  (char*) iter.data_ptr(0),

From 3926fdbaa46d47483cec310f245ac60acf3ee13b Mon Sep 17 00:00:00 2001
From: zhouzhuojie <zhouzhuojie@gmail.com>
Date: Wed, 25 Aug 2021 08:50:00 -0700
Subject: [PATCH 205/530] [skip ci] Add generated comment to ruleset json
 (#63896)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63896

Reviewed By: heitorschueroff

Differential Revision: D30529820

Pulled By: zhouzhuojie

fbshipit-source-id: 7529803af23ea36a7bcb673cd399da80da8e3feb
---
 .github/generated-ciflow-ruleset.json    | 1 +
 .github/scripts/generate_ci_workflows.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json
index 80b2cabfff788..70aa7767483e3 100644
--- a/.github/generated-ciflow-ruleset.json
+++ b/.github/generated-ciflow-ruleset.json
@@ -1,4 +1,5 @@
 {
+  "__comment": "@generated DO NOT EDIT MANUALLY, Generation script: .github/scripts/generate_ci_workflows.py",
   "label_rules": {
     "ciflow/all": [
       "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index fce50ac7811e5..097974d88acf1 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -90,7 +90,9 @@ def add_label_rule(self, labels: Set[str], workflow_name: str) -> None:
                 self.label_rules[label] = {workflow_name}
 
     def generate_json(self) -> None:
+        GENERATED = "generated"  # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
         output = {
+            "__comment": f"@{GENERATED} DO NOT EDIT MANUALLY, Generation script: .github/scripts/generate_ci_workflows.py",
             "version": self.version,
             "label_rules": {
                 label: sorted(list(workflows))

From 8c897d254dda8c3f667a85e05a4a31739a2d85c5 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Wed, 25 Aug 2021 09:00:13 -0700
Subject: [PATCH 206/530] Swap CUDA 11.1 and 11.3 in CI to make 11.1 periodic
 (#63900)

Summary:
Preparing for supporting 11.3 in the next release.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63900

Reviewed By: malfet

Differential Revision: D30541437

Pulled By: janeyx99

fbshipit-source-id: a7297da7f7818a4291b1c321d62d76fc2c0f1f90
---
 .github/generated-ciflow-ruleset.json         | 48 +++++++++----------
 .github/scripts/generate_ci_workflows.py      | 24 +++++-----
 ...orch-linux-xenial-cuda11.3-py3.6-gcc7.yml} | 10 ++--
 ...ated-linux-xenial-cuda11.3-py3.6-gcc7.yml} | 16 +++----
 ...orch-linux-xenial-cuda11.1-py3.6-gcc7.yml} | 10 ++--
 ...odic-linux-xenial-cuda11.1-py3.6-gcc7.yml} | 16 +++----
 ...ated-periodic-win-vs2019-cuda11.1-py3.yml} | 14 +++---
 ... => generated-win-vs2019-cuda11.3-py3.yml} | 14 +++---
 8 files changed, 76 insertions(+), 76 deletions(-)
 rename .github/workflows/{generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml => generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml} (96%)
 rename .github/workflows/{generated-linux-xenial-cuda11.1-py3.6-gcc7.yml => generated-linux-xenial-cuda11.3-py3.6-gcc7.yml} (97%)
 rename .github/workflows/{generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml => generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml} (96%)
 rename .github/workflows/{generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml => generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml} (97%)
 rename .github/workflows/{generated-periodic-win-vs2019-cuda11.3-py3.yml => generated-periodic-win-vs2019-cuda11.1-py3.yml} (97%)
 rename .github/workflows/{generated-win-vs2019-cuda11.1-py3.yml => generated-win-vs2019-cuda11.3-py3.yml} (97%)

diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json
index 70aa7767483e3..d13561190d01f 100644
--- a/.github/generated-ciflow-ruleset.json
+++ b/.github/generated-ciflow-ruleset.json
@@ -3,19 +3,19 @@
   "label_rules": {
     "ciflow/all": [
       "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
-      "libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
+      "libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-bionic-cuda10.2-py3.9-gcc7",
       "linux-bionic-py3.8-gcc9-coverage",
       "linux-xenial-cuda10.2-py3.6-gcc7",
-      "linux-xenial-cuda11.1-py3.6-gcc7",
+      "linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
-      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
-      "periodic-linux-xenial-cuda11.3-py3.6-gcc7",
-      "periodic-win-vs2019-cuda11.3-py3",
+      "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
+      "periodic-linux-xenial-cuda11.1-py3.6-gcc7",
+      "periodic-win-vs2019-cuda11.1-py3",
       "win-vs2019-cpu-py3",
       "win-vs2019-cuda10.1-py3",
-      "win-vs2019-cuda11.1-py3"
+      "win-vs2019-cuda11.3-py3"
     ],
     "ciflow/bazel": [
       "linux-xenial-py3.6-gcc7-bazel-test"
@@ -31,19 +31,19 @@
     ],
     "ciflow/cuda": [
       "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
-      "libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
+      "libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-bionic-cuda10.2-py3.9-gcc7",
       "linux-xenial-cuda10.2-py3.6-gcc7",
-      "linux-xenial-cuda11.1-py3.6-gcc7",
-      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
-      "periodic-linux-xenial-cuda11.3-py3.6-gcc7",
-      "periodic-win-vs2019-cuda11.3-py3",
+      "linux-xenial-cuda11.3-py3.6-gcc7",
+      "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
+      "periodic-linux-xenial-cuda11.1-py3.6-gcc7",
+      "periodic-win-vs2019-cuda11.1-py3",
       "win-vs2019-cuda10.1-py3",
-      "win-vs2019-cuda11.1-py3"
+      "win-vs2019-cuda11.3-py3"
     ],
     "ciflow/default": [
       "linux-bionic-py3.8-gcc9-coverage",
-      "linux-xenial-cuda11.1-py3.6-gcc7",
+      "linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
       "win-vs2019-cpu-py3",
@@ -51,35 +51,35 @@
     ],
     "ciflow/libtorch": [
       "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
-      "libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
-      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7"
+      "libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
+      "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7"
     ],
     "ciflow/linux": [
       "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
-      "libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
+      "libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-bionic-cuda10.2-py3.9-gcc7",
       "linux-bionic-py3.8-gcc9-coverage",
       "linux-xenial-cuda10.2-py3.6-gcc7",
-      "linux-xenial-cuda11.1-py3.6-gcc7",
+      "linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
-      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
-      "periodic-linux-xenial-cuda11.3-py3.6-gcc7"
+      "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
+      "periodic-linux-xenial-cuda11.1-py3.6-gcc7"
     ],
     "ciflow/scheduled": [
-      "periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
-      "periodic-linux-xenial-cuda11.3-py3.6-gcc7",
-      "periodic-win-vs2019-cuda11.3-py3"
+      "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
+      "periodic-linux-xenial-cuda11.1-py3.6-gcc7",
+      "periodic-win-vs2019-cuda11.1-py3"
     ],
     "ciflow/slow": [
       "linux-bionic-cuda10.2-py3.9-gcc7",
       "linux-xenial-cuda10.2-py3.6-gcc7"
     ],
     "ciflow/win": [
-      "periodic-win-vs2019-cuda11.3-py3",
+      "periodic-win-vs2019-cuda11.1-py3",
       "win-vs2019-cpu-py3",
       "win-vs2019-cuda10.1-py3",
-      "win-vs2019-cuda11.1-py3"
+      "win-vs2019-cuda11.3-py3"
     ]
   },
   "version": "v1"
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 097974d88acf1..e24c2e5af3893 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -214,8 +214,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
     ),
     CIWorkflow(
         arch="windows",
-        build_environment="win-vs2019-cuda11.1-py3",
-        cuda_version="11.1",
+        build_environment="win-vs2019-cuda11.3-py3",
+        cuda_version="11.3",
         test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
         num_test_shards=2,
         on_pull_request=True,
@@ -227,8 +227,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
     ),
     CIWorkflow(
         arch="windows",
-        build_environment="periodic-win-vs2019-cuda11.3-py3",
-        cuda_version="11.3",
+        build_environment="periodic-win-vs2019-cuda11.1-py3",
+        cuda_version="11.1",
         test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
         num_test_shards=2,
         is_scheduled="45 0,4,8,12,16,20 * * *",
@@ -337,8 +337,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
     ),
     CIWorkflow(
         arch="linux",
-        build_environment="linux-xenial-cuda11.1-py3.6-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
+        build_environment="linux-xenial-cuda11.3-py3.6-gcc7",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7",
         test_runner_type=LINUX_CUDA_TEST_RUNNER,
         num_test_shards=2,
         on_pull_request=True,
@@ -349,8 +349,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
     ),
     CIWorkflow(
         arch="linux",
-        build_environment="libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
+        build_environment="libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7",
         test_runner_type=LINUX_CUDA_TEST_RUNNER,
         is_libtorch=True,
         on_pull_request=True,
@@ -362,8 +362,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
     ),
     CIWorkflow(
         arch="linux",
-        build_environment="periodic-linux-xenial-cuda11.3-py3.6-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7",
+        build_environment="periodic-linux-xenial-cuda11.1-py3.6-gcc7",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
         test_runner_type=LINUX_CUDA_TEST_RUNNER,
         num_test_shards=2,
         is_scheduled="45 0,4,8,12,16,20 * * *",
@@ -376,8 +376,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
     ),
     CIWorkflow(
         arch="linux",
-        build_environment="periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7",
+        build_environment="periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
         test_runner_type=LINUX_CUDA_TEST_RUNNER,
         is_libtorch=True,
         is_scheduled="45 0,4,8,12,16,20 * * *",
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
similarity index 96%
rename from .github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
rename to .github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
index ba59027969b7a..95261026f3862 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -2,7 +2,7 @@
 
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: libtorch-linux-xenial-cuda11.1-py3.6-gcc7
+name: libtorch-linux-xenial-cuda11.3-py3.6-gcc7
 
 on:
   pull_request:
@@ -14,8 +14,8 @@ on:
   workflow_dispatch:
 
 env:
-  BUILD_ENVIRONMENT: libtorch-linux-xenial-cuda11.1-py3.6-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7
+  BUILD_ENVIRONMENT: libtorch-linux-xenial-cuda11.3-py3.6-gcc7
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
@@ -26,7 +26,7 @@ env:
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
 
 concurrency:
-  group: libtorch-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
+  group: libtorch-linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
   cancel-in-progress: true
 
 jobs:
@@ -120,7 +120,7 @@ jobs:
     needs: [calculate-docker-image, ciflow_should_run]
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
-      JOB_BASE_NAME: libtorch-linux-xenial-cuda11.1-py3.6-gcc7-build
+      JOB_BASE_NAME: libtorch-linux-xenial-cuda11.3-py3.6-gcc7-build
     steps:
       - name: Log in to ECR
         run: |
diff --git a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
similarity index 97%
rename from .github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
rename to .github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 4275cc31ebddd..3273cb0395437 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -2,7 +2,7 @@
 
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-cuda11.1-py3.6-gcc7
+name: linux-xenial-cuda11.3-py3.6-gcc7
 
 on:
   pull_request:
@@ -14,8 +14,8 @@ on:
   workflow_dispatch:
 
 env:
-  BUILD_ENVIRONMENT: linux-xenial-cuda11.1-py3.6-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7
+  BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
@@ -26,7 +26,7 @@ env:
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
 
 concurrency:
-  group: linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
+  group: linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
   cancel-in-progress: true
 
 jobs:
@@ -120,7 +120,7 @@ jobs:
     needs: [calculate-docker-image, ciflow_should_run]
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
-      JOB_BASE_NAME: linux-xenial-cuda11.1-py3.6-gcc7-build
+      JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-build
     steps:
       - name: Log in to ECR
         run: |
@@ -256,7 +256,7 @@ jobs:
     runs-on: ${{ matrix.runner }}
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
-      JOB_BASE_NAME: linux-xenial-cuda11.1-py3.6-gcc7-test
+      JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-test
       TEST_CONFIG: ${{ matrix.config }}
       SHARD_NUMBER: ${{ matrix.shard }}
       NUM_TEST_SHARDS: ${{ matrix.num_shards }}
@@ -323,7 +323,7 @@ jobs:
           env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
-          BUILD_ENVIRONMENT: linux-xenial-cuda11.1-py3.6-gcc7-${{ matrix.config }}
+          BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7-${{ matrix.config }}
           PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
@@ -468,7 +468,7 @@ jobs:
         env:
           AWS_DEFAULT_REGION: us-east-1
           CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-xenial-cuda11.1-py3.6-gcc7-test
+          JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-test
           CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
similarity index 96%
rename from .github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
rename to .github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 1f4c6d270ec54..de0aa4bb3333c 100644
--- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -2,7 +2,7 @@
 
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7
+name: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7
 
 on:
   pull_request:
@@ -12,8 +12,8 @@ on:
   workflow_dispatch:
 
 env:
-  BUILD_ENVIRONMENT: periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
+  BUILD_ENVIRONMENT: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
@@ -24,7 +24,7 @@ env:
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
 
 concurrency:
-  group: periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
+  group: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
   cancel-in-progress: true
 
 jobs:
@@ -118,7 +118,7 @@ jobs:
     needs: [calculate-docker-image, ciflow_should_run]
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
-      JOB_BASE_NAME: periodic-libtorch-linux-xenial-cuda11.3-py3.6-gcc7-build
+      JOB_BASE_NAME: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7-build
     steps:
       - name: Log in to ECR
         run: |
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
similarity index 97%
rename from .github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
rename to .github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 0282b206a117c..3f1b5b4a85f68 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -2,7 +2,7 @@
 
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: periodic-linux-xenial-cuda11.3-py3.6-gcc7
+name: periodic-linux-xenial-cuda11.1-py3.6-gcc7
 
 on:
   pull_request:
@@ -12,8 +12,8 @@ on:
   workflow_dispatch:
 
 env:
-  BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.3-py3.6-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
+  BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.6-gcc7
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
@@ -24,7 +24,7 @@ env:
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
 
 concurrency:
-  group: periodic-linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
+  group: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
   cancel-in-progress: true
 
 jobs:
@@ -118,7 +118,7 @@ jobs:
     needs: [calculate-docker-image, ciflow_should_run]
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
-      JOB_BASE_NAME: periodic-linux-xenial-cuda11.3-py3.6-gcc7-build
+      JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.6-gcc7-build
     steps:
       - name: Log in to ECR
         run: |
@@ -254,7 +254,7 @@ jobs:
     runs-on: ${{ matrix.runner }}
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
-      JOB_BASE_NAME: periodic-linux-xenial-cuda11.3-py3.6-gcc7-test
+      JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.6-gcc7-test
       TEST_CONFIG: ${{ matrix.config }}
       SHARD_NUMBER: ${{ matrix.shard }}
       NUM_TEST_SHARDS: ${{ matrix.num_shards }}
@@ -321,7 +321,7 @@ jobs:
           env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
-          BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.3-py3.6-gcc7-${{ matrix.config }}
+          BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ matrix.config }}
           PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
@@ -466,7 +466,7 @@ jobs:
         env:
           AWS_DEFAULT_REGION: us-east-1
           CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: periodic-linux-xenial-cuda11.3-py3.6-gcc7-test
+          JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.6-gcc7-test
           CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
similarity index 97%
rename from .github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml
rename to .github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
index 6d1eff302dba9..4bf74faae1843 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
@@ -2,7 +2,7 @@
 
 # Template is at:    .github/templates/windows_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: periodic-win-vs2019-cuda11.3-py3
+name: periodic-win-vs2019-cuda11.1-py3
 
 on:
   pull_request:
@@ -12,9 +12,9 @@ on:
   workflow_dispatch:
 
 env:
-  BUILD_ENVIRONMENT: periodic-win-vs2019-cuda11.3-py3
+  BUILD_ENVIRONMENT: periodic-win-vs2019-cuda11.1-py3
   BUILD_WHEEL: 1
-  CUDA_VERSION: "11.3"
+  CUDA_VERSION: "11.1"
   IN_CI: 1
   INSTALL_WINDOWS_SDK: 1
   PYTHON_VERSION: "3.8"
@@ -29,7 +29,7 @@ env:
   USE_CUDA: 1
 
 concurrency:
-  group: periodic-win-vs2019-cuda11.3-py3-${{ github.event.pull_request.number || github.sha }}
+  group: periodic-win-vs2019-cuda11.1-py3-${{ github.event.pull_request.number || github.sha }}
   cancel-in-progress: true
 
 jobs:
@@ -47,7 +47,7 @@ jobs:
         working-directory: pytorch-${{ github.run_id }}
     needs: [ciflow_should_run]
     env:
-      JOB_BASE_NAME: periodic-win-vs2019-cuda11.3-py3-build
+      JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-build
       http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
     steps:
@@ -150,7 +150,7 @@ jobs:
 
   test:
     env:
-      JOB_BASE_NAME: periodic-win-vs2019-cuda11.3-py3-test
+      JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test
       SHARD_NUMBER: ${{ matrix.shard }}
       NUM_TEST_SHARDS: ${{ matrix.num_shards }}
       TEST_CONFIG: ${{ matrix.config }}
@@ -316,7 +316,7 @@ jobs:
         env:
           AWS_DEFAULT_REGION: us-east-1
           CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: periodic-win-vs2019-cuda11.3-py3-test
+          JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test
           CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
diff --git a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
similarity index 97%
rename from .github/workflows/generated-win-vs2019-cuda11.1-py3.yml
rename to .github/workflows/generated-win-vs2019-cuda11.3-py3.yml
index 2b3a30c6187fd..1b423008fe5fd 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
@@ -2,7 +2,7 @@
 
 # Template is at:    .github/templates/windows_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: win-vs2019-cuda11.1-py3
+name: win-vs2019-cuda11.3-py3
 
 on:
   pull_request:
@@ -14,9 +14,9 @@ on:
   workflow_dispatch:
 
 env:
-  BUILD_ENVIRONMENT: win-vs2019-cuda11.1-py3
+  BUILD_ENVIRONMENT: win-vs2019-cuda11.3-py3
   BUILD_WHEEL: 1
-  CUDA_VERSION: "11.1"
+  CUDA_VERSION: "11.3"
   IN_CI: 1
   INSTALL_WINDOWS_SDK: 1
   PYTHON_VERSION: "3.8"
@@ -31,7 +31,7 @@ env:
   USE_CUDA: 1
 
 concurrency:
-  group: win-vs2019-cuda11.1-py3-${{ github.event.pull_request.number || github.sha }}
+  group: win-vs2019-cuda11.3-py3-${{ github.event.pull_request.number || github.sha }}
   cancel-in-progress: true
 
 jobs:
@@ -49,7 +49,7 @@ jobs:
         working-directory: pytorch-${{ github.run_id }}
     needs: [ciflow_should_run]
     env:
-      JOB_BASE_NAME: win-vs2019-cuda11.1-py3-build
+      JOB_BASE_NAME: win-vs2019-cuda11.3-py3-build
       http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
     steps:
@@ -152,7 +152,7 @@ jobs:
 
   test:
     env:
-      JOB_BASE_NAME: win-vs2019-cuda11.1-py3-test
+      JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test
       SHARD_NUMBER: ${{ matrix.shard }}
       NUM_TEST_SHARDS: ${{ matrix.num_shards }}
       TEST_CONFIG: ${{ matrix.config }}
@@ -318,7 +318,7 @@ jobs:
         env:
           AWS_DEFAULT_REGION: us-east-1
           CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: win-vs2019-cuda11.1-py3-test
+          JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test
           CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}

From 01c35115d82c7e782304ea68ee23cccb735f5432 Mon Sep 17 00:00:00 2001
From: Ansley Ussery <ansley@fb.com>
Date: Wed, 25 Aug 2021 09:01:50 -0700
Subject: [PATCH 207/530] Fix bug in `check_empty_containers` (#63492)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63492

Test Plan: Imported from OSS

Reviewed By: bdhirsh

Differential Revision: D30402749

Pulled By: ansley

fbshipit-source-id: 7de533355fe91ca4f45b2bafc3bfb205a028c1ed
---
 test/jit/test_isinstance.py | 9 +++++++++
 test/test_jit.py            | 1 -
 torch/_jit_internal.py      | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/test/jit/test_isinstance.py b/test/jit/test_isinstance.py
index 93b2605748516..5fd2b87965607 100644
--- a/test/jit/test_isinstance.py
+++ b/test/jit/test_isinstance.py
@@ -310,3 +310,12 @@ def fn(x: Any):
             x: int = 2
             fn(x)
             self.assertEqual(len(w), 0)
+
+    def test_empty_container_special_cases(self):
+        # Should not throw "Boolean value of Tensor with no values is
+        # ambiguous" error
+        torch._jit_internal.check_empty_containers(torch.Tensor([]))
+
+        # Should not throw "Boolean value of Tensor with more than
+        # one value is ambiguous" error
+        torch._jit_internal.check_empty_containers(torch.rand(2, 3))
diff --git a/test/test_jit.py b/test/test_jit.py
index 06afe656a8d3c..28de1722dde47 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -5944,7 +5944,6 @@ def test_bool_arith_not(lhs):
         self.assertEqual(test_bool_arith_not(torch.zeros(3)), 1)
         self.assertTrue(str(test_bool_arith_not.graph).count('if') == 0)
 
-
     def test_conditional_casting(self):
         def test_bool_cast_tensor(x):
             if x:
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index bd7b616996a24..418607add7373 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -1133,7 +1133,7 @@ def check_args_exist(target_type) -> None:
 
 
 def check_empty_containers(obj) -> None:
-    if not obj:
+    if obj == [] or obj == {} or obj == ():
         warnings.warn("The inner type of a container is lost when "
                       "calling torch.jit.isinstance in eager mode. For "
                       "example, List[int] would become list and "

From 34ed16ffef80cf11c86a1f48d1b4930b71e19866 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Wed, 25 Aug 2021 09:04:28 -0700
Subject: [PATCH 208/530] Temporary fix for remote gpu execution issue (#63899)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63899

See: T99020845

Test Plan: sandcastle

Reviewed By: heitorschueroff

Differential Revision: D30527384

fbshipit-source-id: ce9933e5e181322c02d4ed17f3fdaabe4c5ba29e
---
 torch/testing/_internal/common_device_type.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 8ec6e71d121ff..b5d61273afd3e 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -12,7 +12,7 @@
 import torch
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
-    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, DeterministicGuard, TEST_SKIP_NOARCH
+    IS_SANDCASTLE, IS_FBCODE, DeterministicGuard, TEST_SKIP_NOARCH
 from torch.testing._internal.common_cuda import _get_torch_cuda_version
 from torch.testing import \
     (get_all_dtypes)
@@ -469,13 +469,9 @@ def get_device_type_test_bases():
     test_bases: List[Any] = list()
 
     if IS_SANDCASTLE or IS_FBCODE:
-        if IS_REMOTE_GPU:
-            # Skip if sanitizer is enabled
-            if not TEST_WITH_ASAN and not TEST_WITH_TSAN and not TEST_WITH_UBSAN:
-                test_bases.append(CUDATestBase)
-        else:
-            test_bases.append(CPUTestBase)
-            test_bases.append(MetaTestBase)
+        # temporarily disable IS_REMOTE_GPU, see T99020845
+        test_bases.append(CPUTestBase)
+        test_bases.append(MetaTestBase)
     else:
         test_bases.append(CPUTestBase)
         if not TEST_SKIP_NOARCH:

From 5ab356ffe6001cc54d99096f1981ad41d9b69e93 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 25 Aug 2021 09:24:27 -0700
Subject: [PATCH 209/530] Update CMake minimum version to 3.10 (#63660)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63660

Test Plan: Imported from OSS

Reviewed By: janeyx99, mruberry

Differential Revision: D30543878

fbshipit-source-id: a7d938807653f39727f2cc7d7ca167200567b6a0
---
 .circleci/docker/build.sh                | 16 +++++++++++++++-
 .circleci/docker/common/install_cmake.sh |  3 +++
 .circleci/docker/common/install_conda.sh |  4 ++--
 .circleci/docker/ubuntu-cuda/Dockerfile  |  6 ++++++
 .jenkins/pytorch/build-asan.sh           |  3 +++
 .jenkins/pytorch/build.sh                |  2 ++
 CMakeLists.txt                           |  8 ++++----
 c10/CMakeLists.txt                       |  2 +-
 torch/CMakeLists.txt                     |  2 +-
 9 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index 2b916a19ae117..18d19ae5d586f 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -78,11 +78,13 @@ TRAVIS_DL_URL_PREFIX="https://s3.amazonaws.com/travis-python-archives/binaries/u
 case "$image" in
   pytorch-linux-xenial-py3.8)
     ANACONDA_PYTHON_VERSION=3.8
+    CMAKE_VERSION=3.10.3
     GCC_VERSION=7
     # Do not install PROTOBUF, DB, and VISION as a test
     ;;
   pytorch-linux-xenial-py3.6-gcc5.4)
     ANACONDA_PYTHON_VERSION=3.6
+    CMAKE_VERSION=3.10.3
     GCC_VERSION=5
     PROTOBUF=yes
     DB=yes
@@ -91,11 +93,13 @@ case "$image" in
     ;;
   pytorch-linux-xenial-py3.6-gcc7.2)
     ANACONDA_PYTHON_VERSION=3.6
+    CMAKE_VERSION=3.10.3
     GCC_VERSION=7
     # Do not install PROTOBUF, DB, and VISION as a test
     ;;
   pytorch-linux-xenial-py3.6-gcc7)
     ANACONDA_PYTHON_VERSION=3.6
+    CMAKE_VERSION=3.10.3
     GCC_VERSION=7
     PROTOBUF=yes
     DB=yes
@@ -105,6 +109,7 @@ case "$image" in
     CUDA_VERSION=10.2
     CUDNN_VERSION=7
     ANACONDA_PYTHON_VERSION=3.6
+    CMAKE_VERSION=3.10.3
     GCC_VERSION=7
     PROTOBUF=yes
     DB=yes
@@ -115,6 +120,7 @@ case "$image" in
     CUDA_VERSION=11.1
     CUDNN_VERSION=8
     ANACONDA_PYTHON_VERSION=3.6
+    CMAKE_VERSION=3.10.3
     GCC_VERSION=7
     PROTOBUF=yes
     DB=yes
@@ -125,6 +131,7 @@ case "$image" in
     CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names
     CUDNN_VERSION=8
     ANACONDA_PYTHON_VERSION=3.6
+    CMAKE_VERSION=3.10.3
     GCC_VERSION=7
     PROTOBUF=yes
     DB=yes
@@ -134,6 +141,7 @@ case "$image" in
   pytorch-linux-xenial-py3-clang5-asan)
     ANACONDA_PYTHON_VERSION=3.6
     CLANG_VERSION=5.0
+    CMAKE_VERSION=3.10.3
     PROTOBUF=yes
     DB=yes
     VISION=yes
@@ -141,6 +149,7 @@ case "$image" in
   pytorch-linux-xenial-py3-clang7-asan)
     ANACONDA_PYTHON_VERSION=3.6
     CLANG_VERSION=7
+    CMAKE_VERSION=3.10.3
     PROTOBUF=yes
     DB=yes
     VISION=yes
@@ -148,6 +157,7 @@ case "$image" in
   pytorch-linux-xenial-py3-clang7-onnx)
     ANACONDA_PYTHON_VERSION=3.6
     CLANG_VERSION=7
+    CMAKE_VERSION=3.10.3
     PROTOBUF=yes
     DB=yes
     VISION=yes
@@ -155,16 +165,17 @@ case "$image" in
   pytorch-linux-xenial-py3-clang5-android-ndk-r19c)
     ANACONDA_PYTHON_VERSION=3.6
     CLANG_VERSION=5.0
+    CMAKE_VERSION=3.10.3
     LLVMDEV=yes
     PROTOBUF=yes
     ANDROID=yes
     ANDROID_NDK_VERSION=r19c
     GRADLE_VERSION=6.8.3
-    CMAKE_VERSION=3.7.0
     NINJA_VERSION=1.9.0
     ;;
   pytorch-linux-xenial-py3.6-clang7)
     ANACONDA_PYTHON_VERSION=3.6
+    CMAKE_VERSION=3.10.3
     CLANG_VERSION=7
     PROTOBUF=yes
     DB=yes
@@ -244,6 +255,9 @@ case "$image" in
     DB=yes
     VISION=yes
     echo "image '$image' did not match an existing build configuration"
+    if [[ "$image" == *xenial* ]]; then
+      CMAKE_VERSION=3.10.3
+    fi
     if [[ "$image" == *py* ]]; then
       extract_version_from_image_name py ANACONDA_PYTHON_VERSION
     fi
diff --git a/.circleci/docker/common/install_cmake.sh b/.circleci/docker/common/install_cmake.sh
index 3ef71031db38f..5aa564d7c478c 100755
--- a/.circleci/docker/common/install_cmake.sh
+++ b/.circleci/docker/common/install_cmake.sh
@@ -4,6 +4,9 @@ set -ex
 
 [ -n "$CMAKE_VERSION" ]
 
+# Remove system cmake install so it won't get used instead
+apt-get remove cmake -y
+
 # Turn 3.6.3 into v3.6
 path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/')
 file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
diff --git a/.circleci/docker/common/install_conda.sh b/.circleci/docker/common/install_conda.sh
index 86dbb153b2925..f12ae38aa58bd 100755
--- a/.circleci/docker/common/install_conda.sh
+++ b/.circleci/docker/common/install_conda.sh
@@ -69,8 +69,8 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   }
 
   # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
-  # DO NOT install cmake here as it would install a version newer than 3.5, but
-  # we want to pin to version 3.5.
+  # DO NOT install cmake here as it would install a version newer than 3.10, but
+  # we want to pin to version 3.10.
   SCIPY_VERSION=1.1.0
   if [ "$ANACONDA_PYTHON_VERSION" = "3.9" ]; then
     # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
diff --git a/.circleci/docker/ubuntu-cuda/Dockerfile b/.circleci/docker/ubuntu-cuda/Dockerfile
index 003538f576bd5..84075db161358 100644
--- a/.circleci/docker/ubuntu-cuda/Dockerfile
+++ b/.circleci/docker/ubuntu-cuda/Dockerfile
@@ -65,6 +65,12 @@ ADD ./common/install_openssl.sh install_openssl.sh
 ENV OPENSSL_ROOT_DIR /opt/openssl
 RUN bash ./install_openssl.sh
 
+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+ADD ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 ADD ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
diff --git a/.jenkins/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh
index 37dfeebdbd332..8d0bcd2555342 100755
--- a/.jenkins/pytorch/build-asan.sh
+++ b/.jenkins/pytorch/build-asan.sh
@@ -16,6 +16,9 @@ clang --version
 # detect_leaks=0: Python is very leaky, so we need suppress it
 # symbolize=1: Gives us much better errors when things go wrong
 export ASAN_OPTIONS=detect_leaks=0:symbolize=1:detect_odr_violation=0
+if [ -n "$(which conda)" ]; then
+  export CMAKE_PREFIX_PATH=/opt/conda
+fi
 
 # FIXME: Remove the hardcoded "-pthread" option.
 # With asan build, the cmake thread CMAKE_HAVE_LIBC_CREATE[1] checking will
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index f6ac52aed99c4..d7b66e7c9177e 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -88,6 +88,8 @@ if ! which conda; then
   else
     export USE_MKLDNN=0
   fi
+else
+  export CMAKE_PREFIX_PATH=/opt/conda
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4d1653ffaded3..db38d592c55b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
 #cmake_policy(SET CMP0022 NEW)
 #cmake_policy(SET CMP0023 NEW)
 
@@ -323,9 +323,9 @@ option(WERROR "Build with -Werror supported by the compiler" OFF)
 if(USE_CCACHE)
   find_program(CCACHE_PROGRAM ccache)
   if(CCACHE_PROGRAM)
-    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
-    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
-    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "C compiler launcher")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "CXX compiler launcher")
+    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "CUDA compiler launcher")
   else()
     message(STATUS "Could not find ccache. Consider installing ccache to speed up compilation.")
   endif()
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 3d2d4352ffef4..23a0e024d35ed 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
 project(c10 CXX)
 
 set(CMAKE_CXX_STANDARD 14)
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 90504f025c4a3..761605fadcce8 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -2,7 +2,7 @@
 # Now it only builds the Torch python bindings.
 
 if(NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+  cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
   project(torch CXX C)
   find_package(torch REQUIRED)
   option(USE_CUDA "Use CUDA" ON)

From c1dfd58715c73dba3c089b2993e62d03a8647407 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Wed, 25 Aug 2021 09:35:26 -0700
Subject: [PATCH 210/530] Minor OptionalTensorRef updates (#63611)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63611

A few minor updates to `OptionalTensorRef`:
1. use `Tensor`'s `unsafe_borrow_t` constructor which avoids an unnecesary `nullptr` check.
2. copy constructor cannot defer to the `const Tensor&` constructor because it checks the tensor is
defined, and so would fail for disengaged optionals.
3. use copy-swap idiom to avoid issues with self-assignment. `x = x` should be a no-op, but the old
version would clear `x`.
4. Add pointer-like access for consistency with `optional` and `MaybeOwned`

Test Plan: Imported from OSS

Reviewed By: bdhirsh

Differential Revision: D30484704

Pulled By: ezyang

fbshipit-source-id: 738f4bd22359eaecd0a519a04e89a4b44d92da5b
---
 aten/src/ATen/core/Tensor.h          | 24 +++++++++++++-----------
 aten/src/ATen/templates/TensorBody.h |  2 ++
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index fb0f86952bea4..fa2479c800c05 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -6,28 +6,22 @@
 namespace at {
 class TORCH_API OptionalTensorRef {
  public:
-  OptionalTensorRef() {}
+  OptionalTensorRef() = default;
 
   ~OptionalTensorRef() {
     ref_.unsafeReleaseTensorImpl();
   }
 
   OptionalTensorRef(const Tensor& src)
-      : ref_(c10::intrusive_ptr<TensorImpl>(
-            src.unsafeGetTensorImpl(),
-            c10::raw::DontIncreaseRefcount{})) {
+      : ref_(Tensor::unsafe_borrow_t{}, src) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src.defined());
   }
 
   OptionalTensorRef(const OptionalTensorRef& rhs)
-      : OptionalTensorRef(rhs.ref_) {}
+      : ref_(Tensor::unsafe_borrow_t{}, rhs.ref_) {}
 
-  OptionalTensorRef& operator=(const OptionalTensorRef& rhs) {
-    // Need to call unsafeReleaseTensorImpl on ref_ since we are reassigning it
-    // (which does not call the destructor).
-    ref_.unsafeReleaseTensorImpl();
-    ref_ = Tensor(c10::intrusive_ptr<TensorImpl>(
-        rhs.ref_.unsafeGetTensorImpl(), c10::raw::DontIncreaseRefcount{}));
+  OptionalTensorRef& operator=(OptionalTensorRef rhs) {
+    std::swap(ref_, rhs.ref_);
     return *this;
   }
 
@@ -39,6 +33,14 @@ class TORCH_API OptionalTensorRef {
     return ref_;
   }
 
+  const Tensor& operator*() const & {
+    return ref_;
+  }
+
+  const Tensor* operator->() const & {
+    return &ref_;
+  }
+
   operator bool() const {
     return ref_.defined();
   }
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index a6e6583c7b19c..95312ff5d10f3 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -52,6 +52,7 @@ struct Node;
 
 namespace at {
 
+class OptionalTensorRef;
 class Tensor;
 using TensorList = ArrayRef<Tensor>;
 
@@ -96,6 +97,7 @@ class TORCH_API Tensor {
   explicit Tensor(unsafe_borrow_t, const Tensor& rhs)
       : impl_(c10::intrusive_ptr<at::TensorImpl, UndefinedTensorImpl>::reclaim(rhs.impl_.get())) {}
   friend MaybeOwnedTraits<Tensor>;
+  friend OptionalTensorRef;
 
  public:
   Tensor(){};

From 83b132b112c2e035a23dcab4a88393209c4325ee Mon Sep 17 00:00:00 2001
From: Karen Zhou <kazhou@fb.com>
Date: Wed, 25 Aug 2021 09:55:02 -0700
Subject: [PATCH 211/530] [pruner] add support for pruning BatchNorm2d (#63519)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63519

If the pruner should be pruning biases along with weights, then if the model has BatchNorm2d following pruned Conv2d layers, then the corresponding channels of the BatchNorm must also be pruned.

Specifically, they need to zeroed out, rather than fully removed, since in eager mode, the dimensions between layers need to be preserved.

To do this, we add a pruning parametrization called `ZeroesParametrization` which zeroes out pruned channels, rather than removing them.

The user must provide in the config, a tuple of the Conv2d and BatchNorm layers that go together. The `prepare` method will add the tuple to the `module_groups`; then it will add a PruningParametrization to the Conv2d layer, and a ZeroesParametrization to BatchNorm, and then set their pruned sets to be the same set. That way, during `step`, both masks are updated with the same pruned indices.

ghstack-source-id: 136562278

Test Plan:
`buck test mode/dev-nosan //caffe2/test:ao -- TestBasePruner`

https://pxl.cl/1N1P6

Reviewed By: z-a-f

Differential Revision: D30349855

fbshipit-source-id: 3199d3688d5a70963f9b32d7a8fdac3962ae6a65
---
 test/ao/sparsity/test_pruner.py               | 157 ++++++++++----
 torch/ao/sparsity/__init__.py                 |   1 +
 .../experimental/pruner/base_pruner.py        | 200 +++++++++++++++---
 .../experimental/pruner/parametrization.py    |  13 ++
 4 files changed, 298 insertions(+), 73 deletions(-)

diff --git a/test/ao/sparsity/test_pruner.py b/test/ao/sparsity/test_pruner.py
index c358df6ac95ae..663c6f033a9ce 100644
--- a/test/ao/sparsity/test_pruner.py
+++ b/test/ao/sparsity/test_pruner.py
@@ -4,7 +4,7 @@
 
 import torch
 from torch import nn
-from torch.ao.sparsity import BasePruner, PruningParametrization
+from torch.ao.sparsity import BasePruner, PruningParametrization, ZeroesParametrization
 from torch.nn.utils import parametrize
 
 from torch.testing._internal.common_utils import TestCase
@@ -13,8 +13,13 @@
 
 DEVICES = {"cpu", "cuda" if torch.cuda.is_available() else "cpu"}
 
+NEEDS_ZEROS = {  # these layers should have pruned indices zero-ed, not removed
+    nn.BatchNorm2d
+}
+
 
 class Linear(nn.Module):
+    r"""Model with Linear layers, in Sequential and outside, without biases"""
     def __init__(self):
         super().__init__()
         self.seq = nn.Sequential(
@@ -29,6 +34,7 @@ def forward(self, x):
 
 
 class LinearB(nn.Module):
+    r"""Model with Linear layers, in Sequential and outside, with biases"""
     def __init__(self):
         super().__init__()
         self.seq = nn.Sequential(
@@ -43,6 +49,8 @@ def forward(self, x):
 
 
 class MultipleLinear(nn.Module):
+    r"""Model with multiple Linear layers, in Sequential and outside, without biases
+    and with activation functions"""
     def __init__(self):
         super().__init__()
         self.seq = nn.Sequential(
@@ -61,6 +69,8 @@ def forward(self, x):
 
 
 class MultipleLinearB(nn.Module):
+    r"""Model with multiple Linear layers, in Sequential and outside, with biases
+    and with activation functions"""
     def __init__(self):
         super().__init__()
         self.seq = nn.Sequential(
@@ -79,6 +89,8 @@ def forward(self, x):
 
 
 class MultipleLinearMixed(nn.Module):
+    r"""Model with multiple Linear layers, in Sequential and outside, some with biases
+    and with activation functions"""
     def __init__(self):
         super().__init__()
         self.seq = nn.Sequential(
@@ -97,6 +109,7 @@ def forward(self, x):
 
 
 class Conv2dA(nn.Module):
+    r"""Model with Conv2d layers, in Sequential and outside, without biases"""
     def __init__(self):
         super().__init__()
         self.seq = nn.Sequential(
@@ -111,6 +124,7 @@ def forward(self, x):
 
 
 class Conv2dB(nn.Module):
+    r"""Model with Conv2d layers, in Sequential and outside, with biases"""
     def __init__(self):
         super().__init__()
         self.seq = nn.Sequential(
@@ -125,6 +139,7 @@ def forward(self, x):
 
 
 class Conv2dC(nn.Module):
+    r"""Model with Conv2d layers, in Sequential and outside, with and without biases"""
     def __init__(self):
         super().__init__()
         self.seq = nn.Sequential(
@@ -138,6 +153,24 @@ def forward(self, x):
         return x
 
 
+class Conv2dBN(nn.Module):
+    r"""Model with Conv2d layers and BatchNorms"""
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=True),
+            nn.BatchNorm2d(32)
+        )
+        self.conv2d = nn.Conv2d(32, 64, 3, 1, bias=True)
+        self.bn = nn.BatchNorm2d(64)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d(x)
+        x = self.bn(x)
+        return x
+
+
 class SimplePruner(BasePruner):
     def update_mask(self, layer, **kwargs):
         layer.parametrizations.weight[0].pruned_outputs.add(1)
@@ -150,35 +183,66 @@ def update_mask(self, layer, **kwargs):
 
 class TestBasePruner(TestCase):
     def _check_pruner_prepared(self, model, pruner, device):
-        for g in pruner.module_groups:
-            module = g['module']
-            assert module.weight.device == device
-            # Check mask exists
-            assert hasattr(module, 'mask')
-            # Check parametrization exists and is correct
-            assert parametrize.is_parametrized(module)
-            assert hasattr(module, "parametrizations")
-            # Assume that this is the 1st/only parametrization
-            assert type(module.parametrizations.weight[0]) == PruningParametrization
+        for config in pruner.module_groups:
+            modules = []
+            if type(config['module']) is tuple:
+                for module in config['module']:
+                    modules.append(module)
+            else:
+                module = config['module']
+                modules.append(module)
+            for module in modules:
+                assert module.weight.device == device
+                # Check mask exists
+                assert hasattr(module, 'mask')
+                # Check parametrization exists and is correct
+                assert parametrize.is_parametrized(module)
+                assert hasattr(module, "parametrizations")
+                # Assume that this is the 1st/only parametrization
+                if isinstance(module, tuple(NEEDS_ZEROS)):
+                    assert type(module.parametrizations.weight[0]) == ZeroesParametrization
+                else:
+                    assert type(module.parametrizations.weight[0]) == PruningParametrization
 
     def _check_pruner_mask_squashed(self, model, pruner, device):
-        for g in pruner.module_groups:
-            module = g['module']
-            assert module.weight.device == device
-            assert not hasattr(module, "parametrizations")
-            assert not hasattr(module, 'mask')
+        for config in pruner.module_groups:
+            modules = []
+            if type(config['module']) is tuple:
+                for module in config['module']:
+                    modules.append(module)
+            else:
+                module = config['module']
+                modules.append(module)
+            for module in modules:
+                assert module.weight.device == device
+                assert not hasattr(module, "parametrizations")
+                assert not hasattr(module, 'mask')
 
     def _check_pruner_valid_before_step(self, model, pruner, device):
-        for g in pruner.module_groups:
-            module = g['module']
-            assert module.weight.device == device
-            assert module.parametrizations.weight[0].pruned_outputs == set()
+        for config in pruner.module_groups:
+            modules = []
+            if type(config['module']) is tuple:
+                for module in config['module']:
+                    modules.append(module)
+            else:
+                module = config['module']
+                modules.append(module)
+            for module in modules:
+                assert module.weight.device == device
+                assert module.parametrizations.weight[0].pruned_outputs == set()
 
     def _check_pruner_valid_after_step(self, model, pruner, pruned_set, device):
-        for g in pruner.module_groups:
-            module = g['module']
-            assert module.weight.device == device
-            assert module.parametrizations.weight[0].pruned_outputs == pruned_set
+        for config in pruner.module_groups:
+            modules = []
+            if type(config['module']) is tuple:
+                for module in config['module']:
+                    modules.append(module)
+            else:
+                module = config['module']
+                modules.append(module)
+            for module in modules:
+                assert module.weight.device == device
+                assert module.parametrizations.weight[0].pruned_outputs == pruned_set
 
     def _test_constructor_on_device(self, model, device):
         self.assertRaisesRegex(TypeError, 'with abstract methods update_mask',
@@ -218,19 +282,23 @@ def test_prepare_linear(self):
             for model in models:
                 self._test_prepare_linear_on_device(model, torch.device(device))
 
-    def _test_prepare_conv2d_on_device(self, model, device):
+    def _test_prepare_conv2d_on_device(self, model, config, device):
         model = model.to(device)
         x = torch.ones((1, 1, 28, 28))
         pruner = SimplePruner(None)
-        pruner.prepare(model, None)
+        pruner.prepare(model, config)
         self._check_pruner_prepared(model, pruner, device)
         assert model(x).shape == (1, 64, 24, 24)
 
     def test_prepare_conv2d(self):
-        models = [Conv2dA(), Conv2dB(), Conv2dC()]
+        bn_model = Conv2dBN()
+        bn_config = [(bn_model.seq[0], bn_model.seq[1]), (bn_model.conv2d, bn_model.bn)]
+
+        models = [Conv2dA(), Conv2dB(), Conv2dC(), bn_model]
+        configs = [None, None, None, bn_config]
         for device in DEVICES:
-            for model in models:
-                self._test_prepare_conv2d_on_device(model, torch.device(device))
+            for model, config in zip(models, configs):
+                self._test_prepare_conv2d_on_device(model, config, torch.device(device))
 
     def _test_squash_mask_linear_on_device(self, model, device):
         model = model.to(device)
@@ -247,20 +315,24 @@ def test_squash_mask_linear(self):
             for model in models:
                 self._test_squash_mask_linear_on_device(model, torch.device(device))
 
-    def _test_squash_mask_conv2d_on_device(self, model, device):
+    def _test_squash_mask_conv2d_on_device(self, model, config, device):
         model = model.to(device)
         x = torch.ones((1, 1, 28, 28))
         pruner = SimplePruner(None)
-        pruner.prepare(model, None)
+        pruner.prepare(model, config)
         pruner.squash_mask()
         self._check_pruner_mask_squashed(model, pruner, device)
         assert model(x).shape == (1, 64, 24, 24)
 
     def test_squash_mask_conv2d(self):
-        models = [Conv2dA(), Conv2dB(), Conv2dC()]
+        bn_model = Conv2dBN()
+        bn_config = [(bn_model.seq[0], bn_model.seq[1]), (bn_model.conv2d, bn_model.bn)]
+
+        models = [Conv2dA(), Conv2dB(), Conv2dC(), bn_model]
+        configs = [None, None, None, bn_config]
         for device in DEVICES:
-            for model in models:
-                self._test_squash_mask_conv2d_on_device(model, torch.device(device))
+            for model, config in zip(models, configs):
+                self._test_squash_mask_conv2d_on_device(model, config, torch.device(device))
 
     def _test_step_linear_on_device(self, model, is_basic, device):
         model = model.to(device)
@@ -288,18 +360,25 @@ def test_step_linear(self):
             for model in complex_models:
                 self._test_step_linear_on_device(model, False, torch.device(device))
 
-    def _test_step_conv2d_on_device(self, model, device):
+    def _test_step_conv2d_on_device(self, model, config, device):
         model = model.to(device)
         x = torch.ones((1, 1, 28, 28))
         pruner = SimplePruner(None)
-        pruner.prepare(model, None)
+        pruner.prepare(model, config)
         self._check_pruner_valid_before_step(model, pruner, device)
         pruner.step()
+        if type(model) is Conv2dBN:
+            assert model.seq[1].parametrizations.weight[0].pruned_outputs == model.seq[0].parametrizations.weight[0].pruned_outputs
+            assert model.bn.parametrizations.weight[0].pruned_outputs == model.conv2d.parametrizations.weight[0].pruned_outputs
         self._check_pruner_valid_after_step(model, pruner, {1}, device)
         assert model(x).shape == (1, 64, 24, 24)
 
     def test_step_conv2d(self):
-        models = [Conv2dA(), Conv2dB(), Conv2dC()]
+        bn_model = Conv2dBN()
+        bn_config = [(bn_model.seq[0], bn_model.seq[1]), (bn_model.conv2d, bn_model.bn)]
+
+        models = [Conv2dA(), Conv2dB(), Conv2dC(), bn_model]
+        configs = [None, None, None, bn_config]
         for device in DEVICES:
-            for model in models:
-                self._test_step_conv2d_on_device(model, torch.device(device))
+            for model, config in zip(models, configs):
+                self._test_step_conv2d_on_device(model, config, torch.device(device))
diff --git a/torch/ao/sparsity/__init__.py b/torch/ao/sparsity/__init__.py
index 06854a42cf9ce..80aa30814eac8 100644
--- a/torch/ao/sparsity/__init__.py
+++ b/torch/ao/sparsity/__init__.py
@@ -19,6 +19,7 @@
 
 # Parametrizations
 from .experimental.pruner.parametrization import PruningParametrization
+from .experimental.pruner.parametrization import ZeroesParametrization
 from .experimental.pruner.parametrization import ActivationReconstruction
 from .experimental.pruner.parametrization import BiasHook
 
diff --git a/torch/ao/sparsity/experimental/pruner/base_pruner.py b/torch/ao/sparsity/experimental/pruner/base_pruner.py
index a8a7b69141be3..6baeb6efda849 100644
--- a/torch/ao/sparsity/experimental/pruner/base_pruner.py
+++ b/torch/ao/sparsity/experimental/pruner/base_pruner.py
@@ -1,4 +1,6 @@
 
+import copy
+import warnings
 import abc
 
 import torch
@@ -7,13 +9,18 @@
 
 from torch.nn.modules.container import ModuleDict, ModuleList
 
-from .parametrization import PruningParametrization, ActivationReconstruction, BiasHook
+from .parametrization import PruningParametrization, ZeroesParametrization, ActivationReconstruction, BiasHook
 
-from torch.ao.sparsity import BaseSparsifier, fqn_to_module
+from torch.ao.sparsity import BaseSparsifier, module_to_fqn, fqn_to_module
 
-SUPPORTED_MODULES = {
+SUPPORTED_MODULES = {  # added to config if None given
     nn.Linear,
-    nn.Conv2d
+    nn.Conv2d,
+    nn.BatchNorm2d,  # will need manual update to match conv2d
+}
+
+NEEDS_ZEROS = {  # these layers should have pruned indices zero-ed, not removed
+    nn.BatchNorm2d
 }
 
 
@@ -44,45 +51,170 @@ def _prepare(self, use_path=False, *args, **kwargs):
         self.bias_handles = []
 
         for config in self.module_groups:
+            modules = []
             if use_path:
-                module = fqn_to_module(self.model, config['fqn'])
+                if type(config['module']) is tuple:  # (Conv2d, BN)
+                    for fqn in config['fqn']:
+                        module = fqn_to_module(self.model, fqn)
+                        modules.append(module)
+                else:
+                    module = fqn_to_module(self.model, config['fqn'])
+                    modules.append(module)
             else:
-                module = config['module']
-
-            if getattr(module, 'mask', None) is None:
-                module.register_buffer('mask', torch.tensor(module.weight.shape[0]))
-            param = config.get('parametrization', PruningParametrization)
-            parametrize.register_parametrization(module, 'weight',
-                                                 param(module.mask),
-                                                 unsafe=True)
-
-            assert isinstance(module.parametrizations, ModuleDict)  # make mypy happy
-            assert isinstance(module.parametrizations.weight, ModuleList)
-            if isinstance(module, tuple(SUPPORTED_MODULES)):
-                self.activation_handles.append(module.register_forward_hook(
-                    ActivationReconstruction(module.parametrizations.weight[0])
-                ))
+                if type(config['module']) is tuple:
+                    for module in config['module']:
+                        modules.append(module)
+                else:
+                    module = config['module']
+                    modules.append(module)
+
+            for module in modules:
+                if not isinstance(module, tuple(NEEDS_ZEROS)):
+                    # add pruning parametrization and forward hooks
+                    if getattr(module, 'mask', None) is None:
+                        module.register_buffer('mask', torch.tensor(module.weight.shape[0]))
+                    param = config.get('parametrization', PruningParametrization)
+                    parametrize.register_parametrization(module, 'weight', param(module.mask), unsafe=True)
+
+                    assert isinstance(module.parametrizations, ModuleDict)  # make mypy happy
+                    assert isinstance(module.parametrizations.weight, ModuleList)
+                    if isinstance(module, tuple(SUPPORTED_MODULES)):
+                        self.activation_handles.append(module.register_forward_hook(
+                            ActivationReconstruction(module.parametrizations.weight[0])
+                        ))
+                    else:
+                        raise NotImplementedError("This module type is not supported yet.")
+
+                else:  # needs zeros
+                    if getattr(module, 'mask', None) is None:
+                        module.register_buffer('mask', torch.tensor(module.weight.shape[0]))
+                    param = config.get('parametrization', ZeroesParametrization)
+                    parametrize.register_parametrization(module, 'weight', param(module.mask), unsafe=True)
+
+                if module.bias is not None:
+                    module.register_parameter('_bias', nn.Parameter(module.bias.detach()))
+                    module.bias = None
+                self.bias_handles.append(module.register_forward_hook(BiasHook(module.parametrizations.weight[0], self.prune_bias)))
+
+            if len(modules) == 2:  # (Conv2d, BN)
+                # should have the same set of pruned outputs
+                modules[1].parametrizations.weight[0].pruned_outputs = modules[0].parametrizations.weight[0].pruned_outputs
+
+
+    def prepare(self, model, config):
+        r"""Prepares a model, by adding the parametrizations and forward post-hooks.
+        Note::
+            The model is modified inplace. If you need to preserve the original
+            model, use copy.deepcopy.
+
+        Args:
+        - model [nn.Module]: model to configure. The model itself is not saved
+            but used for the state_dict saving / loading.
+        - config [list]: configuration elements could either be instances of
+            nn.Module or dict maps. The dicts must have a key 'module' with the
+            value being an instance of a nn.Module.
+        """
+        self.model = model  # TODO: Need to figure out how to load without this.
+        self.config = config
+
+        # If no config -- try getting all the supported layers
+        if self.config is None:
+            # Add all models to the config
+            self.config = []
+            stack = [model]
+            while stack:
+                module = stack.pop()
+                for name, child in module.named_children():
+                    if type(child) in SUPPORTED_MODULES:
+                        self.config.append(child)
+                    else:
+                        if type(child) in NEEDS_ZEROS and self.prune_bias:
+                            warnings.warn(f"Models with {type(child)} layers have config provided by user.")
+                        stack.append(child)
+
+        for module_config in self.config:
+            if type(module_config) is tuple:
+                first_layer, next_layer = module_config
+                assert isinstance(first_layer, nn.Conv2d) and isinstance(next_layer, nn.BatchNorm2d)
+                module_config = {'module': module_config}
+                local_args = copy.deepcopy(self.defaults)
+                local_args.update(module_config)
+                fqn_list = []
+                for module in local_args['module']:
+                    module_fqn = module_to_fqn(model, module)
+                    if module_fqn and module_fqn[0] == '.':
+                        module_fqn = module_fqn[1:]
+                    fqn_list.append(module_fqn)
+                local_args['fqn'] = fqn_list
             else:
-                raise NotImplementedError("This module type is not supported yet.")
+                if isinstance(module_config, nn.Module):
+                    module_config = {'module': module_config}
+                local_args = copy.deepcopy(self.defaults)
+                local_args.update(module_config)
+                module = local_args['module']
+                module_fqn = module_to_fqn(model, module)
+                if module_fqn and module_fqn[0] == '.':
+                    module_fqn = module_fqn[1:]
+                local_args['fqn'] = module_fqn
+
+            self.module_groups.append(local_args)
 
-            if module.bias is not None:
-                module.register_parameter('_bias', nn.Parameter(module.bias.detach()))
-                module.bias = None
-            self.bias_handles.append(module.register_forward_hook(BiasHook(module.parametrizations.weight[0], self.prune_bias)))
+        self._prepare()
 
     def squash_mask(self, use_path=False, *args, **kwargs):
         for config in self.module_groups:
+            modules = []
             if use_path:
-                module = fqn_to_module(self.model, config['fqn'])
+                if type(config['module']) is tuple:  # (Conv2d, BN)
+                    for fqn in config['fqn']:
+                        module = fqn_to_module(self.model, fqn)
+                        modules.append(module)
+                else:
+                    module = fqn_to_module(self.model, config['fqn'])
+                    modules.append(module)
             else:
-                module = config['module']
-            parametrize.remove_parametrizations(module, 'weight',
-                                                leave_parametrized=True)
-            if getattr(module._parameters, 'mask', None):
-                del module._parameters['mask']
-            elif getattr(module._buffers, 'mask', None):
-                del module._buffers['mask']
-            delattr(module, 'mask')
+                if type(config['module']) is tuple:
+                    for module in config['module']:
+                        modules.append(module)
+                else:
+                    module = config['module']
+                    modules.append(module)
+
+            for module in modules:
+                parametrize.remove_parametrizations(module, 'weight',
+                                                    leave_parametrized=True)
+                if getattr(module._parameters, 'mask', None):
+                    del module._parameters['mask']
+                elif getattr(module._buffers, 'mask', None):
+                    del module._buffers['mask']
+                delattr(module, 'mask')
+
+    def step(self, use_path=False):
+        if not self.enable_mask_update:
+            return
+        with torch.no_grad():
+            for config in self.module_groups:
+                modules = []
+                if use_path:
+                    if type(config['module']) is tuple:  # (Conv2d, BN)
+                        for fqn in config['fqn']:
+                            module = fqn_to_module(self.model, fqn)
+                            modules.append(module)
+                    else:
+                        module = fqn_to_module(self.model, config['fqn'])
+                        modules.append(module)
+                else:
+                    if type(config['module']) is tuple:
+                        for module in config['module']:
+                            modules.append(module)
+                    else:
+                        module = config['module']
+                        modules.append(module)
+
+                # only need to update the first module in modules if len(modules) > 1
+                # since they should share the same set of pruned outputs
+                module = modules[0]
+                self.update_mask(module, **config)
 
     @abc.abstractmethod
     def update_mask(self, layer, **kwargs):
diff --git a/torch/ao/sparsity/experimental/pruner/parametrization.py b/torch/ao/sparsity/experimental/pruner/parametrization.py
index 696b16e1edccc..0ee937a4a8ae4 100644
--- a/torch/ao/sparsity/experimental/pruner/parametrization.py
+++ b/torch/ao/sparsity/experimental/pruner/parametrization.py
@@ -14,6 +14,19 @@ def forward(self, x):
         return x[list(valid_outputs)]
 
 
+class ZeroesParametrization(nn.Module):
+    r"""Zero out pruned channels instead of removing.
+    E.g. used for Batch Norm pruning, which should match previous Conv2d layer."""
+    def __init__(self, original_outputs):
+        super().__init__()
+        self.original_outputs = set(range(original_outputs.item()))
+        self.pruned_outputs = set()  # Will contain indicies of outputs to prune
+
+    def forward(self, x):
+        x.data[list(self.pruned_outputs)] = 0
+        return x
+
+
 class ActivationReconstruction:
     def __init__(self, parametrization):
         self.param = parametrization

From eebac46282a9166ae330816a7203da13e7b272ad Mon Sep 17 00:00:00 2001
From: Karen Zhou <kazhou@fb.com>
Date: Wed, 25 Aug 2021 09:55:02 -0700
Subject: [PATCH 212/530] [pruner] add getter for pruned outputs in base pruner
 (#63520)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63520

Rather than having to call `module.parametrizations.weight[0].pruned_outputs` each time we need to access the set of pruned indices, we add a getter `get_module_pruned_outputs` which takes the module as an argument and returns the set.

This is used for testing.
ghstack-source-id: 136561130

Test Plan:
` buck test mode/dev-nosan //caffe2/test:ao -- TestBasePruner`

https://pxl.cl/1N4gK

Reviewed By: z-a-f

Differential Revision: D30374558

fbshipit-source-id: e38dfee0879cadde52b942e899a3d8d7151ee493
---
 test/ao/sparsity/test_pruner.py                     |  4 ++--
 .../ao/sparsity/experimental/pruner/base_pruner.py  | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/test/ao/sparsity/test_pruner.py b/test/ao/sparsity/test_pruner.py
index 663c6f033a9ce..55364536b6191 100644
--- a/test/ao/sparsity/test_pruner.py
+++ b/test/ao/sparsity/test_pruner.py
@@ -368,8 +368,8 @@ def _test_step_conv2d_on_device(self, model, config, device):
         self._check_pruner_valid_before_step(model, pruner, device)
         pruner.step()
         if type(model) is Conv2dBN:
-            assert model.seq[1].parametrizations.weight[0].pruned_outputs == model.seq[0].parametrizations.weight[0].pruned_outputs
-            assert model.bn.parametrizations.weight[0].pruned_outputs == model.conv2d.parametrizations.weight[0].pruned_outputs
+            assert pruner.get_module_pruned_outputs(model.seq[1]) == pruner.get_module_pruned_outputs(model.seq[0])
+            assert pruner.get_module_pruned_outputs(model.bn) == pruner.get_module_pruned_outputs(model.conv2d)
         self._check_pruner_valid_after_step(model, pruner, {1}, device)
         assert model(x).shape == (1, 64, 24, 24)
 
diff --git a/torch/ao/sparsity/experimental/pruner/base_pruner.py b/torch/ao/sparsity/experimental/pruner/base_pruner.py
index 6baeb6efda849..6017e8f53ae69 100644
--- a/torch/ao/sparsity/experimental/pruner/base_pruner.py
+++ b/torch/ao/sparsity/experimental/pruner/base_pruner.py
@@ -189,6 +189,19 @@ def squash_mask(self, use_path=False, *args, **kwargs):
                     del module._buffers['mask']
                 delattr(module, 'mask')
 
+    def get_module_pruned_outputs(self, module):
+        r"""Returns the set of pruned indices of module"""
+        assert parametrize.is_parametrized(module)  # can only get pruned indices of pruned module
+        modules = {config['module'] for config in self.module_groups}
+        module_list = set()
+        for m in modules:
+            if type(m) is tuple:
+                module_list.update(m)
+            else:
+                module_list.add(m)
+        assert module in module_list  # check that module is in pruner.module_groups
+        return module.parametrizations.weight[0].pruned_outputs  # assume only one parametrization attached
+
     def step(self, use_path=False):
         if not self.enable_mask_update:
             return

From 6324d98e9e736214ee7a161482b02aaf8cebbc9d Mon Sep 17 00:00:00 2001
From: riship <riship@nvidia.com>
Date: Wed, 25 Aug 2021 09:56:41 -0700
Subject: [PATCH 213/530] bf16 Error message cleanup as well as addition of
 is_bf16_supported (#63798)

Summary:
ngimel

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63798

Reviewed By: heitorschueroff

Differential Revision: D30526187

Pulled By: ngimel

fbshipit-source-id: c484aec14638097c96c720095d3491249b6b2d14
---
 torch/autocast_mode.py          | 12 ++++++------
 torch/cpu/amp/autocast_mode.py  |  4 ++--
 torch/cuda/__init__.py          |  8 ++++++++
 torch/cuda/amp/autocast_mode.py |  4 ++--
 4 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/torch/autocast_mode.py b/torch/autocast_mode.py
index ec9fdb0326d62..97d51b8f1ca7b 100644
--- a/torch/autocast_mode.py
+++ b/torch/autocast_mode.py
@@ -80,7 +80,7 @@ def forward(self, input):
         c_float32 = torch.rand((8, 8), device="cpu")
         d_float32 = torch.rand((8, 8), device="cpu")
 
-        with autocast(fast_dtype=torch.bfloat16, device_type="cpu"):
+        with autocast(dtype=torch.bfloat16, device_type="cpu"):
             # torch.mm is on autocast's list of ops that should run in bfloat16.
             # Inputs are float32, but the op runs in bfloat16 and produces bfloat16 output.
             # No manual casts are required.
@@ -125,7 +125,7 @@ def forward(self, input):
     Args:
         device_type(string, required):  Whether to use 'cuda' or 'cpu' device
         enabled(bool, optional, default=True)":  Whether autocasting should be enabled in the region.
-        fast_dtype(torch_dtype, optional):  Whether to use torch.float16 or torch.bfloat16
+        dtype(torch_dtype, optional):  Whether to use torch.float16 or torch.bfloat16
     """
     def __init__(self, device_type, enabled=True, **kwargs):
         self.device = device_type
@@ -139,9 +139,9 @@ def __init__(self, device_type, enabled=True, **kwargs):
             warnings.warn('User provided device_type of \'cuda\', but CUDA is not available. Disabling')
             enabled = False
         for key, value in kwargs.items():
-            if key == 'fast_dtype':
+            if key == 'dtype':
                 self.fast_dtype = value
-            if not (key == 'fast_dtype'):
+            if not (key == 'dtype'):
                 raise RuntimeError('Unrecognized optional argument supplied to autocast context manager: ' + str(key))
 
         if self.device == 'cpu':
@@ -152,8 +152,8 @@ def __init__(self, device_type, enabled=True, **kwargs):
                 warnings.warn(error_message)
                 enabled = False
         if self.device == 'cuda':
-            if self.fast_dtype == torch.bfloat16 and torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
-                raise RuntimeError('Current CUDA Device does not support bfloat16. Switching fast_dtype to float16.')
+            if self.fast_dtype == torch.bfloat16 and not torch.cuda.is_bf16_supported():
+                raise RuntimeError('Current CUDA Device does not support bfloat16. Please switch dtype to float16.')
         self._enabled = enabled
 
     def __enter__(self):
diff --git a/torch/cpu/amp/autocast_mode.py b/torch/cpu/amp/autocast_mode.py
index 027ef382f1599..08ea200a2bdc4 100644
--- a/torch/cpu/amp/autocast_mode.py
+++ b/torch/cpu/amp/autocast_mode.py
@@ -5,5 +5,5 @@ class autocast(torch.autocast_mode.autocast):
     See :class:`torch.autocast`.
     ``torch.cpu.amp.autocast(args...)`` is equivalent to ``torch.autocast("cpu", args...)``
     """
-    def __init__(self, enabled=True, fast_dtype=torch.float16):
-        super().__init__("cpu", enabled=enabled, fast_dtype=fast_dtype)
+    def __init__(self, enabled=True, dtype=torch.float16):
+        super().__init__("cpu", enabled=enabled, dtype=dtype)
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 29e112fc67abd..d5a9cbb52f34f 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -78,6 +78,14 @@ def is_available() -> bool:
     # be initialized
     return torch._C._cuda_getDeviceCount() > 0
 
+def is_bf16_supported():
+    r"""Returns a bool indicating if the current CUDA device supports dtype bfloat16"""
+    cu_vers = torch.version.cuda
+    if cu_vers is not None:
+        cuda_maj_decide = int(cu_vers.split(',')[0]) >= 11
+    else:
+        cuda_maj_decide = False
+    return torch.cuda.get_device_properties(torch.cuda.current_device()).major >= 8 and cuda_maj_decide
 
 def _sleep(cycles):
     torch._C._cuda_sleep(cycles)
diff --git a/torch/cuda/amp/autocast_mode.py b/torch/cuda/amp/autocast_mode.py
index e9bfe06a0a352..ca8a2fcaf29d5 100644
--- a/torch/cuda/amp/autocast_mode.py
+++ b/torch/cuda/amp/autocast_mode.py
@@ -13,8 +13,8 @@ class autocast(torch.autocast_mode.autocast):
     See :class:`torch.autocast`.
     ``torch.cuda.amp.autocast(args...)`` is equivalent to ``torch.autocast("cuda", args...)``
     """
-    def __init__(self, enabled=True, fast_dtype=torch.float16):
-        super().__init__("cuda", enabled=enabled, fast_dtype=fast_dtype)
+    def __init__(self, enabled=True, dtype=torch.float16):
+        super().__init__("cuda", enabled=enabled, dtype=dtype)
 
 
 # Casts Tensors and containers of Tensors.  Special-cases passthroughs for strings and np.ndarrays, which

From c06dfd7c26102ac2436ca25609c92fa794e972ca Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@fb.com>
Date: Wed, 25 Aug 2021 10:22:17 -0700
Subject: [PATCH 214/530] [fx2trt] Check input device in TRTModule (#63893)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63893

Add a check to ensure all the inputs are on cuda device.

Test Plan: CI

Reviewed By: kflu, houseroad

Differential Revision: D30525265

fbshipit-source-id: 6e50b70fd535defc1f802d51e8bb991b2dd73741
---
 torch/fx/experimental/fx2trt/fx2trt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/fx/experimental/fx2trt/fx2trt.py b/torch/fx/experimental/fx2trt/fx2trt.py
index 0e7cc24c18be5..ede99fd6f1700 100644
--- a/torch/fx/experimental/fx2trt/fx2trt.py
+++ b/torch/fx/experimental/fx2trt/fx2trt.py
@@ -86,6 +86,7 @@ def forward(self, *inputs):
         bindings: List[Any] = [None] * (len(self.input_names) + len(self.output_names))
 
         for i, input_name in enumerate(self.input_names):
+            assert inputs[i].is_cuda, f"{i}th input is not on cuda device."
             idx = self.engine.get_binding_index(input_name)
             bindings[idx] = contiguous_inputs[i].data_ptr()
 

From ab954cb0d176a5632f123ac19d9469e6f863d39a Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Wed, 25 Aug 2021 11:07:24 -0700
Subject: [PATCH 215/530] clean up engine.cpp thread state (#63115)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63115

This actually changes:
- callbacks now run with proper grad mode even in worker threads
- graphtask's Future callbacks now run with proper TLS when erroring
  out from a worker thread

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D30388100

Pulled By: albanD

fbshipit-source-id: 7ae9c461c2f0040548dd9e1e314f25e8da0c2e67
---
 torch/csrc/autograd/engine.cpp                        | 11 ++++++-----
 .../csrc/distributed/autograd/engine/dist_engine.cpp  |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 252a74b4c07c7..de2078d2d6432 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -407,7 +407,12 @@ auto Engine::thread_main(const std::shared_ptr<GraphTask>& graph_task) -> void {
       }
 
       if (task.fn_ && !local_graph_task->has_error_.load()) {
+        // Set the ThreadLocalState before calling the function.
+        // NB: The ThreadLocalStateGuard doesn't set the grad_mode because GraphTask
+        // always saves ThreadLocalState without grad_mode.
+        at::ThreadLocalStateGuard tls_guard(local_graph_task->thread_locals_);
         AutoGradMode grad_mode(local_graph_task->grad_mode_);
+
         try {
           // The guard sets the thread_local current_graph_task on construction
           // and restores it on exit. The current_graph_task variable helps
@@ -575,6 +580,7 @@ void GraphTask::exec_post_processing() {
     // NB: The ThreadLocalStateGuard doesn't set the grad_mode because GraphTask
     // always saves ThreadLocalState without grad_mode.
     at::ThreadLocalStateGuard tls_guard(this->thread_locals_);
+    AutoGradMode grad_mode(this->grad_mode_);
 
     // WARNING: Don't use a range-for loop here because more callbacks may be
     // added in between callback calls, so iterators may become invalidated.
@@ -764,11 +770,6 @@ void Engine::evaluate_function(
     Node* func,
     InputBuffer& inputs,
     const std::shared_ptr<ReadyQueue>& cpu_ready_queue) {
-  // Set the ThreadLocalState before calling the function.
-  // NB: The ThreadLocalStateGuard doesn't set the grad_mode because GraphTask
-  // always saves ThreadLocalState without grad_mode.
-  at::ThreadLocalStateGuard tls_guard(graph_task->thread_locals_);
-
   // The InputBuffer::adds that supplied incoming grads took pains to
   // ensure they're safe to consume in the context of the present
   // func's stream (if applicable). So we guard onto that stream
diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
index 76f2eaebe5f77..4a3b3fff2e20b 100644
--- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
@@ -359,6 +359,7 @@ void DistEngine::execute_graph_task_until_ready_queue_empty(
         continue;
       }
       if (task.fn_ && !local_graph_task->has_error_.load()) {
+        at::ThreadLocalStateGuard tls_guard(local_graph_task->thread_locals_);
         AutoGradMode grad_mode(local_graph_task->grad_mode_);
         try {
           GraphTaskGuard guard(local_graph_task);

From 8a22d4fa5c5953bcb83293b5349b5f79ae08f193 Mon Sep 17 00:00:00 2001
From: Aayush Prakash <aayushp@fb.com>
Date: Wed, 25 Aug 2021 11:11:08 -0700
Subject: [PATCH 216/530] [Reland] Replacing the p.data acccess in utils with
 tensor.set_ . Passes both test_post_localSGD_optimizer_pari and
 test_periodic_model_averager tests (#63895)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63895

When updating the model parameter, updating `parameter.data` is no longer recommended, because this `data` field will be deprecated in the future.

The replacement is `tensor.set_`.
ghstack-source-id: 136593433

Test Plan:
buck test mode/dev-nosan //caffe2/test/distributed:distributed_nccl_spawn -- test_periodic_model_averager
buck test mode/dev-nosan //caffe2/test/distributed:distributed_nccl_spawn -- test_post_localSGD_optimizer_parity

Reviewed By: SciPioneer

Differential Revision: D30526178

fbshipit-source-id: a1ac0ec3665d8623edd5bf94f01c1132daff5c00
---
 torch/distributed/algorithms/model_averaging/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py
index 44ee422b9e92d..ce1fb65401ad2 100644
--- a/torch/distributed/algorithms/model_averaging/utils.py
+++ b/torch/distributed/algorithms/model_averaging/utils.py
@@ -29,5 +29,6 @@ def average_parameters(
 
     offset = 0
     for p in params_it2:
-        p.data = flat_params[offset : offset + p.numel()].view_as(p)
+        with torch.no_grad():
+            p.set_(flat_params[offset : offset + p.numel()].view_as(p).type_as(p))  # type: ignore[call-overload]
         offset += p.numel()

From a2399a76e18b31747019ead8f80e2a4eb53e8223 Mon Sep 17 00:00:00 2001
From: Raghavan Raman <raghavanr@fb.com>
Date: Wed, 25 Aug 2021 11:12:57 -0700
Subject: [PATCH 217/530] [Static Runtime] Moved NNC operator definitions to
 separate files. (#63838)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63838

Refactored NNC operator definitions code into separate files.

Made `TEWrapper` a class with a fixed set of methods and added separate definitions for them based on `TORCH_ENABLE_LLVM` to keep the same functionality as before.

Test Plan: Build and ran Static Runtime tests.

Reviewed By: hlu1

Differential Revision: D30405467

fbshipit-source-id: 606ef852bb820d5e23a0f8af1bf5dc122e90bceb
---
 tools/build_variables.bzl                    |   1 +
 torch/csrc/jit/runtime/static/ops.cpp        | 193 +------------------
 torch/csrc/jit/runtime/static/te_wrapper.cpp | 184 ++++++++++++++++++
 torch/csrc/jit/runtime/static/te_wrapper.h   |  33 ++++
 4 files changed, 219 insertions(+), 192 deletions(-)
 create mode 100644 torch/csrc/jit/runtime/static/te_wrapper.cpp
 create mode 100644 torch/csrc/jit/runtime/static/te_wrapper.h

diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 157c30663ce1c..0d888ea8a4cb0 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -329,6 +329,7 @@ core_sources_full = core_sources_full_mobile + [
     "torch/csrc/jit/runtime/static/native_ops.cpp",
     "torch/csrc/jit/runtime/static/ops.cpp",
     "torch/csrc/jit/runtime/static/passes.cpp",
+    "torch/csrc/jit/runtime/static/te_wrapper.cpp",
     "torch/csrc/jit/tensorexpr/external_functions.cpp",
     "torch/csrc/jit/tensorexpr/external_functions_codegen.cpp",
 ]
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index e6af641083fc2..d9fb9bad080f0 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -15,6 +15,7 @@
 #include <ATen/native/quantized/cpu/qembeddingbag.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/static/te_wrapper.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
@@ -493,198 +494,6 @@ REGISTER_OPERATOR_FUNCTOR(aten::leaky_relu, aten_leaky_relu, [](Node* n) -> SROp
   };
 });
 
-namespace {
-
-// Use the width of an AVX-512 vector by default; this happens to work OK
-// for AVX2 as well. Some ops benefit from using multiple AVX ports, in
-// which case they are vectorized by twice this constant.  An exception is
-// logit, since it contains FP divide, which is single-ported.
-static constexpr int kVectorWidth = 16;
-
-#ifdef TORCH_ENABLE_LLVM
-
-struct TEWrapper {
-  std::unique_ptr<tensorexpr::LLVMCodeGen> cg;
-  TEWrapper() = default;
-  void update(std::unique_ptr<tensorexpr::LLVMCodeGen>&& cg_) {
-    cg = std::move(cg_);
-  }
-
-  void call(const std::vector<void*>& args) {
-    cg->call_raw(args);
-  }
-
-  inline bool supports(const at::Tensor& t) {
-    return t.is_contiguous() && t.dtype().Match<float>();
-  }
-};
-
-void optimizePointwise(
-    tensorexpr::LoopNest* ln,
-    tensorexpr::Tensor target,
-    int width) {
-  using namespace torch::jit::tensorexpr;
-  std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
-  ForPtr inner, tail;
-  TORCH_CHECK(loops.size() > 0, "No loops created for pointwise op");
-  ln->splitWithTail(loops[0], width, &inner, &tail);
-  ln->vectorize(inner);
-}
-
-std::shared_ptr<TEWrapper> wrapTECompute(
-    std::shared_ptr<TEWrapper> wrap,
-    tensorexpr::Placeholder& in,
-    tensorexpr::Tensor out,
-    tensorexpr::VarHandle& dim,
-    int width = kVectorWidth) {
-  using namespace torch::jit::tensorexpr;
-  LoopNest ln({out});
-  optimizePointwise(&ln, out, width);
-  ln.prepareForCodegen();
-  StmtPtr s = ln.root_stmt();
-  s = tensorexpr::IRSimplifier::simplify(s);
-  std::vector<CodeGen::BufferArg> args;
-  args.emplace_back(out);
-  args.emplace_back(in);
-  args.emplace_back(dim);
-  auto cg = std::make_unique<LLVMCodeGen>(s, args);
-  wrap->update(std::move(cg));
-  return wrap;
-};
-
-#else
-
-struct TEWrapper {
-  TEWrapper() = default;
-  template <typename... Ts>
-  void operator()(const Ts&... ts) {
-    DCHECK(0 && "Invalid call");
-  }
-  void call(const std::vector<void*>& args) {
-    DCHECK(0 && "Invalid call");
-  }
-
-  inline bool supports(const at::Tensor& t) {
-    return false;
-  }
-};
-
-std::shared_ptr<TEWrapper> wrapTECompute(
-    std::shared_ptr<TEWrapper> wrap,
-    tensorexpr::Placeholder& in,
-    tensorexpr::Tensor out,
-    tensorexpr::VarHandle& dim,
-    int width = kVectorWidth) {
-  return wrap;
-};
-
-#endif
-
-std::mutex& getNNCCacheMutex() {
-  static std::mutex nncCacheMutex;
-  return nncCacheMutex;
-}
-
-std::unordered_map<NodeKind, std::shared_ptr<TEWrapper>>& getNNCCache() {
-  static std::unordered_map<NodeKind, std::shared_ptr<TEWrapper>> nncCache;
-  return nncCache;
-}
-
-std::shared_ptr<TEWrapper> lookupNNCCache(NodeKind kind) {
-  std::lock_guard<std::mutex> lock(getNNCCacheMutex());
-  auto it = getNNCCache().find(kind);
-  if (it != getNNCCache().end()) {
-    return it->second;
-  }
-  return nullptr;
-}
-
-void updateNNCCache(NodeKind kind, std::shared_ptr<TEWrapper> code) {
-  std::lock_guard<std::mutex> lock(getNNCCacheMutex());
-  getNNCCache()[kind] = code;
-}
-
-} // namespace
-
-std::shared_ptr<TEWrapper> createLogit(c10::optional<float> clamp) {
-  using namespace torch::jit::tensorexpr;
-  // TODO: Use NNC cache for this op.
-  auto wrap = std::make_shared<TEWrapper>();
-  auto N = VarHandle("N", kInt);
-  Placeholder A("A", kFloat, {N});
-  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
-    auto A_elem = [&]() {
-      if (!clamp) {
-        return A.load(i);
-      } else {
-        auto elem = A.load(i);
-        auto min = FloatImm::make(*clamp);
-        auto max = FloatImm::make(1.0f - *clamp);
-        elem = CompareSelect::make(elem, min, min, elem, kLT);
-        return CompareSelect::make(elem, max, max, elem, kGT);
-      }
-    }();
-    return log_vml(A_elem / (FloatImm::make(1.0f) - A_elem));
-  });
-  return wrapTECompute(wrap, A, B, N);
-}
-
-std::shared_ptr<TEWrapper> createRelu() {
-  using namespace torch::jit::tensorexpr;
-  auto wrap = lookupNNCCache(aten::relu);
-  if (wrap) {
-    return wrap;
-  }
-  wrap = std::make_shared<TEWrapper>();
-  auto N = VarHandle("N", kInt);
-  Placeholder A("A", kFloat, {N});
-  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
-    auto zero = FloatImm::make(0.f);
-    auto a = A.load(i);
-    return ifThenElse(a < zero, zero, a);
-  });
-  wrap = wrapTECompute(wrap, A, B, N);
-  updateNNCCache(aten::relu, wrap);
-  return wrap;
-}
-
-std::shared_ptr<TEWrapper> createTanh() {
-  using namespace torch::jit::tensorexpr;
-  auto wrap = lookupNNCCache(aten::tanh);
-  if (wrap) {
-    return wrap;
-  }
-  wrap = std::make_shared<TEWrapper>();
-  auto N = VarHandle("N", kInt);
-  Placeholder A("A", kFloat, {N});
-  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
-    auto a = A.load(i);
-    return fast_tanh(a);
-  });
-  wrap = wrapTECompute(wrap, A, B, N);
-  updateNNCCache(aten::tanh, wrap);
-  return wrap;
-}
-
-std::shared_ptr<TEWrapper> createSigmoid() {
-  using namespace torch::jit::tensorexpr;
-  auto wrap = lookupNNCCache(aten::sigmoid);
-  if (wrap) {
-    return wrap;
-  }
-  wrap = std::make_shared<TEWrapper>();
-  auto N = VarHandle("N", kInt);
-  Placeholder A("A", kFloat, {N});
-  Tensor B =
-      Compute("B", {N}, [&](const VarHandle& i) { return sigmoid(A.load(i)); });
-  // NNC uses sleef for vectorizing sigmoid, which comes in an 8-wide flavor
-  // (Sleef_expf8).
-  constexpr int kSleefWidth = 8;
-  wrap = wrapTECompute(wrap, A, B, N, kSleefWidth);
-  updateNNCCache(aten::sigmoid, wrap);
-  return wrap;
-}
-
 REGISTER_OPERATOR_FUNCTOR(aten::relu, aten_relu, [](Node* n) -> SROperator {
   if (!n->matches(torch::schema("aten::relu(Tensor self) -> Tensor"))) {
     LogAndDumpSchema(n);
diff --git a/torch/csrc/jit/runtime/static/te_wrapper.cpp b/torch/csrc/jit/runtime/static/te_wrapper.cpp
new file mode 100644
index 0000000000000..413d44348a5c0
--- /dev/null
+++ b/torch/csrc/jit/runtime/static/te_wrapper.cpp
@@ -0,0 +1,184 @@
+#include <torch/csrc/jit/runtime/static/te_wrapper.h>
+
+#include <ATen/CPUFunctions.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::tensorexpr;
+
+// Use the width of an AVX-512 vector by default; this happens to work OK for
+// AVX2 as well. Some ops benefit from using multiple AVX ports, in which case
+// they are vectorized by twice this constant.  An exception is logit, since it
+// contains FP divide, which is single-ported.
+static constexpr int kVectorWidth = 16;
+
+#ifdef TORCH_ENABLE_LLVM
+
+void TEWrapper::update(std::unique_ptr<LLVMCodeGen>&& cg_) {
+  cg = std::move(cg_);
+}
+
+void TEWrapper::call(const std::vector<void*>& args) {
+  cg->call_raw(args);
+}
+
+bool TEWrapper::supports(const at::Tensor& t) {
+  return t.is_contiguous() && t.dtype().Match<float>();
+}
+
+void optimizePointwise(LoopNest* ln, Tensor target, int width) {
+  std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
+  ForPtr inner, tail;
+  TORCH_CHECK(loops.size() > 0, "No loops created for pointwise op");
+  ln->splitWithTail(loops[0], width, &inner, &tail);
+  ln->vectorize(inner);
+}
+
+std::shared_ptr<TEWrapper> wrapTECompute(
+    std::shared_ptr<TEWrapper> wrap,
+    Placeholder& in,
+    Tensor out,
+    VarHandle& dim,
+    int width = kVectorWidth) {
+  LoopNest ln({out});
+  optimizePointwise(&ln, out, width);
+  ln.prepareForCodegen();
+  StmtPtr s = ln.root_stmt();
+  s = IRSimplifier::simplify(s);
+  std::vector<CodeGen::BufferArg> args;
+  args.emplace_back(out);
+  args.emplace_back(in);
+  args.emplace_back(dim);
+  auto cg = std::make_unique<LLVMCodeGen>(s, args);
+  wrap->update(std::move(cg));
+  return wrap;
+};
+
+#else
+
+void TEWrapper::call(const std::vector<void*>& args) {
+  DCHECK(0 && "Invalid call");
+}
+
+bool TEWrapper::supports(const at::Tensor& t) {
+  return false;
+}
+
+std::shared_ptr<TEWrapper> wrapTECompute(
+    std::shared_ptr<TEWrapper> wrap,
+    Placeholder& in,
+    Tensor out,
+    VarHandle& dim,
+    int width = kVectorWidth) {
+  return wrap;
+};
+
+#endif
+
+namespace {
+
+std::mutex& getNNCCacheMutex() {
+  static std::mutex nncCacheMutex;
+  return nncCacheMutex;
+}
+
+std::unordered_map<NodeKind, std::shared_ptr<TEWrapper>>& getNNCCache() {
+  static std::unordered_map<NodeKind, std::shared_ptr<TEWrapper>> nncCache;
+  return nncCache;
+}
+
+std::shared_ptr<TEWrapper> lookupNNCCache(NodeKind kind) {
+  std::lock_guard<std::mutex> lock(getNNCCacheMutex());
+  auto it = getNNCCache().find(kind);
+  if (it != getNNCCache().end()) {
+    return it->second;
+  }
+  return nullptr;
+}
+
+void updateNNCCache(NodeKind kind, std::shared_ptr<TEWrapper> code) {
+  std::lock_guard<std::mutex> lock(getNNCCacheMutex());
+  getNNCCache()[kind] = code;
+}
+
+} // namespace
+
+std::shared_ptr<TEWrapper> createLogit(c10::optional<float> clamp) {
+  // TODO: Use NNC cache for this op.
+  auto wrap = std::make_shared<TEWrapper>();
+  auto N = VarHandle("N", kInt);
+  Placeholder A("A", kFloat, {N});
+  Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
+    auto A_elem = [&]() {
+      if (!clamp) {
+        return A.load(i);
+      } else {
+        auto elem = A.load(i);
+        auto min = FloatImm::make(*clamp);
+        auto max = FloatImm::make(1.0f - *clamp);
+        elem = CompareSelect::make(elem, min, min, elem, kLT);
+        return CompareSelect::make(elem, max, max, elem, kGT);
+      }
+    }();
+    return log_vml(A_elem / (FloatImm::make(1.0f) - A_elem));
+  });
+  return wrapTECompute(wrap, A, B, N);
+}
+
+std::shared_ptr<TEWrapper> createRelu() {
+  auto wrap = lookupNNCCache(aten::relu);
+  if (wrap) {
+    return wrap;
+  }
+  wrap = std::make_shared<TEWrapper>();
+  auto N = VarHandle("N", kInt);
+  Placeholder A("A", kFloat, {N});
+  Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
+    auto zero = FloatImm::make(0.f);
+    auto a = A.load(i);
+    return ifThenElse(a < zero, zero, a);
+  });
+  wrap = wrapTECompute(wrap, A, B, N);
+  updateNNCCache(aten::relu, wrap);
+  return wrap;
+}
+
+std::shared_ptr<TEWrapper> createTanh() {
+  auto wrap = lookupNNCCache(aten::tanh);
+  if (wrap) {
+    return wrap;
+  }
+  wrap = std::make_shared<TEWrapper>();
+  auto N = VarHandle("N", kInt);
+  Placeholder A("A", kFloat, {N});
+  Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
+    auto a = A.load(i);
+    return fast_tanh(a);
+  });
+  wrap = wrapTECompute(wrap, A, B, N);
+  updateNNCCache(aten::tanh, wrap);
+  return wrap;
+}
+
+std::shared_ptr<TEWrapper> createSigmoid() {
+  auto wrap = lookupNNCCache(aten::sigmoid);
+  if (wrap) {
+    return wrap;
+  }
+  wrap = std::make_shared<TEWrapper>();
+  auto N = VarHandle("N", kInt);
+  Placeholder A("A", kFloat, {N});
+  Tensor B =
+      Compute("B", {N}, [&](const VarHandle& i) { return sigmoid(A.load(i)); });
+  // NNC uses sleef for vectorizing sigmoid, which comes in an 8-wide flavor
+  // (Sleef_expf8).
+  constexpr int kSleefWidth = 8;
+  wrap = wrapTECompute(wrap, A, B, N, kSleefWidth);
+  updateNNCCache(aten::sigmoid, wrap);
+  return wrap;
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/static/te_wrapper.h b/torch/csrc/jit/runtime/static/te_wrapper.h
new file mode 100644
index 0000000000000..776602dc3edec
--- /dev/null
+++ b/torch/csrc/jit/runtime/static/te_wrapper.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+
+namespace torch {
+namespace jit {
+
+class TEWrapper {
+ public:
+  TEWrapper() = default;
+  void call(const std::vector<void*>& args);
+  bool supports(const at::Tensor& t);
+#ifdef TORCH_ENABLE_LLVM
+  void update(std::unique_ptr<tensorexpr::LLVMCodeGen>&& cg_);
+#endif
+
+ private:
+#ifdef TORCH_ENABLE_LLVM
+  std::unique_ptr<tensorexpr::LLVMCodeGen> cg;
+#endif
+};
+
+std::shared_ptr<TEWrapper> createLogit(c10::optional<float> clamp);
+std::shared_ptr<TEWrapper> createRelu();
+std::shared_ptr<TEWrapper> createTanh();
+std::shared_ptr<TEWrapper> createSigmoid();
+
+} // namespace jit
+} // namespace torch

From dde07cad6f029c2727487544b851dfd74945efb1 Mon Sep 17 00:00:00 2001
From: Raghavan Raman <raghavanr@fb.com>
Date: Wed, 25 Aug 2021 11:12:57 -0700
Subject: [PATCH 218/530] [Static Runtime] Added a variable for clamp in the
 NNC code for Logit. (#63839)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63839

Replaced the use of a constant for clamp in the NNC code for Logit
with a variable. This makes it easier to enable caching for Logit.

There is no performance difference with this change, as shown in the micro-benchmarks below.

```
Logit NNC Benchmark	Time (ns)
	           const-clamp	var-clamp
logit_nnc_sleef/64	550	543
logit_nnc_sleef/512	3514	3517
logit_nnc_sleef/8192	85537	82900
logit_nnc_sleef/32768	347635	337016
logit_nnc_fast/64	173	167
logit_nnc_fast/512	829	866
logit_nnc_fast/8192	13286	13069
logit_nnc_fast/32768	51116	53429
logit_nnc_vml/64	146	164
logit_nnc_vml/512	773	783
logit_nnc_vml/8192	11556	11563
logit_nnc_vml/32768	44815	46720
```

Test Plan: SR unit tests and the inline_cvr model.

Reviewed By: bertmaher

Differential Revision: D30405466

fbshipit-source-id: adb891fdae5746439931ce5f43165291fec08f52
---
 torch/csrc/jit/runtime/static/ops.cpp        |  8 ++--
 torch/csrc/jit/runtime/static/te_wrapper.cpp | 41 ++++++++------------
 torch/csrc/jit/runtime/static/te_wrapper.h   |  2 +-
 3 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index d9fb9bad080f0..140fdf188a951 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -579,8 +579,9 @@ REGISTER_OPERATOR_FUNCTOR(aten::logit, aten_logit, [](Node* n) -> SROperator {
         ? c10::make_optional<float>(static_cast<float>(clamp_d.value()))
         : c10::nullopt;
   }
-  auto te = clamp ? createLogit(clamp) : nullptr;
-  return [te](ProcessedNode* p_node) {
+  auto te = clamp ? createLogit() : nullptr;
+  float clamp_value = clamp ? *clamp : 0.0f;
+  return [te, clamp_value](ProcessedNode* p_node) {
     const auto& in0_t = p_node->Input(0).toTensor();
     if (p_node->Output(0).isNone()) {
       p_node->Output(0) = create_empty_from(in0_t);
@@ -594,7 +595,8 @@ REGISTER_OPERATOR_FUNCTOR(aten::logit, aten_logit, [](Node* n) -> SROperator {
     } else {
       at::native::resize_(out_t, in0_t.sizes(), c10::nullopt);
       int64_t nn = in0_t.numel();
-      te->call({out_t.data_ptr(), in0_t.data_ptr(), &nn});
+      float c = clamp_value;
+      te->call({out_t.data_ptr(), in0_t.data_ptr(), &nn, &c});
     }
   };
 });
diff --git a/torch/csrc/jit/runtime/static/te_wrapper.cpp b/torch/csrc/jit/runtime/static/te_wrapper.cpp
index 413d44348a5c0..9c3cbe9ac5941 100644
--- a/torch/csrc/jit/runtime/static/te_wrapper.cpp
+++ b/torch/csrc/jit/runtime/static/te_wrapper.cpp
@@ -38,23 +38,19 @@ void optimizePointwise(LoopNest* ln, Tensor target, int width) {
 
 std::shared_ptr<TEWrapper> wrapTECompute(
     std::shared_ptr<TEWrapper> wrap,
-    Placeholder& in,
     Tensor out,
-    VarHandle& dim,
+    std::vector<CodeGen::BufferArg> args,
     int width = kVectorWidth) {
   LoopNest ln({out});
   optimizePointwise(&ln, out, width);
   ln.prepareForCodegen();
   StmtPtr s = ln.root_stmt();
   s = IRSimplifier::simplify(s);
-  std::vector<CodeGen::BufferArg> args;
-  args.emplace_back(out);
-  args.emplace_back(in);
-  args.emplace_back(dim);
+  args.insert(args.begin(), out);
   auto cg = std::make_unique<LLVMCodeGen>(s, args);
   wrap->update(std::move(cg));
   return wrap;
-};
+}
 
 #else
 
@@ -68,12 +64,11 @@ bool TEWrapper::supports(const at::Tensor& t) {
 
 std::shared_ptr<TEWrapper> wrapTECompute(
     std::shared_ptr<TEWrapper> wrap,
-    Placeholder& in,
     Tensor out,
-    VarHandle& dim,
+    std::vector<CodeGen::BufferArg> args,
     int width = kVectorWidth) {
   return wrap;
-};
+}
 
 #endif
 
@@ -105,26 +100,24 @@ void updateNNCCache(NodeKind kind, std::shared_ptr<TEWrapper> code) {
 
 } // namespace
 
-std::shared_ptr<TEWrapper> createLogit(c10::optional<float> clamp) {
+std::shared_ptr<TEWrapper> createLogit() {
   // TODO: Use NNC cache for this op.
   auto wrap = std::make_shared<TEWrapper>();
   auto N = VarHandle("N", kInt);
+  auto C = VarHandle("C", kFloat);
   Placeholder A("A", kFloat, {N});
   Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto A_elem = [&]() {
-      if (!clamp) {
-        return A.load(i);
-      } else {
-        auto elem = A.load(i);
-        auto min = FloatImm::make(*clamp);
-        auto max = FloatImm::make(1.0f - *clamp);
-        elem = CompareSelect::make(elem, min, min, elem, kLT);
-        return CompareSelect::make(elem, max, max, elem, kGT);
-      }
+      auto elem = A.load(i);
+      auto one = FloatImm::make(1.0f);
+      const auto& min = C;
+      auto max = one - C;
+      elem = CompareSelect::make(elem, min, min, elem, kLT);
+      return CompareSelect::make(elem, max, max, elem, kGT);
     }();
     return log_vml(A_elem / (FloatImm::make(1.0f) - A_elem));
   });
-  return wrapTECompute(wrap, A, B, N);
+  return wrapTECompute(wrap, B, {A, N, C});
 }
 
 std::shared_ptr<TEWrapper> createRelu() {
@@ -140,7 +133,7 @@ std::shared_ptr<TEWrapper> createRelu() {
     auto a = A.load(i);
     return ifThenElse(a < zero, zero, a);
   });
-  wrap = wrapTECompute(wrap, A, B, N);
+  wrap = wrapTECompute(wrap, B, {A, N});
   updateNNCCache(aten::relu, wrap);
   return wrap;
 }
@@ -157,7 +150,7 @@ std::shared_ptr<TEWrapper> createTanh() {
     auto a = A.load(i);
     return fast_tanh(a);
   });
-  wrap = wrapTECompute(wrap, A, B, N);
+  wrap = wrapTECompute(wrap, B, {A, N});
   updateNNCCache(aten::tanh, wrap);
   return wrap;
 }
@@ -175,7 +168,7 @@ std::shared_ptr<TEWrapper> createSigmoid() {
   // NNC uses sleef for vectorizing sigmoid, which comes in an 8-wide flavor
   // (Sleef_expf8).
   constexpr int kSleefWidth = 8;
-  wrap = wrapTECompute(wrap, A, B, N, kSleefWidth);
+  wrap = wrapTECompute(wrap, B, {A, N}, kSleefWidth);
   updateNNCCache(aten::sigmoid, wrap);
   return wrap;
 }
diff --git a/torch/csrc/jit/runtime/static/te_wrapper.h b/torch/csrc/jit/runtime/static/te_wrapper.h
index 776602dc3edec..0a5f3d8532990 100644
--- a/torch/csrc/jit/runtime/static/te_wrapper.h
+++ b/torch/csrc/jit/runtime/static/te_wrapper.h
@@ -24,7 +24,7 @@ class TEWrapper {
 #endif
 };
 
-std::shared_ptr<TEWrapper> createLogit(c10::optional<float> clamp);
+std::shared_ptr<TEWrapper> createLogit();
 std::shared_ptr<TEWrapper> createRelu();
 std::shared_ptr<TEWrapper> createTanh();
 std::shared_ptr<TEWrapper> createSigmoid();

From 64d605bab82792bd1f89cb896302e59313466884 Mon Sep 17 00:00:00 2001
From: Raghavan Raman <raghavanr@fb.com>
Date: Wed, 25 Aug 2021 11:12:57 -0700
Subject: [PATCH 219/530] [Static Runtime] Added caching for the NNC code
 generated for Logit. (#63840)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63840

Added NNC generated code for Logit to the cache.

```
Logit NNC Benchmark	Time (ns)
	            w/o cache	w/ cache
logit_nnc_sleef/64	543	536
logit_nnc_sleef/512	3517	3465
logit_nnc_sleef/8192	88483	85881
logit_nnc_sleef/32768	337016	323090
logit_nnc_fast/64	167	163
logit_nnc_fast/512	866	817
logit_nnc_fast/8192	13069	12801
logit_nnc_fast/32768	53429	52530
logit_nnc_vml/64	164	151
logit_nnc_vml/512	783	769
logit_nnc_vml/8192	11563	11674
logit_nnc_vml/32768	46720	46452
```

Test Plan: Unit tests and inline_cvr model.

Reviewed By: hlu1

Differential Revision: D30405424

fbshipit-source-id: 938b1b74758e2612ae151bac890c5f8ebbc42d50
---
 torch/csrc/jit/runtime/static/te_wrapper.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/te_wrapper.cpp b/torch/csrc/jit/runtime/static/te_wrapper.cpp
index 9c3cbe9ac5941..d8b494c9d4a23 100644
--- a/torch/csrc/jit/runtime/static/te_wrapper.cpp
+++ b/torch/csrc/jit/runtime/static/te_wrapper.cpp
@@ -101,8 +101,11 @@ void updateNNCCache(NodeKind kind, std::shared_ptr<TEWrapper> code) {
 } // namespace
 
 std::shared_ptr<TEWrapper> createLogit() {
-  // TODO: Use NNC cache for this op.
-  auto wrap = std::make_shared<TEWrapper>();
+  auto wrap = lookupNNCCache(aten::logit);
+  if (wrap) {
+    return wrap;
+  }
+  wrap = std::make_shared<TEWrapper>();
   auto N = VarHandle("N", kInt);
   auto C = VarHandle("C", kFloat);
   Placeholder A("A", kFloat, {N});
@@ -117,7 +120,9 @@ std::shared_ptr<TEWrapper> createLogit() {
     }();
     return log_vml(A_elem / (FloatImm::make(1.0f) - A_elem));
   });
-  return wrapTECompute(wrap, B, {A, N, C});
+  wrap = wrapTECompute(wrap, B, {A, N, C});
+  updateNNCCache(aten::logit, wrap);
+  return wrap;
 }
 
 std::shared_ptr<TEWrapper> createRelu() {

From 67d8e7b659b19e1ee68208b28bfa7dba73375dbc Mon Sep 17 00:00:00 2001
From: driazati <driazati@users.noreply.github.com>
Date: Wed, 25 Aug 2021 11:19:49 -0700
Subject: [PATCH 220/530] Reformat run_test.py (#63808)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63808

`black run_test.py`

Test Plan: Imported from OSS

Reviewed By: seemethere

Differential Revision: D30497437

Pulled By: driazati

fbshipit-source-id: 41b29b73f41fa4bb15fce5eaa69f8efe614e02f7
---
 test/run_test.py | 1209 +++++++++++++++++++++++++---------------------
 1 file changed, 650 insertions(+), 559 deletions(-)

diff --git a/test/run_test.py b/test/run_test.py
index f3b7bf72bbbba..ecc93fe03aa30 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -14,7 +14,13 @@
 
 import torch
 from torch.utils import cpp_extension
-from torch.testing._internal.common_utils import FILE_SCHEMA, IS_IN_CI, TEST_WITH_ROCM, shell, set_cwd
+from torch.testing._internal.common_utils import (
+    FILE_SCHEMA,
+    IS_IN_CI,
+    TEST_WITH_ROCM,
+    shell,
+    set_cwd,
+)
 import torch.distributed as dist
 from typing import Dict, Optional, List
 
@@ -29,243 +35,245 @@
         get_reordered_tests,
         get_test_case_configs,
     )
+
     HAVE_TEST_SELECTION_TOOLS = True
 except ImportError:
     HAVE_TEST_SELECTION_TOOLS = False
-    print("Unable to import test_selections from tools/testing. Running without test selection stats...")
+    print(
+        "Unable to import test_selections from tools/testing. Running without test selection stats..."
+    )
 
 
 TESTS = [
-    'test_import_time',
-    'test_public_bindings',
-    'test_type_hints',
-    'test_ao_sparsity',
-    'test_autograd',
-    'benchmark_utils/test_benchmark_utils',
-    'test_binary_ufuncs',
-    'test_buffer_protocol',
-    'test_bundled_inputs',
-    'test_complex',
-    'test_cpp_api_parity',
-    'test_cpp_extensions_aot_no_ninja',
-    'test_cpp_extensions_aot_ninja',
-    'test_cpp_extensions_jit',
-    'distributed/test_c10d_common',
-    'distributed/test_c10d_gloo',
-    'distributed/test_c10d_nccl',
-    'distributed/test_jit_c10d',
-    'distributed/test_c10d_spawn_gloo',
-    'distributed/test_c10d_spawn_nccl',
-    'distributed/test_store',
-    'distributed/test_pg_wrapper',
-    'distributed/algorithms/test_join',
-    'test_cuda',
-    'test_jit_cuda_fuser',
-    'test_cuda_primary_ctx',
-    'test_dataloader',
-    'test_datapipe',
-    'distributed/test_data_parallel',
-    'distributed/test_distributed_spawn',
-    'distributions/test_constraints',
-    'distributions/test_distributions',
-    'test_dispatch',
-    'test_foreach',
-    'test_indexing',
-    'test_jit',
-    'test_linalg',
-    'test_logging',
-    'test_mkldnn',
-    'test_model_dump',
-    'test_module_init',
-    'test_modules',
-    'test_multiprocessing',
-    'test_multiprocessing_spawn',
-    'distributed/test_nccl',
-    'test_native_functions',
-    'test_numba_integration',
-    'test_nn',
-    'test_ops',
-    'test_optim',
-    'test_functional_optim',
-    'test_pytree',
-    'test_mobile_optimizer',
-    'test_set_default_mobile_cpu_allocator',
-    'test_xnnpack_integration',
-    'test_vulkan',
-    'test_sparse',
-    'test_sparse_csr',
-    'test_quantization',
-    'test_pruning_op',
-    'test_spectral_ops',
-    'test_serialization',
-    'test_shape_ops',
-    'test_show_pickle',
-    'test_sort_and_select',
-    'test_tensor_creation_ops',
-    'test_testing',
-    'test_torch',
-    'test_type_info',
-    'test_unary_ufuncs',
-    'test_utils',
-    'test_view_ops',
-    'test_vmap',
-    'test_namedtuple_return_api',
-    'test_numpy_interop',
-    'test_jit_profiling',
-    'test_jit_legacy',
-    'test_jit_fuser_legacy',
-    'test_tensorboard',
-    'test_namedtensor',
-    'test_reductions',
-    'test_type_promotion',
-    'test_jit_disabled',
-    'test_function_schema',
-    'test_overrides',
-    'test_jit_fuser_te',
-    'test_tensorexpr',
-    'test_tensorexpr_pybind',
-    'test_openmp',
-    'test_profiler',
+    "test_import_time",
+    "test_public_bindings",
+    "test_type_hints",
+    "test_ao_sparsity",
+    "test_autograd",
+    "benchmark_utils/test_benchmark_utils",
+    "test_binary_ufuncs",
+    "test_buffer_protocol",
+    "test_bundled_inputs",
+    "test_complex",
+    "test_cpp_api_parity",
+    "test_cpp_extensions_aot_no_ninja",
+    "test_cpp_extensions_aot_ninja",
+    "test_cpp_extensions_jit",
+    "distributed/test_c10d_common",
+    "distributed/test_c10d_gloo",
+    "distributed/test_c10d_nccl",
+    "distributed/test_jit_c10d",
+    "distributed/test_c10d_spawn_gloo",
+    "distributed/test_c10d_spawn_nccl",
+    "distributed/test_store",
+    "distributed/test_pg_wrapper",
+    "distributed/algorithms/test_join",
+    "test_cuda",
+    "test_jit_cuda_fuser",
+    "test_cuda_primary_ctx",
+    "test_dataloader",
+    "test_datapipe",
+    "distributed/test_data_parallel",
+    "distributed/test_distributed_spawn",
+    "distributions/test_constraints",
+    "distributions/test_distributions",
+    "test_dispatch",
+    "test_foreach",
+    "test_indexing",
+    "test_jit",
+    "test_linalg",
+    "test_logging",
+    "test_mkldnn",
+    "test_model_dump",
+    "test_module_init",
+    "test_modules",
+    "test_multiprocessing",
+    "test_multiprocessing_spawn",
+    "distributed/test_nccl",
+    "test_native_functions",
+    "test_numba_integration",
+    "test_nn",
+    "test_ops",
+    "test_optim",
+    "test_functional_optim",
+    "test_pytree",
+    "test_mobile_optimizer",
+    "test_set_default_mobile_cpu_allocator",
+    "test_xnnpack_integration",
+    "test_vulkan",
+    "test_sparse",
+    "test_sparse_csr",
+    "test_quantization",
+    "test_pruning_op",
+    "test_spectral_ops",
+    "test_serialization",
+    "test_shape_ops",
+    "test_show_pickle",
+    "test_sort_and_select",
+    "test_tensor_creation_ops",
+    "test_testing",
+    "test_torch",
+    "test_type_info",
+    "test_unary_ufuncs",
+    "test_utils",
+    "test_view_ops",
+    "test_vmap",
+    "test_namedtuple_return_api",
+    "test_numpy_interop",
+    "test_jit_profiling",
+    "test_jit_legacy",
+    "test_jit_fuser_legacy",
+    "test_tensorboard",
+    "test_namedtensor",
+    "test_reductions",
+    "test_type_promotion",
+    "test_jit_disabled",
+    "test_function_schema",
+    "test_overrides",
+    "test_jit_fuser_te",
+    "test_tensorexpr",
+    "test_tensorexpr_pybind",
+    "test_openmp",
+    "test_profiler",
     "distributed/test_launcher",
-    'distributed/nn/jit/test_instantiator',
-    'distributed/rpc/test_faulty_agent',
-    'distributed/rpc/test_tensorpipe_agent',
-    'distributed/rpc/cuda/test_tensorpipe_agent',
-    'test_determination',
-    'test_futures',
-    'test_fx',
-    'test_fx_experimental',
-    'test_functional_autograd_benchmark',
-    'test_package',
-    'test_license',
-    'distributed/pipeline/sync/skip/test_api',
-    'distributed/pipeline/sync/skip/test_gpipe',
-    'distributed/pipeline/sync/skip/test_inspect_skip_layout',
-    'distributed/pipeline/sync/skip/test_leak',
-    'distributed/pipeline/sync/skip/test_portal',
-    'distributed/pipeline/sync/skip/test_stash_pop',
-    'distributed/pipeline/sync/skip/test_tracker',
-    'distributed/pipeline/sync/skip/test_verify_skippables',
-    'distributed/pipeline/sync/test_balance',
-    'distributed/pipeline/sync/test_bugs',
-    'distributed/pipeline/sync/test_checkpoint',
-    'distributed/pipeline/sync/test_copy',
-    'distributed/pipeline/sync/test_deferred_batch_norm',
-    'distributed/pipeline/sync/test_dependency',
-    'distributed/pipeline/sync/test_inplace',
-    'distributed/pipeline/sync/test_microbatch',
-    'distributed/pipeline/sync/test_phony',
-    'distributed/pipeline/sync/test_pipe',
-    'distributed/pipeline/sync/test_pipeline',
-    'distributed/pipeline/sync/test_stream',
-    'distributed/pipeline/sync/test_transparency',
-    'distributed/pipeline/sync/test_worker',
-    'distributed/optim/test_zero_redundancy_optimizer',
-    'distributed/elastic/timer/api_test',
-    'distributed/elastic/timer/local_timer_example',
-    'distributed/elastic/timer/local_timer_test',
-    'distributed/elastic/events/lib_test',
-    'distributed/elastic/metrics/api_test',
-    'distributed/elastic/utils/logging_test',
-    'distributed/elastic/utils/util_test',
-    'distributed/elastic/utils/distributed_test',
-    'distributed/elastic/multiprocessing/api_test',
-    'distributed/_sharding_spec/test_sharding_spec',
-    'distributed/_sharded_tensor/test_sharded_tensor',
+    "distributed/nn/jit/test_instantiator",
+    "distributed/rpc/test_faulty_agent",
+    "distributed/rpc/test_tensorpipe_agent",
+    "distributed/rpc/cuda/test_tensorpipe_agent",
+    "test_determination",
+    "test_futures",
+    "test_fx",
+    "test_fx_experimental",
+    "test_functional_autograd_benchmark",
+    "test_package",
+    "test_license",
+    "distributed/pipeline/sync/skip/test_api",
+    "distributed/pipeline/sync/skip/test_gpipe",
+    "distributed/pipeline/sync/skip/test_inspect_skip_layout",
+    "distributed/pipeline/sync/skip/test_leak",
+    "distributed/pipeline/sync/skip/test_portal",
+    "distributed/pipeline/sync/skip/test_stash_pop",
+    "distributed/pipeline/sync/skip/test_tracker",
+    "distributed/pipeline/sync/skip/test_verify_skippables",
+    "distributed/pipeline/sync/test_balance",
+    "distributed/pipeline/sync/test_bugs",
+    "distributed/pipeline/sync/test_checkpoint",
+    "distributed/pipeline/sync/test_copy",
+    "distributed/pipeline/sync/test_deferred_batch_norm",
+    "distributed/pipeline/sync/test_dependency",
+    "distributed/pipeline/sync/test_inplace",
+    "distributed/pipeline/sync/test_microbatch",
+    "distributed/pipeline/sync/test_phony",
+    "distributed/pipeline/sync/test_pipe",
+    "distributed/pipeline/sync/test_pipeline",
+    "distributed/pipeline/sync/test_stream",
+    "distributed/pipeline/sync/test_transparency",
+    "distributed/pipeline/sync/test_worker",
+    "distributed/optim/test_zero_redundancy_optimizer",
+    "distributed/elastic/timer/api_test",
+    "distributed/elastic/timer/local_timer_example",
+    "distributed/elastic/timer/local_timer_test",
+    "distributed/elastic/events/lib_test",
+    "distributed/elastic/metrics/api_test",
+    "distributed/elastic/utils/logging_test",
+    "distributed/elastic/utils/util_test",
+    "distributed/elastic/utils/distributed_test",
+    "distributed/elastic/multiprocessing/api_test",
+    "distributed/_sharding_spec/test_sharding_spec",
+    "distributed/_sharded_tensor/test_sharded_tensor",
 ]
 
 # Tests need to be run with pytest.
 USE_PYTEST_LIST = [
-    'distributed/pipeline/sync/skip/test_api',
-    'distributed/pipeline/sync/skip/test_gpipe',
-    'distributed/pipeline/sync/skip/test_inspect_skip_layout',
-    'distributed/pipeline/sync/skip/test_leak',
-    'distributed/pipeline/sync/skip/test_portal',
-    'distributed/pipeline/sync/skip/test_stash_pop',
-    'distributed/pipeline/sync/skip/test_tracker',
-    'distributed/pipeline/sync/skip/test_verify_skippables',
-    'distributed/pipeline/sync/test_balance',
-    'distributed/pipeline/sync/test_bugs',
-    'distributed/pipeline/sync/test_checkpoint',
-    'distributed/pipeline/sync/test_copy',
-    'distributed/pipeline/sync/test_deferred_batch_norm',
-    'distributed/pipeline/sync/test_dependency',
-    'distributed/pipeline/sync/test_inplace',
-    'distributed/pipeline/sync/test_microbatch',
-    'distributed/pipeline/sync/test_phony',
-    'distributed/pipeline/sync/test_pipe',
-    'distributed/pipeline/sync/test_pipeline',
-    'distributed/pipeline/sync/test_stream',
-    'distributed/pipeline/sync/test_transparency',
-    'distributed/pipeline/sync/test_worker',
-    'distributions/test_constraints',
-    'distributions/test_transforms',
-    'distributions/test_utils',
-    'test_typing',
+    "distributed/pipeline/sync/skip/test_api",
+    "distributed/pipeline/sync/skip/test_gpipe",
+    "distributed/pipeline/sync/skip/test_inspect_skip_layout",
+    "distributed/pipeline/sync/skip/test_leak",
+    "distributed/pipeline/sync/skip/test_portal",
+    "distributed/pipeline/sync/skip/test_stash_pop",
+    "distributed/pipeline/sync/skip/test_tracker",
+    "distributed/pipeline/sync/skip/test_verify_skippables",
+    "distributed/pipeline/sync/test_balance",
+    "distributed/pipeline/sync/test_bugs",
+    "distributed/pipeline/sync/test_checkpoint",
+    "distributed/pipeline/sync/test_copy",
+    "distributed/pipeline/sync/test_deferred_batch_norm",
+    "distributed/pipeline/sync/test_dependency",
+    "distributed/pipeline/sync/test_inplace",
+    "distributed/pipeline/sync/test_microbatch",
+    "distributed/pipeline/sync/test_phony",
+    "distributed/pipeline/sync/test_pipe",
+    "distributed/pipeline/sync/test_pipeline",
+    "distributed/pipeline/sync/test_stream",
+    "distributed/pipeline/sync/test_transparency",
+    "distributed/pipeline/sync/test_worker",
+    "distributions/test_constraints",
+    "distributions/test_transforms",
+    "distributions/test_utils",
+    "test_typing",
     "distributed/elastic/events/lib_test",
     "distributed/elastic/agent/server/test/api_test",
 ]
 
 WINDOWS_BLOCKLIST = [
-    'distributed/nn/jit/test_instantiator',
-    'distributed/rpc/test_faulty_agent',
-    'distributed/rpc/test_tensorpipe_agent',
-    'distributed/rpc/cuda/test_tensorpipe_agent',
-    'distributed/pipeline/sync/skip/test_api',
-    'distributed/pipeline/sync/skip/test_gpipe',
-    'distributed/pipeline/sync/skip/test_inspect_skip_layout',
-    'distributed/pipeline/sync/skip/test_leak',
-    'distributed/pipeline/sync/skip/test_portal',
-    'distributed/pipeline/sync/skip/test_stash_pop',
-    'distributed/pipeline/sync/skip/test_tracker',
-    'distributed/pipeline/sync/skip/test_verify_skippables',
-    'distributed/pipeline/sync/test_balance',
-    'distributed/pipeline/sync/test_bugs',
-    'distributed/pipeline/sync/test_checkpoint',
-    'distributed/pipeline/sync/test_copy',
-    'distributed/pipeline/sync/test_deferred_batch_norm',
-    'distributed/pipeline/sync/test_dependency',
-    'distributed/pipeline/sync/test_inplace',
-    'distributed/pipeline/sync/test_microbatch',
-    'distributed/pipeline/sync/test_phony',
-    'distributed/pipeline/sync/test_pipe',
-    'distributed/pipeline/sync/test_pipeline',
-    'distributed/pipeline/sync/test_stream',
-    'distributed/pipeline/sync/test_transparency',
-    'distributed/pipeline/sync/test_worker',
+    "distributed/nn/jit/test_instantiator",
+    "distributed/rpc/test_faulty_agent",
+    "distributed/rpc/test_tensorpipe_agent",
+    "distributed/rpc/cuda/test_tensorpipe_agent",
+    "distributed/pipeline/sync/skip/test_api",
+    "distributed/pipeline/sync/skip/test_gpipe",
+    "distributed/pipeline/sync/skip/test_inspect_skip_layout",
+    "distributed/pipeline/sync/skip/test_leak",
+    "distributed/pipeline/sync/skip/test_portal",
+    "distributed/pipeline/sync/skip/test_stash_pop",
+    "distributed/pipeline/sync/skip/test_tracker",
+    "distributed/pipeline/sync/skip/test_verify_skippables",
+    "distributed/pipeline/sync/test_balance",
+    "distributed/pipeline/sync/test_bugs",
+    "distributed/pipeline/sync/test_checkpoint",
+    "distributed/pipeline/sync/test_copy",
+    "distributed/pipeline/sync/test_deferred_batch_norm",
+    "distributed/pipeline/sync/test_dependency",
+    "distributed/pipeline/sync/test_inplace",
+    "distributed/pipeline/sync/test_microbatch",
+    "distributed/pipeline/sync/test_phony",
+    "distributed/pipeline/sync/test_pipe",
+    "distributed/pipeline/sync/test_pipeline",
+    "distributed/pipeline/sync/test_stream",
+    "distributed/pipeline/sync/test_transparency",
+    "distributed/pipeline/sync/test_worker",
     "distributed/elastic/agent/server/test/api_test",
-    'distributed/elastic/multiprocessing/api_test',
-    'distributed/_sharded_tensor/test_sharded_tensor',
+    "distributed/elastic/multiprocessing/api_test",
+    "distributed/_sharded_tensor/test_sharded_tensor",
 ]
 
 ROCM_BLOCKLIST = [
-    'distributed/nn/jit/test_instantiator',
-    'distributed/rpc/test_faulty_agent',
-    'distributed/rpc/test_tensorpipe_agent',
-    'distributed/rpc/cuda/test_tensorpipe_agent',
-    'distributed/_sharded_tensor/test_sharded_tensor',
-    'test_determination',
-    'test_multiprocessing',
-    'test_jit_legacy',
-    'test_type_hints',
-    'test_openmp',
+    "distributed/nn/jit/test_instantiator",
+    "distributed/rpc/test_faulty_agent",
+    "distributed/rpc/test_tensorpipe_agent",
+    "distributed/rpc/cuda/test_tensorpipe_agent",
+    "distributed/_sharded_tensor/test_sharded_tensor",
+    "test_determination",
+    "test_multiprocessing",
+    "test_jit_legacy",
+    "test_type_hints",
+    "test_openmp",
 ]
 
 RUN_PARALLEL_BLOCKLIST = [
-    'test_cpp_extensions_jit',
-    'test_jit_disabled',
-    'test_mobile_optimizer',
-    'test_multiprocessing',
-    'test_multiprocessing_spawn',
-    'test_namedtuple_return_api',
-    'test_overrides',
-    'test_show_pickle',
-    'test_tensorexpr',
-    'test_cuda_primary_ctx',
-] + [test for test in TESTS if test.startswith('distributed/')]
-
-WINDOWS_COVERAGE_BLOCKLIST = [
-]
+    "test_cpp_extensions_jit",
+    "test_jit_disabled",
+    "test_mobile_optimizer",
+    "test_multiprocessing",
+    "test_multiprocessing_spawn",
+    "test_namedtuple_return_api",
+    "test_overrides",
+    "test_show_pickle",
+    "test_tensorexpr",
+    "test_cuda_primary_ctx",
+] + [test for test in TESTS if test.startswith("distributed/")]
+
+WINDOWS_COVERAGE_BLOCKLIST = []
 
 
 # These tests are slow enough that it's worth calculating whether the patch
@@ -273,76 +281,76 @@
 # run with --determine-from, we use another generated list based on this one and the
 # previous test stats.
 TARGET_DET_LIST = [
-    'distributions/test_distributions',
-    'test_nn',
-    'test_autograd',
-    'test_cpp_extensions_jit',
-    'test_jit_legacy',
-    'test_dataloader',
-    'test_overrides',
-    'test_linalg',
-    'test_jit',
-    'test_jit_profiling',
-    'test_torch',
-    'test_binary_ufuncs',
-    'test_numpy_interop',
-    'test_reductions',
-    'test_shape_ops',
-    'test_sort_and_select',
-    'test_testing',
-    'test_view_ops',
-    'distributed/nn/jit/test_instantiator',
-    'distributed/rpc/test_tensorpipe_agent',
-    'distributed/rpc/cuda/test_tensorpipe_agent',
-    'distributed/algorithms/ddp_comm_hooks/test_ddp_hooks',
-    'distributed/test_distributed_spawn',
-    'test_cuda',
-    'test_cuda_primary_ctx',
-    'test_cpp_extensions_aot_ninja',
-    'test_cpp_extensions_aot_no_ninja',
-    'test_serialization',
-    'test_optim',
-    'test_utils',
-    'test_multiprocessing',
-    'test_tensorboard',
-    'distributed/test_c10d_common',
-    'distributed/test_c10d_gloo',
-    'distributed/test_c10d_nccl',
-    'distributed/test_jit_c10d',
-    'distributed/test_c10d_spawn_gloo',
-    'distributed/test_c10d_spawn_nccl',
-    'distributed/test_store',
-    'distributed/test_pg_wrapper',
-    'test_quantization',
-    'test_pruning_op',
-    'test_determination',
-    'test_futures',
-    'distributed/pipeline/sync/skip/test_api',
-    'distributed/pipeline/sync/skip/test_gpipe',
-    'distributed/pipeline/sync/skip/test_inspect_skip_layout',
-    'distributed/pipeline/sync/skip/test_leak',
-    'distributed/pipeline/sync/skip/test_portal',
-    'distributed/pipeline/sync/skip/test_stash_pop',
-    'distributed/pipeline/sync/skip/test_tracker',
-    'distributed/pipeline/sync/skip/test_verify_skippables',
-    'distributed/pipeline/sync/test_balance',
-    'distributed/pipeline/sync/test_bugs',
-    'distributed/pipeline/sync/test_checkpoint',
-    'distributed/pipeline/sync/test_copy',
-    'distributed/pipeline/sync/test_deferred_batch_norm',
-    'distributed/pipeline/sync/test_dependency',
-    'distributed/pipeline/sync/test_inplace',
-    'distributed/pipeline/sync/test_microbatch',
-    'distributed/pipeline/sync/test_phony',
-    'distributed/pipeline/sync/test_pipe',
-    'distributed/pipeline/sync/test_pipeline',
-    'distributed/pipeline/sync/test_stream',
-    'distributed/pipeline/sync/test_transparency',
-    'distributed/pipeline/sync/test_worker',
+    "distributions/test_distributions",
+    "test_nn",
+    "test_autograd",
+    "test_cpp_extensions_jit",
+    "test_jit_legacy",
+    "test_dataloader",
+    "test_overrides",
+    "test_linalg",
+    "test_jit",
+    "test_jit_profiling",
+    "test_torch",
+    "test_binary_ufuncs",
+    "test_numpy_interop",
+    "test_reductions",
+    "test_shape_ops",
+    "test_sort_and_select",
+    "test_testing",
+    "test_view_ops",
+    "distributed/nn/jit/test_instantiator",
+    "distributed/rpc/test_tensorpipe_agent",
+    "distributed/rpc/cuda/test_tensorpipe_agent",
+    "distributed/algorithms/ddp_comm_hooks/test_ddp_hooks",
+    "distributed/test_distributed_spawn",
+    "test_cuda",
+    "test_cuda_primary_ctx",
+    "test_cpp_extensions_aot_ninja",
+    "test_cpp_extensions_aot_no_ninja",
+    "test_serialization",
+    "test_optim",
+    "test_utils",
+    "test_multiprocessing",
+    "test_tensorboard",
+    "distributed/test_c10d_common",
+    "distributed/test_c10d_gloo",
+    "distributed/test_c10d_nccl",
+    "distributed/test_jit_c10d",
+    "distributed/test_c10d_spawn_gloo",
+    "distributed/test_c10d_spawn_nccl",
+    "distributed/test_store",
+    "distributed/test_pg_wrapper",
+    "test_quantization",
+    "test_pruning_op",
+    "test_determination",
+    "test_futures",
+    "distributed/pipeline/sync/skip/test_api",
+    "distributed/pipeline/sync/skip/test_gpipe",
+    "distributed/pipeline/sync/skip/test_inspect_skip_layout",
+    "distributed/pipeline/sync/skip/test_leak",
+    "distributed/pipeline/sync/skip/test_portal",
+    "distributed/pipeline/sync/skip/test_stash_pop",
+    "distributed/pipeline/sync/skip/test_tracker",
+    "distributed/pipeline/sync/skip/test_verify_skippables",
+    "distributed/pipeline/sync/test_balance",
+    "distributed/pipeline/sync/test_bugs",
+    "distributed/pipeline/sync/test_checkpoint",
+    "distributed/pipeline/sync/test_copy",
+    "distributed/pipeline/sync/test_deferred_batch_norm",
+    "distributed/pipeline/sync/test_dependency",
+    "distributed/pipeline/sync/test_inplace",
+    "distributed/pipeline/sync/test_microbatch",
+    "distributed/pipeline/sync/test_phony",
+    "distributed/pipeline/sync/test_pipe",
+    "distributed/pipeline/sync/test_pipeline",
+    "distributed/pipeline/sync/test_stream",
+    "distributed/pipeline/sync/test_transparency",
+    "distributed/pipeline/sync/test_worker",
 ]
 
 # the JSON file to store the S3 test stats
-TEST_TIMES_FILE = '.pytorch-test-times.json'
+TEST_TIMES_FILE = ".pytorch-test-times.json"
 
 # if a test file takes longer than 5 min, we add it to TARGET_DET_LIST
 SLOW_TEST_THRESHOLD = 300
@@ -353,28 +361,27 @@
 
 
 if dist.is_available():
-    DISTRIBUTED_TESTS_CONFIG['test'] = {
-        'WORLD_SIZE': '1'
-    }
+    DISTRIBUTED_TESTS_CONFIG["test"] = {"WORLD_SIZE": "1"}
     if not TEST_WITH_ROCM and dist.is_mpi_available():
-        DISTRIBUTED_TESTS_CONFIG['mpi'] = {
-            'WORLD_SIZE': '3',
-            'TEST_REPORT_SOURCE_OVERRIDE': 'dist-mpi'
+        DISTRIBUTED_TESTS_CONFIG["mpi"] = {
+            "WORLD_SIZE": "3",
+            "TEST_REPORT_SOURCE_OVERRIDE": "dist-mpi",
         }
     if dist.is_nccl_available():
-        DISTRIBUTED_TESTS_CONFIG['nccl'] = {
-            'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3',
-            'TEST_REPORT_SOURCE_OVERRIDE': 'dist-nccl'
+        DISTRIBUTED_TESTS_CONFIG["nccl"] = {
+            "WORLD_SIZE": "2" if torch.cuda.device_count() == 2 else "3",
+            "TEST_REPORT_SOURCE_OVERRIDE": "dist-nccl",
         }
     if dist.is_gloo_available():
-        DISTRIBUTED_TESTS_CONFIG['gloo'] = {
-            'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3',
-            'TEST_REPORT_SOURCE_OVERRIDE': 'dist-gloo'
+        DISTRIBUTED_TESTS_CONFIG["gloo"] = {
+            "WORLD_SIZE": "2" if torch.cuda.device_count() == 2 else "3",
+            "TEST_REPORT_SOURCE_OVERRIDE": "dist-gloo",
         }
 
 # https://stackoverflow.com/questions/2549939/get-signal-names-from-numbers-in-python
-SIGNALS_TO_NAMES_DICT = {getattr(signal, n): n for n in dir(signal)
-                         if n.startswith('SIG') and '_' not in n}
+SIGNALS_TO_NAMES_DICT = {
+    getattr(signal, n): n for n in dir(signal) if n.startswith("SIG") and "_" not in n
+}
 
 CPP_EXTENSIONS_ERROR = """
 Ninja (https://ninja-build.org) is required for some of the C++ extensions
@@ -385,18 +392,20 @@
 
 PYTORCH_COLLECT_COVERAGE = bool(os.environ.get("PYTORCH_COLLECT_COVERAGE"))
 
-ENABLE_PR_HISTORY_REORDERING = bool(os.environ.get("ENABLE_PR_HISTORY_REORDERING", "0") == "1")
+ENABLE_PR_HISTORY_REORDERING = bool(
+    os.environ.get("ENABLE_PR_HISTORY_REORDERING", "0") == "1"
+)
 
 JIT_EXECUTOR_TESTS = [
-    'test_jit_cuda_fuser',
-    'test_jit_profiling',
-    'test_jit_legacy',
-    'test_jit_fuser_legacy',
+    "test_jit_cuda_fuser",
+    "test_jit_profiling",
+    "test_jit_legacy",
+    "test_jit_fuser_legacy",
 ]
 
 DISTRIBUTED_TESTS = [
-    'distributed/test_distributed_fork',
-    'distributed/test_distributed_spawn',
+    "distributed/test_distributed_fork",
+    "distributed/test_distributed_spawn",
 ]
 
 # Dictionary matching test modules (in TESTS) to lists of test cases (within that test_module) that would be run when
@@ -411,7 +420,7 @@
 
 # The file from which the SPECIFIED_TEST_CASES_DICT will be filled, a CSV of test cases that would be run when
 # options.run_specified_test_cases is enabled.
-SPECIFIED_TEST_CASES_FILE: str = '.pytorch_specified_test_cases.csv'
+SPECIFIED_TEST_CASES_FILE: str = ".pytorch_specified_test_cases.csv"
 
 
 def print_to_stderr(message):
@@ -421,15 +430,18 @@ def print_to_stderr(message):
 def get_test_case_args(test_module, using_pytest) -> List[str]:
     args = []
     # if test_module not specified or specified with '__all__' then run all tests
-    if test_module not in SPECIFIED_TEST_CASES_DICT or '__all__' in SPECIFIED_TEST_CASES_DICT[test_module]:
+    if (
+        test_module not in SPECIFIED_TEST_CASES_DICT
+        or "__all__" in SPECIFIED_TEST_CASES_DICT[test_module]
+    ):
         return args
 
     if using_pytest:
-        args.append('-k')
-        args.append(' or '.join(SPECIFIED_TEST_CASES_DICT[test_module]))
+        args.append("-k")
+        args.append(" or ".join(SPECIFIED_TEST_CASES_DICT[test_module]))
     else:
         for test in SPECIFIED_TEST_CASES_DICT[test_module]:
-            args.append('-k')
+            args.append("-k")
             args.append(test)
 
     return args
@@ -437,59 +449,70 @@ def get_test_case_args(test_module, using_pytest) -> List[str]:
 
 def get_executable_command(options, allow_pytest, disable_coverage=False):
     if options.coverage and not disable_coverage:
-        executable = ['coverage', 'run', '--parallel-mode', '--source=torch']
+        executable = ["coverage", "run", "--parallel-mode", "--source=torch"]
     else:
         executable = [sys.executable]
     if options.pytest:
         if allow_pytest:
-            executable += ['-m', 'pytest']
+            executable += ["-m", "pytest"]
         else:
-            print_to_stderr('Pytest cannot be used for this test. Falling back to unittest.')
+            print_to_stderr(
+                "Pytest cannot be used for this test. Falling back to unittest."
+            )
     return executable
 
 
-def run_test(test_module, test_directory, options, launcher_cmd=None, extra_unittest_args=None):
+def run_test(
+    test_module, test_directory, options, launcher_cmd=None, extra_unittest_args=None
+):
     unittest_args = options.additional_unittest_args.copy()
     if options.verbose:
         unittest_args.append(f'-{"v"*options.verbose}')  # in case of pytest
     if test_module in RUN_PARALLEL_BLOCKLIST:
-        unittest_args = [arg for arg in unittest_args if not arg.startswith('--run-parallel')]
+        unittest_args = [
+            arg for arg in unittest_args if not arg.startswith("--run-parallel")
+        ]
     if extra_unittest_args:
         assert isinstance(extra_unittest_args, list)
         unittest_args.extend(extra_unittest_args)
 
     # If using pytest, replace -f with equivalent -x
     if options.pytest:
-        unittest_args = [arg if arg != '-f' else '-x' for arg in unittest_args]
+        unittest_args = [arg if arg != "-f" else "-x" for arg in unittest_args]
     elif IS_IN_CI:
         # use the downloaded test cases configuration, not supported in pytest
-        unittest_args.extend(['--import-slow-tests', '--import-disabled-tests'])
+        unittest_args.extend(["--import-slow-tests", "--import-disabled-tests"])
 
     # Multiprocessing related tests cannot run with coverage.
     # Tracking issue: https://github.com/pytorch/pytorch/issues/50661
-    disable_coverage = sys.platform == 'win32' and test_module in WINDOWS_COVERAGE_BLOCKLIST
+    disable_coverage = (
+        sys.platform == "win32" and test_module in WINDOWS_COVERAGE_BLOCKLIST
+    )
 
     # Extra arguments are not supported with pytest
-    executable = get_executable_command(options, allow_pytest=not extra_unittest_args,
-                                        disable_coverage=disable_coverage)
+    executable = get_executable_command(
+        options, allow_pytest=not extra_unittest_args, disable_coverage=disable_coverage
+    )
 
     # TODO: move this logic into common_utils.py instead of passing in "-k" individually
     # The following logic for running specified tests will only run for non-distributed tests, as those are dispatched
     # to test_distributed and not run_test (this function)
     if options.run_specified_test_cases:
-        unittest_args.extend(get_test_case_args(test_module, 'pytest' in executable))
+        unittest_args.extend(get_test_case_args(test_module, "pytest" in executable))
 
     # Can't call `python -m unittest test_*` here because it doesn't run code
     # in `if __name__ == '__main__': `. So call `python test_*.py` instead.
-    argv = [test_module + '.py'] + unittest_args
+    argv = [test_module + ".py"] + unittest_args
 
     command = (launcher_cmd or []) + executable + argv
-    print_to_stderr('Executing {} ... [{}]'.format(command, datetime.now()))
+    print_to_stderr("Executing {} ... [{}]".format(command, datetime.now()))
     return shell(command, test_directory)
 
 
 def test_cuda_primary_ctx(test_module, test_directory, options):
-    return run_test(test_module, test_directory, options, extra_unittest_args=['--subprocess'])
+    return run_test(
+        test_module, test_directory, options, extra_unittest_args=["--subprocess"]
+    )
 
 
 def _test_cpp_extensions_aot(test_directory, options, use_ninja):
@@ -501,46 +524,52 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja):
             return 1
 
     # Wipe the build folder, if it exists already
-    cpp_extensions_test_dir = os.path.join(test_directory, 'cpp_extensions')
-    cpp_extensions_test_build_dir = os.path.join(cpp_extensions_test_dir, 'build')
+    cpp_extensions_test_dir = os.path.join(test_directory, "cpp_extensions")
+    cpp_extensions_test_build_dir = os.path.join(cpp_extensions_test_dir, "build")
     if os.path.exists(cpp_extensions_test_build_dir):
         shutil.rmtree(cpp_extensions_test_build_dir)
 
     # Build the test cpp extensions modules
     shell_env = os.environ.copy()
-    shell_env['USE_NINJA'] = str(1 if use_ninja else 0)
-    cmd = [sys.executable, 'setup.py', 'install', '--root', './install']
+    shell_env["USE_NINJA"] = str(1 if use_ninja else 0)
+    cmd = [sys.executable, "setup.py", "install", "--root", "./install"]
     return_code = shell(cmd, cwd=cpp_extensions_test_dir, env=shell_env)
     if return_code != 0:
         return return_code
-    if sys.platform != 'win32':
-        return_code = shell(cmd,
-                            cwd=os.path.join(cpp_extensions_test_dir, 'no_python_abi_suffix_test'),
-                            env=shell_env)
+    if sys.platform != "win32":
+        return_code = shell(
+            cmd,
+            cwd=os.path.join(cpp_extensions_test_dir, "no_python_abi_suffix_test"),
+            env=shell_env,
+        )
         if return_code != 0:
             return return_code
 
     # "install" the test modules and run tests
-    python_path = os.environ.get('PYTHONPATH', '')
+    python_path = os.environ.get("PYTHONPATH", "")
     from shutil import copyfile
-    test_module = 'test_cpp_extensions_aot' + ('_ninja' if use_ninja else '_no_ninja')
-    copyfile(test_directory + '/test_cpp_extensions_aot.py', test_directory + '/' + test_module + '.py')
+
+    test_module = "test_cpp_extensions_aot" + ("_ninja" if use_ninja else "_no_ninja")
+    copyfile(
+        test_directory + "/test_cpp_extensions_aot.py",
+        test_directory + "/" + test_module + ".py",
+    )
     try:
-        cpp_extensions = os.path.join(test_directory, 'cpp_extensions')
-        install_directory = ''
+        cpp_extensions = os.path.join(test_directory, "cpp_extensions")
+        install_directory = ""
         # install directory is the one that is named site-packages
-        for root, directories, _ in os.walk(os.path.join(cpp_extensions, 'install')):
+        for root, directories, _ in os.walk(os.path.join(cpp_extensions, "install")):
             for directory in directories:
-                if '-packages' in directory:
+                if "-packages" in directory:
                     install_directory = os.path.join(root, directory)
 
-        assert install_directory, 'install_directory must not be empty'
-        os.environ['PYTHONPATH'] = os.pathsep.join([install_directory, python_path])
+        assert install_directory, "install_directory must not be empty"
+        os.environ["PYTHONPATH"] = os.pathsep.join([install_directory, python_path])
         return run_test(test_module, test_directory, options)
     finally:
-        os.environ['PYTHONPATH'] = python_path
-        if os.path.exists(test_directory + '/' + test_module + '.py'):
-            os.remove(test_directory + '/' + test_module + '.py')
+        os.environ["PYTHONPATH"] = python_path
+        if os.path.exists(test_directory + "/" + test_module + ".py"):
+            os.remove(test_directory + "/" + test_module + ".py")
 
 
 def test_cpp_extensions_aot_ninja(test_module, test_directory, options):
@@ -553,53 +582,73 @@ def test_cpp_extensions_aot_no_ninja(test_module, test_directory, options):
 
 def test_distributed(test_module, test_directory, options):
     # MPI tests are broken with Python-3.9
-    mpi_available = subprocess.call('command -v mpiexec', shell=True) == 0 and sys.version_info < (3, 9)
+    mpi_available = subprocess.call(
+        "command -v mpiexec", shell=True
+    ) == 0 and sys.version_info < (3, 9)
     if options.verbose and not mpi_available:
-        print_to_stderr(
-            'MPI not available -- MPI backend tests will be skipped')
+        print_to_stderr("MPI not available -- MPI backend tests will be skipped")
     config = DISTRIBUTED_TESTS_CONFIG
     for backend, env_vars in config.items():
-        if sys.platform == 'win32' and backend != 'gloo':
+        if sys.platform == "win32" and backend != "gloo":
             continue
-        if backend == 'mpi' and not mpi_available:
+        if backend == "mpi" and not mpi_available:
             continue
         for with_init_file in {True, False}:
-            if sys.platform == 'win32' and not with_init_file:
+            if sys.platform == "win32" and not with_init_file:
                 continue
             tmp_dir = tempfile.mkdtemp()
             if options.verbose:
                 init_str = "with {} init_method"
                 with_init = init_str.format("file" if with_init_file else "env")
                 print_to_stderr(
-                    'Running distributed tests for the {} backend {}'.format(
-                        backend, with_init))
-            os.environ['TEMP_DIR'] = tmp_dir
-            os.environ['BACKEND'] = backend
-            os.environ['INIT_METHOD'] = 'env://'
+                    "Running distributed tests for the {} backend {}".format(
+                        backend, with_init
+                    )
+                )
+            os.environ["TEMP_DIR"] = tmp_dir
+            os.environ["BACKEND"] = backend
+            os.environ["INIT_METHOD"] = "env://"
             os.environ.update(env_vars)
             if with_init_file:
                 if test_module == "test_distributed_spawn":
-                    init_method = f'{FILE_SCHEMA}{tmp_dir}/'
+                    init_method = f"{FILE_SCHEMA}{tmp_dir}/"
                 else:
-                    init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file'
-                os.environ['INIT_METHOD'] = init_method
+                    init_method = f"{FILE_SCHEMA}{tmp_dir}/shared_init_file"
+                os.environ["INIT_METHOD"] = init_method
             try:
-                os.mkdir(os.path.join(tmp_dir, 'barrier'))
-                os.mkdir(os.path.join(tmp_dir, 'test_dir'))
-                if backend == 'mpi':
+                os.mkdir(os.path.join(tmp_dir, "barrier"))
+                os.mkdir(os.path.join(tmp_dir, "test_dir"))
+                if backend == "mpi":
                     # test mpiexec for --noprefix option
-                    with open(os.devnull, 'w') as devnull:
-                        allowrunasroot_opt = '--allow-run-as-root' if subprocess.call(
-                            'mpiexec --allow-run-as-root -n 1 bash -c ""', shell=True,
-                            stdout=devnull, stderr=subprocess.STDOUT) == 0 else ''
-                        noprefix_opt = '--noprefix' if subprocess.call(
-                            f'mpiexec {allowrunasroot_opt} -n 1 --noprefix bash -c ""', shell=True,
-                            stdout=devnull, stderr=subprocess.STDOUT) == 0 else ''
-
-                    mpiexec = ['mpiexec', '-n', '3', noprefix_opt, allowrunasroot_opt]
-
-                    return_code = run_test(test_module, test_directory, options,
-                                           launcher_cmd=mpiexec)
+                    with open(os.devnull, "w") as devnull:
+                        allowrunasroot_opt = (
+                            "--allow-run-as-root"
+                            if subprocess.call(
+                                'mpiexec --allow-run-as-root -n 1 bash -c ""',
+                                shell=True,
+                                stdout=devnull,
+                                stderr=subprocess.STDOUT,
+                            )
+                            == 0
+                            else ""
+                        )
+                        noprefix_opt = (
+                            "--noprefix"
+                            if subprocess.call(
+                                f'mpiexec {allowrunasroot_opt} -n 1 --noprefix bash -c ""',
+                                shell=True,
+                                stdout=devnull,
+                                stderr=subprocess.STDOUT,
+                            )
+                            == 0
+                            else ""
+                        )
+
+                    mpiexec = ["mpiexec", "-n", "3", noprefix_opt, allowrunasroot_opt]
+
+                    return_code = run_test(
+                        test_module, test_directory, options, launcher_cmd=mpiexec
+                    )
                 else:
                     return_code = run_test(test_module, test_directory, options)
                 if return_code != 0:
@@ -610,15 +659,15 @@ def test_distributed(test_module, test_directory, options):
 
 
 CUSTOM_HANDLERS = {
-    'test_cuda_primary_ctx': test_cuda_primary_ctx,
-    'test_cpp_extensions_aot_no_ninja': test_cpp_extensions_aot_no_ninja,
-    'test_cpp_extensions_aot_ninja': test_cpp_extensions_aot_ninja,
-    'distributed/test_distributed_spawn': test_distributed,
+    "test_cuda_primary_ctx": test_cuda_primary_ctx,
+    "test_cpp_extensions_aot_no_ninja": test_cpp_extensions_aot_no_ninja,
+    "test_cpp_extensions_aot_ninja": test_cpp_extensions_aot_ninja,
+    "distributed/test_distributed_spawn": test_distributed,
 }
 
 
 def parse_test_module(test):
-    return test.split('.')[0]
+    return test.split(".")[0]
 
 
 class TestChoices(list):
@@ -631,137 +680,152 @@ def __contains__(self, item):
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        description='Run the PyTorch unit test suite',
-        epilog='where TESTS is any of: {}'.format(', '.join(TESTS)),
-        formatter_class=argparse.RawTextHelpFormatter)
+        description="Run the PyTorch unit test suite",
+        epilog="where TESTS is any of: {}".format(", ".join(TESTS)),
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
     parser.add_argument(
-        '-v',
-        '--verbose',
-        action='count',
+        "-v",
+        "--verbose",
+        action="count",
         default=0,
-        help='print verbose information and test-by-test results')
-    parser.add_argument(
-        '--jit',
-        '--jit',
-        action='store_true',
-        help='run all jit tests')
+        help="print verbose information and test-by-test results",
+    )
+    parser.add_argument("--jit", "--jit", action="store_true", help="run all jit tests")
     parser.add_argument(
-        '--distributed-tests',
-        '--distributed-tests',
-        action='store_true',
-        help='run all distributed tests')
+        "--distributed-tests",
+        "--distributed-tests",
+        action="store_true",
+        help="run all distributed tests",
+    )
     parser.add_argument(
-        '-pt', '--pytest', action='store_true',
-        help='If true, use `pytest` to execute the tests. E.g., this runs '
-             'TestTorch with pytest in verbose and coverage mode: '
-             'python run_test.py -vci torch -pt')
+        "-pt",
+        "--pytest",
+        action="store_true",
+        help="If true, use `pytest` to execute the tests. E.g., this runs "
+        "TestTorch with pytest in verbose and coverage mode: "
+        "python run_test.py -vci torch -pt",
+    )
     parser.add_argument(
-        '-c', '--coverage', action='store_true', help='enable coverage',
-        default=PYTORCH_COLLECT_COVERAGE)
+        "-c",
+        "--coverage",
+        action="store_true",
+        help="enable coverage",
+        default=PYTORCH_COLLECT_COVERAGE,
+    )
     parser.add_argument(
-        '-i',
-        '--include',
-        nargs='+',
+        "-i",
+        "--include",
+        nargs="+",
         choices=TestChoices(TESTS),
         default=TESTS,
-        metavar='TESTS',
-        help='select a set of tests to include (defaults to ALL tests).'
-             ' tests must be a part of the TESTS list defined in run_test.py')
+        metavar="TESTS",
+        help="select a set of tests to include (defaults to ALL tests)."
+        " tests must be a part of the TESTS list defined in run_test.py",
+    )
     parser.add_argument(
-        '-x',
-        '--exclude',
-        nargs='+',
+        "-x",
+        "--exclude",
+        nargs="+",
         choices=TESTS,
-        metavar='TESTS',
+        metavar="TESTS",
         default=[],
-        help='select a set of tests to exclude')
+        help="select a set of tests to exclude",
+    )
     parser.add_argument(
-        '-f',
-        '--first',
+        "-f",
+        "--first",
         choices=TESTS,
-        metavar='TESTS',
-        help='select the test to start from (excludes previous tests)')
+        metavar="TESTS",
+        help="select the test to start from (excludes previous tests)",
+    )
     parser.add_argument(
-        '-l',
-        '--last',
+        "-l",
+        "--last",
         choices=TESTS,
-        metavar='TESTS',
-        help='select the last test to run (excludes following tests)')
+        metavar="TESTS",
+        help="select the last test to run (excludes following tests)",
+    )
     parser.add_argument(
-        '--bring-to-front',
-        nargs='+',
+        "--bring-to-front",
+        nargs="+",
         choices=TestChoices(TESTS),
         default=[],
-        metavar='TESTS',
-        help='select a set of tests to run first. This can be used in situations'
-             ' where you want to run all tests, but care more about some set, '
-             'e.g. after making a change to a specific component')
+        metavar="TESTS",
+        help="select a set of tests to run first. This can be used in situations"
+        " where you want to run all tests, but care more about some set, "
+        "e.g. after making a change to a specific component",
+    )
     parser.add_argument(
-        '--ignore-win-blocklist',
-        action='store_true',
-        help='always run blocklisted windows tests')
+        "--ignore-win-blocklist",
+        action="store_true",
+        help="always run blocklisted windows tests",
+    )
     parser.add_argument(
-        '--determine-from',
-        help='File of affected source filenames to determine which tests to run.')
+        "--determine-from",
+        help="File of affected source filenames to determine which tests to run.",
+    )
     parser.add_argument(
-        '--continue-through-error',
-        action='store_true',
-        help='Runs the full test suite despite one of the tests failing',
-        default=strtobool(os.environ.get("CONTINUE_THROUGH_ERROR", "False")))
+        "--continue-through-error",
+        action="store_true",
+        help="Runs the full test suite despite one of the tests failing",
+        default=strtobool(os.environ.get("CONTINUE_THROUGH_ERROR", "False")),
+    )
     parser.add_argument(
-        'additional_unittest_args',
-        nargs='*',
-        help='additional arguments passed through to unittest, e.g., '
-             'python run_test.py -i sparse -- TestSparse.test_factory_size_check')
+        "additional_unittest_args",
+        nargs="*",
+        help="additional arguments passed through to unittest, e.g., "
+        "python run_test.py -i sparse -- TestSparse.test_factory_size_check",
+    )
     parser.add_argument(
-        '--export-past-test-times',
-        nargs='?',
+        "--export-past-test-times",
+        nargs="?",
         type=str,
         const=TEST_TIMES_FILE,
-        help='dumps test times from previous S3 stats into a file, format JSON',
+        help="dumps test times from previous S3 stats into a file, format JSON",
     )
     parser.add_argument(
-        '--shard',
+        "--shard",
         nargs=2,
         type=int,
-        help='runs a shard of the tests (taking into account other selections), e.g., '
-        '--shard 2 3 will break up the selected tests into 3 shards and run the tests '
-        'in the 2nd shard (the first number should not exceed the second)',
+        help="runs a shard of the tests (taking into account other selections), e.g., "
+        "--shard 2 3 will break up the selected tests into 3 shards and run the tests "
+        "in the 2nd shard (the first number should not exceed the second)",
     )
     parser.add_argument(
-        '--exclude-jit-executor',
-        action='store_true',
-        help='exclude tests that are run for a specific jit config'
+        "--exclude-jit-executor",
+        action="store_true",
+        help="exclude tests that are run for a specific jit config",
     )
     parser.add_argument(
-        '--exclude-distributed-tests',
-        action='store_true',
-        help='exclude distributed tests'
+        "--exclude-distributed-tests",
+        action="store_true",
+        help="exclude distributed tests",
     )
     parser.add_argument(
-        '--run-specified-test-cases',
-        nargs='?',
+        "--run-specified-test-cases",
+        nargs="?",
         type=str,
         const=SPECIFIED_TEST_CASES_FILE,
-        help='load specified test cases file dumped from previous OSS CI stats, format CSV. '
-        ' If all test cases should run for a <test_module> please add a single row: \n'
-        ' test_filename,test_case_name\n'
-        ' ...\n'
-        ' <test_module>,__all__\n'
-        ' ...\n'
-        'how we use the stats will be based on option "--use-specified-test-cases-by".'
+        help="load specified test cases file dumped from previous OSS CI stats, format CSV. "
+        " If all test cases should run for a <test_module> please add a single row: \n"
+        " test_filename,test_case_name\n"
+        " ...\n"
+        " <test_module>,__all__\n"
+        " ...\n"
+        'how we use the stats will be based on option "--use-specified-test-cases-by".',
     )
     parser.add_argument(
-        '--use-specified-test-cases-by',
+        "--use-specified-test-cases-by",
         type=str,
-        choices=['include', 'bring-to-front'],
-        default='include',
+        choices=["include", "bring-to-front"],
+        default="include",
         help='used together with option "--run-specified-test-cases". When specified test case '
-        'file is set, this option allows the user to control whether to only run the specified test '
-        'modules or to simply bring the specified modules to front and also run the remaining '
-        'modules. Note: regardless of this option, we will only run the specified test cases '
-        ' within a specified test module. For unspecified test modules with the bring-to-front '
-        'option, all test cases will be run, as one may expect.',
+        "file is set, this option allows the user to control whether to only run the specified test "
+        "modules or to simply bring the specified modules to front and also run the remaining "
+        "modules. Note: regardless of this option, we will only run the specified test cases "
+        " within a specified test module. For unspecified test modules with the bring-to-front "
+        "option, all test cases will be run, as one may expect.",
     )
     return parser.parse_args()
 
@@ -809,7 +873,7 @@ def exclude_tests(exclude_list, selected_tests, exclude_message=None):
         for test in tests_copy:
             if test.startswith(exclude_test):
                 if exclude_message is not None:
-                    print_to_stderr('Excluding {} {}'.format(test, exclude_message))
+                    print_to_stderr("Excluding {} {}".format(test, exclude_message))
                 selected_tests.remove(test)
     return selected_tests
 
@@ -817,9 +881,9 @@ def exclude_tests(exclude_list, selected_tests, exclude_message=None):
 def get_selected_tests(options):
     # First make sure run specific test cases options are processed.
     if options.run_specified_test_cases:
-        if options.use_specified_test_cases_by == 'include':
+        if options.use_specified_test_cases_by == "include":
             options.include = list(SPECIFIED_TEST_CASES_DICT.keys())
-        elif options.use_specified_test_cases_by == 'bring-to-front':
+        elif options.use_specified_test_cases_by == "bring-to-front":
             options.bring_to_front = list(SPECIFIED_TEST_CASES_DICT.keys())
 
     selected_tests = options.include
@@ -827,17 +891,20 @@ def get_selected_tests(options):
     # filter if there's JIT only and distributed only test options
     if options.jit:
         selected_tests = list(
-            filter(lambda test_name: "jit" in test_name, selected_tests))
+            filter(lambda test_name: "jit" in test_name, selected_tests)
+        )
 
     if options.distributed_tests:
         selected_tests = list(
-            filter(lambda test_name: test_name in DISTRIBUTED_TESTS, selected_tests))
+            filter(lambda test_name: test_name in DISTRIBUTED_TESTS, selected_tests)
+        )
 
     # process reordering
     if options.bring_to_front:
         to_front = set(options.bring_to_front)
-        selected_tests = options.bring_to_front + list(filter(lambda name: name not in to_front,
-                                                              selected_tests))
+        selected_tests = options.bring_to_front + list(
+            filter(lambda name: name not in to_front, selected_tests)
+        )
 
     if options.first:
         first_index = find_test_index(options.first, selected_tests)
@@ -845,7 +912,7 @@ def get_selected_tests(options):
 
     if options.last:
         last_index = find_test_index(options.last, selected_tests, find_last_index=True)
-        selected_tests = selected_tests[:last_index + 1]
+        selected_tests = selected_tests[: last_index + 1]
 
     # process exclusion
     if options.exclude_jit_executor:
@@ -856,30 +923,36 @@ def get_selected_tests(options):
 
     selected_tests = exclude_tests(options.exclude, selected_tests)
 
-    if sys.platform == 'win32' and not options.ignore_win_blocklist:
-        target_arch = os.environ.get('VSCMD_ARG_TGT_ARCH')
-        if target_arch != 'x64':
-            WINDOWS_BLOCKLIST.append('cpp_extensions_aot_no_ninja')
-            WINDOWS_BLOCKLIST.append('cpp_extensions_aot_ninja')
-            WINDOWS_BLOCKLIST.append('cpp_extensions_jit')
-            WINDOWS_BLOCKLIST.append('jit')
-            WINDOWS_BLOCKLIST.append('jit_fuser')
+    if sys.platform == "win32" and not options.ignore_win_blocklist:
+        target_arch = os.environ.get("VSCMD_ARG_TGT_ARCH")
+        if target_arch != "x64":
+            WINDOWS_BLOCKLIST.append("cpp_extensions_aot_no_ninja")
+            WINDOWS_BLOCKLIST.append("cpp_extensions_aot_ninja")
+            WINDOWS_BLOCKLIST.append("cpp_extensions_jit")
+            WINDOWS_BLOCKLIST.append("jit")
+            WINDOWS_BLOCKLIST.append("jit_fuser")
 
-        selected_tests = exclude_tests(WINDOWS_BLOCKLIST, selected_tests, 'on Windows')
+        selected_tests = exclude_tests(WINDOWS_BLOCKLIST, selected_tests, "on Windows")
 
     elif TEST_WITH_ROCM:
-        selected_tests = exclude_tests(ROCM_BLOCKLIST, selected_tests, 'on ROCm')
+        selected_tests = exclude_tests(ROCM_BLOCKLIST, selected_tests, "on ROCm")
 
     # sharding
     if options.shard:
         assert len(options.shard) == 2, "Unexpected shard format"
         assert min(options.shard) > 0, "Shards must be positive numbers"
         which_shard, num_shards = options.shard
-        assert which_shard <= num_shards, "Selected shard must be less than or equal to total number of shards"
-        assert num_shards <= len(selected_tests), f"Number of shards must be less than {len(selected_tests)}"
+        assert (
+            which_shard <= num_shards
+        ), "Selected shard must be less than or equal to total number of shards"
+        assert num_shards <= len(
+            selected_tests
+        ), f"Number of shards must be less than {len(selected_tests)}"
         # TODO: fix this to use test_times_filename, but currently this is not working
         # because setting the export arg immeidately halts the test execution.
-        selected_tests = get_shard_based_on_S3(which_shard, num_shards, selected_tests, TEST_TIMES_FILE)
+        selected_tests = get_shard_based_on_S3(
+            which_shard, num_shards, selected_tests, TEST_TIMES_FILE
+        )
 
     return selected_tests
 
@@ -896,27 +969,27 @@ def test_impact_of_file(filename):
         CI - CI configuration files
     """
     parts = filename.split(os.sep)
-    if parts[0] in ['.jenkins', '.circleci']:
-        return 'CI'
-    if parts[0] in ['docs', 'scripts', 'CODEOWNERS', 'README.md']:
-        return 'NONE'
-    elif parts[0] == 'torch':
-        if parts[-1].endswith('.py') or parts[-1].endswith('.pyi'):
-            return 'TORCH'
-    elif parts[0] == 'caffe2':
-        if parts[-1].endswith('.py') or parts[-1].endswith('.pyi'):
-            return 'CAFFE2'
-    elif parts[0] == 'test':
-        if parts[-1].endswith('.py') or parts[-1].endswith('.pyi'):
-            return 'TEST'
-
-    return 'UNKNOWN'
+    if parts[0] in [".jenkins", ".circleci"]:
+        return "CI"
+    if parts[0] in ["docs", "scripts", "CODEOWNERS", "README.md"]:
+        return "NONE"
+    elif parts[0] == "torch":
+        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
+            return "TORCH"
+    elif parts[0] == "caffe2":
+        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
+            return "CAFFE2"
+    elif parts[0] == "test":
+        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
+            return "TEST"
+
+    return "UNKNOWN"
 
 
 def log_test_reason(file_type, filename, test, options):
     if options.verbose:
         print_to_stderr(
-            'Determination found {} file {} -- running {}'.format(
+            "Determination found {} file {} -- running {}".format(
                 file_type,
                 filename,
                 test,
@@ -930,37 +1003,37 @@ def get_dep_modules(test):
         return _DEP_MODULES_CACHE[test]
 
     repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-    test_location = os.path.join(repo_root, 'test', test + '.py')
+    test_location = os.path.join(repo_root, "test", test + ".py")
     finder = modulefinder.ModuleFinder(
         # Ideally exclude all third party modules, to speed up calculation.
         excludes=[
-            'scipy',
-            'numpy',
-            'numba',
-            'multiprocessing',
-            'sklearn',
-            'setuptools',
-            'hypothesis',
-            'llvmlite',
-            'joblib',
-            'email',
-            'importlib',
-            'unittest',
-            'urllib',
-            'json',
-            'collections',
+            "scipy",
+            "numpy",
+            "numba",
+            "multiprocessing",
+            "sklearn",
+            "setuptools",
+            "hypothesis",
+            "llvmlite",
+            "joblib",
+            "email",
+            "importlib",
+            "unittest",
+            "urllib",
+            "json",
+            "collections",
             # Modules below are excluded because they are hitting https://bugs.python.org/issue40350
             # Trigger AttributeError: 'NoneType' object has no attribute 'is_package'
-            'mpl_toolkits',
-            'google',
-            'onnx',
+            "mpl_toolkits",
+            "google",
+            "onnx",
             # Triggers RecursionError
-            'mypy'
+            "mypy",
         ],
     )
     # HACK: some platforms default to ascii, so we can't just run_script :(
-    with open(test_location, 'r', encoding='utf-8') as fp:
-        finder.load_module('__main__', fp, test_location, ('', 'r', 1))
+    with open(test_location, "r", encoding="utf-8") as fp:
+        finder.load_module("__main__", fp, test_location, ("", "r", 1))
 
     dep_modules = set(finder.modules.keys())
     _DEP_MODULES_CACHE[test] = dep_modules
@@ -972,45 +1045,44 @@ def determine_target(target_det_list, test, touched_files, options):
     # Some tests are faster to execute than to determine.
     if test not in target_det_list:
         if options.verbose:
-            print_to_stderr(f'Running {test} without determination')
+            print_to_stderr(f"Running {test} without determination")
         return True
     # HACK: "no_ninja" is not a real module
-    if test.endswith('_no_ninja'):
-        test = test[:(-1 * len('_no_ninja'))]
-    if test.endswith('_ninja'):
-        test = test[:(-1 * len('_ninja'))]
+    if test.endswith("_no_ninja"):
+        test = test[: (-1 * len("_no_ninja"))]
+    if test.endswith("_ninja"):
+        test = test[: (-1 * len("_ninja"))]
 
     dep_modules = get_dep_modules(test)
 
     for touched_file in touched_files:
         file_type = test_impact_of_file(touched_file)
-        if file_type == 'NONE':
+        if file_type == "NONE":
             continue
-        elif file_type == 'CI':
+        elif file_type == "CI":
             # Force all tests to run if any change is made to the CI
             # configurations.
             log_test_reason(file_type, touched_file, test, options)
             return True
-        elif file_type == 'UNKNOWN':
+        elif file_type == "UNKNOWN":
             # Assume uncategorized source files can affect every test.
             log_test_reason(file_type, touched_file, test, options)
             return True
-        elif file_type in ['TORCH', 'CAFFE2', 'TEST']:
+        elif file_type in ["TORCH", "CAFFE2", "TEST"]:
             parts = os.path.splitext(touched_file)[0].split(os.sep)
             touched_module = ".".join(parts)
             # test/ path does not have a "test." namespace
-            if touched_module.startswith('test.'):
-                touched_module = touched_module.split('test.')[1]
-            if (
-                touched_module in dep_modules
-                or touched_module == test.replace('/', '.')
+            if touched_module.startswith("test."):
+                touched_module = touched_module.split("test.")[1]
+            if touched_module in dep_modules or touched_module == test.replace(
+                "/", "."
             ):
                 log_test_reason(file_type, touched_file, test, options)
                 return True
 
     # If nothing has determined the test has run, don't run the test.
     if options.verbose:
-        print_to_stderr(f'Determination is skipping {test}')
+        print_to_stderr(f"Determination is skipping {test}")
 
     return False
 
@@ -1019,20 +1091,21 @@ def run_test_module(test: str, test_directory: str, options) -> Optional[str]:
     test_module = parse_test_module(test)
 
     # Printing the date here can help diagnose which tests are slow
-    print_to_stderr('Running {} ... [{}]'.format(test, datetime.now()))
+    print_to_stderr("Running {} ... [{}]".format(test, datetime.now()))
     handler = CUSTOM_HANDLERS.get(test_module, run_test)
     return_code = handler(test_module, test_directory, options)
     assert isinstance(return_code, int) and not isinstance(
-        return_code, bool), 'Return code should be an integer'
+        return_code, bool
+    ), "Return code should be an integer"
     if return_code == 0:
         return None
 
-    message = f'{test} failed!'
+    message = f"{test} failed!"
     if return_code < 0:
         # subprocess.Popen returns the child process' exit signal as
         # return code -N, where N is the signal number.
         signal_name = SIGNALS_TO_NAMES_DICT[-return_code]
-        message += f' Received signal: {signal_name}'
+        message += f" Received signal: {signal_name}"
     return message
 
 
@@ -1042,44 +1115,60 @@ def main():
     # TODO: move this export & download function in tools/ folder
     test_times_filename = options.export_past_test_times
     if test_times_filename:
-        print(f'Exporting past test times from S3 to {test_times_filename}, no tests will be run.')
+        print(
+            f"Exporting past test times from S3 to {test_times_filename}, no tests will be run."
+        )
         export_S3_test_times(test_times_filename)
         return
 
     specified_test_cases_filename = options.run_specified_test_cases
     if specified_test_cases_filename:
-        print(f'Loading specified test cases to run from {specified_test_cases_filename}.')
+        print(
+            f"Loading specified test cases to run from {specified_test_cases_filename}."
+        )
         global SPECIFIED_TEST_CASES_DICT
-        SPECIFIED_TEST_CASES_DICT = get_specified_test_cases(specified_test_cases_filename, TESTS)
+        SPECIFIED_TEST_CASES_DICT = get_specified_test_cases(
+            specified_test_cases_filename, TESTS
+        )
 
     test_directory = os.path.dirname(os.path.abspath(__file__))
     selected_tests = get_selected_tests(options)
 
     if options.verbose:
-        print_to_stderr('Selected tests: {}'.format(', '.join(selected_tests)))
+        print_to_stderr("Selected tests: {}".format(", ".join(selected_tests)))
 
     if options.coverage and not PYTORCH_COLLECT_COVERAGE:
-        shell(['coverage', 'erase'])
+        shell(["coverage", "erase"])
 
     if options.determine_from is not None and os.path.exists(options.determine_from):
-        slow_tests = get_slow_tests_based_on_S3(TESTS, TARGET_DET_LIST, SLOW_TEST_THRESHOLD)
-        print('Added the following tests to target_det tests as calculated based on S3:')
+        slow_tests = get_slow_tests_based_on_S3(
+            TESTS, TARGET_DET_LIST, SLOW_TEST_THRESHOLD
+        )
+        print(
+            "Added the following tests to target_det tests as calculated based on S3:"
+        )
         print(slow_tests)
-        with open(options.determine_from, 'r') as fh:
+        with open(options.determine_from, "r") as fh:
             touched_files = [
-                os.path.normpath(name.strip()) for name in fh.read().split('\n')
+                os.path.normpath(name.strip())
+                for name in fh.read().split("\n")
                 if len(name.strip()) > 0
             ]
         # HACK: Ensure the 'test' paths can be traversed by Modulefinder
-        sys.path.append('test')
+        sys.path.append("test")
         selected_tests = [
-            test for test in selected_tests
-            if determine_target(TARGET_DET_LIST + slow_tests, test, touched_files, options)
+            test
+            for test in selected_tests
+            if determine_target(
+                TARGET_DET_LIST + slow_tests, test, touched_files, options
+            )
         ]
-        sys.path.remove('test')
+        sys.path.remove("test")
 
     if IS_IN_CI:
-        selected_tests = get_reordered_tests(selected_tests, ENABLE_PR_HISTORY_REORDERING)
+        selected_tests = get_reordered_tests(
+            selected_tests, ENABLE_PR_HISTORY_REORDERING
+        )
         # downloading test cases configuration to local environment
         get_test_case_configs(dirpath=os.path.dirname(os.path.abspath(__file__)))
 
@@ -1101,6 +1190,7 @@ def main():
     finally:
         if options.coverage:
             from coverage import Coverage
+
             test_dir = os.path.dirname(os.path.abspath(__file__))
             with set_cwd(test_dir):
                 cov = Coverage()
@@ -1116,5 +1206,6 @@ def main():
             print_to_stderr(err)
         sys.exit(1)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()

From 3d4aabfc483f274817749c45870a32306b67bfd8 Mon Sep 17 00:00:00 2001
From: zhouzhuojie <zhouzhuojie@gmail.com>
Date: Wed, 25 Aug 2021 11:30:28 -0700
Subject: [PATCH 221/530] Fix ciflow/all label generation (#63954)

Summary:
the `ciflow/all` is automatically added but need to be added before we call `gen_root_job_condition`.

- fix the order of adding `ciflow/all`
- refactor all the string into global constants

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63954

Reviewed By: malfet

Differential Revision: D30545596

Pulled By: zhouzhuojie

fbshipit-source-id: 83ab668f0234488afb855a72e3ebd4503f7f1a78
---
 .github/scripts/generate_ci_workflows.py      | 68 +++++++++++--------
 ...torch-linux-xenial-cuda10.2-py3.6-gcc7.yml |  2 +-
 ...torch-linux-xenial-cuda11.3-py3.6-gcc7.yml |  2 +-
 ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml |  2 +-
 ...rated-linux-bionic-py3.8-gcc9-coverage.yml |  2 +-
 ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml |  2 +-
 ...rated-linux-xenial-cuda11.3-py3.6-gcc7.yml |  2 +-
 .../generated-linux-xenial-py3.6-gcc5.4.yml   |  2 +-
 ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml |  2 +-
 ...torch-linux-xenial-cuda11.1-py3.6-gcc7.yml |  2 +-
 ...iodic-linux-xenial-cuda11.1-py3.6-gcc7.yml |  2 +-
 ...rated-periodic-win-vs2019-cuda11.1-py3.yml |  2 +-
 .../generated-win-vs2019-cpu-py3.yml          |  2 +-
 .../generated-win-vs2019-cuda10.1-py3.yml     |  2 +-
 .../generated-win-vs2019-cuda11.3-py3.yml     |  2 +-
 15 files changed, 53 insertions(+), 43 deletions(-)

diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index e24c2e5af3893..946d8da6a29ad 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -38,6 +38,18 @@
     LINUX_CPU_TEST_RUNNER,
 }
 
+LABEL_CIFLOW_ALL = "ciflow/all"
+LABEL_CIFLOW_BAZEL = "ciflow/bazel"
+LABEL_CIFLOW_COVERAGE = "ciflow/coverage"
+LABEL_CIFLOW_CPU = "ciflow/cpu"
+LABEL_CIFLOW_CUDA = "ciflow/cuda"
+LABEL_CIFLOW_DEFAULT = "ciflow/default"
+LABEL_CIFLOW_LIBTORCH = "ciflow/libtorch"
+LABEL_CIFLOW_LINUX = "ciflow/linux"
+LABEL_CIFLOW_SCHEDULED = "ciflow/scheduled"
+LABEL_CIFLOW_SLOW = "ciflow/slow"
+LABEL_CIFLOW_WIN = "ciflow/win"
+
 
 @dataclass
 class CIFlowConfig:
@@ -73,6 +85,7 @@ def __post_init__(self) -> None:
         if not self.enabled:
             self.reset_root_job()
             return
+        self.labels.add(LABEL_CIFLOW_ALL)
         self.gen_root_job_condition()
 
 
@@ -149,10 +162,6 @@ def __post_init__(self) -> None:
                 self.num_test_shards_on_pull_request = 1
             else:
                 self.num_test_shards_on_pull_request = self.num_test_shards
-
-        # Add ciflow/all to labels
-        self.ciflow_config.labels.add('ciflow/all')
-
         self.assert_valid()
 
     def assert_valid(self) -> None:
@@ -163,18 +172,19 @@ def assert_valid(self) -> None:
             assert self.test_runner_type in WINDOWS_RUNNERS, err_message
 
         if self.ciflow_config.enabled:
-            # make sure if ciflow/default is set, we then need to set trigger_action_only to False
-            assert self.ciflow_config.trigger_action_only != ('ciflow/default' in self.ciflow_config.labels)
+            # make sure if LABEL_CIFLOW_DEFAULT is set, we then need to set trigger_action_only to False
+            assert self.ciflow_config.trigger_action_only != (LABEL_CIFLOW_DEFAULT in self.ciflow_config.labels)
             assert self.on_pull_request
-            assert 'ciflow/all' in self.ciflow_config.labels
+            assert LABEL_CIFLOW_ALL in self.ciflow_config.labels
+            assert LABEL_CIFLOW_ALL in self.ciflow_config.root_job_condition
             if self.arch == 'linux':
-                assert 'ciflow/linux' in self.ciflow_config.labels
+                assert LABEL_CIFLOW_LINUX in self.ciflow_config.labels
             if self.arch == 'windows':
-                assert 'ciflow/win' in self.ciflow_config.labels
+                assert LABEL_CIFLOW_WIN in self.ciflow_config.labels
             if self.test_runner_type in CUDA_RUNNERS:
-                assert 'ciflow/cuda' in self.ciflow_config.labels
+                assert LABEL_CIFLOW_CUDA in self.ciflow_config.labels
             if self.test_runner_type in CPU_RUNNERS:
-                assert 'ciflow/cpu' in self.ciflow_config.labels
+                assert LABEL_CIFLOW_CPU in self.ciflow_config.labels
 
     def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         output_file_path = GITHUB_DIR / f"workflows/generated-{self.build_environment}.yml"
@@ -196,7 +206,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         num_test_shards=2,
         ciflow_config=CIFlowConfig(
             enabled=True,
-            labels={'ciflow/default', 'ciflow/cpu', 'ciflow/win'}
+            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CPU, LABEL_CIFLOW_WIN}
         ),
     ),
     CIWorkflow(
@@ -209,7 +219,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         num_test_shards=2,
         ciflow_config=CIFlowConfig(
             enabled=True,
-            labels={'ciflow/default', 'ciflow/cuda', 'ciflow/win'}
+            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN}
         ),
     ),
     CIWorkflow(
@@ -222,7 +232,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         ciflow_config=CIFlowConfig(
             enabled=True,
             trigger_action_only=True,
-            labels={'ciflow/cuda', 'ciflow/win'}
+            labels={LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN}
         ),
     ),
     CIWorkflow(
@@ -236,7 +246,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         ciflow_config=CIFlowConfig(
             enabled=True,
             trigger_action_only=True,
-            labels={'ciflow/scheduled', 'ciflow/win', 'ciflow/cuda'}
+            labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_WIN, LABEL_CIFLOW_CUDA}
         ),
     ),
 ]
@@ -252,7 +262,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         num_test_shards=2,
         ciflow_config=CIFlowConfig(
             enabled=True,
-            labels={'ciflow/default', 'ciflow/linux', 'ciflow/cpu'}
+            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}
         ),
     ),
     # CIWorkflow(
@@ -301,7 +311,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         ciflow_config=CIFlowConfig(
             enabled=True,
             trigger_action_only=True,
-            labels={'ciflow/slow', 'ciflow/linux', 'ciflow/cuda'}
+            labels={LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA}
         ),
     ),
     CIWorkflow(
@@ -319,7 +329,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         ciflow_config=CIFlowConfig(
             enabled=True,
             trigger_action_only=True,
-            labels=set(['ciflow/slow', 'ciflow/linux', 'ciflow/cuda']),
+            labels=set([LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
         ),
     ),
     CIWorkflow(
@@ -332,7 +342,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         ciflow_config=CIFlowConfig(
             enabled=True,
             trigger_action_only=True,
-            labels=set(['ciflow/libtorch', 'ciflow/linux', 'ciflow/cuda']),
+            labels=set([LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
         ),
     ),
     CIWorkflow(
@@ -344,7 +354,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         on_pull_request=True,
         ciflow_config=CIFlowConfig(
             enabled=True,
-            labels=set(['ciflow/default', 'ciflow/linux', 'ciflow/cuda']),
+            labels=set([LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
         ),
     ),
     CIWorkflow(
@@ -357,7 +367,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         ciflow_config=CIFlowConfig(
             enabled=True,
             trigger_action_only=True,
-            labels=set(['ciflow/libtorch', 'ciflow/linux', 'ciflow/cuda']),
+            labels=set([LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
         ),
     ),
     CIWorkflow(
@@ -371,7 +381,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         ciflow_config=CIFlowConfig(
             enabled=True,
             trigger_action_only=True,
-            labels={'ciflow/scheduled', 'ciflow/linux', 'ciflow/cuda'}
+            labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA}
         ),
     ),
     CIWorkflow(
@@ -385,7 +395,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         ciflow_config=CIFlowConfig(
             enabled=True,
             trigger_action_only=True,
-            labels={'ciflow/scheduled', 'ciflow/linux', 'ciflow/libtorch', 'ciflow/cuda'},
+            labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_CUDA},
         ),
     ),
     # CIWorkflow(
@@ -416,7 +426,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         num_test_shards=2,
         ciflow_config=CIFlowConfig(
             enabled=True,
-            labels={'ciflow/default', 'ciflow/coverage', 'ciflow/linux', 'ciflow/cpu'},
+            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_COVERAGE, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU},
         ),
     ),
     # CIWorkflow(
@@ -485,7 +495,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         on_pull_request=True,
         ciflow_config=CIFlowConfig(
             enabled=True,
-            labels={'ciflow/default', 'ciflow/bazel', 'ciflow/cpu', 'ciflow/linux'},
+            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BAZEL, LABEL_CIFLOW_CPU, LABEL_CIFLOW_LINUX},
         ),
     ),
 ]
@@ -517,8 +527,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
                 ciflow_ruleset.add_label_rule(workflow.ciflow_config.labels, workflow.build_environment)
             elif workflow.on_pull_request:
                 # If ciflow is disabled but still on_pull_request, we can denote
-                # it as a special label 'ciflow/default' in the ruleset, which will be later
-                # turned into an actual 'ciflow/default' label in the workflow.
-                # During the rollout phase, it has the same effect as 'ciflow/default'
-                ciflow_ruleset.add_label_rule({'ciflow/default'}, workflow.build_environment)
+                # it as a special label LABEL_CIFLOW_DEFAULT in the ruleset, which will be later
+                # turned into an actual LABEL_CIFLOW_DEFAULT label in the workflow.
+                # During the rollout phase, it has the same effect as LABEL_CIFLOW_DEFAULT
+                ciflow_ruleset.add_label_rule({LABEL_CIFLOW_DEFAULT}, workflow.build_environment)
     ciflow_ruleset.generate_json()
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
index f45ed052e3838..72a9c4effeee3 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -32,7 +32,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 95261026f3862..937a531c977e5 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -32,7 +32,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 61a817ea64bc1..5a4b6c6a56c78 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -32,7 +32,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index f07b8712b6ea1..1226715485f21 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -32,7 +32,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/coverage') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/coverage') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index cb8c6b55b1789..38321b1834b26 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -32,7 +32,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 3273cb0395437..2daf432ae76c8 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -32,7 +32,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index e3be43370a777..3551fe9845218 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -32,7 +32,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index 1827249beae99..9e787e4ba3845 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -32,7 +32,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
index de0aa4bb3333c..09989ef516a7e 100644
--- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -30,7 +30,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 3f1b5b4a85f68..c87397849106e 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -30,7 +30,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
index 4bf74faae1843..2c673ccce0f43 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
@@ -35,7 +35,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index 53acdd8a961b7..54362c903f7d0 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -35,7 +35,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
index a3447bc41f616..c8497bd3029ee 100644
--- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
@@ -37,7 +37,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
index 1b423008fe5fd..205758657d9b2 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
@@ -37,7 +37,7 @@ concurrency:
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }}
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/win')) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run

From 72995657681f6173413b5ee7c62bd91212d07e8d Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Wed, 25 Aug 2021 11:53:24 -0700
Subject: [PATCH 222/530] Update torch.distributed.run OMP_NUM_THREADS message
 to log.warning (#63953)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63953

Closes #61138

Test:
`python -m torch.distributed.run --nproc_per_node 2 test.py`
Still outputs message

`LOGLEVEL=ERROR python -m torch.distributed.run --nproc_per_node 2 test.py`
Does not output message anymore

cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D30542997

Pulled By: H-Huang

fbshipit-source-id: e7da30dcda51516abf4e56f1f510132e44397027
---
 torch/distributed/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index 9fb88fa3a2c96..f21fc4e68808f 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -595,7 +595,7 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
     nproc_per_node = determine_local_world_size(args.nproc_per_node)
     if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
         omp_num_threads = 1
-        print(
+        log.warning(
             f"*****************************************\n"
             f"Setting OMP_NUM_THREADS environment variable for each process to be "
             f"{omp_num_threads} in default, to avoid your system being overloaded, "

From b0782f0f328321ab3ede798dfed3c7a143130e31 Mon Sep 17 00:00:00 2001
From: mingfeima <mingfei.ma@intel.com>
Date: Wed, 25 Aug 2021 11:53:52 -0700
Subject: [PATCH 223/530] add BFloat16 support for bernoulli and Dropout on CPU
 (#56372)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/56372

Test Plan: Imported from OSS

Reviewed By: heitorschueroff

Differential Revision: D28836792

Pulled By: VitalyFedyunin

fbshipit-source-id: ede951d172a59276e11383fd767778ab959b5a6b
---
 aten/src/ATen/native/cpu/DistributionTemplates.h | 6 +++---
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp      | 2 +-
 test/test_nn.py                                  | 2 +-
 test/test_torch.py                               | 1 +
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h
index 66bd31fa74d45..15b1916b9892c 100644
--- a/aten/src/ATen/native/cpu/DistributionTemplates.h
+++ b/aten/src/ATen/native/cpu/DistributionTemplates.h
@@ -308,7 +308,7 @@ struct ExponentialKernel {
 
 template<typename RNG>
 void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) {
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(generator->mutex_);
     using self_t = scalar_t;
@@ -325,7 +325,7 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) {
         return static_cast<self_t>(bernoulli(generator));
       });
     } else {
-      AT_DISPATCH_FLOATING_TYPES(p_.scalar_type(), "bernoulli_tensor_cpu_p_", [&] {
+      AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p_.scalar_type(), "bernoulli_tensor_cpu_p_", [&] {
         using p_t = scalar_t;
         cpu_serial_kernel(iter, [&](const p_t p_val) -> self_t {
           at::bernoulli_distribution<float> bernoulli(p_val);
@@ -338,7 +338,7 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) {
 
 template<typename RNG>
 void bernoulli_kernel(Tensor& self, double p, RNG generator) {
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
     // See Note [Acquire lock when using random generators]
     std::lock_guard<std::mutex> lock(generator->mutex_);
     auto iter = TensorIterator::borrowing_nullary_op(self);
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 007e444d5cd33..f86f0a349dace 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -488,7 +488,7 @@ void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional<Generator> ge
     int64_t n = self.numel();
     bool contig = self.is_contiguous();
 
-    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
+    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
       at::Tensor tmp_int_tensor;
       if (std::is_same<scalar_t, int>::value && contig) {
         tmp_int_tensor = self;
diff --git a/test/test_nn.py b/test/test_nn.py
index d577493fd531c..8c3541aca0716 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -12984,7 +12984,7 @@ def test_Dropout(self, device):
 
         self._test_dropout_stride_mean_preserve(nn.Dropout, device)
 
-        if self.device_type == 'cuda':
+        if self.device_type == 'cuda' or self.device_type == 'cpu':
             input = input.bfloat16()
             self._test_dropout(nn.Dropout, device, input)
 
diff --git a/test/test_torch.py b/test/test_torch.py
index d0f631a2eab52..15e36c83654db 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -4324,6 +4324,7 @@ def test_repeat_interleave(self, device):
             self.assertEqual(a_with_output.size(), torch.Size([3, 2]))
 
     @dtypes(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False)))
+    @dtypesIfCPU(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=True)))
     @dtypesIfCUDA(*(torch.testing.get_all_fp_dtypes(include_bfloat16=False)))
     def test_bernoulli_p(self, device, dtype):
         for trivial_p in ([0, 1], [1, 0, 1, 1, 0, 1]):

From 7edeead796abf374a713e7855f13b980d7a9c517 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Wed, 25 Aug 2021 12:46:09 -0700
Subject: [PATCH 224/530] Add a comment on the potential implicit type
 up-casting (#63905)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63905

as title
ghstack-source-id: 136590703

Test Plan: N/A

Reviewed By: mrshenli

Differential Revision: D30527929

fbshipit-source-id: 69402bbfa87cfd8fc166ce313cde9736ee072589
---
 torch/distributed/algorithms/model_averaging/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py
index ce1fb65401ad2..a2bbac2a25474 100644
--- a/torch/distributed/algorithms/model_averaging/utils.py
+++ b/torch/distributed/algorithms/model_averaging/utils.py
@@ -20,6 +20,9 @@ def average_parameters(
         return
 
     params_it1, params_it2 = itertools.tee(params)
+    # If the input parameters have different data types,
+    # packing these parameters will trigger an implicit type up-casting.
+    # The original parameter data types will be restored during the subsequent unpacking.
     flat_params = torch.cat([p.data.view(-1) for p in params_it1])
     flat_params /= dist.get_world_size(group_to_use)
     # Make sure the allreduce will not conflict with any other ongoing process group.

From ab5cf5a1eb17516dddf5162dc7ab3c670c997376 Mon Sep 17 00:00:00 2001
From: driazati <driazati@users.noreply.github.com>
Date: Wed, 25 Aug 2021 12:58:24 -0700
Subject: [PATCH 225/530] Move existing target determinator to tools (#63809)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63809

This moves out the modulefinder determinator to `tools/testing` since it is supposed to be CI-only. This also simplifies run_test.py a little bit.

Test Plan: Imported from OSS

Reviewed By: malfet, seemethere, janeyx99

Differential Revision: D30497438

Pulled By: driazati

fbshipit-source-id: 1d203037af5af6a20c1e7812da935e7cbb5cd82f
---
 test/run_test.py                           | 232 ++-------------------
 test/test_determination.py                 |   2 +-
 tools/testing/modulefinder_determinator.py | 224 ++++++++++++++++++++
 3 files changed, 241 insertions(+), 217 deletions(-)
 create mode 100644 tools/testing/modulefinder_determinator.py

diff --git a/test/run_test.py b/test/run_test.py
index ecc93fe03aa30..d3c661093a6e8 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -4,8 +4,8 @@
 import copy
 from datetime import datetime
 from distutils.util import strtobool
-import modulefinder
 import os
+import pathlib
 import shutil
 import signal
 import subprocess
@@ -24,9 +24,11 @@
 import torch.distributed as dist
 from typing import Dict, Optional, List
 
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent
+
 try:
     # using tools/ to optimize test run.
-    sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
+    sys.path.append(str(REPO_ROOT))
     from tools.testing.test_selections import (
         export_S3_test_times,
         get_shard_based_on_S3,
@@ -35,6 +37,10 @@
         get_reordered_tests,
         get_test_case_configs,
     )
+    from tools.testing.modulefinder_determinator import (
+        should_run_test,
+        TARGET_DET_LIST,
+    )
 
     HAVE_TEST_SELECTION_TOOLS = True
 except ImportError:
@@ -276,87 +282,12 @@
 WINDOWS_COVERAGE_BLOCKLIST = []
 
 
-# These tests are slow enough that it's worth calculating whether the patch
-# touched any related files first. This list was manually generated, but for every
-# run with --determine-from, we use another generated list based on this one and the
-# previous test stats.
-TARGET_DET_LIST = [
-    "distributions/test_distributions",
-    "test_nn",
-    "test_autograd",
-    "test_cpp_extensions_jit",
-    "test_jit_legacy",
-    "test_dataloader",
-    "test_overrides",
-    "test_linalg",
-    "test_jit",
-    "test_jit_profiling",
-    "test_torch",
-    "test_binary_ufuncs",
-    "test_numpy_interop",
-    "test_reductions",
-    "test_shape_ops",
-    "test_sort_and_select",
-    "test_testing",
-    "test_view_ops",
-    "distributed/nn/jit/test_instantiator",
-    "distributed/rpc/test_tensorpipe_agent",
-    "distributed/rpc/cuda/test_tensorpipe_agent",
-    "distributed/algorithms/ddp_comm_hooks/test_ddp_hooks",
-    "distributed/test_distributed_spawn",
-    "test_cuda",
-    "test_cuda_primary_ctx",
-    "test_cpp_extensions_aot_ninja",
-    "test_cpp_extensions_aot_no_ninja",
-    "test_serialization",
-    "test_optim",
-    "test_utils",
-    "test_multiprocessing",
-    "test_tensorboard",
-    "distributed/test_c10d_common",
-    "distributed/test_c10d_gloo",
-    "distributed/test_c10d_nccl",
-    "distributed/test_jit_c10d",
-    "distributed/test_c10d_spawn_gloo",
-    "distributed/test_c10d_spawn_nccl",
-    "distributed/test_store",
-    "distributed/test_pg_wrapper",
-    "test_quantization",
-    "test_pruning_op",
-    "test_determination",
-    "test_futures",
-    "distributed/pipeline/sync/skip/test_api",
-    "distributed/pipeline/sync/skip/test_gpipe",
-    "distributed/pipeline/sync/skip/test_inspect_skip_layout",
-    "distributed/pipeline/sync/skip/test_leak",
-    "distributed/pipeline/sync/skip/test_portal",
-    "distributed/pipeline/sync/skip/test_stash_pop",
-    "distributed/pipeline/sync/skip/test_tracker",
-    "distributed/pipeline/sync/skip/test_verify_skippables",
-    "distributed/pipeline/sync/test_balance",
-    "distributed/pipeline/sync/test_bugs",
-    "distributed/pipeline/sync/test_checkpoint",
-    "distributed/pipeline/sync/test_copy",
-    "distributed/pipeline/sync/test_deferred_batch_norm",
-    "distributed/pipeline/sync/test_dependency",
-    "distributed/pipeline/sync/test_inplace",
-    "distributed/pipeline/sync/test_microbatch",
-    "distributed/pipeline/sync/test_phony",
-    "distributed/pipeline/sync/test_pipe",
-    "distributed/pipeline/sync/test_pipeline",
-    "distributed/pipeline/sync/test_stream",
-    "distributed/pipeline/sync/test_transparency",
-    "distributed/pipeline/sync/test_worker",
-]
-
 # the JSON file to store the S3 test stats
 TEST_TIMES_FILE = ".pytorch-test-times.json"
 
 # if a test file takes longer than 5 min, we add it to TARGET_DET_LIST
 SLOW_TEST_THRESHOLD = 300
 
-_DEP_MODULES_CACHE: Dict[str, set] = {}
-
 DISTRIBUTED_TESTS_CONFIG = {}
 
 
@@ -957,136 +888,6 @@ def get_selected_tests(options):
     return selected_tests
 
 
-def test_impact_of_file(filename):
-    """Determine what class of impact this file has on test runs.
-
-    Possible values:
-        TORCH - torch python code
-        CAFFE2 - caffe2 python code
-        TEST - torch test code
-        UNKNOWN - may affect all tests
-        NONE - known to have no effect on test outcome
-        CI - CI configuration files
-    """
-    parts = filename.split(os.sep)
-    if parts[0] in [".jenkins", ".circleci"]:
-        return "CI"
-    if parts[0] in ["docs", "scripts", "CODEOWNERS", "README.md"]:
-        return "NONE"
-    elif parts[0] == "torch":
-        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
-            return "TORCH"
-    elif parts[0] == "caffe2":
-        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
-            return "CAFFE2"
-    elif parts[0] == "test":
-        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
-            return "TEST"
-
-    return "UNKNOWN"
-
-
-def log_test_reason(file_type, filename, test, options):
-    if options.verbose:
-        print_to_stderr(
-            "Determination found {} file {} -- running {}".format(
-                file_type,
-                filename,
-                test,
-            )
-        )
-
-
-def get_dep_modules(test):
-    # Cache results in case of repetition
-    if test in _DEP_MODULES_CACHE:
-        return _DEP_MODULES_CACHE[test]
-
-    repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-    test_location = os.path.join(repo_root, "test", test + ".py")
-    finder = modulefinder.ModuleFinder(
-        # Ideally exclude all third party modules, to speed up calculation.
-        excludes=[
-            "scipy",
-            "numpy",
-            "numba",
-            "multiprocessing",
-            "sklearn",
-            "setuptools",
-            "hypothesis",
-            "llvmlite",
-            "joblib",
-            "email",
-            "importlib",
-            "unittest",
-            "urllib",
-            "json",
-            "collections",
-            # Modules below are excluded because they are hitting https://bugs.python.org/issue40350
-            # Trigger AttributeError: 'NoneType' object has no attribute 'is_package'
-            "mpl_toolkits",
-            "google",
-            "onnx",
-            # Triggers RecursionError
-            "mypy",
-        ],
-    )
-    # HACK: some platforms default to ascii, so we can't just run_script :(
-    with open(test_location, "r", encoding="utf-8") as fp:
-        finder.load_module("__main__", fp, test_location, ("", "r", 1))
-
-    dep_modules = set(finder.modules.keys())
-    _DEP_MODULES_CACHE[test] = dep_modules
-    return dep_modules
-
-
-def determine_target(target_det_list, test, touched_files, options):
-    test = parse_test_module(test)
-    # Some tests are faster to execute than to determine.
-    if test not in target_det_list:
-        if options.verbose:
-            print_to_stderr(f"Running {test} without determination")
-        return True
-    # HACK: "no_ninja" is not a real module
-    if test.endswith("_no_ninja"):
-        test = test[: (-1 * len("_no_ninja"))]
-    if test.endswith("_ninja"):
-        test = test[: (-1 * len("_ninja"))]
-
-    dep_modules = get_dep_modules(test)
-
-    for touched_file in touched_files:
-        file_type = test_impact_of_file(touched_file)
-        if file_type == "NONE":
-            continue
-        elif file_type == "CI":
-            # Force all tests to run if any change is made to the CI
-            # configurations.
-            log_test_reason(file_type, touched_file, test, options)
-            return True
-        elif file_type == "UNKNOWN":
-            # Assume uncategorized source files can affect every test.
-            log_test_reason(file_type, touched_file, test, options)
-            return True
-        elif file_type in ["TORCH", "CAFFE2", "TEST"]:
-            parts = os.path.splitext(touched_file)[0].split(os.sep)
-            touched_module = ".".join(parts)
-            # test/ path does not have a "test." namespace
-            if touched_module.startswith("test."):
-                touched_module = touched_module.split("test.")[1]
-            if touched_module in dep_modules or touched_module == test.replace(
-                "/", "."
-            ):
-                log_test_reason(file_type, touched_file, test, options)
-                return True
-
-    # If nothing has determined the test has run, don't run the test.
-    if options.verbose:
-        print_to_stderr(f"Determination is skipping {test}")
-
-    return False
-
-
 def run_test_module(test: str, test_directory: str, options) -> Optional[str]:
     test_module = parse_test_module(test)
 
@@ -1131,7 +932,7 @@ def main():
             specified_test_cases_filename, TESTS
         )
 
-    test_directory = os.path.dirname(os.path.abspath(__file__))
+    test_directory = str(REPO_ROOT / "test")
     selected_tests = get_selected_tests(options)
 
     if options.verbose:
@@ -1144,10 +945,10 @@ def main():
         slow_tests = get_slow_tests_based_on_S3(
             TESTS, TARGET_DET_LIST, SLOW_TEST_THRESHOLD
         )
-        print(
+        print_to_stderr(
             "Added the following tests to target_det tests as calculated based on S3:"
         )
-        print(slow_tests)
+        print_to_stderr(slow_tests)
         with open(options.determine_from, "r") as fh:
             touched_files = [
                 os.path.normpath(name.strip())
@@ -1155,22 +956,22 @@ def main():
                 if len(name.strip()) > 0
             ]
         # HACK: Ensure the 'test' paths can be traversed by Modulefinder
-        sys.path.append("test")
+        sys.path.append(test_directory)
         selected_tests = [
             test
             for test in selected_tests
-            if determine_target(
+            if should_run_test(
                 TARGET_DET_LIST + slow_tests, test, touched_files, options
             )
         ]
-        sys.path.remove("test")
+        sys.path.remove(test_directory)
 
     if IS_IN_CI:
         selected_tests = get_reordered_tests(
             selected_tests, ENABLE_PR_HISTORY_REORDERING
         )
         # downloading test cases configuration to local environment
-        get_test_case_configs(dirpath=os.path.dirname(os.path.abspath(__file__)))
+        get_test_case_configs(dirpath=test_directory)
 
     has_failed = False
     failure_messages = []
@@ -1191,8 +992,7 @@ def main():
         if options.coverage:
             from coverage import Coverage
 
-            test_dir = os.path.dirname(os.path.abspath(__file__))
-            with set_cwd(test_dir):
+            with set_cwd(test_directory):
                 cov = Coverage()
                 if PYTORCH_COLLECT_COVERAGE:
                     cov.load()
diff --git a/test/test_determination.py b/test/test_determination.py
index 6b7fcc0f0d242..277bbd2bc166c 100644
--- a/test/test_determination.py
+++ b/test/test_determination.py
@@ -30,7 +30,7 @@ def determined_tests(cls, changed_files):
         return [
             test
             for test in cls.TESTS
-            if run_test.determine_target(run_test.TARGET_DET_LIST, test, changed_files, DummyOptions())
+            if run_test.should_run_test(run_test.TARGET_DET_LIST, test, changed_files, DummyOptions())
         ]
 
     def test_config_change_only(self):
diff --git a/tools/testing/modulefinder_determinator.py b/tools/testing/modulefinder_determinator.py
new file mode 100644
index 0000000000000..8acd0ed9cc2f0
--- /dev/null
+++ b/tools/testing/modulefinder_determinator.py
@@ -0,0 +1,224 @@
+import os
+import modulefinder
+import sys
+import pathlib
+import warnings
+from typing import Dict, Any, List, Set
+
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent
+
+# These tests are slow enough that it's worth calculating whether the patch
+# touched any related files first. This list was manually generated, but for every
+# run with --determine-from, we use another generated list based on this one and the
+# previous test stats.
+TARGET_DET_LIST = [
+    "distributions/test_distributions",
+    "test_nn",
+    "test_autograd",
+    "test_cpp_extensions_jit",
+    "test_jit_legacy",
+    "test_dataloader",
+    "test_overrides",
+    "test_linalg",
+    "test_jit",
+    "test_jit_profiling",
+    "test_torch",
+    "test_binary_ufuncs",
+    "test_numpy_interop",
+    "test_reductions",
+    "test_shape_ops",
+    "test_sort_and_select",
+    "test_testing",
+    "test_view_ops",
+    "distributed/nn/jit/test_instantiator",
+    "distributed/rpc/test_tensorpipe_agent",
+    "distributed/rpc/cuda/test_tensorpipe_agent",
+    "distributed/algorithms/ddp_comm_hooks/test_ddp_hooks",
+    "distributed/test_distributed_spawn",
+    "test_cuda",
+    "test_cuda_primary_ctx",
+    "test_cpp_extensions_aot_ninja",
+    "test_cpp_extensions_aot_no_ninja",
+    "test_serialization",
+    "test_optim",
+    "test_utils",
+    "test_multiprocessing",
+    "test_tensorboard",
+    "distributed/test_c10d_common",
+    "distributed/test_c10d_gloo",
+    "distributed/test_c10d_nccl",
+    "distributed/test_jit_c10d",
+    "distributed/test_c10d_spawn_gloo",
+    "distributed/test_c10d_spawn_nccl",
+    "distributed/test_store",
+    "distributed/test_pg_wrapper",
+    "test_quantization",
+    "test_pruning_op",
+    "test_determination",
+    "test_futures",
+    "distributed/pipeline/sync/skip/test_api",
+    "distributed/pipeline/sync/skip/test_gpipe",
+    "distributed/pipeline/sync/skip/test_inspect_skip_layout",
+    "distributed/pipeline/sync/skip/test_leak",
+    "distributed/pipeline/sync/skip/test_portal",
+    "distributed/pipeline/sync/skip/test_stash_pop",
+    "distributed/pipeline/sync/skip/test_tracker",
+    "distributed/pipeline/sync/skip/test_verify_skippables",
+    "distributed/pipeline/sync/test_balance",
+    "distributed/pipeline/sync/test_bugs",
+    "distributed/pipeline/sync/test_checkpoint",
+    "distributed/pipeline/sync/test_copy",
+    "distributed/pipeline/sync/test_deferred_batch_norm",
+    "distributed/pipeline/sync/test_dependency",
+    "distributed/pipeline/sync/test_inplace",
+    "distributed/pipeline/sync/test_microbatch",
+    "distributed/pipeline/sync/test_phony",
+    "distributed/pipeline/sync/test_pipe",
+    "distributed/pipeline/sync/test_pipeline",
+    "distributed/pipeline/sync/test_stream",
+    "distributed/pipeline/sync/test_transparency",
+    "distributed/pipeline/sync/test_worker",
+]
+
+_DEP_MODULES_CACHE: Dict[str, Set[str]] = {}
+
+
+def should_run_test(
+    target_det_list: List[str], test: str, touched_files: List[str], options: Any
+) -> bool:
+    test = parse_test_module(test)
+    # Some tests are faster to execute than to determine.
+    if test not in target_det_list:
+        if options.verbose:
+            print_to_stderr(f"Running {test} without determination")
+        return True
+    # HACK: "no_ninja" is not a real module
+    if test.endswith("_no_ninja"):
+        test = test[: (-1 * len("_no_ninja"))]
+    if test.endswith("_ninja"):
+        test = test[: (-1 * len("_ninja"))]
+
+    dep_modules = get_dep_modules(test)
+
+    for touched_file in touched_files:
+        file_type = test_impact_of_file(touched_file)
+        if file_type == "NONE":
+            continue
+        elif file_type == "CI":
+            # Force all tests to run if any change is made to the CI
+            # configurations.
+            log_test_reason(file_type, touched_file, test, options)
+            return True
+        elif file_type == "UNKNOWN":
+            # Assume uncategorized source files can affect every test.
+            log_test_reason(file_type, touched_file, test, options)
+            return True
+        elif file_type in ["TORCH", "CAFFE2", "TEST"]:
+            parts = os.path.splitext(touched_file)[0].split(os.sep)
+            touched_module = ".".join(parts)
+            # test/ path does not have a "test." namespace
+            if touched_module.startswith("test."):
+                touched_module = touched_module.split("test.")[1]
+            if touched_module in dep_modules or touched_module == test.replace(
+                "/", "."
+            ):
+                log_test_reason(file_type, touched_file, test, options)
+                return True
+
+    # If nothing has determined the test has run, don't run the test.
+    if options.verbose:
+        print_to_stderr(f"Determination is skipping {test}")
+
+    return False
+
+
+def test_impact_of_file(filename: str) -> str:
+    """Determine what class of impact this file has on test runs.
+
+    Possible values:
+        TORCH - torch python code
+        CAFFE2 - caffe2 python code
+        TEST - torch test code
+        UNKNOWN - may affect all tests
+        NONE - known to have no effect on test outcome
+        CI - CI configuration files
+    """
+    parts = filename.split(os.sep)
+    if parts[0] in [".jenkins", ".circleci"]:
+        return "CI"
+    if parts[0] in ["docs", "scripts", "CODEOWNERS", "README.md"]:
+        return "NONE"
+    elif parts[0] == "torch":
+        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
+            return "TORCH"
+    elif parts[0] == "caffe2":
+        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
+            return "CAFFE2"
+    elif parts[0] == "test":
+        if parts[-1].endswith(".py") or parts[-1].endswith(".pyi"):
+            return "TEST"
+
+    return "UNKNOWN"
+
+
+def log_test_reason(file_type: str, filename: str, test: str, options: Any) -> None:
+    if options.verbose:
+        print_to_stderr(
+            "Determination found {} file {} -- running {}".format(
+                file_type,
+                filename,
+                test,
+            )
+        )
+
+
+def get_dep_modules(test: str) -> Set[str]:
+    # Cache results in case of repetition
+    if test in _DEP_MODULES_CACHE:
+        return _DEP_MODULES_CACHE[test]
+
+    test_location = REPO_ROOT / "test" / f"{test}.py"
+
+    # HACK: some platforms default to ascii, so we can't just run_script :(
+    finder = modulefinder.ModuleFinder(
+        # Ideally exclude all third party modules, to speed up calculation.
+        excludes=[
+            "scipy",
+            "numpy",
+            "numba",
+            "multiprocessing",
+            "sklearn",
+            "setuptools",
+            "hypothesis",
+            "llvmlite",
+            "joblib",
+            "email",
+            "importlib",
+            "unittest",
+            "urllib",
+            "json",
+            "collections",
+            # Modules below are excluded because they are hitting https://bugs.python.org/issue40350
+            # Trigger AttributeError: 'NoneType' object has no attribute 'is_package'
+            "mpl_toolkits",
+            "google",
+            "onnx",
+            # Triggers RecursionError
+            "mypy",
+        ],
+    )
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        finder.run_script(str(test_location))
+    dep_modules = set(finder.modules.keys())
+    _DEP_MODULES_CACHE[test] = dep_modules
+    return dep_modules
+
+
+def parse_test_module(test: str) -> str:
+    return test.split(".")[0]
+
+
+def print_to_stderr(message: str) -> None:
+    print(message, file=sys.stderr)

From 5b548f6f64ebd7b2187cf7e79043eb7d2e92f2cf Mon Sep 17 00:00:00 2001
From: Priya Ramani <priyaramani@fb.com>
Date: Wed, 25 Aug 2021 13:08:12 -0700
Subject: [PATCH 226/530] Shape Propagation Pass: Fix AdaptiveAveragePooling2d
 (#63629)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63629

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D30461727

Pulled By: priyaramani

fbshipit-source-id: 3873d1d636f79185680b82de06174d8de288c941
---
 test/jit/test_symbolic_shape_analysis.py      | 32 +++++++++++++------
 .../jit/runtime/symbolic_shape_registry.cpp   | 13 ++++++--
 2 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/test/jit/test_symbolic_shape_analysis.py b/test/jit/test_symbolic_shape_analysis.py
index 33dc515c51145..6d4e33cda852f 100644
--- a/test/jit/test_symbolic_shape_analysis.py
+++ b/test/jit/test_symbolic_shape_analysis.py
@@ -3,7 +3,6 @@
 import operator
 
 from torch.testing import FileCheck
-from typing import List
 
 
 if __name__ == '__main__':
@@ -60,15 +59,6 @@ def prop_shapes_on_graph(inp0, inp1):
         self.assertEqual(output_shape[1], sym2)
         self.assertEqual(output_shape[2], sym3)
 
-    def test_sharing_of_list_len(self):
-        @torch.jit.script
-        def foo(x, out: List[int]):
-            return torch.nn.functional.adaptive_avg_pool2d(x, out)
-
-        self.run_pass("inline", foo.graph)
-        torch._C._jit_pass_propagate_shapes_on_graph(foo.graph)
-        FileCheck().check("Tensor(*, *)").check_same("adaptive_avg_pool2d").run(foo.graph)
-
     def test_shared_shape_graph(self):
         @torch.jit.script
         def foo(x, y):
@@ -165,3 +155,25 @@ def foo2(x, y):
             inputs[1].setType(inputs[1].type().with_sizes([5, 8, sym1]))
             torch._C._jit_pass_propagate_shapes_on_graph(graph)
             self.assertEqual(next(graph.outputs()).type().symbolic_sizes(), [5, 8, sym1])
+
+    def test_adaptive_avg_pool2d(self):
+        inps = [
+            [(1, 64, 8, 9), (5, 7)],
+            [(1, 64, 10, 9), (7)],
+            [(1, 64, 10, 9), (5, None)],
+            [(1, 8, 4, 3), (None, None)],
+            [(1, 8, 4, 3), (None, 5)],
+        ]
+
+        for inp in inps:
+            t = torch.randn(*inp[0])
+            out_size = torch.nn.functional.adaptive_avg_pool2d(t, inp[1]).size()
+
+            def foo(x):
+                return torch.nn.functional.adaptive_avg_pool2d(x, inp[1])
+
+            fn = torch.jit.trace(foo, (t,))
+            torch._C._jit_erase_non_input_shape_information(fn.graph)
+            torch._C._jit_pass_peephole(fn.graph)
+            torch._C._jit_pass_constant_propagation(fn.graph)
+            self.checkShapeAnalysis(out_size, fn.graph, assert_propagation=True)
diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
index ffc2f44e16dac..d4471998d11e8 100644
--- a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
+++ b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
@@ -36,10 +36,17 @@ const std::string shape_compute_functions =
           return expandedSizes
 
         def adaptive_avg_pool2d(self: List[int], out: List[int]):
-          # TODO: return out directly, list len refiner would need to
-          # annotate the List Type with len directly in IR
           assert len(out) == 2
-          return [out[0], out[1]]
+          assert len(self) == 3 or len(self) == 4
+          for i in range (1, len(self)):
+            assert self[i] != 0
+
+          shape: List[int] = []
+          for i in range(0, len(self) -2):
+            shape.append(self[i])
+          for elem in out:
+            shape.append(elem)
+          return shape
 
         # TODO: maybe make it customary that extra arguments are unused ?
         # TODO: return self directly

From 52ebe7e14efc3e1ebb5cf974245a4d37f4441e9d Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Wed, 25 Aug 2021 14:34:40 -0700
Subject: [PATCH 227/530] Back out "Temporary fix for remote gpu execution
 issue" (#63983)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63983

Test for fixes in D30545351. it should resolve the remote execution flag being populated incorrectly issue.

Test Plan: CI

Reviewed By: malfet, seemethere

Differential Revision: D30549443

fbshipit-source-id: b3895909f5cd654ba163b77950872b332fbad3fe
---
 torch/testing/_internal/common_device_type.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index b5d61273afd3e..8ec6e71d121ff 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -12,7 +12,7 @@
 import torch
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
-    IS_SANDCASTLE, IS_FBCODE, DeterministicGuard, TEST_SKIP_NOARCH
+    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, DeterministicGuard, TEST_SKIP_NOARCH
 from torch.testing._internal.common_cuda import _get_torch_cuda_version
 from torch.testing import \
     (get_all_dtypes)
@@ -469,9 +469,13 @@ def get_device_type_test_bases():
     test_bases: List[Any] = list()
 
     if IS_SANDCASTLE or IS_FBCODE:
-        # temporarily disable IS_REMOTE_GPU, see T99020845
-        test_bases.append(CPUTestBase)
-        test_bases.append(MetaTestBase)
+        if IS_REMOTE_GPU:
+            # Skip if sanitizer is enabled
+            if not TEST_WITH_ASAN and not TEST_WITH_TSAN and not TEST_WITH_UBSAN:
+                test_bases.append(CUDATestBase)
+        else:
+            test_bases.append(CPUTestBase)
+            test_bases.append(MetaTestBase)
     else:
         test_bases.append(CPUTestBase)
         if not TEST_SKIP_NOARCH:

From b5b9ce146f27624876d64034305c3c033bdfeaf5 Mon Sep 17 00:00:00 2001
From: John Clow <jclow@fb.com>
Date: Wed, 25 Aug 2021 14:49:06 -0700
Subject: [PATCH 228/530] Small fixes to the Contributing.txt (#63385)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63385

Correcting a mistake for the pytorch uninstall, and
adding an extra note for Darwin.

Test Plan: Imported from OSS

Reviewed By: janeyx99, heitorschueroff

Differential Revision: D30530234

fbshipit-source-id: e0f88a1725eeadabfb4b28c1da11e369ee878ab4
---
 CONTRIBUTING.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7cf3aecabd7c1..93de9b022ee6f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -87,7 +87,7 @@ lazy.)
 
 
 ```bash
-conda -y uninstall pytorch
+conda uninstall pytorch -y
 yes | pip uninstall torch
 ```
 
@@ -781,6 +781,8 @@ If you are editing a single file and rebuilding in a tight loop, the time spent
 linking will dominate. The system linker available in most Linux distributions
 (GNU `ld`) is quite slow. Use a faster linker, like [lld](https://lld.llvm.org/).
 
+People on Mac, follow [this guide](https://stackoverflow.com/questions/42730345/how-to-install-llvm-for-mac) instead.
+
 The easiest way to use `lld` this is download the
 [latest LLVM binaries](http://releases.llvm.org/download.html#8.0.0) and run:
 ```

From 730ce29bafcdf21cc7000dccbbed4c7af500cf27 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 25 Aug 2021 15:00:47 -0700
Subject: [PATCH 229/530] Add note on ifdefing based on CUDA_VERSION for ROCm
 path (#62850)

Summary:
CUDA_VERSION and HIP_VERSION follow very unrelated versioning schemes, so it does not make sense to use CUDA_VERSION to determine the ROCm path. This note explicitly addresses it.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62850

Reviewed By: mruberry

Differential Revision: D30547562

Pulled By: malfet

fbshipit-source-id: 02990fa66a88466c2330ab85f446b25b78545150
---
 docs/source/notes/hip.rst | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/docs/source/notes/hip.rst b/docs/source/notes/hip.rst
index 20f99cb96c5b0..a9c94e2a4febb 100644
--- a/docs/source/notes/hip.rst
+++ b/docs/source/notes/hip.rst
@@ -119,6 +119,27 @@ torch.distributed backends
 
 Currently, only the "nccl" and "gloo" backends for torch.distributed are supported on ROCm.
 
+.. _cuda-api-to_hip-api-mappings:
+
+CUDA API to HIP API mappings in C++
+-----------------------------------
+
+Please refer: https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP_API_Guide.html
+
+NOTE: The CUDA_VERSION macro, cudaRuntimeGetVersion and cudaDriverGetVersion APIs do not
+semantically map to the same values as HIP_VERSION macro, hipRuntimeGetVersion and
+hipDriverGetVersion APIs. Please do not use them interchangeably when doing version checks.
+
+Eg: Instead of
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+If it is desired to not take the code path for ROCm/HIP:
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(USE_ROCM)
+If it is desired to take the code path for ROCm/HIP:
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11000) || defined(USE_ROCM)
+If it is desired to take the code path for ROCm/HIP only for specific HIP versions:
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11000) || (defined(USE_ROCM) && ROCM_VERSION >= 40300)
+
+
 Refer to CUDA Semantics doc
 ---------------------------
 

From 44ede71751440975e985944503d548e101a42a64 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Wed, 25 Aug 2021 15:05:14 -0700
Subject: [PATCH 230/530] Shard python_torch_functions.cpp (#62187)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62187

This file can take 3 minutes on its own to compile, and after
python_functions.cpp is the second limiting factor for compile time of
`libtorch_python` on a 32-core threadripper. This splits it into 3 files that
take around 1 minute each to compile.

Test Plan: Imported from OSS

Reviewed By: H-Huang

Differential Revision: D29962048

Pulled By: albanD

fbshipit-source-id: 99016d75912bff483fe21b130cef43a6882f8c0e
---
 BUILD.bazel                                   |   4 +-
 caffe2/CMakeLists.txt                         |   4 +-
 tools/autograd/gen_python_functions.py        |  60 +-
 .../templates/python_torch_functions.cpp      | 760 +---------------
 tools/build_variables.bzl                     |   9 +-
 tools/codegen/gen.py                          |   4 +-
 torch/csrc/autograd/python_torch_functions.h  |  25 +
 .../python_torch_functions_manual.cpp         | 826 ++++++++++++++++++
 8 files changed, 930 insertions(+), 762 deletions(-)
 create mode 100644 torch/csrc/autograd/python_torch_functions.h
 create mode 100644 torch/csrc/autograd/python_torch_functions_manual.cpp

diff --git a/BUILD.bazel b/BUILD.bazel
index afdd4699b160f..a5f20c2020181 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -224,7 +224,9 @@ libtorch_python_generated_sources = [
         "torch/csrc/autograd/generated/python_functions_3.cpp",
         "torch/csrc/autograd/generated/python_functions_4.cpp",
         "torch/csrc/autograd/generated/python_variable_methods.cpp",
-        "torch/csrc/autograd/generated/python_torch_functions.cpp",
+        "torch/csrc/autograd/generated/python_torch_functions_0.cpp",
+        "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
+        "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
         "torch/csrc/autograd/generated/python_nn_functions.cpp",
         "torch/csrc/autograd/generated/python_fft_functions.cpp",
         "torch/csrc/autograd/generated/python_linalg_functions.cpp",
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 67ab08f9b0fc5..1662a92268d37 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -397,7 +397,9 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_3.cpp"
     "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_4.cpp"
     "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_0.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_1.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_2.cpp"
     "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.cpp"
     "${TORCH_SRC_DIR}/csrc/autograd/generated/python_fft_functions.cpp"
     "${TORCH_SRC_DIR}/csrc/autograd/generated/python_linalg_functions.cpp"
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index d1fb70c8abed3..f61d3d0c0709c 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -164,9 +164,12 @@ def gen(out: str, native_yaml_path: str, deprecated_yaml_path: str, template_pat
     create_python_bindings(
         fm, methods, is_py_variable_method, None, 'python_variable_methods.cpp', method=True)
 
+    # NOTE: num_shards here must be synced with gatherTorchFunctions in
+    #       torch/csrc/autograd/python_torch_functions_manual.cpp
     functions = load_signatures(native_functions, deprecated_yaml_path, method=False)
-    create_python_bindings(
-        fm, functions, is_py_torch_function, 'torch', 'python_torch_functions.cpp', method=False)
+    create_python_bindings_sharded(
+        fm, functions, is_py_torch_function, 'torch', 'python_torch_functions.cpp',
+        method=False, num_shards=3)
 
     create_python_bindings(
         fm, functions, is_py_nn_function, 'torch.nn', 'python_nn_functions.cpp', method=False)
@@ -180,6 +183,16 @@ def gen(out: str, native_yaml_path: str, deprecated_yaml_path: str, template_pat
     create_python_bindings(
         fm, functions, is_py_special_function, 'torch.special', 'python_special_functions.cpp', method=False)
 
+def group_filter_overloads(
+    pairs: Sequence[PythonSignatureNativeFunctionPair],
+    pred: Callable[[NativeFunction], bool]
+) -> Dict[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]:
+    grouped: Dict[BaseOperatorName, List[PythonSignatureNativeFunctionPair]] = defaultdict(list)
+    for pair in pairs:
+        if pred(pair.function):
+            grouped[pair.function.func.name.name].append(pair)
+    return grouped
+
 def create_python_bindings(
     fm: FileManager,
     pairs: Sequence[PythonSignatureNativeFunctionPair],
@@ -194,10 +207,7 @@ def create_python_bindings(
     py_method_defs: List[str] = []
     py_forwards: List[str] = []
 
-    grouped: Dict[BaseOperatorName, List[PythonSignatureNativeFunctionPair]] = defaultdict(list)
-    for pair in pairs:
-        if pred(pair.function):
-            grouped[pair.function.func.name.name].append(pair)
+    grouped = group_filter_overloads(pairs, pred)
 
     for name in sorted(grouped.keys(), key=lambda x: str(x)):
         overloads = grouped[name]
@@ -212,6 +222,44 @@ def create_python_bindings(
         'py_method_defs': py_method_defs,
     })
 
+def create_python_bindings_sharded(
+    fm: FileManager,
+    pairs: Sequence[PythonSignatureNativeFunctionPair],
+    pred: Callable[[NativeFunction], bool],
+    module: Optional[str],
+    filename: str,
+    *,
+    method: bool,
+    num_shards: int
+) -> None:
+    """Generates Python bindings to ATen functions"""
+    grouped = group_filter_overloads(pairs, pred)
+
+    def key_func(kv: Tuple[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]) -> str:
+        return str(kv[0])
+
+    def env_func(
+        kv: Tuple[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]
+    ) -> Dict[str, List[str]]:
+        return {
+            'py_forwards': list(forward_decls(kv[0], kv[1], method=method)),
+            'py_methods': [method_impl(kv[0], module, kv[1], method=method)],
+            'py_method_defs': [method_def(kv[0], module, kv[1], method=method)],
+        }
+
+    fm.write_sharded(
+        filename,
+        grouped.items(),
+        base_env={
+            'generated_comment':
+            '@' + f'generated from {fm.template_dir}/{filename}',
+        },
+        key_fn=key_func,
+        env_callable=env_func,
+        num_shards=num_shards,
+        sharded_keys={'py_forwards', 'py_methods', 'py_method_defs'}
+    )
+
 def load_signatures(
     native_functions: List[NativeFunction],
     deprecated_yaml_path: str,
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 9e02036639516..b45b5f298716b 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -7,7 +7,6 @@
 // and also copied into 'torch' module.
 
 #include <Python.h>
-#include <pybind11/pybind11.h>
 
 // Undefine the copysign macro so that at::copysign works as intended with MSVC
 // https://github.com/python/cpython/blob/c60394c7fc9cc09b16e9675a3eeb5844b6d8523f/PC/pyconfig.h#L196
@@ -15,6 +14,7 @@
 #undef copysign
 #endif // _MSC_VER
 
+#include "torch/csrc/autograd/python_torch_functions.h"
 #include "torch/csrc/autograd/python_variable.h"
 #include "torch/csrc/autograd/utils/wrap_outputs.h"
 #include "torch/csrc/Dtype.h"
@@ -34,7 +34,6 @@
 
 #include <ATen/ATen.h>
 
-#include <fmt/format.h>
 #include <functional>
 #include <initializer_list>
 #include <stdexcept>
@@ -59,767 +58,28 @@ using at::ArrayRef;
 using torch::utils::check_out_type_matches;
 using namespace torch::autograd::utils;
 
-namespace torch { namespace autograd {
-
-static PyObject* THPVariableFunctionsModule = NULL;
-
-inline Tensor dispatch_arange(const Scalar& end, Tensor result) {
-  pybind11::gil_scoped_release no_gil;
-  return at::arange_out(result, end);
-}
-
-inline Tensor dispatch_arange(const Scalar& end, const TensorOptions& options) {
-  torch::utils::maybe_initialize_cuda(options);
-  pybind11::gil_scoped_release no_gil;
-  return torch::arange(end, options);
-}
-
-inline Tensor dispatch_arange(const Scalar& start, const Scalar& end, const Scalar& step, Tensor result) {
-  pybind11::gil_scoped_release no_gil;
-  return at::arange_out(result, start, end, step);
-}
-
-inline Tensor dispatch_arange(const Scalar& start, const Scalar& end, const Scalar& step, const TensorOptions& options) {
-  torch::utils::maybe_initialize_cuda(options);
-  pybind11::gil_scoped_release no_gil;
-  return torch::arange(start, end, step, options);
-}
-
-static PyObject * THPVariable_arange(PyObject* self, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  static PythonArgParser parser({
-    "arange(Scalar end, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
-    "arange(Scalar start, Scalar end, Scalar step=1, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
-  }, /*traceable=*/true);
-
-  ParsedArgs<9> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-
-  if(r.has_torch_function()) {
-    return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
-  }
-
-  if (r.idx == 0) {
-    if (r.isNone(1)) {
-      auto end = r.scalar(0);
-      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
-      c10::optional<ScalarType> scalarType = r.scalartypeOptional(2);
-      const auto options = TensorOptions()
-          .dtype(scalarType)
-          .device(r.device(4))
-          .layout(r.layout(3))
-          .requires_grad(r.toBool(6))
-          .pinned_memory(r.toBool(5));
-      return wrap(dispatch_arange(end, options));
-    } else {
-      TORCH_CHECK(!r.toBool(5), " `pin_memory` and `out` parameters are incompatible");
-      check_out_type_matches(r.tensor(1), r.scalartype(2), r.isNone(2), r.layout(3),
-                             r.device(4), r.isNone(4));
-      return wrap(dispatch_arange(r.scalar(0), r.tensor(1)).set_requires_grad(r.toBool(6)));
-    }
-  } else if (r.idx == 1) {
-    if (r.isNone(3)) {
-      auto start = r.scalar(0);
-      auto end = r.scalar(1);
-      auto step = r.scalar(2);
-      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
-      c10::optional<ScalarType> scalarType = r.scalartypeOptional(4);
-      const auto options = TensorOptions()
-          .dtype(scalarType)
-          .device(r.device(6))
-          .layout(r.layout(5))
-          .requires_grad(r.toBool(8))
-          .pinned_memory(r.toBool(7));
-      return wrap(dispatch_arange(start, end, step, options));
-    } else {
-      TORCH_CHECK(!r.toBool(7), " `pin_memory` and `out` parameters are incompatible");
-      check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4), r.layout(5),
-                               r.device(6), r.isNone(6));
-      return wrap(dispatch_arange(r.scalar(0), r.scalar(1), r.scalar(2), r.tensor(3)).set_requires_grad(r.toBool(8)));
-    }
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-inline Tensor dispatch_range(const Scalar& start, const Scalar& end, const Scalar& step, Tensor result) {
-  pybind11::gil_scoped_release no_gil;
-  OptionalDeviceGuard device_guard(device_of(result));
-  return at::range_out(result, start, end, step);
-}
-
-inline Tensor dispatch_range(const Scalar& start, const Scalar& end, const Scalar& step, const TensorOptions& options) {
-  torch::utils::maybe_initialize_cuda(options);
-  pybind11::gil_scoped_release no_gil;
-  DeviceGuard device_guard(options.device());
-  return torch::range(start, end, step, options);
-}
-
-static PyObject * THPVariable_range(PyObject* self, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  static PythonArgParser parser({
-    "range(Scalar start, Scalar end, Scalar step=1, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
-  });
-
-  ParsedArgs<8> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-
-  if (r.idx == 0) {
-    auto ret = PyErr_WarnEx(
-        PyExc_UserWarning,
-        "torch.range is deprecated and will be removed in a future release "
-        "because its behavior is inconsistent with Python's range builtin. "
-        "Instead, use torch.arange, which produces values in [start, end).",
-        1);
-    if (ret != 0) throw python_error();
-    if (r.isNone(3)) {
-      const auto options = TensorOptions()
-          .dtype(r.scalartype(4))
-          .device(r.device(6))
-          .layout(r.layout(5))
-          .requires_grad(r.toBool(7));
-      return wrap(dispatch_range(r.scalar(0), r.scalar(1), r.scalar(2), options));
-    } else {
-      check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4),
-                             r.layout(5), r.device(6), r.isNone(6));
-      return wrap(dispatch_range(r.scalar(0), r.scalar(1), r.scalar(2), r.tensor(3)).set_requires_grad(r.toBool(7)));
-    }
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-inline Tensor dispatch_full(
-    IntArrayRef size,
-    const Scalar& fill_val,
-    const TensorOptions& options) {
-  torch::utils::maybe_initialize_cuda(options);
-  pybind11::gil_scoped_release no_gil;
-  return at::full(size, fill_val, options);
-}
-
-inline Tensor dispatch_full(
-    IntArrayRef size,
-    const Scalar& fill_val,
-    c10::optional<DimnameList> names,
-    const TensorOptions& options) {
-  torch::utils::maybe_initialize_cuda(options);
-  pybind11::gil_scoped_release no_gil;
-  return at::full(size, fill_val, names, options);
-}
-
-inline Tensor dispatch_full(
-    IntArrayRef size,
-    const Scalar& fill_val,
-    Tensor result) {
-  pybind11::gil_scoped_release no_gil;
-  return at::full_out(result, size, fill_val);
-}
-
-static PyObject * THPVariable_full(PyObject* self, PyObject* args, PyObject* kwargs) {
-  HANDLE_TH_ERRORS
-
-  static PythonArgParser parser({
-    "full(IntArrayRef size, Scalar fill_value, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
-    "full(IntArrayRef size, Scalar fill_value, *, DimnameList names=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
-  }, /*traceable=*/true);
-
-  // Acquires (common) arguments
-  ParsedArgs<8> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
+// NOTE: See [Sharded File] comment in VariableType
 
-  if(r.has_torch_function()) {
-    return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
-  }
-
-  auto size = r.intlist(0);
-  auto fill_val = r.scalar(1);
-  const auto options = TensorOptions{}
-      .dtype(r.scalartypeOptional(3))
-      .layout(r.layout(4))
-      .device(r.device(5))
-      .pinned_memory(r.toBool(6));
-
-  if (r.idx == 0) {
-    // full
-    if (r.isNone(2)) {
-      return wrap(dispatch_full(size, fill_val, options).set_requires_grad(r.toBool(7)));
-    }
-
-    // full.out
-    // Validates out tensor and other kwargs
-    auto result = r.tensor(2);
-    TORCH_CHECK(!r.toBool(6), " `pin_memory` and `out` parameters are incompatible");
-    check_out_type_matches(result, r.scalartype(3), r.isNone(3), r.layout(4),
-                          r.device(5), r.isNone(5));
-
-    return wrap(dispatch_full(size, fill_val, result).set_requires_grad(r.toBool(7)));
-  } else if (r.idx == 1) {
-    // full.names
-    if (r.isNone(2)) {
-      return wrap(dispatch_full(size, fill_val, c10::nullopt, options).set_requires_grad(r.toBool(7)));
-    }
-
-    // Converts from c10::optional<std:vector...> to c10::optional<ArrayRef...>
-    auto raw_names = r.toDimnameListOptional(2);
-    c10::optional<DimnameList> names(*raw_names);
-    return wrap(dispatch_full(size, fill_val, names, options).set_requires_grad(r.toBool(7)));
-  }
-
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-inline Tensor dispatch_randint(int64_t high, IntArrayRef size, c10::optional<Generator> generator, Tensor result) {
-  pybind11::gil_scoped_release no_gil;
-  return at::randint_out(result, high, size, generator);
-}
-inline Tensor dispatch_randint(int64_t high, IntArrayRef size, c10::optional<Generator> generator, const TensorOptions & options) {
-  torch::utils::maybe_initialize_cuda(options);
-  pybind11::gil_scoped_release no_gil;
-  return torch::randint(high, size, generator, options);
-}
-inline Tensor dispatch_randint(int64_t high, IntArrayRef size, Tensor result) {
-  pybind11::gil_scoped_release no_gil;
-  return at::randint_out(result, high, size);
-}
-inline Tensor dispatch_randint(int64_t high, IntArrayRef size, const TensorOptions & options) {
-  torch::utils::maybe_initialize_cuda(options);
-  pybind11::gil_scoped_release no_gil;
-  return torch::randint(high, size, options);
-}
-inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, c10::optional<Generator> generator, Tensor result) {
-  pybind11::gil_scoped_release no_gil;
-  return at::randint_out(result, low, high, size, generator);
-}
-inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, c10::optional<Generator> generator, const TensorOptions & options) {
-  torch::utils::maybe_initialize_cuda(options);
-  pybind11::gil_scoped_release no_gil;
-  return torch::randint(low, high, size, generator, options);
-}
-inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, Tensor result) {
-  pybind11::gil_scoped_release no_gil;
-  return at::randint_out(result, low, high, size);
-}
-inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, const TensorOptions & options) {
-  torch::utils::maybe_initialize_cuda(options);
-  pybind11::gil_scoped_release no_gil;
-  return torch::randint(low, high, size, options);
-}
-
-static PyObject * THPVariable_randint(PyObject* self_, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  static PythonArgParser parser({
-    "randint(int64_t high, IntArrayRef size, *, Generator generator=None, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
-    "randint(int64_t low, int64_t high, IntArrayRef size, *, Generator generator=None, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
-  }, /*traceable=*/false);
-
-  ParsedArgs<9> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-
-  if(r.has_torch_function()) {
-    return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
-  }
-
-  if (r.idx == 0) {
-    if (r.isNone(3)) {
-      auto high = r.toInt64(0);
-      auto size = r.intlist(1);
-      auto generator = r.generator(2);
-      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
-      auto dtype = r.scalartypeWithDefault(4, at::ScalarType::Long);
-      auto device = r.device(6);
-      const auto options = TensorOptions()
-          .dtype(dtype)
-          .device(device)
-          .layout(r.layout(5))
-          .requires_grad(r.toBool(7));
-      return wrap(dispatch_randint(high, size, generator, options));
-    } else {
-      check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4),
-                             r.layout(5), r.device(6), r.isNone(6));
-      return wrap(dispatch_randint(r.toInt64(0), r.intlist(1), r.generator(2), r.tensor(3)).set_requires_grad(r.toBool(7)));
-    }
-  } else if (r.idx == 1) {
-    if (r.isNone(4)) {
-      auto low = r.toInt64(0);
-      auto high = r.toInt64(1);
-      auto size = r.intlist(2);
-      auto generator = r.generator(3);
-      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
-      auto dtype = r.scalartypeWithDefault(5, at::ScalarType::Long);
-      auto device = r.device(7);
-      const auto options = TensorOptions()
-          .dtype(dtype)
-          .device(device)
-          .layout(r.layout(6))
-          .requires_grad(r.toBool(8));
-      return wrap(dispatch_randint(low, high, size, generator, options));
-    } else {
-      check_out_type_matches(r.tensor(4), r.scalartype(5), r.isNone(5),
-                             r.layout(6), r.device(7), r.isNone(7));
-      return wrap(dispatch_randint(r.toInt64(0), r.toInt64(1), r.intlist(2), r.generator(3), r.tensor(4)).set_requires_grad(r.toBool(8)));
-    }
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-// implemented on python object to allow torch.as_tensor to be constructed with arbitrarily nested
-// python objects - list, tuple, np array, scalar, etc.
-static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  jit::tracer::warn("torch.as_tensor", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::as_tensor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
-  END_HANDLE_TH_ERRORS
-}
-
-// implemented on python object here because PyObject currently not natively declarable
-// See: ATen/native/README.md for more context
-static PyObject * THPVariable_from_numpy(PyObject* module, PyObject* arg)
-{
-  HANDLE_TH_ERRORS
-  jit::tracer::warn("torch.from_numpy", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::tensor_from_numpy(arg));
-  END_HANDLE_TH_ERRORS
-}
-
-static Tensor dispatch_nonzero(const Tensor & self) {
-  pybind11::gil_scoped_release no_gil;
-  OptionalDeviceGuard device_guard(device_of(self));
-  return self.nonzero();
-}
-
-static Tensor dispatch_nonzero(const Tensor & self, Tensor out) {
-  pybind11::gil_scoped_release no_gil;
-  OptionalDeviceGuard device_guard(device_of(self));
-  return at::nonzero_out(out, self);
-}
-
-static std::vector<Tensor> dispatch_nonzero_numpy(const Tensor & self) {
-  pybind11::gil_scoped_release no_gil;
-  OptionalDeviceGuard device_guard(device_of(self));
-  return self.nonzero_numpy();
-}
-
-static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject* kwargs);
-
-static PyObject * THPVariable_sparse_csr_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  jit::tracer::warn("torch.sparse_csr_tensor", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::sparse_csr_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
-  END_HANDLE_TH_ERRORS
-}
-
-static PyObject * THPVariable__sparse_csr_tensor_unsafe(PyObject* self, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  jit::tracer::warn("torch._sparse_csr_tensor_unsafe", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::_sparse_csr_tensor_unsafe_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
-  END_HANDLE_TH_ERRORS
-}
-
-static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  jit::tracer::warn("torch.sparse_coo_tensor", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
-  END_HANDLE_TH_ERRORS
-}
-
-static PyObject * THPVariable__sparse_coo_tensor_unsafe(PyObject* self, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  jit::tracer::warn("torch._sparse_coo_tensor_unsafe", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::_sparse_coo_tensor_unsafe_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
-  END_HANDLE_TH_ERRORS
-}
-
-// implemented on python object to allow torch.tensor to be constructed with arbitrarily nested
-// python objects - list, tuple, np array, scalar, etc.
-static PyObject * THPVariable_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  jit::tracer::warn("torch.tensor", jit::tracer::WARN_CONSTRUCTOR);
-  return THPVariable_Wrap(torch::utils::tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
-  END_HANDLE_TH_ERRORS
-}
-
-static PyObject * THPVariable_get_device(PyObject* self_, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  static PythonArgParser parser({
-    "get_device(Tensor input)",
-  }, /*traceable=*/false);
-
-  ParsedArgs<1> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-
-  if (r.idx == 0) {
-    return wrap(r.tensor(0).get_device());
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-static PyObject * THPVariable_frombuffer(PyObject* self_, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  static PythonArgParser parser({
-    "frombuffer(PyObject* buffer, *, ScalarType dtype, int64_t count=-1, int64_t offset=0, bool requires_grad=False)",
-  }, /*traceable=*/false);
-
-  PyObject* ret = nullptr;
-  ParsedArgs<5> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-
-  if (r.idx == 0) {
-    auto buffer = r.pyobject(0);
-    auto dtype = r.scalartype(1);
-    auto count = r.toInt64(2);
-    auto offset = r.toInt64(3);
-    auto requires_grad = r.toBool(4);
-
-    auto elsize = at::elementSize(dtype);
-    size_t actual_count = 0;
-    Py_buffer view;
-
-    TORCH_CHECK_VALUE(
-        PyObject_CheckBuffer(buffer) != 0,
-        "object does not implement Python buffer protocol.");
-
-    if (PyObject_GetBuffer(buffer, &view, PyBUF_WRITABLE) < 0) {
-      TORCH_CHECK(
-          PyObject_GetBuffer(buffer, &view, PyBUF_SIMPLE) >= 0,
-          "could not retrieve buffer from object");
-      TORCH_WARN_ONCE(
-          "The given buffer is not writable, and PyTorch does "
-          "not support non-writable tensors. This means you can write to the "
-          "underlying (supposedly non-writable) buffer using the tensor. "
-          "You may want to copy the buffer to protect its data or make it writable "
-          "before converting it to a tensor. This type of warning will be "
-          "suppressed for the rest of this program.");
-      PyErr_Clear();
-    }
-
-    Py_INCREF(view.obj);
-    THPObjectPtr obj(view.obj);
-
-    auto len = view.len;
-    auto buf = view.buf;
-    PyBuffer_Release(&view);
-
-    TORCH_CHECK_VALUE(
-        len > 0 && count != 0,
-        "both buffer length (", len, ") and count (", count, ") must not be 0");
-    TORCH_CHECK_VALUE(
-        offset >= 0 && offset < len,
-        "offset (", offset, " bytes) must be non-negative and no greater than "
-        "buffer length (", len, " bytes) minus 1");
-    TORCH_CHECK_VALUE(
-        count > 0 || (len - offset) % elsize == 0,
-        "buffer length (", len - offset, " bytes) after offset (", offset, " bytes) "
-        "must be a multiple of element size (", elsize, ")");
-
-    if (count < 0) {
-      actual_count = (len - offset) / elsize;
-    } else {
-      actual_count = static_cast<size_t>(count);
-    }
-
-    TORCH_CHECK_VALUE(
-        static_cast<size_t>(offset) + actual_count * elsize <= len,
-        "requested buffer length (", actual_count, " * ", elsize, " bytes) "
-        "after offset (", offset, " bytes) must not be greater than actual "
-        "buffer length (", len, " bytes)");
-
-    auto offset_buf = static_cast<char*>(buf) + offset;
-    auto options = TensorOptions()
-        .dtype(dtype)
-        .device(c10::kCPU);
-
-    auto tensor = at::for_blob(offset_buf, static_cast<int64_t>(actual_count))
-                      .options(options)
-                      .deleter([obj = obj.release()](void*) {
-                        pybind11::gil_scoped_acquire gil;
-                        Py_DECREF(obj);
-                      })
-                      .make_tensor();
-    tensor.set_requires_grad(requires_grad);
-    ret = wrap(tensor);
-  }
-
-  return ret;
-
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-static PyObject * THPVariable_numel(PyObject* self_, PyObject* args, PyObject* kwargs);
-
-// linspace
-static PyObject * THPVariable_linspace(PyObject* self_, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  static PythonArgParser parser({
-    "linspace(Scalar start, Scalar end, int64_t? steps=None, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
-  }, /*traceable=*/true);
-
-  ParsedArgs<9> parsed_args;
-  auto _r = parser.parse(nullptr, args, kwargs, parsed_args);
-  if(_r.has_torch_function()) {
-    return handle_torch_function(_r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch");
-  }
-  if (_r.isNone(3)) {
-    // aten::linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-
-    // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
-    // This leads to problem in the operator argument checks,
-    // when either `start` or `end` is complex and dtype is None
-    const auto options = TensorOptions()
-        .dtype(_r.scalartypeOptional(4))
-        .device(_r.device(6))
-        .layout(_r.layoutOptional(5))
-        .requires_grad(_r.toBool(8))
-        .pinned_memory(_r.toBool(7));
-    torch::utils::maybe_initialize_cuda(options);
-
-    auto dispatch_linspace = [](Scalar start, Scalar end, c10::optional<int64_t> steps, TensorOptions options) -> Tensor {
-      pybind11::gil_scoped_release no_gil;
-      return torch::linspace(start, end, steps, options);
-    };
-    return wrap(dispatch_linspace(_r.scalar(0), _r.scalar(1), _r.toInt64Optional(2), options));
-  } else {
-    // aten::linspace.out(Scalar start, Scalar end, int? steps=None, *, Tensor(a!) out) -> Tensor(a!)
-    check_out_type_matches(_r.tensor(3), _r.scalartype(4),
-                           _r.isNone(4), _r.layoutOptional(5),
-                           _r.device(6), _r.isNone(6));
-
-    auto dispatch_linspace_out = [](Tensor out, Scalar start, Scalar end, c10::optional<int64_t> steps) -> Tensor {
-      pybind11::gil_scoped_release no_gil;
-      return at::linspace_out(out, start, end, steps);
-    };
-    return wrap(dispatch_linspace_out(_r.tensor(3), _r.scalar(0), _r.scalar(1), _r.toInt64Optional(2)).set_requires_grad(_r.toBool(8)));
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-// logspace
-static PyObject * THPVariable_logspace(PyObject* self_, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  static PythonArgParser parser({
-    "logspace(Scalar start, Scalar end, int64_t? steps=None, double base=10.0, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
-  }, /*traceable=*/true);
-
-  ParsedArgs<10> parsed_args;
-  auto _r = parser.parse(nullptr, args, kwargs, parsed_args);
-  if(_r.has_torch_function()) {
-    return handle_torch_function(_r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch");
-  }
-  if (_r.isNone(4)) {
-    // aten::logspace(Scalar start, Scalar end, int? steps=None, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-
-    // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
-    // This leads to problem in the operator argument checks,
-    // when either `start` or `end` is complex and dtype is None
-    const auto options = TensorOptions()
-        .dtype(_r.scalartypeOptional(5))
-        .device(_r.device(7))
-        .layout(_r.layoutOptional(6))
-        .requires_grad(_r.toBool(9))
-        .pinned_memory(_r.toBool(8));
-    torch::utils::maybe_initialize_cuda(options);
-
-    auto dispatch_logspace = [](Scalar start, Scalar end, c10::optional<int64_t> steps, double base, TensorOptions options) -> Tensor {
-      pybind11::gil_scoped_release no_gil;
-      return torch::logspace(start, end, steps, base, options);
-    };
-    return wrap(dispatch_logspace(_r.scalar(0), _r.scalar(1), _r.toInt64Optional(2), _r.toDouble(3), options));
-  } else {
-    // aten::logspace.out(Scalar start, Scalar end, int? steps=None, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
-    check_out_type_matches(_r.tensor(4), _r.scalartype(5),
-                           _r.isNone(5), _r.layoutOptional(6),
-                           _r.device(7), _r.isNone(7));
-
-    auto dispatch_logspace_out = [](Tensor out, Scalar start, Scalar end, c10::optional<int64_t> steps, double base) -> Tensor {
-      pybind11::gil_scoped_release no_gil;
-      return at::logspace_out(out, start, end, steps, base);
-    };
-    return wrap(dispatch_logspace_out(_r.tensor(4), _r.scalar(0), _r.scalar(1), _r.toInt64Optional(2), _r.toDouble(3)).set_requires_grad(_r.toBool(9)));
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
+namespace torch { namespace autograd {
 
 // generated forward declarations start here
 
 ${py_forwards}
 
-// Wrapper converts a raised TypeError into returning NotImplemented
-// Used to implement binary arithmetic operators
-template <PyObject* (*Func)(PyObject*, PyObject*, PyObject*)>
-static PyObject * TypeError_to_NotImplemented_(PyObject* self, PyObject* args, PyObject* kwargs) {
-  PyObject* ret = Func(self, args, kwargs);
-  if (!ret && PyErr_ExceptionMatches(PyExc_TypeError)) {
-    PyErr_Clear();
-    Py_INCREF(Py_NotImplemented);
-    ret = Py_NotImplemented;
-  }
-  return ret;
-}
-
-// XXX: ops that are bound here are not exposed to the C++ api nor the JIT.
-// Any new ops added here should be accompanied with a comment why they are not
-// being registered through native_functions.yaml, and be tagged cpp / JIT
-static PyMethodDef torch_functions[] = {
-  {"arange", castPyCFunctionWithKeywords(THPVariable_arange),
-    METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"as_tensor", castPyCFunctionWithKeywords(THPVariable_as_tensor),
-    METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"dsmm", castPyCFunctionWithKeywords(THPVariable_mm), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"from_numpy", THPVariable_from_numpy, METH_STATIC | METH_O, NULL},
-  {"frombuffer", castPyCFunctionWithKeywords(THPVariable_frombuffer), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"full", castPyCFunctionWithKeywords(THPVariable_full), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"hsmm", castPyCFunctionWithKeywords(THPVariable_hspmm), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"linspace", castPyCFunctionWithKeywords(THPVariable_linspace), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"logspace", castPyCFunctionWithKeywords(THPVariable_logspace), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"nonzero", castPyCFunctionWithKeywords(THPVariable_nonzero), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"randint", castPyCFunctionWithKeywords(THPVariable_randint), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"range", castPyCFunctionWithKeywords(THPVariable_range), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"saddmm", castPyCFunctionWithKeywords(THPVariable_sspaddmm), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"sparse_coo_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_coo_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"_sparse_coo_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_coo_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"_validate_sparse_coo_tensor_args", castPyCFunctionWithKeywords(THPVariable__validate_sparse_coo_tensor_args), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"sparse_csr_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_csr_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"_sparse_csr_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_csr_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"_validate_sparse_csr_tensor_args", castPyCFunctionWithKeywords(THPVariable__validate_sparse_csr_tensor_args), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"spmm", castPyCFunctionWithKeywords(THPVariable_mm), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"tensor", castPyCFunctionWithKeywords(THPVariable_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"get_device", castPyCFunctionWithKeywords(THPVariable_get_device), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
-  {"numel", castPyCFunctionWithKeywords(THPVariable_numel), METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+static PyMethodDef torch_functions_shard[] = {
   ${py_method_defs}
-  {NULL}
 };
 
-static PyTypeObject THPVariableFunctions = {
-  PyVarObject_HEAD_INIT(NULL, 0)
-  "torch._C._VariableFunctionsClass",    /* tp_name */
-  0,                                     /* tp_basicsize */
-  0,                                     /* tp_itemsize */
-  0,                                     /* tp_dealloc */
-  0,                                     /* tp_vectorcall_offset */
-  0,                                     /* tp_getattr */
-  0,                                     /* tp_setattr */
-  0,                                     /* tp_reserved */
-  0,                                     /* tp_repr */
-  0,                                     /* tp_as_number */
-  0,                                     /* tp_as_sequence */
-  0,                                     /* tp_as_mapping */
-  0,                                     /* tp_hash  */
-  0,                                     /* tp_call */
-  0,                                     /* tp_str */
-  0,                                     /* tp_getattro */
-  0,                                     /* tp_setattro */
-  0,                                     /* tp_as_buffer */
-  Py_TPFLAGS_DEFAULT,                    /* tp_flags */
-  NULL,                                  /* tp_doc */
-  0,                                     /* tp_traverse */
-  0,                                     /* tp_clear */
-  0,                                     /* tp_richcompare */
-  0,                                     /* tp_weaklistoffset */
-  0,                                     /* tp_iter */
-  0,                                     /* tp_iternext */
-  torch_functions,                       /* tp_methods */
-  0,                                     /* tp_members */
-  0,                                     /* tp_getset */
-  0,                                     /* tp_base */
-  0,                                     /* tp_dict */
-  0,                                     /* tp_descr_get */
-  0,                                     /* tp_descr_set */
-  0,                                     /* tp_dictoffset */
-  0,                                     /* tp_init */
-  0,                                     /* tp_alloc */
-  0                                      /* tp_new */
-};
-
-void initTorchFunctions(PyObject* module) {
-  if (PyType_Ready(&THPVariableFunctions) < 0) {
-    throw python_error();
-  }
-  Py_INCREF(&THPVariableFunctions);
-
-  // Steals
-  Py_INCREF(&THPVariableFunctions);
-  if (PyModule_AddObject(module, "_VariableFunctionsClass", reinterpret_cast<PyObject*>(&THPVariableFunctions)) < 0) {
-    throw python_error();
-  }
-  // PyType_GenericNew returns a new reference
-  THPVariableFunctionsModule = PyType_GenericNew(&THPVariableFunctions, Py_None, Py_None);
-  // PyModule_AddObject steals a reference
-  if (PyModule_AddObject(module, "_VariableFunctions", THPVariableFunctionsModule) < 0) {
-    throw python_error();
-  }
+void gatherTorchFunctions${shard_id}(std::vector<PyMethodDef> &torch_functions) {
+  constexpr size_t num_functions = sizeof(torch_functions_shard) / sizeof(torch_functions_shard[0]);
+  torch_functions.insert(
+    torch_functions.end(),
+    torch_functions_shard,
+    torch_functions_shard + num_functions);
 }
 
 // generated methods start here
 
 ${py_methods}
 
-static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  static PythonArgParser parser({
-    "nonzero(Tensor input, *, bool as_tuple=False, Tensor out=None)",
-  });
-  ParsedArgs<3> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-
-  if(r.has_torch_function()){
-    return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
-  }
-
-  const auto as_tuple = r.toBool(1);
-  const auto has_out = !r.isNone(2);
-
-  if (as_tuple) {
-    TORCH_CHECK(!has_out, "nonzero does not support the out kwarg when as_tuple is True");
-    return wrap(dispatch_nonzero_numpy(r.tensor(0)));
-  }
-
-  if (has_out) {
-    return wrap(dispatch_nonzero(r.tensor(0), r.tensor(2)));
-  }
-
-  return wrap(dispatch_nonzero(r.tensor(0)));
-
-  END_HANDLE_TH_ERRORS
-}
-
-static PyObject * THPVariable_numel(PyObject* self_, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  static PythonArgParser parser({
-    "numel(Tensor input)",
-  }, /*traceable=*/false);
-
-  ParsedArgs<1> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-
-  if(r.has_torch_function()){
-    return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
-  }
-
-  if (r.idx == 0) {
-    return wrap(r.tensor(0).numel());
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
 }} // namespace torch::autograd
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 0d888ea8a4cb0..5f4cc0df522f5 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -23,7 +23,9 @@ GENERATED_CPP = [
     "autograd/generated/python_fft_functions.cpp",
     "autograd/generated/python_linalg_functions.cpp",
     "autograd/generated/python_special_functions.cpp",
-    "autograd/generated/python_torch_functions.cpp",
+    "autograd/generated/python_torch_functions_0.cpp",
+    "autograd/generated/python_torch_functions_1.cpp",
+    "autograd/generated/python_torch_functions_2.cpp",
     "autograd/generated/python_variable_methods.cpp",
 ]
 
@@ -664,6 +666,7 @@ libtorch_python_core_sources = [
     "torch/csrc/autograd/python_function.cpp",
     "torch/csrc/autograd/python_hook.cpp",
     "torch/csrc/autograd/python_legacy_variable.cpp",
+    "torch/csrc/autograd/python_torch_functions_manual.cpp",
     "torch/csrc/autograd/python_variable.cpp",
     "torch/csrc/autograd/python_variable_indexing.cpp",
     "torch/csrc/jit/backends/backend_init.cpp",
@@ -760,7 +763,9 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
         "autograd/generated/python_fft_functions.cpp",
         "autograd/generated/python_linalg_functions.cpp",
         "autograd/generated/python_special_functions.cpp",
-        "autograd/generated/python_torch_functions.cpp",
+        "autograd/generated/python_torch_functions_0.cpp",
+        "autograd/generated/python_torch_functions_1.cpp",
+        "autograd/generated/python_torch_functions_2.cpp",
         "autograd/generated/python_variable_methods.cpp",
     ]]
 
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 44bb3b4f87e1d..ffa4ed7a1c70e 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -1,5 +1,5 @@
 import os
-from typing import List, Dict, Optional, Tuple, Set, Callable, Any, Union, Sequence, TypeVar
+from typing import List, Dict, Optional, Tuple, Set, Callable, Any, Union, Sequence, TypeVar, Iterable
 from typing_extensions import Literal
 import yaml
 from collections import OrderedDict, defaultdict, namedtuple
@@ -858,7 +858,7 @@ def write(self, filename: str, env_callable: Callable[[], Union[str, Union[str,
     def write_sharded(
             self,
             filename: str,
-            items: List[T],
+            items: Iterable[T],
             *,
             key_fn: Callable[[T], str],
             env_callable: Callable[[T], Dict[str, List[str]]],
diff --git a/torch/csrc/autograd/python_torch_functions.h b/torch/csrc/autograd/python_torch_functions.h
new file mode 100644
index 0000000000000..58257794812ee
--- /dev/null
+++ b/torch/csrc/autograd/python_torch_functions.h
@@ -0,0 +1,25 @@
+#include <Python.h>
+
+#include <vector>
+
+
+namespace torch { namespace autograd {
+
+extern PyObject* THPVariableFunctionsModule;
+
+// Wrapper converts a raised TypeError into returning NotImplemented
+// Used to implement binary arithmetic operators
+template <PyObject* (*Func)(PyObject*, PyObject*, PyObject*)>
+inline PyObject * TypeError_to_NotImplemented_(PyObject* self, PyObject* args, PyObject* kwargs) {
+  PyObject* ret = Func(self, args, kwargs);
+  if (!ret && PyErr_ExceptionMatches(PyExc_TypeError)) {
+    PyErr_Clear();
+    Py_INCREF(Py_NotImplemented);
+    ret = Py_NotImplemented;
+  }
+  return ret;
+}
+
+void initTorchFunctions();
+
+}}  // namespace torch::autograd
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
new file mode 100644
index 0000000000000..a54d1017bcee8
--- /dev/null
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -0,0 +1,826 @@
+#include <torch/csrc/autograd/python_torch_functions.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/autograd/utils/wrap_outputs.h>
+#include <torch/csrc/Dtype.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/utils/out_types.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/pycfunction_helpers.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+#include <torch/csrc/utils/tensor_layouts.h>
+#include <torch/csrc/utils/tensor_new.h>
+#include <torch/csrc/utils/tensor_numpy.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/utils/structseq.h>
+#include <torch/csrc/utils/cuda_lazy_init.h>
+
+#include <ATen/ATen.h>
+
+#include <fmt/format.h>
+#include <Python.h>
+#include <pybind11/pybind11.h>
+#include <vector>
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+using at::ArrayRef;
+
+using torch::utils::check_out_type_matches;
+using namespace torch::autograd::utils;
+
+namespace torch { namespace autograd {
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+PyObject* THPVariableFunctionsModule = nullptr;
+
+
+inline Tensor dispatch_arange(const Scalar& end, Tensor result) {
+  pybind11::gil_scoped_release no_gil;
+  return at::arange_out(result, end);
+}
+
+inline Tensor dispatch_arange(const Scalar& end, const TensorOptions& options) {
+  torch::utils::maybe_initialize_cuda(options);
+  pybind11::gil_scoped_release no_gil;
+  return torch::arange(end, options);
+}
+
+inline Tensor dispatch_arange(const Scalar& start, const Scalar& end, const Scalar& step, Tensor result) {
+  pybind11::gil_scoped_release no_gil;
+  return at::arange_out(result, start, end, step);
+}
+
+inline Tensor dispatch_arange(const Scalar& start, const Scalar& end, const Scalar& step, const TensorOptions& options) {
+  torch::utils::maybe_initialize_cuda(options);
+  pybind11::gil_scoped_release no_gil;
+  return torch::arange(start, end, step, options);
+}
+
+static PyObject * THPVariable_arange(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "arange(Scalar end, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
+    "arange(Scalar start, Scalar end, Scalar step=1, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
+  }, /*traceable=*/true);
+
+  ParsedArgs<9> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  if(r.has_torch_function()) {
+    return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
+  }
+
+  if (r.idx == 0) {
+    if (r.isNone(1)) {
+      auto end = r.scalar(0);
+      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
+      c10::optional<ScalarType> scalarType = r.scalartypeOptional(2);
+      const auto options = TensorOptions()
+          .dtype(scalarType)
+          .device(r.device(4))
+          .layout(r.layout(3))
+          .requires_grad(r.toBool(6))
+          .pinned_memory(r.toBool(5));
+      return wrap(dispatch_arange(end, options));
+    } else {
+      TORCH_CHECK(!r.toBool(5), " `pin_memory` and `out` parameters are incompatible");
+      check_out_type_matches(r.tensor(1), r.scalartype(2), r.isNone(2), r.layout(3),
+                             r.device(4), r.isNone(4));
+      return wrap(dispatch_arange(r.scalar(0), r.tensor(1)).set_requires_grad(r.toBool(6)));
+    }
+  } else if (r.idx == 1) {
+    if (r.isNone(3)) {
+      auto start = r.scalar(0);
+      auto end = r.scalar(1);
+      auto step = r.scalar(2);
+      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
+      c10::optional<ScalarType> scalarType = r.scalartypeOptional(4);
+      const auto options = TensorOptions()
+          .dtype(scalarType)
+          .device(r.device(6))
+          .layout(r.layout(5))
+          .requires_grad(r.toBool(8))
+          .pinned_memory(r.toBool(7));
+      return wrap(dispatch_arange(start, end, step, options));
+    } else {
+      TORCH_CHECK(!r.toBool(7), " `pin_memory` and `out` parameters are incompatible");
+      check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4), r.layout(5),
+                               r.device(6), r.isNone(6));
+      return wrap(dispatch_arange(r.scalar(0), r.scalar(1), r.scalar(2), r.tensor(3)).set_requires_grad(r.toBool(8)));
+    }
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+inline Tensor dispatch_range(const Scalar& start, const Scalar& end, const Scalar& step, Tensor result) {
+  pybind11::gil_scoped_release no_gil;
+  OptionalDeviceGuard device_guard(device_of(result));
+  return at::range_out(result, start, end, step);
+}
+
+inline Tensor dispatch_range(const Scalar& start, const Scalar& end, const Scalar& step, const TensorOptions& options) {
+  torch::utils::maybe_initialize_cuda(options);
+  pybind11::gil_scoped_release no_gil;
+  DeviceGuard device_guard(options.device());
+  return torch::range(start, end, step, options);
+}
+
+static PyObject * THPVariable_range(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "range(Scalar start, Scalar end, Scalar step=1, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
+  });
+
+  ParsedArgs<8> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  if (r.idx == 0) {
+    auto ret = PyErr_WarnEx(
+        PyExc_UserWarning,
+        "torch.range is deprecated and will be removed in a future release "
+        "because its behavior is inconsistent with Python's range builtin. "
+        "Instead, use torch.arange, which produces values in [start, end).",
+        1);
+    if (ret != 0) throw python_error();
+    if (r.isNone(3)) {
+      const auto options = TensorOptions()
+          .dtype(r.scalartype(4))
+          .device(r.device(6))
+          .layout(r.layout(5))
+          .requires_grad(r.toBool(7));
+      return wrap(dispatch_range(r.scalar(0), r.scalar(1), r.scalar(2), options));
+    } else {
+      check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4),
+                             r.layout(5), r.device(6), r.isNone(6));
+      return wrap(dispatch_range(r.scalar(0), r.scalar(1), r.scalar(2), r.tensor(3)).set_requires_grad(r.toBool(7)));
+    }
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+inline Tensor dispatch_full(
+    IntArrayRef size,
+    const Scalar& fill_val,
+    const TensorOptions& options) {
+  torch::utils::maybe_initialize_cuda(options);
+  pybind11::gil_scoped_release no_gil;
+  return at::full(size, fill_val, options);
+}
+
+inline Tensor dispatch_full(
+    IntArrayRef size,
+    const Scalar& fill_val,
+    c10::optional<DimnameList> names,
+    const TensorOptions& options) {
+  torch::utils::maybe_initialize_cuda(options);
+  pybind11::gil_scoped_release no_gil;
+  return at::full(size, fill_val, names, options);
+}
+
+inline Tensor dispatch_full(
+    IntArrayRef size,
+    const Scalar& fill_val,
+    Tensor result) {
+  pybind11::gil_scoped_release no_gil;
+  return at::full_out(result, size, fill_val);
+}
+
+static PyObject * THPVariable_full(PyObject* self, PyObject* args, PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+
+  static PythonArgParser parser({
+    "full(IntArrayRef size, Scalar fill_value, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
+    "full(IntArrayRef size, Scalar fill_value, *, DimnameList names=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
+  }, /*traceable=*/true);
+
+  // Acquires (common) arguments
+  ParsedArgs<8> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  if(r.has_torch_function()) {
+    return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
+  }
+
+  auto size = r.intlist(0);
+  auto fill_val = r.scalar(1);
+  const auto options = TensorOptions{}
+      .dtype(r.scalartypeOptional(3))
+      .layout(r.layout(4))
+      .device(r.device(5))
+      .pinned_memory(r.toBool(6));
+
+  if (r.idx == 0) {
+    // full
+    if (r.isNone(2)) {
+      return wrap(dispatch_full(size, fill_val, options).set_requires_grad(r.toBool(7)));
+    }
+
+    // full.out
+    // Validates out tensor and other kwargs
+    auto result = r.tensor(2);
+    TORCH_CHECK(!r.toBool(6), " `pin_memory` and `out` parameters are incompatible");
+    check_out_type_matches(result, r.scalartype(3), r.isNone(3), r.layout(4),
+                          r.device(5), r.isNone(5));
+
+    return wrap(dispatch_full(size, fill_val, result).set_requires_grad(r.toBool(7)));
+  } else if (r.idx == 1) {
+    // full.names
+    if (r.isNone(2)) {
+      return wrap(dispatch_full(size, fill_val, c10::nullopt, options).set_requires_grad(r.toBool(7)));
+    }
+
+    // Converts from c10::optional<std:vector...> to c10::optional<ArrayRef...>
+    auto raw_names = r.toDimnameListOptional(2);
+    c10::optional<DimnameList> names(*raw_names);
+    return wrap(dispatch_full(size, fill_val, names, options).set_requires_grad(r.toBool(7)));
+  }
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+inline Tensor dispatch_randint(int64_t high, IntArrayRef size, c10::optional<Generator> generator, Tensor result) {
+  pybind11::gil_scoped_release no_gil;
+  return at::randint_out(result, high, size, generator);
+}
+inline Tensor dispatch_randint(int64_t high, IntArrayRef size, c10::optional<Generator> generator, const TensorOptions & options) {
+  torch::utils::maybe_initialize_cuda(options);
+  pybind11::gil_scoped_release no_gil;
+  return torch::randint(high, size, generator, options);
+}
+inline Tensor dispatch_randint(int64_t high, IntArrayRef size, Tensor result) {
+  pybind11::gil_scoped_release no_gil;
+  return at::randint_out(result, high, size);
+}
+inline Tensor dispatch_randint(int64_t high, IntArrayRef size, const TensorOptions & options) {
+  torch::utils::maybe_initialize_cuda(options);
+  pybind11::gil_scoped_release no_gil;
+  return torch::randint(high, size, options);
+}
+inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, c10::optional<Generator> generator, Tensor result) {
+  pybind11::gil_scoped_release no_gil;
+  return at::randint_out(result, low, high, size, generator);
+}
+inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, c10::optional<Generator> generator, const TensorOptions & options) {
+  torch::utils::maybe_initialize_cuda(options);
+  pybind11::gil_scoped_release no_gil;
+  return torch::randint(low, high, size, generator, options);
+}
+inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, Tensor result) {
+  pybind11::gil_scoped_release no_gil;
+  return at::randint_out(result, low, high, size);
+}
+inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, const TensorOptions & options) {
+  torch::utils::maybe_initialize_cuda(options);
+  pybind11::gil_scoped_release no_gil;
+  return torch::randint(low, high, size, options);
+}
+
+static PyObject * THPVariable_randint(PyObject* self_, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "randint(int64_t high, IntArrayRef size, *, Generator generator=None, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
+    "randint(int64_t low, int64_t high, IntArrayRef size, *, Generator generator=None, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
+  }, /*traceable=*/false);
+
+  ParsedArgs<9> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  if(r.has_torch_function()) {
+    return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
+  }
+
+  if (r.idx == 0) {
+    if (r.isNone(3)) {
+      auto high = r.toInt64(0);
+      auto size = r.intlist(1);
+      auto generator = r.generator(2);
+      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
+      auto dtype = r.scalartypeWithDefault(4, at::ScalarType::Long);
+      auto device = r.device(6);
+      const auto options = TensorOptions()
+          .dtype(dtype)
+          .device(device)
+          .layout(r.layout(5))
+          .requires_grad(r.toBool(7));
+      return wrap(dispatch_randint(high, size, generator, options));
+    } else {
+      check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4),
+                             r.layout(5), r.device(6), r.isNone(6));
+      return wrap(dispatch_randint(r.toInt64(0), r.intlist(1), r.generator(2), r.tensor(3)).set_requires_grad(r.toBool(7)));
+    }
+  } else if (r.idx == 1) {
+    if (r.isNone(4)) {
+      auto low = r.toInt64(0);
+      auto high = r.toInt64(1);
+      auto size = r.intlist(2);
+      auto generator = r.generator(3);
+      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
+      auto dtype = r.scalartypeWithDefault(5, at::ScalarType::Long);
+      auto device = r.device(7);
+      const auto options = TensorOptions()
+          .dtype(dtype)
+          .device(device)
+          .layout(r.layout(6))
+          .requires_grad(r.toBool(8));
+      return wrap(dispatch_randint(low, high, size, generator, options));
+    } else {
+      check_out_type_matches(r.tensor(4), r.scalartype(5), r.isNone(5),
+                             r.layout(6), r.device(7), r.isNone(7));
+      return wrap(dispatch_randint(r.toInt64(0), r.toInt64(1), r.intlist(2), r.generator(3), r.tensor(4)).set_requires_grad(r.toBool(8)));
+    }
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on python object to allow torch.as_tensor to be constructed with arbitrarily nested
+// python objects - list, tuple, np array, scalar, etc.
+static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.as_tensor", jit::tracer::WARN_CONSTRUCTOR);
+  return THPVariable_Wrap(torch::utils::as_tensor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on python object here because PyObject currently not natively declarable
+// See: ATen/native/README.md for more context
+static PyObject * THPVariable_from_numpy(PyObject* module, PyObject* arg)
+{
+  HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.from_numpy", jit::tracer::WARN_CONSTRUCTOR);
+  return THPVariable_Wrap(torch::utils::tensor_from_numpy(arg));
+  END_HANDLE_TH_ERRORS
+}
+
+static Tensor dispatch_nonzero(const Tensor & self) {
+  pybind11::gil_scoped_release no_gil;
+  OptionalDeviceGuard device_guard(device_of(self));
+  return self.nonzero();
+}
+
+static Tensor dispatch_nonzero(const Tensor & self, Tensor out) {
+  pybind11::gil_scoped_release no_gil;
+  OptionalDeviceGuard device_guard(device_of(self));
+  return at::nonzero_out(out, self);
+}
+
+static std::vector<Tensor> dispatch_nonzero_numpy(const Tensor & self) {
+  pybind11::gil_scoped_release no_gil;
+  OptionalDeviceGuard device_guard(device_of(self));
+  return self.nonzero_numpy();
+}
+
+static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject* kwargs);
+
+static PyObject * THPVariable_sparse_csr_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.sparse_csr_tensor", jit::tracer::WARN_CONSTRUCTOR);
+  return THPVariable_Wrap(torch::utils::sparse_csr_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable__sparse_csr_tensor_unsafe(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  jit::tracer::warn("torch._sparse_csr_tensor_unsafe", jit::tracer::WARN_CONSTRUCTOR);
+  return THPVariable_Wrap(torch::utils::_sparse_csr_tensor_unsafe_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.sparse_coo_tensor", jit::tracer::WARN_CONSTRUCTOR);
+  return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable__sparse_coo_tensor_unsafe(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  jit::tracer::warn("torch._sparse_coo_tensor_unsafe", jit::tracer::WARN_CONSTRUCTOR);
+  return THPVariable_Wrap(torch::utils::_sparse_coo_tensor_unsafe_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+// implemented on python object to allow torch.tensor to be constructed with arbitrarily nested
+// python objects - list, tuple, np array, scalar, etc.
+static PyObject * THPVariable_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.tensor", jit::tracer::WARN_CONSTRUCTOR);
+  return THPVariable_Wrap(torch::utils::tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_get_device(PyObject* self_, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "get_device(Tensor input)",
+  }, /*traceable=*/false);
+
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  if (r.idx == 0) {
+    return wrap(r.tensor(0).get_device());
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}static PyObject * THPVariable_frombuffer(PyObject* self_, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "frombuffer(PyObject* buffer, *, ScalarType dtype, int64_t count=-1, int64_t offset=0, bool requires_grad=False)",
+  }, /*traceable=*/false);
+
+  PyObject* ret = nullptr;
+  ParsedArgs<5> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  if (r.idx == 0) {
+    auto buffer = r.pyobject(0);
+    auto dtype = r.scalartype(1);
+    auto count = r.toInt64(2);
+    auto offset = r.toInt64(3);
+    auto requires_grad = r.toBool(4);
+
+    auto elsize = at::elementSize(dtype);
+    size_t actual_count = 0;
+    Py_buffer view;
+
+    TORCH_CHECK_VALUE(
+        PyObject_CheckBuffer(buffer) != 0,
+        "object does not implement Python buffer protocol.");
+
+    if (PyObject_GetBuffer(buffer, &view, PyBUF_WRITABLE) < 0) {
+      TORCH_CHECK(
+          PyObject_GetBuffer(buffer, &view, PyBUF_SIMPLE) >= 0,
+          "could not retrieve buffer from object");
+      TORCH_WARN_ONCE(
+          "The given buffer is not writable, and PyTorch does "
+          "not support non-writable tensors. This means you can write to the "
+          "underlying (supposedly non-writable) buffer using the tensor. "
+          "You may want to copy the buffer to protect its data or make it writable "
+          "before converting it to a tensor. This type of warning will be "
+          "suppressed for the rest of this program.");
+      PyErr_Clear();
+    }
+
+    Py_INCREF(view.obj);
+    THPObjectPtr obj(view.obj);
+
+    auto len = view.len;
+    auto buf = view.buf;
+    PyBuffer_Release(&view);
+
+    TORCH_CHECK_VALUE(
+        len > 0 && count != 0,
+        "both buffer length (", len, ") and count (", count, ") must not be 0");
+    TORCH_CHECK_VALUE(
+        offset >= 0 && offset < len,
+        "offset (", offset, " bytes) must be non-negative and no greater than "
+        "buffer length (", len, " bytes) minus 1");
+    TORCH_CHECK_VALUE(
+        count > 0 || (len - offset) % elsize == 0,
+        "buffer length (", len - offset, " bytes) after offset (", offset, " bytes) "
+        "must be a multiple of element size (", elsize, ")");
+
+    if (count < 0) {
+      actual_count = (len - offset) / elsize;
+    } else {
+      actual_count = static_cast<size_t>(count);
+    }
+
+    TORCH_CHECK_VALUE(
+        static_cast<size_t>(offset) + actual_count * elsize <= len,
+        "requested buffer length (", actual_count, " * ", elsize, " bytes) "
+        "after offset (", offset, " bytes) must not be greater than actual "
+        "buffer length (", len, " bytes)");
+
+    auto offset_buf = static_cast<char*>(buf) + offset;
+    auto options = TensorOptions()
+        .dtype(dtype)
+        .device(c10::kCPU);
+
+    auto tensor = at::for_blob(offset_buf, static_cast<int64_t>(actual_count))
+                      .options(options)
+                      .deleter([obj = obj.release()](void*) {
+                        pybind11::gil_scoped_acquire gil;
+                        Py_DECREF(obj);
+                      })
+                      .make_tensor();
+    tensor.set_requires_grad(requires_grad);
+    ret = wrap(tensor);
+  }
+
+  return ret;
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_numel(PyObject* self_, PyObject* args, PyObject* kwargs);
+
+// linspace
+static PyObject * THPVariable_linspace(PyObject* self_, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "linspace(Scalar start, Scalar end, int64_t? steps=None, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
+  }, /*traceable=*/true);
+
+  ParsedArgs<9> parsed_args;
+  auto _r = parser.parse(nullptr, args, kwargs, parsed_args);
+  if(_r.has_torch_function()) {
+    return handle_torch_function(_r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch");
+  }
+  if (_r.isNone(3)) {
+    // aten::linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+
+    // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
+    // This leads to problem in the operator argument checks,
+    // when either `start` or `end` is complex and dtype is None
+    const auto options = TensorOptions()
+        .dtype(_r.scalartypeOptional(4))
+        .device(_r.device(6))
+        .layout(_r.layoutOptional(5))
+        .requires_grad(_r.toBool(8))
+        .pinned_memory(_r.toBool(7));
+    torch::utils::maybe_initialize_cuda(options);
+
+    auto dispatch_linspace = [](Scalar start, Scalar end, c10::optional<int64_t> steps, TensorOptions options) -> Tensor {
+      pybind11::gil_scoped_release no_gil;
+      return torch::linspace(start, end, steps, options);
+    };
+    return wrap(dispatch_linspace(_r.scalar(0), _r.scalar(1), _r.toInt64Optional(2), options));
+  } else {
+    // aten::linspace.out(Scalar start, Scalar end, int? steps=None, *, Tensor(a!) out) -> Tensor(a!)
+    check_out_type_matches(_r.tensor(3), _r.scalartype(4),
+                           _r.isNone(4), _r.layoutOptional(5),
+                           _r.device(6), _r.isNone(6));
+
+    auto dispatch_linspace_out = [](Tensor out, Scalar start, Scalar end, c10::optional<int64_t> steps) -> Tensor {
+      pybind11::gil_scoped_release no_gil;
+      return at::linspace_out(out, start, end, steps);
+    };
+    return wrap(dispatch_linspace_out(_r.tensor(3), _r.scalar(0), _r.scalar(1), _r.toInt64Optional(2)).set_requires_grad(_r.toBool(8)));
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// logspace
+static PyObject * THPVariable_logspace(PyObject* self_, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "logspace(Scalar start, Scalar end, int64_t? steps=None, double base=10.0, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)",
+  }, /*traceable=*/true);
+
+  ParsedArgs<10> parsed_args;
+  auto _r = parser.parse(nullptr, args, kwargs, parsed_args);
+  if(_r.has_torch_function()) {
+    return handle_torch_function(_r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch");
+  }
+  if (_r.isNone(4)) {
+    // aten::logspace(Scalar start, Scalar end, int? steps=None, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+
+    // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
+    // This leads to problem in the operator argument checks,
+    // when either `start` or `end` is complex and dtype is None
+    const auto options = TensorOptions()
+        .dtype(_r.scalartypeOptional(5))
+        .device(_r.device(7))
+        .layout(_r.layoutOptional(6))
+        .requires_grad(_r.toBool(9))
+        .pinned_memory(_r.toBool(8));
+    torch::utils::maybe_initialize_cuda(options);
+
+    auto dispatch_logspace = [](Scalar start, Scalar end, c10::optional<int64_t> steps, double base, TensorOptions options) -> Tensor {
+      pybind11::gil_scoped_release no_gil;
+      return torch::logspace(start, end, steps, base, options);
+    };
+    return wrap(dispatch_logspace(_r.scalar(0), _r.scalar(1), _r.toInt64Optional(2), _r.toDouble(3), options));
+  } else {
+    // aten::logspace.out(Scalar start, Scalar end, int? steps=None, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+    check_out_type_matches(_r.tensor(4), _r.scalartype(5),
+                           _r.isNone(5), _r.layoutOptional(6),
+                           _r.device(7), _r.isNone(7));
+
+    auto dispatch_logspace_out = [](Tensor out, Scalar start, Scalar end, c10::optional<int64_t> steps, double base) -> Tensor {
+      pybind11::gil_scoped_release no_gil;
+      return at::logspace_out(out, start, end, steps, base);
+    };
+    return wrap(dispatch_logspace_out(_r.tensor(4), _r.scalar(0), _r.scalar(1), _r.toInt64Optional(2), _r.toDouble(3)).set_requires_grad(_r.toBool(9)));
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// XXX: ops that are bound here are not exposed to the C++ api nor the JIT.
+// Any new ops added here should be accompanied with a comment why they are not
+// being registered through native_functions.yaml, and be tagged cpp / JIT
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+static PyMethodDef torch_functions_manual[] = {
+  {"arange", castPyCFunctionWithKeywords(THPVariable_arange),
+    METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"as_tensor", castPyCFunctionWithKeywords(THPVariable_as_tensor),
+    METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"from_numpy", THPVariable_from_numpy, METH_STATIC | METH_O, nullptr},
+  {"frombuffer", castPyCFunctionWithKeywords(THPVariable_frombuffer), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"full", castPyCFunctionWithKeywords(THPVariable_full), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"linspace", castPyCFunctionWithKeywords(THPVariable_linspace), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"logspace", castPyCFunctionWithKeywords(THPVariable_logspace), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"nonzero", castPyCFunctionWithKeywords(THPVariable_nonzero), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"randint", castPyCFunctionWithKeywords(THPVariable_randint), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"range", castPyCFunctionWithKeywords(THPVariable_range), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"sparse_coo_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_coo_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"_sparse_coo_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_coo_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"sparse_csr_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_csr_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"_sparse_csr_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_csr_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"tensor", castPyCFunctionWithKeywords(THPVariable_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"get_device", castPyCFunctionWithKeywords(THPVariable_get_device), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+  {"numel", castPyCFunctionWithKeywords(THPVariable_numel), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr},
+};
+
+static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "nonzero(Tensor input, *, bool as_tuple=False, Tensor out=None)",
+  });
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
+  }
+
+  const auto as_tuple = r.toBool(1);
+  const auto has_out = !r.isNone(2);
+
+  if (as_tuple) {
+    TORCH_CHECK(!has_out, "nonzero does not support the out kwarg when as_tuple is True");
+    return wrap(dispatch_nonzero_numpy(r.tensor(0)));
+  }
+
+  if (has_out) {
+    return wrap(dispatch_nonzero(r.tensor(0), r.tensor(2)));
+  }
+
+  return wrap(dispatch_nonzero(r.tensor(0)));
+
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * THPVariable_numel(PyObject* self_, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "numel(Tensor input)",
+  }, /*traceable=*/false);
+
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  if(r.has_torch_function()){
+    return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
+  }
+
+  if (r.idx == 0) {
+    return wrap(r.tensor(0).numel());
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// Sharded function definitions
+void gatherTorchFunctions_0(std::vector<PyMethodDef> &torch_functions);
+void gatherTorchFunctions_1(std::vector<PyMethodDef> &torch_functions);
+void gatherTorchFunctions_2(std::vector<PyMethodDef> &torch_functions);
+
+void gatherTorchFunctions(std::vector<PyMethodDef> &torch_functions) {
+  constexpr size_t num_functions = sizeof(torch_functions_manual) / sizeof(torch_functions_manual[0]);
+  torch_functions.assign(torch_functions_manual,
+                         torch_functions_manual + num_functions);
+  // NOTE: Must be synced with num_shards in tools/autograd/gen_python_functions.py
+  gatherTorchFunctions_0(torch_functions);
+  gatherTorchFunctions_1(torch_functions);
+  gatherTorchFunctions_2(torch_functions);
+
+  static std::array<std::pair<const char *, const char *>, 4> aliases{{
+    // Canonical function, alias name
+    {"sspaddmm", "saddmm"},
+    {"mm", "spmm"},
+    {"mm", "dsmm"},
+    {"hspmm", "hsmm"}
+  }};
+
+  for (const auto& alias : aliases) {
+    auto it = std::find_if(torch_functions.begin(), torch_functions.end(),
+                          [&](const PyMethodDef& def) {
+                            return strcmp(def.ml_name, alias.first) == 0;
+                          });
+    TORCH_INTERNAL_ASSERT(
+        it != torch_functions.end(),
+        "Failed to create function alias from ", alias.first, " to ", alias.second);
+    PyMethodDef alias_def = *it;
+    alias_def.ml_name = alias.second;
+
+    torch_functions.push_back(alias_def);
+  }
+
+  torch_functions.push_back({nullptr});
+  torch_functions.shrink_to_fit();
+}
+
+static PyTypeObject THPVariableFunctions = {
+  PyVarObject_HEAD_INIT(nullptr, 0)
+  "torch._C._VariableFunctionsClass",    /* tp_name */
+  0,                                     /* tp_basicsize */
+  0,                                     /* tp_itemsize */
+  nullptr,                               /* tp_dealloc */
+  0,                                     /* tp_vectorcall_offset */
+  nullptr,                               /* tp_getattr */
+  nullptr,                               /* tp_setattr */
+  nullptr,                               /* tp_reserved */
+  nullptr,                               /* tp_repr */
+  nullptr,                               /* tp_as_number */
+  nullptr,                               /* tp_as_sequence */
+  nullptr,                               /* tp_as_mapping */
+  nullptr,                               /* tp_hash  */
+  nullptr,                               /* tp_call */
+  nullptr,                               /* tp_str */
+  nullptr,                               /* tp_getattro */
+  nullptr,                               /* tp_setattro */
+  nullptr,                               /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT,                    /* tp_flags */
+  nullptr,                               /* tp_doc */
+  nullptr,                               /* tp_traverse */
+  nullptr,                               /* tp_clear */
+  nullptr,                               /* tp_richcompare */
+  0,                                     /* tp_weaklistoffset */
+  nullptr,                               /* tp_iter */
+  nullptr,                               /* tp_iternext */
+  nullptr,                               /* tp_methods */
+  nullptr,                               /* tp_members */
+  nullptr,                               /* tp_getset */
+  nullptr,                               /* tp_base */
+  nullptr,                               /* tp_dict */
+  nullptr,                               /* tp_descr_get */
+  nullptr,                               /* tp_descr_set */
+  0,                                     /* tp_dictoffset */
+  nullptr,                               /* tp_init */
+  nullptr,                               /* tp_alloc */
+  nullptr                                /* tp_new */
+};
+
+void initTorchFunctions(PyObject *module) {
+  static std::vector<PyMethodDef> torch_functions;
+  gatherTorchFunctions(torch_functions);
+  THPVariableFunctions.tp_methods = torch_functions.data();
+
+  if (PyType_Ready(&THPVariableFunctions) < 0) {
+    throw python_error();
+  }
+  Py_INCREF(&THPVariableFunctions);
+
+  // Steals
+  Py_INCREF(&THPVariableFunctions);
+  if (PyModule_AddObject(module, "_VariableFunctionsClass",
+                         reinterpret_cast<PyObject*>(&THPVariableFunctions)) < 0) {
+    throw python_error();
+  }
+  // PyType_GenericNew returns a new reference
+  THPVariableFunctionsModule = PyType_GenericNew(&THPVariableFunctions, Py_None, Py_None);
+  // PyModule_AddObject steals a reference
+  if (PyModule_AddObject(module, "_VariableFunctions", THPVariableFunctionsModule) < 0) {
+    throw python_error();
+  }
+}
+
+}}  // namespace torch::autograd

From ba0e6a1e03d110b7a6b95de7c80651ec623e8135 Mon Sep 17 00:00:00 2001
From: John Clow <jclow@fb.com>
Date: Wed, 25 Aug 2021 15:27:37 -0700
Subject: [PATCH 231/530] [EASY] Update the clang-tidy error message (#63370)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63370

As shown by this CI run, the actual thing that is incorrect is the prompt.
https://github.com/pytorch/pytorch/actions/runs/1137298261

The CI runs the below command instead of the original command.
The original command errors out when importing another file on line 1.
Trying to fix the code to work with the original command causes the CI to error out.

We should actually ask the user to run
`python3 -m tools.linter.install.clang_tidy`

Test Plan: Imported from OSS

Reviewed By: janeyx99, heitorschueroff

Differential Revision: D30530216

Pulled By: Gamrix

fbshipit-source-id: 2a2b8d539dcc2839e4000c13e82c207fa89bfc9f
---
 tools/linter/clang_tidy/__main__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/linter/clang_tidy/__main__.py b/tools/linter/clang_tidy/__main__.py
index b99c1f5366848..1846916c26f3d 100644
--- a/tools/linter/clang_tidy/__main__.py
+++ b/tools/linter/clang_tidy/__main__.py
@@ -184,7 +184,8 @@ def main() -> None:
             f"Could not find '{options.clang_tidy_exe}'\n"
             + "We provide a custom build of clang-tidy that has additional checks.\n"
             + "You can install it by running:\n"
-            + "$ python3 tools/linter/install/clang_tidy.py"
+            + "$ python3 -m tools.linter.install.clang_tidy \n"
+            + "from the pytorch folder"
         )
         raise RuntimeError(msg)
 

From 1be1c901aabd3ddcf55af3ee869e611b7f3f43b6 Mon Sep 17 00:00:00 2001
From: David Riazati <driazati@users.noreply.github.com>
Date: Wed, 25 Aug 2021 15:54:31 -0700
Subject: [PATCH 232/530] Remove render_test_results job (#63877)

Summary:
This removes the `render_test_results` job we had before which had been causing some confusion among devs when it failed and isn't really necessary now that we can actually render test results on the PR HUD.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63877

Reviewed By: walterddr, janeyx99

Differential Revision: D30546705

Pulled By: driazati

fbshipit-source-id: 55fdafdb6f80924d941ffc15ee10787cb54f34a1
---
 .github/scripts/generate_ci_workflows.py      |  7 +-
 .github/templates/bazel_ci_workflow.yml.j2    | 68 +---------------
 .github/templates/common.yml.j2               | 25 ++++++
 .github/templates/linux_ci_workflow.yml.j2    | 80 ++-----------------
 .github/templates/windows_ci_workflow.yml.j2  | 71 +---------------
 ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 78 ++++--------------
 ...rated-linux-bionic-py3.8-gcc9-coverage.yml | 78 ++++--------------
 ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 78 ++++--------------
 ...rated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 78 ++++--------------
 .../generated-linux-xenial-py3.6-gcc5.4.yml   | 78 ++++--------------
 ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml | 67 +++-------------
 ...iodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 78 ++++--------------
 ...rated-periodic-win-vs2019-cuda11.1-py3.yml | 67 +++-------------
 .../generated-win-vs2019-cpu-py3.yml          | 67 +++-------------
 .../generated-win-vs2019-cuda10.1-py3.yml     | 67 +++-------------
 .../generated-win-vs2019-cuda11.3-py3.yml     | 67 +++-------------
 16 files changed, 173 insertions(+), 881 deletions(-)

diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 946d8da6a29ad..f1b962521b18d 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -191,8 +191,10 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         with open(output_file_path, "w") as output_file:
             GENERATED = "generated"  # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
             output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
-            output_file.write(workflow_template.render(asdict(self)))
-            output_file.write("\n")
+            content = workflow_template.render(asdict(self))
+            output_file.write(content)
+            if content[-1] != "\n":
+                output_file.write("\n")
         print(output_file_path)
 
 
@@ -504,6 +506,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
     jinja_env = jinja2.Environment(
         variable_start_string="!{{",
         loader=jinja2.FileSystemLoader(str(GITHUB_DIR.joinpath("templates"))),
+        undefined=jinja2.StrictUndefined,
     )
     template_and_workflows = [
         (jinja_env.get_template("linux_ci_workflow.yml.j2"), LINUX_WORKFLOWS),
diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2
index 016a11bc39277..d25ffe6d8a7e5 100644
--- a/.github/templates/bazel_ci_workflow.yml.j2
+++ b/.github/templates/bazel_ci_workflow.yml.j2
@@ -44,6 +44,7 @@ on:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      !{{ common.display_ec2_information() }}
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -85,9 +86,7 @@ on:
             -w /var/lib/jenkins/workspace \
             "${DOCKER_IMAGE}" \
             sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/build.sh'
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
+      !{{ common.parse_ref() }}
       - name: Display and upload binary build size statistics (Click Me)
         # temporary hack: set CIRCLE_* vars, until we update
         # tools/stats/print_test_stats.py to natively support GitHub Actions
@@ -156,71 +155,10 @@ on:
           if-no-files-found: error
           path:
             test-reports-*.zip
+      !{{ common.upload_test_statistics(build_environment) }}
       - name: Clean up docker images
         if: always()
         run: |
           # Prune all of the docker images
           docker system prune -af
 {%- endblock %}
-{% block render_test_results +%}
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [build-and-test, !{{ ciflow_config.root_job_name }}]
-    if: ${{ needs.build-and-test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports
-          path: .
-      - name: Unzip test reports
-        run: |
-          # Should preserve paths so reports should still be in test/test-reports
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload test statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: !{{ build_environment }}-build-and-test
-          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
-          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
-          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
-        run: |
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-{%- endblock %}
diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index 12108f1f95f46..bf72898d04c25 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -4,3 +4,28 @@
         run: |
           .github/scripts/display_ec2_information.sh
 {%- endmacro -%}
+
+{%- macro parse_ref() -%}
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+{%- endmacro -%}
+
+{%- macro upload_test_statistics(build_environment) -%}
+      - name: Display and upload test statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: !{{ build_environment }}-test
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
+        run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
+          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+{%- endmacro -%}
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 767760bf24d25..f63685295bbce 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -194,9 +194,7 @@ jobs:
             -w /var/lib/jenkins/workspace \
             "${DOCKER_IMAGE}" \
             sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
+      !{{ common.parse_ref() }}
       - name: Display and upload binary build size statistics (Click Me)
         # temporary hack: set CIRCLE_* vars, until we update
         # tools/stats/print_test_stats.py to natively support GitHub Actions
@@ -424,6 +422,8 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
+      !{{ common.parse_ref() }}
+      !{{ common.upload_test_statistics(build_environment) }}
       - name: Hold runner for 2 hours or until ssh sessions have drained
         # Always hold for active ssh sessions
         if: always()
@@ -437,76 +437,7 @@ jobs:
           docker system prune -af
 {% endblock %}
 {%- endif -%}
-{%- if not is_libtorch %}
-{% block render_test_results +%}
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [generate-test-matrix, test, !{{ ciflow_config.root_job_name }}]
-    if: ${{ needs.test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
-      fail-fast: false
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      !{{ common.display_ec2_information() }}
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports-${{ matrix.config }}
-          path: .
-      - name: Unzip test reports
-        run: |
-          # Should preserve paths so reports should still be in test/test-reports
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload test statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: !{{ build_environment }}-test
-          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
-          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
-          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
-        run: |
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-{%- endblock %}
-{%- endif -%}
-  {%- if enable_doc_jobs %}
-
+{%- if enable_doc_jobs %}
   pytorch_python_doc_build:
     runs-on: linux.2xlarge
     needs: [calculate-docker-image, build, !{{ ciflow_config.root_job_name }}]
@@ -608,5 +539,4 @@ jobs:
         run: |
           # Prune all of the docker images
           docker system prune -af
-
-  {%- endif -%}
+{%- endif -%}
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index 6756bf4720ac6..1be7b325306d5 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -275,78 +275,11 @@ jobs:
           path:
             pytorch-${{ github.run_id }}/test-reports-*.zip
       !{{ wait_and_kill_ssh() }}
+      !{{ common.parse_ref() }}
+      !{{ common.upload_test_statistics(build_environment) }}
       - name: Cleanup workspace
         if: always()
         shell: bash
         # Should remove the entirety of pytorch-${{ github.run_id }}
         run: |
           rm -rf ./*
-
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [generate-test-matrix, test, !{{ ciflow_config.root_job_name }}]
-{%- if only_build_on_pull_request %}
-    if: ${{ github.event_name == 'push' && (needs.test.result != 'skipped' || failure()) }}
-{%- else %}
-    if: ${{ needs.test.result != 'skipped' || failure() }}
-{%- endif %}
-    runs-on: linux.2xlarge
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
-      fail-fast: false
-    # TODO: Make this into a composite step
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      !{{ common.display_ec2_information() }}
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports-${{ matrix.config }}
-          path: .
-      - name: Unzip test reports
-        run: |
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload test statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: !{{ build_environment }}-test
-          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
-          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
-          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
-        run: |
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 5a4b6c6a56c78..0b3dddd3930e5 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -395,70 +395,6 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-          # Prune all of the docker images
-          docker system prune -af
-
-
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [generate-test-matrix, test, ciflow_should_run]
-    if: ${{ needs.test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
-      fail-fast: false
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports-${{ matrix.config }}
-          path: .
-      - name: Unzip test reports
-        run: |
-          # Should preserve paths so reports should still be in test/test-reports
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -473,5 +409,19 @@ jobs:
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
         run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index 1226715485f21..624e9d0d92c5b 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -399,70 +399,6 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-          # Prune all of the docker images
-          docker system prune -af
-
-
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [generate-test-matrix, test, ciflow_should_run]
-    if: ${{ needs.test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
-      fail-fast: false
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports-${{ matrix.config }}
-          path: .
-      - name: Unzip test reports
-        run: |
-          # Should preserve paths so reports should still be in test/test-reports
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -477,5 +413,19 @@ jobs:
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
         run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 38321b1834b26..99a9f1f778f9f 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -395,70 +395,6 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-          # Prune all of the docker images
-          docker system prune -af
-
-
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [generate-test-matrix, test, ciflow_should_run]
-    if: ${{ needs.test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
-      fail-fast: false
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports-${{ matrix.config }}
-          path: .
-      - name: Unzip test reports
-        run: |
-          # Should preserve paths so reports should still be in test/test-reports
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -473,5 +409,19 @@ jobs:
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
         run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 2daf432ae76c8..be56b56ee715b 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -395,70 +395,6 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-          # Prune all of the docker images
-          docker system prune -af
-
-
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [generate-test-matrix, test, ciflow_should_run]
-    if: ${{ needs.test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
-      fail-fast: false
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports-${{ matrix.config }}
-          path: .
-      - name: Unzip test reports
-        run: |
-          # Should preserve paths so reports should still be in test/test-reports
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -473,5 +409,19 @@ jobs:
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
         run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index 3551fe9845218..c1b877c7f9c10 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -395,70 +395,6 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-          # Prune all of the docker images
-          docker system prune -af
-
-
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [generate-test-matrix, test, ciflow_should_run]
-    if: ${{ needs.test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
-      fail-fast: false
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports-${{ matrix.config }}
-          path: .
-      - name: Unzip test reports
-        run: |
-          # Should preserve paths so reports should still be in test/test-reports
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -473,8 +409,22 @@ jobs:
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
         run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # Prune all of the docker images
+          docker system prune -af
 
   pytorch_python_doc_build:
     runs-on: linux.2xlarge
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index 9e787e4ba3845..7ca389635bf56 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -138,6 +138,10 @@ jobs:
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -250,69 +254,24 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [build-and-test, ciflow_should_run]
-    if: ${{ needs.build-and-test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports
-          path: .
-      - name: Unzip test reports
-        run: |
-          # Should preserve paths so reports should still be in test/test-reports
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
       - name: Display and upload test statistics (Click Me)
         # temporary hack: set CIRCLE_* vars, until we update
         # tools/stats/print_test_stats.py to natively support GitHub Actions
         env:
           AWS_DEFAULT_REGION: us-east-1
           CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-xenial-py3.6-gcc7-bazel-test-build-and-test
+          JOB_BASE_NAME: linux-xenial-py3.6-gcc7-bazel-test-test
           CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
         run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index c87397849106e..375c4b65aaf89 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -393,70 +393,6 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-          # Prune all of the docker images
-          docker system prune -af
-
-
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [generate-test-matrix, test, ciflow_should_run]
-    if: ${{ needs.test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
-      fail-fast: false
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports-${{ matrix.config }}
-          path: .
-      - name: Unzip test reports
-        run: |
-          # Should preserve paths so reports should still be in test/test-reports
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -471,5 +407,19 @@ jobs:
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
         run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
index 2c673ccce0f43..ce4540b79cee7 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
@@ -249,64 +249,6 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-      - name: Cleanup workspace
-        if: always()
-        shell: bash
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf ./*
-
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [generate-test-matrix, test, ciflow_should_run]
-    if: ${{ needs.test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
-      fail-fast: false
-    # TODO: Make this into a composite step
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports-${{ matrix.config }}
-          path: .
-      - name: Unzip test reports
-        run: |
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -321,5 +263,14 @@ jobs:
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
         run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Cleanup workspace
+        if: always()
+        shell: bash
+        # Should remove the entirety of pytorch-${{ github.run_id }}
+        run: |
+          rm -rf ./*
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index 54362c903f7d0..d868d19d0fc2c 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -233,64 +233,6 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-      - name: Cleanup workspace
-        if: always()
-        shell: bash
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf ./*
-
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [generate-test-matrix, test, ciflow_should_run]
-    if: ${{ needs.test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
-      fail-fast: false
-    # TODO: Make this into a composite step
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports-${{ matrix.config }}
-          path: .
-      - name: Unzip test reports
-        run: |
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -305,5 +247,14 @@ jobs:
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
         run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Cleanup workspace
+        if: always()
+        shell: bash
+        # Should remove the entirety of pytorch-${{ github.run_id }}
+        run: |
+          rm -rf ./*
diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
index c8497bd3029ee..4d4550c9ce06b 100644
--- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
@@ -251,64 +251,6 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-      - name: Cleanup workspace
-        if: always()
-        shell: bash
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf ./*
-
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [generate-test-matrix, test, ciflow_should_run]
-    if: ${{ needs.test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
-      fail-fast: false
-    # TODO: Make this into a composite step
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports-${{ matrix.config }}
-          path: .
-      - name: Unzip test reports
-        run: |
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -323,5 +265,14 @@ jobs:
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
         run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Cleanup workspace
+        if: always()
+        shell: bash
+        # Should remove the entirety of pytorch-${{ github.run_id }}
+        run: |
+          rm -rf ./*
diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
index 205758657d9b2..c5ae48a888938 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
@@ -251,64 +251,6 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-      - name: Cleanup workspace
-        if: always()
-        shell: bash
-        # Should remove the entirety of pytorch-${{ github.run_id }}
-        run: |
-          rm -rf ./*
-
-  # this is a separate step from test because the log files from test are too
-  # long: basically, GitHub tries to render all of the log files when you click
-  # through an action causing extreme slowdown on actions that contain too many
-  # logs (like test); we can always move it back to the other one, but it
-  # doesn't create the best experience
-  render_test_results:
-    needs: [generate-test-matrix, test, ciflow_should_run]
-    if: ${{ needs.test.result != 'skipped' || failure() }}
-    runs-on: linux.2xlarge
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.render-matrix) }}
-      fail-fast: false
-    # TODO: Make this into a composite step
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow tools/stats/print_test_stats.py to use Git commands
-          fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - uses: actions/download-artifact@v2
-        name: Download PyTorch Test Reports
-        with:
-          name: test-reports-${{ matrix.config }}
-          path: .
-      - name: Unzip test reports
-        run: |
-          unzip -o 'test-reports-*.zip'
-      - name: Install dependencies
-        # boto3 version copied from .circleci/docker/common/install_conda.sh
-        run: |
-          pip3 install -r requirements.txt
-          pip3 install boto3==1.16.34 junitparser rich
-      - name: Output Test Results (Click Me)
-        run: |
-          python3 tools/render_junit.py test
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -323,5 +265,14 @@ jobs:
           CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
           CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
         run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Cleanup workspace
+        if: always()
+        shell: bash
+        # Should remove the entirety of pytorch-${{ github.run_id }}
+        run: |
+          rm -rf ./*

From 57d4c6cf424892888866ed98551f769cb5656623 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 25 Aug 2021 16:42:14 -0700
Subject: [PATCH 233/530] =?UTF-8?q?replace=20`self.assertTrue(torch.allclo?=
 =?UTF-8?q?se(..))`=20with=20`self.assertEqual(=E2=80=A6)`=20(#63637)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63565

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63637

Reviewed By: malfet

Differential Revision: D30541266

Pulled By: mruberry

fbshipit-source-id: ab461949782c6908a589ea098fcfcf5c3e081ee6
---
 test/custom_operator/test_custom_ops.py       | 10 ++--
 test/jit/test_freezing.py                     | 12 ++---
 test/jit/test_tracer.py                       |  4 +-
 test/package/test_directory_reader.py         |  2 +-
 test/package/test_model.py                    |  8 ++--
 test/package/test_package_fx.py               |  8 ++--
 test/package/test_package_script.py           | 16 +++----
 .../quantization/core/test_workflow_module.py |  6 +--
 test/quantization/core/test_workflow_ops.py   | 22 ++++-----
 test/quantization/fx/test_equalize_fx.py      | 10 ++--
 test/quantization/fx/test_numeric_suite_fx.py |  6 +--
 test/quantization/fx/test_quantize_fx.py      |  2 +-
 test/test_autograd.py                         |  6 +--
 test/test_bundled_images.py                   |  4 +-
 test/test_cuda.py                             | 14 +++---
 test/test_jit.py                              |  6 +--
 test/test_nn.py                               | 46 +++++++++----------
 test/test_overrides.py                        | 11 +++--
 test/test_spectral_ops.py                     |  2 +-
 torch/testing/_internal/common_jit.py         |  2 +-
 torch/testing/_internal/jit_utils.py          |  2 +-
 21 files changed, 101 insertions(+), 98 deletions(-)

diff --git a/test/custom_operator/test_custom_ops.py b/test/custom_operator/test_custom_ops.py
index 3937abde91476..356b4932d49ac 100644
--- a/test/custom_operator/test_custom_ops.py
+++ b/test/custom_operator/test_custom_ops.py
@@ -44,8 +44,8 @@ def test_calling_custom_op_with_autograd(self):
         output.sum().backward(go, False, True)
         grad = torch.ones(5, 5)
 
-        self.assertTrue(torch.allclose(x.grad, y + grad))
-        self.assertTrue(torch.allclose(y.grad, x + grad * 2))
+        self.assertEqual(x.grad, y + grad)
+        self.assertEqual(y.grad, x + grad * 2)
 
         # Test with optional arg.
         x.grad.zero_()
@@ -56,9 +56,9 @@ def test_calling_custom_op_with_autograd(self):
 
         go = torch.ones((), requires_grad=True)
         output.sum().backward(go, False, True)
-        self.assertTrue(torch.allclose(x.grad, y + grad))
-        self.assertTrue(torch.allclose(y.grad, x + grad * 2))
-        self.assertTrue(torch.allclose(z.grad, grad))
+        self.assertEqual(x.grad, y + grad)
+        self.assertEqual(y.grad, x + grad * 2)
+        self.assertEqual(z.grad, grad)
 
     def test_calling_custom_op_with_autograd_in_nograd_mode(self):
         with torch.no_grad():
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 8e07af06b70ea..e9317b11412a9 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -1877,7 +1877,7 @@ def forward(self, x):
             N, C, H, W, = 10, 3, 224, 224
             inp = torch.randn(N, C, H, W)
             self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
-            self.assertTrue(torch.allclose(model(inp), mod(inp)))
+            self.assertEqual(model(inp), mod(inp))
 
     @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
     def test_pool2d_batchnorm(self):
@@ -1901,7 +1901,7 @@ def test_pool2d_batchnorm(self):
                 self.run_pass('dce', mod.graph)
                 self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
                 FileCheck().check("aten::to_dense").check_next("return").run(mod.graph)
-                self.assertTrue(torch.allclose(sub_model(inp), mod(inp)))
+                self.assertEqual(sub_model(inp), mod(inp))
 
     @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
     def test_pool3d_batchnorm(self):
@@ -1925,7 +1925,7 @@ def test_pool3d_batchnorm(self):
                 self.run_pass('dce', mod.graph)
                 self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
                 FileCheck().check("aten::to_dense").check_next("return").run(mod.graph)
-                self.assertTrue(torch.allclose(sub_model(inp), mod(inp)))
+                self.assertEqual(sub_model(inp), mod(inp))
 
     @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
     @skipIfNoTorchVision
@@ -1964,7 +1964,7 @@ def forward(self, x):
                                 check_count("aten::to_dense", 1, exactly=True).run(mod.graph))
                         else:
                             FileCheck().check_count("aten::to_dense", 1, exactly=True).check("aten::layer_norm").run(mod.graph)
-                        self.assertTrue(torch.allclose(sub_model(param[2]), mod(param[2]), 1e-04, 1e-04))
+                        self.assertEqual(sub_model(param[2]), mod(param[2]), rtol=1e-04, atol=1e-04)
 
     @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
     @skipIfNoTorchVision
@@ -2003,7 +2003,7 @@ def forward(self, x):
                 inp = torch.randn(N, C, H, W)
                 self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
                 FileCheck().check_count("aten::to_dense", 1, exactly=True).run(mod.graph)
-                self.assertTrue(torch.allclose(sub_model(inp), mod(inp)))
+                self.assertEqual(sub_model(inp), mod(inp))
 
     @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
     def test_hardswish_hardsigmoid(self):
@@ -2030,7 +2030,7 @@ def test_hardswish_hardsigmoid(self):
                         x = torch.rand(size)
                         # `inplace=False` is intentional, otherwise we modify the input
                         # and we aren't testing aten impls anyways
-                        self.assertTrue(torch.allclose(aten_op(x, inplace=False), m(x).to_dense()))
+                        self.assertEqual(aten_op(x, inplace=False), m(x).to_dense())
 
     @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
     def test_scalar_mul(self):
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 247072fb3e94d..1d95dc8d0d8a4 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -163,13 +163,13 @@ def forward(self, x, y):
         eager_out = mod(*test_inputs)
         traced_out = traced_func(*test_inputs)
         self.assertNotWarn(lambda: traced_func(*test_inputs), "Shouldn't throw slicing related warn here")
-        self.assertTrue(torch.allclose(eager_out, traced_out))
+        self.assertEqual(eager_out, traced_out)
 
         test_inputs = (torch.randint(0, 50, (50, 50)), torch.tensor(12))
         eager_out = mod(*test_inputs)
         traced_out = traced_func(*test_inputs)
         self.assertNotWarn(lambda: traced_func(*test_inputs), "Shouldn't throw slicing related warn here")
-        self.assertTrue(torch.allclose(eager_out, traced_out))
+        self.assertEqual(eager_out, traced_out)
 
 
     def test_typeas_trace_check(self):
diff --git a/test/package/test_directory_reader.py b/test/package/test_directory_reader.py
index 93968d6e1bf92..576a7f0c064cd 100644
--- a/test/package/test_directory_reader.py
+++ b/test/package/test_directory_reader.py
@@ -61,7 +61,7 @@ def test_loading_pickle(self):
             importer = PackageImporter(Path(temp_dir) / Path(filename).name)
             dir_mod = importer.load_pickle("model", "model.pkl")
             input = torch.rand(1, 3, 224, 224)
-            self.assertTrue(torch.allclose(dir_mod(input), resnet(input)))
+            self.assertEqual(dir_mod(input), resnet(input))
 
     def test_loading_module(self):
         """
diff --git a/test/package/test_model.py b/test/package/test_model.py
index f5e08b6bfa83c..dc67ff5d89d2e 100644
--- a/test/package/test_model.py
+++ b/test/package/test_model.py
@@ -49,7 +49,7 @@ def test_resnet(self):
         # test that it works
         input = torch.rand(1, 3, 224, 224)
         ref = resnet(input)
-        self.assertTrue(torch.allclose(r2(input), ref))
+        self.assertEqual(r2(input), ref)
 
         # functions exist also to get at the private modules in each package
         torchvision = i.import_module("torchvision")
@@ -81,7 +81,7 @@ def test_resnet(self):
 
         i2 = PackageImporter(f2)
         r3 = i2.load_pickle("model", "model.pkl")
-        self.assertTrue(torch.allclose(r3(input), ref))
+        self.assertEqual(r3(input), ref)
 
     @skipIfNoTorchVision
     def test_model_save(self):
@@ -159,7 +159,7 @@ def load():
             r = the_model(input)
             results.append(r)
 
-        self.assertTrue(torch.allclose(*results))
+        self.assertEqual(*results)
 
     @skipIfNoTorchVision
     def test_script_resnet(self):
@@ -188,7 +188,7 @@ def test_script_resnet(self):
         loaded = torch.jit.load(f2)
 
         input = torch.rand(1, 3, 224, 224)
-        self.assertTrue(torch.allclose((loaded(input)), resnet(input)))
+        self.assertEqual(loaded(input), resnet(input))
 
 
 if __name__ == "__main__":
diff --git a/test/package/test_package_fx.py b/test/package/test_package_fx.py
index 7f31014a8ec04..64d431c0a3e6b 100644
--- a/test/package/test_package_fx.py
+++ b/test/package/test_package_fx.py
@@ -36,7 +36,7 @@ def forward(self, x):
         pi = PackageImporter(f)
         loaded_traced = pi.load_pickle("model", "model.pkl")
         input = torch.rand(2, 3)
-        self.assertTrue(torch.allclose(loaded_traced(input), traced(input)))
+        self.assertEqual(loaded_traced(input), traced(input))
 
     def test_package_then_fx(self):
         from package_a.test_module import SimpleTest
@@ -52,7 +52,7 @@ def test_package_then_fx(self):
         loaded = pi.load_pickle("model", "model.pkl")
         traced = symbolic_trace(loaded)
         input = torch.rand(2, 3)
-        self.assertTrue(torch.allclose(loaded(input), traced(input)))
+        self.assertEqual(loaded(input), traced(input))
 
     def test_package_fx_package(self):
         from package_a.test_module import SimpleTest
@@ -87,7 +87,7 @@ def test_package_fx_package(self):
         loaded2 = pi2.load_pickle("model", "model.pkl")
 
         input = torch.rand(2, 3)
-        self.assertTrue(torch.allclose(loaded(input), loaded2(input)))
+        self.assertEqual(loaded(input), loaded2(input))
 
     def test_package_fx_with_imports(self):
         import package_a.subpackage
@@ -158,7 +158,7 @@ def __init__(self, root, graph, info):
         self.assertEqual(loaded_gm.info, "secret")
 
         input_x = torch.randn(3)
-        self.assertTrue(torch.allclose(loaded_gm(input_x), gm(input_x)))
+        self.assertEqual(loaded_gm(input_x), gm(input_x))
 
 
 if __name__ == "__main__":
diff --git a/test/package/test_package_script.py b/test/package/test_package_script.py
index 3bbaed0501ca1..ecacd79fb6bf7 100644
--- a/test/package/test_package_script.py
+++ b/test/package/test_package_script.py
@@ -51,7 +51,7 @@ def test_package_interface(self):
 
         input = torch.tensor(1)
 
-        self.assertTrue(torch.allclose(scripted(input), scripted_loaded(input)))
+        self.assertEqual(scripted(input), scripted_loaded(input))
 
     def test_different_package_interface(self):
         """Test a case where the interface defined in the package is
@@ -149,7 +149,7 @@ def __init__(self, x):
         input = torch.rand(2, 3)
         loaded_script_class = diff_fake.MyScriptClass(input)
         orig_script_class = fake.MyScriptClass(input)
-        self.assertTrue(torch.allclose(loaded_script_class.bar, orig_script_class.foo))
+        self.assertEqual(loaded_script_class.bar, orig_script_class.foo)
 
     def test_save_scriptmodule(self):
         """
@@ -506,7 +506,7 @@ def test_save_shared_tensors(self):
         self.assertTrue(len(file_structure.children[".data"].children) == 1)
 
         input = torch.rand(2, 3, 4)
-        self.assertTrue(torch.allclose(loaded_mod_1(input), mod1(input)))
+        self.assertEqual(loaded_mod_1(input), mod1(input))
 
     def test_load_shared_tensors(self):
         """
@@ -630,7 +630,7 @@ def test_saving_and_scripting_packaged_mod(self):
         loaded_mod = importer_0.load_pickle("model", "model.pkl")
 
         input = torch.rand(2, 3)
-        self.assertTrue(torch.allclose(loaded_mod(input), orig_mod(input)))
+        self.assertEqual(loaded_mod(input), orig_mod(input))
 
         scripted_mod = torch.jit.script(loaded_mod)
 
@@ -643,7 +643,7 @@ def test_saving_and_scripting_packaged_mod(self):
         importer_1 = PackageImporter(buffer_1)
         loaded_mod_scripted = importer_1.load_pickle("res", "scripted_mod.pkl")
 
-        self.assertTrue(torch.allclose(loaded_mod_scripted(input), orig_mod(input)))
+        self.assertEqual(loaded_mod_scripted(input), orig_mod(input))
 
     def test_mixing_packaged_and_inline_modules(self):
         """
@@ -680,7 +680,7 @@ def forward(self, input: str):
         loaded_imported = importer.load_pickle("model", "imported.pkl")
 
         input = torch.rand(2, 3)
-        self.assertTrue(torch.allclose(loaded_imported(input), imported_mod(input)))
+        self.assertEqual(loaded_imported(input), imported_mod(input))
         self.assertEqual(loaded_inline("input"), inline_mod("input"))
 
     @skipIfNoTorchVision
@@ -721,8 +721,8 @@ def a_non_torch_leaf(a, b):
         loaded_imported = importer.load_pickle("model", "imported.pkl")
 
         input = torch.rand(2, 3)
-        self.assertTrue(torch.allclose(loaded_imported(input), imported_mod(input)))
-        self.assertTrue(torch.allclose(loaded_inline(input), inline_mod(input)))
+        self.assertEqual(loaded_imported(input), imported_mod(input))
+        self.assertEqual(loaded_inline(input), inline_mod(input))
 
     def test_tensor_sharing_pickle(self):
         """Test that saving a ScriptModule and a separately saving a tensor
diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py
index 2298653e878f8..b7782ecf9c1bd 100644
--- a/test/quantization/core/test_workflow_module.py
+++ b/test/quantization/core/test_workflow_module.py
@@ -205,11 +205,11 @@ def test_per_channel_observers(self, qdtype, qscheme, ch_axis, reduce_range):
             if reduce_range:
                 ref_scales = [s * 255 / 127 for s in ref_scales]
                 ref_zero_points = [math.floor(z / 2) for z in ref_zero_points]
-            self.assertTrue(torch.allclose(qparams[0], torch.tensor(ref_scales, dtype=qparams[0].dtype), atol=0.0001))
+            self.assertEqual(qparams[0], torch.tensor(ref_scales, dtype=qparams[0].dtype), rtol=1e-5, atol=0.0001)
             if qscheme == torch.per_channel_affine_float_qparams:
-                self.assertTrue(torch.allclose(qparams[1], torch.tensor(ref_zero_points, dtype=qparams[1].dtype), atol=1))
+                self.assertEqual(qparams[1], torch.tensor(ref_zero_points, dtype=qparams[1].dtype), rtol=1e-5, atol=1)
             else:
-                self.assertTrue(torch.allclose(qparams[1], torch.tensor(ref_zero_points, dtype=qparams[1].dtype)))
+                self.assertEqual(qparams[1], torch.tensor(ref_zero_points, dtype=qparams[1].dtype))
 
 
             # Test for serializability
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index 9fcf5ac138f3e..60cd04345be85 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -312,13 +312,13 @@ def test_forward_per_tensor_half_precision_numerics(self):
             X1 = torch.randn(5, 5).to(torch.float16)
             Y1 = torch.fake_quantize_per_tensor_affine(X1, scale, zero, mini, maxi)
             Y1r = _fake_quantize_per_tensor_affine_reference(X1, scale, zero, mini, maxi)
-            self.assertTrue(torch.allclose(Y1, Y1r, rtol=tolerance, atol=tolerance))
+            self.assertEqual(Y1, Y1r, rtol=tolerance, atol=tolerance)
 
         # to force overflow
         X2 = torch.tensor(2**15 + .01).to(torch.float16)
         Y2 = torch.fake_quantize_per_tensor_affine(X2, scale, zero, mini, maxi)
         Y2r = _fake_quantize_per_tensor_affine_reference(X2, scale, zero, mini, maxi)
-        self.assertTrue(torch.allclose(Y2, Y2r, rtol=tolerance, atol=tolerance))
+        self.assertEqual(Y2, Y2r, rtol=tolerance, atol=tolerance)
 
         scale = 10
 
@@ -326,7 +326,7 @@ def test_forward_per_tensor_half_precision_numerics(self):
         X3 = torch.tensor(2**-24).to(torch.float16)
         Y3 = torch.fake_quantize_per_tensor_affine(X3, scale, zero, mini, maxi)
         Y3r = _fake_quantize_per_tensor_affine_reference(X3, scale, zero, mini, maxi)
-        self.assertTrue(torch.allclose(Y3, Y3r, rtol=tolerance, atol=tolerance))
+        self.assertEqual(Y3, Y3r, rtol=tolerance, atol=tolerance)
 
     def _test_forward_per_tensor_cachemask_impl(self, device):
         float_types = (torch.float32, torch.float16, torch.float64)
@@ -347,7 +347,7 @@ def _test_forward_per_tensor_cachemask_impl(self, device):
                 X, scale, zero_point, quant_min, quant_max)
             Y_ref = _fake_quantize_per_tensor_affine_reference(
                 X, scale, zero_point, quant_min, quant_max).to(device)
-            self.assertTrue(torch.allclose(Y_test, Y_ref, rtol=tolerance, atol=tolerance))
+            self.assertEqual(Y_test, Y_ref, rtol=tolerance, atol=tolerance)
             self.assertTrue(Y_test.dtype == float_type)
 
     def test_forward_per_tensor_cachemask_cpu(self):
@@ -380,14 +380,14 @@ def _test_backward_per_tensor_cachemask_impl(self, device):
                 X, scale, zero_point, quant_min, quant_max)
             Y_ref = _fake_quantize_per_tensor_affine_reference(
                 X, scale, zero_point, quant_min, quant_max).to(device)
-            self.assertTrue(torch.allclose(Y_test, Y_ref, rtol=tolerance, atol=tolerance))
+            self.assertEqual(Y_test, Y_ref, rtol=tolerance, atol=tolerance)
 
             # backward pass
             dout = torch.rand_like(X, dtype=torch.float).to(device)
             dX = _fake_quantize_per_tensor_affine_grad_reference(
                 dout, X, scale, zero_point, quant_min, quant_max)
             Y_test.backward(dout)
-            self.assertTrue(torch.allclose(dX, X.grad))
+            self.assertEqual(dX, X.grad)
             self.assertTrue(X.grad.dtype == float_type)
 
     def test_backward_per_tensor_cachemask_cpu(self):
@@ -729,14 +729,14 @@ def test_forward_per_channel_half_precision_numerics(self):
             X1 = torch.randn(4, 5).to(torch.float16)
             Y1 = torch.fake_quantize_per_channel_affine(X1, scale, zero, axis, mini, maxi)
             Y1r = _fake_quantize_per_channel_affine_reference(X1, scale, zero, axis, mini, maxi)
-            self.assertTrue(torch.allclose(Y1, Y1r, rtol=tolerance, atol=tolerance))
+            self.assertEqual(Y1, Y1r, rtol=tolerance, atol=tolerance)
 
         # to force overflow
         X2 = torch.randn(4, 5).to(torch.float16)
         X2[0, 0] = 2**15 + .01
         Y2 = torch.fake_quantize_per_channel_affine(X2, scale, zero, axis, mini, maxi)
         Y2r = _fake_quantize_per_channel_affine_reference(X2, scale, zero, axis, mini, maxi)
-        self.assertTrue(torch.allclose(Y2, Y2r, rtol=tolerance, atol=tolerance))
+        self.assertEqual(Y2, Y2r, rtol=tolerance, atol=tolerance)
 
         scale = torch.zeros(5) + 10
 
@@ -745,7 +745,7 @@ def test_forward_per_channel_half_precision_numerics(self):
         X3[0, 0] = 2**-24
         Y3 = torch.fake_quantize_per_channel_affine(X3, scale, zero, axis, mini, maxi)
         Y3r = _fake_quantize_per_channel_affine_reference(X3, scale, zero, axis, mini, maxi)
-        self.assertTrue(torch.allclose(Y3, Y3r, rtol=tolerance, atol=tolerance))
+        self.assertEqual(Y3, Y3r, rtol=tolerance, atol=tolerance)
 
     def _test_learnable_forward_per_channel(self, X_base, device, scale_base, zero_point_base, axis):
         r"""Tests the forward path of the learnable FakeQuantizePerTensorAffine op.
@@ -1160,7 +1160,7 @@ def test_fused_obs_fake_quant_backward_op(self, device) -> None:
 
         dX = _fake_quantize_per_tensor_affine_grad_reference(
             dout, x, x_scale, x_zero_point, 0, 255)
-        self.assertTrue(torch.allclose(dX, x.grad))
+        self.assertEqual(dX, x.grad)
         self.assertTrue(x.grad.dtype == torch.float32)
 
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),)
@@ -1206,7 +1206,7 @@ def test_fused_backward_op_fake_quant_off(self, device) -> None:
 
         dX = _fake_quantize_per_tensor_affine_grad_reference(
             dout, x, x_scale, x_zero_point, 0, 255)
-        self.assertTrue(torch.allclose(dX, x.grad))
+        self.assertEqual(dX, x.grad)
         self.assertTrue(x.grad.dtype == torch.float32)
 
 if __name__ == '__main__':
diff --git a/test/quantization/fx/test_equalize_fx.py b/test/quantization/fx/test_equalize_fx.py
index 7c17d1296daac..a74b1744e7cc3 100644
--- a/test/quantization/fx/test_equalize_fx.py
+++ b/test/quantization/fx/test_equalize_fx.py
@@ -217,10 +217,10 @@ def test_input_weight_eq_observer(self, ndim, input_qdtype, input_qscheme, weigh
             ref_zero_points = -128 if weight_qdtype is torch.qint8 else 0
             ref_zero_points = ref_zero_points - np.round(ref_min_weights_scaled / ref_scales)
 
-        self.assertTrue(torch.allclose(weight_qparams[0], torch.tensor(
-            ref_scales, dtype=weight_qparams[0].dtype), atol=0.0001))
-        self.assertTrue(torch.allclose(weight_qparams[1], torch.tensor(
-            ref_zero_points, dtype=weight_qparams[1].dtype), atol=1))
+        self.assertEqual(weight_qparams[0], torch.tensor(
+            ref_scales, dtype=weight_qparams[0].dtype), rtol=1e-5, atol=0.0001)
+        self.assertEqual(weight_qparams[1], torch.tensor(
+            ref_zero_points, dtype=weight_qparams[1].dtype), rtol=1e-5, atol=1)
 
     def test_input_weight_equalization_prepare(self):
         """ Tests that graphs created after prepare_fx is as expected
@@ -783,7 +783,7 @@ def test_input_weight_equalization_results(self):
             prepared(x)
             equalized_and_quantized = convert_fx(prepared)  # Check if compile
             equalized_and_quantized_output = equalized_and_quantized(x)
-            self.assertTrue(torch.allclose(quantized_output, equalized_and_quantized_output, atol=0.1))
+            self.assertEqual(quantized_output, equalized_and_quantized_output, rtol=1e-5, atol=0.1)
 
     @skipIfNoFBGEMM
     def test_selective_equalization(self):
diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index 61062fba781e5..3e627f5e14419 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -1834,8 +1834,8 @@ def test_loggers_preserve_qat_numerics(self):
         mp_ns, mc_ns = add_loggers('fp32', mp, 'int8', mc, OutputLogger)
         ref_fp32_ns = mp_ns(datum)
         ref_int8_ns = mc_ns(datum)
-        self.assertTrue(torch.allclose(ref_fp32, ref_fp32_ns))
-        self.assertTrue(torch.allclose(ref_int8, ref_int8_ns))
+        self.assertEqual(ref_fp32, ref_fp32_ns)
+        self.assertEqual(ref_int8, ref_int8_ns)
 
     @skipIfNoFBGEMM
     def test_shadow_loggers_preserve_qat_numerics(self):
@@ -1852,7 +1852,7 @@ def test_shadow_loggers_preserve_qat_numerics(self):
 
         mc_shadows_mp = add_shadow_loggers('int8', mc, 'fp32', mp, OutputLogger)
         ref_shadow = mc_shadows_mp(datum)
-        self.assertTrue(torch.allclose(ref_fp32, ref_shadow))
+        self.assertEqual(ref_fp32, ref_shadow)
 
 class TestFXNumericSuiteCoreAPIsModels(FXNumericSuiteQuantizationTestCase):
     """
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 1bc6b610d1662..08474d2bc1d19 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -4668,7 +4668,7 @@ def _test_conv_transpose_impl(
             m2q = torch.quantization.convert(m2p)
             q_result2 = m2q(data)
             # verify results match
-            self.assertTrue(torch.allclose(q_result1, q_result2))
+            self.assertEqual(q_result1, q_result2)
 
     @unittest.skipUnless('qnnpack' in supported_qengines,
                          "This Pytorch Build has not been built with or does not support QNNPACK")
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 126d9230fe687..4d416459c2af4 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -2801,11 +2801,11 @@ def test_var_mean_differentiable(self):
 
         r1 = var1 * var1 * mean1 * mean1
         r2 = var2 * var2 * mean2 * mean2
-        self.assertTrue(torch.allclose(r1, r2, rtol=0.01, atol=0.0))
+        self.assertEqual(r1, r2, rtol=0.01, atol=0.0)
 
         torch.autograd.backward(r1, grad)
         torch.autograd.backward(r2, grad)
-        self.assertTrue(torch.allclose(input1.grad, input2.grad, rtol=0.01, atol=0.0))
+        self.assertEqual(input1.grad, input2.grad, rtol=0.01, atol=0.0)
 
     @slowTest
     @skipIfNoLapack
@@ -5159,7 +5159,7 @@ def test_autograd_inplace_views_cross_dtype(self):
 
         # TODO: this is a bug!
         # once this is fixed, it should have the transpose removed:
-        # self.assertTrue(torch.allclose(non_inplace_grad, inplace_grad))
+        # self.assertEqual(non_inplace_grad, inplace_grad)
         self.assertEqual(non_inplace_grad.T, inplace_grad)
 
     def test_autograd_multiple_views_python(self):
diff --git a/test/test_bundled_images.py b/test/test_bundled_images.py
index 0c95ae39c582d..7efd40178a160 100644
--- a/test/test_bundled_images.py
+++ b/test/test_bundled_images.py
@@ -67,7 +67,7 @@ def forward(self, arg):
         self.assertEqual(len(inflated), 1)
         self.assertEqual(len(inflated[0]), 1)
         self.assertEqual(raw_data.shape, decoded_data.shape)
-        self.assertTrue(torch.allclose(raw_data, decoded_data, atol=0.1, rtol=1e-01))
+        self.assertEqual(raw_data, decoded_data, atol=0.1, rtol=1e-01)
 
         # Check if fb::image_decode_to_NCHW works as expected
         with open("caffe2/test/test_img/p1.jpg", "rb") as fp:
@@ -76,4 +76,4 @@ def forward(self, arg):
             byte_tensor = torch.tensor(list(fp.read())).byte()
             im2_tensor = torch.ops.fb.image_decode_to_NCHW(byte_tensor, weight, bias)
             self.assertEqual(raw_data.shape, im2_tensor.shape)
-            self.assertTrue(torch.allclose(raw_data, im2_tensor, atol=0.1, rtol=1e-01))
+            self.assertEqual(raw_data, im2_tensor, atol=0.1, rtol=1e-01)
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 55bab2ee4ebcd..e90cb1703c06e 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -2036,7 +2036,7 @@ def test_grad_scaling_unscale(self, dtype=torch.float):
             else:
                 self.assertEqual(found_inf, 0.0)
                 for grad in grads:
-                    self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7))
+                    self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7)
 
         # When passing lists with mismatched dtypes to a raw
         # _amp_foreach_non_finite_check_and_unscale_ call,
@@ -2044,7 +2044,7 @@ def test_grad_scaling_unscale(self, dtype=torch.float):
         grads = [g.clone(), g.to(dtype=torch.float16)]
         torch._amp_foreach_non_finite_check_and_unscale_(grads, found_inf, inv_scale)
         for grad in grads:
-            self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7))
+            self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7)
 
         # Passing lists with mismatched devices to a raw
         # _amp_foreach_non_finite_check_and_unscale_ call should raise errors.
@@ -2084,7 +2084,7 @@ def perfect_storm_grads(inject_inf):
                 # No inf was injected, ensures unscaling worked normally.
                 self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 0)
                 for grad in grads:
-                    self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7))
+                    self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7)
             else:
                 # inf was injected, ensures inf was found.
                 self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 1)
@@ -2136,7 +2136,7 @@ def test_grad_scaling_unscale_sparse(self, device="cuda", dtype=torch.float):
         found_inf.zero_()
         found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, False)[cur]
         self.assertEqual(found_inf, 0.0)
-        self.assertTrue(torch.allclose(p.grad.to_dense(), (s / 4).to_dense()))
+        self.assertEqual(p.grad.to_dense(), (s / 4).to_dense())
 
         v = torch.FloatTensor([16., 32., float('inf')])
         p.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cuda", dtype=dtype)
@@ -2158,7 +2158,7 @@ def test_grad_scaling_unscale_sparse(self, device="cuda", dtype=torch.float):
         found_inf.zero_()
         found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, True)[cur]
         self.assertEqual(found_inf, 0.0)
-        self.assertTrue(torch.allclose(p.grad.to_dense(), (s.half() / 4).to_dense()))
+        self.assertEqual(p.grad.to_dense(), (s.half() / 4).to_dense())
 
         # Creates fp16 sparse tensor with duplicated indices (uncoalesced).  The uncoalesced representation
         # does not overflow in fp16, but the coalesced representation would, because 64000 + 64000 > fp16 max.
@@ -2465,7 +2465,7 @@ def run(model0, model1, optimizer0, optimizer1, try_scaling_api):
 
             for c, s in zip(chain(mod_control0.parameters(), mod_control1.parameters()),
                             chain(mod_scaling0.parameters(), mod_scaling1.parameters())):
-                self.assertTrue(torch.allclose(c, s, atol=1e-7))
+                self.assertEqual(c, s, rtol=1e-5, atol=1e-7)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_grad_scaling_multigpu(self):
@@ -2534,7 +2534,7 @@ def run(model0, model1, optimizer0, optimizer1, try_scaling_api):
 
             for c, s in zip(chain(mod_control0.parameters(), mod_control1.parameters()),
                             chain(mod_scaling0.parameters(), mod_scaling1.parameters())):
-                self.assertTrue(torch.allclose(c, s, atol=1e-7))
+                self.assertEqual(c, s, rtol=1e-5, atol=1e-7)
 
     def test_cublas_multiple_threads_same_device(self):
         # Note, these parameters should be very carefully tuned
diff --git a/test/test_jit.py b/test/test_jit.py
index 28de1722dde47..2595411c01848 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -14954,7 +14954,7 @@ def jit_multihead_attn_forward(query,                   # type: Tensor
                                                                   attn_mask=mask)[0]
         # print("rel. error: ")
         # print(jit_out / py_out - 1)
-        self.assertTrue(torch.allclose(jit_out, py_out, atol=5e-4, rtol=1e-4))
+        self.assertEqual(jit_out, py_out, atol=5e-4, rtol=1e-4)
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
     def test_scriptmodule_multi_head_attn_cuda(self):
@@ -14990,7 +14990,7 @@ def forward(self, q, k, v):
                                                                   None, None, None, 0.0,
                                                                   model.mod.out_proj.weight,
                                                                   model.mod.out_proj.bias)[0]
-        self.assertTrue(torch.allclose(jit_out, py_out, atol=5e-4, rtol=1e-4))
+        self.assertEqual(jit_out, py_out, atol=5e-4, rtol=1e-4)
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
     def test_scriptmodule_transformer_cuda(self):
@@ -15029,7 +15029,7 @@ def forward(self, q, k):
 
         # print(jit_out/py_out-1)
         # print(torch.allclose(jit_out, py_out, atol=5e-4, rtol=1e-4))
-        self.assertTrue(torch.allclose(jit_out, py_out, atol=5e-4, rtol=1e-4))
+        self.assertEqual(jit_out, py_out, atol=5e-4, rtol=1e-4)
 
     def test_list_python_op(self):
         def python_list_op(lst):
diff --git a/test/test_nn.py b/test/test_nn.py
index 8c3541aca0716..c6fe0b28b30ac 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -229,7 +229,7 @@ def test_doubletensor_avg_pool2d(self):
                 actual = torch.nn.functional.avg_pool2d(input[0], (i, j))
                 actual = actual.view(1, actual.numel())
                 expected = self._avg_pool2d(input, (i, j))
-                self.assertTrue(torch.allclose(actual, expected, rtol=0, atol=1e-5))
+                self.assertEqual(actual, expected, rtol=0, atol=1e-5)
 
     def test_avg_pool2d_with_zero_divisor(self):
         self.assertRaisesRegex(RuntimeError, "divisor must be not zero",
@@ -244,7 +244,7 @@ def test_doubletensor_avg_pool2d_with_divisor(self):
                     actual = F.avg_pool2d(input[0], (i, j), divisor_override=divisor)
                     actual = actual.view(1, actual.numel())
                     expected = self._sum_pool2d(input, (i, j)) / divisor
-                    self.assertTrue(torch.allclose(actual, expected, rtol=0, atol=1e-5))
+                    self.assertEqual(actual, expected, rtol=0, atol=1e-5)
 
     def test_doubletensor_avg_pool3d(self):
         h, w, d = 5, 6, 7
@@ -255,7 +255,7 @@ def test_doubletensor_avg_pool3d(self):
                     actual = torch.nn.functional.avg_pool3d(input.unsqueeze(0), (i, j, k))
                     actual = actual.view(1, actual.numel())
                     expected = self._avg_pool3d(input, (i, j, k))
-                    self.assertTrue(torch.allclose(actual, expected, rtol=0, atol=1e-5))
+                    self.assertEqual(actual, expected, rtol=0, atol=1e-5)
 
     def test_doubletensor_avg_pool3d_with_divisor(self):
         h, w, d = 6, 5, 7
@@ -267,7 +267,7 @@ def test_doubletensor_avg_pool3d_with_divisor(self):
                         actual = torch.nn.functional.avg_pool3d(input.unsqueeze(0), (i, j, k), divisor_override=divisor)
                         actual = actual.view(1, actual.numel())
                         expected = self._sum_pool3d(input, (i, j, k)) / divisor
-                        self.assertTrue(torch.allclose(actual, expected, rtol=0, atol=1e-5))
+                        self.assertEqual(actual, expected, rtol=0, atol=1e-5)
 
     def test_avg_pool3d_with_zero_divisor(self):
         self.assertRaisesRegex(RuntimeError, "divisor must be not zero",
@@ -2260,7 +2260,7 @@ def forward(self, x):
         self.assertNotIn("weight", model._parameters)
         # Result should be skew-symmetric
         A = model.weight
-        self.assertTrue(torch.allclose(A, -A.T))
+        self.assertEqual(A, -A.T)
         # Remove and check consistency
         parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
         self.assertFalse(hasattr(model, "parametrizations"))
@@ -2277,7 +2277,7 @@ def forward(self, x):
         self.assertNotIn("weight", model._parameters)
         # Result should be skew-symmetric
         A = model.weight
-        self.assertTrue(torch.allclose(A, -A.T))
+        self.assertEqual(A, -A.T)
         # Remove and check consistency
         parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
         self.assertFalse(hasattr(model, "parametrizations"))
@@ -2291,7 +2291,7 @@ def forward(self, x):
         # Result should be orthogonal
         X = model.weight
         Id = torch.eye(X.size(0), device=X.device)
-        self.assertTrue(torch.allclose(X.T @ X, Id))
+        self.assertEqual(X.T @ X, Id)
         # Structure tests
         self.assertTrue(hasattr(model, "parametrizations"))
         self.assertTrue(parametrize.is_parametrized(model))
@@ -2810,10 +2810,10 @@ def right_inverse(self, w):
         init_weight = model.weight.clone()
         parametrize.register_parametrization(model, "weight", RankOne())
         # Projecting a rank 1 matrix onto the matrices of rank one does not change the matrix
-        self.assertTrue(torch.allclose(init_weight, model.weight))
+        self.assertEqual(init_weight, model.weight)
         parametrize.register_parametrization(model, "weight", Double())
         # The matrix now is twice the initial matrix
-        self.assertTrue(torch.allclose(2.0 * init_weight, model.weight))
+        self.assertEqual(2.0 * init_weight, model.weight)
         # Multiplying by a scalar does not change the rank
         self.assertEqual(torch.linalg.matrix_rank(model.weight).item(), 1)
 
@@ -11276,7 +11276,7 @@ def test_layer_norm_grads_with_create_graph_flag(self):
         grads1 = torch.autograd.grad(layer_norm(x).sum(), x, create_graph=False)[0]
         grads2 = torch.autograd.grad(layer_norm(x).sum(), x, create_graph=True)[0]
 
-        self.assertTrue(torch.allclose(grads1, grads2, rtol, atol))
+        self.assertEqual(grads1, grads2, rtol=rtol, atol=atol)
 
         if TEST_CUDA:
             x = x.to('cuda')
@@ -11285,7 +11285,7 @@ def test_layer_norm_grads_with_create_graph_flag(self):
             grads1 = torch.autograd.grad(layer_norm(x).sum(), x, create_graph=False)[0]
             grads2 = torch.autograd.grad(layer_norm(x).sum(), x, create_graph=True)[0]
 
-            self.assertTrue(torch.allclose(grads1, grads2, rtol, atol))
+            self.assertEqual(grads1, grads2, rtol=rtol, atol=atol)
 
     def test_padding_list(self):
         # Padding can be a list, or tuple (regression test for gh-54452)
@@ -11793,7 +11793,7 @@ def test_add_relu(self):
         relu_res = torch.relu(add_res)
         add_relu_res = torch._VF._add_relu(a, b)
 
-        self.assertTrue(torch.allclose(add_relu_res, relu_res))
+        self.assertEqual(add_relu_res, relu_res)
 
     def test_add_relu_broadcasting(self):
         a = torch.rand((1, 32))
@@ -11802,7 +11802,7 @@ def test_add_relu_broadcasting(self):
         res = torch._VF._add_relu(a, b)
         broadcasted_res = torch._VF._add_relu(a, b_scalar)
 
-        self.assertTrue(torch.allclose(broadcasted_res, res))
+        self.assertEqual(broadcasted_res, res)
 
 
 def add_test(test, decorator=None):
@@ -14070,8 +14070,8 @@ def helper(n, c, h, w, kernel_size, stride=None,
 
             self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
             self.assertTrue(ref_out.is_contiguous())
-            self.assertTrue(torch.allclose(out, ref_out))
-            self.assertTrue(torch.allclose(input.grad, ref_input.grad))
+            self.assertEqual(out, ref_out)
+            self.assertEqual(input.grad, ref_input.grad)
 
         helper(4, 8, 8, 8, 3)
         helper(4, 8, 8, 8, 3, count_include_pad=False, padding=1)
@@ -14200,9 +14200,9 @@ def helper(n, c, h, w, kernel_size, stride=None):
             self.assertTrue(ref_out.is_contiguous())
             self.assertTrue(ind.is_contiguous(memory_format=torch.channels_last))
             self.assertTrue(ref_ind.is_contiguous())
-            self.assertTrue(torch.allclose(out, ref_out))
-            self.assertTrue(torch.allclose(ind, ref_ind))
-            self.assertTrue(torch.allclose(input.grad, ref_input.grad))
+            self.assertEqual(out, ref_out)
+            self.assertEqual(ind, ref_ind)
+            self.assertEqual(input.grad, ref_input.grad)
 
         helper(4, 8, 8, 8, 7)
         helper(200, 512, 28, 28, 2)
@@ -17180,7 +17180,7 @@ def test_maxpool3d_non_square_backward(self, device):
             shape = tuple(32 if i != dim else 256 for i in range(4))
             x = torch.randn(shape, device=device, requires_grad=True)
             F.max_pool3d(x, kernel_size=(1, 1, 1)).sum().backward()
-            self.assertTrue(torch.allclose(x.grad, torch.ones_like(x.grad)))
+            self.assertEqual(x.grad, torch.ones_like(x.grad))
 
     # Check that clip_grad_norm_ raises an error if the total norm of the
     # parameters' gradients is non-finite
@@ -17672,7 +17672,7 @@ def removable_hook_2(m, input):
 
         input = torch.randn(2, 2)
         output = module(input)
-        self.assertTrue(torch.allclose(torch.sigmoid(input), output))
+        self.assertEqual(torch.sigmoid(input), output)
 
         # make sure hook removal is successful
         self.assertFalse(handle.id in handle.hooks_dict_ref())
@@ -17707,7 +17707,7 @@ def removable_hook_2(m, input, output):
 
         input = torch.randn(2, 2)
         output = module(input)
-        self.assertTrue(torch.allclose(torch.sigmoid(input), output))
+        self.assertEqual(torch.sigmoid(input), output)
 
         # make sure hook removal is successful
         self.assertFalse(handle.id in handle.hooks_dict_ref())
@@ -18001,7 +18001,7 @@ def hook_function(module, input):
         module = TestModule()
         module.register_forward_pre_hook(hook_function)
         output = module(torch.zeros(2, 2))
-        self.assertTrue(torch.allclose(output, torch.ones(2, 2)))
+        self.assertEqual(output, torch.ones(2, 2))
 
     def test_lazy_forward_hook(self):
         """
@@ -18024,7 +18024,7 @@ def hook_function(module, input, output):
         module = TestModule()
         module.register_forward_hook(hook_function)
         output = module(torch.zeros(2, 2))
-        self.assertTrue(torch.allclose(output, torch.ones(2, 2)))
+        self.assertEqual(output, torch.ones(2, 2))
 
     @suppress_warnings
     def test_lazy_conv1d(self):
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 41044376a40f7..a6252374364c2 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -762,6 +762,9 @@ def __bool__(self):
     def __int__(self):
         return self.__torch_function__(torch.Tensor.__int__, (Wrapper,), (self,))
 
+    def __len__(self):
+        return len(self._data)
+
 
 # unwrap inputs if necessary
 def unwrap(v):
@@ -782,15 +785,15 @@ class TestEinsumOverride(TestCase):
     def test_wrapper(self):
         x = Wrapper(torch.randn(5))
         y = Wrapper(torch.randn(4))
-        self.assertTrue(torch.allclose(torch.einsum('i,j->ij', x, y),
-                                       torch.ger(x, y)))
+        self.assertEqual(torch.einsum('i,j->ij', x, y)._data,
+                         torch.ger(x, y)._data)
 
         # in the old einsum interface, `operands` is a list
         a = Wrapper(torch.randn(2, 3))
         b = Wrapper(torch.randn(5, 3, 7))
         c = Wrapper(torch.randn(2, 7))
-        self.assertTrue(torch.allclose(torch.einsum('ik,jkl,il->ij', [a, b, c]),
-                                       torch.nn.functional.bilinear(a, c, b)))
+        self.assertEqual(torch.einsum('ik,jkl,il->ij', [a, b, c])._data,
+                         torch.nn.functional.bilinear(a, c, b)._data)
 
 class TestGradCheckOverride(TestCase):
     "Test that wrappers work with gradcheck."
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index e7e4832ad5631..fdc8c01417fd1 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -658,7 +658,7 @@ def test_fftshift_frequencies(self, device, dtype):
 
             # Test fftshift sorts the fftfreq output
             shifted = torch.fft.fftshift(x)
-            self.assertTrue(torch.allclose(shifted, shifted.sort().values))
+            self.assertEqual(shifted, shifted.sort().values)
             self.assertEqual(sorted_fft_freqs, shifted)
 
             # And ifftshift is the inverse
diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py
index 80cb4d0331889..3b62ced36f391 100644
--- a/torch/testing/_internal/common_jit.py
+++ b/torch/testing/_internal/common_jit.py
@@ -136,7 +136,7 @@ def get_recording_tensors(args):
         for g2, g2_test in zip(grads2, grads2_test):
             if g2 is None and g2_test is None:
                 continue
-            self.assertTrue(torch.allclose(g2, g2_test, atol=5e-4, rtol=1e-4))
+            self.assertEqual(g2, g2_test, atol=5e-4, rtol=1e-4)
 
 class JitCommonTestCase(TestCase):
     def createFunctionFromGraph(self, trace):
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index 7f9fb976934d3..50d8dac23867b 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -594,7 +594,7 @@ def input_reduce(input, fn, acc):
             for g2, g2_ge in zip(grads2, grads2_ge):
                 if g2 is None and g2_ge is None:
                     continue
-                self.assertTrue(torch.allclose(g2, g2_ge, atol=8e-4, rtol=8e-4))
+                self.assertEqual(g2, g2_ge, atol=8e-4, rtol=8e-4)
 
         return ge
 

From 01b8162d00bfb0844a3f8a165d49907e51a16add Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 25 Aug 2021 17:50:48 -0700
Subject: [PATCH 234/530] Back out "Revert D30384746: [fx2trt] Add a test for
 quantized resnet18" (#63973)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63973

Original commit changeset: b93235323e22

Test Plan: buck run mode/opt -c python.package_style=inplace caffe2:fx2trt_quantized_resnet_test

Reviewed By: 842974287

Differential Revision: D30546036

fbshipit-source-id: 2c8302456f072d04da00cf9ad97aa8304bc5e43e
---
 .../fx2trt/converters/acc_ops_converters.py   |  15 +--
 .../fx2trt/example/quantized_resnet_test.py   | 117 ++++++++++++++++++
 2 files changed, 121 insertions(+), 11 deletions(-)
 create mode 100644 torch/fx/experimental/fx2trt/example/quantized_resnet_test.py

diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
index 566359bf2af0d..33a817d4ccdb5 100644
--- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
+++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
@@ -1300,15 +1300,11 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name):
     if q_zero_point != 0:
         raise RuntimeError(f"Only support zero_point == 0, get {q_zero_point}")
 
-    # temporarily set q_scale to 1 to make sure the q_scale is different
-    # for quantize and dequantize to avoid the error
-    # TODO: follow up with nvidia TensorRT team to repro and fix the problem
-    q_scale = 1
     scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([float(q_scale)], dtype=np.float32)))
     scale_layer.name = input_val.name + ".quant.scale"
     scale = scale_layer.get_output(0)
-    assert trt.__version__ > "8.0", "Explicit quantize op is only supported in "
-    "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
+    # assert trt.__version__ > "8.0", "Explicit quantize op is only supported in "
+    # "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
     layer = network.add_quantize(input=input_val, scale=scale)
     layer.axis = 0
     layer.name = input_val.name + ".quant"
@@ -1316,9 +1312,6 @@ def acc_ops_quantize_per_tensor(network, target, args, kwargs, name):
 
 @tensorrt_converter(acc_ops.dequantize)
 def acc_ops_dequantize(network, target, args, kwargs, name):
-    """
-    Currently just a no-op.
-    """
     input_val = kwargs["input"]
 
     if not isinstance(input_val, trt.tensorrt.ITensor):
@@ -1339,8 +1332,8 @@ def acc_ops_dequantize(network, target, args, kwargs, name):
     scale_layer = network.add_constant((1,), trt.Weights(np.ascontiguousarray([q_scale], dtype=np.float32)))
     scale_layer.name = input_val.name + ".dequant.scale"
     scale = scale_layer.get_output(0)
-    assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in "
-    "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
+    # assert trt.__version__ > "8.0", "Explicit dequantize op is only supported in "
+    # "TensorRT 8.0 or above, current TensorRT version:" + trt.__version__
     layer = network.add_dequantize(input=input_val, scale=scale)
     layer.name = input_val.name + ".dequant"
     layer.axis = 0
diff --git a/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py b/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py
new file mode 100644
index 0000000000000..140f4fb50bd76
--- /dev/null
+++ b/torch/fx/experimental/fx2trt/example/quantized_resnet_test.py
@@ -0,0 +1,117 @@
+import torch.fx
+import torchvision.models as models
+from torch.fx.experimental.fx2trt.fx2trt import TRTInterpreter, InputTensorSpec, TRTModule
+from torch.quantization.quantize_fx import prepare_fx, convert_fx
+import torch.fx.experimental.fx_acc.acc_tracer as acc_tracer
+import copy
+from torch.fx.passes import shape_prop
+from torch.fx.experimental.normalize import NormalizeArgs
+
+rn18 = models.resnet18().eval()
+
+def build_fp16_trt(rn18):
+    rn18 = copy.deepcopy(rn18)
+    rn18 = acc_tracer.trace(rn18, [torch.randn(1, 3, 224, 224)])  # type: ignore[attr-defined]
+    interp = TRTInterpreter(rn18, [InputTensorSpec(torch.Size([3, 224, 224]), torch.float, has_batch_dim=False)])
+    engine, input_names, output_names = interp.run(fp16_mode=True)
+    return TRTModule(engine, input_names, output_names)
+
+@torch.no_grad()
+def build_int8_trt(rn18):
+    rn18 = copy.deepcopy(rn18)
+    data = torch.randn(1, 3, 224, 224)
+    # data = torch.randn(1, 64, 10, 10)
+    # TensorRT only supports symmetric quantization
+    qconfig = torch.quantization.QConfig(
+        activation=torch.quantization.observer.HistogramObserver.with_args(
+            qscheme=torch.per_tensor_symmetric, dtype=torch.qint8
+        ),
+        weight=torch.quantization.default_weight_observer
+    )
+    prepared = prepare_fx(rn18, {"": qconfig})
+    for _ in range(10):
+        prepared(data)
+    quantized_rn18 = convert_fx(prepared, is_reference=True)
+    print("quantized model:", quantized_rn18)
+
+    quantized_rn18 = acc_tracer.trace(quantized_rn18, [data])  # type: ignore[attr-defined]
+    interp = TRTInterpreter(quantized_rn18, [InputTensorSpec(data.shape[1:], torch.float, has_batch_dim=False)])
+    engine, input_names, output_names = interp.run(fp16_mode=False, int8_mode=True)
+    return TRTModule(engine, input_names, output_names)
+
+@torch.no_grad()
+def build_int8_trt_implicit_quant(rn18):
+    rn18 = copy.deepcopy(rn18)
+    data = torch.randn(1, 3, 224, 224)
+    # Quantization
+    qconfig = torch.quantization.QConfig(
+        activation=torch.quantization.observer.HistogramObserver.with_args(
+            qscheme=torch.per_tensor_symmetric, reduce_range=True
+        ),
+        weight=torch.quantization.default_per_channel_weight_observer
+    )
+    prepared = prepare_fx(rn18, {"": qconfig})
+    for _ in range(10):
+        prepared(data)
+    quantized_rn18 = convert_fx(prepared, is_reference=True)
+
+    # Build trt int8 model
+    traced_rn18 = torch.fx.symbolic_trace(quantized_rn18)
+    shape_prop.ShapeProp(traced_rn18).propagate(data)
+    traced_rn18 = NormalizeArgs(traced_rn18).transform()
+    interp = TRTInterpreter(traced_rn18, InputTensorSpec.from_tensors([data]))
+    engine, input_names, output_names = interp.run(fp16_mode=False, int8_mode=True, strict_type_constraints=True)
+    trt_mod = TRTModule(engine, input_names, output_names)
+    return trt_mod
+
+class M(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 3, 3, padding=1)
+
+    def forward(self, x):
+        out = self.conv(x)
+        # out = torch.nn.functional.relu(out)
+        out += x
+        out += out
+        out = torch.nn.functional.relu(out)
+        return out
+
+# rn18 = M().eval()
+# rn18 = rn18.layer1
+int8_trt = build_int8_trt(rn18)
+implicit_int8_trt = build_int8_trt_implicit_quant(rn18)
+fp16_trt = build_fp16_trt(rn18)
+x = torch.randn(5, 3, 224, 224, device="cuda")
+rn18 = rn18.cuda()
+
+import time
+NITER = 100
+
+torch.cuda.synchronize()
+s = time.time()
+for _ in range(NITER):
+    fp16_trt(x)
+    torch.cuda.synchronize()
+print('trt fp16 time (ms/iter)', (time.time() - s) / NITER * 1000)
+
+torch.cuda.synchronize()
+s = time.time()
+for _ in range(NITER):
+    int8_trt(x)
+    torch.cuda.synchronize()
+print('trt int8 time (ms/iter)', (time.time() - s) / NITER * 1000)
+
+torch.cuda.synchronize()
+s = time.time()
+for _ in range(NITER):
+    implicit_int8_trt(x)
+    torch.cuda.synchronize()
+print('trt implicit int8 time (ms/iter)', (time.time() - s) / NITER * 1000)
+
+torch.cuda.synchronize()
+s = time.time()
+for _ in range(NITER):
+    rn18(x)
+    torch.cuda.synchronize()
+print('PyTorch time (ms/iter)', (time.time() - s) / NITER * 1000)

From f4bc28990f6edcaf6bfc9e9737a70ea0be514198 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Wed, 25 Aug 2021 18:17:10 -0700
Subject: [PATCH 235/530] Compute cuda reduction buffer size in elements
 (#63969)

Summary:
Resubmit of https://github.com/pytorch/pytorch/issues/63885

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63969

Reviewed By: mruberry

Differential Revision: D30549423

Pulled By: ngimel

fbshipit-source-id: b16d25030d44ced789c125a333d72b02a8f45067
---
 aten/src/ATen/native/cuda/Reduce.cuh | 3 ++-
 test/test_reductions.py              | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 8c423061a79f6..b4600454f467d 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -919,10 +919,11 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
     // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter
     // when accumulation in output is not possible.
     if (!can_accumulate_in_output && !can_use_32bit_indexing) {
-      int64_t output_memory_size = 1;
+      int64_t output_memory_size = iter.element_size(0);
       for (int dim = 0; dim < iter.ndim(); dim++) {
         output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
       }
+      output_memory_size /= iter.element_size(0); //iter.strides is in bytes
       owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t),
                                                  sizeof(out_scalar_t),
                                                  (char*) iter.data_ptr(0),
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 1497ed6ad419d..c1da0f0816c5a 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -1788,7 +1788,7 @@ def run_test(input_):
         run_test(torch.zeros(64, 61, dtype=dtype, device=device))
         run_test(torch.zeros(64, 1, dtype=dtype, device=device))
 
-    @slowTest
+    @onlyCUDA
     def test_argminmax_large_axis(self, device):
         # Regression test for gh-32863
         x = torch.zeros(2**31, device=device, dtype=torch.int8)

From 2ea2711501fd00c108c4b7cd87bc952bc9204cbb Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Wed, 25 Aug 2021 20:09:12 -0700
Subject: [PATCH 236/530] Make frozen symbol name customizable in torch deploy.
 (#63817)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63817

ghstack-source-id: 136699671

Test Plan: eyes

Reviewed By: wconstab

Differential Revision: D29571559

fbshipit-source-id: 8e3caa4932ef8d7c8559f264f0e9bb5474ad2237
---
 torch/csrc/deploy/interpreter/freeze.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/torch/csrc/deploy/interpreter/freeze.py b/torch/csrc/deploy/interpreter/freeze.py
index 24fa709cb01ac..31531746ed1b2 100644
--- a/torch/csrc/deploy/interpreter/freeze.py
+++ b/torch/csrc/deploy/interpreter/freeze.py
@@ -35,17 +35,13 @@
 
 """
 
-MAIN_PREFIX = """
+MAIN_PREFIX_TEMPLATE = """
 // Compiled standard library modules. These should be appended to the existing
 // `PyImport_FrozenModules` that ships with CPython.
-struct _frozen _PyImport_FrozenModules_torch[] = {
+struct _frozen {}[] = {{
 """
 
-FAKE_PREFIX = """
-// Compiled standard library modules. These should be appended to the existing
-// `PyImport_FrozenModules` that ships with CPython.
-struct _frozen _PyImport_FrozenModules[] = {
-"""
+FAKE_PREFIX = MAIN_PREFIX_TEMPLATE.format("_PyImport_FrozenModules")
 
 MAIN_SUFFIX = """\
     {0, 0, 0} /* sentinel */
@@ -133,7 +129,7 @@ def write_bytecode(self, install_root):
         for f in bytecode_files:
             f.close()
 
-    def write_main(self, install_root, oss):
+    def write_main(self, install_root, oss, symbol_name):
         """
         Write the `main.c` file containing a table enumerating all the
         frozen modules.
@@ -143,7 +139,7 @@ def write_main(self, install_root, oss):
             for m in self.frozen_modules:
                 outfp.write(f"extern unsigned char {m.c_name}[];\n")
 
-            outfp.write(MAIN_PREFIX)
+            outfp.write(MAIN_PREFIX_TEMPLATE.format(symbol_name))
             for m in self.frozen_modules:
                 outfp.write(f'\t{{"{m.module_name}", {m.c_name}, {m.size}}},\n')
             outfp.write(MAIN_SUFFIX)
@@ -246,6 +242,11 @@ def compile_file(self, path: Path, top_package_path: Path):
 parser.add_argument("--verbose", action="store_true", help="Print debug logs")
 parser.add_argument("--install_dir", help="Root directory for all output files")
 parser.add_argument("--oss", action="store_true", help="If it's OSS build, add a fake _PyImport_FrozenModules")
+parser.add_argument(
+    "--symbol_name",
+    help="The name of the frozen module array symbol to generate",
+    default="_PyImport_FrozenModules_torch",
+)
 
 args = parser.parse_args()
 
@@ -264,4 +265,4 @@ def compile_file(self, path: Path, top_package_path: Path):
         f.compile_path(path, path)
 
 f.write_bytecode(args.install_dir)
-f.write_main(args.install_dir, args.oss)
+f.write_main(args.install_dir, args.oss, args.symbol_name)

From 124ae597fb7a371b39ff771779442017f7817d6a Mon Sep 17 00:00:00 2001
From: Zafar Takhirov <zaf@fb.com>
Date: Wed, 25 Aug 2021 20:37:56 -0700
Subject: [PATCH 237/530] [quant] Fixing the conversion of the quantizable RNN
 (#63879)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63879

Quantizable RNN had a bug, where the `from_observed` was an instance method, instead of a class method. This caused the `tq.convert` to fail. This fixes the issue by making the `from_observed` a classmethod.

The tests were passing before because the unittests were not using the custom module path, but a conventional `from_float`, which is also supported.

Test Plan:
`buck test mode/dev //caffe2/test:quantization -- test_custom_module_lstm`

```
buck test mode/dev //caffe2/test:quantization -- test_custom_module_lstm
Parsing buck files: finished in 0.5 sec
Downloaded 0/2 artifacts, 0.00 bytes, 100.0% cache miss (for updated rules)
Building: finished in 9.2 sec (100%) 12622/12622 jobs, 2/12622 updated
  Total time: 9.7 sec
More details at https://www.internalfb.com/intern/buck/build/0d87b987-649f-4d06-b0e2-97b5077
Tpx test run coordinator for Facebook. See https://fburl.com/tpx for details.
Running with tpx session id: cb99305f-65c9-438b-a99f-a0a2a3089778
Trace available for this run at /tmp/tpx-20210824-115652.540356/trace.log
Started reporting to test run: https://www.internalfb.com/intern/testinfra/testrun/5066549645030046
    ✓ ListingSuccess: caffe2/test:quantization - main (12.550)
    ✓ Pass: caffe2/test:quantization - test_custom_module_lstm (quantization.core.test_quantized_op.TestQuantizedOps) (174.867)
Summary
  Pass: 1
  ListingSuccess: 1
If you need help understanding your runs, please follow the wiki: https://fburl.com/posting_in_tpx_users
Finished test run: https://www.internalfb.com/intern/testinfra/testrun/5066549645030046
```

Reviewed By: jerryzh168, mtl67

Differential Revision: D30520473

fbshipit-source-id: bc5d0b5bb079fd146e2614dd42526fc7d4d4f3c6
---
 test/quantization/core/test_quantized_op.py | 6 +++++-
 torch/nn/quantizable/modules/rnn.py         | 5 +++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 6c94586d3101e..18212671aabaa 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -2414,6 +2414,9 @@ def test_custom_module_lstm(self):
         custom_module_config = {
             'float_to_observed_custom_module_class': {
                 torch.nn.LSTM: torch.nn.quantizable.LSTM
+            },
+            'observed_to_quantized_custom_module_class': {
+                torch.nn.quantizable.LSTM: torch.nn.quantizable.LSTM
             }
         }
 
@@ -2460,7 +2463,8 @@ def test_custom_module_lstm(self):
                 self.assertEqual(y_ref, y)
 
                 # Quantize
-                lstm_quantized = torch.quantization.convert(lstm_prepared)
+                lstm_quantized = torch.quantization.convert(
+                    lstm_prepared, convert_custom_config_dict=custom_module_config)
                 qy = lstm_quantized(qx)
 
                 snr = _snr(y, qy)
diff --git a/torch/nn/quantizable/modules/rnn.py b/torch/nn/quantizable/modules/rnn.py
index 32e813ce94eae..bdfd7788533b5 100644
--- a/torch/nn/quantizable/modules/rnn.py
+++ b/torch/nn/quantizable/modules/rnn.py
@@ -407,6 +407,7 @@ def from_float(cls, other, qconfig=None):
         observed = torch.quantization.prepare(observed, inplace=True)
         return observed
 
-    def from_observed(self, other):
-        return torch.quantization.convert(self, inplace=False,
+    @classmethod
+    def from_observed(cls, other):
+        return torch.quantization.convert(other, inplace=False,
                                           remove_qconfig=True)

From 80a61142e48f21e93e388359768e31aa687a9378 Mon Sep 17 00:00:00 2001
From: Zeina Migeed <migeedz@fb.com>
Date: Wed, 25 Aug 2021 20:42:14 -0700
Subject: [PATCH 238/530] inference for algebraic expressions (#63822)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63822

Infer algebraic expressions and add it to our symbolic inferencer. Works for conv2D and can be extended to other operations.

Test Plan: Imported from OSS

Reviewed By: jamesr66a

Differential Revision: D30518469

Pulled By: migeed-z

fbshipit-source-id: b92dfa40b2d834a535177da42b851701b8f7178c
---
 test/fx/test_gradual_type.py                  | 151 ++++++++++++------
 .../experimental/graph_gradual_typechecker.py | 114 ++++++++++++-
 torch/fx/experimental/unify_refinements.py    |  18 ++-
 3 files changed, 226 insertions(+), 57 deletions(-)

diff --git a/test/fx/test_gradual_type.py b/test/fx/test_gradual_type.py
index 203cf6b7e306e..37e8db1e5cf4b 100644
--- a/test/fx/test_gradual_type.py
+++ b/test/fx/test_gradual_type.py
@@ -9,7 +9,14 @@
 from torch.fx.experimental.rewriter import RewritingTracer
 from torch.fx import GraphModule
 from torch.fx.passes.shape_prop import ShapeProp
-from torch.fx.experimental.unification import Var
+
+try:
+    import sympy
+    HAS_SYMPY = True
+except ImportError:
+    HAS_SYMPY = False
+skipIfNoSympy = unittest.skipIf(not HAS_SYMPY, "no sympy")
+
 
 try:
     from torchvision.models import resnet50
@@ -19,13 +26,6 @@
     HAS_TORCHVISION = False
 skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
 
-# try:
-#     from unification import Var
-#     HAS_UNIFICATION = True
-# except ImportError:
-#     HAS_UNIFICATION = False
-# skipIfNoUnification = unittest.skipIf(not HAS_UNIFICATION, "no unification")
-
 def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
     """3x3 convolution with padding"""
     return torch.nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
@@ -270,10 +270,9 @@ def forward(self, x: TensorType((1, 2, 3, 5))):
     def test_type_check_batch_norm_2D(self):
         class BasicBlock(torch.nn.Module):
 
-            def __init__(self, inplanes, planes, norm_layer=None):
+            def __init__(self, inplanes, planes):
                 super(BasicBlock, self).__init__()
-                if norm_layer is None:
-                    norm_layer = torch.nn.BatchNorm2d
+                norm_layer = torch.nn.BatchNorm2d
                 self.bn1 = norm_layer(planes)
 
             def forward(self, x: TensorType((2, 2, 5, 4))):
@@ -302,10 +301,9 @@ def forward(self, x: TensorType((2, 2, 5, 4))):
     def test_type_check_batch_norm_2D_false(self):
         class BasicBlock(torch.nn.Module):
 
-            def __init__(self, inplanes, planes, norm_layer=None):
+            def __init__(self, inplanes, planes):
                 super(BasicBlock, self).__init__()
-                if norm_layer is None:
-                    norm_layer = torch.nn.BatchNorm2d
+                norm_layer = torch.nn.BatchNorm2d
                 self.bn1 = norm_layer(planes)
 
             def forward(self, x: TensorType((2, 2, 5))):
@@ -325,10 +323,9 @@ def forward(self, x: TensorType((2, 2, 5))):
     def test_type_check_batch_norm_2D_broadcast(self):
         class BasicBlock(torch.nn.Module):
 
-            def __init__(self, inplanes, planes, norm_layer=None):
+            def __init__(self, inplanes, planes):
                 super(BasicBlock, self).__init__()
-                if norm_layer is None:
-                    norm_layer = torch.nn.BatchNorm2d
+                norm_layer = torch.nn.BatchNorm2d
                 self.bn1 = norm_layer(planes)
 
             def forward(self, x: Dyn):
@@ -363,10 +360,9 @@ def forward(self, x: Dyn):
 
     def test_type_check_conv2D(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self, inplanes, planes, stride=1, norm_layer=None):
+            def __init__(self, inplanes, planes, stride=1):
                 super(BasicBlock, self).__init__()
-                if norm_layer is None:
-                    norm_layer = torch.nn.BatchNorm2d
+                norm_layer = torch.nn.BatchNorm2d
                 self.conv1 = conv3x3(inplanes, planes, stride)
                 self.bn1 = norm_layer(planes)
 
@@ -394,10 +390,9 @@ def forward(self, x: Dyn):
 
     def test_type_check_conv2D_2(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self, inplanes, planes, stride=1, norm_layer=None):
+            def __init__(self, inplanes, planes, stride=1):
                 super(BasicBlock, self).__init__()
-                if norm_layer is None:
-                    norm_layer = torch.nn.BatchNorm2d
+                norm_layer = torch.nn.BatchNorm2d
                 self.conv1 = conv3x3(inplanes, planes, stride)
                 self.bn1 = norm_layer(planes)
 
@@ -434,7 +429,6 @@ def forward(self, x: TensorType((5, 2, 3, 4))):
         with self.assertRaises(TypeError):
             tc.type_check()
 
-
     def test_type_check_conv2D_2_fully_static(self):
         annotation_list = [(1, 2, 3, 5), (2, 5, 6, 9), (10, 15, 13, 14),
                            (10, Dyn, 13, 14), (Dyn, Dyn, Dyn, 3)]
@@ -522,16 +516,14 @@ def forward(self, x):
                     assert n.type == TensorType(output_types[i])
                     assert is_consistent(n.type, TensorType(b.size()))
 
-
     def test_typecheck_basicblock(self):
         class BasicBlock(torch.nn.Module):
             expansion = 1
 
             def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
-                         base_width=64, dilation=1, norm_layer=None):
+                         base_width=64, dilation=1):
                 super(BasicBlock, self).__init__()
-                if norm_layer is None:
-                    norm_layer = torch.nn.BatchNorm2d
+                norm_layer = torch.nn.BatchNorm2d
                 if groups != 1 or base_width != 64:
                     raise ValueError('BasicBlock only supports groups=1 and base_width=64')
                 if dilation > 1:
@@ -643,7 +635,6 @@ def forward(self, x: TensorType((1, Dyn, 3, 5, Dyn))):
             if n.op == 'output':
                 assert n.type == TensorType((1, Dyn, 5, Dyn))
 
-
     def test_type_check_flatten3(self):
         class M(torch.nn.Module):
             def forward(self, x: TensorType((2, 3, 4, 5))):
@@ -661,7 +652,6 @@ def forward(self, x: TensorType((2, 3, 4, 5))):
         c = r.constraints
         assert c == [Equality(2, 2)]
 
-
     def test_type_typechecl_maxpool2d_3dinput(self):
 
         class BasicBlock(torch.nn.Module):
@@ -770,7 +760,6 @@ def forward(self, x):
                     assert n.type == TensorType(output_types[i])
                     assert is_consistent(n.type, TensorType(b.size()))
 
-
     def test_flatten_fully_static(self):
         annotation_list = [Dyn, TensorType((2, 5, 6, 9)), TensorType((10, 15, 13, 14)),
                            TensorType((10, Dyn, 13, 14)), TensorType((Dyn, Dyn, Dyn, 10))]
@@ -816,6 +805,7 @@ def forward(self, x):
                 if n.op == 'output':
                     assert is_consistent(n.type, TensorType(b.size()))
 
+    @skipIfNoSympy
     @skipIfNoTorchVision
     def test_resnet50(self):
         gm_run = symbolic_trace(resnet50())
@@ -859,14 +849,13 @@ def test_resnet50(self):
             batch_sizes.add(n.type.__args__[0])
         assert (len(batch_sizes) == 1)
 
-
+    @skipIfNoSympy
     def test_type_check_batch_norm_symbolic(self):
         class BasicBlock(torch.nn.Module):
 
-            def __init__(self, inplanes, planes, norm_layer=None):
+            def __init__(self, inplanes, planes):
                 super(BasicBlock, self).__init__()
-                if norm_layer is None:
-                    norm_layer = torch.nn.BatchNorm2d
+                norm_layer = torch.nn.BatchNorm2d
                 self.bn1 = norm_layer(planes)
 
             def forward(self, x: Dyn):
@@ -884,15 +873,15 @@ def forward(self, x: Dyn):
 
         infer_symbolic_types(traced)
 
-
-        my_types = iter([TensorType[(2, 2, Var(7), 4)],
-                         TensorType[(2, 2, Var(7), 4)],
-                         TensorType[(2, 2, Var(7), 4)],
-                         TensorType[(2, 2, Var(7), 4)]])
+        my_types = iter([TensorType[(2, 2, sympy.symbols('~7'), 4)],
+                         TensorType[(2, 2, sympy.symbols('~7'), 4)],
+                         TensorType[(2, 2, sympy.symbols('~7'), 4)],
+                         TensorType[(2, 2, sympy.symbols('~7'), 4)]])
 
         for n in graph.nodes:
             assert n.type == next(my_types)
 
+    @skipIfNoSympy
     def test_symbolic_add_with_broadcast(self):
         class M(torch.nn.Module):
             def forward(self, x: TensorType((1, 2, 3, Dyn)), y: TensorType((2, 3, 4))):
@@ -911,16 +900,17 @@ def forward(self, x: TensorType((1, 2, 3, Dyn)), y: TensorType((2, 3, 4))):
 
         infer_symbolic_types(symbolic_traced)
 
-        expected_ph_types = [TensorType((1, 2, 3, Var(0))),
+        expected_ph_types = [TensorType((1, 2, 3, sympy.symbols('~0'))),
                              TensorType((2, 3, 4)),
-                             TensorType((1, 2, 3, Var(1))),
-                             TensorType((1, 2, 3, Var(1)))]
+                             TensorType((1, 2, 3, sympy.symbols('~1'))),
+                             TensorType((1, 2, 3, sympy.symbols('~1')))]
         expected_iter = iter(expected_ph_types)
 
+
         for n in symbolic_traced.graph.nodes:
             assert n.type == next(expected_iter)
 
-
+    @skipIfNoSympy
     def test_symbolic_add_with_broadcast_2(self):
         class M(torch.nn.Module):
             def forward(self, x: TensorType((1, 2)), y: TensorType((Dyn, 2))):
@@ -934,13 +924,80 @@ def forward(self, x: TensorType((1, 2)), y: TensorType((Dyn, 2))):
         r.refine()
 
         expected_ph_types = [TensorType((1, 2)),
-                             TensorType((Var(1), 2)),
-                             TensorType((Var(1), 2)),
-                             TensorType((Var(1), 2))]
+                             TensorType((sympy.symbols('~1'), 2)),
+                             TensorType((sympy.symbols('~1'), 2)),
+                             TensorType((sympy.symbols('~1'), 2))]
         expected_iter = iter(expected_ph_types)
 
         for n in symbolic_traced.graph.nodes:
             assert n.type == next(expected_iter)
 
+    @skipIfNoSympy
+    def test_type_check_conv2D_types(self):
+        class BasicBlock(torch.nn.Module):
+            def __init__(self, inplanes, planes, stride=1):
+                super(BasicBlock, self).__init__()
+                norm_layer = torch.nn.BatchNorm2d
+                self.conv1 = conv3x3(inplanes, planes, stride)
+                self.bn1 = norm_layer(planes)
+
+            def forward(self, x: Dyn):
+                identity = x
+                out: TensorType((2, 2, Dyn, 4)) = self.conv1(x)
+                out += identity
+                return out
+
+        B = BasicBlock(2, 2)
+        ast_rewriter = RewritingTracer()
+        graph = ast_rewriter.trace(B)
+        traced = GraphModule(ast_rewriter.root, graph, "gm")
+        tc = GraphTypeChecker({}, traced)
+        tc.type_check()
+        infer_symbolic_types(traced)
+
+        for n in traced.graph.nodes:
+            if n.op == 'call_module':
+                assert isinstance(n.type.__args__[2], sympy.floor)
+                assert isinstance(n.type.__args__[3], sympy.floor)
+
+    @skipIfNoSympy
+    def test_type_check_symbolic_inferenceconv2D_maxpool2d_flatten(self):
+
+        class BasicBlock(torch.nn.Module):
+            def __init__(self):
+                super(BasicBlock, self).__init__()
+
+                self.conv1 = torch.nn.Conv2d(3, 6, 5)
+                self.pool = torch.nn.MaxPool2d(2, 2)
+                self.conv2 = torch.nn.Conv2d(6, 16, 5)
+                self.fc1 = torch.nn.Linear(5, 120)
+                self.pool2 = torch.nn.AdaptiveAvgPool2d((6, 7))
+
+            def forward(self, x : TensorType((4, 3, Dyn, Dyn))):
+                out = self.conv1(x)
+                out = self.pool(out)
+                out = self.conv2(out)
+                out = self.pool(out)
+                out = self.fc1(out)
+                out = self.pool2(out)
+                out = torch.flatten(out, 1)
+                return out
+
+        B = BasicBlock()
+        ast_rewriter = RewritingTracer()
+        traced = symbolic_trace(B)
+        tc = GraphTypeChecker({}, traced)
+        tc.type_check()
+        infer_symbolic_types(traced)
+
+        for n in traced.graph.nodes:
+            if n.target == 'conv1':
+                assert n.type == TensorType((4, 6, sympy.floor((sympy.symbols('~0') - 4)),
+                                             sympy.floor((sympy.symbols('~1') - 4))))
+
+            elif n.target == 'conv2':
+                assert n.type == TensorType((4, 16, sympy.floor((sympy.symbols('~4') - 4)),
+                                             sympy.floor((sympy.symbols('~5') - 4))))
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index 6e05f918e810e..a54e52151f858 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -9,12 +9,18 @@
 from torch.fx.experimental.refinement_types import Equality
 import itertools
 
-
 from torch.fx.experimental.unification import Var  # type: ignore[attr-defined]
 
 
+try:
+    import sympy  # type: ignore[import]
+    HAS_SYMPY = True
+except ImportError:
+    HAS_SYMPY = False
+
 _INFERENCE_RULES: Dict[Target, Callable] = {}
 _REFINEMENT_RULES: Dict[Target, Callable] = {}
+_RULES: Dict[Target, Callable] = {}
 
 
 def expand_to_tensor_dim(t, n):
@@ -84,6 +90,13 @@ def register(fn):
         return fn
     return register
 
+def register_algebraic_expressions_inference_rule(call_target):
+    def register(fn):
+        if call_target in _RULES:
+            raise RuntimeError('Rule already registered for {call_target}!')
+        _RULES[call_target] = fn
+        return fn
+    return register
 
 @register_inference_rule(torch.add)
 @register_inference_rule(operator.add)
@@ -258,10 +271,12 @@ def calculate_out_dimension(d_in, module_instance, index):
     dilation = (module_instance.dilation, module_instance.dilation) \
         if isinstance(module_instance.dilation, int) else module_instance.dilation
 
+    DIMENSION_TYPES = (int, sympy.Symbol) if HAS_SYMPY else (int,)
+
     if d_in == Dyn:
         return Dyn
 
-    elif isinstance(d_in, int):
+    elif isinstance(d_in, DIMENSION_TYPES):
         n = d_in + 2 * padding[index] - \
             dilation[index] * \
             (kernel_size[index] - 1) - 1
@@ -269,7 +284,7 @@ def calculate_out_dimension(d_in, module_instance, index):
         return (n // stride[0]) + 1
 
     else:
-        raise TypeError(f'{d_in} in {module_instance} must be a number or Dyn')
+        raise TypeError(f'{d_in} in {module_instance} must be a number or Dyn. Received {type(d_in)}')
 
 
 def get_greatest_upper_bound(type1, type2):
@@ -552,8 +567,17 @@ def get_node_type(a):
 
 
 @register_refinement_rule(Conv2d)
+def conv_refinement_rule(n: Node):
+    res = []
+    assert isinstance(n.args[0], Node)
+    arg_type = n.args[0].type
+    if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType):
+        res = [Equality(arg_type.__args__[0], n.type.__args__[0])]
+        return res
+
+
 @register_refinement_rule(torch.nn.Linear)
-def first_one(n: Node):
+def linear_refinement_rule(n: Node):
     res = []
     assert isinstance(n.args[0], Node)
     arg_type = n.args[0].type
@@ -564,7 +588,6 @@ def first_one(n: Node):
 # todo needs review for addition. Is this constraint correct?
 @register_refinement_rule(BatchNorm2d)
 @register_refinement_rule(torch.nn.ReLU)
-@register_refinement_rule(torch.nn.AdaptiveAvgPool2d)
 def all_eq(n: Node):
     res = []
     assert isinstance(n.args[0], Node)
@@ -575,6 +598,18 @@ def all_eq(n: Node):
         res = [Equality(args1[i], args2[i]) for i in range(len(args1))]
     return res
 
+
+@register_refinement_rule(torch.nn.AdaptiveAvgPool2d)
+def first_two__eq(n: Node):
+    res = []
+    assert isinstance(n.args[0], Node)
+    arg_type = n.args[0].type
+    if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType):
+        args1 = arg_type.__args__
+        args2 = n.type.__args__
+        res = [Equality(args1[0], args2[0]), Equality(args1[1], args2[1])]
+    return res
+
 @register_refinement_rule(torch.add)
 @register_refinement_rule(operator.add)
 def add_eq(n: Node):
@@ -636,6 +671,20 @@ def flatten_refinement_rule(n: Node):
             eq_const.append(Equality(t1, t2))
     return eq_const
 
+
+@register_algebraic_expressions_inference_rule(Conv2d)
+def conv_rule(n: Node, module_instance):
+    assert isinstance(n.args[0], Node)
+    arg_type = n.args[0].type
+    if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType):
+        w_in = arg_type.__args__[3]
+        h_in = arg_type.__args__[2]
+        h_out = calculate_out_dimension(h_in, module_instance, 0)
+        w_out = calculate_out_dimension(w_in, module_instance, 1)
+        new_type = TensorType((n.type.__args__[0], n.type.__args__[1], h_out, w_out))
+        n.type = new_type
+        return new_type
+
 class Refine:
     """
     Symbolic shape inference.
@@ -658,6 +707,15 @@ def refine(self):
             self.refine_node(n)
         return True
 
+    def symbolic_relations(self):
+        """
+        Infers algebraic relations
+        """
+        graph = self.traced.graph
+        for n in graph.nodes:
+            self.infer_symbolic_relations(n)
+        return True
+
     def replace_dyn_with_fresh_var(self, typ):
         """
         Replace all unknown types with fresh type variables.
@@ -675,6 +733,26 @@ def replace_dyn_with_fresh_var(self, typ):
         else:
             return typ
 
+
+    def convert_to_sympy_symbols(self, typ):
+        """
+        Replace all unknown types with fresh type variables.
+        """
+        if HAS_SYMPY:
+            if isinstance(typ, Var):
+                return sympy.symbols(str(typ))
+            elif isinstance(typ, TensorType):
+                new_args = [self.convert_to_sympy_symbols(a) for a in typ.__args__]
+                return TensorType(tuple(new_args))
+            elif isinstance(typ, list):
+                return [self.convert_to_sympy_symbols(t) for t in typ]
+            elif isinstance(typ, tuple):
+                return (self.convert_to_sympy_symbols(t) for t in typ)
+            else:
+                return typ
+        else:
+            return typ
+
     def refine_node(self, n: Node):
         """
         Returns a list of equality constraints for
@@ -710,6 +788,32 @@ def get_node_type(a):
         else:
             pass
 
+    def infer_symbolic_relations(self, n: Node):
+        if HAS_SYMPY:
+            n.type = self.convert_to_sympy_symbols(n.type)
+            if n.op == 'call_function':
+                if n.target in _RULES:
+                    return _RULES[n.target](n)
+                else:
+                    pass
+
+            if n.op == 'call_module':
+                module_instance = self.traced.get_submodule(n.target)
+                if type(module_instance) in _RULES:
+                    return _RULES[type(module_instance)](n, module_instance)
+                else:
+                    pass
+
+            if n.op == 'output':
+                def get_node_type(a):
+                    return a.type
+                n.type = torch.fx.node.map_arg(n.args[0], get_node_type)
+                return n.type
+
+            else:
+                pass
+        else:
+            pass
 
 def get_parameter(traced, target: str):
     """
diff --git a/torch/fx/experimental/unify_refinements.py b/torch/fx/experimental/unify_refinements.py
index 5074377ebf2dc..532d2784fb49a 100644
--- a/torch/fx/experimental/unify_refinements.py
+++ b/torch/fx/experimental/unify_refinements.py
@@ -2,11 +2,10 @@
 from torch.fx.tensor_type import TensorType
 from torch.fx.experimental.unification import Var, unify  # type: ignore[attr-defined]
 
+
 def infer_symbolic_types_single_pass(traced):
     """
-    Generate constraints over types,
-    solve constraints with unification,
-    apply solution back to the types
+    Calls our symbolic inferencer once.
     """
     r = Refine(traced)
     r.refine()
@@ -20,8 +19,17 @@ def infer_symbolic_types(traced):
     to infer all the information such as the case
     for braodcasting.
     """
-    infer_symbolic_types_single_pass(traced)
-    infer_symbolic_types_single_pass(traced)
+    r = Refine(traced)
+    r.refine()
+    mgu = unify_eq(r.constraints)
+    substitute_all_types(traced.graph, mgu)
+
+    r = Refine(traced)
+    r.refine()
+    mgu = unify_eq(r.constraints)
+    substitute_all_types(traced.graph, mgu)
+
+    r.symbolic_relations()
 
 def convert_eq(list_of_eq):
     """

From 49c8fbc92f70d6d78e02e2b7944de59d9348db37 Mon Sep 17 00:00:00 2001
From: nikithamalgi <nikithamalgi@devvm146.prn0.facebook.com>
Date: Wed, 25 Aug 2021 21:47:50 -0700
Subject: [PATCH 239/530] Clean up related to type refinements (#62444)

Summary:
Creates a helper function to refine the types into a torchScript compatible format in the monkeytype config for profile directed typing

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62444

Reviewed By: malfet

Differential Revision: D30548159

Pulled By: nikithamalgifb

fbshipit-source-id: 7c09ce5f5e043d069313b87112837d7e226ade1f
---
 test/jit/test_pdt.py            | 41 ------------------
 torch/jit/_monkeytype_config.py | 74 ++++++++++++++++-----------------
 torch/jit/frontend.py           |  4 +-
 3 files changed, 37 insertions(+), 82 deletions(-)

diff --git a/test/jit/test_pdt.py b/test/jit/test_pdt.py
index b04a66e5dfcd9..57cd74faf432b 100644
--- a/test/jit/test_pdt.py
+++ b/test/jit/test_pdt.py
@@ -454,44 +454,3 @@ def test_none(a) -> Any:
 
         scripted_fn = torch.jit._script_pdt(test_none, example_inputs=[(None, ), (torch.Tensor(1), )])
         self.assertEqual(scripted_fn(torch.ones(1), ), test_none(torch.ones(1), ))
-
-        class TestForwardWithNoneType(torch.nn.Module):
-            def forward(self, a):
-                count = 0
-                for i, val in enumerate(a):
-                    if val is None:
-                        count += 1
-                return count
-
-        make_global(TestForwardWithNoneType)
-        pdt_model = TestForwardWithNoneType()
-
-        # Test List[Optional[float]] as input
-        scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[([None, ], ), ([2.9, ], )])
-        self.assertEqual(scripted_model([2.8, 6.7, 3.8, None, ]), pdt_model([2.8, 6.7, 3.8, None, ]))
-
-        # Test Tuple[Optional[int]] as input
-        scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[((5.1, ), ), ((None, ), ), ])
-        self.assertEqual(scripted_model((6.2, None, 10.6, 80.1, None, )), pdt_model((6.2, None, 10.6, 80.1, None, )))
-
-        # Test List[Optional[int]] as input
-        scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[([None, ], ), ([2, ], )])
-        self.assertEqual(scripted_model([2, None, 6, 8, ]), pdt_model([2, None, 6, 8, ]))
-
-        # Test Tuple[Optional[int]] as input
-        scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[((None, ), ), ((5, ), )])
-        self.assertEqual(scripted_model((2, None, 6, 8)), pdt_model((2, None, 6, 8, )))
-
-        # Test Tuple[Optional[float]] as input
-        scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[((None, ), ), ((5, ), )])
-        self.assertEqual(scripted_model((2, None, 6, 8)), pdt_model((2, None, 6, 8, )))
-
-        # Test Tuple[Optional[torch.Tensor]] as input
-        scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[(((torch.ones(1), ), (None, ), ), )])
-        self.assertEqual(scripted_model((torch.ones(1), torch.ones(1), None)),
-                         pdt_model((torch.ones(1), torch.ones(1), None)))
-
-        # Test List[Optional[torch.Tensor]] as input
-        scripted_model = torch.jit._script_pdt(pdt_model, example_inputs=[([None, ], ), ([torch.ones(1), ], )])
-        self.assertEqual(scripted_model([torch.ones(1), torch.ones(1), None]),
-                         pdt_model([torch.ones(1), torch.ones(1), None]))
diff --git a/torch/jit/_monkeytype_config.py b/torch/jit/_monkeytype_config.py
index b5a698eca7006..f0e4613e82fd1 100644
--- a/torch/jit/_monkeytype_config.py
+++ b/torch/jit/_monkeytype_config.py
@@ -1,7 +1,6 @@
 import inspect
 import typing
 import pathlib
-import torch
 from typing import Optional, Iterable, List, Dict
 from collections import defaultdict
 from types import CodeType
@@ -16,25 +15,38 @@
 except ImportError:
     _IS_MONKEYTYPE_INSTALLED = False
 
-def get_optional_of_element_type(types: str):
+def get_type(type):
+    """
+    Helper function which converts the given type to a torchScript acceptable format.
+    """
+    if isinstance(type, str):
+        return type
+    elif inspect.getmodule(type) == typing:
+        # If the type is a type imported from typing
+        # like Tuple, List, Dict then replace `typing.`
+        # with a null string. This needs to be done since
+        # typing.List is not accepted by TorchScript.
+        type_to_string = str(type)
+        return type_to_string.replace(type.__module__ + '.', '')
+    elif type.__module__.startswith('torch'):
+        # If the type is a subtype of torch module, then TorchScript expects a fully qualified name
+        # for the type which is obtained by combining the module name and type name.
+        return type.__module__ + '.' + type.__name__
+    else:
+        # For all other types use the name for the type.
+        return type.__name__
+
+def get_optional_of_element_type(types):
     """
     Helper function to extracts the type of the element to be annotated to Optional
     from the list of consolidated types and returns `Optional[element type]`.
-
     TODO: To remove this check once Union support lands.
     """
-    elements = types.split(",")
-    elem_type = elements[0] if 'NoneType' in elements[1] else elements[1]
-
-    # If the type is from typing module, then extract the element type
-    start = elem_type.find("[")
-    end = elem_type.rfind("]")
-    if start != -1 and end != -1:
-        return elem_type[:start + 1] + 'Optional[' + elem_type[start + 1: end] + ']]'
-
-    # Else return Optional[element type]
-    if elem_type == 'Tensor':
-        elem_type = 'torch.Tensor'
+    elem_type = types[1] if type(None) == types[0] else types[0]
+    elem_type = get_type(elem_type)
+
+    # Optional type is internally converted to Union[type, NoneType], which
+    # is not supported yet in TorchScript. Hence, representing the optional type as string.
     return 'Optional[' + elem_type + ']'
 
 def get_qualified_name(func):
@@ -88,30 +100,15 @@ def consolidate_types(self, qualified_name: str) -> Dict:
             # then consolidate the type to `Any` and replace the entry
             # by type `Any`.
             for arg, types in all_args.items():
-                _all_type = " "
-                for _type in types:
-                    # If the type is a type imported from typing
-                    # like Tuple, List, Dict then replace "typing."
-                    # with a null string.
-                    if inspect.getmodule(_type) == typing:
-                        _type_to_string = str(_type)
-                        _all_type += _type_to_string.replace('typing.', '') + ','
-                    elif _type is torch.nn.parameter.Parameter:
-                        # Check if the type is torch.nn.parameter.Parameter,
-                        # use the entire quaalified name `torch.nn.parameter.Parameter`
-                        # for type
-                        _all_type += 'torch.nn.parameter.Parameter' + ','
-                    else:
-                        _all_type += _type.__name__ + ','
-                _all_type = _all_type.lstrip(" ")  # Remove any trailing spaces
-
-                if len(types) == 2 and 'NoneType' in _all_type:
+                types = list(types)
+                type_length = len(types)
+                if type_length == 2 and type(None) in types:
                     # TODO: To remove this check once Union suppport in TorchScript lands.
-                    all_args[arg] = {get_optional_of_element_type(_all_type)}
-                elif len(types) > 1:
-                    all_args[arg] = {'Any'}
-                else:
-                    all_args[arg] = {_all_type[:-1]}
+                    all_args[arg] = get_optional_of_element_type(types)
+                elif type_length > 1:
+                    all_args[arg] = 'Any'
+                elif type_length == 1:
+                    all_args[arg] = get_type(types[0])
             return all_args
 
         def get_args_types(self, qualified_name: str) -> Dict:
@@ -157,7 +154,6 @@ def jit_code_filter(code: CodeType) -> bool:
     The custom CodeFilter is required while scripting a FX Traced forward calls.
     FX Traced forward calls have `code.co_filename` start with '<' which is used
     to exclude tracing of stdlib and site-packages in the default code filter.
-
     Since we need all forward calls to be traced, this custom code filter
     checks for code.co_name to be 'forward' and enables tracing for all such calls.
     The code filter is similar to default code filter for monkeytype and
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index b0228b132980a..0928106f3ba49 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -337,9 +337,9 @@ def build_param_list(ctx, py_args, self_name, pdt_arg_types=None):
                 raise NotSupportedError(ctx_range, _vararg_kwarg_err)
 
     # List of Tuple of args and type as inferred by profile directed typing
-    arg_and_types = [(arg, next(iter(pdt_arg_types[arg.arg])) if pdt_arg_types and bool(pdt_arg_types[arg.arg]) else None)
+    arg_and_types = [(arg, pdt_arg_types[arg.arg] if pdt_arg_types and bool(pdt_arg_types[arg.arg]) else None)
                      for arg in py_args.args]
-    arg_and_types_kwonlyargs = [(arg, next(iter(pdt_arg_types[arg.arg])) if pdt_arg_types and bool(pdt_arg_types[arg.arg])
+    arg_and_types_kwonlyargs = [(arg, pdt_arg_types[arg.arg] if pdt_arg_types and bool(pdt_arg_types[arg.arg])
                                 else None) for arg in py_args.kwonlyargs]
 
     result = [build_param(ctx, arg, self_name, kwarg_only=False, pdt_arg_type=arg_type)

From b1154cc7741fa7ad4f075272347ff587ebf168f7 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 25 Aug 2021 22:04:44 -0700
Subject: [PATCH 240/530] enable equal_nan for complex values in isclose
 (#63571)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63571

Test Plan: Imported from OSS

Reviewed By: malfet, ngimel

Differential Revision: D30560127

Pulled By: mruberry

fbshipit-source-id: 8958121ca24e7c139d869607903aebbe87bc0740
---
 aten/src/ATen/native/TensorCompare.cpp |  6 ++----
 test/test_testing.py                   | 30 ++++++++++++++++++++------
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index 90a57d1d30c94..3f69cab48b090 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -108,8 +108,6 @@ bool allclose(const Tensor& self, const Tensor& other, double rtol, double atol,
 //  https://github.com/numpy/numpy/issues/15959 is resolved
 Tensor isclose(const Tensor& self, const Tensor& other, double rtol, double atol, bool equal_nan) {
   TORCH_CHECK(self.scalar_type() == other.scalar_type(), self.scalar_type(), " did not match ", other.scalar_type());
-  TORCH_CHECK(!(self.is_complex() && equal_nan),
-    "isclose with equal_nan=True is not supported for complex inputs.");
   TORCH_CHECK(!(self.is_quantized() || other.is_quantized()),
     "isclose is not supported for quantized inputs.");
 
@@ -121,8 +119,8 @@ Tensor isclose(const Tensor& self, const Tensor& other, double rtol, double atol
 
   // Computes equality closeness
   Tensor close = self == other;
-  if (equal_nan && self.is_floating_point()) {
-      close.__ior__((self != self).__iand__(other != other));
+  if (equal_nan && (self.is_floating_point() || self.is_complex())) {
+      close.__ior__(self.isnan().__iand__(other.isnan()));
   }
 
   // In case of zero tolerances the closeness inequality degenerates to an equality check.
diff --git a/test/test_testing.py b/test/test_testing.py
index d59290b36c27b..7e67569bb4799 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -335,8 +335,6 @@ def test_isclose_comparetensors_float(self, device, dtype):
 
         self._comparetensors_helper(tests, device, dtype, True)
 
-    # torch.close with equal_nan=True is not implemented for complex inputs
-    # see https://github.com/numpy/numpy/issues/15959
     # Note: compareTensor will compare the real and imaginary parts of a
     # complex tensors separately, unlike isclose.
     @dtypes(torch.complex64, torch.complex128)
@@ -416,13 +414,20 @@ def test_isclose_comparetensors_complex(self, device, dtype):
         # equal_nan = True tests
         tests = (
             (complex(1, 1), complex(1, float('nan')), False),
-            (complex(float('nan'), 1), complex(1, float('nan')), False),
+            (complex(1, 1), complex(float('nan'), 1), False),
             (complex(float('nan'), 1), complex(float('nan'), 1), True),
+            (complex(float('nan'), 1), complex(1, float('nan')), True),
+            (complex(float('nan'), float('nan')), complex(float('nan'), float('nan')), True),
         )
+        self._isclose_helper(tests, device, dtype, True)
 
-        with self.assertRaises(RuntimeError):
-            self._isclose_helper(tests, device, dtype, True)
-
+        tests = (
+            (complex(1, 1), complex(1, float('nan')), False),
+            (complex(1, 1), complex(float('nan'), 1), False),
+            (complex(float('nan'), 1), complex(float('nan'), 1), True),
+            (complex(float('nan'), 1), complex(1, float('nan')), False),
+            (complex(float('nan'), float('nan')), complex(float('nan'), float('nan')), True),
+        )
         self._comparetensors_helper(tests, device, dtype, True)
 
     # Tests that isclose with rtol or atol values less than zero throws a
@@ -449,6 +454,19 @@ def test_isclose_equality_shortcut(self):
 
         self.assertFalse(torch.isclose(a, b, rtol=0, atol=0))
 
+    @dtypes(torch.float16, torch.float32, torch.float64, torch.complex64, torch.complex128)
+    def test_isclose_nan_equality_shortcut(self, device, dtype):
+        if dtype.is_floating_point:
+            a = b = torch.nan
+        else:
+            a = complex(torch.nan, 0)
+            b = complex(0, torch.nan)
+
+        expected = True
+        tests = [(a, b, expected)]
+
+        self._isclose_helper(tests, device, dtype, equal_nan=True, rtol=0, atol=0)
+
     @dtypes(torch.bool, torch.long, torch.float, torch.cfloat)
     def test_make_tensor(self, device, dtype):
         def check(size, low, high, requires_grad, noncontiguous):

From b629ea4620c5707bfbf3640acb9c3c0f966c585d Mon Sep 17 00:00:00 2001
From: Joseph Spisak <spisakjo@gmail.com>
Date: Wed, 25 Aug 2021 22:49:22 -0700
Subject: [PATCH 241/530] Update persons_of_interest.rst (#63907)

Summary:
Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63907

Reviewed By: jspisak

Differential Revision: D30534972

Pulled By: dzhulgakov

fbshipit-source-id: ba726fc53e292a362c387cc8b5f7776ca2a2544c
---
 docs/source/community/persons_of_interest.rst | 158 ++++++++++++------
 1 file changed, 103 insertions(+), 55 deletions(-)

diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst
index c220ae80806e8..b1d4954a65768 100644
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@@ -4,50 +4,47 @@ PyTorch Governance | Persons of Interest
 General Maintainers
 -------------------
 
--  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 -  Soumith Chintala (`soumith <https://github.com/soumith>`__)
 -  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 -  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 -  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
--  (sunsetting) Sam Gross
-   (`colesbury <https://github.com/colesbury>`__)
+-  (emeritus) Sam Gross (`colesbury <https://github.com/colesbury>`__)
+-  (emeritus) Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 
 Module-level maintainers
 ------------------------
 
-torch.*
-~~~~~~~
-
--  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
--  Soumith Chintala (`soumith <https://github.com/soumith>`__)
--  [linear algebra] Vishwak Srinivasan (`vishwakftw <https://github.com/vishwakftw>`__)
-
 torch.nn
 ~~~~~~~~
 
--  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 -  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 -  Soumith Chintala (`soumith <https://github.com/soumith>`__)
--  Sam Gross (`colesbury <https://github.com/colesbury>`__)
+-  Joel Schlosser (`jbschlosser <https://github.com/jbschlosser>`__)
+-  (emeritus) Sam Gross (`colesbury <https://github.com/colesbury>`__)
+-  (emeritus) Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 
 torch.optim
 ~~~~~~~~~~~
 
--  Vincent Quenneville-Belair (`vincentqb <https://github.com/vincentqb>`__)
 -  Soumith Chintala (`soumith <https://github.com/soumith>`__)
+-  Ilqar Ramazanli (`iramazanli <https://github.com/iramazanli>`__)
+-  (emeritus) Vincent Quenneville-Belair (`vincentqb <https://github.com/vincentqb>`__)
 
-Autograd Engine
-~~~~~~~~~~~~~~~
+torch.autograd
+~~~~~~~~~~~~~~
 
 -  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 -  Alban Desmaison (`alband <https://github.com/alband>`__)
--  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
+-  (emeritus) Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 
-JIT
-~~~
+JIT / TorchScript / FX
+~~~~~~~~~~~~~~~~~~~~~~
 
--  Zach Devito (`zdevito <https://github.com/zdevito>`__)
 -  Michael Suo (`suo <https://github.com/suo>`__)
+-  Yanan Cao (`gmagogsfm <https://github.com/gmagogsfm>`__)
+-  James Reed (`jamesr66a <https://github.com/jamesr66a>`__)
+-  (emeritus) Zach Devito (`zdevito <https://github.com/zdevito>`__)
+
 
 Distributions & RNG
 ~~~~~~~~~~~~~~~~~~~
@@ -60,39 +57,55 @@ Distributions & RNG
 Distributed
 ~~~~~~~~~~~
 
--  Pieter Noordhuis (`pietern <https://github.com/pietern>`__)
 -  Shen Li (`mrshenli <https://github.com/mrshenli>`__)
--  (proposed) Pritam Damania
-   (`pritamdamania87 <https://github.com/pritamdamania87>`__)
+-  Pritam Damania (`pritamdamania87 <https://github.com/pritamdamania87>`__)
+-  (emeritus) Pieter Noordhuis (`pietern <https://github.com/pietern>`__)
 
 Multiprocessing and DataLoaders
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 -  Vitaly Fedyunin (`VitalyFedyunin <https://github.com/VitalyFedyunin>`__)
 -  Simon Wang (`SsnL <https://github.com/SsnL>`__)
--  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
+-  (emeritus) Adam Paszke (`apaszke <https://github.com/apaszke>`__)
+
+torch.linalg / Linear Algebra
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-  Mike Ruberry (`mruberry <https://github.com/mruberry>`__)
+-  Vishwak Srinivasan (`vishwakftw <https://github.com/vishwakftw>`__)
+-  Ivan Yashchuk (`IvanYashchuk <https://github.com/IvanYashchuk>`__)
+
+torch.fft
+~~~~~~~~~
+
+-  Mike Ruberry (`mruberry <https://github.com/mruberry>`__)
+-  Peter Bell (`peterbell10 <https://github.com/peterbell10>`__)
+
 
 CPU Performance / SIMD
 ~~~~~~~~~~~~~~~~~~~~~~
 
--  Xiaoqiang Zheng (`zheng-xq <https://github.com/zheng-xq>`__)
 -  Vitaly Fedyunin (`VitalyFedyunin <https://github.com/VitalyFedyunin>`__)
--  Sam Gross (`colesbury <https://github.com/colesbury>`__)
--  (sunsetting) Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
--  [threading] Ilia Cherniavskii (`ilia-cher <https://github.com/ilia-cher>`__)
+-  (emeritus) Xiaoqiang Zheng (`zheng-xq <https://github.com/zheng-xq>`__)
+-  (emeritus) Sam Gross (`colesbury <https://github.com/colesbury>`__)
+-  (emeritus) Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
+-  (emeritus) Ilia Cherniavskii (`ilia-cher <https://github.com/ilia-cher>`__)
 
 CUDA
 ~~~~
 
 -  Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
 -  Edward Yang (`ezyang <https://github.com/ezyang>`__)
--  Xiaoqiang Zheng (`zheng-xq <https://github.com/zheng-xq>`__)
+-  Piotr Bialecki (`ptrblck <https://github.com/ptrblck>`__)
+-  (emeritus) Xiaoqiang Zheng (`zheng-xq <https://github.com/zheng-xq>`__)
 
 MKLDNN
 ~~~~~~
 
--  Junjie Bai (`bddppq <https://github.com/bddppq>`__)
--  Yinghai Lu (`yinghai <https://github.com/yinghai>`__)
+-  Vitaly Fedyunin (`VitalyFedyunin <https://github.com/VitalyFedyunin>`__)
+-  Jianhui Li (`Jianhui-Li <https://github.com/Jianhui-Li>`__)
+-  (emeritus) Junjie Bai (`bddppq <https://github.com/bddppq>`__)
+-  (emeritus) Yinghai Lu (`yinghai <https://github.com/yinghai>`__)
 
 AMD/ROCm/HIP
 ~~~~~~~~~~~~
@@ -100,39 +113,66 @@ AMD/ROCm/HIP
 -  Peng Sun (`sunway513 <https://github.com/sunway513>`__)
 -  Jithun Nair (`jithunnair-amd <https://github.com/jithunnair-amd>`__)
 -  Jeff Daily (`jeffdaily <https://github.com/jeffdaily>`__)
+-  (emeritus) Junjie Bai (`bddppq <https://github.com/bddppq>`__)
 
 Build + CI
 ~~~~~~~~~~
 
--  Will Feng (`yf225 <https://github.com/yf225>`__)
--  Edward Yang (`ezyang <https://github.com/ezyang>`__)
--  Soumith Chintala (`soumith <https://github.com/soumith>`__)
--  Karl Ostmo (`kostmo <https://github.com/kostmo>`__)
--  Hong Xu (`xuhdev <https://github.com/xuhdev>`__)
+-  Nikita Shulga (`malfet <https://github.com/malfet>`__)
+-  Eli Uriegas (`seemethere <https://github.com/seemethere>`__)
+-  Zhuojie Zhou (`zhouzhuojie <https://github.com/zhouzhuojie>`__)
+-  (emeritus) Edward Yang (`ezyang <https://github.com/ezyang>`__)
+-  (emeritus) Karl Ostmo (`kostmo <https://github.com/kostmo>`__)
 
-Benchmarks
-~~~~~~~~~~
+Performance Tools
+~~~~~~~~~~~~~~~~~
 
--  Mingzhe Li (`mingzhe09088 <https://github.com/mingzhe09088>`__)
+-  Victor Bittorf (`bitfort <https://github.com/bitfort>`__)
+-  Gisle Dankel (`gdankel <https://github.com/gdankel>`__)
+-  Taylor Robie (`robieta <https://github.com/robieta>`__)
+-  Xu Zhao (`xuzhao9 <https://github.com/xuzhao9>`__)
+-  Geeta Chauhan (`chauhang <https://github.com/chauhang>`__)
+-  (emeritus) Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
+-  (emeritus) Mingzhe Li (`mingzhe09088 <https://github.com/mingzhe09088>`__)
 
 C++ API
 ~~~~~~~
 
--  Will Feng (`yf225 <https://github.com/yf225>`__)
+-  Joel Schlosser (`jbschlosser <https://github.com/jbschlosser>`__)
+-  (emeritus) Will Feng (`yf225 <https://github.com/yf225>`__)
 
 C10 utils and operator dispatch
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
--  Sebastian Messmer (`smessmer <https://github.com/smessmer>`__)
+-  Brian Hirsh (`bdhirsh <https://github.com/bdhirsh>`__)
+-  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 -  Dmytro Dzhulgakov (`dzhulgakov <https://github.com/dzhulgakov>`__)
+-  (emeritus) Sebastian Messmer (`smessmer <https://github.com/smessmer>`__)
 
 ONNX <-> PyTorch
 ~~~~~~~~~~~~~~~~
-
--  Lu Fang (`houseroad <https://github.com/houseroad>`__)
--  Lara Haidar (`lara-hdr <https://github.com/lara-hdr>`__)
--  Spandan Tiwari (`spandantiwari <https://github.com/spandantiwari>`__)
+-  Negin Raoof (`neginraoof <https://github.com/neginraoof>`__)
+-  Gary Miguel (`garymm <https://github.com/garymm>`__)
 -  Bowen Bao (`BowenBao <https://github.com/BowenBao>`__)
+-  (emeritus) Lu Fang (`houseroad <https://github.com/houseroad>`__)
+-  (emeritus) Lara Haidar (`lara-hdr <https://github.com/lara-hdr>`__)
+-  (emeritus) Spandan Tiwari (`spandantiwari <https://github.com/spandantiwari>`__)
+
+Mobile / Edge
+~~~~~~~~~~~~~
+-  David Reiss (`dreiss <https://github.com/dreiss>`__)
+-  Raziel Guevara (`raziel <https://github.com/raziel>`__)
+-  Linbin Yu (`linbinyu <https://github.com/linbinyu>`__)
+-  Ivan Kobzarev (`IvanKobzarev <https://github.com/IvanKobzarev>`__)
+-  Tao Xu (`xta0 <https://github.com/xta0>`__)
+
+Model Compression & Optimization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-  Raghuraman Krishnamoorthi (`raghuramank100 <https://github.com/raghuramank100>`__)
+-  Jerry Zhang (`jerryzh168 <https://github.com/jerryzh168>`__)
+-  Zafar Takhirov (`z-a-f <https://github.com/z-a-f>`__)
+-  Supriya Rao (`supriyar <https://github.com/supriyar>`__)
+
 
 Windows
 ~~~~~~~
@@ -152,31 +192,39 @@ Library-level maintainers
 XLA
 ~~~
 
--  Ailing Zhang (`ailzhang <https://github.com/ailzhang>`__)
+-  Jack Cao (`JackCaoG <https://github.com/JackCaoG>`__)
+-  Daniel Sohn (`jysohn23 <https://github.com/jysohn23>`__)
+-  Zach Cain (`zcain117 <https://github.com/zcain117>`__)
+-  Brian Hirsch (`bdhirsh <https://github.com/bdhirsh>`__)
 -  Gregory Chanan (`gchanan <https://github.com/gchanan>`__)
--  Davide Libenzi (`dlibenzi <https://github.com/dlibenzi>`__)
--  Alex Suhan (`asuhan <https://github.com/asuhan>`__)
+-  (emeritus) Ailing Zhang (`ailzhang <https://github.com/ailzhang>`__)
+-  (emeritus) Davide Libenzi (`dlibenzi <https://github.com/dlibenzi>`__)
+-  (emeritus) Alex Suhan (`asuhan <https://github.com/asuhan>`__)
 
 TorchServe
 ~~~~~~~~~~
 
-- Geeta Chauhan (`chauhang <https://github.com/chauhang>`__)
-- Manoj Rao (`mycpuorg <https://github.com/mycpuorg>`__)
-- Vamshi Dantu (`vdantu <https://github.com/vdantu>`__)
-- Dhanasekar Karuppasamy (`dhanainme <https://github.com/dhanainme>`__)
+-  Geeta Chauhan (`chauhang <https://github.com/chauhang>`__)
+-  Manoj Rao (`mycpuorg <https://github.com/mycpuorg>`__)
+-  Vamshi Dantu (`vdantu <https://github.com/vdantu>`__)
+-  Dhanasekar Karuppasamy (`dhanainme <https://github.com/dhanainme>`__)
 
 TorchVision
 ~~~~~~~~~~~
 
-- Francisco Massa (`fmassa <https://github.com/fmassa>`__)
+-  Francisco Massa (`fmassa <https://github.com/fmassa>`__)
+-  Vasilis Vryniotis (`datumbox <https://github.com/datumbox>`__)
 
 TorchText
 ~~~~~~~~~
 
-- Guanheng George Zhang (`zhangguanheng66 <https://github.com/zhangguanheng66>`__)
-- Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
+-  Parmeet Singh Bhatia (`parmeet <https://github.com/parmeet>`__)
+-  Steven Liu (`hudeven <https://github.com/hudeven>`__)
+-  (emeritus) Guanheng George Zhang (`zhangguanheng66 <https://github.com/zhangguanheng66>`__)
+-  (emeritus) Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
 
 TorchAudio
 ~~~~~~~~~~
 
-- Vincent QB (`vincentqb <https://github.com/vincentqb>`__)
+-  Moto Hira (`mthrok <https://github.com/mthrok>`__)
+-  (emeritus) Vincent QB (`vincentqb <https://github.com/vincentqb>`__)

From 9d95d485679392774532d4c79a73b9c11b665e1b Mon Sep 17 00:00:00 2001
From: Kiuk Chung <kiuk@fb.com>
Date: Wed, 25 Aug 2021 22:56:33 -0700
Subject: [PATCH 242/530] (torch.distributed) Add
 torch.distributed.is_torchelastic_launched() util method + make
 init_method=tcp:// compatible with torchelastic (#63910)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63910

Addresses the current issue that `init_method=tcp://` is not compatible with `torch.distributed.run` and `torch.distributed.launch`. When running with a training script that initializes the process group with `init_method=tcp://localhost:$port` as such:

```
$ python -u -m torch.distributed.run --max_restarts 0 --nproc_per_node 1 --nnodes 1 --master_addr $(hostname) --master_port 6000 ~/tmp/test.py
```

An `Address in use` error is raised since the training script tries to create a TCPStore on port 6000, which is already taken since the elastic agent is already running a TCPStore on that port.

For details see: https://github.com/pytorch/pytorch/issues/63874.

This change does a couple of things:

1. Adds `is_torchelastic_launched()` check function that users can use in the training scripts to see whether the script is launched via torchelastic.
1. Update the `torch.distributed` docs page to include the new `is_torchelastic_launched()` function.
1. Makes `init_method=tcp://` torchelastic compatible by modifying `_tcp_rendezvous_handler` in `torch.distributed.rendezvous` (this is NOT the elastic rendezvous, it is the old rendezvous module which is slotted for deprecation in future releases) to check `is_torchelastic_launched()` AND `torchelastic_use_agent_store()` and if so, only create TCPStore clients (no daemons, not even for rank 0).
1. Adds a bunch of unittests to cover the different code paths

NOTE: the issue mentions that we should fail-fast with an assertion on `init_method!=env://` when `is_torchelastic_launched()` is `True`. There are three registered init_methods in pytorch: env://, tcp://, file://. Since this diff makes tcp:// compatible with torchelastic and I've validated that file is compatible with torchelastic. There is no need to add assertions. I did update the docs to point out that env:// is the RECOMMENDED init_method. We should probably deprecate the other init_methods in the future but this is out of scope for this issue.

Test Plan: Unittests.

Reviewed By: cbalioglu

Differential Revision: D30529984

fbshipit-source-id: 267aea6d4dad73eb14a2680ac921f210ff547cc5
---
 docs/source/distributed.rst                   |   2 +
 .../launcher/bin/test_script_init_method.py   |  76 ++++++++++++
 .../test_script_is_torchelastic_launched.py   |  42 +++++++
 test/distributed/launcher/run_test.py         | 117 ++++++++++++++++++
 test/distributed/test_launcher.py             |   6 +-
 torch/_C/_distributed_c10d.pyi                |   3 +-
 torch/distributed/distributed_c10d.py         |  46 ++++---
 torch/distributed/launch.py                   |  10 +-
 torch/distributed/rendezvous.py               |  93 +++++++++-----
 torch/distributed/run.py                      |   1 +
 10 files changed, 342 insertions(+), 54 deletions(-)
 create mode 100755 test/distributed/launcher/bin/test_script_init_method.py
 create mode 100755 test/distributed/launcher/bin/test_script_is_torchelastic_launched.py

diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index 0f4e051bbf4db..c5cd727fa7ea0 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -180,6 +180,8 @@ joined.
 
 .. autofunction:: is_nccl_available
 
+.. autofunction:: is_torchelastic_launched
+
 --------------------------------------------------------------------------------
 
 Currently three initialization methods are supported:
diff --git a/test/distributed/launcher/bin/test_script_init_method.py b/test/distributed/launcher/bin/test_script_init_method.py
new file mode 100755
index 0000000000000..299839c40759b
--- /dev/null
+++ b/test/distributed/launcher/bin/test_script_init_method.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="test script")
+
+    parser.add_argument(
+        "--init_method",
+        type=str,
+        required=True,
+        help="init_method to pass to `dist.init_process_group()` (e.g. env://)",
+    )
+    parser.add_argument(
+        "--world_size",
+        type=int,
+        default=os.getenv("WORLD_SIZE", -1),
+        help="world_size to pass to `dist.init_process_group()`",
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=os.getenv("RANK", -1),
+        help="rank to pass to `dist.init_process_group()`",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    dist.init_process_group(
+        backend="gloo",
+        init_method=args.init_method,
+        world_size=args.world_size,
+        rank=args.rank,
+    )
+
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    # one hot (by rank) tensor of size world_size
+    # example:
+    # rank 0, world_size 4 => [1, 0, 0, 0]
+    # rank 1, world_size 4 => [0, 1, 0, 0]
+    # ...
+    t = F.one_hot(torch.tensor(rank), num_classes=world_size)
+
+    # after all_reduce t = tensor.ones(size=world_size)
+    dist.all_reduce(t)
+
+    # adding all elements in t should equal world_size
+    derived_world_size = torch.sum(t).item()
+    if derived_world_size != world_size:
+        raise RuntimeError(
+            f"Wrong world size derived. Expected: {world_size}, Got: {derived_world_size}"
+        )
+
+    print("Done")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py b/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py
new file mode 100755
index 0000000000000..fa9729c757b64
--- /dev/null
+++ b/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This is a test script that launches as part of the test cases in
+run_test.py, to validate the correctness of
+the method ``torch.distributed.is_torchelastic_launched()``. To do so,
+we run this script with and without torchelastic and validate that the
+boolean value written to the out_file is indeed what we expect (e.g.
+should be False when not launched with torchelastic, True when launched with)
+The script itself is not a test case hence no assertions are made in this script.
+
+see: - test/distributed/launcher/run_test.py#test_is_torchelastic_launched()
+     - test/distributed/launcher/run_test.py#test_is_not_torchelastic_launched()
+"""
+import argparse
+
+import torch.distributed as dist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="test script")
+    parser.add_argument(
+        "--out_file",
+        help="file to write indicating whether this script was launched with torchelastic",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    with open(args.out_file, "w") as out:
+        out.write(f"{dist.is_torchelastic_launched()}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/distributed/launcher/run_test.py b/test/distributed/launcher/run_test.py
index 079fea792ed02..4ed824c036390 100644
--- a/test/distributed/launcher/run_test.py
+++ b/test/distributed/launcher/run_test.py
@@ -7,8 +7,10 @@
 # LICENSE file in the root directory of this source tree.
 import multiprocessing as mp
 import os
+import runpy
 import shutil
 import subprocess
+import sys
 import tempfile
 import unittest
 import uuid
@@ -21,6 +23,7 @@
 from torch.distributed.elastic.multiprocessing.errors import ChildFailedError
 from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
 from torch.distributed.elastic.utils import get_socket_with_port
+from torch.distributed.elastic.utils.distributed import get_free_port
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
     sandcastle_skip_if,
@@ -475,3 +478,117 @@ def test_launch_shutdown(self, agent_mock_cls):
             param_mock.return_value = rdzv_handler_mock
             launch.main(args)
             rdzv_handler_mock.shutdown.assert_called_once()
+
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    def test_is_torchelastic_launched(self):
+        # launch test script with torchelastic and validate that
+        # torch.distributed.is_torchelastic_launched() returns True
+
+        out_file = f"{os.path.join(self.test_dir, 'out')}"
+
+        launch.main(
+            [
+                "--run_path",
+                "--nnodes=1",
+                "--nproc_per_node=1",
+                "--monitor_interval=1",
+                path("bin/test_script_is_torchelastic_launched.py"),
+                f"--out_file={out_file}",
+            ]
+        )
+
+        with open(out_file, "r") as fp:
+            is_torchelastic_launched = fp.readline()
+            self.assertEqual("True", is_torchelastic_launched)
+
+    def test_is_not_torchelastic_launched(self):
+        # launch test script without torchelastic and validate that
+        # torch.distributed.is_torchelastic_launched() returns False
+
+        out_file = f"{os.path.join(self.test_dir, 'out')}"
+
+        # need to run the script with runpy in the same interpreter
+        # as the test because otherwise (depending on the environment)
+        # it will not find torch as a dependency
+        with patch.object(
+            sys,
+            "argv",
+            [
+                path("bin/test_script_is_torchelastic_launched.py"),
+                f"--out_file={out_file}",
+            ],
+        ):
+            runpy.run_path(sys.argv[0], run_name="__main__")
+            with open(out_file, "r") as fp:
+                is_torchelastic_launched = fp.readline()
+                self.assertEqual("False", is_torchelastic_launched)
+
+    def test_init_method_tcp(self):
+        port = get_free_port()
+        with patch.object(
+            sys,
+            "argv",
+            [
+                path("bin/test_script_init_method.py"),
+                f"--init_method=tcp://localhost:{port}",
+                "--rank=0",
+                "--world_size=1",
+            ],
+        ):
+            runpy.run_path(sys.argv[0], run_name="__main__")
+            # nothing to validate, just make sure it runs
+
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    def test_init_method_tcp_with_torchelastic(self):
+        port = get_free_port()
+        launch.main(
+            [
+                "--run_path",
+                "--nnodes=1",
+                "--nproc_per_node=4",
+                "--master_addr=localhost",
+                f"--master_port={port}",
+                "--monitor_interval=1",
+                path("bin/test_script_init_method.py"),
+                f"--init_method=tcp://localhost:{port}",
+            ]
+        )
+        # nothing to validate, just make sure it runs
+
+    def test_init_method_env(self):
+        port = get_free_port()
+        with patch.dict(
+            os.environ,
+            {
+                "RANK": "0",
+                "WORLD_SIZE": "1",
+                "MASTER_ADDR": "localhost",
+                "MASTER_PORT": str(port),
+            },
+        ), patch.object(
+            sys,
+            "argv",
+            [
+                path("bin/test_script_init_method.py"),
+                "--init_method=env://",
+            ],
+        ):
+            runpy.run_path(sys.argv[0], run_name="__main__")
+            # nothing to validate, just make sure it runs
+
+    @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    def test_init_method_env_with_torchelastic(self):
+        port = get_free_port()
+        launch.main(
+            [
+                "--run_path",
+                "--nnodes=1",
+                "--nproc_per_node=4",
+                "--master_addr=localhost",
+                f"--master_port={port}",
+                "--monitor_interval=1",
+                path("bin/test_script_init_method.py"),
+                "--init_method=env://",
+            ]
+        )
+        # nothing to validate, just make sure it runs
diff --git a/test/distributed/test_launcher.py b/test/distributed/test_launcher.py
index 4565a266bc9ec..422c88b6bdee5 100644
--- a/test/distributed/test_launcher.py
+++ b/test/distributed/test_launcher.py
@@ -20,10 +20,14 @@
 def path(script):
     return os.path.join(os.path.dirname(__file__), script)
 
+
 if TEST_WITH_DEV_DBG_ASAN:
-    print("Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr)
+    print(
+        "Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr
+    )
     sys.exit(0)
 
+
 class TestDistributedLaunch(TestCase):
     def test_launch_user_script(self):
         nnodes = 1
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index cfa9c7cc1a46c..50e7602bdd838 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -134,7 +134,8 @@ class TCPStore(Store):
         world_size: int = ...,
         is_master: bool = ...,
         timeout: timedelta = ...,
-        wait_for_workers: bool = ...
+        wait_for_workers: bool = ...,
+        multi_tenant: bool = ...
     ): ...
 
 class PrefixStore(Store):
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 1b1244d9e37d5..fac096e339e71 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1,6 +1,7 @@
 import contextlib
 import io
 import logging
+import os
 import pickle
 import time
 import warnings
@@ -9,28 +10,31 @@
 
 import torch
 from torch._C._distributed_c10d import (
-    AllreduceOptions,
     AllreduceCoalescedOptions,
+    AllreduceOptions,
     AllToAllOptions,
     BarrierOptions,
     BroadcastOptions,
     GatherOptions,
     PrefixStore,
     ProcessGroup,
-    ReduceOptions,
     ReduceOp,
+    ReduceOptions,
     ReduceScatterOptions,
     ScatterOptions,
     Store,
+    _DistributedDebugLevel,
+    _get_debug_mode,
 )
-from torch._C._distributed_c10d import _get_debug_mode, _DistributedDebugLevel
 from torch._six import string_classes
 
+from .constants import default_pg_timeout
+from .rendezvous import register_rendezvous_handler, rendezvous  # noqa: F401
+
+
 # This module is wildcard imported from torch.distributed.
 # TODO: specify __all__
 
-from .constants import default_pg_timeout
-from .rendezvous import rendezvous, register_rendezvous_handler  # noqa: F401
 
 _MPI_AVAILABLE = True
 _NCCL_AVAILABLE = True
@@ -244,7 +248,9 @@ def _store_based_barrier(rank, store, timeout):
                 )
             )
 
-    logger.info(f"Rank {rank}: Completed store-based barrier for key:{store_key} with {world_size} nodes.")
+    logger.info(
+        f"Rank {rank}: Completed store-based barrier for key:{store_key} with {world_size} nodes."
+    )
 
 
 def _rank_not_in_group(group: ProcessGroup):
@@ -384,6 +390,18 @@ def is_initialized():
     return GroupMember.WORLD is not None
 
 
+def is_torchelastic_launched():
+    """
+    Checks whether this process was launched with ``torch.distributed.elastic``
+    (aka torchelastic). The existence of ``TORCHELASTIC_RUN_ID`` environment
+    variable is used as a proxy to determine whether the current process
+    was launched with torchelastic. This is a reasonable proxy since
+    ``TORCHELASTIC_RUN_ID`` maps to the rendezvous id which is always a
+    non-null value indicating the job id for peer discovery purposes..
+    """
+    return os.getenv("TORCHELASTIC_RUN_ID") is not None
+
+
 def _get_default_group():
     """
     Getting the default process group created by init_process_group
@@ -1778,8 +1796,8 @@ def broadcast_object_list(object_list, src=0, group=None, device=None):
     is_nccl_backend = group_backend == Backend.NCCL
     current_device = None
     if device is not None:
-        if is_nccl_backend and device.type != 'cuda':
-            raise ValueError('device type must be cuda for nccl backend')
+        if is_nccl_backend and device.type != "cuda":
+            raise ValueError("device type must be cuda for nccl backend")
         current_device = device
     else:
         current_device = torch.device("cpu")
@@ -2229,7 +2247,9 @@ def scatter(tensor, scatter_list=None, src=0, group=None, async_op=False):
 
     if _rank_not_in_group(group):
         return
-    scatter_list = [t if not t.is_complex() else torch.view_as_real(t) for t in scatter_list]
+    scatter_list = [
+        t if not t.is_complex() else torch.view_as_real(t) for t in scatter_list
+    ]
     tensor = tensor if not tensor.is_complex() else torch.view_as_real(tensor)
 
     my_rank = get_rank()
@@ -3026,9 +3046,7 @@ def new_subgroups(
         if rank in ranks_in_subgroup:
             cur_subgroup = subgroup
             logger.info(
-                "Rank {} is assigned to subgroup {}".format(
-                    rank, ranks_in_subgroup
-                )
+                "Rank {} is assigned to subgroup {}".format(rank, ranks_in_subgroup)
             )
 
     return cur_subgroup, subgroups
@@ -3139,8 +3157,6 @@ def new_subgroups_by_enumeration(
             rank_to_ranks_dict[rank] = ranks
             if my_rank == rank:
                 cur_subgroup = subgroup
-                logging.info(
-                    "Rank {} is assigned to subgroup {}".format(rank, ranks)
-                )
+                logging.info("Rank {} is assigned to subgroup {}".format(rank, ranks))
 
     return cur_subgroup, subgroups
diff --git a/torch/distributed/launch.py b/torch/distributed/launch.py
index 5fcb3eb44c126..4f29edd10d521 100644
--- a/torch/distributed/launch.py
+++ b/torch/distributed/launch.py
@@ -97,9 +97,9 @@
     >>>    # your code to run
 
 3. In your training program, you are supposed to call the following function
-at the beginning to start the distributed backend. You need to make sure that
-the init_method uses ``env://``, which is the only supported ``init_method``
-by this module.
+at the beginning to start the distributed backend. It is strongly recommended
+that ``init_method=env://``. Other init methods (e.g. ``tcp://``) may work,
+but ``env://`` is the one that is officially supported by this module.
 
 ::
 
@@ -147,6 +147,7 @@
 
 from torch.distributed.run import get_args_parser, run
 
+
 logger = logging.getLogger(__name__)
 
 
@@ -181,7 +182,8 @@ def main(args=None):
         "If your script expects `--local_rank` argument to be set, please\n"
         "change it to read from `os.environ['LOCAL_RANK']` instead. See \n"
         "https://pytorch.org/docs/stable/distributed.html#launch-utility for \n"
-        "further instructions\n", FutureWarning
+        "further instructions\n",
+        FutureWarning,
     )
     args = parse_args(args)
     launch(args)
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 6a5b680e25011..6e430e273f951 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -1,17 +1,22 @@
 try:
     from urllib.parse import urlparse, urlunparse
 except ImportError:
-    raise ImportError("urllib cannot be found, urlparse from python2 is no longer supported.")
+    raise ImportError(
+        "urllib cannot be found, urlparse from python2 is no longer supported."
+    )
 
-import torch._six as six
 import numbers
 import os
 import sys
 from datetime import timedelta
-from typing import Optional, Dict, Union
-from torch.distributed import FileStore, TCPStore, PrefixStore
+from typing import Dict, Optional, Union
+
+import torch._six as six
+from torch.distributed import FileStore, PrefixStore, Store, TCPStore
+
 from .constants import default_pg_timeout
 
+
 _rendezvous_handlers = {}
 
 
@@ -73,7 +78,9 @@ def rendezvous(url: str, rank: int = -1, world_size: int = -1, **kwargs):
             query_dict["world_size"] = world_size
 
         result = result._replace(
-            query="{}".format("&".join(["{}={}".format(k, v) for k, v in query_dict.items()]))
+            query="{}".format(
+                "&".join(["{}={}".format(k, v) for k, v in query_dict.items()])
+            )
         )
         url = urlunparse(result)
 
@@ -92,8 +99,9 @@ def _error(msg):
 
     result = urlparse(url)
     path = result.path
-    if sys.platform == 'win32':
+    if sys.platform == "win32":
         import urllib.request
+
         full_path = result.netloc + result.path
         path = urllib.request.url2pathname(full_path)
         if path:
@@ -119,7 +127,41 @@ def _error(msg):
     raise RuntimeError("Unable to perform rerendezvous using file:// method")
 
 
-def _tcp_rendezvous_handler(url: str, timeout: timedelta = default_pg_timeout, **kwargs):
+def _torchelastic_use_agent_store() -> bool:
+    return os.environ.get("TORCHELASTIC_USE_AGENT_STORE", None) == str(True)
+
+
+def _create_c10d_store(hostname, port, rank, world_size, timeout) -> Store:
+    """
+    Smartly creates a c10d Store object on ``rank`` based on whether
+    we need to re-use agent store. The TCPStore server is assumed to be hosted
+    on ``hostname:port``.
+
+    If ``torchelastic_use_agent_store()`` is ``True``, then it is assumed that
+    the agent leader (node rank 0) hosts the TCPStore server (for which the
+    endpoint is specified by the given ``hostname:port``). Hence
+    ALL ranks will create and return a TCPStore client (e.g. ``start_daemon=False``).
+
+    If ``torchelastic_use_agent_store()`` is ``False``, then rank 0 will host
+    the TCPStore (with multi-tenancy) and it is assumed that rank 0's hostname
+    and port are correctly passed via ``hostname`` and ``port``. All
+    non-zero ranks will create and return a TCPStore client.
+    """
+
+    if _torchelastic_use_agent_store():
+        attempt = os.environ["TORCHELASTIC_RESTART_COUNT"]
+        tcp_store = TCPStore(hostname, port, world_size, False, timeout)
+        return PrefixStore(f"/worker/attempt_{attempt}", tcp_store)
+    else:
+        start_daemon = rank == 0
+        return TCPStore(
+            hostname, port, world_size, start_daemon, timeout, multi_tenant=True
+        )
+
+
+def _tcp_rendezvous_handler(
+    url: str, timeout: timedelta = default_pg_timeout, **kwargs
+):
     def _error(msg):
         return _rendezvous_error("tcp:// rendezvous: " + msg)
 
@@ -136,18 +178,19 @@ def _error(msg):
 
     rank = int(query["rank"])
     world_size = int(query["world_size"])
-    start_daemon = rank == 0
     assert result.hostname is not None
-    store = TCPStore(  # type: ignore[call-arg]
-        result.hostname, result.port, world_size, start_daemon, timeout, multi_tenant=True
-    )
+
+    store = _create_c10d_store(result.hostname, result.port, rank, world_size, timeout)
+
     yield (store, rank, world_size)
 
     # If this configuration is invalidated, there is nothing we can do about it
-    raise RuntimeError("Unable to perform rerendezvous using tcp:// method")
+    raise RuntimeError("Unable to perform re-rendezvous using tcp:// method")
 
 
-def _env_rendezvous_handler(url: str, timeout: timedelta = default_pg_timeout, **kwargs):
+def _env_rendezvous_handler(
+    url: str, timeout: timedelta = default_pg_timeout, **kwargs
+):
     def _error(msg):
         return _rendezvous_error("env:// rendezvous: " + msg)
 
@@ -183,29 +226,13 @@ def _get_env_or_raise(env_var: str) -> str:
     master_addr = _get_env_or_raise("MASTER_ADDR")
     master_port = int(_get_env_or_raise("MASTER_PORT"))
 
+    store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout)
 
-    use_torchelastic_store = os.environ.get("TORCHELASTIC_USE_AGENT_STORE", None)
-
-    if use_torchelastic_store == str(True):
-        attempt = os.environ["TORCHELASTIC_RESTART_COUNT"]
-        worker_process_prefix = f"/worker/attempt_{attempt}"
-        # When TORCHELASTIC_USE_AGENT_STORE is set up, the worker process is assumed
-        # to be invoked by the torchelastic agent. Torchelastic agent creates a tcp daemon thread
-        # on the GROUP_RANK=0, as a result all user worker processes should create store with: daemon=False
-        tcp_store = TCPStore(master_addr, master_port, world_size, False, timeout)
-        # Each if-else condition returns due to: https://github.com/python/mypy/issues/1191
-        yield (PrefixStore(worker_process_prefix, tcp_store), rank, world_size)
-    else:
-        # Start the TCP store daemon on the rank 0
-        start_daemon = rank == 0
-        store = TCPStore(  # type: ignore[call-arg]
-            master_addr, master_port, world_size, start_daemon, timeout, multi_tenant=True
-        )
-        # Each if-else condition returns due to: https://github.com/python/mypy/issues/1191
-        yield (store, rank, world_size)
+    yield (store, rank, world_size)
 
     # If this configuration is invalidated, there is nothing we can do about it
-    raise RuntimeError("Unable to perform rerendezvous using env:// method")
+    raise RuntimeError("Unable to perform re-rendezvous using env:// method")
+
 
 register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
 register_rendezvous_handler("env", _env_rendezvous_handler)
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index f21fc4e68808f..d4428a0cde3c1 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -321,6 +321,7 @@ def train():
 from torch.distributed.elastic.utils.logging import get_logger
 from torch.distributed.launcher.api import LaunchConfig, elastic_launch
 
+
 log = get_logger()
 
 
From 3b284ab0243d22bb831a1685f47061b9612e1cb3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <marjanf@fb.com>
Date: Wed, 25 Aug 2021 23:40:09 -0700
Subject: [PATCH 243/530] Adding BFP16 quantization/dequantization support to
 OSS (#63059)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63059

Supporting BFP16 quantization method to OSS. Currently only support CPU
ghstack-source-id: 136639528

Test Plan: Imported from OSS

Reviewed By: wanchaol

Differential Revision: D30194538

fbshipit-source-id: ac248567ad8028457c2a91b77ef2ce81709fce53
---
 .../quantization/test_quantization.py         |  65 +++++---
 tools/build_variables.bzl                     |   2 +
 torch/csrc/distributed/c10d/init.cpp          |  24 +++
 .../c10d/quantization/quantization.cpp        |  93 +++++++++++
 .../c10d/quantization/quantization.h          |  20 +++
 .../c10d/quantization/quantization_gpu.cu     | 148 ++++++++++++++++++
 .../c10d/quantization/quantization_gpu.h      |  20 +++
 .../c10d/quantization/quantization_utils.h    |  31 ++++
 .../algorithms/quantization/quantization.py   |  41 +++--
 9 files changed, 409 insertions(+), 35 deletions(-)
 create mode 100644 torch/csrc/distributed/c10d/quantization/quantization.cpp
 create mode 100644 torch/csrc/distributed/c10d/quantization/quantization.h
 create mode 100644 torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
 create mode 100644 torch/csrc/distributed/c10d/quantization/quantization_gpu.h
 create mode 100644 torch/csrc/distributed/c10d/quantization/quantization_utils.h

diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py
index 7872920f21141..505f805b2cc10 100644
--- a/test/distributed/algorithms/quantization/test_quantization.py
+++ b/test/distributed/algorithms/quantization/test_quantization.py
@@ -8,6 +8,7 @@
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     requires_gloo,
+    skip_if_rocm,
     skip_if_lt_x_gpu,
     requires_nccl,
 )
@@ -26,9 +27,9 @@ def _build_tensor(size, value=None, dtype=torch.float, device_id=None):
     if value is None:
         value = size
     if device_id is None:
-        return torch.empty(size, size, size, dtype=dtype).fill_(value)
+        return torch.empty(size, dtype=dtype).fill_(value)
     else:
-        return torch.empty(size, size, size, dtype=dtype).fill_(value).cuda(device_id)
+        return torch.empty(size, dtype=dtype).fill_(value).cuda(device_id)
 if TEST_WITH_DEV_DBG_ASAN:
     print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr)
     sys.exit(0)
@@ -38,7 +39,6 @@ def _build_tensor(size, value=None, dtype=torch.float, device_id=None):
     sys.exit(0)
 
 BACKEND = os.environ["BACKEND"]
-
 if BACKEND == "gloo" or BACKEND == "nccl":
     class DistQuantizationTests(MultiProcessTestCase):
 
@@ -60,7 +60,7 @@ def op_timeout_sec(self):
 
         @property
         def world_size(self):
-            return 2
+            return int(os.environ["WORLD_SIZE"])
 
         def _init_multigpu_helper(self):
             """Multigpu tests are designed to simulate the multi nodes with multi
@@ -69,7 +69,7 @@ def _init_multigpu_helper(self):
             divided to subsets, each process only uses a subset.
             """
             nGPUs = torch.cuda.device_count()
-            world_size = dist.get_world_size()
+            world_size = self.world_size
             visible_devices = range(nGPUs)
 
             if BACKEND == "nccl":
@@ -91,18 +91,29 @@ def _init_multigpu_helper(self):
         @requires_gloo()
         @sandcastle_skip_if(BACKEND != "gloo", "Only gloo backend supports all_gather_fp16")
         def test_all_gather_fp16(self):
-            store = dist.FileStore(self.file_name, int(self.world_size))
+            store = dist.FileStore(self.file_name, self.world_size)
             dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo')
             device = torch.device(f"cuda:{self.rank}")
             group = list(range(0, self.world_size))
             group_id = dist.group.WORLD
             self._test_all_gather(group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.FP16)
 
+        @requires_gloo()
+        @sandcastle_skip_if(BACKEND != "gloo", "Only gloo backend supports all_gather_fp16")
+        def test_all_gather_bfp16(self):
+            store = dist.FileStore(self.file_name, self.world_size)
+            dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo')
+            device = torch.device(f"cuda:{self.rank}")
+            group = list(range(0, self.world_size))
+            group_id = dist.group.WORLD
+            self._test_all_gather(group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.BFP16)
+
         @requires_nccl()
         @sandcastle_skip_if(BACKEND != "nccl", "Only nccl backend supports all_to_all_fp16")
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        @skip_if_rocm
         def test_all_to_all_fp16(self):
-            store = dist.FileStore(self.file_name, int(self.world_size))
+            store = dist.FileStore(self.file_name, self.world_size)
             dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl')
             device = torch.device(f"cuda:{self.rank}")
             group = list(range(0, self.world_size))
@@ -117,16 +128,34 @@ def test_all_to_all_fp16(self):
                 dtype=torch.float32,
                 qtype=DQuantType.FP16)
 
+        @requires_nccl()
+        @sandcastle_skip_if(BACKEND != "nccl", "Only nccl backend supports all_to_all_fp16")
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        @skip_if_rocm
+        def test_all_to_all_bfp16(self):
+            store = dist.FileStore(self.file_name, self.world_size)
+            dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl')
+            device = torch.device(f"cuda:{self.rank}")
+            group = list(range(0, self.world_size))
+            group_id = dist.new_group(range(self.world_size))
+            rank_to_GPU = self._init_multigpu_helper()
+            self._test_all_to_all(
+                group,
+                group_id,
+                self.rank,
+                cuda=True,
+                rank_to_GPU=rank_to_GPU,
+                dtype=torch.float32,
+                qtype=DQuantType.BFP16)
+
         def _test_all_gather(
                 self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float, qtype=None):
             for dest in group:
-                tensor = _build_tensor(dest + 1, rank, dtype=dtype)
-                tensors = [_build_tensor(dest + 1, -1, dtype=dtype) for i in group]
-                expected_tensors = [_build_tensor(dest + 1, i, dtype=dtype) for i in group]
-                if (qtype is not None):
-                    allgather = quant.auto_quantize(dist.all_gather, qtype, quant_loss=None)
-                else:
-                    allgather = dist.all_gather
+                tensor = _build_tensor([dest + 1, dest + 1], rank, dtype=dtype)
+                tensors = [_build_tensor([dest + 1, dest + 1], -1, dtype=dtype) for i in group]
+                expected_tensors = [
+                    _build_tensor([dest + 1, dest + 1], i, dtype=dtype) for i in group
+                ]
                 if cuda:
                     tensor = tensor.cuda(rank_to_GPU[rank][0])
                     tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
@@ -134,6 +163,7 @@ def _test_all_gather(
                     tensor_shapes = [torch.view_as_real(tensors[0]).shape]
                 else:
                     tensor_shapes = [tensors[0].shape]
+                allgather = quant.auto_quantize(dist.all_gather, qtype, quant_loss=None)
                 allgather(tensors, tensor, group=group_id, async_op=False)
 
                 for t1, t2 in zip(tensors, expected_tensors):
@@ -168,11 +198,8 @@ def _test_all_to_all(
                         t.cuda(rank_to_GPU[rank][0]) for t in expected_tensors
                     ]
                     out_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in out_tensors]
-                if(qtype is not None):
-                    quantize_alltoall = quant.auto_quantize(dist.all_to_all, qtype, quant_loss=None)
-                    quantize_alltoall(out_tensors, in_tensors, group=group_id)
-                else:
-                    dist.all_to_all(out_tensors, in_tensors, group=group_id)
+                quantize_alltoall = quant.auto_quantize(dist.all_to_all, qtype, quant_loss=None)
+                quantize_alltoall(out_tensors, in_tensors, group=group_id)
                 for t1, t2 in zip(out_tensors, expected_tensors):
                     self.assertEqual(t1, t2)
 
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 5f4cc0df522f5..3f6225358ac97 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -551,6 +551,7 @@ libtorch_cuda_distributed_extra_sources = [
     "torch/csrc/distributed/c10d/NCCLUtils.cpp",
     "torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp",
     "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
+    "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
 ]
 
 libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
@@ -737,6 +738,7 @@ libtorch_python_distributed_core_sources = [
     "torch/csrc/distributed/c10d/frontend.cpp",
     "torch/csrc/distributed/c10d/init.cpp",
     "torch/csrc/distributed/c10d/python_comm_hook.cpp",
+    "torch/csrc/distributed/c10d/quantization/quantization.cpp",
 ]
 
 libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 201f0c2dd64f4..6b52d3c058384 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -17,6 +17,7 @@
 
 #ifdef USE_C10D_NCCL
 #include <c10d/ProcessGroupNCCL.hpp>
+#include <torch/csrc/distributed/c10d/quantization/quantization_gpu.h>
 #endif
 
 #ifdef USE_C10D_MPI
@@ -31,8 +32,10 @@
 #include <c10d/frontend.hpp>
 #include <c10d/logger.hpp>
 #include <c10d/reducer.hpp>
+
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/distributed/c10d/python_comm_hook.h>
+#include <torch/csrc/distributed/c10d/quantization/quantization.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/utils/object_ptr.h>
 #include <torch/csrc/utils/pybind.h>
@@ -1644,6 +1647,27 @@ PyMethodDef* python_functions() {
   return methods;
 }
 
+namespace quantization {
+TORCH_LIBRARY(q, m) {
+    m.def("_Bfloat16QuantizedToFloat(Tensor input) -> Tensor");
+    m.def("_FloatToBfloat16Quantized(Tensor input) -> Tensor");
+}
+    TORCH_LIBRARY_IMPL(q, CPU, m) {
+        m.impl("_Bfloat16QuantizedToFloat", _bfloat16_to_float_cpu);
+        m.impl("_FloatToBfloat16Quantized", _float_to_bfloat16_cpu);
+    }
+
+#ifdef USE_C10D_NCCL
+    #define DISPATCH_TO_CUDA(name, function) \
+        m.impl(name, torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(function)))
+    TORCH_LIBRARY_IMPL(q, CUDA, m) {
+        DISPATCH_TO_CUDA("_Bfloat16QuantizedToFloat", _bfloat16_to_float_cuda);
+        DISPATCH_TO_CUDA("_FloatToBfloat16Quantized", _float_to_bfloat16_cuda);
+    }
+#endif
+
+} // namespace quantization
+
 } // namespace c10d
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/c10d/quantization/quantization.cpp b/torch/csrc/distributed/c10d/quantization/quantization.cpp
new file mode 100644
index 0000000000000..b9682d73ed139
--- /dev/null
+++ b/torch/csrc/distributed/c10d/quantization/quantization.cpp
@@ -0,0 +1,93 @@
+#include <torch/csrc/distributed/c10d/quantization/quantization.h>
+#include <torch/csrc/distributed/c10d/quantization/quantization_utils.h>
+
+namespace torch {
+namespace distributed {
+namespace c10d {
+namespace quantization {
+
+void FloatToBFloat16Quantized_ref(
+    const float* const input,
+    const size_t nrows,
+    const size_t ncols,
+    uint16_t* const output){
+  for (const auto row : c10::irange(nrows)) {
+    const float* input_row = input + row * ncols;
+    uint16_t* output_row = output + row * ncols;
+
+    for (const auto col : c10::irange(ncols)) {
+      output_row[col] =
+          (*reinterpret_cast<const uint32_t*>(input_row + col) + (1 << 15)) >>
+          16;
+    }
+  }
+}
+
+void BFloat16QuantizedToFloat_ref(
+    const at::BFloat16* const input,
+    const size_t nrows,
+    const size_t ncols,
+    float* const output){
+  const int32_t output_columns = ncols;
+
+  for (const auto row : c10::irange(nrows)) {
+    const at::BFloat16* input_row = input + row * ncols;
+    float* output_row = output + row * output_columns;
+
+    for (const auto col : c10::irange(ncols)) {
+      uint32_t val_fp32 = static_cast<uint32_t>(
+                              reinterpret_cast<const uint16_t*>(input_row)[col])
+          << 16;
+      reinterpret_cast<uint32_t*>(output_row)[col] = val_fp32;
+    }
+  }
+}
+
+at::Tensor _float_to_bfloat16_cpu(const at::Tensor& input) {
+  TENSOR_ON_CPU(input);
+  // Currently it supports 2D inputs
+  TENSOR_NDIM_EQUALS(input, 2);
+
+  const auto input_sizes = input.sizes();
+  const int32_t nrows = input_sizes[0];
+  const int32_t ncols = input_sizes[1];
+  const int32_t output_columns = ncols;
+  auto output = at::empty(
+      {nrows, output_columns},
+      input.options().dtype(at::kHalf));
+
+  FloatToBFloat16Quantized_ref(
+      input.data_ptr<float>(),
+      nrows,
+      ncols,
+      reinterpret_cast<uint16_t*>(output.data_ptr<at::Half>()));
+
+  return output;
+}
+
+at::Tensor _bfloat16_to_float_cpu(const at::Tensor& input) {
+  TENSOR_ON_CPU(input);
+  // Currently it supports 2D inputs
+  TENSOR_NDIM_EQUALS(input, 2);
+
+  const auto input_sizes = input.sizes();
+  const int32_t nrows = input_sizes[0];
+  const int32_t ncols = input_sizes[1];
+  const int32_t output_columns = ncols;
+
+  auto output = at::empty(
+      {nrows, output_columns}, // 4 = sizeof(float)
+      input.options().dtype(at::kFloat)); //
+  BFloat16QuantizedToFloat_ref(
+      reinterpret_cast<at::BFloat16*>(input.data_ptr<at::Half>()),
+      nrows,
+      ncols,
+      output.data_ptr<float>());
+
+  return output;
+}
+
+} // namespace quantization
+} // namespace c10d
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/c10d/quantization/quantization.h b/torch/csrc/distributed/c10d/quantization/quantization.h
new file mode 100644
index 0000000000000..658fa754488d1
--- /dev/null
+++ b/torch/csrc/distributed/c10d/quantization/quantization.h
@@ -0,0 +1,20 @@
+// (c) Facebook, Inc. and its affiliates. Confidential and proprietary.
+
+#pragma once
+
+
+#include <ATen/ATen.h>
+#include <vector>
+
+namespace torch {
+namespace distributed {
+namespace c10d {
+namespace quantization {
+
+at::Tensor _float_to_bfloat16_cpu(const at::Tensor& input);
+at::Tensor _bfloat16_to_float_cpu(const at::Tensor& input);
+
+} // namespace quantization
+} // namespace c10d
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu b/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
new file mode 100644
index 0000000000000..5590e035b0683
--- /dev/null
+++ b/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
@@ -0,0 +1,148 @@
+#include <c10/cuda/CUDAGuard.h>
+#include <c10d/Utils.hpp>
+#include <torch/csrc/distributed/c10d/quantization/quantization_gpu.h>
+#include <torch/csrc/distributed/c10d/quantization/quantization_utils.h>
+
+// FP32 -> BF16 kernel
+__global__ inline void _float_to_bfloat16_cuda_kernel(
+    const float* __restrict__ input,
+    const int nrows,
+    const int ncols,
+    uint16_t* __restrict__ output) {
+  const int row_incre = blockDim.y * gridDim.y;
+  const int col_incre = blockDim.x * gridDim.x;
+  for (int row = blockIdx.y * blockDim.y + threadIdx.y; row < nrows;
+       row += row_incre) {
+    const float* input_row = input + row * ncols;
+    uint16_t* output_row = output + row * ncols;
+    for (int col = blockIdx.x * blockDim.x + threadIdx.x; col < ncols;
+         col += col_incre) {
+      // Add 2^15 and right shift 16 to do round-nearest
+      output_row[col] =
+          (*reinterpret_cast<const uint32_t*>(input_row + col) + (1 << 15)) >>
+          16;
+    }
+  }
+}
+
+// BF16 -> FP32 kernel
+__global__ inline void _bfloat16_to_float_cuda_kernel(
+    const uint16_t* __restrict__ input,
+    const int nrows,
+    const int ncols,
+    float* __restrict__ output) {
+  const int row_incre = blockDim.y * gridDim.y;
+  const int col_incre = blockDim.x * gridDim.x;
+  for (int row = blockIdx.y * blockDim.y + threadIdx.y; row < nrows;
+       row += row_incre) {
+    for (int col = blockIdx.x * blockDim.x + threadIdx.x; col < ncols;
+         col += col_incre) {
+      const uint16_t* input_row = input + row * ncols;
+      float* output_row = output + row * ncols;
+      uint32_t val_fp32 = static_cast<uint32_t>(
+                              reinterpret_cast<const uint16_t*>(input_row)[col])
+          << 16;
+      reinterpret_cast<uint32_t*>(output_row)[col] = val_fp32;
+    }
+  }
+}
+
+namespace torch {
+namespace distributed {
+namespace c10d {
+namespace quantization {
+
+at::Tensor _float_to_bfloat16_cuda(const at::Tensor& input) {
+  TENSOR_ON_CUDA_GPU(input);
+  // Currently it supports 2D inputs
+  TENSOR_NDIM_EQUALS(input, 2);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(input.get_device());
+
+  const int nrows = input.size(0);
+  const int ncols = input.size(1);
+  const int output_columns = ncols;
+
+  auto output = at::empty(
+      {nrows, output_columns},
+      input.options().dtype(at::kHalf)); // at::kHalf
+
+  if (nrows == 0 || output_columns == 0) {
+    return output;
+  }
+
+  // TODO: replace Half by BFloat16, after BFloat16 is supported by Nvidia
+  // NCCL input.options().dtype(at::kBFloat16)); // at::kBFloat16
+
+  constexpr int threads_per_block = 256;
+  const int blockDim_x = std::min(output_columns, threads_per_block);
+  dim3 blockDim(blockDim_x, threads_per_block / blockDim_x);
+  const int gridDim_x = (output_columns + blockDim.x - 1) / blockDim.x;
+  const int gridDim_y = std::min((nrows + blockDim.y - 1) / blockDim.y, 65535u);
+  dim3 gridDim(gridDim_x, gridDim_y);
+
+  _float_to_bfloat16_cuda_kernel<<<
+      gridDim,
+      blockDim,
+      0,
+      at::cuda::getCurrentCUDAStream()>>>(
+      input.data_ptr<float>(),
+      nrows,
+      ncols,
+      // TODO: replace Half by BFloat16, after BFloat16 is supported by Nvidia
+      // NCCL
+      reinterpret_cast<uint16_t*>(output.data_ptr<at::Half>()));
+  //C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return output;
+}
+
+at::Tensor _bfloat16_to_float_cuda(const at::Tensor& input) {
+  TENSOR_ON_CUDA_GPU(input);
+  // Currently it supports 2D inputs
+  TENSOR_NDIM_EQUALS(input, 2);
+
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(input.get_device());
+
+  const int nrows = input.size(0);
+  const int ncols = input.size(1);
+  const int output_columns = ncols;
+
+  auto output = at::empty(
+      {nrows, output_columns}, // 4 = sizeof(float)
+      input.options().dtype(at::kFloat)); // at::kBytes for uint8_t
+
+  if (nrows == 0 || output_columns == 0) {
+    return output;
+  }
+
+  constexpr int threads_per_block = 256;
+
+  const int blockDim_x = std::min(output_columns, threads_per_block);
+  dim3 blockDim(blockDim_x, threads_per_block / blockDim_x);
+  const int gridDim_x = (output_columns + blockDim.x - 1) / blockDim.x;
+  const int gridDim_y = std::min((nrows + blockDim.y - 1) / blockDim.y, 65535u);
+  dim3 gridDim(gridDim_x, gridDim_y);
+
+  _bfloat16_to_float_cuda_kernel<<<
+      gridDim,
+      blockDim,
+      0,
+      at::cuda::getCurrentCUDAStream()>>>(
+      // TODO: replace Half by BFloat16, after BFloat16 is supported by Nvidia
+      // NCCL
+      reinterpret_cast<uint16_t*>(input.data_ptr<at::Half>()),
+      nrows,
+      ncols,
+      output.data_ptr<float>());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return output;
+}
+
+} // namespace quantization
+} // namespace c10d
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/c10d/quantization/quantization_gpu.h b/torch/csrc/distributed/c10d/quantization/quantization_gpu.h
new file mode 100644
index 0000000000000..2a0c8f8f8d39c
--- /dev/null
+++ b/torch/csrc/distributed/c10d/quantization/quantization_gpu.h
@@ -0,0 +1,20 @@
+// (c) Facebook, Inc. and its affiliates. Confidential and proprietary.
+
+#pragma once
+
+
+#include <ATen/ATen.h>
+#include <vector>
+
+namespace torch {
+namespace distributed {
+namespace c10d {
+namespace quantization {
+
+at::Tensor _float_to_bfloat16_cuda(const at::Tensor& input);
+at::Tensor _bfloat16_to_float_cuda(const at::Tensor& input);
+
+} // namespace quantization
+} // namespace c10d
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/c10d/quantization/quantization_utils.h b/torch/csrc/distributed/c10d/quantization/quantization_utils.h
new file mode 100644
index 0000000000000..0467ba2769f5b
--- /dev/null
+++ b/torch/csrc/distributed/c10d/quantization/quantization_utils.h
@@ -0,0 +1,31 @@
+// (c) Facebook, Inc. and its affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include <ATen/ATen.h>
+
+#include <typeinfo>
+
+inline std::string torch_tensor_device_name(const at::Tensor& ten) {
+  return c10::DeviceTypeName(ten.device().type());
+}
+
+#define TENSOR_NDIM_EQUALS(ten, dims)      \
+  TORCH_CHECK(                             \
+      (ten).ndimension() == (dims),        \
+      "Tensor '" #ten "' must have " #dims \
+      " dimension(s). "                    \
+      "Found ",                            \
+      (ten).ndimension())
+
+#define TENSOR_ON_CPU(x)                                      \
+  TORCH_CHECK(                                                \
+      !x.is_cuda(),                           \
+      #x " must be a CPU tensor; it is currently on device ", \
+      torch_tensor_device_name(x))
+
+#define TENSOR_ON_CUDA_GPU(x)                                  \
+  TORCH_CHECK(                                                 \
+      x.is_cuda(),                                             \
+      #x " must be a CUDA tensor; it is currently on device ", \
+      torch_tensor_device_name(x))
diff --git a/torch/distributed/algorithms/quantization/quantization.py b/torch/distributed/algorithms/quantization/quantization.py
index 724d6aa362487..d58c58cad09e2 100644
--- a/torch/distributed/algorithms/quantization/quantization.py
+++ b/torch/distributed/algorithms/quantization/quantization.py
@@ -10,7 +10,12 @@
 TORCH_HALF_MAX = torch.finfo(torch.float16).max
 
 class DQuantType(Enum):
-    FP16 = "fp16"
+    """
+    Different quantization methods for auto_quantize API are identified here.
+    auto_quantize API currently supports fp16 and bfp16 methods.
+    """
+    FP16 = "fp16",
+    BFP16 = "bfp16"
 
     def __str__(self) -> str:
         return self.value
@@ -26,6 +31,8 @@ def _quantize_tensor(tensor, qtype):
         )
     if (qtype == DQuantType.FP16):
         return _fp32_to_fp16_with_clamp(tensor)
+    elif (qtype == DQuantType.BFP16):
+        return torch.ops.q._FloatToBfloat16Quantized(tensor)
     else:
         raise RuntimeError(
             f'Quantization type {qtype} is not supported'
@@ -38,13 +45,8 @@ def _quantize_tensor_list(tensor_list, qtype):
         raise RuntimeError(
             f"_quantize_tensor_list expecting list of torch.Tensor as input but found {type(tensor_list)}"
         )
-    if (qtype == DQuantType.FP16):
-        quantized_tensor_list = [_quantize_tensor(t, qtype) for t in tensor_list]
-        return quantized_tensor_list
-    else:
-        raise RuntimeError(
-            f'Quantization type {qtype} is not supported'
-        )
+    quantized_tensor_list = [_quantize_tensor(t, qtype) for t in tensor_list]
+    return quantized_tensor_list
 
 def _dequantize_tensor(tensor, qtype, quant_loss=None):
     if not isinstance(tensor, torch.Tensor):
@@ -60,6 +62,13 @@ def _dequantize_tensor(tensor, qtype, quant_loss=None):
             return tensor.float()
         else:
             return tensor.float() / quant_loss
+    elif (qtype == DQuantType.BFP16):
+        if tensor.dtype != torch.float16:
+            raise RuntimeError(
+                f"tensor dtype is {tensor.dtype} while expected to be FP16."
+            )
+        else:
+            return torch.ops.q._Bfloat16QuantizedToFloat(tensor)
     else:
         raise RuntimeError(
             f'Quantization type {qtype} is not supported'
@@ -73,26 +82,26 @@ def _dequantize_tensor_list(tensor_list, qtype, quant_loss=None):
         raise RuntimeError(
             f"_dequantize_tensor_list expecting list of torch.Tensor as input but found {type(tensor_list)}"
         )
-    elif (qtype == DQuantType.FP16):
-        dequantized_tensor_list = [_dequantize_tensor(t, qtype) for t in tensor_list]
-        return dequantized_tensor_list
-    else:
-        raise RuntimeError(
-            f'Quantization type {qtype} is not supported'
-        )
+    dequantized_tensor_list = [_dequantize_tensor(t, qtype) for t in tensor_list]
+    return dequantized_tensor_list
 
 
 def auto_quantize(func, qtype, quant_loss=None):
     """
     This is a prototype API that automatically quantize the input tensors, choose the precision types, and
     pass other necessary arguments and then dequantizes the output.
+
     Currently it only supports:
-        . FP16 quantization method
+        . FP16 and BFP16 quantization method supported for gloo and nccl backends
         . all_gather, all_to_all collective ops
+
+    Note: BFP16 only supports 2D tensors.
+
     Args:
         func (callable): A function representing collective operations.
         qtype (QuantType): Quantization method
         quant_loss (float, optional): This can be used to improve accuracy in the dequantization.
+
     Returns:
         (callable): the same collective as func but enables automatic quantization/dequantization.
     """

From a6f767ed3d66b4a01e5b2edead8491dfbca517e6 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 25 Aug 2021 23:48:58 -0700
Subject: [PATCH 244/530] Fix issue re: DDP and create_graph=True (#63831)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63831

Closes https://github.com/pytorch/pytorch/issues/63812

`at::mul_out` is not supported when `grad` itself requires grad, which is useful for computing higher order derivatives.

In this case, fall back to a mul + copy instead of mul_out.
ghstack-source-id: 136614644

Test Plan: UT

Reviewed By: SciPioneer

Differential Revision: D30505573

fbshipit-source-id: 83532b6207b3d80116fcc4dff0e5520d73b3454f
---
 torch/csrc/distributed/c10d/reducer.cpp       | 22 ++++++++++++++++---
 .../_internal/distributed/distributed_test.py | 22 +++++++++++++++++++
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index d91f191602888..eafc70cc5e30f 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -377,9 +377,25 @@ void Reducer::mark_variable_ready_dense(size_t variable_index) {
         if (comm_hook_ == nullptr) {
           auto wrapped =
               at::native::wrapped_scalar_tensor(double(1.) / div_factor_);
-          // Divides while copying into the bucket view to save one scan over
-          // all the input parameters.
-          at::mul_out(bucket_view, grad, wrapped);
+          if (!grad.requires_grad()) {
+            // Divides while copying into the bucket view to save one scan over
+            // all the input parameters.
+            at::mul_out(bucket_view, grad, wrapped);
+          } else {
+            // If DDP is running with create_graph=True, gradients require_grad
+            // themselves in order to compute higher order derivatives. However,
+            // DDP will not sync up these gradients currently (see
+            // https://github.com/pytorch/pytorch/issues/63812).
+            LOG(WARNING)
+                << "Using DistributedDataParallel with create_graph=True "
+                << " is not well-supported. The higher-order gradient will "
+                << " not be synchronized across ranks, and backpropagation "
+                << " through all_reduce operations will not occur. If you require "
+                << " DDP to work with higher-order gradients for your use case, "
+                << " please ping https://github.com/pytorch/pytorch/issues/63929";
+            auto div_result = at::mul(grad, wrapped);
+            bucket_view.copy_(div_result);
+          }
         } else {
           bucket_view.copy_(grad);
         }
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index f4bc073a4317e..333458c5f8308 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -3760,6 +3760,28 @@ def test_DistributedDataParallel_requires_grad(self):
             )
             self._barrier()
 
+        @sandcastle_skip_if(
+            BACKEND != "nccl" and BACKEND != "gloo",
+            "Only NCCL and GLOO backend support DistributedDataParallel",
+        )
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_ddp_create_graph(self):
+            rank = self.rank
+            torch.cuda.set_device(rank)
+            net = torch.nn.parallel.DistributedDataParallel(
+                torch.nn.Linear(1, 1, bias=False).cuda(rank),
+                device_ids=[rank]
+            )
+            inp = torch.randn((2, 1), device=rank)
+            for _ in range(6):
+                loss = net(inp).sum()
+                # Verify DDP works with create_graph=True
+                loss.backward(create_graph=True)
+                # grad tensors should require grad.
+                self.assertTrue(
+                    all([param.requires_grad for param in net.parameters()])
+                )
+
         @sandcastle_skip_if(
             BACKEND != "nccl" and BACKEND != "gloo",
             "Only NCCL and GLOO backend support DistributedDataParallel",

From 5757d03145ac4d7a81822d35fc76af56ba7d39ab Mon Sep 17 00:00:00 2001
From: Kefei Lu <kefeilu@fb.com>
Date: Thu, 26 Aug 2021 00:51:53 -0700
Subject: [PATCH 245/530] Add logging for _MinimizerBase

Summary: Add logging so we know which nodes are currently being visited

Test Plan: lint & SC tests

Reviewed By: 842974287

Differential Revision: D30509865

fbshipit-source-id: 09e77e44c97c825242e0b24f90463b50f3ca19c6
---
 torch/fx/passes/net_min_base.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch/fx/passes/net_min_base.py b/torch/fx/passes/net_min_base.py
index 2a093bea49a4c..b7a911e4bf3db 100644
--- a/torch/fx/passes/net_min_base.py
+++ b/torch/fx/passes/net_min_base.py
@@ -1,5 +1,6 @@
 import argparse
 from typing import Any, Callable, Tuple, Dict, Optional
+import logging
 
 import torch
 import torch.fx
@@ -17,6 +18,8 @@
     Names
 )
 
+_LOGGER = logging.getLogger(__name__)
+
 
 class FxNetMinimizerBadModuleError(Exception):
     """
@@ -403,6 +406,7 @@ def _sequential_traverse(self, nodes: NodeList) -> NodeSet:
         culprits: NodeSet = set()
 
         for node in nodes:
+            _LOGGER.info(f"Visit node: {node.name}")
             cur_nodes: NodeSet = {node}
 
             if node in self.fusions:

From 61d88cdd1c5fe7cf91b6ee0a71a250e3a6f61878 Mon Sep 17 00:00:00 2001
From: Michael Dagitses <mikeyd@fb.com>
Date: Thu, 26 Aug 2021 04:42:36 -0700
Subject: [PATCH 246/530] use `const auto&` as type for grad alias (#63949)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63949

This is an extension of the discussion in
https://github.com/pytorch/pytorch/pull/63040#discussion_r687793027.

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D30546789

Pulled By: dagitses

fbshipit-source-id: 3046aff4f129d5492d73dfb67717a824e16ffee8
---
 tools/autograd/gen_autograd_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
index 7d852aded47a9..08136ab54bfcc 100644
--- a/tools/autograd/gen_autograd_functions.py
+++ b/tools/autograd/gen_autograd_functions.py
@@ -479,7 +479,7 @@ def save_var(var: SavedAttribute, is_output: bool) -> None:
     body: List[str] = []
 
     if uses_single_grad(info):
-        body.append('auto& grad = grads[0];')
+        body.append('const auto& grad = grads[0];')
 
     def emit_derivative(
         derivative: Derivative,

From c02eda8166068400a9e5d82343108cd8a524095c Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@fb.com>
Date: Thu, 26 Aug 2021 05:43:05 -0700
Subject: [PATCH 247/530] Update TensorPipe submodule

Summary: The bot failed to do it.

Test Plan: D30542677

Reviewed By: beauby

Differential Revision: D30573500

fbshipit-source-id: 50abd6fc415cead0a6b6d9290fa0e5f97d0e4989
---
 third_party/tensorpipe | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/tensorpipe b/third_party/tensorpipe
index e45b2338d0a31..1cd0ac3e4ce51 160000
--- a/third_party/tensorpipe
+++ b/third_party/tensorpipe
@@ -1 +1 @@
-Subproject commit e45b2338d0a31192a7e413f3fbbfa7fd90504a37
+Subproject commit 1cd0ac3e4ce5144ee4ea2545741182c76fba6cf2

From 774ae0851d98829b412e46dde85e716dad065a06 Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Thu, 26 Aug 2021 06:05:28 -0700
Subject: [PATCH 248/530] [OpInfo] Added ReductionOpInfo subclass of OpInfo and
 ported sum test (#62737)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62737

ReductionOpInfo is a specialization of OpInfo for reduction operators. For now, it is designed to work with reductions that return a single tensor and that reduce all elements along one or more dimensions to a single value. In particular this excludes operators such as `max` and `min` that return multiple tensors and `quantile` that can return multiple values.

fixes https://github.com/pytorch/pytorch/issues/49746

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30406568

Pulled By: heitorschueroff

fbshipit-source-id: 218b1da1902f67bcf4c3681e2a0f0029a25d51f1
---
 test/test_ops.py                              |   6 +-
 test/test_reductions.py                       |  19 +-
 .../_internal/common_methods_invocations.py   | 255 +++++++++++++-----
 3 files changed, 208 insertions(+), 72 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 76a7b6a1485ca..a6baf8dbe699a 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -10,7 +10,7 @@
     (TestCase, is_iterable_of_tensors, run_tests, IS_SANDCASTLE, clone_input_helper, make_tensor,
      gradcheck, gradgradcheck, IS_IN_CI, suppress_warnings)
 from torch.testing._internal.common_methods_invocations import \
-    (op_db, _NOTHING, UnaryUfuncInfo, SpectralFuncInfo)
+    (op_db, _NOTHING, UnaryUfuncInfo, ReductionOpInfo, SpectralFuncInfo)
 from torch.testing._internal.common_device_type import \
     (deviceCountAtLeast, instantiate_device_type_tests, ops, onlyCUDA, onlyOnCPUAndCUDA, skipCUDAIfRocm, OpDTypes)
 from torch.testing._internal.common_jit import JitCommonTestCase, check_against_reference
@@ -27,8 +27,8 @@
 # Get names of all the operators which have ref in their entry in OpInfo (testing infra)
 #   except for Unary Ufuncs (separately implemented in test/test_unary_ufuncs.py)
 #   and Spectral Functions (separately implemented for only 1D as of now, in test/test_spectral_ops.py)
-_ref_test_ops = list(filter(lambda op: not isinstance(op, (UnaryUfuncInfo, SpectralFuncInfo)) and
-                     op.ref is not None and op.ref is not _NOTHING, op_db))
+_ref_test_ops = list(filter(lambda op: not isinstance(op, (UnaryUfuncInfo, ReductionOpInfo,
+                     SpectralFuncInfo)) and op.ref is not None and op.ref is not _NOTHING, op_db))
 
 
 # Tests that apply to all operators and aren't related to any particular
diff --git a/test/test_reductions.py b/test/test_reductions.py
index c1da0f0816c5a..e224eaec93648 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -14,7 +14,9 @@
     IS_WINDOWS, make_tensor)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU,
-    onlyOnCPUAndCUDA, onlyCUDA, largeTensorTest, precisionOverride)
+    onlyOnCPUAndCUDA, onlyCUDA, largeTensorTest, ops, precisionOverride)
+from torch.testing._internal.common_methods_invocations import (
+    ReductionOpInfo, reduction_ops)
 
 # TODO: replace with make_tensor
 def _generate_input(shape, dtype, device, with_extremal):
@@ -55,6 +57,21 @@ def _rand_shape(dim, min_size, max_size):
 
 class TestReductions(TestCase):
 
+    ###########################################################################
+    # ReductionOpInfo unit tests
+    ###########################################################################
+
+    @ops(reduction_ops, allowed_dtypes=[torch.float])
+    def test_dim_default(self, device, dtype, op: ReductionOpInfo):
+        """Tests that the default behavior is to reduce all dimensions."""
+        t = make_tensor((2, 3), device, dtype)
+        args, kwargs = next(op.generate_args_kwargs(t))
+        self.assertEqual(op(t, *args, **kwargs).ndim, 0)
+
+    ###########################################################################
+    # TODO: Legacy tests - port to ReductionOpInfo
+    ###########################################################################
+
     def test_var_unbiased(self, device):
         tensor = torch.randn(100, device=device)
         self.assertEqual(tensor.var(0), tensor.var(0, unbiased=True))
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index b725c4831d25f..3839b2ef82c17 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -13,7 +13,7 @@
 from torch._six import inf
 import collections.abc
 
-from typing import List, Sequence, Tuple, Union
+from typing import Any, Callable, List, Optional, Sequence, Tuple, Union
 
 from torch.testing import \
     (make_non_contiguous, floating_types, floating_types_and, complex_types,
@@ -43,6 +43,15 @@
     import scipy.special
 
 
+# Reasonable testing sizes for dimensions
+L = 20
+M = 10
+S = 5
+
+# Unique value to distinguish default from anything else
+_NOTHING = object()
+
+
 class DecorateInfo(object):
     """Describes which test, or type of tests, should be wrapped in the given
        decorators when testing an operator. Any test that matches all provided
@@ -92,6 +101,7 @@ def __init__(
                          device_type=device_type, dtypes=dtypes, active_if=active_if)
 
 
+
 class SampleInput(object):
     """Represents sample inputs to a function."""
 
@@ -185,6 +195,7 @@ def _np(t):
         sample_np_input, np_args, np_kwargs = to_numpy(self.input), to_numpy(self.args), to_numpy(self.kwargs)
         return (sample_np_input, np_args, np_kwargs)
 
+
 class AliasInfo(object):
     """Class holds alias information. For example, torch.abs ->
     torch.absolute, torch.Tensor.absolute, torch.Tensor.absolute_
@@ -200,9 +211,6 @@ def __call__(self, *args, **kwargs):
         return self.op(*args, **kwargs)
 
 
-_NOTHING = object()  # Unique value to distinguish default from anything else
-
-
 # Extension of getattr to support qualified names
 # e.g. _getattr_qual(torch, 'linalg.norm') -> torch.linalg.norm
 def _getattr_qual(obj, name, default=_NOTHING):
@@ -770,9 +778,164 @@ def default_test_dtypes(self, device_type):
                 else supported.intersection(self._default_test_dtypes))
 
 
-L = 20
-M = 10
-S = 5
+def _generate_reduction_inputs(device, dtype, requires_grad):
+    """Generates input tensors for testing reduction operators"""
+    yield make_tensor([], device, dtype, requires_grad=requires_grad)
+    yield make_tensor([2], device, dtype, requires_grad=requires_grad)
+    yield make_tensor([2, 3], device, dtype, requires_grad=requires_grad, noncontiguous=True)
+    yield make_tensor([3, 2, 1, 5], device, dtype, requires_grad=requires_grad)
+
+
+def _generate_reduction_kwargs(ndim, supports_multiple_dims=True):
+    """Generates a subset of all valid dim and keepdim kwargs given ndim that
+    is appropriate for testing reduction operators.
+    """
+
+    # Test default dim and keepdim
+    yield {}
+
+    # Test reducing inner and outer most dimensions
+    yield {'dim': 0, 'keepdim': True}
+    yield {'dim': -1, 'keepdim': False}
+
+    # Test reducing middle dimension
+    if ndim > 2:
+        yield {'dim': ndim // 2, 'keepdim': True}
+
+    if supports_multiple_dims:
+        # Test reducing all dimensions
+        yield {'dim': tuple(range(ndim)), 'keepdim': False}
+
+        # Test reducing both first and last dimensions
+        if ndim > 1:
+            yield {'dim': (0, -1), 'keepdim': True}
+
+        # Test reducing every other dimension starting with the second
+        if ndim > 3:
+            yield {'dim': tuple(range(1, ndim, 2)), 'keepdim': False}
+
+
+def sample_inputs_reduction(op_info, device, dtype, requires_grad, **kwargs):
+    """Sample inputs for reduction operators."""
+
+    # TODO(@heitorschueroff) Once all reduction operators are using
+    # ReductionOpInfo use op_info.supports_multiple_dims directly.
+    supports_multiple_dims: bool = kwargs.get('supports_multiple_dims', True)
+
+    # TODO(@heitorschueroff) Once all reduction operators are using ReductionOpInfo
+    # use op_info.genearte_args_kwargs directly.
+    generate_args_kwargs = kwargs.get('generate_args_kwargs', lambda *args, **kwargs: (yield tuple(), {}))
+
+    inputs: List[SampleInput] = []
+    for t in _generate_reduction_inputs(device, dtype, requires_grad):
+        for reduction_kwargs in _generate_reduction_kwargs(t.ndim, supports_multiple_dims):
+            for args, kwargs in generate_args_kwargs(t, **reduction_kwargs):
+                kwargs.update(reduction_kwargs)
+                inputs.append(SampleInput(t, args=args, kwargs=kwargs))
+
+    return inputs
+
+
+# NOTE [Reductions]:
+#
+# For testing purposes, we relax the definition of a reduction operator
+# as defined in the docstring below. We do this to capture operators with
+# a similar API so they can be tested automatically. However...
+#
+# Strictly speaking a reduction operator is an operator that can reduce an
+# array to a single scalar value and that can be computed from the partial
+# result of reducing subarrays. This usually means that the reduction operation
+# should be commutative and associative. This definition is important when it
+# comes to implementation as it determines how a reduction can be parallelized.
+#
+# For example, many summary statistics such as median, mode and quantile cannot
+# be computed from partial results because these are sorting and counting based
+# algorithms that need information that would be lost in the reduced value.
+class ReductionOpInfo(OpInfo):
+    """Reduction operator information.
+
+    An operator is a reduction operator if it reduces one or more dimensions of
+    the input tensor to a single value. Reduction operators must implement the
+    following signature:
+
+    - `op(input, *args, *, dim=None, keepdim=False, **kwargs) -> Tensor`
+
+    ReductionOpInfo tests that reduction operators implement a consistent API.
+    Optional features such as reducing over multiple dimensions are captured in
+    the optional keyword parameters of the ReductionOpInfo constructor.
+
+    If a reduction operator does not yet implement the full required API of
+    reduction operators, this should be documented by skipping the failing
+    tests rather than adding optional parameters to ReductionOpInfo.
+
+    NOTE
+    The API for reduction operators has not yet been finalized and some
+    requirements may change.
+
+    See tests in test/test_reductions.py
+    """
+
+    def __init__(
+        self, name, *,
+
+        # The identity value for the operator if it has one.
+        identity: Optional[Any] = None,
+
+        # The nan policy for the operator if it implements one.
+        # - propagate: NaN values are propagated to the output
+        # - omit: NaN values are discarded during the reduction
+        nan_policy: Optional[str] = None,
+
+        # Whether the operator supports reducing multiple dimensions.
+        supports_multiple_dims: bool = True,
+
+        # Whether the operator promotes integral to floating point dtypes.
+        promotes_int_to_float: bool = False,
+
+        # Whether the operator promotes all integral dtypes to int64.
+        promotes_int_to_int64: bool = False,
+
+        # If a specific dtype is given, then the operator always returns that
+        # dtype irrespective of the input dtype. If None, the operator returns
+        # the dtype according to the type promotion rules above.
+        result_dtype: Optional[torch.dtype] = None,
+
+        # ReductionOpInfo tests generate their own input, dim and keepdim
+        # arguments and call this function to generate tuples of extra args and
+        # kwargs to use when calling the op. This is required for operators that
+        # have other required parameters besides the input tensor.
+        generate_args_kwargs: Callable = lambda t, dim=None, keepdim=False: (yield tuple(), {}),
+
+        # Options from the OpInfo base class
+        **kwargs,
+    ):
+        assert nan_policy in (None, 'propagate', 'omit')
+
+        # These are mutually exclusive options
+        assert not (result_dtype and promotes_int_to_float)
+        assert not (result_dtype and promotes_int_to_int64)
+        assert not (promotes_int_to_float and promotes_int_to_int64)
+
+        # Default sample_inputs_func for ReductionOpInfo which augments sample
+        # inputs from sample_inputs_reduction with the args and kwargs from
+        # generate_args_kwargs. This is only used if sample_inputs_func is None.
+        def sample_inputs_func(*args, **kwargs):
+            kwargs['supports_multiple_dims'] = supports_multiple_dims
+            kwargs['generate_args_kwargs'] = generate_args_kwargs
+            return sample_inputs_reduction(*args, **kwargs)
+
+        # Override OpInfo defaults and call base class __init__
+        kwargs.setdefault('inplace_variant', None)
+        kwargs.setdefault('sample_inputs_func', sample_inputs_func)
+        super(ReductionOpInfo, self).__init__(name, **kwargs)
+
+        self.identity = identity
+        self.nan_policy = nan_policy
+        self.supports_multiple_dims = supports_multiple_dims
+        self.promotes_int_to_float = promotes_int_to_float
+        self.promotes_int_to_int64 = promotes_int_to_int64
+        self.result_dtype = result_dtype
+        self.generate_args_kwargs = generate_args_kwargs
 
 
 def sample_inputs_unary(op_info, device, dtype, requires_grad, **kwargs):
@@ -2452,56 +2615,6 @@ def sample_inputs_max_min_reduction_no_dim(op_info, device, dtype, requires_grad
                                           requires_grad=requires_grad),))
     return inputs
 
-# Generates input tensors for testing reduction ops
-def _generate_reduction_inputs(device, dtype, requires_grad):
-    yield make_tensor((), device, dtype, requires_grad=requires_grad)
-    yield make_tensor((2,), device, dtype, requires_grad=requires_grad)
-    yield make_tensor((2, 3), device, dtype, requires_grad=requires_grad, noncontiguous=True)
-    yield make_tensor((3, 2, 1, 2, 2), device, dtype, requires_grad=requires_grad)
-
-# Generates a subset of possible dim and keepdim kwargs for a tensor
-# with ndim dims appropriate for testing. If supports_multiple_dims
-# is True (default) then dim kwarg can be a list of dims.
-def _generate_reduction_kwargs(ndim, supports_multiple_dims=True):
-    for keepdim in [True, False]:
-        # Always test reducing inner and outer most dimensions
-        yield {'dim': 0, 'keepdim': keepdim}
-        yield {'dim': -1, 'keepdim': keepdim}
-
-        # Also reduce middle dimension
-        if ndim > 2:
-            yield {'dim': ndim // 2, 'keepdim': keepdim}
-
-        if supports_multiple_dims:
-            # Always test reducing all dims
-            yield {'dim': tuple(range(ndim)), 'keepdim': keepdim}
-
-            # Test reducing both first and last dimensions
-            if ndim > 1:
-                yield {'dim': (0, ndim - 1), 'keepdim': keepdim}
-
-            # Test reducing every other dimension starting with the second
-            if ndim > 3:
-                yield {'dim': tuple(range(1, ndim, 2)), 'keepdim': keepdim}
-
-# Wraps sample_inputs_reduction function to provide the additional supports_multiple_dims args
-def sample_inputs_reduction_wrapper(supports_multiple_dims):
-    # Generates sample inputs for reduction ops that contain the input tensor
-    # and dim and keepdim kwargs. If a reduction op needs to test additional
-    # args/kwargs then create a separate sample_inputs function
-    def fn(op_info, device, dtype, requires_grad):
-        inputs = []
-
-        for t in _generate_reduction_inputs(device, dtype, requires_grad):
-            # Add case without dim and keepdim kwargs
-            inputs.append(SampleInput(t))
-            for kwargs in _generate_reduction_kwargs(t.ndim, supports_multiple_dims):
-                inputs.append(SampleInput(t, kwargs=kwargs))
-
-        return inputs
-
-    return fn
-
 def sample_inputs_reduction_quantile(op_info, device, dtype, requires_grad):
     test_quantiles = (0.5, make_tensor((2,), device, dtype, low=0, high=1))
     test_interpolations = ['linear', 'midpoint']
@@ -2513,6 +2626,8 @@ def sample_inputs_reduction_quantile(op_info, device, dtype, requires_grad):
             inputs.append(SampleInput(t, args=(quantiles,)))
             for kwargs in _generate_reduction_kwargs(t.ndim, supports_multiple_dims=False):
                 # Interpolation kwarg for now is only supported when providing both dim and keepdim
+                kwargs.setdefault('dim', 0)
+                kwargs.setdefault('keepdim', False)
                 for interpolation in test_interpolations:
                     kwargs['interpolation'] = interpolation
                     inputs.append(SampleInput(t, args=(quantiles,), kwargs=kwargs))
@@ -6875,19 +6990,19 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            dtypesIfCUDA=all_types_and(torch.float16),
            # TODO: some signatures of median do support out
            supports_out=False,
-           sample_inputs_func=sample_inputs_reduction_wrapper(False)),
+           sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False)),
     OpInfo('nanmedian',
            dtypes=all_types(),
            dtypesIfCPU=all_types_and(torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.float16),
            # TODO: some signatures of nanmedian do support out
            supports_out=False,
-           sample_inputs_func=sample_inputs_reduction_wrapper(False)),
+           sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False)),
     OpInfo('var_mean',
            dtypes=floating_and_complex_types_and(torch.half),
            dtypesIfCPU=floating_and_complex_types_and(torch.half, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
-           sample_inputs_func=sample_inputs_reduction_wrapper(False),
+           sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False),
            backward_dtypes=floating_types_and(torch.half),
            backward_dtypesIfCPU=floating_types_and(torch.half, torch.bfloat16),
            backward_dtypesIfCUDA=floating_types_and(torch.half),
@@ -6906,7 +7021,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            dtypes=floating_and_complex_types_and(torch.half),
            dtypesIfCPU=floating_and_complex_types_and(torch.half, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
-           sample_inputs_func=sample_inputs_reduction_wrapper(False),
+           sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False),
            backward_dtypes=floating_types_and(torch.half),
            backward_dtypesIfCPU=floating_types_and(torch.half, torch.bfloat16),
            backward_dtypesIfCUDA=floating_types_and(torch.half),
@@ -6981,21 +7096,16 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            supports_out=False,
            supports_forward_ad=True,
            sample_inputs_func=sample_inputs_max_min_reduction_no_dim,),
-    OpInfo('sum',
-           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16, torch.bool),
-           supports_out=False,
-           supports_forward_ad=True,
-           sample_inputs_func=sample_inputs_reduction_wrapper(supports_multiple_dims=True)),
     OpInfo('nansum',
            dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
            supports_out=False,
-           sample_inputs_func=sample_inputs_reduction_wrapper(supports_multiple_dims=True)),
+           sample_inputs_func=sample_inputs_reduction),
     # TODO(@heitorschueroff) Add test for dtype kwarg
     OpInfo('mean',
            dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            assert_autodiffed=True,
            supports_forward_ad=True,
-           sample_inputs_func=sample_inputs_reduction_wrapper(supports_multiple_dims=True),
+           sample_inputs_func=sample_inputs_reduction,
            # Need to skip out test because one of the overload for mean does not support it
            # TODO(@heitorschueroff) fix this when implementing ReductionInfo
            skips=(SkipInfo('TestCommon', 'test_out'),)),
@@ -8843,6 +8953,14 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
             ),
         ),
     ),
+    ReductionOpInfo(
+        'sum',
+        identity=0,
+        supports_out=False,
+        supports_forward_ad=True,
+        promotes_int_to_int64=True,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+    ),
 ]
 
 # Common operator groupings
@@ -8851,6 +8969,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
 spectral_funcs = [op for op in op_db if isinstance(op, SpectralFuncInfo)]
 sparse_unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo) and op.supports_sparse is True]
 shape_funcs = [op for op in op_db if isinstance(op, ShapeFuncInfo)]
+reduction_ops = [op for op in op_db if isinstance(op, ReductionOpInfo)]
 
 # TODO: review porting these to make_tensor
 def index_variable(shape, max_indices, device=torch.device('cpu')):

From 10da1fc3f869075d698fbcda6e0b3ece739973d2 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 26 Aug 2021 06:58:12 -0700
Subject: [PATCH 249/530] Deify opmath_t into its own header, align with
 accscalar_t (#63986)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63986

Fixes #63985

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D30555996

Pulled By: ezyang

fbshipit-source-id: b6e4d56a5658ed028ffc105cc4b479faa6882b65
---
 aten/src/ATen/OpMathType.h                    | 16 +++++++++++++++
 aten/src/ATen/native/cuda/AmpKernels.cu       |  4 ++--
 .../ATen/native/cuda/ForeachBinaryOpList.cu   |  4 ++--
 .../ATen/native/cuda/ForeachBinaryOpScalar.cu |  4 ++--
 .../native/cuda/ForeachBinaryOpScalarList.cu  |  4 ++--
 aten/src/ATen/native/cuda/ForeachFunctors.cuh | 20 ++++++++-----------
 .../ATen/native/cuda/ForeachPointwiseOp.cu    | 10 +++++-----
 aten/src/ATen/native/cuda/ForeachUnaryOp.cu   |  4 ++--
 8 files changed, 39 insertions(+), 27 deletions(-)
 create mode 100644 aten/src/ATen/OpMathType.h

diff --git a/aten/src/ATen/OpMathType.h b/aten/src/ATen/OpMathType.h
new file mode 100644
index 0000000000000..b58d4779ac7a4
--- /dev/null
+++ b/aten/src/ATen/OpMathType.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+
+namespace at {
+
+// For FP16 or BFloat16 inputs, ops should perform internal math in FP32.
+template<typename scalar_t> struct OpMathType { using type = scalar_t; };
+template<> struct OpMathType<at::Half> { using type = float; };
+template<> struct OpMathType<at::BFloat16> { using type = float; };
+
+template<typename T>
+using opmath_type = typename OpMathType<T>::type;
+
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/AmpKernels.cu b/aten/src/ATen/native/cuda/AmpKernels.cu
index a5d8a643648e7..c89d8a09e8d1d 100644
--- a/aten/src/ATen/native/cuda/AmpKernels.cu
+++ b/aten/src/ATen/native/cuda/AmpKernels.cu
@@ -59,7 +59,7 @@ void _amp_non_finite_check_and_unscale_cuda_(Tensor& scaled_grad,
       auto* found_inf_ptr = found_inf.data_ptr<float>();
       auto* inv_scale_ptr = inv_scale.data_ptr<float>();
 
-      using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+      using opmath_t = at::opmath_type<scalar_t>;
 
       gpu_kernel(iter,
                  [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (scalar_t val_in) -> scalar_t {
@@ -154,7 +154,7 @@ void _amp_foreach_non_finite_check_and_unscale_cuda_(TensorList scaled_grads,
       auto* found_inf_ptr = found_inf.data_ptr<float>();
       auto* inv_scale_ptr = inv_scale.data_ptr<float>();
 
-      using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+      using opmath_t = at::opmath_type<scalar_t>;
 
       // multi_tensor_apply guards onto tensor_lists[0][0], no need to guard explicitly.
       multi_tensor_apply<1>(tensor_lists,
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
index 0277aee6f02b1..67a27ce116feb 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@@ -18,7 +18,7 @@ std::vector<Tensor> foreach_tensor_list_op(TensorList tensors1, TensorList tenso
     tensor_lists.emplace_back(std::move(vec_res));
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        using opmath_t = at::opmath_type<scalar_t>;
         multi_tensor_apply<3>(tensor_lists,
                               BinaryOpListAlphaFunctor<scalar_t,
                                                        /* depth */ 3,
@@ -38,7 +38,7 @@ void foreach_tensor_list_op_(TensorList tensors1, TensorList tensors2, const Sca
     tensor_lists.emplace_back(tensors2.vec());
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda_", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        using opmath_t = at::opmath_type<scalar_t>;
         multi_tensor_apply<2>(tensor_lists,
                               BinaryOpListAlphaFunctor<scalar_t,
                                                        /* depth */ 2,
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
index 37bd5db07eeed..59a04c9efab12 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@@ -17,7 +17,7 @@ std::vector<Tensor> foreach_binary_op(TensorList tensors, const Scalar& scalar)
     tensor_lists.emplace_back(std::move(vec_res));
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        using opmath_t = at::opmath_type<scalar_t>;
         multi_tensor_apply<2>(tensor_lists,
                               BinaryOpScalarFunctor<scalar_t,
                                                     /* depth */ 2,
@@ -35,7 +35,7 @@ void foreach_binary_op_(TensorList tensors, const Scalar& scalar) {
     tensor_lists.emplace_back(tensors.vec());
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda_", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        using opmath_t = at::opmath_type<scalar_t>;
         multi_tensor_apply<1>(tensor_lists,
                               BinaryOpScalarFunctor<scalar_t,
                                                     /* depth */ 1,
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
index 308553c8c2726..d91c06db30d52 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@@ -18,7 +18,7 @@ std::vector<Tensor> foreach_binary_op(TensorList tensors, at::ArrayRef<Scalar> s
     tensor_lists.emplace_back(vec_res);
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBFloat16, kHalf, kBool, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        using opmath_t = at::opmath_type<scalar_t>;
         multi_tensor_apply<2, opmath_t>(tensor_lists,
                                         scalars,
                                         BinaryOpScalarListFunctor<scalar_t,
@@ -37,7 +37,7 @@ void foreach_binary_op_(TensorList tensors, at::ArrayRef<Scalar> scalars) {
     tensor_lists.emplace_back(tensors.vec());
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBFloat16, kHalf, kBool, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        using opmath_t = at::opmath_type<scalar_t>;
         multi_tensor_apply<1, opmath_t>(tensor_lists,
                                         scalars,
                                         BinaryOpScalarListFunctor<scalar_t,
diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
index 51fe5c496bd91..8a16534cec3f8 100644
--- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh
+++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
@@ -1,16 +1,12 @@
 #pragma once
 #include <ATen/native/ForeachUtils.h>
 #include <ATen/native/cuda/MultiTensorApply.cuh>
+#include <ATen/OpMathType.h>
 
 namespace at { namespace native {
 
 namespace {
 
-// For FP16 or BFloat16 inputs, ops should perform internal math in FP32.
-template<typename scalar_t> struct get_opmath_t { using opmath_t = scalar_t; };
-template<> struct get_opmath_t<at::Half> { using opmath_t = float; };
-template<> struct get_opmath_t<at::BFloat16> { using opmath_t = float; };
-
 // Initializes args and checks if all args are aligned
 template<int depth, typename T>
 __device__ bool init_args(
@@ -158,7 +154,7 @@ __device__ __forceinline__ void pointwise_op_scalar(
 //
 template<typename T, int depth, int r_args_depth, int res_arg_index>
 struct BinaryOpScalarFunctor {
-    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    using opmath_t = at::opmath_type<T>;
     template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<depth>& tl,
@@ -179,7 +175,7 @@ struct BinaryOpScalarFunctor {
 
 template<typename T, int depth, int r_args_depth, int res_arg_index>
 struct BinaryOpScalarListFunctor {
-    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    using opmath_t = at::opmath_type<T>;
     template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListScalarListMetadata<opmath_t, depth>& tl,
@@ -200,7 +196,7 @@ struct BinaryOpScalarListFunctor {
 
 template<typename T, int depth, int r_args_depth, int res_arg_index>
 struct BinaryOpListAlphaFunctor {
-    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    using opmath_t = at::opmath_type<T>;
     template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<depth>& tl,
@@ -287,7 +283,7 @@ struct ZeroFunctor {
 
 template<typename T, int depth, int r_args_depth, int res_arg_index>
 struct UnaryOpFunctor {
-    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    using opmath_t = at::opmath_type<T>;
     template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<depth>& tl,
@@ -333,7 +329,7 @@ struct UnaryOpFunctor {
 
 template<typename T, int depth, int r_args_depth, int res_arg_index>
 struct PointwiseOpScalarFunctor {
-    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    using opmath_t = at::opmath_type<T>;
     template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<depth>& tl,
@@ -354,7 +350,7 @@ struct PointwiseOpScalarFunctor {
 
 template<typename T, int depth, int r_args_depth, int res_arg_index>
 struct PointwiseOpScalarListFunctor {
-    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    using opmath_t = at::opmath_type<T>;
     template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListScalarListMetadata<opmath_t, depth>& tl,
@@ -375,7 +371,7 @@ struct PointwiseOpScalarListFunctor {
 
 template<typename T, int depth>
 struct PointwiseOpListFunctor {
-    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    using opmath_t = at::opmath_type<T>;
     template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<depth>& tl,
diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
index 977425984e99e..9440b87caedac 100644
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@@ -20,7 +20,7 @@ std::vector<Tensor> foreach_pointwise_op(TensorList input, TensorList tensors1,
     tensor_lists.emplace_back(std::move(vec_res));
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, input[0].scalar_type(), "foreach_pointwise_op_cuda", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        using opmath_t = at::opmath_type<scalar_t>;
         multi_tensor_apply<4>(tensor_lists,
                               PointwiseOpScalarFunctor<scalar_t,
                                                        /* depth */ 4,
@@ -41,7 +41,7 @@ void foreach_pointwise_op_(TensorList input, TensorList tensors1, TensorList ten
     tensor_lists.emplace_back(tensors2.vec());
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, input[0].scalar_type(), "foreach_pointwise_op__cuda", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        using opmath_t = at::opmath_type<scalar_t>;
         multi_tensor_apply<3>(tensor_lists,
                               PointwiseOpScalarFunctor<scalar_t,
                                                        /* depth */ 3,
@@ -61,7 +61,7 @@ void foreach_pointwise_op_(TensorList input, TensorList tensors1, TensorList ten
     tensor_lists.emplace_back(tensors2.vec());
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, input[0].scalar_type(), "foreach_pointwise_op__cuda", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        using opmath_t = at::opmath_type<scalar_t>;
         multi_tensor_apply<3, opmath_t>(tensor_lists,
                                         scalars,
                                         PointwiseOpScalarListFunctor<scalar_t,
@@ -88,7 +88,7 @@ std::vector<Tensor> foreach_pointwise_op(TensorList input, TensorList tensors1,
     tensor_lists.emplace_back(std::move(vec_res));
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, input[0].scalar_type(), "foreach_pointwise_op_cuda", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        using opmath_t = at::opmath_type<scalar_t>;
         multi_tensor_apply<4, opmath_t>(tensor_lists,
                                         scalars,
                                         PointwiseOpScalarListFunctor<scalar_t,
@@ -172,7 +172,7 @@ std::vector<Tensor> foreach_tensor_##NAME##_cuda(TensorList tensors1, TensorList
     tensor_lists.emplace_back(std::move(vec_res));                                                         \
                                                                                                            \
     AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, tensors1[0].scalar_type(), "foreach_maximum_minimum_op_cuda", [&]() { \
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;                                                 \
+        using opmath_t = at::opmath_type<scalar_t>;                                                 \
         auto op = []  GPU_LAMBDA (opmath_t a, opmath_t b) -> opmath_t {                                    \
             opmath_t c = a OP b ? a : b;                                                                   \
             if (_isnan(a)) {                                                                               \
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index 8d606824d2cc6..fd7a12b9dfac6 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -15,7 +15,7 @@ template <typename scalar_t, template<class> class Op> std::vector<Tensor> forea
     tensor_lists.emplace_back(tensors.vec());
     tensor_lists.emplace_back(std::move(vec_res));
 
-    using opmath_t = typename get_opmath_t<scalar_t>::opmath_t;
+    using opmath_t = typename at::opmath_type<scalar_t>;
     multi_tensor_apply<2>(tensor_lists,
                           UnaryOpFunctor<scalar_t,
                                          /* depth */ 2,
@@ -29,7 +29,7 @@ template <typename scalar_t, template<class> class Op> std::vector<Tensor> forea
 template <typename scalar_t, template<class> class Op> void foreach_unary_op_(TensorList tensors) {
     std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(tensors.vec());
-    using opmath_t = typename get_opmath_t<scalar_t>::opmath_t;
+    using opmath_t = typename at::opmath_type<scalar_t>;
     multi_tensor_apply<1>(tensor_lists,
                           UnaryOpFunctor<scalar_t,
                                          /* depth */ 1,

From 950f7c023706db6db64a37b7b3c8b760679f7d3f Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Thu, 26 Aug 2021 07:17:24 -0700
Subject: [PATCH 250/530] Added API tests to ReductionOpInfo and ported
 amax/amin/nansum tests (#62899)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62899

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D30408816

Pulled By: heitorschueroff

fbshipit-source-id: 6cb0aa7fa7edba93549ef873baa2fb8a003bd91d
---
 test/test_reductions.py                       | 242 +++++++++++++++++-
 .../_internal/common_methods_invocations.py   | 241 +++++++++++------
 2 files changed, 397 insertions(+), 86 deletions(-)

diff --git a/test/test_reductions.py b/test/test_reductions.py
index e224eaec93648..e716336e4afe7 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -2,18 +2,20 @@
 import numpy as np
 
 import math
-from typing import Dict, List
+from typing import Dict, List, Sequence
 import random
 from functools import partial
 from itertools import product, combinations, permutations
 import warnings
 
 from torch._six import inf, nan
+from torch.testing import (
+    integral_types_and, floating_and_complex_types_and)
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict,
     IS_WINDOWS, make_tensor)
 from torch.testing._internal.common_device_type import (
-    instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU,
+    OpDTypes, instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU,
     onlyOnCPUAndCUDA, onlyCUDA, largeTensorTest, ops, precisionOverride)
 from torch.testing._internal.common_methods_invocations import (
     ReductionOpInfo, reduction_ops)
@@ -55,18 +57,244 @@ def _rand_shape(dim, min_size, max_size):
         shape.append(random.randint(min_size, max_size))
     return tuple(shape)
 
+def _reduced_shape(shape, dim=None, keepdim=False):
+    """Computes the expected reduced shape given dim and keepdim
+
+    Args:
+        shape: The shape to reduce
+        dim : The dimensions to reduce
+        keepdim: If true, reduced dimensions have size 1 in the reduced shape,
+            otherwise they are removed from the reduced shape.
+
+    Returns:
+        The reduced shape
+    """
+    if dim is None:
+        return [1] * len(shape) if keepdim else []
+
+    # Wrap negative dims
+    dim = dim if isinstance(dim, Sequence) else [dim]
+    dim = set(i if i >= 0 else len(shape) + i for i in dim)
+
+    result = []
+    for i, size in enumerate(shape):
+        if i not in dim:
+            result.append(size)
+        elif keepdim:
+            result.append(1)
+
+    return result
+
 class TestReductions(TestCase):
 
     ###########################################################################
     # ReductionOpInfo unit tests
     ###########################################################################
 
-    @ops(reduction_ops, allowed_dtypes=[torch.float])
-    def test_dim_default(self, device, dtype, op: ReductionOpInfo):
-        """Tests that the default behavior is to reduce all dimensions."""
-        t = make_tensor((2, 3), device, dtype)
+    def _test_dim_keepdim(self, op: ReductionOpInfo, device, *, ndim, **dim_keepdim):
+        """Tests output shape for input with ndim and dim and keepdim kwargs"""
+        shape = torch.randint(2, 5, (ndim,)).tolist()
+        t = make_tensor(shape, device, torch.float)
+        args, kwargs = next(op.generate_args_kwargs(t, **dim_keepdim))
+        result = op(t, *args, **dim_keepdim, **kwargs)
+        expected_shape = _reduced_shape(shape, **dim_keepdim)
+        self.assertEqual(result.shape, expected_shape, f"""
+        expected output shape to be {expected_shape} but got {list(result.shape)}
+        for input shape {shape} and {dim_keepdim}
+        """)
+
+    # TODO(@heitorschueroff) combine cases with and without keepdim once
+    # there's support for a @parametrize decorator.
+
+    @ops(reduction_ops, dtypes=OpDTypes.none)
+    def test_dim_default(self, device, op: ReductionOpInfo):
+        """Tests that the default dim reduces all dimensions."""
+        for ndim in range(3):
+            self._test_dim_keepdim(op, device, ndim=ndim)
+
+    @ops(reduction_ops, dtypes=OpDTypes.none)
+    def test_dim_default_keepdim(self, device, op: ReductionOpInfo):
+        """Tests that the default dim, when keepdim=True, reduces all dimensions to size 1."""
+        for ndim in range(3):
+            self._test_dim_keepdim(op, device, ndim=ndim, keepdim=True)
+
+    @ops(reduction_ops, dtypes=OpDTypes.none)
+    def test_dim_none(self, device, op: ReductionOpInfo):
+        """Tests that dim=None reduces all dimensions."""
+        for ndim in range(3):
+            self._test_dim_keepdim(op, device, ndim=ndim, dim=None)
+
+    @ops(reduction_ops, dtypes=OpDTypes.none)
+    def test_dim_none_keepdim(self, device, op: ReductionOpInfo):
+        """Tests that dim=None, when keepdim=True, reduces all dimensions to size 1."""
+        for ndim in range(3):
+            self._test_dim_keepdim(op, device, ndim=ndim, dim=None, keepdim=True)
+
+    @ops(reduction_ops, dtypes=OpDTypes.none)
+    def test_dim_single(self, device, op: ReductionOpInfo):
+        """Tests that dim=i reduces dimension i."""
+        self._test_dim_keepdim(op, device, ndim=0, dim=0)
+        self._test_dim_keepdim(op, device, ndim=1, dim=0)
+        self._test_dim_keepdim(op, device, ndim=2, dim=-1)
+        self._test_dim_keepdim(op, device, ndim=3, dim=1)
+
+    @ops(reduction_ops, dtypes=OpDTypes.none)
+    def test_dim_single_keepdim(self, device, op: ReductionOpInfo):
+        """Tests that dim=i, when keepdim=True, reduces dimension i to size 1."""
+        self._test_dim_keepdim(op, device, ndim=0, dim=0, keepdim=True)
+        self._test_dim_keepdim(op, device, ndim=1, dim=0, keepdim=True)
+        self._test_dim_keepdim(op, device, ndim=2, dim=-1, keepdim=True)
+        self._test_dim_keepdim(op, device, ndim=3, dim=1, keepdim=True)
+
+    @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none)
+    def test_dim_empty(self, device, op: ReductionOpInfo):
+        """Tests that dim=[] is a no-op"""
+        self._test_dim_keepdim(op, device, ndim=0, dim=[])
+        self._test_dim_keepdim(op, device, ndim=2, dim=[])
+
+    @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none)
+    def test_dim_empty_keepdim(self, device, op: ReductionOpInfo):
+        """Tests that dim=[], when keepdim=True, is a no-op"""
+        self._test_dim_keepdim(op, device, ndim=0, dim=[], keepdim=True)
+        self._test_dim_keepdim(op, device, ndim=2, dim=[], keepdim=True)
+
+    @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none)
+    def test_dim_multi(self, device, op: ReductionOpInfo):
+        """Tests that dim=[i, j, ...] reduces dimensions i, j, ...."""
+        self._test_dim_keepdim(op, device, ndim=1, dim=[0])
+        self._test_dim_keepdim(op, device, ndim=3, dim=[0, 2])
+
+    @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none)
+    def test_dim_multi_keepdim(self, device, op: ReductionOpInfo):
+        """Tests that dim=[i, j, ...], when keepdim=True, reduces dimensions i, j, .... to size 1."""
+        self._test_dim_keepdim(op, device, ndim=1, dim=[0], keepdim=True)
+        self._test_dim_keepdim(op, device, ndim=3, dim=[0, 2], keepdim=True)
+
+    @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none)
+    def test_dim_multi_unsorted(self, device, op: ReductionOpInfo):
+        """Tests that operator correctly handles unsorted dim list."""
+        self._test_dim_keepdim(op, device, ndim=4, dim=[3, 0, 2])
+
+    @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none)
+    def test_dim_multi_unsorted_keepdim(self, device, op: ReductionOpInfo):
+        """Tests that operator correctly handles unsorted dim list when keepdim=True."""
+        self._test_dim_keepdim(op, device, ndim=4, dim=[3, 0, 2], keepdim=True)
+
+    @ops(filter(lambda op: op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none)
+    def test_dim_multi_duplicate(self, device, op: ReductionOpInfo):
+        """Tests that an error is raised if dim has duplicate entries."""
+        with self.assertRaises(RuntimeError):
+            self._test_dim_keepdim(op, device, ndim=3, dim=[0, 1, 1, 2])
+
+    @ops(filter(lambda op: not op.supports_multiple_dims, reduction_ops), dtypes=OpDTypes.none)
+    def test_dim_multi_unsupported(self, device, op: ReductionOpInfo):
+        """Tests that ops claiming to not support multi dim actually don't."""
+        with self.assertRaises(TypeError):
+            self._test_dim_keepdim(op, device, ndim=3, dim=[0, 2])
+
+    @ops(reduction_ops, dtypes=OpDTypes.none)
+    def test_dim_offbounds(self, device, op: ReductionOpInfo):
+        """Tests that passing an off-bounds dim throws"""
+        with self.assertRaises(IndexError):
+            self._test_dim_keepdim(op, device, ndim=2, dim=2)
+
+    @ops(reduction_ops, dtypes=OpDTypes.none)
+    def test_dim_ndim_limit(self, device, op: ReductionOpInfo):
+        """Tests that an exception is raised when reducing a tensor with more
+        than 64 dims along some specific dimensions. dim=None is ok"""
+        t = make_tensor([1] * 65, device, torch.float)
+        with self.assertRaisesRegex(RuntimeError, "only tensors with up to 64 dims are supported"):
+            op(t, dim=0)
+
+    @ops(filter(lambda op: op.identity is not None, reduction_ops), dtypes=OpDTypes.supported)
+    def test_identity(self, device, dtype, op: ReductionOpInfo):
+        """Tests that the identity value is an identity for the operator"""
+        t = make_tensor((10,), device, dtype)
+        t[1::2] = op.identity
+        args, kwargs = next(op.generate_args_kwargs(t))
+        result = op(t[::2], *args, **kwargs)
+        result_with_identity = op(t, *args, **kwargs)
+        self.assertEqual(result, result_with_identity, """
+        Adding identity value to the input tensor should not change the result.
+        """)
+
+    # TODO(@heitorschueroff) Update these to use the nan_policy kwarg once
+    # it is added to reduction operators.
+
+    @ops(filter(lambda op: op.nan_policy == 'propagate', reduction_ops), dtypes=OpDTypes.supported,
+         allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16))
+    def test_nan_policy_propagate(self, device, dtype, op: ReductionOpInfo):
+        """Tests that nan is propagated to the output by default"""
+        t = make_tensor((5,), device, dtype)
+        t[2] = torch.nan
         args, kwargs = next(op.generate_args_kwargs(t))
-        self.assertEqual(op(t, *args, **kwargs).ndim, 0)
+        result = op(t, *args, **kwargs)
+        self.assertTrue(result.isnan())
+
+    @ops(filter(lambda op: op.nan_policy == 'omit', reduction_ops), dtypes=OpDTypes.supported,
+         allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16))
+    def test_nan_policy_omit(self, device, dtype, op: ReductionOpInfo):
+        """Tests that NaN values do not affect the result."""
+        t = make_tensor((10,), device, dtype)
+        t[1::2] = torch.nan
+        args, kwargs = next(op.generate_args_kwargs(t))
+        result = op(t[::2], *args, **kwargs)
+        result_with_nan = op(t, *args, **kwargs)
+        self.assertEqual(result, result_with_nan)
+
+    @ops(reduction_ops, dtypes=OpDTypes.supported)
+    def test_result_dtype(self, device, dtype, op: ReductionOpInfo):
+        """Tests that the result has the correct dtype"""
+        t = make_tensor((5,), device, dtype)
+        args, kwargs = next(op.generate_args_kwargs(t))
+        result: torch.Tensor = op(t, *args, **kwargs)
+        is_integral = dtype in integral_types_and(torch.bool)
+        if op.promotes_int_to_float and is_integral:
+            self.assertTrue(torch.is_floating_point(result.dtype))
+        elif op.promotes_int_to_int64 and is_integral:
+            self.assertEqual(result.dtype, torch.int64)
+        elif op.result_dtype is not None:
+            self.assertEqual(result.dtype, op.result_dtype)
+        else:
+            self.assertEqual(result.dtype, dtype)
+
+    @ops(reduction_ops, dtypes=OpDTypes.none)
+    def test_empty_tensor_empty_slice(self, device, op: ReductionOpInfo):
+        """Tests for consistent behavior when reducing over an empty slice.
+
+        The rules for reducing over an empty slice are as follows:
+            - Return the identity value if the operator has one
+            - Otherwise, return NaN if the operator promotes integral dtype to
+              floating point dtypes.
+            - Otherwise, raise an error
+
+        See discussion here https://github.com/pytorch/pytorch/issues/61901
+        """
+        t = make_tensor((0, 2, 3), device, torch.float)
+        for dim in [0] + [[0, 2]] if op.supports_multiple_dims else []:
+            args, kwargs = next(op.generate_args_kwargs(t, dim=dim))
+            if op.identity is not None:
+                # Reducing along empty slice should return identity
+                result = op(t, *args, dim=dim, **kwargs)
+                self.assertEqual(result, torch.full_like(result, op.identity))
+            elif op.promotes_int_to_float:
+                # Reducing along empty slice should return NaN
+                result = op(t, *args, dim=dim, **kwargs)
+                self.assertEqual(result, torch.full_like(result, torch.nan))
+            else:
+                # Reducing along empty slice should raise an error
+                with self.assertRaises(IndexError):
+                    op(t, *args, dim=dim, **kwargs)
+
+    @ops(reduction_ops, dtypes=OpDTypes.none)
+    def test_empty_tensor_nonempty_slice(self, device, op: ReductionOpInfo):
+        """Tests that reducing a nonempty slice of an empty tensor returns an
+        empty tensor with the dimensions reduced."""
+        t = make_tensor((0, 2, 3), device, torch.float)
+        for dim in [1] + [[1, 2]] if op.supports_multiple_dims else []:
+            args, kwargs = next(op.generate_args_kwargs(t, dim=dim))
+            result = op(t, *args, dim=dim, **kwargs)
+            self.assertEqual(result.shape, _reduced_shape(t.shape, dim))
 
     ###########################################################################
     # TODO: Legacy tests - port to ReductionOpInfo
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 3839b2ef82c17..4331c92d56599 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -2170,28 +2170,6 @@ def sample_inputs_take_along_dim(op_info, device, dtype, requires_grad, **kwargs
             )
 
 
-def sample_inputs_amax_amin(op_info, device, dtype, requires_grad, **kwargs):
-    # Ordered as (input shape, kwargs)
-    test_cases: Tuple[tuple, dict] = (  # type: ignore[assignment]
-        ((S, S, S), {}),
-        ((S, S, S), {'dim': 1}),
-        ((S, S, S), {'dim': (1, 2,)}),
-        ((S, S, S), {'dim': 1, 'keepdim': True}),
-        ((), {'dim': 0}),
-        ((), {}),
-        ((), {'dim': 0, 'keepdim': True}),
-    )
-
-    samples: List[SampleInput] = []
-    for shape, kwargs in test_cases:
-        samples.append(SampleInput(
-            make_tensor(shape, device, dtype, requires_grad=requires_grad),
-            kwargs=kwargs))
-
-    return samples
-
-# TODO (@heitorschueroff) Once aminmax supports multiple dims this should
-# be combined with the above test.
 def sample_inputs_aminmax(op_info, device, dtype, requires_grad, **kwargs):
     test_cases: Tuple[tuple, dict] = (  # type: ignore[assignment]
         ((S, S, S), {}),
@@ -2210,33 +2188,6 @@ def sample_inputs_aminmax(op_info, device, dtype, requires_grad, **kwargs):
 
     return samples
 
-def sample_inputs_argmax_argmin(op_info, device, dtype, requires_grad, **kwargs):
-    test_cases = (
-        ((2, 2, 2), ()),
-        ((2, 2, 2), (0,)),
-        ((2, 2, 2), (1,)),
-        ((2, 2, 2), (2,)),
-        ((2, 2, 2), (2, True,)),
-        ((2, 2, 2), (None,)),
-        ((), (0,)),
-        ((), ()),
-        ((), (None, True,)),
-        ((1,), ()),
-        ((1,), (0,)),
-        ((1,), (0, True)),
-        ((2,), ()),
-        ((2,), (0,)),
-        ((2,), (0, True)),
-        ((2, 2, 3), ()),
-        ((2, 2, 3), (0,)),
-        ((2, 2, 3), (1,)),
-        ((2, 2, 3), (None, True)),
-    )
-    return tuple(SampleInput((make_tensor(size, device, dtype,
-                                          requires_grad=requires_grad)),
-                             args=args)
-                 for size, args in test_cases)
-
 def sample_inputs_diff(op_info, device, dtype, requires_grad, **kwargs):
     test_cases = (
         ((1,), 0, None, None),
@@ -2634,6 +2585,14 @@ def sample_inputs_reduction_quantile(op_info, device, dtype, requires_grad):
 
     return inputs
 
+def sample_inputs_reduction_count_nonzero(*args, **kwargs):
+    """Sample inputs for count_nonzero"""
+    samples: List[SampleInput] = sample_inputs_reduction(*args, **kwargs)
+    # count_nonzero does not support keepdim yet
+    for sample in samples:
+        sample.kwargs.pop('keepdim', None)
+    return samples
+
 def sample_inputs_leaky_relu(op_info, device, dtype, requires_grad):
     N = 10
     tensors = [SampleInput(make_tensor((N, N), device=device, dtype=dtype,
@@ -5823,22 +5782,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                # TODO: update sample inputs with for_inplace_variant kwarg to support this test
                SkipInfo('TestCommon', 'test_variant_consistency_eager'),),
            sample_inputs_func=sample_inputs_addcmul_addcdiv),
-    OpInfo('amax',
-           ref=lambda a, dim=None, keepdim=False, **kwargs: np.amax(a, axis=dim, keepdims=keepdim, **kwargs),
-           dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
-           sample_inputs_func=sample_inputs_amax_amin,),
-    OpInfo('amin',
-           ref=lambda a, dim=None, keepdim=False, **kwargs: np.amin(a, axis=dim, keepdims=keepdim, **kwargs),
-           dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
-           sample_inputs_func=sample_inputs_amax_amin),
-    OpInfo('argmax',
-           dtypes=all_types_and(torch.float16, torch.bfloat16),
-           supports_autograd=False,
-           sample_inputs_func=sample_inputs_argmax_argmin,),
-    OpInfo('argmin',
-           dtypes=all_types_and(torch.float16, torch.bfloat16),
-           supports_autograd=False,
-           sample_inputs_func=sample_inputs_argmax_argmin,),
     UnaryUfuncInfo('asin',
                    aliases=('arcsin', ),
                    ref=np.arcsin,
@@ -7096,10 +7039,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            supports_out=False,
            supports_forward_ad=True,
            sample_inputs_func=sample_inputs_max_min_reduction_no_dim,),
-    OpInfo('nansum',
-           dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
-           supports_out=False,
-           sample_inputs_func=sample_inputs_reduction),
     # TODO(@heitorschueroff) Add test for dtype kwarg
     OpInfo('mean',
            dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
@@ -7458,16 +7397,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            supports_forward_ad=True,
            skips=(
                SkipInfo('TestMathBits', 'test_conj_view', device_type='cuda'),),),
-    OpInfo('prod',
-           dtypes=all_types_and_complex_and(torch.bool),
-           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-           skips=(
-               # prod does not support the (Tensor, *, out) overload
-               SkipInfo('TestCommon', 'test_out',
-                        dtypes=[torch.float32]),
-           ),
-           sample_inputs_func=sample_inputs_prod,
-           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
     OpInfo('qr',
            op=torch.qr,
            dtypes=floating_and_complex_types(),
@@ -8953,13 +8882,167 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
             ),
         ),
     ),
+    ReductionOpInfo(
+        'all',
+        identity=True,
+        supports_multiple_dims=False,
+        supports_out=False,
+        supports_autograd=False,
+        result_dtype=torch.bool,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        skips=(
+            # FIXME: does not support passing keepdim without dim
+            SkipInfo('TestReductions', 'test_dim_default_keepdim'),
+            # FIXME: does not support dim=None
+            SkipInfo('TestReductions', 'test_dim_none'),
+            SkipInfo('TestReductions', 'test_dim_none_keepdim'),
+            # FIXME: uint8 input returns uint8 instead of bool
+            SkipInfo('TestReductions', 'test_result_dtype', dtypes=[torch.uint8]),
+        ),
+    ),
+    ReductionOpInfo(
+        'any',
+        identity=False,
+        supports_multiple_dims=False,
+        supports_out=False,
+        supports_autograd=False,
+        result_dtype=torch.bool,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        skips=(
+            # FIXME: does not support passing keepdim without dim
+            SkipInfo('TestReductions', 'test_dim_default_keepdim'),
+            # FIXME: does not support dim=None
+            SkipInfo('TestReductions', 'test_dim_none'),
+            SkipInfo('TestReductions', 'test_dim_none_keepdim'),
+            # FIXME: uint8 input returns uint8 instead of bool
+            SkipInfo('TestReductions', 'test_result_dtype', dtypes=[torch.uint8]),
+        ),
+    ),
+    ReductionOpInfo(
+        'amax',
+        nan_policy='propagate',
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        ref=lambda a, dim=None, keepdim=False, **kwargs: np.amax(a, axis=dim, keepdims=keepdim, **kwargs),
+        skips=(
+            # FIXME: sum reduces all dimensions when dim=[]
+            SkipInfo('TestReductions', 'test_dim_empty'),
+            SkipInfo('TestReductions', 'test_dim_empty_keepdim'),
+        ),
+    ),
+    ReductionOpInfo(
+        'amin',
+        nan_policy='propagate',
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        ref=lambda a, dim=None, keepdim=False, **kwargs: np.amin(a, axis=dim, keepdims=keepdim, **kwargs),
+        skips=(
+            # FIXME: sum reduces all dimensions when dim=[]
+            SkipInfo('TestReductions', 'test_dim_empty'),
+            SkipInfo('TestReductions', 'test_dim_empty_keepdim'),
+        ),
+    ),
+    ReductionOpInfo(
+        'argmax',
+        supports_multiple_dims=False,
+        supports_autograd=False,
+        result_dtype=torch.int64,
+        dtypes=all_types_and(torch.float16, torch.bfloat16),
+        skips=(
+            # FIXME: keepdim parameter is ignored when dim=None
+            SkipInfo('TestReductions', 'test_dim_default_keepdim'),
+            SkipInfo('TestReductions', 'test_dim_none_keepdim'),
+        ),
+    ),
+    ReductionOpInfo(
+        'argmin',
+        supports_multiple_dims=False,
+        supports_autograd=False,
+        result_dtype=torch.int64,
+        dtypes=all_types_and(torch.float16, torch.bfloat16),
+        skips=(
+            # FIXME: keepdim parameter is ignored when dim=None
+            SkipInfo('TestReductions', 'test_dim_default_keepdim'),
+            SkipInfo('TestReductions', 'test_dim_none_keepdim'),
+        ),
+    ),
+    ReductionOpInfo(
+        'count_nonzero',
+        identity=0,
+        supports_out=False,
+        supports_autograd=False,
+        result_dtype=torch.int64,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        sample_inputs_func=sample_inputs_reduction_count_nonzero,
+        skips=(
+            # FIXME: count_nonzero does not accept keepdim kwarg
+            SkipInfo('TestReductions', 'test_dim_default_keepdim'),
+            SkipInfo('TestReductions', 'test_dim_none_keepdim'),
+            SkipInfo('TestReductions', 'test_dim_single_keepdim'),
+            SkipInfo('TestReductions', 'test_dim_empty_keepdim'),
+            SkipInfo('TestReductions', 'test_dim_multi_keepdim'),
+            SkipInfo('TestReductions', 'test_dim_multi_unsorted_keepdim'),
+            SkipInfo('TestReductions', 'test_dim_offbounds_keepdim'),
+            # FIXME: dim=[] reduces all dimensions
+            SkipInfo('TestReductions', 'test_dim_empty'),
+        ),
+    ),
+    ReductionOpInfo(
+        'prod',
+        identity=1,
+        nan_policy='propagate',
+        supports_multiple_dims=False,
+        supports_out=False,
+        promotes_int_to_int64=True,
+        gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+        dtypes=all_types_and_complex_and(torch.bool),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        sample_inputs_func=sample_inputs_prod,
+        skips=(
+            # FIXME: prod does not support passing keepdim without passing dim
+            SkipInfo('TestReductions', 'test_dim_default_keepdim'),
+            # FIXME: prod reduces all dimensions when dim=[]
+            SkipInfo('TestReductions', 'test_dim_empty'),
+            SkipInfo('TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: prod does not support passing None to dim
+            SkipInfo('TestReductions', 'test_dim_none'),
+            SkipInfo('TestReductions', 'test_dim_none_keepdim'),
+        ),
+    ),
     ReductionOpInfo(
         'sum',
         identity=0,
+        nan_policy='propagate',
         supports_out=False,
         supports_forward_ad=True,
         promotes_int_to_int64=True,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        skips=(
+            # FIXME: sum does not support passing keepdim without passing dim
+            SkipInfo('TestReductions', 'test_dim_default_keepdim'),
+            # FIXME: sum reduces all dimensions when dim=[]
+            SkipInfo('TestReductions', 'test_dim_empty'),
+            SkipInfo('TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: sum does not support passing None to dim
+            SkipInfo('TestReductions', 'test_dim_none'),
+            SkipInfo('TestReductions', 'test_dim_none_keepdim'),
+        ),
+    ),
+    ReductionOpInfo(
+        'nansum',
+        identity=0,
+        nan_policy='omit',
+        supports_out=False,
+        promotes_int_to_int64=True,
+        dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+        skips=(
+            # FIXME: nansum does not support passing keepdim without passing dim
+            SkipInfo('TestReductions', 'test_dim_default_keepdim'),
+            # FIXME: nansum reduces all dimensions when dim=[]
+            SkipInfo('TestReductions', 'test_dim_empty'),
+            SkipInfo('TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: nansum does not support passing None to dim
+            SkipInfo('TestReductions', 'test_dim_none'),
+            SkipInfo('TestReductions', 'test_dim_none_keepdim'),
+        ),
     ),
 ]
 

From 733755f72ca15feef8deeb512925639ef15f92d7 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Thu, 26 Aug 2021 07:48:20 -0700
Subject: [PATCH 251/530] remove special grad_mode tls handling (#63116)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63116

This PR removes the special flag to disable grad mode tracking on the ThreadLocalState and replaces it with an explicit setter that users can use.
This allows to reduce complexity of ThreadLocalState.

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D30388098

Pulled By: albanD

fbshipit-source-id: 85641b3d711179fb78ff6a41ed077548dc821a2f
---
 aten/src/ATen/ThreadLocalState.cpp            | 26 +++++--------------
 aten/src/ATen/ThreadLocalState.h              | 14 +++++-----
 torch/csrc/autograd/engine.cpp                |  2 --
 torch/csrc/autograd/engine.h                  | 13 +++++-----
 .../autograd/engine/dist_engine.cpp           |  1 -
 5 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index fc4b8fa9c27ec..98c2519e045ce 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -9,40 +9,26 @@
 
 namespace at {
 
-ThreadLocalState::ThreadLocalState(bool keep_grad_mode)
+ThreadLocalState::ThreadLocalState()
     : dispatch_key_(c10::impl::tls_local_dispatch_key_set()),
       debug_info_(c10::ThreadLocalDebugInfo::current()),
       autograd_tls_(c10::AutogradState::get_tls_state()) {
   rf_tls_ = at::get_record_function_tls_();
   saved_tensors_default_hooks_ = SavedTensorDefaultHooks::get_hooks();
 
-#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  keep_grad_mode_ = keep_grad_mode;
-#endif
   bumped_record_all_functions_ = at::checkRecordAllFunctions();
 }
 
+void ThreadLocalState::set_grad_mode(bool enabled) {
+  autograd_tls_.set_grad_mode(enabled);
+}
+
 /* static */
 void ThreadLocalState::setThreadLocalState(
     const ThreadLocalState& state) {
   // Note that setting the InferenceMode TLS in this function is ONLY ok because we always
   // restore the dispatch key set TLS at the same time.
-#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  if (state.keep_grad_mode_) {
-    c10::AutogradState::set_tls_state(state.autograd_tls_);
-  } else {
-    auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(),
-                                        /* inference_mode */ state.autograd_tls_.get_inference_mode());
-    c10::AutogradState::set_tls_state(new_state);
-  }
-#else
-  // The mobile build explicitly ignore grad_mode but fails if we propagate
-  // its value across threads or set it to a fixed value.
-  // So we have to make sure the grad_mode value is not changed here.
-  auto new_state = c10::AutogradState(/* grad_mode */ c10::AutogradState::get_tls_state().get_grad_mode(),
-                                      /* inference_mode */ state.autograd_tls_.get_inference_mode());
-  c10::AutogradState::set_tls_state(new_state);
-#endif
+  c10::AutogradState::set_tls_state(state.autograd_tls_);
 
   at::set_record_function_tls_(state.rf_tls_);
 
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index 4942399cbd6d7..41146912819b4 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -16,10 +16,12 @@ class TORCH_API ThreadLocalState {
  public:
   // Saves the thread local variables' values and
   // returns them as a ThreadLocalState
-  // keep_grad_mode - whether grad mode has to be preserved
-  //  (e.g. not preserved when passing from forward pass into
-  //   the autograd engine, autograd engine takes care of grad mode)
-  ThreadLocalState(bool keep_grad_mode = true);
+  ThreadLocalState();
+
+  // set_grad_mode - force the value of the grad mode TLS in
+  //  the current state object. This is used for example in the
+  //  autograd engine.
+  void set_grad_mode(bool enabled);
 
   // Sets thread local variables in the current thread,
   // according to the thread boundary specified
@@ -35,10 +37,8 @@ class TORCH_API ThreadLocalState {
   // RecordFunction TLS
   RecordFunctionTLS rf_tls_;
 
+  // TLS for AutogradModes
   AutogradState autograd_tls_;
-#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  bool keep_grad_mode_ = true;
-#endif
 
   // TLS for saved tensors default hooks
   std::pair<PyObject*, PyObject*> saved_tensors_default_hooks_;
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index de2078d2d6432..acd7971aad6a7 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -411,7 +411,6 @@ auto Engine::thread_main(const std::shared_ptr<GraphTask>& graph_task) -> void {
         // NB: The ThreadLocalStateGuard doesn't set the grad_mode because GraphTask
         // always saves ThreadLocalState without grad_mode.
         at::ThreadLocalStateGuard tls_guard(local_graph_task->thread_locals_);
-        AutoGradMode grad_mode(local_graph_task->grad_mode_);
 
         try {
           // The guard sets the thread_local current_graph_task on construction
@@ -580,7 +579,6 @@ void GraphTask::exec_post_processing() {
     // NB: The ThreadLocalStateGuard doesn't set the grad_mode because GraphTask
     // always saves ThreadLocalState without grad_mode.
     at::ThreadLocalStateGuard tls_guard(this->thread_locals_);
-    AutoGradMode grad_mode(this->grad_mode_);
 
     // WARNING: Don't use a range-for loop here because more callbacks may be
     // added in between callback calls, so iterators may become invalidated.
diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h
index 17318473bcfcd..dd465f96c350e 100644
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@@ -53,9 +53,8 @@ struct GraphTask: std::enable_shared_from_this<GraphTask> {
   // true, it signals all threads to stop executing.
   std::atomic_bool has_error_{false};
   std::atomic_bool future_completed_{false};
-  // It is safe to read grad_mode_ and keep_graph_ without synchronization
+  // It is safe to read keep_graph_ without synchronization
   bool keep_graph_;
-  bool grad_mode_;
 
   // To protect reads/writes to not_ready_, dependencies_, captured_vars_,
   // has_error_, future_result_, cpu_ready_queue_, and leaf_streams.
@@ -110,8 +109,9 @@ struct GraphTask: std::enable_shared_from_this<GraphTask> {
   // out of the GraphTask and are no longer valid.
   std::vector<Variable> captured_vars_;
 
-  at::ThreadLocalState thread_locals_ =
-      at::ThreadLocalState(/* keep_grad_mode */ false);
+  // Note: this field is not ready to be used until the proper `thread_locals_.set_grad_mode()`
+  // call in the constructor.
+  at::ThreadLocalState thread_locals_ = at::ThreadLocalState();
 
   std::unordered_set<c10::Stream> leaf_streams;
 
@@ -180,12 +180,13 @@ struct GraphTask: std::enable_shared_from_this<GraphTask> {
       std::shared_ptr<ReadyQueue> cpu_ready_queue,
       bool exit_on_error = false)
       : keep_graph_(keep_graph),
-        grad_mode_(grad_mode),
         owner_(NO_DEVICE),
         reentrant_depth_(reentrant_depth),
         exit_on_error_(exit_on_error),
         cpu_ready_queue_(std::move(cpu_ready_queue)),
-        future_result_(c10::make_intrusive<at::ivalue::Future>(c10::ListType::create(c10::TensorType::get()))) {}
+        future_result_(c10::make_intrusive<at::ivalue::Future>(c10::ListType::create(c10::TensorType::get()))) {
+    thread_locals_.set_grad_mode(grad_mode);
+        }
  private:
   // run GraphTask post processing
   void exec_post_processing();
diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
index 4a3b3fff2e20b..e6522c33280a9 100644
--- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
@@ -360,7 +360,6 @@ void DistEngine::execute_graph_task_until_ready_queue_empty(
       }
       if (task.fn_ && !local_graph_task->has_error_.load()) {
         at::ThreadLocalStateGuard tls_guard(local_graph_task->thread_locals_);
-        AutoGradMode grad_mode(local_graph_task->grad_mode_);
         try {
           GraphTaskGuard guard(local_graph_task);
           engine_.evaluate_function(

From 7ccc4b5cc8c9a256bca151776444bf94bc28e5f6 Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Thu, 26 Aug 2021 08:00:48 -0700
Subject: [PATCH 252/530] [CI] move distributed test into its own CI job
 (#62896)

Summary:
Moving distributed to its own job.

- [x] ensure there should be a distributed test job for every default test job matrix (on GHA)
- [x] ensure that circleci jobs works for distributed as well
- [x] waiting for test distributed to have its own run_test.py launch options, see https://github.com/pytorch/pytorch/issues/63147

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62896

Reviewed By: seemethere

Differential Revision: D30230856

Pulled By: walterddr

fbshipit-source-id: 0cad620f6cd9e56c727c105458d76539a5ae976f
---
 .../cimodel/data/pytorch_build_definitions.py | 21 ++++++++
 .circleci/config.yml                          | 49 +++++++++++++++++++
 .github/scripts/generate_ci_workflows.py      |  5 ++
 .../scripts/generate_pytorch_test_matrix.py   |  2 +
 .github/templates/linux_ci_workflow.yml.j2    |  1 +
 ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml |  1 +
 ...rated-linux-bionic-py3.8-gcc9-coverage.yml |  1 +
 ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml |  1 +
 ...rated-linux-xenial-cuda11.3-py3.6-gcc7.yml |  1 +
 .../generated-linux-xenial-py3.6-gcc5.4.yml   |  1 +
 ...iodic-linux-xenial-cuda11.1-py3.6-gcc7.yml |  1 +
 .jenkins/pytorch/test.sh                      | 10 +++-
 12 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py
index bdc977270c22e..d7b20158759d0 100644
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@@ -415,6 +415,27 @@ def instantiate_configs(only_slow_gradcheck):
             )
             c.dependent_tests.append(bc_breaking_check)
 
+        if (
+            compiler_name != "clang"
+            and not rocm_version
+            and not is_libtorch
+            and not is_vulkan
+            and not is_pure_torch
+            and not is_noarch
+            and not is_slow_gradcheck
+            and not only_slow_gradcheck
+        ):
+            distributed_test = Conf(
+                c.gen_build_name("") + "distributed",
+                [],
+                is_xla=False,
+                restrict_phases=["test"],
+                is_libtorch=False,
+                is_important=True,
+                parent_build=c,
+            )
+            c.dependent_tests.append(distributed_test)
+
         config_list.append(c)
 
     return config_list
diff --git a/.circleci/config.yml b/.circleci/config.yml
index cb3e148e2e162..1bb32b5cc0a3d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7159,6 +7159,13 @@ workflows:
           build_environment: "pytorch-linux-backward-compatibility-check-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_linux_xenial_py3_6_gcc5_4_distributed_test
+          requires:
+            - pytorch_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-linux-pytorch_linux_xenial_py3_6_gcc5_4_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
           requires:
@@ -7184,6 +7191,13 @@ workflows:
           build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed_test
+          requires:
+            - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-linux-pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
           requires:
@@ -7209,6 +7223,13 @@ workflows:
           build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed_test
+          requires:
+            - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-linux-pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build
           requires:
@@ -7246,6 +7267,13 @@ workflows:
           build_environment: "pytorch-linux-xenial-py3.6-gcc7-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7"
           resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_linux_xenial_py3_6_gcc7_distributed_test
+          requires:
+            - pytorch_linux_xenial_py3_6_gcc7_build
+          build_environment: "pytorch-linux-pytorch_linux_xenial_py3_6_gcc7_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_linux_xenial_py3_clang7_asan_build
           requires:
@@ -7380,6 +7408,13 @@ workflows:
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_distributed_test
+          requires:
+            - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
+          build_environment: "pytorch-linux-pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
           requires:
@@ -7402,6 +7437,13 @@ workflows:
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_distributed_test
+          requires:
+            - pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
+          build_environment: "pytorch-linux-pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_linux_bionic_py3_6_clang9_noarch_build
           requires:
@@ -7463,6 +7505,13 @@ workflows:
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_distributed_test
+          requires:
+            - pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_build
+          build_environment: "pytorch-linux-pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_linux_bionic_rocm3_9_py3_6_build
           requires:
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index f1b962521b18d..cd7065dbfaa47 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -138,10 +138,12 @@ class CIWorkflow:
     only_build_on_pull_request: bool = False
     only_run_smoke_tests_on_pull_request: bool = False
     num_test_shards_on_pull_request: int = -1
+    distributed_test: bool = True
 
     # The following variables will be set as environment variables,
     # so it's easier for both shell and Python scripts to consume it if false is represented as the empty string.
     enable_jit_legacy_test: YamlShellBool = "''"
+    enable_distributed_test: YamlShellBool = "''"
     enable_multigpu_test: YamlShellBool = "''"
     enable_nogpu_no_avx_test: YamlShellBool = "''"
     enable_nogpu_no_avx2_test: YamlShellBool = "''"
@@ -154,6 +156,9 @@ def __post_init__(self) -> None:
         if not self.on_pull_request:
             self.only_build_on_pull_request = False
 
+        if self.distributed_test:
+            self.enable_distributed_test = 1
+
         # If num_test_shards_on_pull_request is not user-defined, default to num_test_shards unless we are
         # only running smoke tests on the pull request.
         if self.num_test_shards_on_pull_request == -1:
diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py
index d8860a02a5c37..75df57cfa2f89 100755
--- a/.github/scripts/generate_pytorch_test_matrix.py
+++ b/.github/scripts/generate_pytorch_test_matrix.py
@@ -51,6 +51,8 @@ def main() -> None:
         configs['nogpu_NO_AVX'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
     if NOGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_NOGPU_NO_AVX2_TEST'):
         configs['nogpu_NO_AVX2'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
+    if os.getenv('ENABLE_DISTRIBUTED_TEST'):
+        configs['distributed'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     if os.getenv('ENABLE_SLOW_TEST'):
         configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     matrix = {
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index f63685295bbce..d9af899b04b66 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -248,6 +248,7 @@ jobs:
     {%- endif %}
     env:
       TEST_RUNNER_TYPE: !{{ test_runner_type }}
+      ENABLE_DISTRIBUTED_TEST: !{{ enable_distributed_test }}
       ENABLE_JIT_LEGACY_TEST: !{{ enable_jit_legacy_test }}
       ENABLE_MULTIGPU_TEST: !{{ enable_multigpu_test }}
       ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }}
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 0b3dddd3930e5..769efcaa80a3b 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -224,6 +224,7 @@ jobs:
     needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      ENABLE_DISTRIBUTED_TEST: 1
       ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
       ENABLE_NOGPU_NO_AVX_TEST: ''
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index 624e9d0d92c5b..ddd81c079df7c 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -224,6 +224,7 @@ jobs:
     needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.2xlarge
+      ENABLE_DISTRIBUTED_TEST: 1
       ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
       ENABLE_NOGPU_NO_AVX_TEST: ''
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 99a9f1f778f9f..5a888d0104174 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -224,6 +224,7 @@ jobs:
     needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      ENABLE_DISTRIBUTED_TEST: 1
       ENABLE_JIT_LEGACY_TEST: 1
       ENABLE_MULTIGPU_TEST: 1
       ENABLE_NOGPU_NO_AVX_TEST: 1
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index be56b56ee715b..25d74de36dc66 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -224,6 +224,7 @@ jobs:
     needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      ENABLE_DISTRIBUTED_TEST: 1
       ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
       ENABLE_NOGPU_NO_AVX_TEST: ''
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index c1b877c7f9c10..341f9e6da6e8c 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -224,6 +224,7 @@ jobs:
     needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.2xlarge
+      ENABLE_DISTRIBUTED_TEST: 1
       ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
       ENABLE_NOGPU_NO_AVX_TEST: ''
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 375c4b65aaf89..470fdaaad4230 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -222,6 +222,7 @@ jobs:
     needs: [ciflow_should_run]
     env:
       TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      ENABLE_DISTRIBUTED_TEST: 1
       ENABLE_JIT_LEGACY_TEST: ''
       ENABLE_MULTIGPU_TEST: ''
       ENABLE_NOGPU_NO_AVX_TEST: ''
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index e27ba3e0cd838..daa0da7eeca26 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -19,6 +19,11 @@ BUILD_DIR="build"
 BUILD_RENAMED_DIR="build_renamed"
 BUILD_BIN_DIR="$BUILD_DIR"/bin
 
+# GHA has test config defined for the test job, so we need to add them.
+if [[ -n "${TEST_CONFIG}" ]]; then
+    BUILD_ENVIRONMENT="${BUILD_ENVIRONMENT}-${TEST_CONFIG}"
+fi
+
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
@@ -522,6 +527,9 @@ elif [[ "${BUILD_ENVIRONMENT}" == *vulkan-linux* ]]; then
   test_vulkan
 elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   test_bazel
+elif [[ "${BUILD_ENVIRONMENT}" == *distributed* ]]; then
+  test_distributed
+  test_rpc
 else
   install_torchvision
   install_monkeytype
@@ -532,9 +540,7 @@ else
   test_custom_script_ops
   test_custom_backend
   test_torch_function_benchmark
-  test_distributed
   test_benchmarks
-  test_rpc
   if [[ "${BUILD_ENVIRONMENT}" == *linux-xenial-py3.6-gcc7-test* || "${BUILD_ENVIRONMENT}" == *linux-xenial-py3.6-gcc5.4-test* ]]; then
     test_python_gloo_with_tls
   fi

From fbe7133b5842a55589e097c8b045ceb08cb346b2 Mon Sep 17 00:00:00 2001
From: Don Jang <djang@fb.com>
Date: Thu, 26 Aug 2021 08:08:53 -0700
Subject: [PATCH 253/530] [Static Runtime] Disable out variant of aten::clone
 (#63980)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63980

The out variant implementation of `aten::clone` causes a crash, which needs further investigation. This change disables it until the problem gets fixed.

Note that `inline_cvr` doesn't use `aten::clone` as of now, so no perf implication: https://www.internalfb.com/phabricator/paste/view/P446858755?lines=121

Test Plan: N/A

Reviewed By: hlu1

Differential Revision: D30544149

fbshipit-source-id: facb334d67473f622b36862fbdb2633358556fdf
---
 torch/csrc/jit/runtime/static/ops.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 140fdf188a951..4d34ed9388364 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -601,11 +601,12 @@ REGISTER_OPERATOR_FUNCTOR(aten::logit, aten_logit, [](Node* n) -> SROperator {
   };
 });
 
+// TODO(T98923825): Uncomment this once the bug in this gets fixed.
+/*
 REGISTER_OPERATOR_FUNCTOR(aten::clone, aten_clone, [](Node* n) -> SROperator {
   if (!n->matches(torch::schema(
-          "aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor"))) {
-    LogAndDumpSchema(n);
-    return nullptr;
+          "aten::clone(Tensor self, *, MemoryFormat? memory_format=None) ->
+Tensor"))) { LogAndDumpSchema(n); return nullptr;
   }
   return [](ProcessedNode* p_node) {
     const auto& src = p_node->Input(0).toTensor();
@@ -631,6 +632,8 @@ REGISTER_OPERATOR_FUNCTOR(aten::clone, aten_clone, [](Node* n) -> SROperator {
     at::native::copy_(out_t, src, false);
   };
 });
+*/
+
 REGISTER_OPERATOR_FUNCTOR(
     quantized::embedding_bag_byte_rowwise_offsets,
     quantized_embedding_bag_byte_rowwise_offsets,

From 1354ee417a4abfaacd8333fb6316c6f58494c0d7 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Thu, 26 Aug 2021 09:27:47 -0700
Subject: [PATCH 254/530] run_test.py: add option to run only core tests
 (#63976)

Summary:
This is in response to a feature request from some folks in the core team to have a local command that would only run relevant "core" tests. The idea is to have a local smoke test option for developers to run locally before making a PR in order to verify their changes did not break core functionality. These smoke tests are not targeted to be short but rather relevant.

This PR enables that by allowing developers to run `python test/run_test.py --core` or `python test/run_test.py -core` in order to run the CORE_TEST_LIST, which is currently test_nn.py, test_torch.py, and test_ops.py.

I am not the best person to judge what should be considered "core", so please comment which tests should be included and/or excluded from the CORE_TEST_LIST!

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63976

Test Plan:
```
(pytorch) janeyx@janeyx-mbp test % python run_test.py --core -v
Selected tests: test_nn, test_ops, test_torch
Running test_nn ... [2021-08-25 14:48:28.865078]
Executing ['/Users/janeyx/miniconda3/envs/pytorch/bin/python', 'test_nn.py', '-v'] ... [2021-08-25 14:48:28.865123]
test_to (__main__.PackedSequenceTest) ... ok
test_to_memory_format (__main__.PackedSequenceTest) ... ok
```

Reviewed By: walterddr

Differential Revision: D30575560

Pulled By: janeyx99

fbshipit-source-id: 3f151982c1e315e50e60cb0d818adaea34556a04
---
 test/run_test.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/test/run_test.py b/test/run_test.py
index d3c661093a6e8..dd95e13de8e36 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -281,6 +281,14 @@
 
 WINDOWS_COVERAGE_BLOCKLIST = []
 
+# A subset of our TEST list that validates PyTorch's ops, modules, and autograd function as expected
+CORE_TEST_LIST = [
+    "test_autograd",
+    "test_modules",
+    "test_nn",
+    "test_ops",
+    "test_torch"
+]
 
 # the JSON file to store the S3 test stats
 TEST_TIMES_FILE = ".pytorch-test-times.json"
@@ -629,6 +637,13 @@ def parse_args():
         action="store_true",
         help="run all distributed tests",
     )
+    parser.add_argument(
+        "-core",
+        "--core",
+        action="store_true",
+        help="Only run core tests, or tests that validate PyTorch's ops, modules,"
+        "and autograd. They are defined by CORE_TEST_LIST."
+    )
     parser.add_argument(
         "-pt",
         "--pytest",
@@ -830,6 +845,12 @@ def get_selected_tests(options):
             filter(lambda test_name: test_name in DISTRIBUTED_TESTS, selected_tests)
         )
 
+    # Filter to only run core tests when --core option is specified
+    if options.core:
+        selected_tests = list(
+            filter(lambda test_name: test_name in CORE_TEST_LIST, selected_tests)
+        )
+
     # process reordering
     if options.bring_to_front:
         to_front = set(options.bring_to_front)

From ba5f1b1076ed6fae4a46d7317204963a4cd53701 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Thu, 26 Aug 2021 09:41:58 -0700
Subject: [PATCH 255/530] [nnc] Fix dtype promotion involving scalars (#64002)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64002

Fixes https://github.com/pytorch/vision/issues/4315

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D30566979

Pulled By: bertmaher

fbshipit-source-id: eaa98b9534a926be7fcd337d46c5a0acb3243179
---
 torch/csrc/jit/passes/tensorexpr_fuser.cpp |  2 +-
 torch/csrc/jit/tensorexpr/kernel.cpp       | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 085291afbdcf8..833c338578616 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -135,7 +135,7 @@ const OperatorSet& supported_eltwise_set() {
       "aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor",
       // "aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor",
       // "aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor", TODO: requires 0-dim Tensor
-      "aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor",
+      // "aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor",
       "aten::remainder.Tensor(Tensor self, Tensor other) -> Tensor",
       "aten::sigmoid(Tensor self) -> Tensor",
       "aten::relu(Tensor self) -> Tensor",
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index d53e857d75a48..c2726a0fafb01 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -454,6 +454,11 @@ std::vector<ExprHandle> computeIndicesToBroadcast(
   return bcast;
 }
 
+bool isScalar(ExprHandle e) {
+  auto n = e.node();
+  return n->isConstant() || to<Var>(n);
+}
+
 void promoteInputs(std::vector<ExprHandle>& inputs, const int typeConstraints) {
   if (inputs.empty()) {
     return;
@@ -462,7 +467,16 @@ void promoteInputs(std::vector<ExprHandle>& inputs, const int typeConstraints) {
   // Find the highest type among the inputs.
   ScalarType highType = inputs[0].dtype().scalar_type();
   for (auto input : inputs) {
-    highType = promoteTypes(highType, input.dtype().scalar_type());
+    auto inputType = input.dtype().scalar_type();
+    if (isScalar(input)) {
+      if (isIntegralType(highType, false) && isFloatingType(inputType)) {
+        highType = c10::get_default_dtype_as_scalartype();
+      } else if (highType == c10::kBool) {
+        highType = inputType;
+      }
+    } else {
+      highType = promoteTypes(highType, inputType);
+    }
   }
 
   if (!checkTypes(highType, typeConstraints)) {

From 6d31ba6ddcf5f839bd2d3f20a19712846d030d8b Mon Sep 17 00:00:00 2001
From: Raghavan Raman <raghavanr@fb.com>
Date: Thu, 26 Aug 2021 09:49:44 -0700
Subject: [PATCH 256/530] [nnc] Sanitized the names of constants in the input
 graph. (#63990)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63923

The input graph can contain constants whose names contain special characters. So, all names of constants in the input graph need to be sanitized.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63990

Reviewed By: ZolotukhinM

Differential Revision: D30558432

Pulled By: navahgar

fbshipit-source-id: de5b0c23d50ee8997f40f2c0fc605dda3719186f
---
 test/cpp/tensorexpr/test_kernel.cpp  | 37 ++++++++++++++++++++++++++++
 torch/csrc/jit/tensorexpr/kernel.cpp |  2 +-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index 8cdf2ef90df11..625fadb811710 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -1195,6 +1195,43 @@ TEST_F(Kernel, SanitizeNames_CUDA) {
   ASSERT_TRUE(at::allclose(o, ref));
 }
 
+TEST_F(Kernel, SanitizeConstants_CUDA) {
+  const auto graph_string = R"IR(
+        graph(%x : Float(16, 16, strides=[16, 1], device=cuda:0)):
+          %none : NoneType = prim::Constant()
+          %size : int = prim::Constant[value=16]()
+          %sizes : int[] = prim::ListConstruct(%size, %size)
+          %30 : Device = prim::Constant[value="cuda"]()
+          %y : Float(16, 16, strides=[16, 1], device=cuda:0) = aten::ones(%sizes, %none, %none, %30, %none)
+          %z : Float(16, 16, strides=[16, 1], device=cuda:0) = aten::mul(%x, %y)
+          return (%z))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+  // IRParser doesn't support tensor constants, so we insert a call to
+  // aten::ones and then const-prop it
+  ConstantPropagation(graph);
+
+  // We set the name of the constant to include special characters that are
+  // not allowed. This should be fixed by the sanitizer in TensorExprKernel.
+  graph->nodes().front()->output()->setDebugName("illegal.name");
+
+  // Check if we have a constant node with illegal name in the graph.
+  auto const_node = graph->nodes().front();
+  ASSERT_EQ(const_node->kind(), prim::Constant);
+  ASSERT_NE(const_node->output()->debugName().find('.'), std::string::npos);
+
+  TensorExprKernel k(graph);
+
+  auto x = at::rand({16, 16}, TensorOptions(kCUDA).dtype(at::kFloat));
+  std::vector<at::Tensor> inputs = {x};
+  std::vector<IValue> stack = fmap<IValue>(inputs);
+  k.run(stack);
+  auto o = stack[0].toTensor();
+  auto y = at::ones({16, 16}, TensorOptions(kCUDA).dtype(at::kFloat));
+  auto ref = x * y;
+  ASSERT_TRUE(at::allclose(o, ref));
+}
+
 TEST_F(Kernel, ConstantTensors) {
   const auto graph_string = R"IR(
         graph(%x : Float(16, 16, strides=[16, 1], device=cpu)):
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index c2726a0fafb01..f72fbf7c18c37 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -2996,7 +2996,7 @@ void TensorExprKernel::bindConstant(const torch::jit::Value* v) {
   }
 
   BufPtr buf = alloc<Buf>(
-      "const_" + v->debugName(),
+      "const_" + sanitizeName(v->debugName()),
       ExprHandleVectorToExprVector(te_sizes),
       ToDtype(static_cast<ScalarType>(*tt->scalarType())));
 

From 0f6b524665378b18b8682f473267c80c6d5ca3df Mon Sep 17 00:00:00 2001
From: Cheng Chang <xcc@fb.com>
Date: Thu, 26 Aug 2021 09:52:42 -0700
Subject: [PATCH 257/530] [NNC] Add C++ codegen backend to NNC (#62869)

Summary:
Adds a C++ codegen backend to NNC to generate C++ for CPU instead of generating LLVM IR.
Tensors are represented as blobs of float. Vector operations are devectorized/unrolled.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62869

Test Plan:
https://github.com/pytorch/pytorch/tree/mvz-nnc-aot-prototype makes it able to AOT compile the whole MobileNetV3 model into binary code through LLVM codegen in NNC.

I forked that branch to https://github.com/cheng-chang/pytorch/tree/cc-aot-cpp, merged this PR into it, and modified `fancy_compile` to compile MobileNetV3 into C++ through

```
import torch

m = torch.jit.load('mobnet.pt')
m.eval()
f = torch.jit.freeze(m)
torch._C._fancy_compile(f.graph, [1, 3, 224, 224])
```

The generated C++ file `mobnet.cc` can be found at https://gist.github.com/cheng-chang/e2830cc6920b39204ebf368035b2bcec.

I manually compiled the generated C++ through `g++ -o mobnet -std=c++14 -L./build/lib -ltorch_cpu -ltorch mobnet.cc`, and it succeeded.

Reviewed By: ZolotukhinM

Differential Revision: D30149482

Pulled By: cheng-chang

fbshipit-source-id: e77b189f0353e37cd309423a48a513e668d07675
---
 test/cpp/tensorexpr/test_cpp_codegen.cpp   | 275 ++++++++++++--
 torch/csrc/jit/tensorexpr/codegen.h        |   4 +
 torch/csrc/jit/tensorexpr/cpp_codegen.cpp  | 404 +++++++++++++++++++--
 torch/csrc/jit/tensorexpr/cpp_codegen.h    |  84 ++++-
 torch/csrc/jit/tensorexpr/cpp_intrinsics.h |  36 ++
 torch/csrc/jit/tensorexpr/expr.h           |   8 +-
 torch/csrc/jit/tensorexpr/ir.h             |   4 +-
 torch/csrc/jit/tensorexpr/ir_printer.cpp   |  51 +--
 torch/csrc/jit/tensorexpr/ir_printer.h     |   3 +
 9 files changed, 782 insertions(+), 87 deletions(-)
 create mode 100644 torch/csrc/jit/tensorexpr/cpp_intrinsics.h

diff --git a/test/cpp/tensorexpr/test_cpp_codegen.cpp b/test/cpp/tensorexpr/test_cpp_codegen.cpp
index df9166b675859..d40caa126e572 100644
--- a/test/cpp/tensorexpr/test_cpp_codegen.cpp
+++ b/test/cpp/tensorexpr/test_cpp_codegen.cpp
@@ -1,9 +1,11 @@
 #include <gtest/gtest.h>
 
-#include <test/cpp/tensorexpr/test_base.h>
+#include "test/cpp/tensorexpr/test_base.h"
 
 #include <torch/csrc/jit/tensorexpr/cpp_codegen.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
 #include <torch/csrc/jit/tensorexpr/stmt.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/csrc/jit/testing/file_check.h>
 
 namespace torch {
@@ -11,44 +13,245 @@ namespace jit {
 
 using namespace torch::jit::tensorexpr;
 
-TEST(CppPrinter, AllocateOnStackThenFree) {
-  std::vector<ExprPtr> dims = {alloc<IntImm>(2), alloc<IntImm>(3)};
-  BufPtr buf = alloc<Buf>("x", dims, kInt);
-  AllocatePtr alloc_ = alloc<Allocate>(buf);
-  FreePtr free_ = alloc<Free>(buf);
-  BlockPtr block = Block::make({alloc_, free_});
-
-  std::stringstream ss;
-  CppPrinter printer(&ss);
-  printer.visit(block);
-  const std::string expected = R"(
-    # CHECK: {
-    # CHECK:   int x[6];
-    # CHECK: }
+#define STR_CHECK(node, expected) \
+  std::stringstream ss;           \
+  CppPrinter printer(&ss);        \
+  printer.visit(node);            \
+  ASSERT_EQ(ss.str(), expected)
+
+#define FILE_CHECK(node, pattern) \
+  std::stringstream ss;           \
+  CppPrinter printer(&ss);        \
+  printer.visit(node);            \
+  torch::jit::testing::FileCheck().run(pattern, ss.str())
+
+TEST(CppPrinter, IntImm) {
+  auto i = alloc<IntImm>(10);
+  STR_CHECK(i, "10");
+}
+
+TEST(CppPrinter, FloatImm) {
+  auto f = alloc<FloatImm>(10);
+  STR_CHECK(f, "10.f");
+}
+
+TEST(CppPrinter, FloatImm1) {
+  auto f = alloc<FloatImm>(10);
+  STR_CHECK(f, "10.f");
+}
+
+TEST(CppPrinter, DoubleImm) {
+  auto d = alloc<DoubleImm>(10);
+  STR_CHECK(d, "10.0");
+}
+
+TEST(CppPrinter, DoubleImm1) {
+  auto d = alloc<DoubleImm>(10.1);
+  STR_CHECK(d, "10.1");
+}
+
+TEST(CppPrinter, HalfImm) {
+  auto h = alloc<HalfImm>(10);
+  STR_CHECK(h, "10");
+}
+
+TEST(CppPrinter, Add) {
+  auto add = alloc<Add>(alloc<IntImm>(1), alloc<IntImm>(2));
+  STR_CHECK(add, "1 + 2");
+}
+
+TEST(CppPrinter, AddExpr1) {
+  auto add = alloc<Add>(
+      alloc<Add>(alloc<IntImm>(0), alloc<IntImm>(1)),
+      alloc<Sub>(alloc<IntImm>(2), alloc<IntImm>(3)));
+  STR_CHECK(add, "(0 + 1) + (2 - 3)");
+}
+
+TEST(CppPrinter, AddExpr2) {
+  auto add = alloc<Add>(
+      alloc<Mul>(alloc<IntImm>(0), alloc<IntImm>(1)),
+      alloc<Sub>(alloc<IntImm>(2), alloc<IntImm>(3)));
+  STR_CHECK(add, "0 * 1 + (2 - 3)");
+}
+
+TEST(CppPrinter, AddExpr3) {
+  auto add = alloc<Add>(
+      alloc<Add>(alloc<IntImm>(0), alloc<IntImm>(1)),
+      alloc<Div>(alloc<IntImm>(2), alloc<IntImm>(3)));
+  STR_CHECK(add, "(0 + 1) + 2 / 3");
+}
+
+TEST(CppPrinter, Mod) {
+  auto mod = alloc<Mod>(alloc<IntImm>(1), alloc<IntImm>(2));
+  STR_CHECK(mod, "1 % 2");
+}
+
+TEST(CppPrinter, ModFloat) {
+  auto mod = alloc<Mod>(alloc<FloatImm>(1), alloc<FloatImm>(2));
+  STR_CHECK(mod, "std::fmod(1.f, 2.f)");
+}
+
+TEST(CppPrinter, Max) {
+  auto max = alloc<Max>(alloc<IntImm>(1), alloc<IntImm>(2), false);
+  STR_CHECK(max, "std::max(1, 2)");
+}
+
+TEST(CppPrinter, MaxFloat) {
+  auto max = alloc<Max>(alloc<FloatImm>(1), alloc<FloatImm>(2), false);
+  STR_CHECK(max, "std::max(1.f, 2.f)");
+}
+
+TEST(CppPrinter, MaxHalf) {
+  auto max = alloc<Max>(alloc<HalfImm>(1), alloc<HalfImm>(2), false);
+  STR_CHECK(max, "(1 < 2) ? 2 : 1");
+}
+
+TEST(CppPrinter, And) {
+  auto v = alloc<And>(alloc<IntImm>(1), alloc<IntImm>(2));
+  STR_CHECK(v, "1 & 2");
+}
+
+TEST(CppPrinter, CompareSelect) {
+  auto cs = alloc<CompareSelect>(
+      alloc<IntImm>(1),
+      alloc<IntImm>(2),
+      alloc<FloatImm>(1),
+      alloc<FloatImm>(2),
+      CompareSelectOperation::kLE);
+  STR_CHECK(cs, "((1 <= 2) ? 1.f : 2.f)");
+}
+
+TEST(CppPrinter, IfThenElse) {
+  auto cond = alloc<Add>(alloc<IntImm>(1), alloc<IntImm>(2));
+  auto true_value = alloc<Sub>(alloc<IntImm>(0), alloc<IntImm>(1));
+  auto false_value = alloc<Mul>(alloc<IntImm>(2), alloc<IntImm>(3));
+  auto v = alloc<IfThenElse>(cond, true_value, false_value);
+  STR_CHECK(v, "((1 + 2) ? 0 - 1 : 2 * 3)");
+}
+
+TEST(CppPrinter, AllocateFree) {
+  BufHandle buf("x", {2, 3}, kInt);
+  AllocatePtr alloc = Allocate::make(buf);
+  FreePtr free = Free::make(buf);
+  BlockPtr block = Block::make({alloc, free});
+
+  const std::string pattern = R"(
+   # CHECK: {
+   # CHECK:   int* x = static_cast<int*>(malloc(24));
+   # CHECK:   free(x);
+   # CHECK: }
+  )";
+  FILE_CHECK(block, pattern);
+}
+
+TEST(CppPrinter, LoadStore) {
+  Placeholder a(BufHandle("A", {2, 3}, kInt));
+  Placeholder b(BufHandle("B", {3, 4}, kInt));
+  auto store = b.store({2, 2}, a.load(1, 1));
+  STR_CHECK(
+      store, "B[(0 + 2 * (1 * 4)) + 2 * 1] = A[(0 + 1 * (1 * 3)) + 1 * 1];\n");
+}
+
+TEST(CppPrinter, Var) {
+  auto var = alloc<Var>("x", kInt);
+  STR_CHECK(var, "x");
+}
+
+TEST(CppPrinter, Cast) {
+  auto cast = alloc<Cast>(kFloat, alloc<IntImm>(1));
+  STR_CHECK(cast, "static_cast<float>(1)");
+}
+
+TEST(CppPrinter, BitCast) {
+  auto cast = alloc<BitCast>(kInt, alloc<FloatImm>(20));
+  STR_CHECK(cast, "std::bitcast<float, int>(20.f)");
+}
+
+TEST(CppPrinter, Let) {
+  auto var = alloc<Var>("x", kFloat);
+  auto val = alloc<FloatImm>(2);
+  auto let = alloc<Let>(var, val);
+  STR_CHECK(let, "float x = 2.f;\n");
+}
+
+TEST(CppPrinter, For) {
+  constexpr int N = 1024;
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
+  VarHandle i("i", kInt);
+  auto f = For::make(i, 0, N, c.store({i}, Add::make(a.load(i), b.load(i))));
+  const std::string pattern = R"(
+   # CHECK: for (int i = 0; i < 1024; i++) {
+   # CHECK:   C[i] = (A[i]) + (B[i]);
+   # CHECK: }
   )";
-  torch::jit::testing::FileCheck().run(expected, ss.str());
-}
-
-TEST(CppPrinter, AllocateOnHeapThenFree) {
-  std::vector<ExprPtr> dims = {
-      alloc<IntImm>(20), alloc<IntImm>(50), alloc<IntImm>(3)};
-  BufPtr buf = alloc<Buf>("y", dims, kLong);
-  AllocatePtr alloc_ = alloc<Allocate>(buf);
-  FreePtr free_ = alloc<Free>(buf);
-  BlockPtr block = Block::make({alloc_, free_});
-
-  std::stringstream ss;
-  CppPrinter printer(&ss);
-  printer.visit(block);
-  // size(long) = 8;
-  // dim0 * dim1 * dim2 * size(long) = 24000.
-  const std::string expected = R"(
-    # CHECK: {
-    # CHECK:   int64_t* y = static_cast<int64_t*>(malloc(24000));
-    # CHECK:   free(y);
+  FILE_CHECK(f, pattern);
+}
+
+TEST(CppPrinter, Cond) {
+  Placeholder x(BufHandle("X", {1}, kInt));
+  auto cmp = CompareSelect::make(x.load(0), 10, CompareSelectOperation::kLT);
+  auto cond =
+      Cond::make(cmp, x.store({0}, x.load(0) + 1), x.store({0}, x.load(0) - 1));
+  const std::string pattern = R"(
+    # CHECK: if (((X[0] < 10) ? 1 : 0)) {
+    # CHECK:   X[0] = (X[0]) + 1;
+    # CHECK: } else {
+    # CHECK:   X[0] = (X[0]) - 1;
     # CHECK: }
   )";
-  torch::jit::testing::FileCheck().run(expected, ss.str());
+  FILE_CHECK(cond, pattern);
+}
+
+TEST(CppPrinter, Intrinsics) {
+  const std::unordered_set<IntrinsicsOp, std::hash<int>> unsupported_ops{
+      kRand, kSigmoid};
+  for (int i = 0; i < kMaxIntrinsicsOp; i++) {
+    IntrinsicsOp op = static_cast<IntrinsicsOp>(i);
+    if (unsupported_ops.count(op)) {
+      continue;
+    }
+
+    if (Intrinsics::OpArgCount(op) == 1) {
+      auto v = alloc<Intrinsics>(op, alloc<FloatImm>(2.0f));
+      STR_CHECK(v, "std::" + v->func_name() + "(2.f)");
+    } else {
+      auto v =
+          alloc<Intrinsics>(op, alloc<FloatImm>(1.0f), alloc<FloatImm>(2.0f));
+      STR_CHECK(v, "std::" + v->func_name() + "(1.f, 2.f)");
+    }
+  }
+}
+
+TEST(CppPrinter, ExternalCall) {
+  std::vector<ExprPtr> dims{alloc<IntImm>(2), alloc<IntImm>(2)};
+  auto output = alloc<Buf>("out", dims, kFloat);
+  auto buf_arg1 = alloc<Buf>("a", dims, kFloat);
+  auto buf_arg2 = alloc<Buf>("b", dims, kFloat);
+  auto scalar_arg = alloc<Add>(alloc<IntImm>(1), alloc<IntImm>(2));
+  std::vector<BufPtr> buf_args{buf_arg1, buf_arg2};
+  std::vector<ExprPtr> scalar_args{scalar_arg};
+  auto call =
+      alloc<ExternalCall>(output, "nnc_aten_matmul", buf_args, scalar_args);
+  const std::string pattern = R"(
+   # CHECK: {
+   # CHECK:   void* buf_ptrs[]{out, a, b};
+   # CHECK:   int64_t buf_ranks[]{2, 2, 2};
+   # CHECK:   int64_t buf_dims[]{2, 2, 2, 2, 2, 2};
+   # CHECK:   int8_t buf_dtypes[]{6, 6, 6};
+   # CHECK:   int64_t extra_args[]{1 + 2};
+   # CHECK:   nnc_aten_matmul(
+   # CHECK:       3,
+   # CHECK:       buf_ptrs,
+   # CHECK:       buf_ranks,
+   # CHECK:       buf_dims,
+   # CHECK:       buf_dtypes,
+   # CHECK:       1,
+   # CHECK:       extra_args);
+   # CHECK: }
+  )";
+  FILE_CHECK(call, pattern);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h
index d7cfe783fab8f..29255aac07df2 100644
--- a/torch/csrc/jit/tensorexpr/codegen.h
+++ b/torch/csrc/jit/tensorexpr/codegen.h
@@ -46,6 +46,10 @@ class TORCH_API CodeGen {
     stmt_ = stmt_->accept_mutator(mutator);
   }
 
+  void apply_visitor(IRVisitor* visitor) {
+    stmt_->accept(visitor);
+  }
+
   std::vector<BufferArg>& buffer_args() {
     return buffer_args_;
   }
diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
index 39a5615a97545..20795e43dd57b 100644
--- a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
@@ -1,44 +1,406 @@
+#include <algorithm>
+#include <type_traits>
+#include <vector>
+
 #include <torch/csrc/jit/tensorexpr/cpp_codegen.h>
+#include <torch/csrc/jit/tensorexpr/cpp_intrinsics.h>
+#include <torch/csrc/jit/tensorexpr/external_functions_registry.h>
+#include <torch/csrc/jit/tensorexpr/types.h>
 
 namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-void CppPrinter::visit(AllocatePtr alloc) {
-  constexpr size_t kAllocOnStackThresholdSize = 512;
+// Rewrites the variables' name according to valid C++ naming convention.
+// E.g. in Graph IR, variable name may contain '.', in C++, they are replaced
+// with '_'.
+class CppVarNameRewriter : public IRVisitor {
+ public:
+  void visit(VarPtr v) override {
+    constexpr char kDot = '.';
+    constexpr char kUnderscore = '_';
+    if (v->name_hint().find(kDot) == std::string::npos) {
+      return;
+    }
+    std::string name = v->name_hint();
+    std::replace(name.begin(), name.end(), kDot, kUnderscore);
+    v->set_name_hint(std::move(name));
+  }
+
+  void visit(BufPtr v) override {
+    v->base_handle()->accept(this);
+  }
+};
+
+static std::string declareExternalFunction(const std::string& func_name) {
+  return "void " + func_name +
+      "("
+      "int64_t bufs_num, "
+      "void** buf_data, "
+      "int64_t* buf_ranks, "
+      "int64_t* buf_dims, "
+      "int8_t* buf_dtypes, "
+      "int64_t args_num, "
+      "int64_t* extra_args);";
+}
+
+CppPrinter::CppPrinter(std::ostream* os) : IRPrinter(*os), lane_(0) {}
+
+CppPrinter::~CppPrinter() = default;
+
+void CppPrinter::printPrologue() {
+  os() << "#include <cassert>" << std::endl;
+  os() << "#include <cmath>" << std::endl;
+  os() << "#include <algorithm>" << std::endl;
+  os() << "#include <type_traits>" << std::endl;
+  os() << std::endl;
+
+  os() << "#define POS_INFINITY INFINITY" << std::endl;
+  os() << "#define NEG_INFINITY -INFINITY" << std::endl;
+  os() << std::endl;
+
+  os() << cpp_intrinsics_definition << std::endl;
+  os() << std::endl;
+
+  os() << "namespace torch {" << std::endl;
+  os() << "namespace jit {" << std::endl;
+  os() << "namespace tensorexpr {" << std::endl;
+  for (auto const& it : getNNCFunctionRegistry()) {
+    os() << declareExternalFunction(it.first) << std::endl;
+  }
+  os() << "} // namespace tensorexpr" << std::endl;
+  os() << "} // namespace jit" << std::endl;
+  os() << "} // namespace torch" << std::endl;
+  os() << std::endl;
+
+  os() << "using namespace torch::jit::tensorexpr;" << std::endl;
+  os() << std::endl;
+}
+
+template <typename T>
+inline typename std::enable_if<!std::is_floating_point<T>::value, void>::type
+visit_mod(std::ostream& os, const ExprPtr lhs, const ExprPtr rhs) {
+  os << *lhs << " % " << *rhs;
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_floating_point<T>::value, void>::type
+visit_mod(std::ostream& os, const ExprPtr lhs, const ExprPtr rhs) {
+  os << "std::fmod(" << *lhs << ", " << *rhs << ")";
+}
+
+template <typename T>
+inline typename std::enable_if<
+    std::is_floating_point<T>::value || std::is_integral<T>::value,
+    void>::type
+visit_max(std::ostream& os, const ExprPtr lhs, const ExprPtr rhs) {
+  os << "std::max(" << *lhs << ", " << *rhs << ")";
+}
 
-  size_t size = 1;
-  for (auto dim : alloc->dims()) {
-    IntImmPtr v = to<IntImm>(dim);
-    if (v) {
-      size *= v->value();
+template <typename T>
+inline typename std::enable_if<
+    !std::is_floating_point<T>::value && !std::is_integral<T>::value,
+    void>::type
+visit_max(std::ostream& os, const ExprPtr lhs, const ExprPtr rhs) {
+  os << "(" << *lhs << " < " << *rhs << ") ? " << *rhs << " : " << *lhs;
+}
+
+template <typename T>
+inline typename std::enable_if<
+    std::is_floating_point<T>::value || std::is_integral<T>::value,
+    void>::type
+visit_min(std::ostream& os, const ExprPtr lhs, const ExprPtr rhs) {
+  os << "std::min(" << *lhs << ", " << *rhs << ")";
+}
+
+template <typename T>
+inline typename std::enable_if<
+    !std::is_floating_point<T>::value && !std::is_integral<T>::value,
+    void>::type
+visit_min(std::ostream& os, const ExprPtr lhs, const ExprPtr rhs) {
+  os << *lhs << " < " << *rhs << " ? " << *lhs << " : " << *rhs;
+}
+
+template <typename T>
+void visit_binary_op(
+    std::ostream& os,
+    const ExprPtr lhs,
+    const ExprPtr rhs,
+    IRNodeType op_type) {
+  switch (op_type) {
+    case IRNodeType::kMod:
+      visit_mod<T>(os, lhs, rhs);
+      break;
+    case IRNodeType::kMax:
+      visit_max<T>(os, lhs, rhs);
+      break;
+    case IRNodeType::kMin:
+      visit_min<T>(os, lhs, rhs);
+      break;
+    default:
+      throw std::runtime_error("invalid op type");
+  }
+}
+
+template <typename Op>
+void dispatch_binary_op(std::ostream& os, const BinaryOpNode<Op>* v) {
+  switch (v->lhs()->dtype().scalar_type()) {
+#define TYPE_CASE(Type, Name)                                      \
+  case ScalarType::Name:                                           \
+    visit_binary_op<Type>(os, v->lhs(), v->rhs(), v->expr_type()); \
+    break;
+    AT_FORALL_SCALAR_TYPES_AND2(Half, Bool, TYPE_CASE);
+#undef TYPE_CASE
+    default:
+      throw unsupported_dtype();
+  }
+}
+
+void CppPrinter::visit(RampPtr v) {
+  visit(alloc<Add>(v->base(), alloc<Mul>(alloc<IntImm>(lane_), v->stride())));
+}
+
+void CppPrinter::visit(BroadcastPtr v) {
+  v->value()->accept(this);
+}
+
+void CppPrinter::visit(ModPtr v) {
+  dispatch_binary_op(os(), v.get());
+}
+
+void CppPrinter::visit(MaxPtr v) {
+  dispatch_binary_op(os(), v.get());
+}
+
+void CppPrinter::visit(MinPtr v) {
+  dispatch_binary_op(os(), v.get());
+}
+
+void CppPrinter::visit(CompareSelectPtr v) {
+  os() << "((" << *v->lhs() << " "
+       << IRPrinter::to_string(v->compare_select_op()) << " " << *v->rhs()
+       << ") ? " << *v->ret_val1() << " : " << *v->ret_val2() << ")";
+}
+
+void CppPrinter::visit(IfThenElsePtr v) {
+  os() << "((" << *v->condition() << ") ? " << *v->true_value() << " : "
+       << *v->false_value() << ")";
+}
+
+void CppPrinter::visit(AllocatePtr v) {
+  size_t size = v->dtype().byte_size();
+  for (const auto& dim : v->dims()) {
+    IntImmPtr d = to<IntImm>(dim);
+    if (d) {
+      size *= d->value();
     } else {
       throw std::runtime_error("Only IntImm dimensions are supported for now");
     }
   }
 
   emitIndent();
-  if (size <= kAllocOnStackThresholdSize) {
-    os() << alloc->dtype().ToCppString() << " " << (*alloc->buffer_var()) << "["
-         << size << "];" << std::endl;
-  } else {
-    size *= alloc->dtype().byte_size();
-    os() << alloc->dtype().ToCppString() << "* " << (*alloc->buffer_var())
-         << " = static_cast<" << alloc->dtype().ToCppString() << "*>(malloc("
-         << size << "));" << std::endl;
-    allocated_on_heap_.insert(alloc->buffer_var());
+  os() << v->dtype().ToCppString() << "* " << (*v->buffer_var())
+       << " = static_cast<" << v->dtype().ToCppString() << "*>(malloc(" << size
+       << "));" << std::endl;
+}
+
+void CppPrinter::visit(FreePtr v) {
+  emitIndent();
+  os() << "free(" << *v->buffer_var() << ");" << std::endl;
+}
+
+void CppPrinter::visit(LoadPtr v) {
+  auto flat_idx = flatten_index(v->buf()->dims(), v->indices());
+  os() << *v->base_handle() << "[" << *flat_idx << "]";
+}
+
+void CppPrinter::visit(StorePtr v) {
+  auto flat_idx = flatten_index(v->buf()->dims(), v->indices());
+  const int lanes = v->value()->dtype().lanes();
+  for (int lane = 0; lane < lanes; lane++) {
+    lane_ = lane;
+    emitIndent();
+    os() << *v->base_handle() << "[" << *flat_idx << "] = " << *v->value()
+         << ";" << std::endl;
+  }
+}
+
+void CppPrinter::visit(CastPtr v) {
+  os() << "static_cast<" << v->dtype().ToCppString() << ">(" << *v->src_value()
+       << ")";
+}
+
+void CppPrinter::visit(BitCastPtr v) {
+  os() << "std::bitcast<" << v->src_value()->dtype().ToCppString() << ", "
+       << v->dtype().ToCppString() << ">(" << *v->src_value() << ")";
+}
+
+void CppPrinter::visit(IntrinsicsPtr v) {
+  if (v->op_type() == kRand || v->op_type() == kSigmoid) {
+    throw std::runtime_error("kRand and kSigmoid are not supported");
+  }
+
+  os() << "std::" << v->func_name() << "(";
+  for (int i = 0; i < v->nparams(); i++) {
+    if (i > 0) {
+      os() << ", ";
+    }
+    os() << *v->param(i);
   }
+  os() << ")";
 }
 
-void CppPrinter::visit(FreePtr free) {
-  VarPtr var = free->buffer_var();
-  if (allocated_on_heap_.count(var)) {
+void CppPrinter::visit(ExternalCallPtr v) {
+  // The generated code needs to link against functions defined
+  // in external_functions.cpp.
+
+  auto& func_registry = getNNCFunctionRegistry();
+  if (!func_registry.count(v->func_name())) {
+    throw unimplemented_lowering(v);
+  }
+
+  std::vector<BufPtr> bufs(v->buf_args());
+  bufs.insert(bufs.begin(), v->buf());
+  auto for_buf = [&](const std::function<void(const BufPtr)>& print_buf) {
+    for (size_t i = 0; i < bufs.size(); i++) {
+      if (i > 0) {
+        os() << ", ";
+      }
+      print_buf(bufs[i]);
+    }
+  };
+
+  emitIndent();
+  os() << "{" << std::endl;
+  indent_++;
+
+  emitIndent();
+  os() << "void* buf_ptrs[]{";
+  for_buf([&](const BufPtr b) { os() << *b->base_handle(); });
+  os() << "};" << std::endl;
+
+  emitIndent();
+  os() << "int64_t buf_ranks[]{";
+  for_buf([&](const BufPtr b) { os() << b->ndim(); });
+  os() << "};" << std::endl;
+
+  emitIndent();
+  os() << "int64_t buf_dims[]{";
+  for_buf([&](const BufPtr buf) {
+    for (size_t i = 0; i < buf->ndim(); i++) {
+      if (i > 0) {
+        os() << ", ";
+      }
+      os() << *buf->dim(i);
+    }
+  });
+  os() << "};" << std::endl;
+
+  emitIndent();
+  os() << "int8_t buf_dtypes[]{";
+  for_buf([&](const BufPtr buf) {
+    os() << static_cast<int>(buf->dtype().scalar_type());
+  });
+  os() << "};" << std::endl;
+
+  emitIndent();
+  os() << "int64_t extra_args[]{";
+  for (size_t i = 0; i < v->args().size(); i++) {
+    if (i > 0) {
+      os() << ", ";
+    }
+    os() << *v->args()[i];
+  }
+  os() << "};" << std::endl;
+
+  emitIndent();
+  os() << v->func_name() << "(" << std::endl;
+  emitIndent();
+  os() << "    " << bufs.size() << "," << std::endl;
+  emitIndent();
+  os() << "    buf_ptrs," << std::endl;
+  emitIndent();
+  os() << "    buf_ranks," << std::endl;
+  emitIndent();
+  os() << "    buf_dims," << std::endl;
+  emitIndent();
+  os() << "    buf_dtypes," << std::endl;
+  emitIndent();
+  os() << "    " << v->args().size() << "," << std::endl;
+  emitIndent();
+  os() << "    extra_args);" << std::endl;
+
+  indent_--;
+  emitIndent();
+  os() << "}" << std::endl;
+}
+
+void CppPrinter::visit(LetPtr v) {
+  if (v->dtype().lanes() == 1) {
     emitIndent();
-    os() << "free(" << name_manager()->get_unique_name(var) << ");"
-         << std::endl;
+    os() << v->dtype().ToCppString() << " " << *v->var() << " = " << *v->value()
+         << ";" << std::endl;
+  } else {
+    vector_vars_[v->var()] = v->value();
+  }
+}
+
+void CppPrinter::visit(VarPtr v) {
+  if (v->dtype().lanes() == 1) {
+    os() << name_manager()->get_unique_name(v);
+  } else {
+    os() << *vector_vars_.at(v);
   }
 }
 
+CppCodeGen::CppCodeGen(
+    StmtPtr stmt,
+    const std::vector<BufferArg>& buffer_args,
+    at::Device device,
+    const std::string& kernel_func_name)
+    : CodeGen(stmt, buffer_args, device, kernel_func_name) {
+  init();
+}
+
+void CppCodeGen::init() {
+  printer_ = std::make_unique<CppPrinter>(&oss_);
+  var_name_rewriter_ = std::make_unique<CppVarNameRewriter>();
+
+  apply_visitor(var_name_rewriter_.get());
+
+  printer_->printPrologue();
+  os() << "void " << kernel_func_name() << "(";
+  const std::vector<BufferArg> buffer_args = this->buffer_args();
+  for (size_t i = 0; i < buffer_args.size(); i++) {
+    if (i > 0) {
+      os() << ", ";
+    }
+    const BufferArg& buffer_arg = buffer_args[i];
+    const VarPtr var = buffer_arg.var();
+    Dtype dtype = buffer_arg.dtype();
+    os() << dtype.ToCppString() << (buffer_arg.isVar() ? " " : "* ") << *var;
+  }
+  os() << ")";
+  stmt()->accept(printer_.get());
+  os() << std::endl;
+}
+
+CppCodeGen::~CppCodeGen() = default;
+
+void CppCodeGen::call(const std::vector<CallArg>& args) {
+  // TODO: compile the generated C++ kernel into a library,
+  // and call the library here.
+  os() << "int main() {}" << std::endl;
+}
+
+void CppCodeGen::call_raw(const std::vector<void*>& args) {
+  // TODO: compile the generated C++ kernel into a library,
+  // and call the library here.
+  os() << "int main() {}" << std::endl;
+}
+
+RegisterCodeGen<CppCodeGen> cpp_codegen_reg("cpp_codegen");
+
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.h b/torch/csrc/jit/tensorexpr/cpp_codegen.h
index 1cf15658716e6..a6d583ed4efb7 100644
--- a/torch/csrc/jit/tensorexpr/cpp_codegen.h
+++ b/torch/csrc/jit/tensorexpr/cpp_codegen.h
@@ -1,24 +1,100 @@
 #pragma once
 
+#include <torch/csrc/jit/tensorexpr/codegen.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 
-#include <unordered_set>
-
 namespace torch {
 namespace jit {
 namespace tensorexpr {
 
+class CppVarNameRewriter;
+
 // Generates C++ code from the IR.
+//
+// Vector operations are unrolled.
+// For example:
+// C[Ramp(0, 1, 3)] = A[Ramp(0, 2, 3)] + B[Ramp(0, 3, 3)];
+// is unrolled into:
+// C[0] = A[0] + B[0];
+// C[1] = A[2] + B[3];
+// C[2] = A[4] + B[6];
 class TORCH_API CppPrinter : public IRPrinter {
  public:
-  explicit CppPrinter(std::ostream* os) : IRPrinter(*os) {}
+  explicit CppPrinter(std::ostream* os);
+  ~CppPrinter() override;
+
+  void printPrologue();
 
   using IRPrinter::visit;
+
+  // Binary expressions.
+  void visit(ModPtr) override;
+  void visit(MaxPtr) override;
+  void visit(MinPtr) override;
+
+  // Conditional expressions.
+  void visit(CompareSelectPtr) override;
+  void visit(IfThenElsePtr) override;
+
+  // Tensor operations.
   void visit(AllocatePtr) override;
   void visit(FreePtr) override;
+  void visit(LoadPtr) override;
+  void visit(StorePtr) override;
+
+  // Casts.
+  void visit(CastPtr) override;
+  void visit(BitCastPtr) override;
+
+  // Calls.
+  void visit(IntrinsicsPtr) override;
+  void visit(ExternalCallPtr) override;
+
+  // Vars.
+  void visit(LetPtr) override;
+  void visit(VarPtr) override;
+
+  // Vector data types.
+  void visit(RampPtr) override;
+  void visit(BroadcastPtr) override;
 
  private:
-  std::unordered_set<VarPtr> allocated_on_heap_;
+  int lane_;
+  std::unordered_map<VarPtr, ExprPtr> vector_vars_;
+};
+
+class TORCH_API CppCodeGen : public CodeGen {
+ public:
+  CppCodeGen(
+      StmtPtr stmt,
+      const std::vector<BufferArg>& buffer_args,
+      at::Device device = at::kCPU,
+      const std::string& kernel_func_name = "func");
+
+  ~CppCodeGen() override;
+
+  void call(const std::vector<CallArg>& args) override;
+  void call_raw(const std::vector<void*>& args) override;
+
+  template <typename... Ts>
+  void operator()(const Ts&... ts) {
+    call(std::vector<CallArg>({CallArg(ts)...}));
+  }
+
+  std::string getCodeText(const std::string& attr = "") override {
+    return oss_.str();
+  }
+
+ private:
+  void init();
+
+  std::ostream& os() {
+    return printer_->os();
+  }
+
+  std::ostringstream oss_;
+  std::unique_ptr<CppPrinter> printer_;
+  std::unique_ptr<CppVarNameRewriter> var_name_rewriter_;
 };
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/cpp_intrinsics.h b/torch/csrc/jit/tensorexpr/cpp_intrinsics.h
new file mode 100644
index 0000000000000..caeeed693ff38
--- /dev/null
+++ b/torch/csrc/jit/tensorexpr/cpp_intrinsics.h
@@ -0,0 +1,36 @@
+#pragma once
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+constexpr auto cpp_intrinsics_definition = R"(
+namespace std {
+
+template <typename T,
+          typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
+T rsqrt(T v) {
+  return 1.0f / std::sqrt(v);
+}
+
+template <typename T,
+          typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
+T frac(T v) {
+  T intpart;
+  return std::modf(v, &intpart);
+}
+
+template <typename From, typename To>
+To bitcast(const From& v) {
+  assert(sizeof(To) == sizeof(From));
+  To res;
+  std::memcpy(&res, &v, sizeof(From));
+  return res;
+}
+
+} // namespace std
+)";
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h
index 108236e2e17f8..a4f317f48e666 100644
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -169,8 +169,12 @@ class TORCH_API Var : public ExprNode<Var> {
     return name_hint_;
   }
 
-  void set_name_hint(const std::string& name_hint) {
-    name_hint_ = name_hint;
+  void set_name_hint(const std::string& name) {
+    name_hint_ = name;
+  }
+
+  void set_name_hint(std::string&& name) {
+    name_hint_ = name;
   }
 
   Var(std::string name_hint, Dtype dtype)
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h
index f9fc7dcfc4246..7fe1fd1a07abb 100644
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -684,6 +684,7 @@ enum IntrinsicsOp {
   kFrac,
   kIsNan,
   kRand, // We need more discussions on this. Should we consider stateful?
+  kMaxIntrinsicsOp,
 };
 
 class TORCH_API Intrinsics : public ExprNode<Intrinsics> {
@@ -864,8 +865,9 @@ class TORCH_API Intrinsics : public ExprNode<Intrinsics> {
     params_ = std::move(params);
   }
 
- private:
   static int OpArgCount(IntrinsicsOp op_type);
+
+ private:
   static Dtype IntrinsicsDtype(IntrinsicsOp op_type, Dtype dt1);
   static Dtype IntrinsicsDtype(IntrinsicsOp op_type, Dtype dt1, Dtype dt2);
   static Dtype IntrinsicsDtype(
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index 27b56e2f58146..2e1fc6e6952a7 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -25,6 +25,24 @@ void IRPrinter::print(Expr& expr) {
 void IRPrinter::print(Stmt& stmt) {
   stmt.accept(this);
 }
+std::string IRPrinter::to_string(CompareSelectOperation op) {
+  switch (op) {
+    case CompareSelectOperation::kEQ:
+      return "==";
+    case CompareSelectOperation::kNE:
+      return "!=";
+    case CompareSelectOperation::kGT:
+      return ">";
+    case CompareSelectOperation::kGE:
+      return ">=";
+    case CompareSelectOperation::kLT:
+      return "<";
+    case CompareSelectOperation::kLE:
+      return "<=";
+    default:
+      throw std::runtime_error("invalid compare select operator");
+  }
+}
 
 // TODO: change whether to include the parenthesis to the parent expression,
 // we need to look at the operator precedence to make the output simpler.
@@ -137,28 +155,8 @@ void IRPrinter::visit(CompareSelectPtr v) {
   if (lhs_prec >= self_prec) {
     os() << ")";
   }
-  switch (cmp_op) {
-    case CompareSelectOperation::kEQ:
-      os() << "==";
-      break;
-    case CompareSelectOperation::kNE:
-      os() << "!=";
-      break;
-    case CompareSelectOperation::kGT:
-      os() << ">";
-      break;
-    case CompareSelectOperation::kGE:
-      os() << ">=";
-      break;
-    case CompareSelectOperation::kLT:
-      os() << "<";
-      break;
-    case CompareSelectOperation::kLE:
-      os() << "<=";
-      break;
-    default:
-      throw std::runtime_error("invalid compare select operator");
-  }
+
+  os() << to_string(cmp_op);
 
   if (rhs_prec >= self_prec) {
     os() << "(";
@@ -230,6 +228,13 @@ void IRPrinter::visit(CastPtr v) {
   os() << ")";
 }
 
+void IRPrinter::visit(BitCastPtr v) {
+  auto dtype = v->dtype();
+  os() << "BitCast<" << dtype.ToCppString() << ">(";
+  v->src_value()->accept(this);
+  os() << ")";
+}
+
 void IRPrinter::visit(VarPtr v) {
   os() << name_manager_.get_unique_name(v);
 }
@@ -439,7 +444,7 @@ void IRPrinter::visit(FreePtr v) {
 void IRPrinter::visit(LetPtr v) {
   os() << dtypeToCppString(v->dtype()) << " " << *v->var();
   os() << " = " << *v->value();
-  os() << ";";
+  os() << ";" << std::endl;
 }
 
 void IRPrinter::visit(CondPtr v) {
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h
index 321d1efe55457..327119dcc74e6 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.h
+++ b/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -37,6 +37,7 @@ class TORCH_API IRPrinter : public IRVisitor {
   AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_PRINT_VISIT);
 #undef IMM_PRINT_VISIT
   void visit(CastPtr v) override;
+  void visit(BitCastPtr v) override;
   void visit(VarPtr v) override;
   void visit(RampPtr v) override;
   void visit(LoadPtr v) override;
@@ -83,6 +84,8 @@ class TORCH_API IRPrinter : public IRVisitor {
   };
 
  protected:
+  std::string to_string(CompareSelectOperation op);
+
   UniqueNameManager* name_manager() {
     return &name_manager_;
   }

From ad47fb88584ec4cc9ed6e5f01e1256d38020a1a3 Mon Sep 17 00:00:00 2001
From: Erjia Guan <erjia@fb.com>
Date: Thu, 26 Aug 2021 10:21:48 -0700
Subject: [PATCH 258/530] Rename IterableAsDataPipe to IterableWrapper (#63981)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63981

Rename `IterableAsDataPipe` to `IterableWrapper` based on our naming convention `Op-er`

Test Plan: Imported from OSS

Reviewed By: VitalyFedyunin

Differential Revision: D30554197

Pulled By: ejguan

fbshipit-source-id: c2eacb20df5645d83ca165d6a1591f7e4791990f
---
 test/test_dataloader.py                     | 4 ++--
 torch/utils/data/dataloader_experimental.py | 4 ++--
 torch/utils/data/datapipes/iter/__init__.py | 4 ++--
 torch/utils/data/datapipes/iter/utils.py    | 5 ++++-
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 01136b9e4bb07..65554632fd30f 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -26,7 +26,7 @@
 )
 from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL
 from torch.utils.data.dataset import random_split
-from torch.utils.data.datapipes.iter import IterableAsDataPipe
+from torch.utils.data.datapipes.iter import IterableWrapper
 from torch._utils import ExceptionWrapper
 from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS,
                                                   IS_IN_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest,
@@ -1963,7 +1963,7 @@ def test_excessive_thread_creation_warning(self):
 class TestDataLoader2(TestCase):
     @skipIfNoDill
     def test_basics(self):
-        dp = IterableAsDataPipe(list(range(10)))
+        dp = IterableWrapper(list(range(10)))
         dl = DataLoader(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2)
         dl2 = DataLoader2(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2)
         self.assertEquals(list(dl), list(dl2))
diff --git a/torch/utils/data/dataloader_experimental.py b/torch/utils/data/dataloader_experimental.py
index 85028afd22124..ea085298bf00f 100644
--- a/torch/utils/data/dataloader_experimental.py
+++ b/torch/utils/data/dataloader_experimental.py
@@ -3,7 +3,7 @@
 
 import torch.utils.data.backward_compatibility
 from torch.utils.data import DataLoader, IterDataPipe
-from torch.utils.data.datapipes.iter import IterableAsDataPipe
+from torch.utils.data.datapipes.iter import IterableWrapper
 
 class DataLoader2:
     def __new__(cls,
@@ -69,7 +69,7 @@ def sharding_worker_init_fn(worker_init_fn, worker_id):
             else:
                 if collate_fn is None:
                     collate_fn = torch.utils.data._utils.collate.default_collate
-                datapipe = IterableAsDataPipe(data_loader).batch(
+                datapipe = IterableWrapper(data_loader).batch(
                     batch_size, drop_last=drop_last).map(collate_fn)
                 return datapipe
 
diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
index 5af2ab661da40..f302fd3a2b7ea 100644
--- a/torch/utils/data/datapipes/iter/__init__.py
+++ b/torch/utils/data/datapipes/iter/__init__.py
@@ -43,7 +43,7 @@
     ZipArchiveReaderIterDataPipe as ZipArchiveReader,
 )
 from torch.utils.data.datapipes.iter.utils import (
-    IterableAsDataPipeIterDataPipe as IterableAsDataPipe,
+    IterableWrapperIterDataPipe as IterableWrapper,
 )
 
 __all__ = ['Batcher',
@@ -55,7 +55,7 @@
            'FileLoader',
            'Filter',
            'HttpReader',
-           'IterableAsDataPipe',
+           'IterableWrapper',
            'LineReader',
            'Mapper',
            'RoutedDecoder',
diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py
index ea241d9f2716c..ee04abc455fba 100644
--- a/torch/utils/data/datapipes/iter/utils.py
+++ b/torch/utils/data/datapipes/iter/utils.py
@@ -1,10 +1,13 @@
 from torch.utils.data import IterDataPipe
 
 
-class IterableAsDataPipeIterDataPipe(IterDataPipe):
+class IterableWrapperIterDataPipe(IterDataPipe):
     def __init__(self, iterable):
         self.iterable = iterable
 
     def __iter__(self):
         for data in self.iterable:
             yield data
+
+    def __len__(self):
+        return len(self.iterable)

From 48c57b9b2ef5a65dac5ef9ba2a15f742bb7d06e5 Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@fb.com>
Date: Thu, 26 Aug 2021 12:08:00 -0700
Subject: [PATCH 259/530] Leverage TensorPipe's automatic SHM address selection
 (#63028)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63028

TensorPipe until now required PyTorch to come up and provide a unique identifier to use as address for the UNIX domain socket used in the SHM transport. However the Linux kernel can automatically assign an available address (like it does with IP ports), and TensorPipe now supports it, so we can remove that useless PyTorch logic.

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D30220352

fbshipit-source-id: 78e8a6ef5916b2a72df26cdc9cd367b9d083e821
---
 .../csrc/distributed/rpc/tensorpipe_agent.cpp | 22 ++-----------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index df42248639f94..8e7ad18c575f8 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -16,12 +16,6 @@
 #include <c10/core/StreamGuard.h>
 #include <c10/util/irange.h>
 
-#if TENSORPIPE_HAS_SHM_TRANSPORT
-// Needed for ::getpid(), which is used to create a unique address.
-#include <sys/types.h>
-#include <unistd.h>
-#endif
-
 namespace torch {
 namespace distributed {
 namespace rpc {
@@ -209,22 +203,10 @@ C10_REGISTER_CREATOR(TensorPipeTransportRegistry, uv, makeUvTransport);
 
 #if TENSORPIPE_HAS_SHM_TRANSPORT
 
-std::string createUniqueShmAddr() {
-  thread_local uint32_t threadLocalId = 0;
-  return c10::str(
-      "shm://tensorpipe_rpc_agent_",
-      std::this_thread::get_id(),
-      "_",
-      ::getpid(),
-      "_",
-      threadLocalId++);
-}
-
 std::unique_ptr<TransportRegistration> makeShmTransport() {
   auto context = tensorpipe::transport::shm::create();
-  std::string address = createUniqueShmAddr();
-  return std::make_unique<TransportRegistration>(TransportRegistration{
-      std::move(context), kShmTransportPriority, std::move(address)});
+  return std::make_unique<TransportRegistration>(
+      TransportRegistration{std::move(context), kShmTransportPriority, ""});
 }
 
 // The SHM implements connections using ringbuffers residing in anonymous shared

From c5cc185b6d556d7d91fc0b038a7a74529b66b737 Mon Sep 17 00:00:00 2001
From: Ansley Ussery <ansley@fb.com>
Date: Thu, 26 Aug 2021 12:14:32 -0700
Subject: [PATCH 260/530] Allow uncompiled strings as input to
 `checkScriptRaisesRegex` (#63901)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63901

cc gmagogsfm

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D30579472

Pulled By: ansley

fbshipit-source-id: 59ee09c1f25278d4f6e51f626588251bd095c6ea
---
 test/jit/test_jit_utils.py           | 15 ++++++++
 torch/testing/_internal/jit_utils.py | 56 ++++++++++++++++++----------
 2 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/test/jit/test_jit_utils.py b/test/jit/test_jit_utils.py
index 11d974bfe64c4..b344f82e96ced 100644
--- a/test/jit/test_jit_utils.py
+++ b/test/jit/test_jit_utils.py
@@ -77,3 +77,18 @@ def fn_hybrid_args(x, /, y, *args, **kwargs):
         self.assertEqual(
             [],
             torch._jit_internal.get_callable_argument_names(fn_hybrid_args))
+
+    def test_checkscriptassertraisesregex(self):
+        def fn():
+            tup = (1, 2)
+            return tup[2]
+
+        self.checkScriptRaisesRegex(fn, (), Exception, "range", name="fn")
+
+        s = dedent("""
+        def fn():
+            tup = (1, 2)
+            return tup[2]
+        """)
+
+        self.checkScriptRaisesRegex(s, (), Exception, "range", name="fn")
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index 50d8dac23867b..4c521a8e4d9d5 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -375,35 +375,53 @@ def assertRaisesRegexWithHighlight(self, exception, regex, highlight):
         return _AssertRaisesRegexWithHighlightContext(self, exception, regex, highlight)
 
     def checkScriptRaisesRegex(self, script, inputs, exception, regex,
-                               outputs=None, capture_output=False, profiling=ProfilingMode.PROFILING):
+                               name=None, outputs=None, capture_output=False,
+                               frames_up=1, profiling=ProfilingMode.PROFILING):
         """
         Checks that a given function will throw the correct exception,
-        when executed with normal python, the string frontend, and the AST frontend
+        when executed with normal python, the string frontend, and the
+        AST frontend. Logic taken from `checkScript` (see comments there
+        for details)
         """
-
         with enable_profiling_mode_for_profiling_tests():
-            # normal python
+            # Normal Python
             with self.assertRaisesRegex(exception, regex):
-                script(*inputs)
-            # string frontend
+                if isinstance(script, str):
+                    frame = self.get_frame_vars(frames_up)
+                    the_locals: Dict[str, Any] = {}
+                    execWrapper(script, glob=frame, loc=the_locals)
+                    frame.update(the_locals)
+
+                    python_fn = frame[name]
+                else:
+                    python_fn = script
+
+                python_fn(*inputs)
+
+            # String frontend
             with self.assertRaisesRegex(exception, regex):
-                source = textwrap.dedent(inspect.getsource(script))
-                cu = torch.jit.CompilationUnit(source)
-                ge = getattr(cu, script.__name__)
-                # profiling run
+                if isinstance(script, str):
+                    cu = torch.jit.CompilationUnit(script, _frames_up=frames_up)
+                    string_frontend = getattr(cu, name)
+                else:
+                    source = textwrap.dedent(inspect.getsource(script))
+                    cu = torch.jit.CompilationUnit(source, _frames_up=frames_up)
+                    string_frontend = getattr(cu, script.__name__)
+
                 with self.assertRaisesRegex(exception, regex):
-                    ge(*inputs)
+                    string_frontend(*inputs)
                 # optimized run
-                ge(*inputs)
-            # python AST frontend
-            with self.assertRaisesRegex(exception, regex):
-                ge = torch.jit.script(script)
-                # profiling run
+                string_frontend(*inputs)
+
+            # Python AST frontend
+            if not isinstance(script, str):
                 with self.assertRaisesRegex(exception, regex):
+                    ge = torch.jit.script(python_fn)
+                    # profiling run
+                    with self.assertRaisesRegex(exception, regex):
+                        ge(*inputs)
+                    # optimized run
                     ge(*inputs)
-                # optimized run
-                ge(*inputs)
-
 
     def checkBailouts(self, model, inputs, expected):
         state = model.get_debug_state()

From 95d0b3199b2e0eb0516e439c8aa1a94b62113e1e Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Thu, 26 Aug 2021 12:48:01 -0700
Subject: [PATCH 261/530] Back out "[ONNX] Fix an issue that optimizations
 might adjust graph inputs unexpectedly. (#61280)" (#64004)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64004

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63904

Fixes T98808160

Test Plan: T98808160

Reviewed By: msaroufim

Differential Revision: D30527450

fbshipit-source-id: 6262901a78ca929cecda1cf740893139aa26f1b4
---
 .../expect/TestOperators.test_prelu.expect    | 28 +++++-----
 ...ors.test_retain_param_name_disabled.expect | 52 +++++--------------
 torch/_C/__init__.pyi.in                      |  2 +-
 torch/csrc/jit/passes/onnx/eval_peephole.cpp  | 21 ++------
 torch/csrc/jit/passes/onnx/eval_peephole.h    |  3 +-
 torch/csrc/jit/python/init.cpp                |  5 +-
 torch/onnx/__init__.py                        | 22 ++------
 torch/onnx/utils.py                           | 19 +++----
 8 files changed, 45 insertions(+), 107 deletions(-)

diff --git a/test/onnx/expect/TestOperators.test_prelu.expect b/test/onnx/expect/TestOperators.test_prelu.expect
index be0328e5c61b7..e19623cfd4460 100644
--- a/test/onnx/expect/TestOperators.test_prelu.expect
+++ b/test/onnx/expect/TestOperators.test_prelu.expect
@@ -2,30 +2,20 @@ ir_version: 6
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
-  node {
-    input: "weight"
-    output: "2"
-    name: "Unsqueeze_0"
-    op_type: "Unsqueeze"
-    attribute {
-      name: "axes"
-      ints: 1
-      ints: 2
-      type: INTS
-    }
-  }
   node {
     input: "input"
-    input: "2"
+    input: "4"
     output: "3"
-    name: "PRelu_1"
+    name: "PRelu_0"
     op_type: "PRelu"
   }
   name: "torch-jit-export"
   initializer {
     dims: 2
+    dims: 1
+    dims: 1
     data_type: 1
-    name: "weight"
+    name: "4"
     raw_data: "\000\000\200>\000\000\200>"
   }
   input {
@@ -51,7 +41,7 @@ graph {
     }
   }
   input {
-    name: "weight"
+    name: "4"
     type {
       tensor_type {
         elem_type: 1
@@ -59,6 +49,12 @@ graph {
           dim {
             dim_value: 2
           }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
         }
       }
     }
diff --git a/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect b/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect
index aa9499e27ac49..5eeaa875feb0c 100644
--- a/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect
+++ b/test/onnx/expect/TestOperators.test_retain_param_name_disabled.expect
@@ -2,57 +2,33 @@ ir_version: 6
 producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
-  node {
-    input: "1"
-    output: "3"
-    name: "Transpose_0"
-    op_type: "Transpose"
-    attribute {
-      name: "perm"
-      ints: 1
-      ints: 0
-      type: INTS
-    }
-  }
   node {
     input: "input.1"
-    input: "3"
+    input: "7"
     output: "4"
-    name: "MatMul_1"
+    name: "MatMul_0"
     op_type: "MatMul"
   }
-  node {
-    input: "2"
-    output: "5"
-    name: "Transpose_2"
-    op_type: "Transpose"
-    attribute {
-      name: "perm"
-      ints: 1
-      ints: 0
-      type: INTS
-    }
-  }
   node {
     input: "4"
-    input: "5"
+    input: "8"
     output: "6"
-    name: "MatMul_3"
+    name: "MatMul_1"
     op_type: "MatMul"
   }
   name: "torch-jit-export"
   initializer {
-    dims: 5
     dims: 4
+    dims: 5
     data_type: 1
-    name: "1"
+    name: "7"
     raw_data: "\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@\000\000\000@"
   }
   initializer {
-    dims: 6
     dims: 5
+    dims: 6
     data_type: 1
-    name: "2"
+    name: "8"
     raw_data: "\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@\000\000@@"
   }
   input {
@@ -72,32 +48,32 @@ graph {
     }
   }
   input {
-    name: "1"
+    name: "7"
     type {
       tensor_type {
         elem_type: 1
         shape {
           dim {
-            dim_value: 5
+            dim_value: 4
           }
           dim {
-            dim_value: 4
+            dim_value: 5
           }
         }
       }
     }
   }
   input {
-    name: "2"
+    name: "8"
     type {
       tensor_type {
         elem_type: 1
         shape {
           dim {
-            dim_value: 6
+            dim_value: 5
           }
           dim {
-            dim_value: 5
+            dim_value: 6
           }
         }
       }
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 0b6bb6b64e0a4..3629150d15090 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -326,7 +326,7 @@ def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ...
 def _jit_pass_onnx_fold_if(graph: Graph) -> None: ...
 def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ...
 def _jit_pass_inline_fork_wait(graph: Graph) -> None: ...
-def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue], isAllowedToAdjustGraphInputs: _bool) -> Dict[str, IValue]: ...
+def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ...
 def _jit_pass_onnx_constant_fold(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> Dict[str, IValue]: ...
 def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ...
 def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ...
diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.cpp b/torch/csrc/jit/passes/onnx/eval_peephole.cpp
index 4bad9367af444..05afb69ef0f23 100644
--- a/torch/csrc/jit/passes/onnx/eval_peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/eval_peephole.cpp
@@ -141,27 +141,14 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) {
   }
 }
 
-void EvalPeepholeONNX(
-    Block* b,
-    ParamMap& paramsDict,
-    bool isAllowedToAdjustGraphInputs) {
+void EvalPeepholeONNX(Block* b, ParamMap& paramsDict) {
   auto valsToParamsMap = buildValueToParamsMap(b, paramsDict);
-
-  // Optimizations like fusing Conv and BatchNorm ops may adjust the graph
-  // inputs. If the graph inputs are not allowed to be adjusted, for example
-  // export_params is False, such optimizations will be skipped.
-  if (isAllowedToAdjustGraphInputs) {
-    fuseConvBatchNorm(b, valsToParamsMap);
-  }
-
+  fuseConvBatchNorm(b, valsToParamsMap);
   buildParamsMapFromValueToParamsMap(valsToParamsMap, paramsDict);
 }
 
-void EvalPeepholeONNX(
-    std::shared_ptr<Graph>& g,
-    ParamMap& paramsDict,
-    bool isAllowedToAdjustGraphInputs) {
-  EvalPeepholeONNX(g->block(), paramsDict, isAllowedToAdjustGraphInputs);
+void EvalPeepholeONNX(std::shared_ptr<Graph>& g, ParamMap& paramsDict) {
+  EvalPeepholeONNX(g->block(), paramsDict);
   GRAPH_DUMP("After EvalPeepholeONNX:", g);
 }
 
diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.h b/torch/csrc/jit/passes/onnx/eval_peephole.h
index d953f2c2e5bda..6f8961d08fd5e 100644
--- a/torch/csrc/jit/passes/onnx/eval_peephole.h
+++ b/torch/csrc/jit/passes/onnx/eval_peephole.h
@@ -9,8 +9,7 @@ namespace jit {
 
 void EvalPeepholeONNX(
     std::shared_ptr<Graph>& g,
-    std::map<std::string, IValue>& paramDict,
-    bool isAllowedToAdjustGraphInputs);
+    std::map<std::string, IValue>& paramDict);
 
 } // namespace jit
 
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 645fea2274fb2..7e43e511c786f 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -203,9 +203,8 @@ void initJITBindings(PyObject* module) {
       .def(
           "_jit_pass_onnx_eval_peephole",
           [](std::shared_ptr<Graph>& graph,
-             std::map<std::string, IValue>& paramsDict,
-             bool isAllowedToAdjustGraphInputs) {
-            EvalPeepholeONNX(graph, paramsDict, isAllowedToAdjustGraphInputs);
+             std::map<std::string, IValue>& paramsDict) {
+            EvalPeepholeONNX(graph, paramsDict);
             return paramsDict;
           },
           pybind11::return_value_policy::move)
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index e058acce1947d..b726b2b55e8b6 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -103,17 +103,11 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM
         export_params (bool, default True): if True, all parameters will
             be exported. Set this to False if you want to export an untrained model.
             In this case, the exported model will first take all of its parameters
-            as arguments, with the ordering as specified by ``model.state_dict().values()``.
-            This helps in stripping parameters from the model which is useful for training.
-            Besides, if this is False, any optimization that may adjust graph inputs will
-            be skipped - for example, Conv and BatchNorm fusion.
+            as arguments, with the ordering as specified by ``model.state_dict().values()``
         verbose (bool, default False): if True, prints a description of the
             model being exported to stdout.
         training (enum, default TrainingMode.EVAL):
-            * ``TrainingMode.EVAL``: export the model in inference mode. In this case, optimizations
-              (e.g., fusing Conv and BatchNorm ops) may adjust graph inputs by modifying model params
-              and model param names. Such adjustment could be skipped by setting export_params = False
-              or keep_initializers_as_inputs = True.
+            * ``TrainingMode.EVAL``: export the model in inference mode.
             * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is
               False and in training mode if model.training is True.
             * ``TrainingMode.TRAINING``: export the model in training mode. Disables optimizations
@@ -190,8 +184,6 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM
         do_constant_folding (bool, default False): Apply the constant-folding optimization.
             Constant-folding will replace some of the ops that have all constant inputs
             with pre-computed constant nodes.
-            Since this optimization adjusts model initializers, it will be disabled if
-            export_params = False or keep_initializers_as_inputs = True.
         example_outputs (T or a tuple of T, where T is Tensor or convertible to Tensor, default None):
             Must be provided when exporting a ScriptModule or ScriptFunction, ignored otherwise.
             Used to determine the type and shape of the outputs without tracing the execution of
@@ -273,13 +265,9 @@ def forward(self, x):
 
         keep_initializers_as_inputs (bool, default None): If True, all the
             initializers (typically corresponding to parameters) in the
-            exported graph will also be added as inputs to the graph.
-
-            If False, then initializers are not added as inputs to the graph, and only
-            the non-parameter inputs are added as inputs. Meanwhile, the optimization
-            that might adjust graph inputs will be skipped (e.g., fusing Conv and
-            BatchNorm ops), even when the user export this model in inference mode.
-
+            exported graph will also be added as inputs to the graph. If False,
+            then initializers are not added as inputs to the graph, and only
+            the non-parameter inputs are added as inputs.
             This may allow for better optimizations (e.g. constant folding) by
             backends/runtimes.
 
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 7860e38034028..41ba20f3ad102 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -439,8 +439,7 @@ def _model_to_graph(model, args, verbose=False,
                     example_outputs=None,
                     _retain_param_name=False, do_constant_folding=True,
                     _disable_torch_constant_prop=False, fixed_batch_size=False,
-                    training=None, dynamic_axes=None, export_params=True,
-                    keep_initializers_as_inputs=False):
+                    training=None, dynamic_axes=None):
     r"""Converts model into an ONNX graph.
 
     Returns:
@@ -499,12 +498,10 @@ def _model_to_graph(model, args, verbose=False,
 
     params_dict = _get_named_param_dict(graph, params)
 
-    allow_adjust_graph_inputs = (export_params and not keep_initializers_as_inputs)
-    if (training is None or training == TrainingMode.EVAL):
-        params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict, allow_adjust_graph_inputs)
+    if training is None or training == TrainingMode.EVAL:
+        params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict)
 
-    if do_constant_folding and allow_adjust_graph_inputs and \
-            _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions:
+    if do_constant_folding and _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions:
         params_dict = torch._C._jit_pass_onnx_constant_fold(graph, params_dict,
                                                             _export_onnx_opset_version)
         torch._C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
@@ -572,9 +569,7 @@ def _export_to_pretty_string(model, args, f, export_params=True, verbose=False,
                                                         output_names, operator_export_type,
                                                         example_outputs, _retain_param_name,
                                                         val_do_constant_folding, fixed_batch_size=fixed_batch_size,
-                                                        training=training,
-                                                        export_params=export_params,
-                                                        keep_initializers_as_inputs=val_keep_init_as_ip)
+                                                        training=training)
 
         return graph._pretty_print_onnx(params_dict, opset_version, False,
                                         operator_export_type, google_printer,
@@ -690,9 +685,7 @@ def _export(model, args, f, export_params=True, verbose=False, training=None,
                                 val_do_constant_folding,
                                 fixed_batch_size=fixed_batch_size,
                                 training=training,
-                                dynamic_axes=dynamic_axes,
-                                export_params=export_params,
-                                keep_initializers_as_inputs=val_keep_init_as_ip)
+                                dynamic_axes=dynamic_axes)
 
             # TODO: Don't allocate a in-memory string for the protobuf
             defer_weight_export = export_type is not ExportTypes.PROTOBUF_FILE

From cbfec02007775d96139d8a1b9d9f8a44fcede31c Mon Sep 17 00:00:00 2001
From: Don Jang <djang@fb.com>
Date: Thu, 26 Aug 2021 12:58:05 -0700
Subject: [PATCH 262/530] [Static Runtime] Add native op for aten::expand_as
 (#64024)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64024

`aten::expand_as` creates a view of the input tensor. This change adds its native op implementation for the static runtime.

Test Plan: - Added `StaticRuntime.IndividualOps_ExpandAs`

Reviewed By: hlu1

Differential Revision: D30546851

fbshipit-source-id: e53483048af890bc41b6192a1ab0c5ba0ee2bdc0
---
 benchmarks/static_runtime/test_scripts.h         |  6 ++++++
 benchmarks/static_runtime/test_static_runtime.cc | 11 +++++++++++
 torch/csrc/jit/runtime/static/native_ops.cpp     | 16 ++++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index 90f93b20c94c0..ecdd491462f62 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -349,6 +349,12 @@ const std::string embedding_bag_max_last_offset = R"JIT(
       return torch.embedding_bag(a, b, c, False, 2, False, None, True)
 )JIT";
 
+const auto expand_as_script = R"JIT(
+  def forward(self, input: Tensor, other:Tensor):
+      a = input.expand_as(other)
+      return a.clone()
+)JIT";
+
 const auto sign_tensor = R"JIT(
   def forward(self, input: Tensor):
       return torch.sign(input).clone()
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index f6ec677bbb7bc..4441b7d043db9 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -610,6 +610,17 @@ TEST(StaticRuntime, IndividualOps_Detach) {
   testStaticRuntime(detach_script_1, args, args2);
 }
 
+TEST(StaticRuntime, IndividualOps_ExpandAs) {
+  auto a = at::randn({3,1});
+  auto b = at::randn({3,2});
+  auto c = at::randn({4,1});
+  auto d = at::randn({4,2});
+  std::vector<IValue> args{a, b};
+  std::vector<IValue> args2{c, d};
+  testStaticRuntime(expand_as_script, args);
+  testStaticRuntime(expand_as_script, args, args2);
+}
+
 TEST(StaticRuntime, IndividualOps_Full) {
   auto dtype = at::ScalarType::Int;
   auto cpu = at::Device(DeviceType::CPU);
diff --git a/torch/csrc/jit/runtime/static/native_ops.cpp b/torch/csrc/jit/runtime/static/native_ops.cpp
index 61a6554a3c5cc..7a1558dd70a00 100644
--- a/torch/csrc/jit/runtime/static/native_ops.cpp
+++ b/torch/csrc/jit/runtime/static/native_ops.cpp
@@ -370,6 +370,22 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR(
       };
     });
 
+REGISTER_NATIVE_OPERATOR_FUNCTOR(
+    aten::expand_as,
+    aten_expand_as,
+    [](Node* n) -> SROperator {
+      if (!n->matches(torch::schema(
+              "aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)"))) {
+        LogAndDumpSchema(n);
+        return nullptr;
+      }
+      return [](ProcessedNode* p_node) {
+        const auto& self = p_node->Input(0).toTensor();
+        const auto& other = p_node->Input(1).toTensor();
+        p_node->Output(0) = self.expand(other.sizes());
+      };
+    });
+
 REGISTER_NATIVE_OPERATOR_FUNCTOR(
     prim::isinstance,
     prim_isinstance,

From 7cfbc85821e8928db570a0730437b96484ac7b60 Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@fb.com>
Date: Thu, 26 Aug 2021 13:06:46 -0700
Subject: [PATCH 263/530] [fx_acc] [fx2trt] add acc op mapper for argmin and
 converter for topk (#63823)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63823

Add mapper for `torch.argmin` which maps it to `acc_ops.flatten` (optional) + `acc_ops.topk` + `acc_ops.getitem` + `acc_ops.squeeze` (optional). This diff doesn't allow mapping if `dim=None && keepdim=True` in `torch.argmin`.

Add fx2trt converter for `acc_ops.topk`.

Test Plan:
buck test mode/opt glow/fb/fx/oss_acc_tracer:test_acc_tracer -- test_argmin
buck run mode/opt caffe2/torch/fb/fx2trt:test_topk

Reviewed By: jfix71

Differential Revision: D30501771

fbshipit-source-id: 0babc45e69bac5e61ff0b9b4dfb98940398e3e57
---
 .../fx2trt/converters/acc_ops_converters.py   | 24 +++++++++
 torch/fx/experimental/fx2trt/fx2trt.py        |  4 +-
 torch/fx/experimental/fx_acc/acc_ops.py       | 51 +++++++++++++++++++
 3 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
index 33a817d4ccdb5..ba370b2b067d4 100644
--- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
+++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
@@ -717,6 +717,7 @@ def acc_ops_squeeze(network, target, args, kwargs, name):
     # dim, which is a very rare case. For now we just claim not supporting dim=None.
     assert dim is not None, "We don't support dim=None right now."
 
+    dim = dim % (len(input_val.shape) + (1 if network.has_implicit_batch_dimension else 0))
     if network.has_implicit_batch_dimension:
         assert dim != 0, "We don't support squeeze batch dim when it's implicit."
         dim -= 1
@@ -796,6 +797,29 @@ def acc_ops_unsqueeze(network, target, args, kwargs, name):
     layer.name = name
     return layer.get_output(0)
 
+@tensorrt_converter(acc_ops.topk)
+def acc_ops_topk(network, target, args, kwargs, name):
+    input_val = kwargs["input"]
+
+    if not isinstance(input_val, trt.tensorrt.ITensor):
+        raise RuntimeError(f"topk received input {input_val} that is not part "
+                           "of the TensorRT region!")
+
+    if kwargs["sorted"] and kwargs["k"] != 1:
+        raise RuntimeError("Currently we don't support sorted=True in topk.")
+
+    if not network.has_implicit_batch_dimension and len(input_val.shape) <= 1:
+        raise RuntimeError("At least 2 dimensions are required for input to topk.")
+
+    num_dims = len(input_val.shape) + (1 if network.has_implicit_batch_dimension else 0)
+    k = kwargs["k"]
+    dim = (kwargs["dim"] if kwargs["dim"] else -1) % num_dims
+    operation = trt.TopKOperation.MAX if kwargs["largest"] else trt.TopKOperation.MIN
+    layer = network.add_topk(
+        input_val, operation, k, get_axes_for_reduce_op(dim, network.has_implicit_batch_dimension)
+    )
+    layer.name = name
+    return (layer.get_output(0), layer.get_output(1))
 
 @tensorrt_converter(acc_ops.adaptive_avg_pool2d)
 def acc_ops_adaptive_avg_pool2d(network, target, args, kwargs, name):
diff --git a/torch/fx/experimental/fx2trt/fx2trt.py b/torch/fx/experimental/fx2trt/fx2trt.py
index ede99fd6f1700..72497a7d2aafc 100644
--- a/torch/fx/experimental/fx2trt/fx2trt.py
+++ b/torch/fx/experimental/fx2trt/fx2trt.py
@@ -415,8 +415,6 @@ def output(self, target, args, kwargs):
             name = f"output{i}"
             output.name = name
             self.network.mark_output(output)
-            if self.fp16_mode:
+            if self.fp16_mode and output.dtype == trt.float32:
                 output.dtype = trt.float16
-            else:
-                output.dtype = trt.float32
             self._output_names.append(name)
diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py
index 95fffaa479c9e..692ca6304910f 100644
--- a/torch/fx/experimental/fx_acc/acc_ops.py
+++ b/torch/fx/experimental/fx_acc/acc_ops.py
@@ -705,6 +705,57 @@ def batch_norm(
 def layer_norm(*, input, normalized_shape, weight, bias, eps):
     return nn.functional.layer_norm(**locals())
 
+def argmin_max_mapper_impl(node: torch.fx.Node, largest: bool) -> torch.fx.Node:
+    """
+    Map torch.argmin or torch.argmax to acc_ops.flatten (depend on dim) + acc_ops.topk
+    + acc_ops.getitem + acc_ops.squeeze (depends on keepdim).
+    """
+    input_node = node.kwargs["input"]
+    dim = node.kwargs["dim"]
+    keepdim = node.kwargs["keepdim"]
+
+    if dim is None and keepdim:
+        raise RuntimeError("We currently don't support argmin/argmax with dim=None and keepdim=True")
+
+    with node.graph.inserting_before(node):
+        if dim is None:
+            flatten_kwargs = {"input": node.kwargs["input"], "start_dim": 0, "end_dim": -1}
+            flatten_node = node.graph.call_function(flatten, kwargs=flatten_kwargs)
+            flatten_node.meta["type"] = torch.Tensor
+            input_node = flatten_node
+            dim = -1
+
+        topk_kwargs = {"input": input_node, "k": 1, "dim": dim, "largest": largest, "sorted": False}
+        topk_node = node.graph.call_function(topk, kwargs=topk_kwargs)
+        # It's actually more like NamedTuple but tuple here should be fine.
+        topk_node.meta["type"] = tuple
+
+        getitem_kwargs = {"input": topk_node, "idx": 1}
+        getitem_node = node.graph.call_function(getitem, kwargs=getitem_kwargs)
+        getitem_node.meta["type"] = torch.Tensor
+        output_node = getitem_node
+
+        if not keepdim:
+            squeeze_kwargs = {"input": getitem_node, "dim": dim}
+            output_node = node.graph.call_function(squeeze, kwargs=squeeze_kwargs)
+
+        output_node.meta = node.meta.copy()
+        return output_node
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.argmin),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("keepdim", "keepdim"),
+    ],
+)
+def torch_argmin_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
+    """
+    Map torch.argmin to acc_ops.flatten (depend on dim) + acc_ops.topk + acc_ops.getitem
+    + acc_ops.squeeze (depends on keepdim).
+    """
+    return argmin_max_mapper_impl(node, largest=False)
 
 @register_custom_acc_mapper_fn(
     op_and_target=("call_method", "split"),

From 5a12cb611f40c2277587b955cc851b47ec60c18d Mon Sep 17 00:00:00 2001
From: Ilqar Ramazanli <iramazanli@fb.com>
Date: Thu, 26 Aug 2021 13:29:03 -0700
Subject: [PATCH 264/530] To add Chained Scheduler to the list of PyTorch
 schedulers. (#63491)

Summary:
In this PR we are introducing ChainedScheduler which initially proposed in the discussion https://github.com/pytorch/pytorch/pull/26423#discussion_r329976246 .

The idea is to provide a user friendly chaining method for schedulers, especially for the cases many of them are involved and we want to have a clean and easy to read interface for schedulers. This method will be even more crucial once CompositeSchedulers and Schedulers for different type of parameters are involved.

The immediate application of Chained Scheduler is expected to happen in TorchVision Library to combine WarmUpLR and  MultiStepLR https://github.com/pytorch/vision/blob/master/references/video_classification/scheduler.py#L5 . However, it can be expected that in many other use cases also this method could be applied.

### Example
The usage is as simple as below:

```python
sched=ChainedScheduler([ExponentialLR(self.opt, gamma=0.9),
                        WarmUpLR(self.opt, warmup_factor=0.2, warmup_iters=4, warmup_method="constant"),
                        StepLR(self.opt, gamma=0.1, step_size=3)])
```

Then calling
```python
sched.step()
```
would trigger step function for all three schedulers consecutively

Partially resolves https://github.com/pytorch/vision/issues/4281

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63491

Reviewed By: datumbox, mruberry

Differential Revision: D30576180

Pulled By: iramazanli

fbshipit-source-id: b43f0749f55faab25079641b7d91c21a891a87e4
---
 test/test_optim.py          | 40 ++++++++++++++++++++++++++++++++++++-
 torch/optim/lr_scheduler.py | 38 +++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/test/test_optim.py b/test/test_optim.py
index 01ec43bbea883..fe282ef33b4de 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -13,7 +13,7 @@
 from torch import sparse
 from torch.optim.lr_scheduler import LambdaLR, MultiplicativeLR, StepLR, \
     MultiStepLR, WarmUpLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, \
-    _LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR
+    _LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR, ChainedScheduler
 from torch.optim.swa_utils import AveragedModel, SWALR, update_bn
 from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests, \
     skipIfRocm
@@ -1253,6 +1253,44 @@ def test_reduce_lr_on_plateau8(self):
                                       threshold=0.1, patience=5, cooldown=5)
         self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
 
+    def test_chained_lr1(self):
+        epochs = 10
+        schedulers = [None] * 1
+        targets = [[0.05] * 3 + [0.005] * 3 + [0.0005] * 3 + [0.00005] * 3]
+        schedulers[0] = StepLR(self.opt, gamma=0.1, step_size=3)
+        scheduler = ChainedScheduler(schedulers)
+        self._test([scheduler], targets, epochs)
+
+    def test_chained_lr2(self):
+        epochs = 10
+        schedulers = [None] * 1
+        targets = [[0.02, 0.03, 0.04] + [0.05] * 9]
+        schedulers[0] = WarmUpLR(self.opt, warmup_factor=0.4, warmup_iters=3, warmup_method="linear")
+        scheduler = ChainedScheduler(schedulers)
+        self._test([scheduler], targets, epochs)
+
+    def test_chained_lr3(self):
+        epochs = 10
+        schedulers = [None] * 2
+        targets = [[0.02, 0.03, 0.04, 0.05] + [0.005] * 4 + [0.0005] * 3 + [0.00005] * 3]
+        schedulers[0] = WarmUpLR(self.opt, warmup_factor=0.4, warmup_iters=3, warmup_method="linear")
+        schedulers[1] = MultiStepLR(self.opt, milestones=[4, 8, 10], gamma=0.1)
+        scheduler = ChainedScheduler(schedulers)
+        self._test([scheduler], targets, epochs)
+
+    def test_chained_lr4(self):
+        epochs = 9
+        schedulers = [None] * 3
+        targets = [[0.05 * 0.2 * 0.9 ** x for x in range(3)]
+                   + [0.05 * 0.2 * 0.9 ** 3 * 0.1]
+                   + [0.05 * 0.9 ** x * 0.1 for x in range(4, 6)]
+                   + [0.05 * 0.9 ** x * 0.01 for x in range(6, 9)]]
+        schedulers[0] = ExponentialLR(self.opt, gamma=0.9)
+        schedulers[1] = WarmUpLR(self.opt, warmup_factor=0.2, warmup_iters=4, warmup_method="constant")
+        schedulers[2] = StepLR(self.opt, gamma=0.1, step_size=3)
+        scheduler = ChainedScheduler(schedulers)
+        self._test([scheduler], targets, epochs)
+
     def test_compound_step_and_multistep_lr(self):
         epochs = 10
         schedulers = [None] * 2
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 657a35ad681b0..761a4041668d6 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -603,6 +603,44 @@ def _get_closed_form_lr(self):
                 for base_lr in self.base_lrs]
 
 
+class ChainedScheduler(_LRScheduler):
+    """Chains list of learning rate schedulers. It takes a list of chainable learning
+    rate schedulers and performs consecutive step() functions belong to them by just
+    one call.
+
+    Args:
+        schedulers (list): List of chained schedulers.
+
+    Example:
+        >>> # Assuming optimizer uses lr = 1. for all groups
+        >>> # lr = 0.09     if epoch == 0
+        >>> # lr = 0.081    if epoch == 1
+        >>> # lr = 0.729    if epoch == 2
+        >>> # lr = 0.6561   if epoch == 3
+        >>> # lr = 0.59049  if epoch >= 4
+        >>> scheduler1 = WarmUpLR(self.opt, warmup_factor=0.1, warmup_iters=2, warmup_method="constant")
+        >>> scheduler2 = ExponentialLR(self.opt, gamma=0.9)
+        >>> scheduler = ChainedScheduler([scheduler1, scheduler2])
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+
+    def __init__(self, schedulers):
+        for scheduler_idx in range(1, len(schedulers)):
+            if (schedulers[scheduler_idx].optimizer != schedulers[0].optimizer):
+                raise ValueError(
+                    "ChainedScheduler expects all schedulers to belong to the same optimizer, but "
+                    "got schedulers at index {} and {} to be different".format(0, scheduler_idx)
+                )
+        self.schedulers = list(schedulers)
+
+    def step(self):
+        for scheduler in self.schedulers:
+            scheduler.step()
+
+
 class ReduceLROnPlateau(object):
     """Reduce learning rate when a metric has stopped improving.
     Models often benefit from reducing the learning rate by a factor

From d8d8e4902a9ce3426e84817b936699f85a5f698e Mon Sep 17 00:00:00 2001
From: Can Balioglu <balioglu@fb.com>
Date: Thu, 26 Aug 2021 13:55:08 -0700
Subject: [PATCH 265/530] [torch/elastic] Pretty print the failure message
 captured by @record (#64036)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64036

This PR slightly revises the implementation of the internal `_format_failure()` method in order to pretty print the error message captured in a subprocess by the `record` annotation.

With this PR a failure log is formatted as below:

```
Root Cause:
[0]:
  time: 2021-08-26_17:12:07
  rank: 0 (local_rank: 0)
  exitcode: 1 (pid: 8045)
  error_file: /tmp/torchelastic_6cj9eppm/6d9d844a-6ce4-4838-93ed-1639a9525b00_rec9kuv3/attempt_0/0/error.json
  msg:
    {
      "message": "ValueError: Test",
      "extraInfo": {
        "py_callstack": [
          "  File \"/data/home/balioglu/fail.py\", line 7, in <module>\n    main()\n",
          "  File \"/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 373, in wrapper\n    error_handler.record_exception(e)\n",
          "  File \"/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/error_handler.py\", line 86, in record_exception\n    _write_error(e, self._get_error_file_path())\n",
          "  File \"/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/error_handler.py\", line 26, in _write_error\n    \"py_callstack\": traceback.format_stack(),\n"
        ],
        "timestamp": "1629997927"
      }
    }
```

in contrast to the old formatting:

```
Root Cause:
[0]:
  time: 2021-08-26_17:15:50
  rank: 0 (local_rank: 0)
  exitcode: 1 (pid: 9417)
  error_file: /tmp/torchelastic_22pwarnq/19f22638-848c-4b8f-8379-677f34fc44e7_u43o9vs7/attempt_0/0/error.json
  msg: "{'message': 'ValueError: Test', 'extraInfo': {'py_callstack': 'Traceback (most recent call last):\n  File "/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 351, in wrapper\n    return f(*args, **kwargs)\n  File "/data/home/balioglu/fail.py", line 5, in main\n    raise ValueError("BALIOGLU")\nValueError: BALIOGLU\n', 'timestamp': '1629998150'}}"
```
ghstack-source-id: 136761768

Test Plan: Run the existing unit tests.

Reviewed By: kiukchung

Differential Revision: D30579025

fbshipit-source-id: 37df0b7c7ec9b620355766122986c2c77e8495ae
---
 .../elastic/multiprocessing/errors/__init__.py  | 17 +++++++++++++++--
 .../multiprocessing/errors/error_handler.py     |  2 +-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/torch/distributed/elastic/multiprocessing/errors/__init__.py b/torch/distributed/elastic/multiprocessing/errors/__init__.py
index 7746dbace9af5..ab0e0f3b7c874 100644
--- a/torch/distributed/elastic/multiprocessing/errors/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/errors/__init__.py
@@ -165,7 +165,7 @@ def timestamp_isoformat(self):
   rank: ${rank} (local_rank: ${local_rank})
   exitcode: ${exitcode} (pid: ${pid})
   error_file: ${error_file}
-  msg: \"${message}\""""
+  msg: ${message}"""
 
 # extra new lines before and after are intentional
 _MSG_FORMAT_TEMPLATE = """
@@ -258,6 +258,19 @@ def format_msg(self, boarder_delim="*", section_delim="="):
     def _format_failure(
         self, idx: int, rank: int, failure: ProcessFailure
     ) -> Tuple[str, int]:
+        if isinstance(failure.message, str):
+            msg = '"' + failure.message + '"'
+        else:
+            try:
+                dmp = json.dumps(failure.message, indent=2)
+            except ValueError:
+                msg = failure.message
+            else:
+                msg = os.linesep
+                # Indent by 4 chars.
+                for l in dmp.splitlines():
+                    msg += f"    {l}{os.linesep}"
+
         fmt = Template(_FAILURE_FORMAT_TEMPLATE).substitute(
             idx=idx,
             time=failure.timestamp_isoformat(),
@@ -266,7 +279,7 @@ def _format_failure(
             exitcode=failure.exitcode,
             pid=failure.pid,
             error_file=failure.error_file,
-            message=failure.message,
+            message=msg,
         )
         width = 0
         for line in fmt.split("\n"):
diff --git a/torch/distributed/elastic/multiprocessing/errors/error_handler.py b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
index 74586e9fd8523..2974355fae88c 100644
--- a/torch/distributed/elastic/multiprocessing/errors/error_handler.py
+++ b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
@@ -23,7 +23,7 @@ def _write_error(e: BaseException, error_file: Optional[str]):
         "message": {
             "message": f"{type(e).__name__}: {e}",
             "extraInfo": {
-                "py_callstack": traceback.format_exc(),
+                "py_callstack": traceback.format_stack(),
                 "timestamp": str(int(time.time())),
             },
         }

From aeec177833cb20e8c6177ef8dbcf02ddc37c8a32 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Thu, 26 Aug 2021 14:09:10 -0700
Subject: [PATCH 266/530] [JIT] UseVariadicOp takes list_idx parameter (#63915)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63915

Previously, this function only worked for variadic op substitutions of the form `op(list, args) -> variadic_op(list_1, ..., list_n, args)`. This change allows for transformations of the form `op(args_0, list, args_1) -> variadic_op(args_0, list_1, ..., list_n, args_1)`.

Test Plan:
`buck test caffe2/test/cpp/jit:jit -- Stack Concat`

(tests exercising `list_idx != 0` will be added further up in this diff stack)

Reviewed By: navahgar

Differential Revision: D30529729

fbshipit-source-id: 568080679c3b40bdaedee56bef2e8a5ce7985d2f
---
 torch/csrc/jit/passes/variadic_ops.cpp | 47 +++++++++++++++++++-------
 torch/csrc/jit/passes/variadic_ops.h   | 12 +++++++
 2 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/torch/csrc/jit/passes/variadic_ops.cpp b/torch/csrc/jit/passes/variadic_ops.cpp
index 6f4d23cec7b66..a827d3a2371d8 100644
--- a/torch/csrc/jit/passes/variadic_ops.cpp
+++ b/torch/csrc/jit/passes/variadic_ops.cpp
@@ -14,8 +14,12 @@ class VariadicUpdater {
   explicit VariadicUpdater(
       std::shared_ptr<Graph> graph,
       NodeKind op,
-      NodeKind variadic_op)
-      : graph_(std::move(graph)), op_(op), variadic_op_(variadic_op) {}
+      NodeKind variadic_op,
+      size_t list_idx = 0)
+      : graph_(std::move(graph)),
+        op_(op),
+        variadic_op_(variadic_op),
+        list_idx_(list_idx) {}
 
   bool run() {
     collectOpNodes(graph_->block());
@@ -39,21 +43,34 @@ class VariadicUpdater {
   }
 
   bool replaceWithVariadicOp(Node* op_node) {
-    if (op_node->input(0)->node()->kind() != prim::ListConstruct) {
+    const size_t num_inputs = op_node->inputs().size();
+    TORCH_CHECK(list_idx_ < num_inputs);
+    if (op_node->input(list_idx_)->node()->kind() != prim::ListConstruct) {
       return false;
     }
-    auto list = op_node->input(0)->node();
+    auto list = op_node->input(list_idx_)->node();
+    const size_t list_len = list->inputs().size();
+
     // We do not transform ops whose list input can not be moved to the
     // position before op. This in turn implies that there is some mutation
     // of the input list before op.
     if (!getOrCreateAliasDb()->couldMoveBeforeTopologically(list, op_node)) {
       return false;
     }
-    std::vector<Value*> inputs = list->inputs().vec();
-    // Add non-list inputs
-    for (size_t i = 1; i < op_node->inputs().size(); ++i) {
-      inputs.push_back(op_node->input(i));
-    }
+
+    // Construct new inputs
+    std::vector<Value*> inputs;
+    inputs.reserve(num_inputs + list_len - 1);
+    inputs.insert(
+        inputs.end(),
+        op_node->inputs().begin(),
+        op_node->inputs().begin() + list_idx_);
+    inputs.insert(inputs.end(), list->inputs().begin(), list->inputs().end());
+    inputs.insert(
+        inputs.end(),
+        op_node->inputs().begin() + list_idx_ + 1,
+        op_node->inputs().end());
+
     auto var_op_node = op_node->owningGraph()->create(variadic_op_, inputs);
     GRAPH_UPDATE("Adding\n", *var_op_node);
     var_op_node->insertBefore(op_node);
@@ -82,6 +99,8 @@ class VariadicUpdater {
 
   NodeKind op_;
   NodeKind variadic_op_;
+
+  size_t list_idx_;
 };
 
 } // namespace
@@ -89,10 +108,11 @@ class VariadicUpdater {
 bool UseVariadicOp(
     const std::shared_ptr<Graph>& graph,
     NodeKind op,
-    NodeKind variadic_op) {
+    NodeKind variadic_op,
+    size_t list_idx) {
   const std::string pass_name = std::string("variadic ") + op.toQualString();
   GRAPH_DUMP("Before " + pass_name, graph);
-  bool changed = VariadicUpdater(graph, op, variadic_op).run();
+  bool changed = VariadicUpdater(graph, op, variadic_op, list_idx).run();
   if (changed) {
     GRAPH_DUMP("After " + pass_name, graph);
   }
@@ -102,13 +122,14 @@ bool UseVariadicOp(
 bool RemoveListMutationAndUseVariadicOp(
     const std::shared_ptr<Graph>& graph,
     NodeKind op,
-    NodeKind variadic_op) {
+    NodeKind variadic_op,
+    size_t list_idx) {
   bool changed_in_last_iter = true;
   bool changed = false;
   while (changed_in_last_iter) {
     changed_in_last_iter = RemoveListMutation(graph);
     changed_in_last_iter =
-        UseVariadicOp(graph, op, variadic_op) || changed_in_last_iter;
+        UseVariadicOp(graph, op, variadic_op, list_idx) || changed_in_last_iter;
     changed = changed || changed_in_last_iter;
   }
   return changed;
diff --git a/torch/csrc/jit/passes/variadic_ops.h b/torch/csrc/jit/passes/variadic_ops.h
index 20cc6648dddb4..e5f6a680c5039 100644
--- a/torch/csrc/jit/passes/variadic_ops.h
+++ b/torch/csrc/jit/passes/variadic_ops.h
@@ -19,5 +19,17 @@ TORCH_API bool UseVariadicStack(const std::shared_ptr<Graph>& graph);
 TORCH_API bool RemoveListMutationAndUseVariadicStack(
     const std::shared_ptr<Graph>& graph);
 
+TORCH_API bool UseVariadicOp(
+    const std::shared_ptr<Graph>& graph,
+    NodeKind op,
+    NodeKind variadic_op,
+    size_t list_idx = 0);
+
+TORCH_API bool RemoveListMutationAndUseVariadicOp(
+    const std::shared_ptr<Graph>& graph,
+    NodeKind op,
+    NodeKind variadic_op,
+    size_t list_idx = 0);
+
 } // namespace jit
 } // namespace torch

From 7861dba7f697f91a19d7fa137a2e15799959e2ca Mon Sep 17 00:00:00 2001
From: Facebook Community Bot <facebook-github-bot@users.noreply.github.com>
Date: Thu, 26 Aug 2021 15:18:37 -0700
Subject: [PATCH 267/530] Automated submodule update: FBGEMM (#62879)

Summary:
This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM).

New submodule commit: https://github.com/pytorch/FBGEMM/commit/ce5470385723b0262b47250d6af05f1b734e4509

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62879

Test Plan: Ensure that CI jobs succeed on GitHub before landing.

Reviewed By: jspark1105

Differential Revision: D30154801

fbshipit-source-id: b2ce185da6f6cadf5128f82b15097d9e13e9e6a0
---
 third_party/fbgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/fbgemm b/third_party/fbgemm
index 10ec0d3388579..d4902e94367b9 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 10ec0d33885795e6b4cc9a17896ee3f25b48fa8e
+Subproject commit d4902e94367b9f074cadd29d7dc5ef6b0c69c6c1

From 085278f8b141579c5d5481a8fb96c7dfa830b262 Mon Sep 17 00:00:00 2001
From: MengeTM <34686199+MengeTM@users.noreply.github.com>
Date: Thu, 26 Aug 2021 15:32:06 -0700
Subject: [PATCH 268/530] Derivatives of relu (#63027) (#63089)

Summary:
Optimization of relu and leaky_relu derivatives for reduction of VRAM needed for the backward-passes

Fixes https://github.com/pytorch/pytorch/issues/63027

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63089

Reviewed By: iramazanli

Differential Revision: D30582049

Pulled By: albanD

fbshipit-source-id: a9481fe8c10cbfe2db485e28ce80cabfef501eb8
---
 tools/autograd/derivatives.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 49e574a1651ba..641471ebc8f06 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1604,10 +1604,6 @@
   self: soft_margin_loss_backward(grad, self, target, reduction)
 
 - name: relu(Tensor self) -> Tensor
-  self: threshold_backward(grad, self, 0)
-
-# NB: `output` instead of `self` saves memory. It avoids saving a copy of self.
-- name: relu_(Tensor(a!) self) -> Tensor(a!)
   self: threshold_backward(grad, result, 0)
 
 - name: silu(Tensor self) -> Tensor

From 49b782b2cb09c80a8c476287509a4f566cc597d2 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Thu, 26 Aug 2021 15:42:00 -0700
Subject: [PATCH 269/530] Add shard number to print_test_stats.py upload name
 (#64055)

Summary:
Now that the render test results job is gone, each shard on GHA is uploading a JSON test stats report. To ensure differentiation, this PR includes the shard number in the report name.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64055

Reviewed By: iramazanli

Differential Revision: D30586869

Pulled By: janeyx99

fbshipit-source-id: fd19f347131deec51486bb0795e4e13ac19bc71a
---
 tools/stats/print_test_stats.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/stats/print_test_stats.py b/tools/stats/print_test_stats.py
index 71df463b14516..1f4c33e8feb43 100755
--- a/tools/stats/print_test_stats.py
+++ b/tools/stats/print_test_stats.py
@@ -781,14 +781,16 @@ def assemble_s3_object(
 
 def send_report_to_s3(head_report: Version2Report) -> None:
     job = os.getenv('JOB_BASE_NAME', os.environ.get('CIRCLE_JOB'))
+    # SHARD_NUMBER is specific to GHA jobs, as the shard number would be included in CIRCLE_JOB already
+    shard = os.environ.get('SHARD_NUMBER', '')
     sha1 = os.environ.get('CIRCLE_SHA1')
     branch = os.environ.get('CIRCLE_BRANCH', '')
     now = datetime.datetime.utcnow().isoformat()
     if branch not in ['master', 'nightly'] and not branch.startswith("release/"):
         pr = os.environ.get('CIRCLE_PR_NUMBER', 'unknown')
-        key = f'pr_test_time/{pr}/{sha1}/{job}/{now}Z.json.bz2'  # Z meaning UTC
+        key = f'pr_test_time/{pr}/{sha1}/{job}{shard}/{now}Z.json.bz2'  # Z meaning UTC
     else:
-        key = f'test_time/{sha1}/{job}/{now}Z.json.bz2'  # Z meaning UTC
+        key = f'test_time/{sha1}/{job}{shard}/{now}Z.json.bz2'  # Z meaning UTC
     obj = get_S3_object_from_bucket('ossci-metrics', key)
     # use bz2 because the results are smaller than gzip, and the
     # compression time penalty we pay is only about half a second for

From 49353e319cd6537509f75463f41ddf4ac70e3cb6 Mon Sep 17 00:00:00 2001
From: Bo Wang <bowangbj@fb.com>
Date: Thu, 26 Aug 2021 16:00:16 -0700
Subject: [PATCH 270/530] More sharded_tensor creation ops:
 harded_tensor.zeros, sharded_tensor.full, sharded_tensor.rand (#63732)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63732

Test Plan:
$ python test/distributed/_sharded_tensor/test_sharded_tensor.py  --v

$ python test/distributed/_sharded_tensor/test_sharded_tensor.py TestCreateTensorFromParams --v
$ python test/distributed/_sharded_tensor/test_sharded_tensor.py TestShardedTensorChunked --v

Imported from OSS

Differential Revision:
D30472621
D30472621

Reviewed By: pritamdamania87

Pulled By: bowangbj

fbshipit-source-id: fd8ebf9b815fdc292ad1aad521f9f4f454163d0e
---
 .../_sharded_tensor/test_sharded_tensor.py    | 197 +++++++++++++++++-
 torch/distributed/_sharded_tensor/__init__.py | 183 +++++++++++++++-
 torch/distributed/_sharded_tensor/api.py      |  42 +++-
 3 files changed, 403 insertions(+), 19 deletions(-)

diff --git a/test/distributed/_sharded_tensor/test_sharded_tensor.py b/test/distributed/_sharded_tensor/test_sharded_tensor.py
index 6c03d9fdf631c..718b594c831ee 100644
--- a/test/distributed/_sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_sharded_tensor/test_sharded_tensor.py
@@ -126,8 +126,9 @@ def wrapper(self):
 class TestCreateTensorFromParams(TestCase):
     @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed')
     def test_empty(self):
+        expected_dtype = torch.double
         tensor_properties = TensorProperties(
-            dtype=torch.double,
+            dtype=expected_dtype,
             layout=torch.strided,
             requires_grad=False,
             pin_memory=False,
@@ -138,14 +139,15 @@ def test_empty(self):
         local_tensor = _create_tensor_from_params(
             5, 10, local_device=local_device, tensor_init_params=tensor_init_params)
         self.assertEqual(local_device, local_tensor.device)
-        self.assertEqual(torch.double, local_tensor.dtype)
+        self.assertEqual(expected_dtype, local_tensor.dtype)
         self.assertEqual(torch.strided, local_tensor.layout)
         self.assertEqual(False, local_tensor.requires_grad)
 
     @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed')
     def test_ones(self):
+        expected_dtype = torch.double
         tensor_properties = TensorProperties(
-            dtype=torch.double,
+            dtype=expected_dtype,
             layout=torch.strided,
             requires_grad=False,
             pin_memory=False,
@@ -153,9 +155,98 @@ def test_ones(self):
         tensor_init_params = TensorInitParams(
             create_op=CreateOp.ONES, tensor_properties=tensor_properties)
         local_device = torch.device('cuda:0')
+        h, w = 5, 10
         local_tensor = _create_tensor_from_params(
-            5, 10, local_device=local_device, tensor_init_params=tensor_init_params)
-        expected_tensor = torch.ones(5, 10, device=local_device, dtype=torch.double)
+            h, w, local_device=local_device, tensor_init_params=tensor_init_params)
+        expected_tensor = torch.ones(h, w, device=local_device, dtype=expected_dtype)
+        self.assertEqual(expected_tensor, local_tensor)
+
+    @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed')
+    def test_zeros(self):
+        expected_dtype = torch.int32
+        tensor_properties = TensorProperties(
+            dtype=expected_dtype,
+            layout=torch.strided,
+            requires_grad=False,
+            pin_memory=False,
+            memory_format=torch.contiguous_format,
+        )
+        tensor_init_params = TensorInitParams(create_op=CreateOp.ZEROS, tensor_properties=tensor_properties, )
+        local_device = torch.device('cuda:0')
+        h, w = 5, 10
+        local_tensor = _create_tensor_from_params(
+            h, w, local_device=local_device, tensor_init_params=tensor_init_params)
+        expected_tensor = torch.zeros(h, w, device=local_device, dtype=expected_dtype)
+        self.assertEqual(expected_tensor, local_tensor)
+
+    @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed')
+    def test_rand(self):
+        expected_dtype = torch.double
+        tensor_properties = TensorProperties(
+            dtype=expected_dtype,
+            layout=torch.strided,
+            requires_grad=False,
+            pin_memory=False,
+            memory_format=torch.contiguous_format,
+        )
+        tensor_init_params = TensorInitParams(create_op=CreateOp.RAND, tensor_properties=tensor_properties, )
+        local_device = torch.device('cuda:0')
+        h, w = 5, 10
+        seed = 13
+        torch.cuda.manual_seed(seed)
+        local_tensor = _create_tensor_from_params(
+            h, w, local_device=local_device, tensor_init_params=tensor_init_params)
+        # reset seed to ensure same random numbers are generated
+        torch.cuda.manual_seed(seed)
+        expected_tensor = torch.rand(h, w, device=local_device, dtype=expected_dtype)
+        self.assertEqual(expected_tensor, local_tensor)
+
+    @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed')
+    def test_full_with_dtype_inferred(self):
+        fill_value = 23.5
+        tensor_properties = TensorProperties(
+            # tensor's dtype can be inferred from fill_value
+            dtype=None,
+            layout=torch.strided,
+            requires_grad=False,
+            pin_memory=False,
+            memory_format=torch.contiguous_format,
+        )
+        tensor_init_params = TensorInitParams(
+            create_op=CreateOp.FULL,
+            fill_value=fill_value,
+            tensor_properties=tensor_properties, )
+        local_device = torch.device('cuda:0')
+        h, w = 5, 10
+        local_tensor = _create_tensor_from_params(
+            h, w, local_device=local_device, tensor_init_params=tensor_init_params)
+        # local_tensor.dtype is inferred from fill_value (float32).
+        self.assertEqual(torch.float32, local_tensor.dtype)
+        expected_tensor = torch.full((h, w), fill_value=fill_value, device=local_device)
+        self.assertEqual(expected_tensor, local_tensor)
+
+    @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed')
+    def test_full_with_dtype_overridden(self):
+        fill_value = 23.5
+        tensor_properties = TensorProperties(
+            # tensor's dtype can be inferred from fill_value
+            dtype=torch.double,
+            layout=torch.strided,
+            requires_grad=False,
+            pin_memory=False,
+            memory_format=torch.contiguous_format,
+        )
+        tensor_init_params = TensorInitParams(
+            create_op=CreateOp.FULL,
+            fill_value=fill_value,
+            tensor_properties=tensor_properties, )
+        local_device = torch.device('cuda:0')
+        h, w = 5, 10
+        local_tensor = _create_tensor_from_params(
+            h, w, local_device=local_device, tensor_init_params=tensor_init_params)
+        # local_tensor.dtype is overridden.
+        self.assertEqual(torch.double, local_tensor.dtype)
+        expected_tensor = torch.full((h, w), fill_value=fill_value, device=local_device, dtype=torch.double)
         self.assertEqual(expected_tensor, local_tensor)
 
 class TestShardedTensorChunked(ShardedTensorTestBase, MultiProcessTestCase):
@@ -292,6 +383,102 @@ def test_create_sharded_tensor_with_ones(self):
         self.assertEqual((expected_h, w), local_shard.size())
         self.assertEqual(local_shard, torch.ones(expected_h, w))
 
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_create_sharded_tensor_with_zeros(self):
+        """ Test _sharded_tensor.zeros(...) """
+
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+        h, w = 10, 20
+        sharded_tensor = _sharded_tensor.zeros(spec, h, w)
+
+        # Validate local shard is initialized with torch.zeros
+        local_shards = sharded_tensor.local_shards()
+        self.assertEqual(1, len(local_shards))
+        local_shard = local_shards[0].tensor
+        self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.device)
+        # The split: for rank!=3 ceil(h/4)=3  for rank=3 1
+        expected_h = 1 if self.rank == 3 else math.ceil(h / 4)
+        self.assertEqual((expected_h, w), local_shard.size())
+        self.assertEqual(local_shard, torch.zeros(expected_h, w))
+
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_create_sharded_tensor_with_rand(self):
+        """ Test _sharded_tensor.rand(...) """
+
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+        h, w = 8, 2
+        seed = 1234
+
+        expected_h = 2
+        expected_device = torch.device(f"cuda:{self.rank}")
+        dtype = torch.double
+        torch.manual_seed(seed)
+        expected = torch.rand(expected_h, w, device=expected_device, dtype=dtype)
+        # reset seed to ensure the same random numbers are generated
+        torch.manual_seed(seed)
+        sharded_tensor = _sharded_tensor.rand(spec, h, w, dtype=dtype)
+
+        # Validate local shard is initialized with torch.rand
+        local_shards = sharded_tensor.local_shards()
+        self.assertEqual(1, len(local_shards))
+        local_shard = local_shards[0].tensor
+        self.assertEqual(expected_device, local_shard.device)
+        self.assertEqual((expected_h, w), local_shard.size())
+        self.assertEqual(expected, local_shard)
+
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_create_sharded_tensor_with_full(self):
+        """ Test _sharded_tensor.full(...) """
+
+        spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        )
+        h, w = 10, 20
+        fill_value = 1234
+        sharded_tensor = _sharded_tensor.full(spec, size=(h, w), fill_value=fill_value, dtype=torch.int32)
+
+        # Validate local shard is initialized with torch.full
+        local_shards = sharded_tensor.local_shards()
+        self.assertEqual(1, len(local_shards))
+        local_shard = local_shards[0].tensor
+        self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.device)
+        # The split: for rank!=3 ceil(h/4)=3  for rank=3 1
+        expected_h = 1 if self.rank == 3 else math.ceil(h / 4)
+        self.assertEqual((expected_h, w), local_shard.size())
+        self.assertEqual(local_shard,
+                         torch.full(size=(expected_h, w), fill_value=fill_value, dtype=torch.int32))
+
+
     @with_comms
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
diff --git a/torch/distributed/_sharded_tensor/__init__.py b/torch/distributed/_sharded_tensor/__init__.py
index 4cbdded8ba1c4..4f8646d54268c 100644
--- a/torch/distributed/_sharded_tensor/__init__.py
+++ b/torch/distributed/_sharded_tensor/__init__.py
@@ -1,7 +1,5 @@
-from typing import List
+# coding=utf-8
 
-import torch
-from torch.distributed._sharding_spec import ShardingSpec
 from .api import (
     CreateOp,
     Shard,
@@ -11,6 +9,9 @@
     TensorProperties,
     load_with_process_group,
 )
+from torch.distributed._sharding_spec import ShardingSpec
+from typing import List
+import torch
 
 
 def empty(sharding_spec: ShardingSpec,
@@ -23,7 +24,8 @@ def empty(sharding_spec: ShardingSpec,
           process_group=None,
           init_rrefs=False):
     """
-    Creates an empty :class:`ShardedTensor`. Needs to be called on all ranks in an SPMD fashion.
+    Returns a :class:`ShardedTensor` filled with uninitialized data.
+        Needs to be called on all ranks in an SPMD fashion.
 
     Args:
         sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification
@@ -74,7 +76,8 @@ def ones(sharding_spec: ShardingSpec,
          process_group=None,
          init_rrefs=False):
     """
-    Creates a ones :class:`ShardedTensor`. Needs to be called on all ranks in an SPMD fashion.
+    Returns a :class:`ShardedTensor` with the scalar value 1.
+        Needs to be called on all ranks in an SPMD fashion.
 
     Args:
         sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification
@@ -113,10 +116,172 @@ def ones(sharding_spec: ShardingSpec,
         init_rrefs=init_rrefs,
     )
 
-def init_from_local_shards(local_shards: List[Shard],
-                           sharded_tensor_metadata: ShardedTensorMetadata,
-                           process_group=None,
-                           init_rrefs=False):
+
+def rand(sharding_spec: ShardingSpec,
+         *size,
+         dtype=None,
+         layout=torch.strided,
+         requires_grad=False,
+         pin_memory=False,
+         memory_format=torch.contiguous_format,
+         process_group=None,
+         init_rrefs=False):
+    """
+    Returns a :class:`ShardedTensor` filled with random numbers from a uniform distribution on the
+        interval :math:`[0, 1)`. Needs to be called on all ranks in an SPMD fashion.
+
+    Args:
+        sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+        size (int...): a sequence of integers defining the shape of the output
+            tensor. Can be a variable number of arguments or a collection like a list or tuple.
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+
+    Returns:
+        A :class:`ShardedTensor` object on each rank
+    """
+    tensor_properties = TensorProperties(
+        dtype=dtype, layout=layout, requires_grad=requires_grad,
+        pin_memory=pin_memory, memory_format=memory_format
+    )
+    tensor_init_params = TensorInitParams(create_op=CreateOp.RAND, tensor_properties=tensor_properties, )
+    return ShardedTensor(
+        sharding_spec,
+        *size,
+        tensor_init_params=tensor_init_params,
+        process_group=process_group,
+        init_rrefs=init_rrefs,
+    )
+
+
+def zeros(sharding_spec: ShardingSpec,
+          *size,
+          dtype=None,
+          layout=torch.strided,
+          requires_grad=False,
+          pin_memory=False,
+          memory_format=torch.contiguous_format,
+          process_group=None,
+          init_rrefs=False):
+    """
+    Returns a :class:`ShardedTensor` filled with the scalar value 0.
+        Needs to be called on all ranks in an SPMD fashion.
+
+    Args:
+        sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+        size (int...): a sequence of integers defining the shape of the output
+            tensor. Can be a variable number of arguments or a collection like a list or tuple.
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+
+    Returns:
+        A :class:`ShardedTensor` object on each rank
+    """
+    tensor_properties = TensorProperties(
+        dtype=dtype, layout=layout, requires_grad=requires_grad,
+        pin_memory=pin_memory, memory_format=memory_format,
+    )
+    tensor_init_params = TensorInitParams(create_op=CreateOp.ZEROS, tensor_properties=tensor_properties, )
+    return ShardedTensor(
+        sharding_spec,
+        *size,
+        tensor_init_params=tensor_init_params,
+        process_group=process_group,
+        init_rrefs=init_rrefs,
+    )
+
+
+def full(sharding_spec: ShardingSpec,
+         size,
+         fill_value=torch.types.Number,
+         dtype=None,
+         layout=torch.strided,
+         requires_grad=False,
+         pin_memory=False,
+         memory_format=torch.contiguous_format,
+         process_group=None,
+         init_rrefs=False):
+    """
+    Creates a :class:`ShardedTensor` filled with fill_value. The tensor’s dtype
+        is inferred from fill_value. If dtype is specified, it will override the
+        inferred type from fill_value. Needs to be called on all ranks in an SPMD fashion.
+
+    Args:
+        sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+        size (int...):  a list, tuple, or `torch.Size` of integers defining the shape of the
+            output tensor.
+        fill_value (Scalar) – the value to fill the output tensor with.
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_tensor_type`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+
+    Returns:
+        A :class:`ShardedTensor` object on each rank
+    """
+    tensor_properties = TensorProperties(
+        dtype=dtype, layout=layout, requires_grad=requires_grad,
+        pin_memory=pin_memory, memory_format=memory_format,
+    )
+    tensor_init_params = TensorInitParams(
+        create_op=CreateOp.FULL, fill_value=fill_value, tensor_properties=tensor_properties)
+    return ShardedTensor(
+        sharding_spec,
+        *size,
+        tensor_init_params=tensor_init_params,
+        process_group=process_group,
+        init_rrefs=init_rrefs,
+    )
+
+
+def init_from_local_shards(
+        local_shards: List[Shard],
+        sharded_tensor_metadata: ShardedTensorMetadata,
+        process_group=None,
+        init_rrefs=False):
     """
     Creates an :class:`ShardedTensor` from local shards and the global metadata.
     Needs to be called on all ranks in an SPMD fashion.
diff --git a/torch/distributed/_sharded_tensor/api.py b/torch/distributed/_sharded_tensor/api.py
index ae1a3a9f38844..3b7476dc25bcf 100644
--- a/torch/distributed/_sharded_tensor/api.py
+++ b/torch/distributed/_sharded_tensor/api.py
@@ -22,7 +22,7 @@
     check_tensor,
     validate_non_overlapping_shards_metadata
 )
-
+from torch.types import Number
 
 # Tracking for sharded tensor objects.
 _sharded_tensor_lock = threading.Lock()
@@ -143,17 +143,28 @@ def _register_remote_shards(sharded_tensor_id: int, rrefs: List[rpc.RRef[Shard]]
 
 class CreateOp(Enum):
     EMPTY = 0
-    ONES = 1
+    FULL = 1
+    ONES = 2
+    RAND = 3
+    ZEROS = 4
 
 
 @dataclass
 class TensorInitParams(object):
     """ Container for list of common params to create new local tensor. """
 
-    __slots__ = ['create_op', 'tensor_properties']
-
     create_op: CreateOp
-    tensor_properties: TensorProperties
+
+    # needed when create_op is FULL
+    # default set to False (not None) since None is incompatible with Number.
+    fill_value: Number = field(default=False)
+
+    tensor_properties: TensorProperties = field(
+        default=TensorProperties(dtype=torch.get_default_dtype(),
+                                 layout=torch.strided,
+                                 requires_grad=False,
+                                 memory_format=torch.contiguous_format,
+                                 pin_memory=False))
 
 
 class ShardedTensor(object):
@@ -684,5 +695,26 @@ def _create_tensor_from_params(*size, local_device, tensor_init_params: TensorIn
                            device=local_device, requires_grad=requires_grad,
                            # NB: memory_format param is not accepted by torch.ones
                            memory_format=memory_format, pin_memory=pin_memory,)
+    elif tensor_init_params.create_op == CreateOp.ZEROS:
+        return torch.zeros(*size,
+                           dtype=dtype,
+                           layout=layout,
+                           device=local_device,
+                           pin_memory=pin_memory,
+                           requires_grad=requires_grad,)
+    elif tensor_init_params.create_op == CreateOp.RAND:
+        return torch.rand(*size,
+                          dtype=dtype,
+                          layout=layout,
+                          device=local_device,
+                          pin_memory=pin_memory,
+                          requires_grad=requires_grad,)
+    elif tensor_init_params.create_op == CreateOp.FULL:
+        return torch.full(size=size,
+                          fill_value=tensor_init_params.fill_value,
+                          layout=layout,
+                          dtype=dtype,
+                          requires_grad=requires_grad,
+                          device=local_device, )
     else:
         raise ValueError(f'Unsupported create_op: {tensor_init_params.create_op}')

From 92a154aa29186afea961a6fe491721229543535f Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Thu, 26 Aug 2021 16:00:21 -0700
Subject: [PATCH 271/530] Move variabletype functions around (#63330)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63330

 - This is in preparation for templated/boxed autograd-not-implemented fallback
 - Make sure VariableTypeUtils does not depend on generated code
 - Lift `isFwGradDefined` into `autograd/functions/utils.cpp` so it's available to mobile builds
 - Removes `using namespace at` from VariableTypeUtils, previously we needed this for Templated version, but now its not strictly necessary but still a good change to avoid name conflicts if this header is included elsewhere in the future.

Test Plan: Imported from OSS

Reviewed By: heitorschueroff

Differential Revision: D30518573

Pulled By: soulitzer

fbshipit-source-id: a0fb904baafc9713de609fffec4b813f6cfcc000
---
 tools/autograd/templates/VariableType.cpp  |  1 +
 torch/csrc/autograd/FunctionsManual.cpp    |  6 +--
 torch/csrc/autograd/FunctionsManual.h      |  1 -
 torch/csrc/autograd/VariableTypeManual.cpp |  5 ++-
 torch/csrc/autograd/VariableTypeUtils.h    | 50 ++++++++++------------
 torch/csrc/autograd/functions/utils.h      |  5 +++
 6 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 1ff3604ec21ea..605a700fb1a47 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -1,4 +1,5 @@
 #include "torch/csrc/autograd/VariableTypeUtils.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
 #include "torch/csrc/autograd/FunctionsManual.h"
 
 #include <ATen/RedispatchFunctions.h>
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 86639c13ea678..95170f073fc38 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -1,5 +1,7 @@
 #include <torch/csrc/autograd/FunctionsManual.h>
 #include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/autograd/functions/utils.h>
+
 
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
@@ -44,10 +46,6 @@ bool isDefined(const c10::optional<Tensor>& t) {
   return t.has_value() && t->defined();
 }
 
-bool isFwGradDefined(const c10::optional<Tensor>& t) {
-  return t.has_value() && t->defined() && t->_fw_grad(/*level */ 0).defined();
-}
-
 Tensor toNonOptTensor(const c10::optional<Tensor>& t) {
   return t.has_value() ? *t : Tensor();
 }
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index d397f55d15189..31a972e3f3280 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -31,7 +31,6 @@ struct IndexRangeGenerator {
     size_t i = 0;
 };
 
-bool isFwGradDefined(const c10::optional<Tensor>& t);
 Tensor toNonOptFwGrad(const c10::optional<Tensor>& t);
 Tensor toNonOptPrimal(const c10::optional<Tensor>& t);
 Tensor toNonOptTensor(const c10::optional<Tensor>& t);
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index f409daa9b83d6..25f05fc110177 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -2,6 +2,7 @@
 #include <c10/core/ScalarType.h>
 #include <torch/csrc/autograd/VariableTypeUtils.h>
 #include <torch/csrc/autograd/FunctionsManual.h>
+#include <torch/csrc/autograd/functions/utils.h>
 #include <torch/csrc/utils/memory.h>
 #include <torch/csrc/autograd/autograd.h>
 #include <ATen/TracerMode.h>
@@ -100,7 +101,7 @@ Tensor _fw_primal(c10::DispatchKeySet ks, const Tensor & self, int64_t level) {
   if (grad_fn) {
       set_history(flatten_tensor_args( result ), grad_fn);
   }
-  if (generated::details::isFwGradDefined(self)) {
+  if (isFwGradDefined(self)) {
     // Modified from original codegen
     // We explicitly want to ignore the forward grad at the given level
     TORCH_CHECK(level == 0, "Invalid level given to _fw_primal");
@@ -131,7 +132,7 @@ Tensor & copy_(c10::DispatchKeySet ks, Tensor & self, const Tensor & src, bool n
   rebase_history(self , std::move(grad_fn));
 
   if (isDifferentiableType(self.scalar_type()) &&
-      (generated::details::isFwGradDefined(self) || generated::details::isFwGradDefined(src))) {
+      (isFwGradDefined(self) || isFwGradDefined(src))) {
     auto self_fw_grad = generated::details::toNonOptFwGrad(self);
     auto src_fw_grad = generated::details::toNonOptFwGrad(src);
     Tensor new_fw_grad;
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index bde2dc46352da..977e9e4cecd5c 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -1,14 +1,12 @@
 #pragma once
 
 #include <c10/util/irange.h>
-#include <torch/csrc/autograd/generated/VariableType.h>
 
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/edge.h>
 #include <torch/csrc/autograd/grad_mode.h>
 #include <torch/csrc/autograd/saved_variable.h>
-#include <torch/csrc/autograd/generated/Functions.h>
 #include <torch/csrc/autograd/functions/tensor.h>
 #include <torch/csrc/autograd/functions/basic_ops.h>
 #include <torch/csrc/jit/frontend/tracer.h>
@@ -35,9 +33,6 @@
 #endif
 #endif
 
-using namespace at;
-using namespace torch::autograd::generated;
-
 namespace torch { namespace autograd {
 
 // The requires_grad argument is used to know if the inplace operation needs
@@ -47,7 +42,7 @@ namespace torch { namespace autograd {
 // a = torch.rand(2)
 // b = torch.rand(2, requires_grad=True)
 // a.copy_(b)
-inline void check_inplace(const Tensor& tensor, bool requires_grad) {
+inline void check_inplace(const at::Tensor& tensor, bool requires_grad) {
   if (requires_grad && GradMode::is_enabled()) {
     auto diff_view_meta = impl::get_view_autograd_meta(tensor);
     if (diff_view_meta && diff_view_meta->has_bw_view()) {
@@ -65,7 +60,7 @@ inline void check_inplace(const Tensor& tensor, bool requires_grad) {
   }
 }
 
-inline void check_inplace(const TensorList tensors, bool requires_grad) {
+inline void check_inplace(const at::TensorList tensors, bool requires_grad) {
   for (const auto& tensor : tensors) {
     check_inplace(tensor, requires_grad);
   }
@@ -77,14 +72,14 @@ inline void throw_error_out_requires_grad(const char* name) {
       "but one of the arguments requires grad.");
 }
 
-inline void throw_error_for_complex_autograd(const Tensor& tensor, const char* name) {
+inline void throw_error_for_complex_autograd(const at::Tensor& tensor, const char* name) {
   if (tensor.requires_grad()) {
     TORCH_CHECK(!tensor.is_complex(), name,
                 " does not support automatic differentiation for outputs with complex dtype.");
   }
 }
 
-inline void throw_error_for_complex_autograd(const TensorList& tensorlist, const char* name) {
+inline void throw_error_for_complex_autograd(const at::TensorList& tensorlist, const char* name) {
   for (const auto& tensor: tensorlist) {
     throw_error_for_complex_autograd(tensor, name);
   }
@@ -114,7 +109,7 @@ inline void rebase_history(std::vector<Variable>&& vars, std::shared_ptr<Node> g
   }
 }
 
-inline void increment_version(const Tensor & t) {
+inline void increment_version(const at::Tensor & t) {
   impl::bump_version(t);
 }
 
@@ -138,8 +133,8 @@ template<typename... Args> inline variable_list flatten_tensor_args(Args&&... ar
 }
 
 // See NOTE [ Autograd View Variables ] for details.
-inline Tensor as_view(const Tensor & base, const Tensor & tensor, bool is_bw_differentiable,
-        bool is_fw_differentiable, std::function<Tensor(const Tensor&)> view_func=nullptr,
+inline at::Tensor as_view(const at::Tensor & base, const at::Tensor & tensor, bool is_bw_differentiable,
+        bool is_fw_differentiable, std::function<at::Tensor(const at::Tensor&)> view_func=nullptr,
         CreationMeta creation_meta=CreationMeta::DEFAULT, bool allow_tensor_metadata_change=true) {
   // Note [View of inference tensor]
   // For inference tensor this code can only be hit outside InferenceMode
@@ -202,7 +197,7 @@ inline Tensor as_view(const Tensor & base, const Tensor & tensor, bool is_bw_dif
 }
 
 // See NOTE [ Autograd View Variables ] for details.
-inline std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor>& tensors, bool is_bw_differentiable,
+inline std::vector<at::Tensor> as_view(const at::Tensor & base, std::vector<at::Tensor>& tensors, bool is_bw_differentiable,
                                    bool is_fw_differentiable, CreationMeta creation_meta=CreationMeta::DEFAULT) {
   // See Note [View of inference tensor]
   if (base.is_inference()) return tensors;
@@ -228,7 +223,7 @@ inline std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor>& ten
       new_shared_info = ViewInfo(base, /* view_func */ nullptr);
     }
 
-    for(Tensor &tensor : tensors) {
+    for(at::Tensor &tensor : tensors) {
       if (is_fw_differentiable || is_bw_differentiable) {
         tensor = make_variable_differentiable_view(tensor, new_shared_info, c10::nullopt, /*shared_view_info*/ true, creation_meta);
       } else {
@@ -282,7 +277,7 @@ inline std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor>& ten
     creation_meta = propagate_creation_meta(diff_view_meta->get_creation_meta(), creation_meta);
   }
 
-  for(Tensor &tensor : tensors) {
+  for(at::Tensor &tensor : tensors) {
     if (is_fw_differentiable || is_bw_differentiable) {
       tensor = make_variable_differentiable_view(tensor, new_bw_info, new_fw_info, /*shared_view_info*/ false, creation_meta);
     } else {
@@ -292,20 +287,20 @@ inline std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor>& ten
   return tensors;
 }
 
-inline void check_no_requires_grad(const Tensor& tensor, const char* name,
+inline void check_no_requires_grad(const at::Tensor& tensor, const char* name,
                                    const char* fn_name="", bool check_grad_mode=true) {
   TORCH_CHECK(!(tensor.defined() && tensor.requires_grad()) || !(check_grad_mode && GradMode::is_enabled()),
               "The function '", fn_name, "' is not differentiable with respect to argument '", name,
               "'. This input cannot have requires_grad True.");
 }
 
-inline void check_no_requires_grad(const c10::optional<Tensor>& tensor, const char* name, const char* fn_name="") {
+inline void check_no_requires_grad(const c10::optional<at::Tensor>& tensor, const char* name, const char* fn_name="") {
   if (tensor.has_value()) {
     check_no_requires_grad(*tensor, name, fn_name);
   }
 }
 
-inline void check_no_requires_grad(TensorList tensors, const char* name, const char* fn_name="") {
+inline void check_no_requires_grad(at::TensorList tensors, const char* name, const char* fn_name="") {
   // GradMode check is expensive, so check it only once for TensorLists
   if (!GradMode::is_enabled()) {
     return;
@@ -315,12 +310,12 @@ inline void check_no_requires_grad(TensorList tensors, const char* name, const c
   }
 }
 
-inline void check_no_requires_grad(const c10::List<c10::optional<Tensor>>& tensors, const char* name, const char* fn_name="") {
+inline void check_no_requires_grad(const c10::List<c10::optional<at::Tensor>>& tensors, const char* name, const char* fn_name="") {
   // GradMode check is expensive, so check it only once for TensorLists
   if (!GradMode::is_enabled()) {
     return;
   }
-  for (c10::optional<Tensor> tensor : tensors) {
+  for (c10::optional<at::Tensor> tensor : tensors) {
     if (tensor.has_value()) {
       check_no_requires_grad(*tensor, name, fn_name, /*check_grad_mode*/ false);
     }
@@ -328,23 +323,23 @@ inline void check_no_requires_grad(const c10::List<c10::optional<Tensor>>& tenso
 }
 
 // Assumed that saved tensor lists are never inplace outputs
-inline std::vector<SavedVariable> make_saved_variable_list(TensorList tensors) {
-  return fmap(tensors, [](const Tensor& tensor) -> SavedVariable {
+inline std::vector<SavedVariable> make_saved_variable_list(at::TensorList tensors) {
+  return fmap(tensors, [](const at::Tensor& tensor) -> SavedVariable {
       return SavedVariable{tensor, false /* is output */}; });
 }
 
 // Assumed that saved tensor lists are never inplace outputs
 inline std::vector<SavedVariable> make_saved_variable_list(const c10::List<c10::optional<at::Tensor>>& tensors) {
-  return fmap(tensors, [](const c10::optional<Tensor>& tensor) -> SavedVariable {
+  return fmap(tensors, [](const c10::optional<at::Tensor>& tensor) -> SavedVariable {
     if (tensor.has_value()) {
       return SavedVariable{*tensor, false /* is output */};
     } else {
-      return SavedVariable{Tensor(), false /* is output */};
+      return SavedVariable{at::Tensor(), false /* is output */};
     }
   });
 }
 
-inline std::vector<std::vector<int64_t>> to_args_sizes(TensorList tensors) {
+inline std::vector<std::vector<int64_t>> to_args_sizes(at::TensorList tensors) {
   std::vector<std::vector<int64_t>> args_sizes(tensors.size());
   for (const auto i : c10::irange(tensors.size())) {
     args_sizes[i] = tensors[i].sizes().vec();
@@ -352,11 +347,12 @@ inline std::vector<std::vector<int64_t>> to_args_sizes(TensorList tensors) {
   return args_sizes;
 }
 
-inline std::vector<ScalarType> to_args_scalartypes(TensorList tensors) {
-  std::vector<ScalarType> args_scalartypes(tensors.size());
+inline std::vector<c10::ScalarType> to_args_scalartypes(at::TensorList tensors) {
+  std::vector<c10::ScalarType> args_scalartypes(tensors.size());
   for (const auto i : c10::irange(tensors.size())) {
     args_scalartypes[i] = tensors[i].scalar_type();
   }
   return args_scalartypes;
 }
+
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h
index 90811e2a30a37..331db5d32cb79 100644
--- a/torch/csrc/autograd/functions/utils.h
+++ b/torch/csrc/autograd/functions/utils.h
@@ -86,4 +86,9 @@ inline void set_history(
     set_history(variable, grad_fn);
   }
 }
+
+inline bool isFwGradDefined(const c10::optional<at::Tensor>& t) {
+  return t.has_value() && t->defined() && t->_fw_grad(/*level */ 0).defined();
+}
+
 }}

From dfa35ab3e710848353aa1d313c5d9127ed2ef745 Mon Sep 17 00:00:00 2001
From: Shijun Kong <shijunk@fb.com>
Date: Thu, 26 Aug 2021 16:06:17 -0700
Subject: [PATCH 272/530] [pytorch][quant][oss] Support 2-bit embedding_bag op
 "embedding_bag_2bit_rowwise_offsets" (#63658)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63658

Support 2-bit embedding_bag op "embedding_bag_2bit_rowwise_offsets"

Reviewed By: jingsh, supriyar

Differential Revision: D30454994

fbshipit-source-id: 7aa7bfe405c2ffff639d5658a35181036e162dc9
---
 .../native/quantized/cpu/qembeddingbag.cpp    | 127 +++++++++++++++---
 aten/src/ATen/native/quantized/library.cpp    |   1 +
 test/quantization/core/test_quantized_op.py   |  30 +++++
 3 files changed, 142 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
index 7adf05a1782ce..6aae3ba02ae09 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -141,9 +141,10 @@ at::Tensor& embedding_lookup_fallback_impl(
 }
 
 template <typename IndexType, typename OffsetType>
-at::Tensor& embedding_bag_4bit_impl(
+at::Tensor& embedding_bag_nbit_impl(
     at::Tensor& output,
     const at::Tensor& weight,
+    const int bit_width,
     const at::Tensor& indices,
     const at::Tensor& offsets,
     bool pruned_weights,
@@ -174,8 +175,9 @@ at::Tensor& embedding_bag_4bit_impl(
 
   const auto weight_sizes = weight.sizes();
   const int64_t weight_size = weight_sizes[1];
+  int NUM_ELEM_PER_BYTE = 8 / bit_width;
   const int64_t D =
-      (weight_size - 4) * 2; // NB: 2-byte fp16 scale and 2-byte zero_offset
+      (weight_size - 2 * sizeof(at::Half)) * NUM_ELEM_PER_BYTE; // NB: 2-byte fp16 scale and 2-byte zero_offset
   const int64_t M = offsets.sizes()[0];
 
   int64_t output_size = M - 1;
@@ -211,7 +213,7 @@ at::Tensor& embedding_bag_4bit_impl(
   if (!pruned_weights || fallback_to_no_sparse) {
     // Generate the fbgemm kernel
     auto kernel = fbgemm::GenerateEmbeddingSpMDMNBit<IndexType, OffsetType>(
-        /*bit rate=*/4,
+        /*bit rate=*/bit_width,
         /*block size=*/block_size,
         /*has weights=*/per_sample_weights_.has_value(),
         /*normalize_by_lengths=*/false,
@@ -234,11 +236,13 @@ at::Tensor& embedding_bag_4bit_impl(
 
     TORCH_CHECK(
         success,
-        "FBGEMM GenerateEmbeddingSpMDMNBit kernel failed for 4-bit input");
+        "FBGEMM GenerateEmbeddingSpMDMNBit kernel failed for ",
+        bit_width,
+        "-bit input");
   } else {
     auto kernel =
         fbgemm::GenerateEmbeddingSpMDMNBitRowWiseSparse<IndexType, OffsetType>(
-            /*bit rate=*/4,
+            /*bit rate=*/bit_width,
             /*block_size=*/block_size,
             /*has weights=*/per_sample_weights_.has_value(),
             /*normalize_by_lengths=*/false,
@@ -260,11 +264,14 @@ at::Tensor& embedding_bag_4bit_impl(
         /*compressed_indices_table=*/compressed_indices_mapping_data);
     TORCH_CHECK(
         success,
-        "FBGEMM GenerateEmbeddingSpMDMNBitRowWiseSparse kernel failed for 4-bit input");
+        "FBGEMM GenerateEmbeddingSpMDMNBitRowWiseSparse kernel failed for ",
+        bit_width,
+        "-bit input");
   }
   return output;
 #else
-  return embedding_lookup_fallback_impl<IndexType, OffsetType, 4, 2>(
+  if (bit_width == 4) {
+    return embedding_lookup_fallback_impl<IndexType, OffsetType, 4, 2>(
       weight,
       indices,
       offsets,
@@ -275,6 +282,19 @@ at::Tensor& embedding_bag_4bit_impl(
       output_size,
       include_last_offset,
       (pruned_weights && !fallback_to_no_sparse));
+  }
+  // bit_width == 2
+  return embedding_lookup_fallback_impl<IndexType, OffsetType, 2, 4>(
+    weight,
+    indices,
+    offsets,
+    per_sample_weights_,
+    compressed_indices_mapping,
+    output,
+    D,
+    output_size,
+    include_last_offset,
+    (pruned_weights && !fallback_to_no_sparse));
 #endif
 }
 
@@ -519,9 +539,10 @@ at::Tensor& embedding_bag_byte_helper(
       is_embedding_op);
 }
 
-at::Tensor& embedding_bag_4bit_helper(
+at::Tensor& _embedding_bag_nbit_helper(
     at::Tensor& output,
     const at::Tensor& weight,
+    const int bit_width,
     const at::Tensor& indices,
     const c10::optional<at::Tensor>& offsets_in,
     bool pruned_weights,
@@ -529,6 +550,10 @@ at::Tensor& embedding_bag_4bit_helper(
     const c10::optional<at::Tensor>& compressed_indices_mapping,
     bool include_last_offset) {
   c10::MaybeOwned<at::Tensor> offsets;
+  TORCH_CHECK(
+      bit_width == 4 || bit_width == 2,
+      "qembedding/qembedding_bag operator supports bit_width 2 or 4, got ",
+      bit_width);
   TORCH_CHECK(
       indices.dim() == 1 || indices.dim() == 2,
       "qembedding/qembedding_bag operator supports 1 or 2d indices, got ",
@@ -539,14 +564,14 @@ at::Tensor& embedding_bag_4bit_helper(
   if (indices.dim() == 2) {
     TORCH_CHECK(
         !offsets_in.has_value(),
-        "embedding_bag_4bit operator: input is 2D, then offsets has to be None, as input is treated is a mini-batch of fixed length sequences.");
+        "embedding_bag_4bit/embedding_bag_2bit operator: input is 2D, then offsets has to be None, as input is treated is a mini-batch of fixed length sequences.");
 
     offsets = c10::MaybeOwned<at::Tensor>::owned(at::arange(
         0, indices.numel(), indices.sizes()[1], indices.scalar_type()));
   } else {
     TORCH_CHECK(
         offsets_in.has_value(),
-        "embedding_bag_4bit operator expects offsets to be set for 1D indices.");
+        "embedding_bag_4bit/embedding_bag_2bit operator expects offsets to be set for 1D indices.");
     offsets = c10::MaybeOwned<at::Tensor>::borrowed(offsets_in.value());
   }
 
@@ -568,9 +593,10 @@ at::Tensor& embedding_bag_4bit_helper(
   // Using helper function to support different type combination without the
   // need to cast, which can be additional performance overhead
   if (indices.scalar_type() == at::kInt && offsets->scalar_type() == at::kInt) {
-    return embedding_bag_4bit_impl<int, int>(
+    return embedding_bag_nbit_impl<int, int>(
         output,
         weight,
+        bit_width,
         indices,
         *offsets,
         pruned_weights,
@@ -579,9 +605,10 @@ at::Tensor& embedding_bag_4bit_helper(
         include_last_offset);
   } else if (
       indices.scalar_type() == at::kInt && offsets->scalar_type() == at::kLong) {
-    return embedding_bag_4bit_impl<int, int64_t>(
+    return embedding_bag_nbit_impl<int, int64_t>(
         output,
         weight,
+        bit_width,
         indices,
         *offsets,
         pruned_weights,
@@ -590,9 +617,10 @@ at::Tensor& embedding_bag_4bit_helper(
         include_last_offset);
   } else if (
       indices.scalar_type() == at::kLong && offsets->scalar_type() == at::kInt) {
-    return embedding_bag_4bit_impl<int64_t, int>(
+    return embedding_bag_nbit_impl<int64_t, int>(
         output,
         weight,
+        bit_width,
         indices,
         *offsets,
         pruned_weights,
@@ -600,9 +628,10 @@ at::Tensor& embedding_bag_4bit_helper(
         compressed_indices_mapping,
         include_last_offset);
   }
-  return embedding_bag_4bit_impl<int64_t, int64_t>(
+  return embedding_bag_nbit_impl<int64_t, int64_t>(
       output,
       weight,
+      bit_width,
       indices,
       *offsets,
       pruned_weights,
@@ -650,9 +679,10 @@ at::Tensor PackedEmbeddingBagWeight::embeddingbag_4bit(
   }
 
   auto output = at::empty({0}, packed_w.options().dtype(at::kFloat));
-  return embedding_bag_4bit_helper(
+  return _embedding_bag_nbit_helper(
     output,
     packed_w,
+    4,
     indices,
     offsets_in,
     pruned_weights,
@@ -709,9 +739,44 @@ Tensor& embedding_bag_4bit_rowwise_offsets_out(
         per_sample_weights_.value().scalar_type(),
         " instead")
   }
-  return embedding_bag_4bit_helper(
+  return _embedding_bag_nbit_helper(
+      output,
+      weight,
+      4,
+      indices,
+      offsets_in,
+      pruned_weights,
+      per_sample_weights_.has_value()
+          ? per_sample_weights_.value().to(at::kFloat)
+          : per_sample_weights_,
+      compressed_indices_mapping,
+      include_last_offset);
+}
+
+Tensor& embedding_bag_2bit_rowwise_offsets_out(
+    Tensor& output,
+    const Tensor& weight,
+    const Tensor& indices,
+    const c10::optional<Tensor>& offsets_in,
+    const bool /* scale_grad_by_freq */,
+    const int64_t /* mode */,
+    bool pruned_weights,
+    const c10::optional<Tensor>& per_sample_weights_,
+    const c10::optional<Tensor>& compressed_indices_mapping,
+    bool include_last_offset) {
+
+  if (per_sample_weights_.has_value()) {
+    TORCH_CHECK(
+        (per_sample_weights_.value().scalar_type() == at::kFloat ||
+         per_sample_weights_.value().scalar_type() == at::kHalf),
+        "Expect fp32 or fp16 weights, but found",
+        per_sample_weights_.value().scalar_type(),
+        " instead")
+  }
+  return _embedding_bag_nbit_helper(
       output,
       weight,
+      2,
       indices,
       offsets_in,
       pruned_weights,
@@ -784,6 +849,33 @@ Tensor embedding_bag_4bit_rowwise_offsets(
   return output;
 }
 
+Tensor embedding_bag_2bit_rowwise_offsets(
+    const Tensor& weight,
+    const Tensor& indices,
+    const c10::optional<Tensor>& offsets_in,
+    const bool /* scale_grad_by_freq */,
+    const int64_t /* mode */,
+    bool pruned_weights,
+    const c10::optional<Tensor>& per_sample_weights_,
+    const c10::optional<Tensor>& compressed_indices_mapping,
+    bool include_last_offset) {
+
+  auto output = create_empty_from(weight, at::kFloat);
+  embedding_bag_2bit_rowwise_offsets_out(
+    output,
+    weight,
+    indices,
+    offsets_in,
+    false, // unused scale_grad_by_freq
+    0, // unused mode
+    pruned_weights,
+    per_sample_weights_,
+    compressed_indices_mapping,
+    include_last_offset
+  );
+  return output;
+}
+
 template <int bit_rate>
 class QEmbeddingBag final {
  public:
@@ -869,6 +961,9 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_rowwise_offsets"),
       embedding_bag_4bit_rowwise_offsets);
+  m.impl(
+      TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_rowwise_offsets"),
+      embedding_bag_2bit_rowwise_offsets);
 }
 } // namespace
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 7cdb5cb35817a..8ead74f326ff2 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -128,6 +128,7 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, bool pruned_weights=False) -> Tensor"));
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 18212671aabaa..9243fe2440173 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -3318,6 +3318,9 @@ def embedding_bag_rowwise_offsets_run(
         if bit_rate == 4:
             pt_op = torch.ops.quantized.embedding_bag_4bit_rowwise_offsets
             pt_prepack_op = torch.ops.quantized.embedding_bag_4bit_prepack
+        elif bit_rate == 2:
+            pt_op = torch.ops.quantized.embedding_bag_2bit_rowwise_offsets
+            pt_prepack_op = torch.ops.quantized.embedding_bag_2bit_prepack
 
         weights = torch.from_numpy((np.random.random_sample((
             num_embeddings, embedding_dim)) + 1).astype(np.float32))
@@ -3483,6 +3486,33 @@ def test_embedding_bag_4bit(self, num_embeddings,
                                                sparsity=sparsity,
                                                atol=0.1, rtol=1e-2)
 
+    """ Tests the correctness of the embedding_bag_2bit quantized operator """
+    @given(num_embeddings=st.integers(10, 100),
+           embedding_dim=st.integers(5, 50).filter(lambda x: x % 8 == 0),
+           num_offsets=st.integers(1, 20),
+           use_32bit_indices=st.booleans(),
+           use_32bit_offsets=st.booleans(),
+           enable_per_sample_weights=st.booleans(),
+           include_last_offset=st.booleans(),
+           fallback_to_no_sparse=st.booleans(),
+           sparsity=st.sampled_from([0.0, 0.5, 0.7]))
+    def test_embedding_bag_2bit(self, num_embeddings,
+                                embedding_dim, num_offsets,
+                                use_32bit_indices,
+                                use_32bit_offsets,
+                                enable_per_sample_weights,
+                                include_last_offset,
+                                fallback_to_no_sparse,
+                                sparsity):
+        self.embedding_bag_rowwise_offsets_run(2, num_embeddings,
+                                               embedding_dim, num_offsets,
+                                               use_32bit_indices, use_32bit_offsets,
+                                               enable_per_sample_weights,
+                                               include_last_offset,
+                                               fallback_to_no_sparse,
+                                               sparsity=sparsity,
+                                               atol=1.0, rtol=1e-1)
+
     """ Tests the correctness of the quantized embedding lookup operator """
     @given(num_embeddings=st.integers(10, 100),
            embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0))

From 0bd8d0951dcb4063c0f7552a7404bd7f0e7b6e6f Mon Sep 17 00:00:00 2001
From: Don Jang <djang@fb.com>
Date: Thu, 26 Aug 2021 16:28:35 -0700
Subject: [PATCH 273/530] [Static Runtime] Remove unnecessary fb::equally_split
 nodes (#64022)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64022

Test Plan: - Added unittest `StaticRuntime.RemoveEquallySplitListUnpack`.

Reviewed By: hlu1

Differential Revision: D30472189

fbshipit-source-id: 36040b0146f4be9d0d0fda293f7205f43aad0b87
---
 torch/csrc/jit/runtime/static/passes.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
index c8e1107199528..1133e3924c32a 100644
--- a/torch/csrc/jit/runtime/static/passes.cpp
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@@ -412,6 +412,7 @@ void ReplaceWithCopy(
 // c10::AliasAnalysisKind::PURE_FUNCTION to make alias analysis work.
 void FuseListUnpack(std::shared_ptr<torch::jit::Graph>& graph) {
   auto nodes = graph->nodes();
+  std::vector<Node*> equally_splits_to_remove;
   for (auto it = nodes.begin(); it != nodes.end(); ++it) {
     Node* node = *it;
     const char* node_qual_string = node->kind().toQualString();
@@ -445,8 +446,22 @@ void FuseListUnpack(std::shared_ptr<torch::jit::Graph>& graph) {
       it_next.destroyCurrent(); // remove list_unpack
 
       node->eraseOutput(0);
+
+      if (strcmp(node_qual_string, "fb::equally_split") == 0 &&
+          node->outputs().size() == 1) {
+        // This captures a case of `y = fb::equally_split(x, 1, _)` where y
+        // becomes just an alias of x.
+        // If this case is found, replace y with x to avoid executing this op.
+        equally_splits_to_remove.push_back(node);
+      }
     }
   }
+
+  for (Node* node : equally_splits_to_remove) {
+    node->output(0)->replaceAllUsesWith(node->input(0));
+    node->destroy();
+  }
+
 #ifndef NDEBUG
   graph->lint();
   AliasDb db2(graph);

From ed573a8e08fadaa611d568294df14c0a96dc4a81 Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@fb.com>
Date: Thu, 26 Aug 2021 16:49:13 -0700
Subject: [PATCH 274/530] Enable test_api IMethodTest in OSS (#63345)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63345

This diff did the following few things to enable the tests:
1. Exposed IMethod as TORCH_API.
2. Linked torch_deploy to test_api if USE_DEPLOY == 1.
3. Generated torch::deploy examples when building torch_deploy library.

Test Plan: ./build/bin/test_api --gtest_filter=IMethodTest.*

Reviewed By: ngimel

Differential Revision: D30346257

Pulled By: alanwaketan

fbshipit-source-id: 932ae7d45790dfb6e00c51893933a054a0fad86d
---
 .jenkins/pytorch/test.sh               |  5 ++-
 test/cpp/api/CMakeLists.txt            |  8 +++++
 test/cpp/api/imethod.cpp               | 44 ++++++++++++++++----------
 torch/csrc/api/include/torch/imethod.h |  2 +-
 4 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index daa0da7eeca26..4eb1b35253c91 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -253,6 +253,7 @@ test_libtorch() {
     ln -sf "$TORCH_LIB_DIR"/libbackend_with_compiler.so "$TORCH_BIN_DIR"
     ln -sf "$TORCH_LIB_DIR"/libjitbackend_test.so "$TORCH_BIN_DIR"
     ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
+    ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
     ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
     ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"
 
@@ -275,7 +276,8 @@ test_libtorch() {
     python test/cpp/jit/tests_setup.py shutdown
     # Wait for background download to finish
     wait
-    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" "$TORCH_BIN_DIR"/test_api --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
+    # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy.
+    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
     "$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
     "$TORCH_BIN_DIR"/test_mobile_nnc --gtest_output=xml:$TEST_REPORTS_DIR/test_mobile_nnc.xml
     if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-py3* ]]; then
@@ -488,6 +490,7 @@ test_torch_deploy() {
   ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
   ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
   "$TORCH_BIN_DIR"/test_deploy
+  "$TORCH_BIN_DIR"/test_api --gtest_filter='IMethodTest.*'
   assert_git_not_dirty
 }
 
diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt
index 9bd9d6780fe7d..fc21afaef6a8a 100644
--- a/test/cpp/api/CMakeLists.txt
+++ b/test/cpp/api/CMakeLists.txt
@@ -41,6 +41,10 @@ set(TORCH_API_TEST_SOURCES
   ${TORCH_API_TEST_DIR}/grad_mode.cpp
 )
 
+if(USE_DEPLOY)
+  list(APPEND TORCH_API_TEST_SOURCES ${TORCH_API_TEST_DIR}/imethod.cpp)
+endif()
+
 if(USE_CUDA)
   list(APPEND TORCH_API_TEST_SOURCES ${TORCH_API_TEST_DIR}/parallel.cpp)
 endif()
@@ -59,6 +63,10 @@ if(USE_CUDA)
   target_compile_definitions(test_api PRIVATE "USE_CUDA")
 endif()
 
+if(USE_DEPLOY)
+  target_link_libraries(test_api PRIVATE torch_deploy)
+endif()
+
 # Workaround for https://github.com/pytorch/pytorch/issues/40941
 if(USE_OPENMP AND CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.0.0))
   # Compiling transformer.cpp or pow_test.cpp with -O2+ and both -fuse-openmp and -faligned-newout any optimization
diff --git a/test/cpp/api/imethod.cpp b/test/cpp/api/imethod.cpp
index 3349d1b3a8a45..8673e55fb5629 100644
--- a/test/cpp/api/imethod.cpp
+++ b/test/cpp/api/imethod.cpp
@@ -8,30 +8,40 @@
 using namespace ::testing;
 using namespace caffe2;
 
-// TODO(T96218435): Enable the following tests in OSS.
+const char* simple = "torch/csrc/deploy/example/generated/simple";
+const char* simpleJit = "torch/csrc/deploy/example/generated/simple_jit";
+
+// TODO(jwtan): Try unifying cmake and buck for getting the path.
+const char* path(const char* envname, const char* path) {
+  const char* env = getenv(envname);
+  return env ? env : path;
+}
+
+// Run `python torch/csrc/deploy/example/generate_examples.py` before running the following tests.
+// TODO(jwtan): Figure out a way to automate the above step for development. (CI has it already.)
 TEST(IMethodTest, CallMethod) {
-  auto script_model = torch::jit::load(getenv("SIMPLE_JIT"));
-  auto script_method = script_model.get_method("forward");
+  auto scriptModel = torch::jit::load(path("SIMPLE_JIT", simpleJit));
+  auto scriptMethod = scriptModel.get_method("forward");
 
   torch::deploy::InterpreterManager manager(3);
-  torch::deploy::Package p = manager.load_package(getenv("SIMPLE"));
-  auto py_model = p.load_pickle("model", "model.pkl");
-  torch::deploy::PythonMethodWrapper py_method(py_model, "forward");
+  torch::deploy::Package package = manager.load_package(path("SIMPLE", simple));
+  auto pyModel = package.load_pickle("model", "model.pkl");
+  torch::deploy::PythonMethodWrapper pyMethod(pyModel, "forward");
 
   auto input = torch::ones({10, 20});
-  auto output_py = py_method({input});
-  auto output_script = script_method({input});
-  EXPECT_TRUE(output_py.isTensor());
-  EXPECT_TRUE(output_script.isTensor());
-  auto output_py_tensor = output_py.toTensor();
-  auto output_script_tensor = output_script.toTensor();
-
-  EXPECT_TRUE(output_py_tensor.equal(output_script_tensor));
-  EXPECT_EQ(output_py_tensor.numel(), 200);
+  auto outputPy = pyMethod({input});
+  auto outputScript = scriptMethod({input});
+  EXPECT_TRUE(outputPy.isTensor());
+  EXPECT_TRUE(outputScript.isTensor());
+  auto outputPyTensor = outputPy.toTensor();
+  auto outputScriptTensor = outputScript.toTensor();
+
+  EXPECT_TRUE(outputPyTensor.equal(outputScriptTensor));
+  EXPECT_EQ(outputPyTensor.numel(), 200);
 }
 
 TEST(IMethodTest, GetArgumentNames) {
-  auto scriptModel = torch::jit::load(getenv("SIMPLE_JIT"));
+  auto scriptModel = torch::jit::load(path("SIMPLE_JIT", simpleJit));
   auto scriptMethod = scriptModel.get_method("forward");
 
   auto& scriptNames = scriptMethod.getArgumentNames();
@@ -39,7 +49,7 @@ TEST(IMethodTest, GetArgumentNames) {
   EXPECT_STREQ(scriptNames[0].c_str(), "input");
 
   torch::deploy::InterpreterManager manager(3);
-  torch::deploy::Package package = manager.load_package(getenv("SIMPLE"));
+  torch::deploy::Package package = manager.load_package(path("SIMPLE", simple));
   auto pyModel = package.load_pickle("model", "model.pkl");
   torch::deploy::PythonMethodWrapper pyMethod(pyModel, "forward");
 
diff --git a/torch/csrc/api/include/torch/imethod.h b/torch/csrc/api/include/torch/imethod.h
index dfabf50ce7191..af010785a8016 100644
--- a/torch/csrc/api/include/torch/imethod.h
+++ b/torch/csrc/api/include/torch/imethod.h
@@ -4,7 +4,7 @@
 
 namespace torch {
 
-class IMethod {
+class TORCH_API IMethod {
   /*
   IMethod provides a portable interface for torch methods, whether
   they are backed by torchscript or python/deploy.

From 81764d1153c607e324390dcac107ea0970ba668c Mon Sep 17 00:00:00 2001
From: Michael Dagitses <mikeyd@fb.com>
Date: Thu, 26 Aug 2021 17:26:52 -0700
Subject: [PATCH 275/530] document that `torch.triangular_solve` has optional
 out= parameter (#63253)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63253

Fixes #57955

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D30312134

Pulled By: dagitses

fbshipit-source-id: 4f484620f5754f4324a99bbac1ff783c64cee6b8
---
 torch/_torch_docs.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index a4f3bdaef7df7..bbb8d981ab8a2 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -9679,7 +9679,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.triangular_solve,
            r"""
-triangular_solve(b, A, upper=True, transpose=False, unitriangular=False) -> (Tensor, Tensor)
+triangular_solve(b, A, upper=True, transpose=False, unitriangular=False, *, out=None) -> (Tensor, Tensor)
 
 Solves a system of equations with a triangular coefficient matrix :math:`A`
 and multiple right-hand sides :math:`b`.
@@ -9706,6 +9706,10 @@ def merge_dicts(*dicts):
         If True, the diagonal elements of :math:`A` are assumed to be
         1 and not referenced from :math:`A`. Default: ``False``.
 
+Keyword args:
+    out ((Tensor, Tensor), optional): tuple of two tensors to write
+        the output to. Ignored if `None`. Default: `None`.
+
 Returns:
     A namedtuple `(solution, cloned_coefficient)` where `cloned_coefficient`
     is a clone of :math:`A` and `solution` is the solution :math:`X` to :math:`AX = b`

From 0c9dce90ed6a12d81b0e769b76d6b0c282326823 Mon Sep 17 00:00:00 2001
From: Paul Johnson <johnsonpaul@fb.com>
Date: Thu, 26 Aug 2021 17:28:35 -0700
Subject: [PATCH 276/530] [pytorch] add per_sample_weights support for
 embedding_bag_4bit_rowwise_offsets (#63605)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63605

Reviewed By: houseroad

Differential Revision: D30434664

fbshipit-source-id: eb4cbae3c705f9dec5c073a56f0f23daee353bc1
---
 .../native/quantized/cuda/embedding_bag.cu    | 51 +++++++++++++------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cuda/embedding_bag.cu b/aten/src/ATen/native/quantized/cuda/embedding_bag.cu
index 6d44ce0f41873..55b0b0d4f36d0 100644
--- a/aten/src/ATen/native/quantized/cuda/embedding_bag.cu
+++ b/aten/src/ATen/native/quantized/cuda/embedding_bag.cu
@@ -56,15 +56,15 @@ dequantize_intx(uint32_t packedVals, float2 scale_bias, uint8_t offset_bits) {
 
 template <uint8_t bits_per_dim>
 __forceinline__ __device__ void
-accumulate_packed_intx(float4* acc, uint32_t packedVals, float2 scale_bias) {
+accumulate_packed_intx(float4* acc, uint32_t packedVals, float2 scale_bias, float sample_weight) {
   constexpr uint8_t dims_per_byte = 8 / bits_per_dim;
   for (uint8_t i = 0; i < dims_per_byte; i++) {
     float4 res = dequantize_intx<bits_per_dim>(packedVals, scale_bias, 4 * bits_per_dim * i /* offset_bits */);
     // Accumulate in float32.
-    acc[i].x += res.x;
-    acc[i].y += res.y;
-    acc[i].z += res.z;
-    acc[i].w += res.w;
+    acc[i].x += (res.x * sample_weight);
+    acc[i].y += (res.y * sample_weight);
+    acc[i].z += (res.z * sample_weight);
+    acc[i].w += (res.w * sample_weight);
   }
 }
 
@@ -77,7 +77,7 @@ __global__ void embedding_bag_nbits_rowwise_offsets_kernel(
     const PackedTensorAccessor32<index_t, 1, RestrictPtrTraits> indices,
     const PackedTensorAccessor32<index_t, 1, RestrictPtrTraits> offsets,
     const bool /* pruned_weights */,
-    const c10::optional<Tensor>& per_sample_weights_,
+    const PackedTensorAccessor32<float, 1, RestrictPtrTraits> per_sample_weights_,
     const c10::optional<Tensor>& compressed_indices_mapping,
     const bool include_last_offset,
     PackedTensorAccessor32<float, 2, RestrictPtrTraits> output) {
@@ -96,6 +96,8 @@ __global__ void embedding_bag_nbits_rowwise_offsets_kernel(
 
   const int32_t D_bytes = weight.size(1);
 
+  bool use_per_sample = per_sample_weights_.size(0) > 0;
+
   int64_t indices_start = offsets[t * B + b];
   int64_t indices_end;
   if (include_last_offset) {
@@ -124,6 +126,7 @@ __global__ void embedding_bag_nbits_rowwise_offsets_kernel(
     }
     for (int32_t l = indices_start; l < indices_end; ++l) {
       int64_t idx = indices[l];
+      float sample_weight = use_per_sample ? per_sample_weights_[l] : 1.0f;
       const uint8_t* __restrict__ row = &weights[idx * D_bytes];
       float2 scale_bias;
       if (fp32_scale_bias) {
@@ -138,7 +141,7 @@ __global__ void embedding_bag_nbits_rowwise_offsets_kernel(
 
       uint32_t v0 = reinterpret_cast<const uint32_t*>(&row[byte_offset])[0];
 
-      accumulate_packed_intx<bits_per_dim>(accumulator, v0, scale_bias);
+      accumulate_packed_intx<bits_per_dim>(accumulator, v0, scale_bias, sample_weight);
     }
 
 
@@ -204,9 +207,11 @@ at::Tensor& embedding_bag_byte_impl(
   const int D = weight_sizes[1] - 8; // NB: -8 to account for scale and bias
   const int64_t M = offsets.sizes()[0];
   TORCH_CHECK(D % 4 == 0);
-  TORCH_CHECK(
-      !per_sample_weights_.has_value(),
-      "Per sample weights not yet implemented for embedding_bag_byte_rowwise_offsets_cuda");
+  if(per_sample_weights_.has_value()) {
+      TORCH_CHECK(per_sample_weights_.value().scalar_type() == at::kFloat,
+              "Per sample weights expected scalar type ", at::kFloat, " but got ",
+              per_sample_weights_.value().scalar_type());
+  }
   TORCH_CHECK(
       !compressed_indices_mapping.has_value(),
       "Compressed indices mapping not yet implemented for embedding_bag_byte_rowwise_offsets_cuda");
@@ -215,6 +220,13 @@ at::Tensor& embedding_bag_byte_impl(
 
   int64_t output_size = include_last_offset ? M - 1 : M;
 
+  at::Tensor sample_weights;
+  if (per_sample_weights_.has_value()) {
+      sample_weights = per_sample_weights_.value();
+  } else {
+      sample_weights = create_empty_from(output, kFloat);
+  }
+
   const std::vector<int64_t> shape = {output_size, D};
   at::native::resize_(output, shape, c10::nullopt);
   AT_DISPATCH_INDEX_TYPES(
@@ -228,7 +240,7 @@ at::Tensor& embedding_bag_byte_impl(
             indices.packed_accessor32<index_t, 1, RestrictPtrTraits>(),
             offsets.packed_accessor32<index_t, 1, RestrictPtrTraits>(),
             false /* pruned_weights */,
-            per_sample_weights_,
+            sample_weights.packed_accessor32<float, 1, RestrictPtrTraits>(),
             compressed_indices_mapping,
             include_last_offset,
             output.packed_accessor32<float, 2, RestrictPtrTraits>());
@@ -377,9 +389,11 @@ at::Tensor& embedding_bag_4bit_impl(
   const int D = 2*(weight_sizes[1] - 4); // NB: -4 to account for scale and bias @fp16
   const int64_t M = offsets.sizes()[0];
   TORCH_CHECK(D % 8 == 0);
-  TORCH_CHECK(
-      !per_sample_weights_.has_value(),
-      "Per sample weights not yet implemented for embedding_bag_byte_rowwise_offsets_cuda");
+  if(per_sample_weights_.has_value()) {
+      TORCH_CHECK(per_sample_weights_.value().scalar_type() == at::kFloat,
+              "Per sample weights expected scalar type ", at::kFloat, " but got ",
+              per_sample_weights_.value().scalar_type());
+  }
   TORCH_CHECK(
       !compressed_indices_mapping.has_value(),
       "Compressed indices mapping not yet implemented for embedding_bag_byte_rowwise_offsets_cuda");
@@ -388,6 +402,13 @@ at::Tensor& embedding_bag_4bit_impl(
 
   int64_t output_size = include_last_offset ? M - 1 : M;
 
+  at::Tensor sample_weights;
+  if (per_sample_weights_.has_value()) {
+      sample_weights = per_sample_weights_.value();
+  } else {
+      sample_weights = create_empty_from(output, kFloat);
+  }
+
   const std::vector<int64_t> shape = {output_size, D};
   at::native::resize_(output, shape, c10::nullopt);
   AT_DISPATCH_INDEX_TYPES(
@@ -401,7 +422,7 @@ at::Tensor& embedding_bag_4bit_impl(
             indices.packed_accessor32<index_t, 1, RestrictPtrTraits>(),
             offsets.packed_accessor32<index_t, 1, RestrictPtrTraits>(),
             false /* pruned_weights */,
-            per_sample_weights_,
+            sample_weights.packed_accessor32<float, 1, RestrictPtrTraits>(),
             compressed_indices_mapping,
             include_last_offset,
             output.packed_accessor32<float, 2, RestrictPtrTraits>());

From a5f35ac7cd12c32227fdcd42979dc9c6aea7ba07 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Thu, 26 Aug 2021 17:36:56 -0700
Subject: [PATCH 277/530] Run through failures on trunk (#64063)

Summary:
This PR runs all the tests on trunk instead of stopping on first failure.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64063

Reviewed By: malfet, seemethere

Differential Revision: D30592020

Pulled By: janeyx99

fbshipit-source-id: 318b225cdf918a98f73e752d1cc0227d9227f36c
---
 .github/templates/bazel_ci_workflow.yml.j2                      | 2 ++
 .github/templates/linux_ci_workflow.yml.j2                      | 2 ++
 .github/templates/windows_ci_workflow.yml.j2                    | 1 +
 .../workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml    | 2 ++
 .../workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml    | 2 ++
 .../workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml    | 2 ++
 .../workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml    | 2 ++
 .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml       | 2 ++
 .../workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml  | 2 ++
 .../generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml     | 2 ++
 .../workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml    | 1 +
 .github/workflows/generated-win-vs2019-cpu-py3.yml              | 1 +
 .github/workflows/generated-win-vs2019-cuda10.1-py3.yml         | 1 +
 .github/workflows/generated-win-vs2019-cuda11.3-py3.yml         | 1 +
 14 files changed, 23 insertions(+)

diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2
index d25ffe6d8a7e5..f4e0034a0f5d6 100644
--- a/.github/templates/bazel_ci_workflow.yml.j2
+++ b/.github/templates/bazel_ci_workflow.yml.j2
@@ -29,6 +29,7 @@ on:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: !{{ build_environment }}-build-and-test
       NUM_TEST_SHARDS: !{{ num_test_shards }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
       - name: Log in to ECR
         run: |
@@ -121,6 +122,7 @@ on:
             -e JOB_BASE_NAME \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e CONTINUE_THROUGH_ERROR \
             -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index d9af899b04b66..52c0a09a9e1c5 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -286,6 +286,7 @@ jobs:
       SHARD_NUMBER: ${{ matrix.shard }}
       NUM_TEST_SHARDS: ${{ matrix.num_shards }}
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
       - name: Log in to ECR
         run: |
@@ -372,6 +373,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index 1be7b325306d5..84a30bda92a36 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -195,6 +195,7 @@ jobs:
       https_proxy: "!{{ squid_proxy }}"
       RUN_SMOKE_TESTS_ONLY_ON_PR: !{{ only_run_smoke_tests_on_pull_request }}
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     needs: [build, generate-test-matrix, !{{ ciflow_config.root_job_name }}]
     strategy:
       matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 769efcaa80a3b..c51f8f047e986 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -262,6 +262,7 @@ jobs:
       SHARD_NUMBER: ${{ matrix.shard }}
       NUM_TEST_SHARDS: ${{ matrix.num_shards }}
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
       - name: Log in to ECR
         run: |
@@ -351,6 +352,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index ddd81c079df7c..014b1d1162d07 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -262,6 +262,7 @@ jobs:
       SHARD_NUMBER: ${{ matrix.shard }}
       NUM_TEST_SHARDS: ${{ matrix.num_shards }}
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
       - name: Log in to ECR
         run: |
@@ -351,6 +352,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 5a888d0104174..76b973eebce24 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -262,6 +262,7 @@ jobs:
       SHARD_NUMBER: ${{ matrix.shard }}
       NUM_TEST_SHARDS: ${{ matrix.num_shards }}
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
       - name: Log in to ECR
         run: |
@@ -351,6 +352,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 25d74de36dc66..8114bd541fdb3 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -262,6 +262,7 @@ jobs:
       SHARD_NUMBER: ${{ matrix.shard }}
       NUM_TEST_SHARDS: ${{ matrix.num_shards }}
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
       - name: Log in to ECR
         run: |
@@ -351,6 +352,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index 341f9e6da6e8c..b5f062c53cb05 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -262,6 +262,7 @@ jobs:
       SHARD_NUMBER: ${{ matrix.shard }}
       NUM_TEST_SHARDS: ${{ matrix.num_shards }}
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
       - name: Log in to ECR
         run: |
@@ -351,6 +352,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index 7ca389635bf56..71a9bf76dac22 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -123,6 +123,7 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: linux-xenial-py3.6-gcc7-bazel-test-build-and-test
       NUM_TEST_SHARDS: 1
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
       - name: Log in to ECR
         run: |
@@ -220,6 +221,7 @@ jobs:
             -e JOB_BASE_NAME \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e CONTINUE_THROUGH_ERROR \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 470fdaaad4230..7b947790902ec 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -260,6 +260,7 @@ jobs:
       SHARD_NUMBER: ${{ matrix.shard }}
       NUM_TEST_SHARDS: ${{ matrix.num_shards }}
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
       - name: Log in to ECR
         run: |
@@ -349,6 +350,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
index ce4540b79cee7..6c87f40accd64 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
@@ -158,6 +158,7 @@ jobs:
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       RUN_SMOKE_TESTS_ONLY_ON_PR: False
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     needs: [build, generate-test-matrix, ciflow_should_run]
     strategy:
       matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index d868d19d0fc2c..30f328ae71fdd 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -150,6 +150,7 @@ jobs:
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       RUN_SMOKE_TESTS_ONLY_ON_PR: False
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     needs: [build, generate-test-matrix, ciflow_should_run]
     strategy:
       matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
index 4d4550c9ce06b..72dd21dce3899 100644
--- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
@@ -160,6 +160,7 @@ jobs:
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       RUN_SMOKE_TESTS_ONLY_ON_PR: True
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     needs: [build, generate-test-matrix, ciflow_should_run]
     strategy:
       matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
index c5ae48a888938..eb6e02fb5c2f3 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
@@ -160,6 +160,7 @@ jobs:
       https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128"
       RUN_SMOKE_TESTS_ONLY_ON_PR: False
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     needs: [build, generate-test-matrix, ciflow_should_run]
     strategy:
       matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}

From 0e8c3c51d9a2a0d364f2707d4131ab12229dc826 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Thu, 26 Aug 2021 17:59:59 -0700
Subject: [PATCH 278/530] port glu to use structured kernel approach (#61800)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61800

resubmitting because the [last one](https://github.com/pytorch/pytorch/pull/61433) was unrecoverable due to making changes incorrectly in the stack

Test Plan: Imported from OSS

Reviewed By: iramazanli

Differential Revision: D29812492

Pulled By: makslevental

fbshipit-source-id: c3dfeacd1e00a526e24fbaab02dad48069d690ef
---
 aten/src/ATen/native/Activation.h          |  2 +-
 aten/src/ATen/native/GatedLinearUnit.cpp   | 32 +++++++++++-----------
 aten/src/ATen/native/cpu/Activation.cpp    |  2 +-
 aten/src/ATen/native/cuda/Activation.cu    |  2 +-
 aten/src/ATen/native/native_functions.yaml |  6 ++--
 5 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/aten/src/ATen/native/Activation.h b/aten/src/ATen/native/Activation.h
index 01782fae1de3f..f0c6d82af2b29 100644
--- a/aten/src/ATen/native/Activation.h
+++ b/aten/src/ATen/native/Activation.h
@@ -51,7 +51,7 @@ DECLARE_DISPATCH(softshrink_fn, softshrink_stub);
 DECLARE_DISPATCH(shrink_backward_fn, shrink_backward_stub);
 DECLARE_DISPATCH(leaky_relu_fn, leaky_relu_stub);
 DECLARE_DISPATCH(leaky_relu_backward_fn, leaky_relu_backward_stub);
-DECLARE_DISPATCH(activation_fn, glu_stub);
+DECLARE_DISPATCH(structured_activation_fn, glu_stub);
 DECLARE_DISPATCH(activation_backward_fn, glu_backward_stub);
 DECLARE_DISPATCH(structured_activation_fn, silu_stub);
 DECLARE_DISPATCH(structured_activation_backward_fn, silu_backward_stub);
diff --git a/aten/src/ATen/native/GatedLinearUnit.cpp b/aten/src/ATen/native/GatedLinearUnit.cpp
index a0e2c16ed645f..c585caa71a011 100644
--- a/aten/src/ATen/native/GatedLinearUnit.cpp
+++ b/aten/src/ATen/native/GatedLinearUnit.cpp
@@ -3,12 +3,11 @@
 #include <ATen/native/Activation.h>
 
 namespace at {
-namespace native {
-
-DEFINE_DISPATCH(glu_stub);
-DEFINE_DISPATCH(glu_backward_stub);
 
-Tensor& glu_out(const Tensor& self, int64_t dim, Tensor &result) {
+namespace meta {
+TORCH_META_FUNC(glu) (
+    const Tensor& self, int64_t dim
+) {
   // this can't pass anyway because a 0-dimensional tensor has "size" 1, which
   // can't be evenly halved, but give a nicer error message here.
   TORCH_CHECK(self.dim() > 0, "glu does not support 0-dimensional tensors");
@@ -16,23 +15,24 @@ Tensor& glu_out(const Tensor& self, int64_t dim, Tensor &result) {
   const int64_t nIn = self.size(wrap_dim);
   TORCH_CHECK(nIn % 2 == 0, "Halving dimension must be even, but dimension ",
               wrap_dim, " is size ", nIn);
+
   // size output to half of input
   const int64_t selfSize = nIn / 2;
-  auto newSizes = self.sizes().vec();
-  newSizes[wrap_dim] = selfSize;
-  result.resize_(newSizes);
-  // half tensor
   Tensor firstHalf = self.narrow(wrap_dim, 0, selfSize);
   Tensor secondHalf = self.narrow(wrap_dim, selfSize, selfSize);
-
-  auto iter = TensorIterator::borrowing_binary_op(result, firstHalf, secondHalf);
-  glu_stub(iter.device_type(), iter);
-  return result;
+  build_borrowing_binary_op(maybe_get_output(), firstHalf, secondHalf);
 }
+} // namespace meta
+
+namespace native {
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DEFINE_DISPATCH(glu_stub);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DEFINE_DISPATCH(glu_backward_stub);
 
-Tensor glu(const Tensor& self, int64_t dim) {
-  auto result = at::empty({0}, self.options());
-  return at::glu_out(result, self, dim);
+TORCH_IMPL_FUNC(glu_out) (const Tensor& self, int64_t dim, const Tensor& out) {
+  glu_stub(device_type(), *this);
 }
 
 Tensor& glu_backward_cpu_out(const Tensor& grad_output, const Tensor& input,
diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
index fc5cc0d1924fb..34b54719fe502 100644
--- a/aten/src/ATen/native/cpu/Activation.cpp
+++ b/aten/src/ATen/native/cpu/Activation.cpp
@@ -519,7 +519,7 @@ void softplus_backward_kernel(TensorIteratorBase& iter, const Scalar& beta_, con
   });
 }
 
-void glu_kernel(TensorIterator& iter) {
+void glu_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_cpu", [&] {
     using Vec = Vectorized<scalar_t>;
     const scalar_t one_val(1);
diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu
index 1229149d76aee..7c8783028a5ac 100644
--- a/aten/src/ATen/native/cuda/Activation.cu
+++ b/aten/src/ATen/native/cuda/Activation.cu
@@ -28,7 +28,7 @@ namespace native {
 // -----------------------------------
 // glu forward
 // -----------------------------------
-void glu_kernel(TensorIterator& iter) {
+void glu_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "glu_cuda", [&]() {
     using acc_t = at::acc_type<scalar_t, true>;
     gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 4f7d7e66a7d5e..224d850c8004c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -8460,14 +8460,16 @@
     CompositeExplicitAutograd: elu_
 
 - func: glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
+  structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
     CPU, CUDA: glu_out
 
 - func: glu(Tensor self, int dim=-1) -> Tensor
+  structured_delegate: glu.out
+  device_check: NoCheck   # TensorIterator
   python_module: nn
-  dispatch:
-    CPU, CUDA: glu
 
 - func: glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn

From 510d2ece81c18ba3f9bbcbc509effe281a77206c Mon Sep 17 00:00:00 2001
From: nikithamalgi <nikithamalgi@devvm146.prn0.facebook.com>
Date: Thu, 26 Aug 2021 18:54:51 -0700
Subject: [PATCH 279/530] Merge script and _script_pdt API (#62420)

Summary:
Merge `torch.jit.script` and `torch.jit._script_pdt` API. This PR merges profile directed typing with script api

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62420

Reviewed By: iramazanli

Differential Revision: D30579015

Pulled By: nikithamalgifb

fbshipit-source-id: 99ba6839d235d61b2dd0144b466b2063a53ccece
---
 test/jit/test_pdt.py  |  80 ++++++++++++-------------
 torch/jit/__init__.py |   1 -
 torch/jit/_script.py  | 136 ++++++++++++++++++++++++++----------------
 3 files changed, 124 insertions(+), 93 deletions(-)

diff --git a/test/jit/test_pdt.py b/test/jit/test_pdt.py
index 57cd74faf432b..468eb2787814b 100644
--- a/test/jit/test_pdt.py
+++ b/test/jit/test_pdt.py
@@ -40,7 +40,7 @@ def forward(self, x) -> Any:
         make_global(TestPDTModel)
         pdt_model = TestPDTModel()
         inp: List[Tuple[Any, ...]] = [(20, ), (2.7, ), (False, ), ]
-        scripted_pdt_model = torch.jit._script_pdt(pdt_model, example_inputs={pdt_model: inp})
+        scripted_pdt_model = torch.jit.script(pdt_model, example_inputs={pdt_model: inp})
         self.assertEqual(scripted_pdt_model(50), pdt_model(50))
         self.assertEqual(scripted_pdt_model(1.8), pdt_model(1.8))
         self.assertTrue(scripted_pdt_model(True), pdt_model(True))
@@ -67,7 +67,7 @@ def forward(self, x):
         inner_pdt_model = NestedPDTInner()
         wrapped_pdt_model = NestedModulePDTWrapper(inner_pdt_model)
         inp: List[Tuple[Any, ...]] = [(20, ), (False, )]
-        scripted_pdt_model = torch.jit._script_pdt(wrapped_pdt_model, example_inputs={wrapped_pdt_model: inp})
+        scripted_pdt_model = torch.jit.script(wrapped_pdt_model, example_inputs={wrapped_pdt_model: inp})
         self.assertEqual(scripted_pdt_model(30), wrapped_pdt_model(30))
         self.assertEqual(scripted_pdt_model(1.9), wrapped_pdt_model(1.9))
         self.assertTrue(scripted_pdt_model(True), wrapped_pdt_model(True))
@@ -95,8 +95,8 @@ def forward(self, x):
         outer_pdt_model = NestedModulePDTOuter(inner_pdt_model)
         inner_input: List[Tuple[Any, ...]] = [(10, 10), (1.9, 20), ]
         outer_input: List[Tuple[Any, ...]] = [(20, ), (False, )]
-        scripted_pdt_model = torch.jit._script_pdt(outer_pdt_model, example_inputs={inner_pdt_model: inner_input,
-                                                   outer_pdt_model: outer_input, })
+        scripted_pdt_model = torch.jit.script(outer_pdt_model, example_inputs={inner_pdt_model: inner_input,
+                                              outer_pdt_model: outer_input, })
         self.assertEqual(scripted_pdt_model(30), outer_pdt_model(30))
         self.assertEqual(scripted_pdt_model(1.9), outer_pdt_model(1.9))
         self.assertTrue(scripted_pdt_model(True), outer_pdt_model(True))
@@ -119,7 +119,7 @@ def fun(self, x):
         make_global(NestedFunctionInForward)
         pdt_model = NestedFunctionInForward()
         inp: List[Tuple[Any, ...]] = [(-1, ), (False, )]
-        scripted_pdt_model = torch.jit._script_pdt(pdt_model, example_inputs={pdt_model: inp})
+        scripted_pdt_model = torch.jit.script(pdt_model, example_inputs={pdt_model: inp})
         self.assertEqual(scripted_pdt_model(30), pdt_model(30))
         self.assertEqual(scripted_pdt_model(True), pdt_model(True))
 
@@ -142,7 +142,7 @@ def fn(self, x, y) -> Any:
         make_global(TestModelWithExport)
         pdt_model = TestModelWithExport()
         inp: List[Tuple[Any, ...]] = [(20, 10, ), (2.7, 8.9, ), ]
-        scripted_pdt_model = torch.jit._script_pdt(pdt_model, example_inputs={pdt_model.fn: inp})
+        scripted_pdt_model = torch.jit.script(pdt_model, example_inputs={pdt_model.fn: inp})
         self.assertEqual(scripted_pdt_model.fn(10, 90), pdt_model.fn(10, 90))
         self.assertEqual(scripted_pdt_model.fn(1.8, 2.2), pdt_model.fn(1.8, 2.2))
         self.assertTrue(scripted_pdt_model.fn(torch.ones(1), 2), pdt_model.fn(torch.ones(1), 2))
@@ -155,7 +155,7 @@ def test_sum(self, a):
         make_global(PDTModel)
         pdt_model = PDTModel()
         inp: List[Tuple[Any, ...]] = [([10, 20, ], ), ]
-        scripted_pdt_model = torch.jit._script_pdt(PDTModel, example_inputs={pdt_model.test_sum: inp})
+        scripted_pdt_model = torch.jit.script(PDTModel, example_inputs={pdt_model.test_sum: inp})
         script_model = scripted_pdt_model()
         self.assertEqual(script_model.test_sum([10, 20, 30, ], ), pdt_model.test_sum([10, 20, 30, ], ))
 
@@ -174,8 +174,8 @@ def test_substring(self, a, b):
         pdt_model = PDTModelWithManyMethods()
         list_inp: List[Tuple[Any, ...]] = [([1.2, 2.3, ], ), ]
         str_inp: List[Tuple[Any, ...]] = [("abc", "b", ), ]
-        scripted_pdt_model = torch.jit._script_pdt(PDTModelWithManyMethods, example_inputs={pdt_model.test_list_to_dict: list_inp,
-                                                   pdt_model.test_substring: str_inp})
+        scripted_pdt_model = torch.jit.script(PDTModelWithManyMethods, example_inputs={pdt_model.test_list_to_dict: list_inp,
+                                              pdt_model.test_substring: str_inp})
         script_model = scripted_pdt_model()
         self.assertEqual(script_model.test_list_to_dict([1.1, 2.2, 3.3, ], ), pdt_model.test_list_to_dict([1.1, 2.2, 3.3, ], ))
         self.assertEqual(script_model.test_substring("helloworld", "world", ), pdt_model.test_substring("helloworld", "world", ))
@@ -195,8 +195,8 @@ def test_find(self, a, b):
         pdt_model_two = PDTModelTwo()
         dict_inp: List[Tuple[Any, ...]] = [({1.2: True, 2.3: False, }, 1.2), ]
         list_inp: List[Tuple[Any, ...]] = [(["abc", "b", ], "c"), ]
-        scripted_pdt_model_one = torch.jit._script_pdt(PDTModelOne, example_inputs={pdt_model_one.test_find: dict_inp})
-        scripted_pdt_model_two = torch.jit._script_pdt(PDTModelTwo, example_inputs={pdt_model_two.test_find: list_inp})
+        scripted_pdt_model_one = torch.jit.script(PDTModelOne, example_inputs={pdt_model_one.test_find: dict_inp})
+        scripted_pdt_model_two = torch.jit.script(PDTModelTwo, example_inputs={pdt_model_two.test_find: list_inp})
 
         script_model_one, script_model_two = scripted_pdt_model_one(), scripted_pdt_model_two()
         self.assertEqual(script_model_one.test_find({1.1: True, 2.2: True, 3.3: False, }, 4.4),
@@ -209,28 +209,28 @@ def test_sum(a, b):
             return a + b
 
         make_global(test_sum)
-        scripted_fn_add = torch.jit._script_pdt(test_sum, example_inputs=[(3, 4)])
+        scripted_fn_add = torch.jit.script(test_sum, example_inputs=[(3, 4)])
         self.assertEqual(scripted_fn_add(10, 2), test_sum(10, 2))
 
         def test_sub(a, b):
             return a - b
 
         make_global(test_sub)
-        scripted_fn_sub = torch.jit._script_pdt(test_sub, example_inputs=[(3.9, 4.10)])
+        scripted_fn_sub = torch.jit.script(test_sub, example_inputs=[(3.9, 4.10)])
         self.assertEqual(scripted_fn_sub(6.5, 2.9), test_sub(6.5, 2.9))
 
         def test_mul(a, b):
             return a * b
 
         make_global(test_mul)
-        scripted_fn_mul = torch.jit._script_pdt(test_mul, example_inputs=[(-10, 9)])
+        scripted_fn_mul = torch.jit.script(test_mul, example_inputs=[(-10, 9)])
         self.assertEqual(scripted_fn_mul(-1, 3), test_mul(-1, 3))
 
         def test_args_complex(real, img):
             return torch.complex(real, img)
 
         make_global(test_args_complex)
-        scripted_fn_complex = torch.jit._script_pdt(test_args_complex, example_inputs=[(torch.rand(3, 4), torch.rand(3, 4))])
+        scripted_fn_complex = torch.jit.script(test_args_complex, example_inputs=[(torch.rand(3, 4), torch.rand(3, 4))])
         arg1, arg2 = torch.rand(3, 4), torch.rand(3, 4)
         self.assertEqual(scripted_fn_complex(arg1, arg2), test_args_complex(arg1, arg2))
 
@@ -241,7 +241,7 @@ def test_bool(a):
                 return 0
 
         make_global(test_bool)
-        scripted_fn_bool = torch.jit._script_pdt(test_bool, example_inputs=[(True,)])
+        scripted_fn_bool = torch.jit.script(test_bool, example_inputs=[(True,)])
         self.assertEqual(scripted_fn_bool(True), test_bool(True))
 
         def test_str(a):
@@ -251,7 +251,7 @@ def test_str(a):
                 return True
 
         make_global(test_str)
-        scripted_fn_str = torch.jit._script_pdt(test_str, example_inputs=[("",)])
+        scripted_fn_str = torch.jit.script(test_str, example_inputs=[("",)])
         self.assertEqual(scripted_fn_str("abc"), test_str("abc"))
 
     def test_pdt_list_and_tuple(self):
@@ -260,24 +260,24 @@ def test_list_and_tuple(a):
 
         make_global(test_list_and_tuple)
 
-        scripted_fn_float_list_input = torch.jit._script_pdt(test_list_and_tuple, example_inputs=[([4.9, 8.9],)])
+        scripted_fn_float_list_input = torch.jit.script(test_list_and_tuple, example_inputs=[([4.9, 8.9],)])
         self.assertEqual(scripted_fn_float_list_input([11.9, 7.6]), test_list_and_tuple([11.9, 7.6]))
 
-        scripted_fn_bool_list_input = torch.jit._script_pdt(test_list_and_tuple, example_inputs=[([True, False, True],)])
+        scripted_fn_bool_list_input = torch.jit.script(test_list_and_tuple, example_inputs=[([True, False, True],)])
         self.assertEqual(scripted_fn_bool_list_input([True, True, True]), test_list_and_tuple([True, True, True]))
 
-        scripted_fn_int_list_input = torch.jit._script_pdt(test_list_and_tuple, example_inputs=[([3, 4, 5], )])
+        scripted_fn_int_list_input = torch.jit.script(test_list_and_tuple, example_inputs=[([3, 4, 5], )])
         self.assertEqual(scripted_fn_int_list_input([1, 2, 3]), test_list_and_tuple([1, 2, 3]))
 
-        scripted_fn_float_tuple_input = torch.jit._script_pdt(test_list_and_tuple, example_inputs=[((4.9, 8.9),)])
+        scripted_fn_float_tuple_input = torch.jit.script(test_list_and_tuple, example_inputs=[((4.9, 8.9),)])
         self.assertEqual(scripted_fn_float_tuple_input((11.9, 7.6)), test_list_and_tuple((11.9, 7.6)))
 
-        scripted_fn_bool_tuple_input = torch.jit._script_pdt(test_list_and_tuple,
-                                                             example_inputs=[((True, False, True),)])
+        scripted_fn_bool_tuple_input = torch.jit.script(test_list_and_tuple,
+                                                        example_inputs=[((True, False, True),)])
         self.assertEqual(scripted_fn_bool_tuple_input((True, True, True)),
                          test_list_and_tuple((True, True, True)))
 
-        scripted_fn_int_tuple_input = torch.jit._script_pdt(test_list_and_tuple, example_inputs=[((3, 4, 5), )])
+        scripted_fn_int_tuple_input = torch.jit.script(test_list_and_tuple, example_inputs=[((3, 4, 5), )])
         self.assertEqual(scripted_fn_int_tuple_input((1, 2, 3)), test_list_and_tuple((1, 2, 3)))
 
     def test_nested_list_and_tuple(self):
@@ -295,22 +295,22 @@ def test_nested_tuple(inp):
         make_global(test_nested_list, test_nested_tuple)
 
         list_inp = [[1, 2, 3, ], [5, 6, 7, ]]
-        scripted_fn = torch.jit._script_pdt(test_nested_list, example_inputs=[(list_inp, ), ])
+        scripted_fn = torch.jit.script(test_nested_list, example_inputs=[(list_inp, ), ])
         inp = [[0, 4, 7, ], [8, 11, ], [6, -1, -20, ]]
         self.assertEqual(scripted_fn(inp, ), test_nested_list(inp, ))
 
         list_inp = ([1, 2, 3, ], [5, 6, 7, ])
-        scripted_fn = torch.jit._script_pdt(test_nested_list, example_inputs=[(list_inp, ), ])
+        scripted_fn = torch.jit.script(test_nested_list, example_inputs=[(list_inp, ), ])
         inp = ([0, 4, 7, ], [8, 11, ], [6, -1, -20, ])
         self.assertEqual(scripted_fn(inp, ), test_nested_list(inp, ))
 
         tup_inp = [(1.0, 2.6, 3.7, ), (5.7, 6.1, 1.7, )]
-        scripted_fn = torch.jit._script_pdt(test_nested_tuple, example_inputs=[(tup_inp, ), ])
+        scripted_fn = torch.jit.script(test_nested_tuple, example_inputs=[(tup_inp, ), ])
         inp = [(1.0, 4.1, 7.4, ), (4.8, 1.1, -1.2, ), (6.3, -1.3, -2.0, )]
         self.assertEqual(scripted_fn(inp, ), test_nested_tuple(inp, ))
 
         tup_inp = ((True, False, True, ), (False, False, False, ))
-        scripted_fn = torch.jit._script_pdt(test_nested_tuple, example_inputs=[(tup_inp, ), ])
+        scripted_fn = torch.jit.script(test_nested_tuple, example_inputs=[(tup_inp, ), ])
         inp = ((True, True, True, ), (False, False, True, ))
         self.assertEqual(scripted_fn(inp, ), test_nested_tuple(inp, ))
 
@@ -324,11 +324,11 @@ def test_dict_int_list(a):
         make_global(test_dict, test_dict_int_list)
 
         str_bool_inp = {'foo' : True, 'bar': False}
-        scripted_fn = torch.jit._script_pdt(test_dict, example_inputs=[(str_bool_inp,)])
+        scripted_fn = torch.jit.script(test_dict, example_inputs=[(str_bool_inp,)])
         self.assertEqual(scripted_fn({'foo' : False, 'bar': True}, ), test_dict({'foo' : False, 'bar': True}, ))
 
         str_list_inp = {0 : [True, False], 1: [False, True]}
-        scripted_fn = torch.jit._script_pdt(test_dict_int_list, example_inputs=[(str_list_inp,)])
+        scripted_fn = torch.jit.script(test_dict_int_list, example_inputs=[(str_list_inp,)])
         self.assertEqual(scripted_fn({0 : [False, False], 1: [True, True]}, ),
                          test_dict_int_list({0 : [False, False], 1: [True, True]}, ))
 
@@ -349,14 +349,14 @@ def test_multiple_type_refinement(a):
 
         make_global(test_multiple_types, test_multiple_type_refinement)
 
-        scripted_fn = torch.jit._script_pdt(test_multiple_types, example_inputs=[(1,), ("abc", ), (8.9,), ([3, 4, 5], )])
+        scripted_fn = torch.jit.script(test_multiple_types, example_inputs=[(1,), ("abc", ), (8.9,), ([3, 4, 5], )])
         self.assertEqual(scripted_fn(10), test_multiple_types(10))
         self.assertEqual(scripted_fn("def"), test_multiple_types("def"))
         self.assertEqual(scripted_fn(7.89999), test_multiple_types(7.89999))
         self.assertEqual(scripted_fn([10, 11, 14]), test_multiple_types([10, 11, 14]))
 
-        scripted_fn = torch.jit._script_pdt(test_multiple_type_refinement, example_inputs=[(1,), ("abc", ), (8.9,),
-                                            ([3, 4, 5],), (True, ), ({"a": True}, ), ])
+        scripted_fn = torch.jit.script(test_multiple_type_refinement, example_inputs=[(1,), ("abc", ), (8.9,),
+                                       ([3, 4, 5],), (True, ), ({"a": True}, ), ])
         self.assertEqual(scripted_fn(10), test_multiple_type_refinement(10))
         self.assertEqual(scripted_fn("def"), test_multiple_type_refinement("def"))
         self.assertEqual(scripted_fn(7.89999), test_multiple_type_refinement(7.89999))
@@ -381,7 +381,7 @@ def test_model(a, m):
         make_global(UserDefinedClass, test_model)
 
         user_class = UserDefinedClass()
-        scripted_fn = torch.jit._script_pdt(test_model, example_inputs=[(10, user_class, ), (10.9, user_class, ), ])
+        scripted_fn = torch.jit.script(test_model, example_inputs=[(10, user_class, ), (10.9, user_class, ), ])
         self.assertEqual(scripted_fn(100, user_class, ), test_model(100, user_class))
         self.assertEqual(scripted_fn(1.9, user_class, ), test_model(1.9, user_class))
 
@@ -403,7 +403,7 @@ def test_model_with_args(a, m):
         make_global(ClassWithArgs, test_model_with_args)
 
         user_class = ClassWithArgs(False)
-        scripted_fn = torch.jit._script_pdt(test_model_with_args, example_inputs=[(10, user_class, ), (10.9, user_class, ), ])
+        scripted_fn = torch.jit.script(test_model_with_args, example_inputs=[(10, user_class, ), (10.9, user_class, ), ])
         self.assertEqual(scripted_fn(100, ClassWithArgs(True), ), test_model_with_args(100, ClassWithArgs(True)))
 
     def test_nn_parameter_as_arg(self):
@@ -420,7 +420,7 @@ def forward(self, y):
 
         make_global(TestNNParameter)
         pdt_model = TestNNParameter()
-        scripted_fn = torch.jit._script_pdt(pdt_model, example_inputs={pdt_model: [(10, ), ], })
+        scripted_fn = torch.jit.script(pdt_model, example_inputs={pdt_model: [(10, ), ], })
         self.assertEqual(scripted_fn(20), pdt_model(20))
 
     def test_fx_tracing_with_typing(self):
@@ -434,7 +434,7 @@ def forward(self, a) -> FXModelOutput:
 
         make_global(FXModel, FXModelOutput)
         pdt_model = FXModel()
-        scripted_fn = torch.jit._script_pdt(pdt_model, example_inputs={pdt_model: [([10, 20, ], ), ], })
+        scripted_fn = torch.jit.script(pdt_model, example_inputs={pdt_model: [([10, 20, ], ), ], })
         self.assertEqual(scripted_fn([20]), pdt_model([20]))
 
     def test_nonetype_as_optional_of_type(self):
@@ -446,11 +446,11 @@ def test_none(a) -> Any:
 
         make_global(test_none)
 
-        scripted_fn = torch.jit._script_pdt(test_none, example_inputs=[(None, ), (10.6, )])
+        scripted_fn = torch.jit.script(test_none, example_inputs=[(None, ), (10.6, )])
         self.assertEqual(scripted_fn(30.9, ), test_none(30.9, ))
 
-        scripted_fn = torch.jit._script_pdt(test_none, example_inputs=[(None, ), (10, )])
+        scripted_fn = torch.jit.script(test_none, example_inputs=[(None, ), (10, )])
         self.assertEqual(scripted_fn(2, ), test_none(2, ))
 
-        scripted_fn = torch.jit._script_pdt(test_none, example_inputs=[(None, ), (torch.Tensor(1), )])
+        scripted_fn = torch.jit.script(test_none, example_inputs=[(None, ), (torch.Tensor(1), )])
         self.assertEqual(scripted_fn(torch.ones(1), ), test_none(torch.ones(1), ))
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index c9fd886c7336d..f7fa58bd36434 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -20,7 +20,6 @@
 )
 from torch.jit._script import (
     script,
-    _script_pdt,
     Attribute,
     ScriptModule,
     script_method,
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index 3d173ae27bd01..09801bab938a7 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -984,57 +984,6 @@ def call_prepare_scriptable_func(obj):
     memo: Dict[int, torch.nn.Module] = {}
     return call_prepare_scriptable_func_impl(obj, memo)
 
-
-def _script_pdt(obj, optimize=None, _frames_up=0, _rcb=None,
-                example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = None):
-    # This is a private API, intended for internal use only. Usage of this API is only for experimental
-    # purposes only and is highly discouraged.
-    global type_trace_db
-    if not _enabled:
-        return obj
-
-    if optimize is not None:
-        warnings.warn(
-            "`optimize` is deprecated and has no effect. Use `with torch.jit.optimized_execution() instead"
-        )
-
-    # No-op for modules and functions that are already scripted
-    if isinstance(obj, ScriptModule):
-        return obj
-    if isinstance(obj, ScriptFunction):
-        return obj
-
-    if example_inputs:
-        # If MonkeyType is installed, enable profile directed type annotation
-        # Check if example_inputs are defined and generate call traces
-        # for the method by running eager mode version of the method with
-        # the provide example inputs. This logs all the traces in type_trace_db
-        type_trace_db = JitTypeTraceStore()
-        if monkeytype_trace:
-            monkeytype_config = JitTypeTraceConfig(type_trace_db)
-            with monkeytype_trace(monkeytype_config):
-                if isinstance(example_inputs, Dict):
-                    # If the obj is an nn.Module or a class, then each method is
-                    # executed with the arguments provided in the example inputs.
-                    # example inputs here will be of type Dict(class.method, (arguments))
-                    # This is used to infer type annotations for those methods
-                    # which are not called directly under the hood of monkeytype.
-                    for module, example_input in example_inputs.items():
-                        for example in example_input:
-                            module(*example)
-                elif isinstance(example_inputs, List):
-                    for examples in example_inputs:
-                        obj(*examples)
-                else:
-                    warnings.warn("Error: Unable to infer types. Please format the inputs to type `List[Tuple]`"
-                                  " or `Dict[Callable, List[Tuple]]` to be run with MonkeyType.")
-        else:
-            warnings.warn("Warning: monkeytype is not installed. Please install https://github.com/Instagram/MonkeyType "
-                          "to enable Profile-Directed Typing in TorchScript. Refer to "
-                          "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. ")
-    return script(obj, optimize, _frames_up, _rcb)
-
-
 def create_script_dict(obj):
     """
     Create a ``torch._C.ScriptDict`` instance with the data from ``obj``.
@@ -1065,7 +1014,8 @@ def create_script_list(obj, type_hint=None):
     return torch._C.ScriptList(obj)  # type: ignore[attr-defined]
 
 
-def script(obj, optimize=None, _frames_up=0, _rcb=None):
+def script(obj, optimize=None, _frames_up=0, _rcb=None,
+           example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = None):
     r"""
     Scripting a function or ``nn.Module`` will inspect the source code, compile
     it as TorchScript code using the TorchScript compiler, and return a :class:`ScriptModule` or
@@ -1083,6 +1033,8 @@ def script(obj, optimize=None, _frames_up=0, _rcb=None):
     Args:
         obj (callable, class, or ``nn.Module``):  The ``nn.Module``, function, class type,
                                                   dictionary, or list to compile.
+        example_inputs (Union[List[Tuple], Dict[Callable, List[Tuple]], None]): Provide example inputs
+            to annotate the arguments for a function or ``nn.Module``.
 
     Returns:
         If ``obj`` is ``nn.Module``, ``script`` returns
@@ -1124,6 +1076,34 @@ def foo(x, y):
 
             ...
 
+    ****Scripting a function using example_inputs**
+        Example inputs can be used to annotate a function arguments.
+
+        Example (annotating a function before scripting):
+
+        .. testcode::
+
+            import torch
+
+            def test_sum(a, b):
+                return a + b
+
+            # Annotate the arguments to be int
+            scripted_fn = torch.jit.script(test_sum, example_inputs=[(3, 4)])
+
+            print(type(scripted_fn))  # torch.jit.ScriptFunction
+
+            # See the compiled graph as Python code
+            print(scripted_fn.code)
+
+            # Call the function using the TorchScript interpreter
+            scripted_fn(20, 100)
+
+        .. testoutput::
+            :hide:
+
+            ...
+
     **Scripting an nn.Module**
         Scripting an ``nn.Module`` by default will compile the ``forward`` method and recursively
         compile any methods, submodules, and functions called by ``forward``. If a ``nn.Module`` only uses
@@ -1210,7 +1190,30 @@ def forward(self, input):
             scripted_module = torch.jit.script(MyModule())
             print(scripted_module.some_entry_point(torch.randn(2, 2)))
             print(scripted_module(torch.randn(2, 2)))
+
+        Example ( Annotating forward of nn.Module using example_inputs)::
+
+            import torch
+            import torch.nn as nn
+            from typing import NamedTuple
+
+            class MyModule(NamedTuple):
+            result: List[int]
+
+            class TestNNModule(torch.nn.Module):
+                def forward(self, a) -> MyModule:
+                    result = MyModule(result=a)
+                    return result
+
+            pdt_model = TestNNModule()
+
+            # Runs the pdt_model in eager model with the inputs provided and annotates the arguments of forward
+            scripted_model = torch.jit.script(pdt_model, example_inputs={pdt_model: [([10, 20, ], ), ], })
+
+            # Run the scripted_model with actual inputs
+            print(scripted_model([20]))
     """
+    global type_trace_db
     if not _enabled:
         return obj
 
@@ -1227,6 +1230,35 @@ def forward(self, input):
     if isinstance(obj, ScriptFunction):
         return obj
 
+    if example_inputs:
+        # If MonkeyType is installed, enable profile directed type annotation
+        # Check if example_inputs are defined and generate call traces
+        # for the method by running eager mode version of the method with
+        # the provide example inputs. This logs all the traces in type_trace_db
+        type_trace_db = JitTypeTraceStore()
+        if monkeytype_trace:
+            monkeytype_config = JitTypeTraceConfig(type_trace_db)
+            with monkeytype_trace(monkeytype_config):
+                if isinstance(example_inputs, Dict):
+                    # If the obj is an nn.Module or a class, then each method is
+                    # executed with the arguments provided in the example inputs.
+                    # example inputs here will be of type Dict(class.method, (arguments))
+                    # This is used to infer type annotations for those methods
+                    # which are not called directly under the hood of monkeytype.
+                    for module, example_input in example_inputs.items():
+                        for example in example_input:
+                            module(*example)
+                elif isinstance(example_inputs, List):
+                    for examples in example_inputs:
+                        obj(*examples)
+                else:
+                    raise ValueError("Error: Unable to infer types. Please format the inputs to type `List[Tuple]`"
+                                     " or `Dict[Callable, List[Tuple]]` to be run with MonkeyType.")
+        else:
+            warnings.warn("Warning: monkeytype is not installed. Please install https://github.com/Instagram/MonkeyType "
+                          "to enable Profile-Directed Typing in TorchScript. Refer to "
+                          "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. ")
+
     if isinstance(obj, torch.nn.Module):
         obj = call_prepare_scriptable_func(obj)
         return torch.jit._recursive.create_script_module(

From 65e6194aeb3269a182cfe2c05c122159da12770f Mon Sep 17 00:00:00 2001
From: Can Balioglu <cbalioglu@users.noreply.github.com>
Date: Thu, 26 Aug 2021 20:16:10 -0700
Subject: [PATCH 280/530] Introduce the torchrun entrypoint (#64049)

Summary:
This PR introduces a new `torchrun` entrypoint that simply "points" to `python -m torch.distributed.run`. It is shorter and less error-prone to type and gives a nicer syntax than a rather cryptic `python -m ...` command line. Along with the new entrypoint the documentation is also updated and places where `torch.distributed.run` are mentioned are replaced with `torchrun`.

cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64049

Reviewed By: cbalioglu

Differential Revision: D30584041

Pulled By: kiukchung

fbshipit-source-id: d99db3b5d12e7bf9676bab70e680d4b88031ae2d
---
 docs/source/elastic/quickstart.rst   | 34 ++++++++++-----------
 docs/source/elastic/run.rst          |  2 +-
 docs/source/elastic/train_script.rst |  2 +-
 setup.py                             |  1 +
 torch/distributed/launch.py          |  6 ++--
 torch/distributed/run.py             | 44 ++++++++++++++--------------
 6 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/docs/source/elastic/quickstart.rst b/docs/source/elastic/quickstart.rst
index 1d22426d06a8b..f7c1ebf7dd0de 100644
--- a/docs/source/elastic/quickstart.rst
+++ b/docs/source/elastic/quickstart.rst
@@ -5,13 +5,13 @@ To launch a **fault-tolerant** job, run the following on all nodes.
 
 .. code-block:: bash
 
-    python -m torch.distributed.run
-            --nnodes=NUM_NODES
-            --nproc_per_node=TRAINERS_PER_NODE
-            --rdzv_id=JOB_ID
-            --rdzv_backend=c10d
-            --rdzv_endpoint=HOST_NODE_ADDR
-            YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
+    torchrun
+       --nnodes=NUM_NODES
+       --nproc_per_node=TRAINERS_PER_NODE
+       --rdzv_id=JOB_ID
+       --rdzv_backend=c10d
+       --rdzv_endpoint=HOST_NODE_ADDR
+       YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 
 
 To launch an **elastic** job, run the following on at least ``MIN_SIZE`` nodes
@@ -19,13 +19,13 @@ and at most ``MAX_SIZE`` nodes.
 
 .. code-block:: bash
 
-    python -m torch.distributed.run
-            --nnodes=MIN_SIZE:MAX_SIZE
-            --nproc_per_node=TRAINERS_PER_NODE
-            --rdzv_id=JOB_ID
-            --rdzv_backend=c10d
-            --rdzv_endpoint=HOST_NODE_ADDR
-            YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
+    torchrun
+        --nnodes=MIN_SIZE:MAX_SIZE
+        --nproc_per_node=TRAINERS_PER_NODE
+        --rdzv_id=JOB_ID
+        --rdzv_backend=c10d
+        --rdzv_endpoint=HOST_NODE_ADDR
+        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 
 ``HOST_NODE_ADDR``, in form <host>[:<port>] (e.g. node1.example.com:29400),
 specifies the node and the port on which the C10d rendezvous backend should be
@@ -46,6 +46,6 @@ ideally you should pick a node that has a high bandwidth.
    Learn more about writing your distributed training script
    `here <train_script.html>`_.
 
-If ``torch.distributed.run`` does not meet your requirements you may use our
-APIs directly for more powerful customization. Start by taking a look at the
-`elastic agent <agent.html>`_ API).
+If ``torchrun`` does not meet your requirements you may use our APIs directly
+for more powerful customization. Start by taking a look at the
+`elastic agent <agent.html>`_ API.
diff --git a/docs/source/elastic/run.rst b/docs/source/elastic/run.rst
index fb870fae41f58..284fc7f755311 100644
--- a/docs/source/elastic/run.rst
+++ b/docs/source/elastic/run.rst
@@ -1,6 +1,6 @@
 .. _launcher-api:
 
-torch.distributed.run (Elastic Launch)
+torchrun (Elastic Launch)
 ======================================
 
 .. automodule:: torch.distributed.run
diff --git a/docs/source/elastic/train_script.rst b/docs/source/elastic/train_script.rst
index 263f2df659574..04225d79067a8 100644
--- a/docs/source/elastic/train_script.rst
+++ b/docs/source/elastic/train_script.rst
@@ -4,7 +4,7 @@ Train script
 -------------
 
 If your train script works with ``torch.distributed.launch`` it will continue
-working with ``torch.distributed.run`` with these differences:
+working with ``torchrun`` with these differences:
 
 1. No need to manually pass ``RANK``, ``WORLD_SIZE``,
    ``MASTER_ADDR``, and ``MASTER_PORT``.
diff --git a/setup.py b/setup.py
index a20098232af3c..6d9ed53dc66aa 100644
--- a/setup.py
+++ b/setup.py
@@ -854,6 +854,7 @@ def make_relative_rpath_args(path):
         'console_scripts': [
             'convert-caffe2-to-onnx = caffe2.python.onnx.bin.conversion:caffe2_to_onnx',
             'convert-onnx-to-caffe2 = caffe2.python.onnx.bin.conversion:onnx_to_caffe2',
+            'torchrun = torch.distributed.run:main',
         ]
     }
 
diff --git a/torch/distributed/launch.py b/torch/distributed/launch.py
index 4f29edd10d521..6173abb2c9ecf 100644
--- a/torch/distributed/launch.py
+++ b/torch/distributed/launch.py
@@ -4,7 +4,7 @@
 
 .. warning::
 
-    This module is going to be deprecated in favor of :ref:`torch.distributed.run <launcher-api>`.
+    This module is going to be deprecated in favor of :ref:`torchrun <launcher-api>`.
 
 The utility can be used for single-node distributed training, in which one or
 more processes per node will be spawned. The utility can be used for either
@@ -177,8 +177,8 @@ def launch(args):
 def main(args=None):
     warnings.warn(
         "The module torch.distributed.launch is deprecated\n"
-        "and will be removed in future. Use torch.distributed.run.\n"
-        "Note that --use_env is set by default in torch.distributed.run.\n"
+        "and will be removed in future. Use torchrun.\n"
+        "Note that --use_env is set by default in torchrun.\n"
         "If your script expects `--local_rank` argument to be set, please\n"
         "change it to read from `os.environ['LOCAL_RANK']` instead. See \n"
         "https://pytorch.org/docs/stable/distributed.html#launch-utility for \n"
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index d4428a0cde3c1..c6e84d6f65f4b 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -7,7 +7,7 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-``torch.distributed.run`` provides a superset of the functionality as ``torch.distributed.launch``
+``torchrun`` provides a superset of the functionality as ``torch.distributed.launch``
 with the following additional functionalities:
 
 1. Worker failures are handled gracefully by restarting all workers.
@@ -18,33 +18,33 @@
 
 
-Transitioning from torch.distributed.launch to torch.distributed.run
+Transitioning from torch.distributed.launch to torchrun
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 
-``torch.distributed.run`` supports the same arguments as ``torch.distributed.launch`` **except**
+``torchrun`` supports the same arguments as ``torch.distributed.launch`` **except**
 for ``--use_env`` which is now deprecated. To migrate from ``torch.distributed.launch``
-to ``torch.distributed.run`` follow these steps:
+to ``torchrun`` follow these steps:
 
 1.  If your training script is already reading ``local_rank`` from the ``LOCAL_RANK`` environment variable.
     Then you need simply omit the ``--use_env`` flag, e.g.:
 
-    +--------------------------------------------------------------------+------------------------------------------------------+
-    |         ``torch.distributed.launch``                               |            ``torch.distributed.run``                 |
-    +====================================================================+======================================================+
-    |                                                                    |                                                      |
-    | .. code-block:: shell-session                                      | .. code-block:: shell-session                        |
-    |                                                                    |                                                      |
-    |    $ python -m torch.distributed.launch --use_env train_script.py  |    $ python -m torch.distributed.run train_script.py |
-    |                                                                    |                                                      |
-    +--------------------------------------------------------------------+------------------------------------------------------+
+    +--------------------------------------------------------------------+--------------------------------------------+
+    |         ``torch.distributed.launch``                               |                ``torchrun``                |
+    +====================================================================+============================================+
+    |                                                                    |                                            |
+    | .. code-block:: shell-session                                      | .. code-block:: shell-session              |
+    |                                                                    |                                            |
+    |    $ python -m torch.distributed.launch --use_env train_script.py  |    $ torchrun train_script.py              |
+    |                                                                    |                                            |
+    +--------------------------------------------------------------------+--------------------------------------------+
 
 2.  If your training script reads local rank from a ``--local_rank`` cmd argument.
     Change your training script to read from the ``LOCAL_RANK`` environment variable as
     demonstrated by the following code snippet:
 
     +-------------------------------------------------------+----------------------------------------------------+
-    |         ``torch.distributed.launch``                  |            ``torch.distributed.run``               |
+    |         ``torch.distributed.launch``                  |                    ``torchrun``                    |
     +=======================================================+====================================================+
     |                                                       |                                                    |
     | .. code-block:: python                                | .. code-block:: python                             |
@@ -59,12 +59,12 @@
     |                                                       |                                                    |
     +-------------------------------------------------------+----------------------------------------------------+
 
-The aformentioned changes suffice to migrate from ``torch.distributed.launch`` to ``torch.distributed.run``.
-To take advantage of new features such as elasticity, fault-tolerance, and error reporting of ``torch.distributed.run``
+The aformentioned changes suffice to migrate from ``torch.distributed.launch`` to ``torchrun``.
+To take advantage of new features such as elasticity, fault-tolerance, and error reporting of ``torchrun``
 please refer to:
 
-* :ref:`elastic_train_script` for more information on authoring training scripts that are ``torch.distributed.run`` compliant.
-* the rest of this page for more information on the features of ``torch.distributed.run``.
+* :ref:`elastic_train_script` for more information on authoring training scripts that are ``torchrun`` compliant.
+* the rest of this page for more information on the features of ``torchrun``.
 
 
@@ -75,7 +75,7 @@
 
 ::
 
-    >>> python -m torch.distributed.run
+    >>> torchrun
         --standalone
         --nnodes=1
         --nproc_per_node=$NUM_TRAINERS
@@ -85,7 +85,7 @@
 
 ::
 
-    >>> python -m torch.distributed.run
+    >>> torchrun
         --nnodes=$NUM_NODES
         --nproc_per_node=$NUM_TRAINERS
         --rdzv_id=$JOB_ID
@@ -104,7 +104,7 @@
 
 ::
 
-    >>> python -m torch.distributed.run
+    >>> torchrun
         --nnodes=1:4
         --nproc_per_node=$NUM_TRAINERS
         --rdzv_id=$JOB_ID
@@ -186,7 +186,7 @@
    of the worker is specified in the ``WorkerSpec``.
 
 5. ``LOCAL_WORLD_SIZE`` - The local world size (e.g. number of workers running locally); equals to
-   ``--nproc_per_node`` specified on ``torch.distributed.run``.
+   ``--nproc_per_node`` specified on ``torchrun``.
 
 6. ``WORLD_SIZE`` - The world size (total number of workers in the job).
 

From 63c90ec3bf6c9445a36199f65e0523a5e6532b0d Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Thu, 26 Aug 2021 20:54:54 -0700
Subject: [PATCH 281/530] [torch/deploy] add torch.distributed to build
 (#63918)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63918

Previously we were building with `USE_DISTRIBUTED` off, because c10d was built as a separately library for historical reasons. Since then, lw has merged the c10d build into libtorch, so this is fairly easy to turn on.

Differential Revision:
D30492442

**NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.intern.facebook.com/intern/diff/D30492442/)!
D30492442
D30492442

Test Plan: added a unit test

Reviewed By: wconstab

Pulled By: suo

fbshipit-source-id: 843b8fcf349a72a7f6fcbd1fcc8961268690fb8c
---
 tools/build_variables.bzl                     |   5 +-
 torch/CMakeLists.txt                          |  99 +++++++----
 .../csrc/deploy/example/generate_examples.py  |   3 +
 torch/csrc/deploy/test_deploy.cpp             |  12 ++
 torch/csrc/deploy/test_deploy_gpu.cpp         |  12 ++
 torch/csrc/distributed/c10d/frontend.cpp      | 161 +++++-------------
 torch/csrc/distributed/c10d/frontend.hpp      |  24 ++-
 torch/csrc/distributed/c10d/frontend_cuda.cpp | 136 +++++++++++++++
 torch/csrc/distributed/c10d/frontend_cuda.hpp |  12 ++
 torch/csrc/distributed/c10d/init.cpp          |  28 +--
 .../distributed/rpc/request_callback_impl.cpp |   1 +
 11 files changed, 315 insertions(+), 178 deletions(-)
 create mode 100644 torch/csrc/distributed/c10d/frontend_cuda.cpp
 create mode 100644 torch/csrc/distributed/c10d/frontend_cuda.hpp

diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 3f6225358ac97..650830b3143f0 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -340,6 +340,7 @@ libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_tr
 
 # These files are the only ones that are supported on Windows.
 libtorch_distributed_base_sources = [
+    "torch/csrc/distributed/c10d/frontend.cpp",
     "torch/csrc/distributed/c10d/comm.cpp",
     "torch/csrc/distributed/c10d/default_comm_hooks.cpp",
     "torch/csrc/distributed/c10d/FileStore.cpp",
@@ -351,6 +352,7 @@ libtorch_distributed_base_sources = [
     "torch/csrc/distributed/c10d/ProcessGroupGloo.cpp",
     "torch/csrc/distributed/c10d/ProcessGroupMPI.cpp",
     "torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
+    "torch/csrc/distributed/c10d/quantization/quantization.cpp",
     "torch/csrc/distributed/c10d/reducer.cpp",
     "torch/csrc/distributed/c10d/sequence_num.cpp",
     "torch/csrc/distributed/c10d/Store.cpp",
@@ -548,6 +550,7 @@ libtorch_cuda_distributed_base_sources = [
 
 # These files are only supported on Linux (and others) but not on Windows.
 libtorch_cuda_distributed_extra_sources = [
+    "torch/csrc/distributed/c10d/frontend_cuda.cpp",
     "torch/csrc/distributed/c10d/NCCLUtils.cpp",
     "torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp",
     "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
@@ -735,10 +738,8 @@ libtorch_python_core_sources = [
 ]
 
 libtorch_python_distributed_core_sources = [
-    "torch/csrc/distributed/c10d/frontend.cpp",
     "torch/csrc/distributed/c10d/init.cpp",
     "torch/csrc/distributed/c10d/python_comm_hook.cpp",
-    "torch/csrc/distributed/c10d/quantization/quantization.cpp",
 ]
 
 libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 761605fadcce8..7c086855612ca 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -214,11 +214,78 @@ add_custom_command(
     WORKING_DIRECTORY
     "${TORCH_ROOT}"
 )
+if(USE_DISTRIBUTED)
+    if(WIN32)
+      append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
+    else()
+      append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
+    endif()
+    # Disable certain warnings for GCC-9.X
+    if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+    endif()
+    # NCCL is a private dependency of libtorch, but libtorch_python includes
+    # some private headers of libtorch, which in turn include NCCL. As a hacky
+    # alternative to making NCCL a public dependency of libtorch, we make it
+    # a private dependency of libtorch_python as well.
+    if(USE_NCCL)
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
+    endif()
+    # Same for MPI.
+    if(USE_MPI)
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES})
+    endif()
+    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
+
+endif()
+
+if(USE_NCCL AND NOT WIN32)
+    list(APPEND TORCH_PYTHON_SRCS
+      ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
+    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL)
+endif()
+
 
 # WARNING- any TORCH_PYTHON_COMPILE_DEFINITIONS above this line
 #          affect both torch_python and DEPLOY interpreter.
 if(USE_DEPLOY)
   add_library(torch_python_obj OBJECT ${TORCH_PYTHON_SRCS})
+  if(USE_DISTRIBUTED)
+    # Set c10d-related compile definitions. For a "normal" build of
+    # libtorch_python, these are set on libtorch as PUBLIC so they are
+    # automatically propagated when libtorch_python links against libtorch. But
+    # since in the deploy build we are intentionally *not* linking against
+    # libtorch, we need to set them manually here.
+    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED)
+    if(USE_GLOO AND USE_C10D_GLOO)
+      list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D_GLOO)
+    endif()
+    if(USE_NCCL AND USE_C10D_NCCL)
+        list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D_NCCL)
+        # Put nccl headers on the include path. We are specifically only setting
+        # include dirs here instead of linking against __caffe2_nccl wholesale
+        # to ensure we aren't accidentally replicating the nccl lib.
+        target_include_directories(torch_python_obj PRIVATE $<TARGET_PROPERTY:__caffe2_nccl,INTERFACE_INCLUDE_DIRECTORIES>)
+    endif()
+    if(USE_MPI AND USE_C10D_MPI)
+      list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D_MPI)
+    endif()
+
+    # Pass USE_RPC in order to reduce use of
+    # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
+    # need to be removed when RPC is supported
+    if(NOT WIN32)
+      target_compile_definitions(torch_cpu PUBLIC USE_RPC)
+    endif()
+    if(USE_TENSORPIPE)
+      list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_TENSORPIPE)
+    endif()
+
+    # Set c10d-related include directories as well.
+    target_include_directories(torch_python_obj PRIVATE $<BUILD_INTERFACE:${TORCH_SRC_DIR}/csrc/distributed>)
+  endif()
   target_compile_definitions(torch_python_obj PRIVATE "-DTHP_BUILD_MAIN_LIB -DUSE_DEPLOY")
 
   target_compile_definitions(torch_python_obj PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS})
@@ -268,38 +335,6 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set_source_files_properties(${TORCH_SRC_DIR}/csrc/utils/throughput_benchmark.cpp PROPERTIES COMPILE_FLAGS -Wno-attributes)
 endif()
 
-if(USE_DISTRIBUTED)
-    if(WIN32)
-      append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
-    else()
-      append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
-    endif()
-    # Disable certain warnings for GCC-9.X
-    if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-    endif()
-    # NCCL is a private dependency of libtorch, but libtorch_python includes
-    # some private headers of libtorch, which in turn include NCCL. As a hacky
-    # alternative to making NCCL a public dependency of libtorch, we make it
-    # a private dependency of libtorch_python as well.
-    if(USE_NCCL)
-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
-    endif()
-    # Same for MPI.
-    if(USE_MPI)
-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES})
-    endif()
-    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
-endif()
-
-if(USE_NCCL AND NOT WIN32)
-    list(APPEND TORCH_PYTHON_SRCS
-      ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
-    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL)
-endif()
-
 add_library(torch_python SHARED ${TORCH_PYTHON_SRCS})
 if(HAVE_SOVERSION)
   set_target_properties(torch_python PROPERTIES
diff --git a/torch/csrc/deploy/example/generate_examples.py b/torch/csrc/deploy/example/generate_examples.py
index 65f244373d954..0f279d922157c 100644
--- a/torch/csrc/deploy/example/generate_examples.py
+++ b/torch/csrc/deploy/example/generate_examples.py
@@ -79,3 +79,6 @@ def save(name, model, model_jit=None, eg=None, featurestore_meta=None):
         e.save_pickle("fn", "fn.pkl", load_library)
 
     generate_fx_example()
+
+    with PackageExporter(p / "uses_distributed") as e:
+        e.save_source_string("uses_distributed", "import torch.distributed; assert torch.distributed.is_available()")
diff --git a/torch/csrc/deploy/test_deploy.cpp b/torch/csrc/deploy/test_deploy.cpp
index a004db1e0d232..53456cacca2ad 100644
--- a/torch/csrc/deploy/test_deploy.cpp
+++ b/torch/csrc/deploy/test_deploy.cpp
@@ -366,3 +366,15 @@ TEST(TorchpyTest, SharedLibraryLoad) {
   }
 }
 #endif
+
+TEST(TorchpyTest, UsesDistributed) {
+  const auto model_filename = path(
+      "USES_DISTRIBUTED",
+      "torch/csrc/deploy/example/generated/uses_distributed");
+  torch::deploy::InterpreterManager m(1);
+  torch::deploy::Package p = m.load_package(model_filename);
+  {
+    auto I = p.acquire_session();
+    I.self.attr("import_module")({"uses_distributed"});
+  }
+}
diff --git a/torch/csrc/deploy/test_deploy_gpu.cpp b/torch/csrc/deploy/test_deploy_gpu.cpp
index 8287d1683edca..4e990adcd9e89 100644
--- a/torch/csrc/deploy/test_deploy_gpu.cpp
+++ b/torch/csrc/deploy/test_deploy_gpu.cpp
@@ -53,3 +53,15 @@ TEST(TorchDeployGPUTest, SimpleModel) {
 
   ASSERT_TRUE(ref_output.allclose(output, 1e-03, 1e-05));
 }
+
+TEST(TorchDeployGPUTest, UsesDistributed) {
+  const auto model_filename = path(
+      "USES_DISTRIBUTED",
+      "torch/csrc/deploy/example/generated/uses_distributed");
+  torch::deploy::InterpreterManager m(1);
+  torch::deploy::Package p = m.load_package(model_filename);
+  {
+    auto I = p.acquire_session();
+    I.self.attr("import_module")({"uses_distributed"});
+  }
+}
diff --git a/torch/csrc/distributed/c10d/frontend.cpp b/torch/csrc/distributed/c10d/frontend.cpp
index b65cba79884af..e5b59f28982f6 100644
--- a/torch/csrc/distributed/c10d/frontend.cpp
+++ b/torch/csrc/distributed/c10d/frontend.cpp
@@ -3,10 +3,11 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/Functions.h>
 #include <c10/util/Exception.h>
-#include <c10d/PrefixStore.hpp>
 #include <c10d/FileStore.hpp>
 #include <c10d/TCPStore.hpp>
 #include <c10d/Utils.hpp>
+#include <torch/csrc/distributed/c10d/quantization/quantization.h>
+#include <torch/library.h>
 
 #include <chrono>
 #include <sstream>
@@ -17,10 +18,6 @@
 #include <c10d/ProcessGroupGloo.hpp>
 #endif
 
-#ifdef USE_C10D_NCCL
-#include <c10d/ProcessGroupNCCL.hpp>
-#endif
-
 #ifdef USE_C10D_MPI
 #include <c10d/ProcessGroupMPI.hpp>
 #endif
@@ -29,6 +26,20 @@ namespace c10d {
 
 namespace {
 
+// Constant initialization, so it is guaranteed to be initialized before
+// static initialization calls which may invoke registerNCCLProcessGroupProvider
+const NCCLProcessGroupProvider stubProvider;
+constexpr const NCCLProcessGroupProvider* defaultStubProviderAddr =
+    &stubProvider;
+inline const NCCLProcessGroupProvider*& getNCCLProcessGroupProviderAddress() {
+  static const NCCLProcessGroupProvider* stubs_ = defaultStubProviderAddr;
+  return stubs_;
+}
+
+const NCCLProcessGroupProvider* GetNCCLProcessGroupProvider() {
+  return getNCCLProcessGroupProviderAddress();
+}
+
 void maybePreprocessComplexTensor(at::Tensor& tensor) {
   if(!tensor.is_complex()) {
     return;
@@ -63,6 +74,11 @@ void assertReduceOpSupportsComplexTensor(ReduceOp op) {
 
 }  // namespace anonymous
 
+void registerNCCLProcessGroupProvider(NCCLProcessGroupProvider* provider) {
+  getNCCLProcessGroupProviderAddress() = provider;
+}
+
+
 std::string Backend::get(const std::string& backend_type) {
   return backend_type;
 }
@@ -207,17 +223,7 @@ c10::intrusive_ptr<ProcessGroup> DistributedC10d::newProcessGroupHelper(
           "Attempting to create GLOO-based process group while GLOO is either not enabled or built");
 #endif // USE_C10D_GLOO
     } else if (backend == "nccl") {
-#ifdef USE_C10D_NCCL
-      auto options = ProcessGroupNCCL::Options::create();
-
-      options->is_high_priority_stream = false;
-      options->timeout = timeout;
-      pg = c10::make_intrusive<ProcessGroupNCCL>(
-          prefix_store, rank, world_size, options);
-#else
-      AT_ERROR(
-          "Attempting to create NCCL-based process group while NCCL is either not enabled or built");
-#endif // USE_C10D_NCCL
+      pg = GetNCCLProcessGroupProvider()->get(prefix_store, rank, world_size, timeout);
     } else {
       // TODO: discuss to figure out how to extend this to third party backends?
       AT_ERROR("Unsupported backend type: ", backend);
@@ -1008,7 +1014,7 @@ void initCustomClassBindings() {
           .def(
               "broadcast",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 std::vector<at::Tensor> data) { return self->broadcast(data);
+                  std::vector<at::Tensor> data) { return self->broadcast(data);
           })
           */
           .def(
@@ -1045,14 +1051,14 @@ void initCustomClassBindings() {
           .def(
               "allreduce",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 at::Tensor& tensor,
-                 c10::intrusive_ptr<::c10d::ReduceOp> op) {
+                  at::Tensor& tensor,
+                  c10::intrusive_ptr<::c10d::ReduceOp> op) {
                       ::c10d::AllreduceOptions opts;
                       opts.reduceOp = *op;
                       std::vector<at::Tensor> tensors = {tensor};
                       return self->allreduce(tensors, opts);
-                 }
-           )
+                  }
+            )
           */
           // TODO: make AllreduceCoalescedOptions compatible with TorchBind to
           // provide the full API in python.
@@ -1098,8 +1104,8 @@ void initCustomClassBindings() {
           .def(
               "allgather",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 std::vector<at::Tensor> output,
-                 at::Tensor input) {
+                  std::vector<at::Tensor> output,
+                  at::Tensor input) {
                 std::vector<std::vector<at::Tensor>> outputs = {
                     std::move(output)};
                 std::vector<at::Tensor> inputs = {std::move(input)};
@@ -1121,8 +1127,8 @@ void initCustomClassBindings() {
           .def(
               "gather",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 std::vector<std::vector<at::Tensor>> output_tensors,
-                 std::vector<at::Tensor> input_tensors) {
+                  std::vector<std::vector<at::Tensor>> output_tensors,
+                  std::vector<at::Tensor> input_tensors) {
                 ::c10d::GatherOptions opts;
                 return self->gather(output_tensors, input_tensors, opts);
               })
@@ -1145,8 +1151,8 @@ void initCustomClassBindings() {
           .def(
               "scatter",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 std::vector<at::Tensor> outputTensors,
-                 std::vector<std::vector<at::Tensor>> inputTensors) {
+                  std::vector<at::Tensor> outputTensors,
+                  std::vector<std::vector<at::Tensor>> inputTensors) {
                 ::c10d::ScatterOptions opts;
                 self->scatter(outputTensors, inputTensors, opts);
               })
@@ -1169,8 +1175,8 @@ void initCustomClassBindings() {
           // TODO: Enable this method when TorchBind supports
           ReduceScatterOptions. .def( "reduce_scatter",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 std::vector<at::Tensor> outputTensors,
-                 std::vector<std::vector<at::Tensor>> inputTensors) {
+                  std::vector<at::Tensor> outputTensors,
+                  std::vector<std::vector<at::Tensor>> inputTensors) {
                 ::c10d::ReduceScatterOptions opts;
                 return self->reduce_scatter(outputTensors, inputTensors, opts);
               })
@@ -1241,95 +1247,6 @@ void initCustomClassBindings() {
                 return self->barrier(opts);
               });
 
-#ifdef USE_C10D_NCCL
-  // XXX: Ideally the Options of ProcessGroupNCCL should be
-  // bound using `def_readwrite` like in pybind11, but we
-  // didn't do that because: 1. no milisecond support yet
-  // 2. no def_readwrite or property support yet.
-  // TODO: make this binding the same as pybind11
-  static const auto ProcessGroupNCCLOptionsTorchBind =
-      torch::class_<::c10d::ProcessGroupNCCL::Options>(
-          "dist_c10d", "ProcessGroupNCCLOptions")
-          .def(torch::init([](int64_t timeout, bool isHighPriorityStream) {
-            auto opTimeout = std::chrono::milliseconds(timeout);
-            auto opts =
-                ::c10d::ProcessGroupNCCL::Options::create(isHighPriorityStream);
-            opts->timeout = opTimeout;
-            return opts;
-          }));
-
-  static const auto ProcessGroupNCCLTorchBind =
-      torch::class_<::c10d::ProcessGroupNCCL>("dist_c10d", "ProcessGroupNCCL")
-          .def_pickle(
-              [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) {
-                auto base_process_group =
-                    ::c10::static_intrusive_pointer_cast<::c10d::ProcessGroup>(self);
-                auto name =
-                    ::c10d::DistributedC10d::get()->getNameOfProcessGroup(self);
-                return std::vector<std::string>{name};
-              },
-              [](std::vector<std::string> state) {
-                TORCH_CHECK(
-                    state.size() == 1,
-                    "Expecting exactly 1 state when restoring ProcessGroupNCCL, got: ",
-                    state.size());
-                const auto& process_group_name = state.front();
-                auto base_process_group =
-                    ::c10d::DistributedC10d::get()->getProcessGroupByName(
-                        process_group_name);
-                TORCH_CHECK(
-                    base_process_group.defined(),
-                    "Needed process group not found, ",
-                    "please create a process group with name: ",
-                    process_group_name);
-                c10::intrusive_ptr<::c10d::ProcessGroupNCCL>
-                    process_group_nccl = ::c10::dynamic_intrusive_pointer_cast<
-                        ::c10d::ProcessGroupNCCL>(base_process_group);
-                TORCH_CHECK(
-                    process_group_nccl.defined(),
-                    "Process group ",
-                    process_group_name,
-                    " isn't configured for NCCL backend");
-                return process_group_nccl;
-              })
-          .def(torch::init(
-              [](const c10::intrusive_ptr<::c10d::Store>& store,
-                 int64_t rank,
-                 int64_t size,
-                 c10::intrusive_ptr<::c10d::ProcessGroupNCCL::Options> options,
-                 const std::string& name) {
-                auto pg = c10::make_intrusive<::c10d::ProcessGroupNCCL>(
-                    store, rank, size, options);
-                ::c10d::DistributedC10d::get()->registerProcessGroupName(
-                    pg, name);
-                return pg;
-              }))
-          .def(
-              "alltoall_base",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self,
-                 at::Tensor output,
-                 at::Tensor input,
-                 std::vector<int64_t> outputSplitSizes,
-                 std::vector<int64_t> inputSplitSizes) {
-                return self->alltoall_base(
-                    output,
-                    input,
-                    outputSplitSizes,
-                    inputSplitSizes,
-                    ::c10d::AllToAllOptions());
-              })
-          .def(
-              "size",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) {
-                return (int64_t)self->getSize();
-              })
-          .def(
-              "rank",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) {
-                return (int64_t)self->getRank();
-              });
-#endif
-
   static const auto DistributedC10dFrontendTorchBind =
       torch::class_<::c10d::DistributedC10d>("dist_c10d", "frontend")
           .def(torch::init([]() { return ::c10d::DistributedC10d::get(); }))
@@ -1344,4 +1261,12 @@ void initCustomClassBindings() {
               &::c10d::DistributedC10d::getNameOfProcessGroup);
 }
 
+TORCH_LIBRARY(q, m) {
+    m.def("_Bfloat16QuantizedToFloat(Tensor input) -> Tensor");
+    m.def("_FloatToBfloat16Quantized(Tensor input) -> Tensor");
+}
+TORCH_LIBRARY_IMPL(q, CPU, m) {
+    m.impl("_Bfloat16QuantizedToFloat", ::torch::distributed::c10d::quantization::_bfloat16_to_float_cpu);
+    m.impl("_FloatToBfloat16Quantized", ::torch::distributed::c10d::quantization::_float_to_bfloat16_cpu);
+}
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/frontend.hpp b/torch/csrc/distributed/c10d/frontend.hpp
index c90cc077b2823..b39d8b7a444bf 100644
--- a/torch/csrc/distributed/c10d/frontend.hpp
+++ b/torch/csrc/distributed/c10d/frontend.hpp
@@ -2,6 +2,7 @@
 
 #include <ATen/ATen.h>
 #include <c10/util/Optional.h>
+#include <c10d/PrefixStore.hpp>
 #include <c10d/ProcessGroup.hpp>
 #include <c10d/Store.hpp>
 #include <c10d/Types.hpp>
@@ -259,7 +260,26 @@ class TORCH_PYTHON_API DistributedC10d : public torch::CustomClassHolder {
   int64_t group_count_;
 };
 
-// Must be called to initialize Torchbind bindings for c10d.
-void initCustomClassBindings();
+// This class exists as a way to allow us to split NCCL-specific code into a
+// different file. frontend_cuda.cpp will, if USE_C10D_NCCL is defined,
+// override this NCCLProcessGroupProvider with one that will actually do
+// something.
+struct TORCH_API NCCLProcessGroupProvider {
+  virtual c10::intrusive_ptr<ProcessGroup> get(
+      c10::intrusive_ptr<PrefixStore> /*prefix_store*/,
+      int64_t /*rank*/,
+      int64_t /*world_size*/,
+      std::chrono::milliseconds /*timeout*/) const {
+    AT_ERROR(
+        "Attempting to create NCCL-based process group while NCCL is either not enabled or built");
+  }
+
+  virtual ~NCCLProcessGroupProvider() = default;
+};
+
+TORCH_API void registerNCCLProcessGroupProvider(
+    NCCLProcessGroupProvider* provider);
+
+TORCH_API void initCustomClassBindings();
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/frontend_cuda.cpp b/torch/csrc/distributed/c10d/frontend_cuda.cpp
new file mode 100644
index 0000000000000..1b42f13b3c8df
--- /dev/null
+++ b/torch/csrc/distributed/c10d/frontend_cuda.cpp
@@ -0,0 +1,136 @@
+#include <torch/csrc/distributed/c10d/frontend_cuda.hpp>
+
+#ifdef USE_C10D_NCCL
+
+#include <c10/util/Exception.h>
+#include <c10d/ProcessGroupNCCL.hpp>
+#include <torch/csrc/distributed/c10d/frontend.hpp>
+#include <torch/csrc/distributed/c10d/quantization/quantization_gpu.h>
+#include <torch/library.h>
+
+namespace c10d {
+
+void initCustomClassBindingsNccl() {
+  // XXX: Ideally the Options of ProcessGroupNCCL should be
+  // bound using `def_readwrite` like in pybind11, but we
+  // didn't do that because: 1. no milisecond support yet
+  // 2. no def_readwrite or property support yet.
+  // TODO: make this binding the same as pybind11
+  static const auto ProcessGroupNCCLOptionsTorchBind =
+      torch::class_<::c10d::ProcessGroupNCCL::Options>(
+          "dist_c10d", "ProcessGroupNCCLOptions")
+          .def(torch::init([](int64_t timeout, bool isHighPriorityStream) {
+            auto opTimeout = std::chrono::milliseconds(timeout);
+            auto opts =
+                ::c10d::ProcessGroupNCCL::Options::create(isHighPriorityStream);
+            opts->timeout = opTimeout;
+            return opts;
+          }));
+
+  static const auto ProcessGroupNCCLTorchBind =
+      torch::class_<::c10d::ProcessGroupNCCL>("dist_c10d", "ProcessGroupNCCL")
+          .def_pickle(
+              [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) {
+                auto base_process_group =
+                    ::c10::static_intrusive_pointer_cast<::c10d::ProcessGroup>(
+                        self);
+                auto name =
+                    ::c10d::DistributedC10d::get()->getNameOfProcessGroup(self);
+                return std::vector<std::string>{name};
+              },
+              [](std::vector<std::string> state) {
+                TORCH_CHECK(
+                    state.size() == 1,
+                    "Expecting exactly 1 state when restoring ProcessGroupNCCL, got: ",
+                    state.size());
+                const auto& process_group_name = state.front();
+                auto base_process_group =
+                    ::c10d::DistributedC10d::get()->getProcessGroupByName(
+                        process_group_name);
+                TORCH_CHECK(
+                    base_process_group.defined(),
+                    "Needed process group not found, ",
+                    "please create a process group with name: ",
+                    process_group_name);
+                c10::intrusive_ptr<::c10d::ProcessGroupNCCL>
+                    process_group_nccl = ::c10::dynamic_intrusive_pointer_cast<
+                        ::c10d::ProcessGroupNCCL>(base_process_group);
+                TORCH_CHECK(
+                    process_group_nccl.defined(),
+                    "Process group ",
+                    process_group_name,
+                    " isn't configured for NCCL backend");
+                return process_group_nccl;
+              })
+          .def(torch::init(
+              [](const c10::intrusive_ptr<::c10d::Store>& store,
+                 int64_t rank,
+                 int64_t size,
+                 c10::intrusive_ptr<::c10d::ProcessGroupNCCL::Options> options,
+                 const std::string& name) {
+                auto pg = c10::make_intrusive<::c10d::ProcessGroupNCCL>(
+                    store, rank, size, options);
+                ::c10d::DistributedC10d::get()->registerProcessGroupName(
+                    pg, name);
+                return pg;
+              }))
+          .def(
+              "alltoall_base",
+              [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self,
+                 at::Tensor output,
+                 at::Tensor input,
+                 std::vector<int64_t> outputSplitSizes,
+                 std::vector<int64_t> inputSplitSizes) {
+                return self->alltoall_base(
+                    output,
+                    input,
+                    outputSplitSizes,
+                    inputSplitSizes,
+                    ::c10d::AllToAllOptions());
+              })
+          .def(
+              "size",
+              [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) {
+                return (int64_t)self->getSize();
+              })
+          .def(
+              "rank",
+              [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) {
+                return (int64_t)self->getRank();
+              });
+}
+
+namespace {
+struct RealNCCLProcessGroupProvider : public NCCLProcessGroupProvider {
+  c10::intrusive_ptr<ProcessGroup> get(
+      c10::intrusive_ptr<PrefixStore> prefix_store,
+      int64_t rank,
+      int64_t world_size,
+      std::chrono::milliseconds timeout) const override {
+    auto options = ProcessGroupNCCL::Options::create();
+    options->is_high_priority_stream = false;
+    options->timeout = timeout;
+    return c10::make_intrusive<ProcessGroupNCCL>(
+        prefix_store, rank, world_size, options);
+  }
+};
+
+struct RegisterNCCLProcessGroupProvider {
+  RegisterNCCLProcessGroupProvider() {
+    static RealNCCLProcessGroupProvider provider;
+    registerNCCLProcessGroupProvider(&provider);
+  }
+};
+
+RegisterNCCLProcessGroupProvider reg;
+
+} // namespace
+#define DISPATCH_TO_CUDA(name, function) \
+    m.impl(name, torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(function)))
+TORCH_LIBRARY_IMPL(q, CUDA, m) {
+    DISPATCH_TO_CUDA("_Bfloat16QuantizedToFloat", ::torch::distributed::c10d::quantization::_bfloat16_to_float_cuda);
+    DISPATCH_TO_CUDA("_FloatToBfloat16Quantized", ::torch::distributed::c10d::quantization::_float_to_bfloat16_cuda);
+}
+} // namespace c10d
+
+#endif // USE_C10D_NCCL
diff --git a/torch/csrc/distributed/c10d/frontend_cuda.hpp b/torch/csrc/distributed/c10d/frontend_cuda.hpp
new file mode 100644
index 0000000000000..a790f2e847b0d
--- /dev/null
+++ b/torch/csrc/distributed/c10d/frontend_cuda.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+#ifdef USE_C10D_NCCL
+#include <c10/macros/Export.h>
+
+namespace c10d {
+
+TORCH_API void initCustomClassBindingsNccl();
+
+}
+
+#endif
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 6b52d3c058384..4bac0ca46edc4 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -17,7 +17,7 @@
 
 #ifdef USE_C10D_NCCL
 #include <c10d/ProcessGroupNCCL.hpp>
-#include <torch/csrc/distributed/c10d/quantization/quantization_gpu.h>
+#include <torch/csrc/distributed/c10d/frontend_cuda.hpp>
 #endif
 
 #ifdef USE_C10D_MPI
@@ -35,7 +35,6 @@
 
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/distributed/c10d/python_comm_hook.h>
-#include <torch/csrc/distributed/c10d/quantization/quantization.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/utils/object_ptr.h>
 #include <torch/csrc/utils/pybind.h>
@@ -233,6 +232,9 @@ void _register_builtin_comm_hook(
 PyObject* c10d_init(PyObject* _unused, PyObject* noargs) {
   C10_LOG_API_USAGE_ONCE("c10d.python.import");
   ::c10d::initCustomClassBindings();
+#ifdef USE_C10D_NCCL
+  ::c10d::initCustomClassBindingsNccl();
+#endif
 
   auto c10d_module = THPObjectPtr(PyImport_ImportModule("torch.distributed"));
   if (!c10d_module) {
@@ -1646,28 +1648,6 @@ static PyMethodDef methods[] = { // NOLINT
 PyMethodDef* python_functions() {
   return methods;
 }
-
-namespace quantization {
-TORCH_LIBRARY(q, m) {
-    m.def("_Bfloat16QuantizedToFloat(Tensor input) -> Tensor");
-    m.def("_FloatToBfloat16Quantized(Tensor input) -> Tensor");
-}
-    TORCH_LIBRARY_IMPL(q, CPU, m) {
-        m.impl("_Bfloat16QuantizedToFloat", _bfloat16_to_float_cpu);
-        m.impl("_FloatToBfloat16Quantized", _float_to_bfloat16_cpu);
-    }
-
-#ifdef USE_C10D_NCCL
-    #define DISPATCH_TO_CUDA(name, function) \
-        m.impl(name, torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(function)))
-    TORCH_LIBRARY_IMPL(q, CUDA, m) {
-        DISPATCH_TO_CUDA("_Bfloat16QuantizedToFloat", _bfloat16_to_float_cuda);
-        DISPATCH_TO_CUDA("_FloatToBfloat16Quantized", _float_to_bfloat16_cuda);
-    }
-#endif
-
-} // namespace quantization
-
 } // namespace c10d
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/request_callback_impl.cpp b/torch/csrc/distributed/rpc/request_callback_impl.cpp
index 7001209be9851..5fbe63ede321c 100644
--- a/torch/csrc/distributed/rpc/request_callback_impl.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_impl.cpp
@@ -16,6 +16,7 @@
 #include <torch/csrc/distributed/autograd/rpc_messages/rref_backward_resp.h>
 #include <torch/csrc/distributed/autograd/utils.h>
 #include <torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h>
+#include <torch/csrc/distributed/rpc/py_rref.h>
 #include <torch/csrc/distributed/rpc/python_call.h>
 #include <torch/csrc/distributed/rpc/python_remote_call.h>
 #include <torch/csrc/distributed/rpc/python_resp.h>

From c7027f19efbb2f7b274c9e5fc0e87fe4b084e6ae Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Thu, 26 Aug 2021 21:05:56 -0700
Subject: [PATCH 282/530] [quant][fx] Add support for dynamic linear + relu
 fusion (INT8) (#63799)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63799

Add a new module that can be used for module swap with the nni.LinearReLU module in convert function.
Supports INT8 currently (since FP16 op doesn't have relu fusion yet).

Fixes #55393

Test Plan:
python test/test_quantization.py test_dynamic_fusion

Imported from OSS

Reviewed By: heitorschueroff

Differential Revision: D30502812

fbshipit-source-id: 3668e4f001a0626d469e17ac323acf582ee28a51
---
 .../eager/test_quantize_eager_ptq.py          | 18 +++++++
 test/quantization/fx/test_quantize_fx.py      | 54 ++++++++++++++++++-
 .../intrinsic/quantized/dynamic/__init__.py   |  1 +
 .../quantized/dynamic/modules/__init__.py     |  6 +++
 .../quantized/dynamic/modules/linear_relu.py  | 47 ++++++++++++++++
 torch/nn/quantized/dynamic/modules/linear.py  |  9 +++-
 .../quantization/fx/quantization_patterns.py  | 13 +++--
 torch/quantization/ns/mappings.py             |  3 ++
 torch/quantization/quantization_mappings.py   |  2 +
 .../testing/_internal/common_quantization.py  |  8 +++
 10 files changed, 154 insertions(+), 7 deletions(-)
 create mode 100644 torch/nn/intrinsic/quantized/dynamic/__init__.py
 create mode 100644 torch/nn/intrinsic/quantized/dynamic/modules/__init__.py
 create mode 100644 torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py

diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py
index 1824da514b733..10cbd928b2b36 100644
--- a/test/quantization/eager/test_quantize_eager_ptq.py
+++ b/test/quantization/eager/test_quantize_eager_ptq.py
@@ -42,6 +42,7 @@
     EmbeddingBagModule,
     EmbeddingModule,
     EmbeddingWithLinear,
+    LinearReluLinearModel,
 )
 
 # annotated models
@@ -995,6 +996,23 @@ def checkQuantized(model):
         model = quantize_dynamic(NestedModel().eval(), qconfig_dict)
         checkQuantized(model)
 
+    def test_linear_relu_fusion(self):
+        dtype = torch.qint8
+        model = LinearReluLinearModel().eval()
+        qconfig = default_dynamic_qconfig
+        qconfig_dict = {'' : qconfig}
+        torch.quantization.fuse_modules(model, [['fc1', 'relu']], inplace=True)
+        prepare_dynamic(model, qconfig_dict)
+        convert_dynamic(model)
+
+        def checkQuantized(model):
+            self.checkDynamicQuantizedLinearRelu(model.fc1, dtype)
+            self.checkDynamicQuantizedLinear(model.fc2, dtype)
+            self.checkScriptable(model, self.calib_data, check_save_load=True)
+            self.checkNoQconfig(model)
+
+        checkQuantized(model)
+
     @given(qconfig=st.sampled_from([per_channel_dynamic_qconfig, default_dynamic_qconfig]),
            dtype=st.sampled_from([torch.qint8, torch.float16]))
     def test_quantized_rnn(self, qconfig, dtype):
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 08474d2bc1d19..cdf2e7bea4328 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -6,6 +6,7 @@
 import torch.nn.quantized.dynamic as nnqd
 import torch.nn.intrinsic as nni
 import torch.nn.intrinsic.quantized as nniq
+import torch.nn.intrinsic.quantized.dynamic as nniqd
 import torch.multiprocessing as mp
 
 # graph mode quantization based on fx
@@ -2883,6 +2884,57 @@ def forward(self, x):
         self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
         self.checkGraphModuleNodes(m_ref, expected_node_occurrence=node_occurrence_ref)
 
+    @skipIfNoFBGEMM
+    def test_dynamic_with_fusion(self):
+        """
+        Tests that dynamic quantization APIs work with Linear + Relu fusion
+        """
+        class LinearRelu(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                x = self.linear(x)
+                return self.relu(x)
+
+        class Linear(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = torch.ones(5, 5)
+                self.b = torch.zeros(5)
+
+            def forward(self, x):
+                return torch.nn.functional.linear(x, self.w, self.b)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mods1 = torch.nn.Sequential(LinearRelu(), LinearRelu())
+                self.mods2 = Linear()
+                self.relu = F.relu
+
+            def forward(self, x):
+                x = self.mods1(x)
+                x = self.mods2(x)
+                x = self.relu(x)
+                return x
+
+        model = M().eval()
+        qconfig = {
+            "": default_dynamic_qconfig,
+        }
+        m = prepare_fx(model, qconfig)
+        m = convert_fx(m)
+        m(torch.rand(5, 5))
+        node_list = [
+            ns.call_module(nniqd.LinearReLU),
+            ns.call_module(nniqd.LinearReLU),
+            ns.call_function(torch.ops.quantized.linear_relu_dynamic),
+        ]
+        self.checkGraphModuleNodes(m, expected_node_list=node_list)
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     """Unit tests for individual ops
@@ -2956,7 +3008,7 @@ def forward(self, x):
         }
         quant_type_to_qlinear_relu_fun = {
             # we don't have linear_relu_dynamic
-            QuantType.DYNAMIC: ns.call_function(torch.ops.quantized.linear_dynamic),
+            QuantType.DYNAMIC: ns.call_function(torch.ops.quantized.linear_relu_dynamic),
             QuantType.STATIC: ns.call_function(torch.ops.quantized.linear_relu),
             QuantType.QAT: ns.call_function(torch.ops.quantized.linear_relu),
         }
diff --git a/torch/nn/intrinsic/quantized/dynamic/__init__.py b/torch/nn/intrinsic/quantized/dynamic/__init__.py
new file mode 100644
index 0000000000000..3d79bdbfe8320
--- /dev/null
+++ b/torch/nn/intrinsic/quantized/dynamic/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/torch/nn/intrinsic/quantized/dynamic/modules/__init__.py b/torch/nn/intrinsic/quantized/dynamic/modules/__init__.py
new file mode 100644
index 0000000000000..ce571862b4275
--- /dev/null
+++ b/torch/nn/intrinsic/quantized/dynamic/modules/__init__.py
@@ -0,0 +1,6 @@
+import torch
+from .linear_relu import LinearReLU
+
+__all__ = [
+    'LinearReLU',
+]
diff --git a/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
new file mode 100644
index 0000000000000..04c4c954810fd
--- /dev/null
+++ b/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
@@ -0,0 +1,47 @@
+import torch
+import torch.nn.quantized.dynamic as nnqd
+import torch.nn.intrinsic as nni
+
+class LinearReLU(nnqd.Linear):
+    r"""
+    A LinearReLU module fused from Linear and ReLU modules that can be used
+    for dynamic quantization.
+    Supports both, FP16 and INT8 quantization.
+
+    We adopt the same interface as :class:`torch.nn.quantized.dynamic.Linear`.
+
+    Attributes:
+        Same as torch.nn.quantized.dynamic.Linear
+
+    Examples::
+
+        >>> m = nn.intrinsic.quantized.dynamic.LinearReLU(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+    _FLOAT_MODULE = nni.LinearReLU  # type: ignore[assignment]
+
+    def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8):
+        super().__init__(in_features, out_features, bias, dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self._packed_params.dtype == torch.qint8:
+            # TODO check if we should set reduce_rage = True by default here
+            Y = torch.ops.quantized.linear_relu_dynamic(
+                x, self._packed_params._packed_params, reduce_range=True)
+        # TODO Support this in a later PR
+        # elif self._packed_params.dtype == torch.float16:
+        #     Y = torch.ops.quantized.linear_relu_dynamic_fp16(
+        #         x, self._packed_params._packed_params)
+        else:
+            raise RuntimeError('Unsupported dtype on dynamic quantized linear relu!')
+        return Y.to(x.dtype)
+
+    def _get_name(self):
+        return 'DynamicQuantizedLinearReLU'
+
+    @classmethod
+    def from_float(cls, mod):
+        return super(LinearReLU, cls).from_float(mod)
diff --git a/torch/nn/quantized/dynamic/modules/linear.py b/torch/nn/quantized/dynamic/modules/linear.py
index 07cfdfe2846cc..ee153b10d2466 100644
--- a/torch/nn/quantized/dynamic/modules/linear.py
+++ b/torch/nn/quantized/dynamic/modules/linear.py
@@ -1,5 +1,6 @@
 import torch
 import torch.nn.quantized as nnq
+import torch.nn.intrinsic as nni
 from torch.nn.quantized.modules.utils import _quantize_weight
 
 class Linear(nnq.Linear):
@@ -79,11 +80,15 @@ def from_float(cls, mod):
             mod (Module): a float module, either produced by torch.quantization
                           utilities or provided by the user
         """
-        float_modules = [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear]
+        float_modules = [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear,
+                         torch.nn.intrinsic.modules.fused.LinearReLU]
+
         assert type(mod) in float_modules, \
             'nn.quantized.dynamic.Linear.from_float only works for one of' + \
             str([float_mod.__name__ for float_mod in float_modules])
         assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        if type(mod) == nni.LinearReLU:
+            mod = mod[0]
         if mod.qconfig is not None and mod.qconfig.weight is not None:
             weight_observer = mod.qconfig.weight()
         else:
@@ -102,6 +107,6 @@ def from_float(cls, mod):
             qweight = mod.weight.float()
         else:
             raise RuntimeError('Unsupported dtype specified for dynamic quantized Linear!')
-        qlinear = Linear(mod.in_features, mod.out_features, dtype=dtype)
+        qlinear = cls(mod.in_features, mod.out_features, dtype=dtype)
         qlinear.set_weight_bias(qweight, mod.bias)
         return qlinear
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 09ca190a73668..b7c39ca92cead 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -1022,9 +1022,14 @@ def convert(self,
                 elif dtypes in [(torch.float32, torch.qint8, torch.quint8),
                                 (torch.float32, torch.float16, None)]:
                     # choose linear dynamic or linear dynamic fp16 op based on weight dtype
-                    qlinear_op = torch.ops.quantized.linear_dynamic \
-                        if weight_dtype == torch.qint8 \
-                        else torch.ops.quantized.linear_dynamic_fp16
+                    if weight_dtype == torch.qint8:
+                        if self.relu_node:
+                            qlinear_op = torch.ops.quantized.linear_relu_dynamic
+                        else:
+                            qlinear_op = torch.ops.quantized.linear_dynamic
+                    else:  # TODO add support for fp16 + relu fusion in a later PR
+                        qlinear_op = torch.ops.quantized.linear_dynamic_fp16
+
                     linear_input = load_arg(quantized=torch.float)(self.linear_node.args[0])
                     qlinear_args = (linear_input, packed_weight)  # type: ignore[assignment]
                     op_out = quantized_graph.create_node(
@@ -1033,7 +1038,7 @@ def convert(self,
                     # TODO: may need to change the key to Node regenerate the map in each transformation,
                     # since we might not be able to rely on the name
                     node_name_to_scope[op_out.name] = node_name_to_scope[self.linear_node.name]
-                    if self.relu_node:
+                    if self.relu_node and weight_dtype is not torch.qint8:
                         op_out = quantized_graph.create_node("call_function", torch.nn.functional.relu, (op_out,), {})
                     return op_out
                 else:
diff --git a/torch/quantization/ns/mappings.py b/torch/quantization/ns/mappings.py
index 399ddca22668e..e97d77119d00e 100644
--- a/torch/quantization/ns/mappings.py
+++ b/torch/quantization/ns/mappings.py
@@ -8,6 +8,7 @@
 import torch.nn.quantized as nnq
 import torch.nn.quantized.dynamic as nnqd
 import torch.nn.intrinsic.quantized as nniq
+import torch.nn.intrinsic.quantized.dynamic as nniqd
 import torch.nn.intrinsic.qat as nniqat
 import torch.nn.intrinsic as nni
 import torch.nn.qat as nnqat
@@ -70,6 +71,7 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]:
             nnq.Linear,
             nni.LinearReLU,
             nniq.LinearReLU,
+            nniqd.LinearReLU,
             nnqat.Linear,
             nnqd.Linear,
             nniqat.LinearReLU,
@@ -529,6 +531,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         nniqat.ConvReLU2d,
         nniqat.ConvReLU3d,
         nniqat.LinearReLU,
+        nniqd.LinearReLU,
     ])
 
     MODS_IO_TYPE_INT8: Set[NSNodeTargetType] = set([
diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py
index 6179398b7398a..775d40bb23efa 100644
--- a/torch/quantization/quantization_mappings.py
+++ b/torch/quantization/quantization_mappings.py
@@ -6,6 +6,7 @@
 import torch.nn.functional as F
 import torch.nn.intrinsic as nni
 import torch.nn.intrinsic.quantized as nniq
+import torch.nn.intrinsic.quantized.dynamic as nniqd
 import torch.nn.intrinsic.quantized._reference as nniqr
 import torch.nn.intrinsic.qat as nniqat
 import torch.nn.quantized as nnq
@@ -122,6 +123,7 @@
     nn.GRU: nnqd.GRU,
     nn.LSTMCell: nnqd.LSTMCell,
     nn.RNNCell: nnqd.RNNCell,
+    nni.LinearReLU: nniqd.LinearReLU,
 }
 
 # Allowlist for propagating the qconfig
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 6b2d1dd13a33d..77512f7ef445a 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -5,6 +5,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import torch.nn.intrinsic.quantized.dynamic as nniqd
 import torch.nn.quantized as nnq
 import torch.nn.quantized.dynamic as nnqd
 from torch.nn.intrinsic import _FusedModule
@@ -422,6 +423,13 @@ def checkDynamicQuantizedLinear(self, mod, dtype):
         self.assertEqual(type(mod), nnqd.Linear)
         self.assertEqual(mod._packed_params.dtype, dtype)
 
+    def checkDynamicQuantizedLinearRelu(self, mod, dtype):
+        r"""Checks that mod has been swapped for an nnqd.Linear
+            module, the bias is float.
+        """
+        self.assertEqual(type(mod), nniqd.LinearReLU)
+        self.assertEqual(mod._packed_params.dtype, dtype)
+
     def check_eager_serialization(self, ref_model, loaded_model, x):
         # Check state dict serialization and torch.save APIs
         model_dict = ref_model.state_dict()

From 975f4ccad6fb7ca13c50ee628ec3fb3a77a64893 Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Thu, 26 Aug 2021 21:05:56 -0700
Subject: [PATCH 283/530] [quant] support linear_relu_dynamic for qnnpack
 backend (#63820)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63820

Adds support in the operator directly to call relu operator if relu fusion is enabled.
Once QNNPACK natively supports relu fusion in the linear_dynamic this can be removed

Test Plan:
python test/test_quantization.py TestDynamicQuantizedLinear.test_qlinear

Imported from OSS

Reviewed By: vkuzo

Differential Revision: D30502813

fbshipit-source-id: 3352ee5f73e482b6d1941f389d720a461b84ba23
---
 aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp | 6 ++++++
 test/quantization/core/test_quantized_op.py            | 1 -
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
index da64197fb4577..23c6158889db2 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -349,6 +349,12 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
   TORCH_INTERNAL_ASSERT(
       runStatus == pytorch_qnnp_status_success,
       "failed to run QNNPACK Linear operator");
+
+  // Call the relu operator here until qlinear dynamic in QNNPACK
+  // supports it natively.
+  if (ReluFused) {
+    output.relu_();
+  }
   return output;
 }
 
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 9243fe2440173..86fe3509ab4fb 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -2606,7 +2606,6 @@ class TestDynamicQuantizedLinear(TestCase):
     def test_qlinear(self, batch_size, input_channels, output_channels,
                      use_bias, use_relu, use_multi_dim_input, use_channelwise, reduce_range):
         if torch.backends.quantized.engine == 'qnnpack':
-            use_relu = False
             reduce_range = False
 
         qlinear_prepack = torch.ops.quantized.linear_prepack

From cec44aa574e06e8aa1096b62a7c6d7c4dda8a3f5 Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Thu, 26 Aug 2021 21:05:56 -0700
Subject: [PATCH 284/530] [quant] Add op support for linear_relu_dynamic_fp16
 (#63824)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63824

Add a fused operator implementation that will work with the quantization fusion APIs.
Once FBGEMM FP16 kernel supports relu fusion natively we can remove the addition from the PT operator.

Test Plan:
python test/test_quantization.py

Imported from OSS

Reviewed By: heitorschueroff

Differential Revision: D30503514

fbshipit-source-id: 6bf3bd53f47ffaa3f1d178eaad8cc980a7f5258a
---
 .../native/quantized/cpu/qlinear_dynamic.cpp  | 11 +++++--
 aten/src/ATen/native/quantized/library.cpp    |  1 +
 test/quantization/core/test_quantized_op.py   | 32 +++++++++++++++++++
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
index 23c6158889db2..3331a0387111c 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -451,8 +451,14 @@ class QLinearDynamicFp16 final {
     TORCH_CHECK(
         fbgemm::fbgemmSupportedCPU(), "Your CPU doesn't support FBGEMM.");
 
-    TORCH_INTERNAL_ASSERT(!ReluFused);
-    return packed_weight->apply_dynamic(std::move(input));
+    auto output = packed_weight->apply_dynamic(std::move(input));
+
+    // Call the relu operator here until fp16 linear dynamic in FBGEMM
+    // supports it natively.
+    if (ReluFused) {
+      output.relu_();
+    }
+    return output;
   }
 #else // USE_FBGEMM
   static at::Tensor run(
@@ -471,6 +477,7 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_dynamic"), TORCH_FN(QLinearDynamicInt8<false>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu_dynamic"), TORCH_FN(QLinearDynamicInt8<true>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_dynamic_fp16"), TORCH_FN(QLinearDynamicFp16<false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu_dynamic_fp16"), TORCH_FN(QLinearDynamicFp16<true>::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, CPU, m) {
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 8ead74f326ff2..3dcf75b1ccb32 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -142,6 +142,7 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 86fe3509ab4fb..49b7c96847612 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -2782,6 +2782,38 @@ def test_qlinear_legacy(self, batch_size, input_channels, output_channels):
         self.assertEqual(Y_fp32, Y_fp32_ref,
                          msg="torch.ops.quantized.fbgemm_linear_dynamic results are off")
 
+    @skipIfNoFBGEMM
+    def test_qlinear_dynamic_fp16(self):
+
+        options = itertools.product(
+            (2, 4),         # batch_size
+            (4, 5, 12),     # input_channels
+            (4, 7, 8),      # output_channels
+            (True, False),  # use_bias
+            (True, False),  # use_relu
+        )
+        for batch_size, input_channels, output_channels, use_bias, use_relu in options:
+            qlinear_prepack = torch.ops.quantized.linear_prepack_fp16
+            if use_relu:
+                qlinear_dynamic = torch.ops.quantized.linear_relu_dynamic_fp16
+            else:
+                qlinear_dynamic = torch.ops.quantized.linear_dynamic_fp16
+
+            x = torch.randn(batch_size, input_channels)
+            w = torch.randn(output_channels, input_channels)
+            bias = torch.randn(output_channels) if use_bias else None
+
+            w_packed = qlinear_prepack(w, bias)
+            out = qlinear_dynamic(x, w_packed)
+
+            # qlinear_dynamic_fp16 uses FP32 activation tensors and FP16 weight tensors
+            # output is FP32
+            w_fp16 = w.to(torch.float16).to(torch.float32)
+            ref = F.linear(x, w_fp16, bias)
+            if use_relu:
+                ref.relu_()
+
+            self.assertEqual(out, ref)
 
 class TestDynamicQuantizedRNNOp(TestCase):
     """Tests the correctness of the dynamic quantized lstm/gru."""

From 294db0603fef315c8f6ac95e30f8ce6b5cce2b5a Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Thu, 26 Aug 2021 21:05:56 -0700
Subject: [PATCH 285/530] [quant] Add support for linear_relu fusion for FP16
 dynamic quant (#63826)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63826

Support the conversion of the intrinsic linearRelu module to the quantized dynamic LinearReLU module
Verify the support works for both linear module and functional linear fusion

Test Plan:
python test/test_quantization.py test_dynamic_with_fusion

Imported from OSS

Reviewed By: iramazanli

Differential Revision: D30503513

fbshipit-source-id: 70446797e9670dfef7341cba2047183d6f88b70f
---
 test/quantization/fx/test_quantize_fx.py      | 33 ++++++++++++-------
 .../quantized/dynamic/modules/linear_relu.py  |  7 ++--
 .../quantization/fx/quantization_patterns.py  |  9 ++---
 3 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index cdf2e7bea4328..762919eeb04ea 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -2922,18 +2922,24 @@ def forward(self, x):
                 return x
 
         model = M().eval()
-        qconfig = {
-            "": default_dynamic_qconfig,
+
+        dynamic_quantized_ops = {
+            float16_dynamic_qconfig: torch.ops.quantized.linear_relu_dynamic_fp16,
+            default_dynamic_qconfig: torch.ops.quantized.linear_relu_dynamic
         }
-        m = prepare_fx(model, qconfig)
-        m = convert_fx(m)
-        m(torch.rand(5, 5))
-        node_list = [
-            ns.call_module(nniqd.LinearReLU),
-            ns.call_module(nniqd.LinearReLU),
-            ns.call_function(torch.ops.quantized.linear_relu_dynamic),
-        ]
-        self.checkGraphModuleNodes(m, expected_node_list=node_list)
+        for config in [float16_dynamic_qconfig, default_dynamic_qconfig]:
+            qconfig = {
+                "": config
+            }
+            m = prepare_fx(model, qconfig)
+            m = convert_fx(m)
+            m(torch.rand(5, 5))
+            node_list = [
+                ns.call_module(nniqd.LinearReLU),
+                ns.call_module(nniqd.LinearReLU),
+                ns.call_function(dynamic_quantized_ops[config]),
+            ]
+            self.checkGraphModuleNodes(m, expected_node_list=node_list)
 
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
@@ -3089,7 +3095,10 @@ def forward(self, x):
             if is_reference:
                 qlinear_fun = ns.call_function(torch.nn.functional.linear)
             else:
-                qlinear_fun = ns.call_function(torch.ops.quantized.linear_dynamic_fp16)
+                if has_relu:
+                    qlinear_fun = ns.call_function(torch.ops.quantized.linear_relu_dynamic_fp16)
+                else:
+                    qlinear_fun = ns.call_function(torch.ops.quantized.linear_dynamic_fp16)
             prepare_node_occurrence = {
                 # weight
                 ns.call_module(torch.quantization.PlaceholderObserver): 1
diff --git a/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
index 04c4c954810fd..c30b3109ef601 100644
--- a/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
+++ b/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
@@ -31,10 +31,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             # TODO check if we should set reduce_rage = True by default here
             Y = torch.ops.quantized.linear_relu_dynamic(
                 x, self._packed_params._packed_params, reduce_range=True)
-        # TODO Support this in a later PR
-        # elif self._packed_params.dtype == torch.float16:
-        #     Y = torch.ops.quantized.linear_relu_dynamic_fp16(
-        #         x, self._packed_params._packed_params)
+        elif self._packed_params.dtype == torch.float16:
+            Y = torch.ops.quantized.linear_relu_dynamic_fp16(
+                x, self._packed_params._packed_params)
         else:
             raise RuntimeError('Unsupported dtype on dynamic quantized linear relu!')
         return Y.to(x.dtype)
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index b7c39ca92cead..6362961ad8daa 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -1027,8 +1027,11 @@ def convert(self,
                             qlinear_op = torch.ops.quantized.linear_relu_dynamic
                         else:
                             qlinear_op = torch.ops.quantized.linear_dynamic
-                    else:  # TODO add support for fp16 + relu fusion in a later PR
-                        qlinear_op = torch.ops.quantized.linear_dynamic_fp16
+                    else:
+                        if self.relu_node:
+                            qlinear_op = torch.ops.quantized.linear_relu_dynamic_fp16
+                        else:
+                            qlinear_op = torch.ops.quantized.linear_dynamic_fp16
 
                     linear_input = load_arg(quantized=torch.float)(self.linear_node.args[0])
                     qlinear_args = (linear_input, packed_weight)  # type: ignore[assignment]
@@ -1038,8 +1041,6 @@ def convert(self,
                     # TODO: may need to change the key to Node regenerate the map in each transformation,
                     # since we might not be able to rely on the name
                     node_name_to_scope[op_out.name] = node_name_to_scope[self.linear_node.name]
-                    if self.relu_node and weight_dtype is not torch.qint8:
-                        op_out = quantized_graph.create_node("call_function", torch.nn.functional.relu, (op_out,), {})
                     return op_out
                 else:
                     assert dtypes == (torch.float16, torch.float16, None)

From 3f1c8094707f695cf0cf51c795b18093e0a3ab86 Mon Sep 17 00:00:00 2001
From: Ansha Yu <ansha@fb.com>
Date: Thu, 26 Aug 2021 23:17:42 -0700
Subject: [PATCH 286/530] [static runtime] port c2 argmin kernel (#63632)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63632

Local benchmarking with 1 input repeated 10k iter on 290331537_4 local net. Reduces argmin runtime by about 80% and and local net execution by about ~0.71-0.77ms.

Before:
```
I0826 17:25:53.972786 1104614 PyTorchPredictorBenchLib.cpp:313] PyTorch run finished. Milliseconds per iter: 7.37599. Iters per second: 135.57
```
```
Static runtime ms per iter: 8.22086. Iters per second: 121.642
Time per node type:
        4.13527 ms.    50.9157%. fb::sigrid_transforms_torch_bind (1 nodes, out variant)
       0.868506 ms.    10.6935%. aten::argmin (1 nodes, out variant)
...
```

After:
```
I0826 17:17:54.165174 1064079 PyTorchPredictorBenchLib.cpp:313] PyTorch run finished. Milliseconds per iter: 6.66724. Iters per second: 149.987
```
```
Static runtime ms per iter: 7.68172. Iters per second: 130.179
Time per node type:
         4.1452 ms.    54.0612%. fb::sigrid_transforms_torch_bind (1 nodes, out variant)
       0.656778 ms.    8.56562%. fb::quantized_linear (8 nodes)
       0.488229 ms.    6.36741%. static_runtime::to_copy (827 nodes, out variant)
       0.372678 ms.    4.86042%. aten::argmin (1 nodes, out variant)
...Time per node type:
        3.39387 ms.    53.5467%. fb::sigrid_transforms_torch_bind (1 nodes, out variant)
       0.636216 ms.    10.0379%. fb::quantized_linear (8 nodes, out variant)
       0.410535 ms.    6.47721%. fb::clip_ranges_to_gather_to_offsets (304 nodes, out variant)
       0.212721 ms.     3.3562%. fb::clip_ranges_gather_sigrid_hash_precompute_v3 (157 nodes, out variant)
       0.173736 ms.    2.74111%. aten::matmul (1 nodes, out variant)
       0.150514 ms.    2.37474%. aten::argmin (1 nodes, out variant)
```
P447422384

Test Plan:
Test with local replayer sending traffic to `ansha_perf_test_0819.test`, and compare outputs to jit interpreter.

Start compute tier:
```
RUN_UUID=ansha_perf_test_0819.test.storage JOB_EXPIRE_TIME=864000 MODEL_ID=290331537_4 PREDICTOR_TAG= PREDICTOR_VERSION=405 PREDICTOR_TYPE=CPU ADDITIONAL_FLAGS="--enable_disagg_file_split=true --enable_adx=false --load_remote_file_locally=true --pytorch_predictor_static_runtime_whitelist_by_id=290331537" GFLAGS_CONFIG_PATH=sigrid/predictor/gflags/predictor_gflags_ads_perf_cpu_pyper SMC_TIER_NAME=sigrid.predictor.perf.ansha_per_test_0819.test.storage CLUSTER=tsp_rva ENTITLEMENT_NAME=ads_ranking_infra_test_t6 PREDICTOR_LOCAL_DIRECTORY= ICET_CONFIG_PATH= NNPI_COMPILATION_CONFIG_FILE= NUM_TASKS=1 NNPI_NUM_WORKERS=0 tw job start /data/users/ansha/fbsource/fbcode/tupperware/config/admarket/sigrid/predictor/predictor_perf_canary.tw
```

Start nnpi tier:
```
RUN_UUID=ansha_perf_test_0819.test JOB_EXPIRE_TIME=247200 MODEL_ID=290331537_4 PREDICTOR_TAG= PREDICTOR_VERSION=343 PREDICTOR_TYPE=NNPI_TWSHARED ADDITIONAL_FLAGS="--torch_glow_min_fusion_group_size=30 --pytorch_storage_tier_replayer_sr_connection_options=overall_timeout:1000000,processing_timeout:1000000 --predictor_storage_smc_tier=sigrid.predictor.perf.ansha_perf_test_0819.test.storage --pytorch_predictor_static_runtime_whitelist_by_id=290331537" GFLAGS_CONFIG_PATH=sigrid/predictor/gflags/predictor_gflags_ads_perf_glow_nnpi_pyper_v1 SMC_TIER_NAME=sigrid.predictor.perf.ansha_perf_test_0819.test CLUSTER=tsp_rva ENTITLEMENT_NAME=ads_ranking_infra_test_t17 PREDICTOR_LOCAL_DIRECTORY= ICET_CONFIG_PATH= NNPI_COMPILATION_CONFIG_FILE= NUM_TASKS=1 NNPI_NUM_WORKERS=0 tw job start /data/users/ansha/fbsource/fbcode/tupperware/config/admarket/sigrid/predictor/predictor_perf_canary.tw
```

```buck test caffe2/benchmarks/static_runtime:static_runtime_cpptest -- StaticRuntime.IndividualOps_Argmin --print-passing-details```

Compared outputs to jit interpreter to check for no differences greater than 1e-3 (with nnc on) https://www.internalfb.com/intern/diff/view-version/136824794/

Reviewed By: hlu1

Differential Revision: D30445635

fbshipit-source-id: 048de8867ac72f764132295d1ebfa843cde2fa27
---
 torch/csrc/jit/runtime/static/ops.cpp | 94 ++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 4d34ed9388364..484c4b03ad64c 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -9,6 +9,7 @@
 #include <ATen/native/Fill.h>
 #include <ATen/native/IndexingUtils.h>
 #include <ATen/native/Resize.h>
+#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorAdvancedIndexing.h>
 #include <ATen/native/layer_norm.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
@@ -178,6 +179,94 @@ Tensor& linear_out(
   return output;
 }
 
+Tensor& c2_argmin_out(
+    Tensor& output,
+    const Tensor& input,
+    const int64_t dim,
+    const bool keepdim) {
+  const auto ndim = input.dim();
+  int64_t dim_ = maybe_wrap_dim(dim, ndim);
+  TORCH_CHECK(dim_ >= 0 && dim_ < ndim);
+
+  const auto in_dims = input.sizes();
+
+  c10::SmallVector<int64_t, 5> out_dims;
+  out_dims.reserve(ndim);
+  int prev_size = 1;
+  int next_size = 1;
+  for (int i = 0; i < dim_; ++i) {
+    out_dims.push_back(in_dims[i]);
+    prev_size *= in_dims[i];
+  }
+  if (keepdim) {
+    out_dims.push_back(1);
+  }
+  for (auto i = dim_ + 1; i < ndim; ++i) {
+    out_dims.push_back(in_dims[i]);
+    next_size *= in_dims[i];
+  }
+  at::native::resize_(output, out_dims, c10::nullopt);
+
+  const auto n = in_dims[dim_];
+
+  if (next_size == 1) {
+    AT_DISPATCH_ALL_TYPES_AND2(
+        kHalf, kBFloat16, input.scalar_type(), "argmin_input", [&]() {
+          const auto in_ptr = input.data_ptr<scalar_t>();
+          const auto out_ptr = output.data_ptr<int64_t>();
+          // input is a [prev_size, n] tensor.
+          // output is a [prev_size,] tensor.
+          // Thus, access is contiguous/coalesced.
+          for (int i = 0; i < prev_size; ++i) {
+            auto v = std::min_element(
+                in_ptr + i * n,
+                in_ptr + (i + 1) * n,
+                [](scalar_t a, scalar_t b) {
+                  // if a is nan, then a is *less* than b with LessOrNan
+                  // semantics
+                  if (at::_isnan(a)) {
+                    return true;
+                  }
+                  // if a is not nan and b is nan, then a is not less than b
+                  // with LessOrNan semantics otherwise, act normally. If `b` is
+                  // NaN then a < b will always return false, so this is
+                  // equivalent to the first snippet.
+                  return a < b;
+                });
+            out_ptr[i] = std::distance(in_ptr + i * n, v);
+          }
+        });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND2(
+        kHalf, kBFloat16, input.scalar_type(), "argmin_input", [&]() {
+          const auto less_or_nan = native::detail::LessOrNan<scalar_t>{};
+
+          const auto in_ptr = input.data_ptr<scalar_t>();
+          const auto out_ptr = output.data_ptr<int64_t>();
+
+          std::memset(out_ptr, 0, prev_size * next_size * sizeof(int64_t));
+
+          for (int i = 0; i < prev_size; ++i) {
+            const scalar_t* cur_in_ptr = in_ptr + i * n * next_size + next_size;
+            for (int k = 1; k < n; ++k) {
+              for (int j = 0; j < next_size; ++j) {
+                int64_t* cur_out_ptr = out_ptr + i * next_size + j;
+                if (less_or_nan(
+                        *cur_in_ptr,
+                        in_ptr
+                            [i * n * next_size + *cur_out_ptr * next_size + j],
+                        *cur_out_ptr,
+                        k)) {
+                  *cur_out_ptr = k;
+                }
+                ++cur_in_ptr;
+              }
+            }
+          }
+        });
+  }
+  return output;
+}
 } // namespace native
 } // namespace at
 
@@ -1209,6 +1298,10 @@ REGISTER_OPERATOR_FUNCTOR(aten::argmin, aten_argmin, [](Node* n) -> SROperator {
     } else {
       auto& out_t = p_node->Output(0).toTensor();
       fastResizeToZero(out_t);
+      if (in0_t.is_contiguous() && dim.has_value()) {
+        at::native::c2_argmin_out(out_t, in0_t, dim.value(), keepdim);
+        return;
+      }
       at::cpu::argmin_out(out_t, in0_t, dim, keepdim);
     }
   };
@@ -1533,6 +1626,5 @@ REGISTER_OPERATOR_FUNCTOR(
         }
       };
     });
-
 } // namespace jit
 } // namespace torch

From 3c3bba4169067a7340ff1d786a6b61282cf26820 Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Fri, 27 Aug 2021 01:39:14 -0700
Subject: [PATCH 287/530] [Static Runtime] Use F14FastMap/F14FastSet (#63999)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63999

Use folly::F14FastMap/F14FastSet instead of std::unordered_map/unordered_set in the Static Runtime code base. folly::F14FastMap/F14FastSet implements the same APIs as std::unordered_map/unordered_set but faster. For details see https://github.com/facebook/folly/blob/master/folly/container/F14.md

Reviewed By: d1jang

Differential Revision: D30566149

fbshipit-source-id: 20a7fa2519e4dde96fb3fc61ef6c92bf6d759383
---
 torch/csrc/jit/runtime/static/impl.cpp       | 83 ++++++++++----------
 torch/csrc/jit/runtime/static/impl.h         | 33 ++++++--
 torch/csrc/jit/runtime/static/ops.cpp        |  3 +-
 torch/csrc/jit/runtime/static/te_wrapper.cpp |  5 +-
 4 files changed, 72 insertions(+), 52 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index cb9342b364cc5..b3e1eb116dc7d 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -104,8 +104,8 @@ bool mayContainAlias(AliasDb& db, const Value* a, const Value* b) {
 
 bool mayContainAlias(
     AliasDb& db,
-    const std::unordered_set<const Value*>& a,
-    const std::unordered_set<const Value*>& b) {
+    const FastSet<const Value*>& a,
+    const FastSet<const Value*>& b) {
   std::vector<Value*> as;
   std::vector<Value*> bs;
   as.reserve(a.size());
@@ -122,11 +122,11 @@ bool mayContainAlias(
 }
 
 // Get set of all inputs/outputs/constants (always alive) and their aliases
-std::unordered_set<const Value*> GetAlwaysAliveValues(
+FastSet<const Value*> GetAlwaysAliveValues(
     const std::shared_ptr<torch::jit::Graph>& graph,
     AliasDb& db) {
   // a set of Values whose live-range exceed current inference
-  std::unordered_set<const Value*> always_alive;
+  FastSet<const Value*> always_alive;
 
   // mark inputs, constants, outputs as always_alive
   for (const auto* input : graph->inputs()) {
@@ -148,7 +148,7 @@ std::unordered_set<const Value*> GetAlwaysAliveValues(
     // constants are already in the always_alive set
     if (node->kind() != prim::Constant) {
       for (const auto* v : node->outputs()) {
-        if (mayContainAlias(db, ValueSet{v}, always_alive)) {
+        if (mayContainAlias(db, {v}, always_alive)) {
           always_alive.insert(v);
         }
       }
@@ -158,22 +158,22 @@ std::unordered_set<const Value*> GetAlwaysAliveValues(
 }
 
 //  Map each value to all values that are alive at the same time.
-using LivenessMap = std::unordered_map<const Value*, std::set<const Value*>>;
+using LivenessMap = FastMap<const Value*, std::set<const Value*>>;
 
 //  The algorithm does a traversal of the execution graph
 //  while keeping track of the live values.
 LivenessMap GetLivenessMap(
     const std::shared_ptr<torch::jit::Graph>& graph,
-    const std::unordered_set<const Value*>& always_alive,
+    const FastSet<const Value*>& always_alive,
     AliasDb& db) {
   // map a Value to a set of Values that overlap live-ranges with the Value's
-  std::unordered_map<const Value*, std::set<const Value*>> liveness_map;
+  FastMap<const Value*, std::set<const Value*>> liveness_map;
 
   // map Values to its creation order in graph (Note: only traverse top-level
   // nodes such that nodes under control-flows are represented by top-level
   // block nodes)
   std::vector<const Value*> values_in_creation_order;
-  std::unordered_map<const Value*, size_t> values_to_idx_in_creation_order;
+  FastMap<const Value*, size_t> values_to_idx_in_creation_order;
   for (const auto* node : graph->nodes()) {
     for (const auto* v : node->outputs()) {
       values_to_idx_in_creation_order[v] = values_in_creation_order.size();
@@ -184,10 +184,10 @@ LivenessMap GetLivenessMap(
   // presence of a Value in live_values_use_chain means the Value alive
   // Value mapped to set of Nodes that may use the Value (i.e., use-chain of
   // Value)
-  std::unordered_map<const Value*, std::set<const Node*>> live_values_use_chain;
+  FastMap<const Value*, std::set<const Node*>> live_values_use_chain;
   // Node mapped to set of Values that the Node may use (i.e., def-chain of node
   // inputs)
-  std::unordered_map<const Node*, std::set<const Value*>> live_nodes_def_chain;
+  FastMap<const Node*, std::set<const Value*>> live_nodes_def_chain;
 
   // add v to the current liveness_map
   std::function<void(const Value* v)> add_live_value_fn = [&](const Value* v) {
@@ -320,12 +320,12 @@ LivenessMap GetLivenessMap(
 std::pair<std::vector<const Value*>, std::vector<const Value*>>
 GetMemoryPlanningCandidates(const std::shared_ptr<torch::jit::Graph>& graph) {
   // for determinism
-  std::unordered_set<const Value*> seen_values;
+  FastSet<const Value*> seen_values;
   std::vector<const Value*> all_values;
-  std::unordered_set<const Value*> can_reuse;
+  FastSet<const Value*> can_reuse;
   // values used by unsupported ops (as either inputs or outputs)
   // these need to be removed from "can_reuse" after analyzing all nodes
-  std::unordered_set<const Value*> cannot_reuse;
+  FastSet<const Value*> cannot_reuse;
   for (auto* n : graph->nodes()) {
     bool can_reuse_inputs_outputs = canReuseInputsOutputs(n);
     for (const auto* v : n->inputs()) {
@@ -388,10 +388,9 @@ GetMemoryPlanningCandidates(const std::shared_ptr<torch::jit::Graph>& graph) {
 //
 // NB: This is a deterministic implementation, which makes it easier to tune
 // and debug.
-std::unordered_map<const Value*, std::vector<const Value*>>
-GenerateSameStorageValues(
+FastMap<const Value*, std::vector<const Value*>> GenerateSameStorageValues(
     const LivenessMap& alive_during,
-    const std::unordered_set<const Value*>& always_alive,
+    const FastSet<const Value*>& always_alive,
     const std::pair<std::vector<const Value*>, std::vector<const Value*>>&
         optimizable,
     AliasDb& db) {
@@ -399,8 +398,7 @@ GenerateSameStorageValues(
   const auto& all_values = optimizable.second;
 
   // map Value* to a set Value* that can share the same storage with it
-  std::unordered_map<const Value*, std::vector<const Value*>>
-      same_storage_values;
+  FastMap<const Value*, std::vector<const Value*>> same_storage_values;
 
   // make new_v and old_v map to the same storage (i.e., add to each other's
   // same_storage_values set)
@@ -589,9 +587,9 @@ StaticModule::StaticModule(
   }
 
   // map Value* to IValue (from inputs or prim::Constant) or null
-  std::unordered_map<Value*, IValue*> value_to_ivalue;
+  FastMap<Value*, IValue*> value_to_ivalue;
   // map Value* to its SSA definition IR
-  std::unordered_map<Value*, DefInfo> value_to_ssa_def;
+  FastMap<Value*, DefInfo> value_to_ssa_def;
 
   // N inputs map to the first N entries in storage
   for (const auto i : c10::irange(graph_->inputs().size())) {
@@ -1165,8 +1163,7 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) {
     TORCH_CHECK(inputs_[i].isNone(), "Input ", i, " was not cleaned up");
   }
 
-  std::unordered_set<const IValue*> output_ivalues(
-      outputs_.begin(), outputs_.end());
+  FastSet<const IValue*> output_ivalues(outputs_.begin(), outputs_.end());
   for (const auto n : c10::irange(nodes_.size())) {
     auto& pnode = nodes_[n];
     for (const auto i : c10::irange(pnode.outputs().size())) {
@@ -1202,13 +1199,13 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) {
 
 static void assign_storage_to_managed_tensors(
     StaticRuntime* runtime,
-    const std::unordered_set<const Value*>& managed_tensor_values,
-    const std::unordered_map<const Value*, std::vector<const Value*>>&
+    const FastSet<const Value*>& managed_tensor_values,
+    const FastMap<const Value*, std::vector<const Value*>>&
         value_to_same_storage_values,
     std::vector<std::pair<size_t, std::vector<at::Tensor*>>>& managed_tensors) {
   // map Value to index to managed_storage, where multiple values can
   // map to the same index (i.e., sharing the same storage)
-  std::unordered_map<const Value*, size_t> value_to_storage_idx;
+  FastMap<const Value*, size_t> value_to_storage_idx;
 
   // Snapshot of the current memory state
   for (auto& pnode : runtime->nodes()) {
@@ -1218,19 +1215,21 @@ static void assign_storage_to_managed_tensors(
       if (managed_tensor_values.count(val)) {
         TORCH_CHECK(ival.isTensor());
         at::Tensor* tensor = &ival.toTensor();
-
-        if (value_to_storage_idx.count(val)) {
-          managed_tensors[value_to_storage_idx[val]].second.emplace_back(
-              tensor);
+        auto f = value_to_storage_idx.find(val);
+        if (f != value_to_storage_idx.end()) {
+          auto storage_idx = f->second;
+          managed_tensors[storage_idx].second.emplace_back(tensor);
         } else {
           auto p =
               std::make_pair<size_t, std::vector<at::Tensor*>>(0, {tensor});
           managed_tensors.emplace_back(std::move(p));
           // first of a group, update the value_to_storage_idx map with the
           // index
-          if (value_to_same_storage_values.count(val)) {
+          auto f = value_to_same_storage_values.find(val);
+          if (f != value_to_same_storage_values.end()) {
             auto storage_idx = managed_tensors.size() - 1;
-            for (const auto* v : value_to_same_storage_values.at(val)) {
+            const auto& same_storage_values = f->second;
+            for (const auto* v : same_storage_values) {
               value_to_storage_idx[v] = storage_idx;
             }
           }
@@ -1242,14 +1241,14 @@ static void assign_storage_to_managed_tensors(
 
 MemoryPlanner::MemoryPlanner(
     StaticRuntime* runtime,
-    const std::unordered_map<const Value*, std::vector<const Value*>>&
+    const FastMap<const Value*, std::vector<const Value*>>&
         value_to_same_storage_values,
-    const std::unordered_set<const Value*>& external_values,
+    const FastSet<const Value*>& external_values,
     bool enable_out_variant,
     bool manage_graph_output_memory) {
   // collect register indices of outputs of ops with out variant
-  std::unordered_set<const Value*> managed_tensor_values;
-  std::unordered_set<const Value*> leaked_values;
+  FastSet<const Value*> managed_tensor_values;
+  FastSet<const Value*> leaked_values;
   if (enable_out_variant) {
     for (ProcessedNode& pnode : runtime->nodes()) {
       if (pnode.has_out_variant()) {
@@ -1260,7 +1259,7 @@ MemoryPlanner::MemoryPlanner(
           }
           // Types are stored in the underlying TorchScript IR
           const auto& type = out_v->type();
-          if (type->cast<TensorType>()) {
+          if (type->castRaw<TensorType>()) {
             managed_tensor_values.insert(out_v);
           } else if (isOptimizableContainerType(pnode.node())) {
             // We "leak" certain container types because their allocations take
@@ -1273,7 +1272,7 @@ MemoryPlanner::MemoryPlanner(
   }
 
   // collect unmanaged output ivalues
-  std::unordered_set<IValue*> unmanaged_ivalues;
+  FastSet<IValue*> unmanaged_ivalues;
   for (ProcessedNode& pnode : runtime->nodes()) {
     for (const auto i : c10::irange(pnode.outputs().size())) {
       // Types are stored in the underlying TorchScript IR
@@ -1295,9 +1294,11 @@ MemoryPlanner::MemoryPlanner(
   }
 
   // copy to unmanaged_ivalues_
-  for (IValue* out : unmanaged_ivalues) {
-    unmanaged_ivalues_.emplace_back(out);
-  }
+  unmanaged_ivalues_.reserve(unmanaged_ivalues.size());
+  unmanaged_ivalues_.insert(
+      unmanaged_ivalues_.begin(),
+      unmanaged_ivalues.begin(),
+      unmanaged_ivalues.end());
 
   if (enable_out_variant) {
     ::torch::jit::assign_storage_to_managed_tensors(
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index b16cfefbc0b60..6cff047b4d2ce 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -9,9 +9,26 @@
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/inliner.h>
 
+#ifdef FBCODE_CAFFE2
+#include <folly/container/F14Map.h>
+#include <folly/container/F14Set.h>
+#endif
+
 namespace torch {
 namespace jit {
 
+#ifdef FBCODE_CAFFE2
+template <typename Key, typename Value>
+using FastMap = folly::F14FastMap<Key, Value>;
+template <typename Key>
+using FastSet = folly::F14FastSet<Key>;
+#else
+template <typename Key, typename Value>
+using FastMap = std::unordered_map<Key, Value>;
+template <typename Key>
+using FastSet = std::unordered_set<Key>;
+#endif
+
 TORCH_API bool canEnableStaticRuntime(
     const std::shared_ptr<torch::jit::Graph>& graph);
 
@@ -127,7 +144,7 @@ class TORCH_API StaticModule {
   size_t num_inputs() const;
   size_t num_outputs() const;
 
-  const std::unordered_map<int, std::vector<DefInfo>>& index_map() const {
+  const FastMap<int, std::vector<DefInfo>>& index_map() const {
     return node_inputs_ssa_def_map_;
   }
 
@@ -147,12 +164,12 @@ class TORCH_API StaticModule {
     return schema_;
   }
 
-  const std::unordered_map<const Value*, std::vector<const Value*>>&
+  const FastMap<const Value*, std::vector<const Value*>>&
   values_share_same_storage() const {
     return value_to_same_storage_values_;
   }
 
-  const std::unordered_set<const Value*>& external_values() const {
+  const FastSet<const Value*>& external_values() const {
     return external_values_;
   }
 
@@ -178,14 +195,14 @@ class TORCH_API StaticModule {
   // a vector of ssa_defs corresponding to graph->outputs()
   std::vector<DefInfo> output_ssa_defs_;
   // map a node idx (in graph order) to a vector of ssa_defs for node inputs
-  std::unordered_map<int, std::vector<DefInfo>> node_inputs_ssa_def_map_;
+  FastMap<int, std::vector<DefInfo>> node_inputs_ssa_def_map_;
 
   // Bookkeeping for MemoryPlanner in StaticRuntime
   // values whose live-time exceeds that of running one inference (e.g., input,
   // output, prim::Constants, and their aliases)
-  std::unordered_set<const Value*> external_values_;
+  FastSet<const Value*> external_values_;
   // map a value to the set of values that may share the same storage with it
-  std::unordered_map<const Value*, std::vector<const Value*>>
+  FastMap<const Value*, std::vector<const Value*>>
       value_to_same_storage_values_;
 };
 
@@ -323,8 +340,8 @@ class MemoryPlanner {
  public:
   explicit MemoryPlanner(
       StaticRuntime* runtime,
-      const std::unordered_map<const Value*, std::vector<const Value*>>&,
-      const std::unordered_set<const Value*>& external_values,
+      const FastMap<const Value*, std::vector<const Value*>>&,
+      const FastSet<const Value*>& external_values,
       bool enable_out_variant,
       bool manage_graph_output_memory);
   // disable copying and moving
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 484c4b03ad64c..54c04566a6eaf 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -16,6 +16,7 @@
 #include <ATen/native/quantized/cpu/qembeddingbag.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
 #include <torch/csrc/jit/runtime/static/te_wrapper.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
@@ -288,7 +289,7 @@ bool disableUnsafeMathOp(const char* op_name) {
   // not guarantee bit exactness vs the jit interpreter. Note aten::relu is not
   // included even though it uses NNC because the results of relu should always
   // match.
-  static const std::unordered_set<std::string> fast_ops{
+  static const FastSet<std::string> fast_ops{
       "aten::add", "aten::tanh", "aten::sigmoid", "aten::logit"};
   return fast_ops.count(op_name) > 0;
 }
diff --git a/torch/csrc/jit/runtime/static/te_wrapper.cpp b/torch/csrc/jit/runtime/static/te_wrapper.cpp
index d8b494c9d4a23..acd1fb758da0a 100644
--- a/torch/csrc/jit/runtime/static/te_wrapper.cpp
+++ b/torch/csrc/jit/runtime/static/te_wrapper.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/CPUFunctions.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
 
 namespace torch {
 namespace jit {
@@ -79,8 +80,8 @@ std::mutex& getNNCCacheMutex() {
   return nncCacheMutex;
 }
 
-std::unordered_map<NodeKind, std::shared_ptr<TEWrapper>>& getNNCCache() {
-  static std::unordered_map<NodeKind, std::shared_ptr<TEWrapper>> nncCache;
+FastMap<NodeKind, std::shared_ptr<TEWrapper>>& getNNCCache() {
+  static FastMap<NodeKind, std::shared_ptr<TEWrapper>> nncCache;
   return nncCache;
 }
 

From c90b3cb1dabe712aa07e082b3735f1f2a9134c9b Mon Sep 17 00:00:00 2001
From: Don Jang <djang@fb.com>
Date: Fri, 27 Aug 2021 02:43:22 -0700
Subject: [PATCH 288/530] [Static Runtime] Manage temporary Tensors for
 aten::layer_norm (#64078)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64078

This change converts `aten::layer_norm -> output Tensor` to `static_runtime::layer_norm -> (output Tensor, temp1 Tensor, tmp2 Tensor)` to manage `tmp1` and `tmp2` Tensors by the static runtime.

Currently the out-variant of `aten::layer_norm` creates two temporary Tensors inside it:
```
    at::Tensor mean = create_empty_from({M}, *X);
    at::Tensor rstd = create_empty_from({M}, *X);
```
that the static runtime misses an opportunity to manage.

This change puts them into (unused) output Tensors of a new placeholder op `static_runtime::layer_norm` so that the static runtime can mange them since the static runtime as of now chooses to manage only output tensors.

Test Plan:
- Enhanced `StaticRuntime.LayerNorm` to ensure that `static_runtime::layer_norm` gets activated.

- Confirmed that the new op gets activated during testing:

```
V0825 12:51:50.017890 2265227 impl.cpp:1396] Switch to out variant for node: %8 : Tensor, %9 : Tensor, %10 : Tensor = static_runtime::layer_norm(%input.1, %normalized_shape.1, %4, %4, %5, %3)

```

Reviewed By: hlu1

Differential Revision: D30486475

fbshipit-source-id: 5121c44ab58c2d8a954aa0bbd9dfeb7468347a2d
---
 .../static_runtime/test_static_runtime.cc     |   7 ++
 torch/csrc/jit/runtime/static/impl.cpp        |   1 +
 torch/csrc/jit/runtime/static/ops.cpp         | 117 +++++++++++-------
 torch/csrc/jit/runtime/static/passes.cpp      |  33 +++++
 torch/csrc/jit/runtime/static/passes.h        |   3 +
 5 files changed, 113 insertions(+), 48 deletions(-)

diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 4441b7d043db9..0d4202464bc61 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -209,6 +209,13 @@ TEST(StaticRuntime, EmbeddingBag) {
 }
 
 TEST(StaticRuntime, LayerNorm) {
+#ifdef FBCODE_CAFFE2
+  script::Module module("module");
+  module.define(layer_norm_with_weights);
+  torch::jit::StaticModule smodule(module);
+  ASSERT_EQ(getNodeWithKind(smodule, "aten::layer_norm"), nullptr);
+  ASSERT_NE(getNodeWithKind(smodule, "static_runtime::layer_norm"), nullptr);
+#endif
   const auto a = torch::rand({1, 2, 2, 2});
   const auto b = torch::rand({3, 2, 2, 2});
   for (int normalized_size : {2, 3}) {
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index b3e1eb116dc7d..643842a74691c 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -74,6 +74,7 @@ void OptimizeGraph(
   if (opts.enable_out_variant) {
     FuseListUnpack(graph);
     ReplaceWithCopy(graph);
+    EnableStaticRuntimeLayerNorm(graph);
   }
 #endif
   ConstantPropagation(graph);
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 54c04566a6eaf..7e78b77246f3b 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1308,55 +1308,76 @@ REGISTER_OPERATOR_FUNCTOR(aten::argmin, aten_argmin, [](Node* n) -> SROperator {
   };
 });
 
-REGISTER_OPERATOR_FUNCTOR(aten::layer_norm, aten_layer_norm, [](Node* n) -> SROperator {
-  if (!n->matches(torch::schema(
-          "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor"))) {
-    LogAndDumpSchema(n);
-    return nullptr;
-  }
-  return [](ProcessedNode* p_node) {
-    // ignore Input(5): `bool cudnn_enable=True`
-    const auto& input = p_node->Input(0).toTensor();
-    const auto normalized_shape = p_node->Input(1).toIntVector();
-    auto weight_opt = p_node->Input(2).toOptional<at::Tensor>();
-    auto bias_opt = p_node->Input(3).toOptional<at::Tensor>();
-    float eps = p_node->Input(4).toDouble();
-
-    c10::MaybeOwned<at::Tensor> weight_maybe_owned =
-        at::borrow_from_optional_tensor(weight_opt);
-    const at::Tensor& weight = *weight_maybe_owned;
-    c10::MaybeOwned<at::Tensor> bias_maybe_owned =
-        at::borrow_from_optional_tensor(bias_opt);
-    const at::Tensor& bias = *bias_maybe_owned;
-
-    auto M_N = at::native::_check_layer_norm_inputs(
-        input, normalized_shape, weight, bias);
-    auto M = M_N.first;
-    auto N = M_N.second;
-    auto X = input.expect_contiguous();
-    auto gamma = weight.expect_contiguous();
-    auto beta = bias.expect_contiguous();
-
-    if (p_node->Output(0).isNone()) {
-      p_node->Output(0) = at::native::empty_like(
-          *X,
-          c10::nullopt /* dtype */,
-          c10::nullopt /* layout */,
-          c10::nullopt /* device */,
-          c10::nullopt /* pin_memory */,
-          at::MemoryFormat::Contiguous);
-    } else {
-      at::native::resize_(
-          p_node->Output(0).toTensor(), X->sizes(), c10::nullopt);
-    }
-    at::Tensor& output = p_node->Output(0).toTensor();
-    at::Tensor mean = create_empty_from({M}, *X);
-    at::Tensor rstd = create_empty_from({M}, *X);
+REGISTER_OPERATOR_FUNCTOR(
+    static_runtime::layer_norm,
+    aten_layer_norm,
+    [](Node* n) -> SROperator {
+      if (!n->matches(torch::schema(
+              "static_runtime::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> (Tensor,Tensor,Tensor)"))) {
+        LogAndDumpSchema(n);
+        return nullptr;
+      }
+      return [](ProcessedNode* p_node) {
+        // ignore Input(5): `bool cudnn_enable=True`
+        const auto& input = p_node->Input(0).toTensor();
+        const auto normalized_shape = p_node->Input(1).toIntVector();
+        auto weight_opt = p_node->Input(2).toOptional<at::Tensor>();
+        auto bias_opt = p_node->Input(3).toOptional<at::Tensor>();
+        float eps = p_node->Input(4).toDouble();
+
+        c10::MaybeOwned<at::Tensor> weight_maybe_owned =
+            at::borrow_from_optional_tensor(weight_opt);
+        const at::Tensor& weight = *weight_maybe_owned;
+        c10::MaybeOwned<at::Tensor> bias_maybe_owned =
+            at::borrow_from_optional_tensor(bias_opt);
+        const at::Tensor& bias = *bias_maybe_owned;
+
+        auto M_N = at::native::_check_layer_norm_inputs(
+            input, normalized_shape, weight, bias);
+        auto M = M_N.first;
+        auto N = M_N.second;
+        auto X = input.expect_contiguous();
+        auto gamma = weight.expect_contiguous();
+        auto beta = bias.expect_contiguous();
 
-    at::native::layer_norm_cpu_out(
-        output, mean, rstd, input, normalized_shape, *gamma, *beta, eps, M, N);
-  };
-});
+        if (p_node->Output(0).isNone()) {
+          p_node->Output(0) = at::native::empty_like(
+              *X,
+              c10::nullopt /* dtype */,
+              c10::nullopt /* layout */,
+              c10::nullopt /* device */,
+              c10::nullopt /* pin_memory */,
+              at::MemoryFormat::Contiguous);
+        } else {
+          at::native::resize_(
+              p_node->Output(0).toTensor(), X->sizes(), c10::nullopt);
+        }
+        if (p_node->Output(1).isNone()) {
+          p_node->Output(1) = create_empty_from({M}, *X);
+        } else {
+          at::native::resize_(p_node->Output(1).toTensor(), {M}, c10::nullopt);
+        }
+        if (p_node->Output(2).isNone()) {
+          p_node->Output(2) = create_empty_from({M}, *X);
+        } else {
+          at::native::resize_(p_node->Output(2).toTensor(), {M}, c10::nullopt);
+        }
+        at::Tensor& output = p_node->Output(0).toTensor();
+        at::Tensor mean = p_node->Output(1).toTensor();
+        at::Tensor rstd = p_node->Output(2).toTensor();
+        at::native::layer_norm_cpu_out(
+            output,
+            mean,
+            rstd,
+            input,
+            normalized_shape,
+            *gamma,
+            *beta,
+            eps,
+            M,
+            N);
+      };
+    });
 
 REGISTER_OPERATOR_FUNCTOR(aten::norm, aten_norm, [](Node* n) -> SROperator {
   if (!n->matches(torch::schema(
diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
index 1133e3924c32a..5099dc1ba6e2b 100644
--- a/torch/csrc/jit/runtime/static/passes.cpp
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@@ -303,6 +303,9 @@ TORCH_LIBRARY_FRAGMENT(static_runtime, m) {
       "static_runtime::to_copy.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor");
   m.def(
       "static_runtime::to_copy.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor");
+  m.def(torch::schema(
+      "static_runtime::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> (Tensor, Tensor, Tensor)",
+      c10::AliasAnalysisKind::PURE_FUNCTION));
 }
 
 bool HasInplaceOp(std::shared_ptr<Graph>& graph, const AliasDb& alias_db) {
@@ -469,5 +472,35 @@ void FuseListUnpack(std::shared_ptr<torch::jit::Graph>& graph) {
 #endif
 }
 
+void EnableStaticRuntimeLayerNorm(std::shared_ptr<torch::jit::Graph>& graph) {
+  const c10::Symbol static_runtime_layer_norm_symbol =
+      c10::Symbol::fromQualString("static_runtime::layer_norm");
+  auto nodes = graph->nodes();
+  std::vector<std::pair<Node*, Node*>> replacement;
+  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
+    Node* old_node = *it;
+    if (!old_node->matches(torch::schema(
+            "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor"))) {
+      continue;
+    }
+    TORCH_CHECK(old_node->outputs().size() == 1);
+    auto* new_node = graph->create(
+        static_runtime_layer_norm_symbol,
+        /*layer_norm*/ 1 + /*mean*/ 1 + /*rst=*/1);
+    new_node->insertBefore(old_node);
+    for (auto* input : old_node->inputs()) {
+      new_node->addInput(input);
+    }
+    replacement.emplace_back(old_node, new_node);
+  }
+  for (const auto& p : replacement) {
+    auto* old_node = p.first;
+    auto* new_node = p.second;
+    new_node->output(0)->copyMetadata(old_node->output(0));
+    old_node->output(0)->replaceAllUsesWith(new_node->output(0));
+    old_node->destroy();
+  }
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/static/passes.h b/torch/csrc/jit/runtime/static/passes.h
index 11ab4bdc7c46a..a42bc97f19618 100644
--- a/torch/csrc/jit/runtime/static/passes.h
+++ b/torch/csrc/jit/runtime/static/passes.h
@@ -13,6 +13,9 @@ TORCH_API void ReplaceWithCopy(
     std::shared_ptr<torch::jit::Graph>& graph,
     bool outputs_are_immutable = true);
 
+TORCH_API void EnableStaticRuntimeLayerNorm(
+    std::shared_ptr<torch::jit::Graph>& graph);
+
 TORCH_API bool HasInplaceOp(
     std::shared_ptr<Graph>& graph,
     const AliasDb& alias_db);

From f2c47cf4dbbdd0cafc1bd2118121c6eda3947f3f Mon Sep 17 00:00:00 2001
From: Harut Movsisyan <harutm@fb.com>
Date: Fri, 27 Aug 2021 03:03:32 -0700
Subject: [PATCH 289/530] [Static Runtime] Out version for fmod (#64046)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64046

Test Plan:
Confirm out variant is used:
```
> //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- --v=1

V0826 23:31:30.321382 193428 impl.cpp:1395] Switch to out variant for node: %4 : Tensor = aten::fmod(%a.1, %b.1)
```

Reviewed By: mikeiovine

Differential Revision: D30581228

fbshipit-source-id: dfab9a16ff8afd40b29338037769f938f154bf74
---
 benchmarks/static_runtime/test_scripts.h      | 10 +++++++
 .../static_runtime/test_static_runtime.cc     | 27 +++++++++++++++++++
 torch/csrc/jit/runtime/static/ops.cpp         | 25 +++++++++++++++++
 3 files changed, 62 insertions(+)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index ecdd491462f62..477b191b24156 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -762,3 +762,13 @@ const std::string quantize_script = R"IR(
       %1249: Tensor = aten::dequantize(%1254)
       return (%1249)
 )IR";
+
+const auto fmod_tensor = R"JIT(
+  def forward(self, a: Tensor, b: Tensor):
+      return torch.fmod(a, b).clone()
+)JIT";
+
+const auto fmod_scalar = R"JIT(
+  def forward(self, a: Tensor, b: int):
+      return torch.fmod(a, b).clone()
+)JIT";
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 0d4202464bc61..bd213c78dac1a 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -1230,3 +1230,30 @@ TEST(StaticRuntime, IndividualOps_VarStack) {
 
   testStaticRuntime(var_stack_script, args1, args2);
 }
+
+TEST(StaticRuntime, IndividualOps_FmodTensor) {
+  // fmod tensor version
+  auto a = at::randn({2, 3});
+  auto b = at::randn({2, 3});
+  std::vector<IValue> args0{a, b};
+  testStaticRuntime(fmod_tensor, args0);
+
+  // check for dynamic shapes
+  auto c = at::randn({4, 3, 2});
+  auto d = at::randn({4, 3, 2});
+  std::vector<IValue> args1{c, d};
+  testStaticRuntime(fmod_tensor, args0, args1);
+}
+
+TEST(StaticRuntime, IndividualOps_FmodScalar) {
+  auto a = at::randn({2, 3});
+
+  // fmod scalar version
+  std::vector<IValue> args2{a, 3};
+  testStaticRuntime(fmod_scalar, args2);
+
+  // check for dynamic shapes
+  auto c = at::randn({4, 3, 2});
+  std::vector<IValue> args3{c, 4};
+  testStaticRuntime(fmod_scalar, args2, args3);
+}
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 7e78b77246f3b..36f796fb2f256 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1611,6 +1611,31 @@ REGISTER_OPERATOR_FUNCTOR(aten::linear, aten_linear, [](Node* n) -> SROperator {
   };
 });
 
+REGISTER_OPERATOR_FUNCTOR(aten::fmod, aten_fmod, [](Node* n) -> SROperator {
+  if (!n->matches(torch::schema(
+          "aten::fmod.Scalar(Tensor self, Scalar other) -> Tensor")) &&
+      !n->matches(torch::schema(
+          "aten::fmod.Tensor(Tensor self, Tensor other) -> Tensor"))) {
+    LogAndDumpSchema(n);
+    return nullptr;
+  }
+  return [](ProcessedNode* p_node) {
+    const auto& in0_t = p_node->Input(0).toTensor();
+    const auto& in1_t = p_node->Input(1).isTensor()
+        ? p_node->Input(1).toTensor()
+        : at::native::wrapped_scalar_tensor(p_node->Input(1).toScalar());
+
+    if (p_node->Output(0).isNone()) {
+      p_node->Output(0) = at::cpu::fmod(in0_t, in1_t);
+    } else {
+      auto& out_t = p_node->Output(0).toTensor();
+      fastResizeToZero(out_t);
+
+      at::cpu::fmod_out(out_t, in0_t, in1_t);
+    }
+  };
+});
+
 namespace {
 
 void check_cat_no_zero_dim(const std::vector<at::Tensor>& tensors) {

From ad8eddbd808a97ac518ffd5b51d2c925803a1a3f Mon Sep 17 00:00:00 2001
From: gmagogsfm <gmagogsfm@gmail.com>
Date: Fri, 27 Aug 2021 08:49:54 -0700
Subject: [PATCH 290/530] More robust check of whether a class is defined in
 torch (#64083)

Summary:
This would prevent bugs for classes that
1) Is defined in a module that happens to start with `torch`, say `torchvision`
2) Is defined in torch but with an import alias like `import torch as th`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64083

Reviewed By: soulitzer

Differential Revision: D30598369

Pulled By: gmagogsfm

fbshipit-source-id: 9d3a7135737b2339c9bd32195e4e69a9c07549d4
---
 torch/jit/_monkeytype_config.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/torch/jit/_monkeytype_config.py b/torch/jit/_monkeytype_config.py
index f0e4613e82fd1..9957541ff25d1 100644
--- a/torch/jit/_monkeytype_config.py
+++ b/torch/jit/_monkeytype_config.py
@@ -1,6 +1,10 @@
+
+import torch
+
 import inspect
 import typing
 import pathlib
+import sys
 from typing import Optional, Iterable, List, Dict
 from collections import defaultdict
 from types import CodeType
@@ -15,6 +19,18 @@
 except ImportError:
     _IS_MONKEYTYPE_INSTALLED = False
 
+# Checks whether a class is defind in `torch.*` modules
+def is_torch_native_class(cls):
+    if not hasattr(cls, '__module__'):
+        return False
+
+    parent_modules = cls.__module__.split('.')
+    if not parent_modules:
+        return False
+
+    root_module = sys.modules.get(parent_modules[0])
+    return root_module is torch
+
 def get_type(type):
     """
     Helper function which converts the given type to a torchScript acceptable format.
@@ -28,7 +44,7 @@ def get_type(type):
         # typing.List is not accepted by TorchScript.
         type_to_string = str(type)
         return type_to_string.replace(type.__module__ + '.', '')
-    elif type.__module__.startswith('torch'):
+    elif is_torch_native_class(type):
         # If the type is a subtype of torch module, then TorchScript expects a fully qualified name
         # for the type which is obtained by combining the module name and type name.
         return type.__module__ + '.' + type.__name__

From a43e7a51d7b4c89096510473becffc934644403f Mon Sep 17 00:00:00 2001
From: Aswin Murali <aswinmurali.co@gmail.com>
Date: Fri, 27 Aug 2021 09:02:22 -0700
Subject: [PATCH 291/530] Adds return type annotation for fork_rng function
 (#63724)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63723

Since it's a generator function the type annotation shall be `Generator`.
![image](https://user-images.githubusercontent.com/47299190/130318830-29ef9529-0daa-463c-90b2-1b11f63ade8a.png)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63724

Reviewed By: iramazanli

Differential Revision: D30543098

Pulled By: heitorschueroff

fbshipit-source-id: ebdd34749defe1e26c899146786a0357ab4b4b9b
---
 torch/random.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/random.py b/torch/random.py
index d774634478697..f5156bf48730d 100644
--- a/torch/random.py
+++ b/torch/random.py
@@ -1,4 +1,5 @@
 import contextlib
+from typing import Generator
 import warnings
 
 from torch._C import default_generator
@@ -65,7 +66,7 @@ def initial_seed() -> int:
 
 
 @contextlib.contextmanager
-def fork_rng(devices=None, enabled=True, _caller="fork_rng", _devices_kw="devices"):
+def fork_rng(devices=None, enabled=True, _caller="fork_rng", _devices_kw="devices") -> Generator:
     """
     Forks the RNG, so that when you return, the RNG is reset
     to the state that it was previously in.

From 22d38bd10d998edc033d268846eaa2dd395dcb55 Mon Sep 17 00:00:00 2001
From: Hanton Yang <hantonyang@fb.com>
Date: Fri, 27 Aug 2021 09:23:45 -0700
Subject: [PATCH 292/530] [OSS] Enable Metal in PyTorch MacOS nightly builds
 (#63718)

Summary:
Build on https://github.com/pytorch/pytorch/pull/63825

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63718

Test Plan:
1.Add `ci/binaries` label to PR, so the CI will build those nightly builds

2.Make sure the following CI jobs build with `USE_PYTORCH_METAL_EXPORT` option is `ON`:
```
ci/circleci: binary_macos_arm64_conda_3_8_cpu_nightly_build
ci/circleci: binary_macos_arm64_conda_3_9_cpu_nightly_build
ci/circleci: binary_macos_arm64_wheel_3_8_cpu_nightly_build
ci/circleci: binary_macos_arm64_wheel_3_9_cpu_nightly_build
ci/circleci: binary_macos_conda_3_6_cpu_nightly_build
ci/circleci: binary_macos_conda_3_7_cpu_nightly_build
ci/circleci: binary_macos_conda_3_8_cpu_nightly_build
ci/circleci: binary_macos_conda_3_9_cpu_nightly_build
ci/circleci: binary_macos_libtorch_3_7_cpu_nightly_build
ci/circleci: binary_macos_wheel_3_6_cpu_nightly_build
ci/circleci: binary_macos_wheel_3_7_cpu_nightly_build
ci/circleci: binary_macos_wheel_3_8_cpu_nightly_build
ci/circleci: binary_macos_wheel_3_9_cpu_nightly_build
```

3.Test `conda` and `wheel` builds locally on [HelloWorld-Metal](https://github.com/pytorch/ios-demo-app/tree/master/HelloWorld-Metal) demo with [(Prototype) Use iOS GPU in PyTorch](https://pytorch.org/tutorials/prototype/ios_gpu_workflow.html)

(1) conda
```
conda install https://15667941-65600975-gh.circle-artifacts.com/0/Users/distiller/project/final_pkgs/pytorch-1.10.0.dev20210826-py3.8_0.tar.bz2
```
(2) wheel
```
pip3 install https://15598647-65600975-gh.circle-artifacts.com/0/Users/distiller/project/final_pkgs/torch-1.10.0.dev20210824-cp38-none-macosx_10_9_x86_64.whl
```

Reviewed By: xta0

Differential Revision: D30593167

Pulled By: hanton

fbshipit-source-id: 471da204e94b29c11301c857c50501307a5f0785
---
 .circleci/scripts/binary_macos_build.sh |  3 +++
 CMakeLists.txt                          |  5 +++++
 aten/src/ATen/CMakeLists.txt            | 29 +++++++++++++++----------
 cmake/Summary.cmake                     |  1 +
 4 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/.circleci/scripts/binary_macos_build.sh b/.circleci/scripts/binary_macos_build.sh
index c402cdd008013..c5cdfa9f09080 100755
--- a/.circleci/scripts/binary_macos_build.sh
+++ b/.circleci/scripts/binary_macos_build.sh
@@ -14,6 +14,9 @@ chmod +x "$build_script"
 # Build
 cat >"$build_script" <<EOL
 export PATH="$workdir/miniconda/bin:$PATH"
+if [[ "$CIRCLE_BRANCH" == "nightly" ]]; then
+  export USE_PYTORCH_METAL_EXPORT=1
+fi
 if [[ "$PACKAGE_TYPE" == conda ]]; then
   "$workdir/builder/conda/build_pytorch.sh"
 else
diff --git a/CMakeLists.txt b/CMakeLists.txt
index db38d592c55b7..f5eed7207f107 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -214,6 +214,7 @@ option(USE_LMDB "Use LMDB" OFF)
 option(USE_MAGMA "Use MAGMA" ON)
 option(USE_METAL "Use Metal for Caffe2 iOS build" ON)
 option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
+option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(
     USE_MLCOMPUTE "Use ML Compute for macOS build" ON
@@ -688,6 +689,10 @@ if(USE_PYTORCH_METAL)
   string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_METAL")
 endif()
 
+if(USE_PYTORCH_METAL_EXPORT)
+  string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_METAL_EXPORT")
+endif()
+
 if(USE_SOURCE_DEBUG_ON_MOBILE)
   string(APPEND CMAKE_CXX_FLAGS " -DSYMBOLICATE_MOBILE_DEBUG_HANDLE")
 endif()
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index a73f3e31ff894..114d970bf7ddc 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -167,13 +167,12 @@ else()
 endif()
 
 # Metal
-if(USE_PYTORCH_METAL)
-  if(APPLE)
-    set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${native_metal_srcs})
-  else()
-    # Add files needed from optimized_for_mobile
-    set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${metal_prepack_cpp})
-  endif()
+if(USE_PYTORCH_METAL_EXPORT)
+  # Add files needed from exporting metal models(optimized_for_mobile)
+  set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${metal_prepack_cpp})
+elseif(APPLE AND USE_PYTORCH_METAL)
+  # Compile Metal kernels
+  set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${native_metal_srcs})
 else()
   set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp})
 endif()
@@ -450,13 +449,21 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
 set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS})
 if(NOT INTERN_BUILD_MOBILE)
   list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${miopen_h})
+  # Metal
+  if(USE_PYTORCH_METAL_EXPORT)
+    # Add files needed from exporting metal models(optimized_for_mobile)
+    list(APPEND INSTALL_HEADERS ${metal_h} ${metal_prepack_h})
+  elseif(APPLE AND USE_PYTORCH_METAL)
+    # Needed by Metal kernels
+    list(APPEND INSTALL_HEADERS ${metal_h} ${native_metal_h})
+  else()
+    list(APPEND INSTALL_HEADERS ${metal_h})
+  endif()
 else()
-  if(USE_PYTORCH_METAL)
-    if(IOS)
+  if(IOS AND USE_PYTORCH_METAL)
       list(APPEND INSTALL_HEADERS ${metal_h} ${native_metal_h})
-    else()
+  else()
       list(APPEND INSTALL_HEADERS ${metal_h} ${metal_prepack_h})
-    endif()
   endif()
 endif()
 
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index afc63b18f5f07..99c41f24ab8c8 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -131,6 +131,7 @@ function(caffe2_print_configuration_summary)
   endif()
   message(STATUS "  USE_METAL             : ${USE_METAL}")
   message(STATUS "  USE_PYTORCH_METAL     : ${USE_PYTORCH_METAL}")
+  message(STATUS "  USE_PYTORCH_METAL_EXPORT     : ${USE_PYTORCH_METAL_EXPORT}")
   message(STATUS "  USE_FFTW              : ${USE_FFTW}")
   message(STATUS "  USE_MKL               : ${CAFFE2_USE_MKL}")
   message(STATUS "  USE_MKLDNN            : ${USE_MKLDNN}")

From f922b58b5fe420538f35d4c88953b664be85d7ee Mon Sep 17 00:00:00 2001
From: Sergei Vorobev <sergei.vorobev@getcruise.com>
Date: Fri, 27 Aug 2021 09:31:36 -0700
Subject: [PATCH 293/530] [bazel] GPU-support: add @local_config_cuda and @cuda
 (#63604)

Summary:
## Context

We take the first step at tackling the GPU-bazel support by adding bazel external workspaces `local_config_cuda` and `cuda`, where the first one has some hardcoded values and lists of files, and the second one provides a nicer, high-level wrapper that maps into the already expected by pytorch bazel targets that are guarded with `if_cuda` macro.

The prefix `local_config_` signifies the fact that we are breaking the bazel hermeticity philosophy by explicitly relaying on the CUDA installation that is present on the machine.

## Testing

Notice an important scenario that is unlocked by this change: compilation of cpp code that depends on cuda libraries (i.e. cuda.h and so on).

Before:
```
sergei.vorobev@cs-sv7xn77uoy-gpu-1628706590:~/src/pytorch4$ bazelisk build --define=cuda=true //:c10
ERROR: /home/sergei.vorobev/src/pytorch4/tools/config/BUILD:12:1: no such package 'tools/toolchain': BUILD file not found in any of the following directories. Add a BUILD file to a directory to mark it as a package.
 - /home/sergei.vorobev/src/pytorch4/tools/toolchain and referenced by '//tools/config:cuda_enabled_and_capable'
ERROR: While resolving configuration keys for //:c10: Analysis failed
ERROR: Analysis of target '//:c10' failed; build aborted: Analysis failed
INFO: Elapsed time: 0.259s
INFO: 0 processes.
FAILED: Build did NOT complete successfully (2 packages loaded, 2 targets configured)
```

After:
```
sergei.vorobev@cs-sv7xn77uoy-gpu-1628706590:~/src/pytorch4$ bazelisk build --define=cuda=true //:c10
INFO: Analyzed target //:c10 (6 packages loaded, 246 targets configured).
INFO: Found 1 target...
Target //:c10 up-to-date:
  bazel-bin/libc10.lo
  bazel-bin/libc10.so
INFO: Elapsed time: 0.617s, Critical Path: 0.04s
INFO: 0 processes.
INFO: Build completed successfully, 1 total action
```

The `//:c10` target is a good testing one for this, because it has such cases where the [glob is different](https://github.com/pytorch/pytorch/blob/075024b9a34904ec3ecdab3704c3bcaa329bdfea/BUILD.bazel#L76-L81), based on do we compile for CUDA or not.

## What is out of scope of this PR

This PR is a first in a series of providing the comprehensive GPU bazel build support. Namely, we don't tackle the [cu_library](https://github.com/pytorch/pytorch/blob/11a40ad915d4d3d8551588e303204810887fcf8d/tools/rules/cu.bzl#L2) implementation here. This would be a separate large chunk of work.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63604

Reviewed By: soulitzer

Differential Revision: D30442083

Pulled By: malfet

fbshipit-source-id: b2a8e4f7e5a25a69b960a82d9e36ba568eb64595
---
 .bazelrc                                      |   6 +-
 .github/scripts/generate_ci_workflows.py      |   2 +-
 ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml |   2 +-
 .jenkins/pytorch/build.sh                     |   4 +
 WORKSPACE                                     |  13 +-
 third_party/cuda.BUILD                        |  43 ++
 third_party/tensorflow_cuda_bazel_build/BUILD |   0
 .../tensorflow_cuda_bazel_build/README.md     |   5 +
 .../tensorflow_cuda_bazel_build/WORKSPACE     |   1 +
 .../tensorflow_cuda_bazel_build/cuda/BUILD    | 451 ++++++++++++++++++
 tools/config/BUILD                            |   1 -
 tools/rules/workspace.bzl                     |  25 +
 12 files changed, 548 insertions(+), 5 deletions(-)
 create mode 100644 third_party/cuda.BUILD
 create mode 100644 third_party/tensorflow_cuda_bazel_build/BUILD
 create mode 100644 third_party/tensorflow_cuda_bazel_build/README.md
 create mode 100644 third_party/tensorflow_cuda_bazel_build/WORKSPACE
 create mode 100755 third_party/tensorflow_cuda_bazel_build/cuda/BUILD

diff --git a/.bazelrc b/.bazelrc
index ecfe8fd0efcd0..310eb293389dc 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -3,7 +3,11 @@ build --copt=-I.
 build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
 
 # Configuration to disable tty features for environments like CI
-
 build:no-tty --curses no
 build:no-tty --progress_report_interval 10
 build:no-tty --show_progress_rate_limit 10
+
+# Configuration to build with GPU support
+build:gpu --define=cuda=true
+# define a separate build folder for faster switching between configs
+build:gpu --platform_suffix=-gpu
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index cd7065dbfaa47..f1819dbac589d 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -497,7 +497,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
     CIWorkflow(
         arch="linux",
         build_environment="linux-xenial-py3.6-gcc7-bazel-test",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
         test_runner_type=LINUX_CPU_TEST_RUNNER,
         on_pull_request=True,
         ciflow_config=CIFlowConfig(
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index 71a9bf76dac22..233144210dbcd 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -15,7 +15,7 @@ on:
 
 env:
   BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc7-bazel-test
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index d7b66e7c9177e..085cf5152e6f1 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -224,7 +224,11 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
 
   get_bazel
 
+  # first build the whole torch for CPU-only
   tools/bazel build --config=no-tty :torch
+  # then build selected set of targets with GPU-support.
+  # TODO: eventually this should converge to building the whole :torch with GPU-support
+  tools/bazel build --config=no-tty --config=gpu :c10
 else
   # check that setup.py would fail with bad arguments
   echo "The next three invocations are expected to fail with invalid command error messages."
diff --git a/WORKSPACE b/WORKSPACE
index 6f5028d4d0912..9396a3451c360 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,7 +1,7 @@
 workspace(name = "pytorch")
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("//tools/rules:workspace.bzl", "new_patched_local_repository")
+load("//tools/rules:workspace.bzl", "new_patched_local_repository", "new_empty_repository")
 
 http_archive(
     name = "bazel_skylib",
@@ -170,3 +170,14 @@ protobuf_deps()
 load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()
+
+local_repository(
+    name = "local_config_cuda",
+    path = "third_party/tensorflow_cuda_bazel_build",
+)
+
+# Wrapper to expose local_config_cuda in an agnostic way
+new_empty_repository(
+    name = "cuda",
+    build_file = "//third_party:cuda.BUILD",
+)
diff --git a/third_party/cuda.BUILD b/third_party/cuda.BUILD
new file mode 100644
index 0000000000000..0c58b34a52e74
--- /dev/null
+++ b/third_party/cuda.BUILD
@@ -0,0 +1,43 @@
+"""
+Collect all the CUDA stuff from @local_config_cuda in a single target
+for convenience.
+"""
+
+cc_library(
+    name = "cuda",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_cuda//cuda:cublas",
+        "@local_config_cuda//cuda:cuda_driver",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudart",
+        "@local_config_cuda//cuda:cufft",
+        "@local_config_cuda//cuda:curand",
+    ],
+)
+
+cc_library(
+    name = "cupti",
+    deps = [
+        "@local_config_cuda//cuda:cupti_headers",
+        "@local_config_cuda//cuda:cupti_link",
+    ],
+)
+
+[
+    alias(
+        name = lib,
+        actual = "@local_config_cuda//cuda:{}".format(lib),
+        visibility = ["//visibility:public"],
+    )
+    for lib in [
+        "cublas",
+        "cufft",
+        "cusolver",
+        "cusparse",
+        "curand",
+        "nvrtc",
+        "cuda_driver",
+        "nvToolsExt",
+    ]
+]
diff --git a/third_party/tensorflow_cuda_bazel_build/BUILD b/third_party/tensorflow_cuda_bazel_build/BUILD
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/third_party/tensorflow_cuda_bazel_build/README.md b/third_party/tensorflow_cuda_bazel_build/README.md
new file mode 100644
index 0000000000000..439e195d8e44e
--- /dev/null
+++ b/third_party/tensorflow_cuda_bazel_build/README.md
@@ -0,0 +1,5 @@
+# Config for CUDA
+
+This is a checked-in copy of the auto-generated config for building CUDA code with bazel. The content of this folder was generated from https://github.com/tensorflow/tensorflow `./configure` execution and then edited manually to fit the pytorch needs.
+
+The LICENSE for the TensorFlow project is APACHE 2. The full LICENSE file could be found here https://github.com/tensorflow/tensorflow/blob/master/LICENSE.
diff --git a/third_party/tensorflow_cuda_bazel_build/WORKSPACE b/third_party/tensorflow_cuda_bazel_build/WORKSPACE
new file mode 100644
index 0000000000000..59369ce679c14
--- /dev/null
+++ b/third_party/tensorflow_cuda_bazel_build/WORKSPACE
@@ -0,0 +1 @@
+workspace(name = "local_config_cuda")
diff --git a/third_party/tensorflow_cuda_bazel_build/cuda/BUILD b/third_party/tensorflow_cuda_bazel_build/cuda/BUILD
new file mode 100755
index 0000000000000..f7271af2750b8
--- /dev/null
+++ b/third_party/tensorflow_cuda_bazel_build/cuda/BUILD
@@ -0,0 +1,451 @@
+licenses([
+    "restricted",
+    "reciprocal",
+    "notice",
+])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_nvcc",
+    values = {
+        "define": "using_cuda_nvcc=true",
+    },
+)
+
+config_setting(
+    name = "using_clang",
+    values = {
+        "define": "using_cuda_clang=true",
+    },
+)
+
+# Equivalent to using_clang && -c opt.
+config_setting(
+    name = "using_clang_opt",
+    values = {
+        "define": "using_cuda_clang=true",
+        "compilation_mode": "opt",
+    },
+)
+
+config_setting(
+    name = "darwin",
+    values = {"cpu": "darwin"},
+)
+
+cc_library(
+    name = "cuda_headers",
+    hdrs = [
+        ":cuda-include",
+        ":cudnn-include",
+    ],
+    includes = [
+        ".",
+        "include",
+    ],
+)
+
+cc_library(
+    name = "cudnn_headers",
+    hdrs = [
+        ":cudnn-include",
+    ],
+    includes = [
+        ".",
+        "include",
+    ],
+)
+
+cc_library(
+    name = "cudart_static",
+    linkopts = [
+        "-L/usr/local/cuda/lib64",
+    ],
+)
+
+cc_library(
+    name = "cuda_driver",
+    linkopts = ["-lcuda"],
+    deps = [":linker_search_path"],
+)
+
+# Provides the RPATH for Nvidia-less sytems to be able to run binaries linked to libcuda.
+cc_library(
+    name = "driver_stub_runtime",
+    linkopts = [
+        "-Wl,-rpath,/usr/local/cuda/lib64/stubs",
+    ],
+    deps = [":cuda_driver"],
+)
+
+cc_library(
+    name = "linker_search_path",
+    linkopts = [
+        "-L/usr/local/cuda/lib64",
+        "-L/usr/local/cuda/lib64/stubs",
+        "-Wl,-rpath-link,/usr/local/cuda/lib64",
+        "-Wl,-rpath-link,/usr/local/cuda/lib64/stubs",
+    ],
+)
+
+[
+    cc_library(
+        name = libname,
+        linkopts = ["-l" + libname] + (["-lgomp"] if (libname == "cusolver") else []),
+        linkstatic = True,
+        deps = [":linker_search_path"],
+    )
+    for libname in [
+        "cublas",
+        "cudart",
+        "cudnn",
+        "cufft",
+        "curand",
+        "cusolver",
+        "cusparse",
+        "nvrtc",
+        "nvToolsExt",
+    ]
+]
+
+cc_library(
+    name = "cuda",
+    deps = [
+        ":cublas",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+        ":nvToolsExt",
+    ],
+)
+
+# NVIDIA Performance Primitives (http://docs.nvidia.com/cuda/npp/modules.html))
+# used by OpenCV
+cc_library(
+    name = "nppi",
+    linkopts = [
+        "-lnppc",
+        "-lnppial",
+        "-lnppicom",
+        "-lnppidei",
+        "-lnppif",
+        "-lnppig",
+        "-lnppim",
+        "-lnppist",
+        "-lnppitc",
+        "-lnpps",
+    ],
+    linkstatic = True,
+    deps = [":linker_search_path"],
+)
+
+# NVIDIA Management Library
+cc_library(
+    name = "nvml",
+    linkopts = [
+        "-lnvidia-ml",
+        "-Wl,-rpath,/usr/lib/nvidia-410",
+        "-Wl,-rpath,/usr/lib/nvidia-390",
+        "-Wl,-rpath,/usr/lib/nvidia-387",
+        "-Wl,-rpath,/usr/lib/nvidia-384",
+    ],
+    deps = [":linker_search_path"],
+)
+
+cc_library(
+    name = "cupti_headers",
+    hdrs = [
+        ":cuda-extras",
+    ],
+    includes = [
+        ".",
+        "extras/CUPTI/include/",
+    ],
+)
+
+# cupti .so exposed at linktime
+cc_library(
+    name = "cupti_link",
+    linkopts = [
+        "-L/usr/local/cuda/extras/CUPTI/lib64",
+        "-lcupti",
+    ],
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = [":cuda-nvvm"],
+)
+
+CUDA_INCLUDES_FILES = [
+    "include/builtin_types.h",
+    "include/channel_descriptor.h",
+    "include/CL/cl_egl.h",
+    "include/CL/cl_ext.h",
+    "include/CL/cl_gl_ext.h",
+    "include/CL/cl_gl.h",
+    "include/CL/cl.h",
+    "include/CL/cl.hpp",
+    "include/CL/cl_platform.h",
+    "include/CL/opencl.h",
+    "include/common_functions.h",
+    "include/cooperative_groups.h",
+    "include/cooperative_groups_helpers.h",
+    "include/crt/common_functions.h",
+    "include/crt/device_double_functions.h",
+    "include/crt/device_double_functions.hpp",
+    "include/crt/device_functions.h",
+    "include/crt/device_functions.hpp",
+    "include/crt/func_macro.h",
+    "include/crt/host_config.h",
+    "include/crt/host_defines.h",
+    "include/crt/host_runtime.h",
+    "include/crt/math_functions.h",
+    "include/crt/math_functions.hpp",
+    "include/crt/mma.h",
+    "include/crt/mma.hpp",
+    "include/crt/nvfunctional",
+    "include/crt/sm_70_rt.h",
+    "include/crt/sm_70_rt.hpp",
+    "include/crt/storage_class.h",
+    # TODO: figure out why on a CI machine with CUDA 10.2 it's not present
+    # "include/cublas_api.h",
+    # "include/cublas.h",
+    # "include/cublas_v2.h",
+    # "include/cublasXt.h",
+    "include/cuComplex.h",
+    "include/cuda_device_runtime_api.h",
+    "include/cudaEGL.h",
+    "include/cuda_egl_interop.h",
+    "include/cuda_fp16.h",
+    "include/cuda_fp16.hpp",
+    "include/cudaGL.h",
+    "include/cuda_gl_interop.h",
+    "include/cuda.h",
+    "include/cudalibxt.h",
+    "include/cuda_occupancy.h",
+    "include/cuda_profiler_api.h",
+    "include/cudaProfiler.h",
+    "include/cudart_platform.h",
+    "include/cuda_runtime_api.h",
+    "include/cuda_runtime.h",
+    "include/cuda_surface_types.h",
+    "include/cuda_texture_types.h",
+    "include/cudaVDPAU.h",
+    "include/cuda_vdpau_interop.h",
+    "include/cufft.h",
+    "include/cufftw.h",
+    "include/cufftXt.h",
+    "include/curand_discrete2.h",
+    "include/curand_discrete.h",
+    "include/curand_globals.h",
+    "include/curand.h",
+    "include/curand_kernel.h",
+    "include/curand_lognormal.h",
+    "include/curand_mrg32k3a.h",
+    "include/curand_mtgp32dc_p_11213.h",
+    "include/curand_mtgp32.h",
+    "include/curand_mtgp32_host.h",
+    "include/curand_mtgp32_kernel.h",
+    "include/curand_normal.h",
+    "include/curand_normal_static.h",
+    "include/curand_philox4x32_x.h",
+    "include/curand_poisson.h",
+    "include/curand_precalc.h",
+    "include/curand_uniform.h",
+    "include/cusolver_common.h",
+    "include/cusolverDn.h",
+    "include/cusolverRf.h",
+    "include/cusolverSp.h",
+    "include/cusolverSp_LOWLEVEL_PREVIEW.h",
+    "include/cusparse.h",
+    "include/cusparse_v2.h",
+    "include/device_atomic_functions.h",
+    "include/device_atomic_functions.hpp",
+    "include/device_double_functions.h",
+    "include/device_functions.h",
+    "include/device_launch_parameters.h",
+    "include/device_types.h",
+    "include/driver_functions.h",
+    "include/driver_types.h",
+    "include/fatBinaryCtl.h",
+    "include/fatbinary.h",
+    "include/host_config.h",
+    "include/host_defines.h",
+    "include/library_types.h",
+    "include/math_constants.h",
+    "include/math_functions.h",
+    "include/mma.h",
+    "include/nppcore.h",
+    "include/nppdefs.h",
+    "include/npp.h",
+    "include/nppi_arithmetic_and_logical_operations.h",
+    "include/nppi_color_conversion.h",
+    "include/nppi_compression_functions.h",
+    "include/nppi_computer_vision.h",
+    "include/nppi_data_exchange_and_initialization.h",
+    "include/nppi_filtering_functions.h",
+    "include/nppi_geometry_transforms.h",
+    "include/nppi.h",
+    "include/nppi_linear_transforms.h",
+    "include/nppi_morphological_operations.h",
+    "include/nppi_statistics_functions.h",
+    "include/nppi_support_functions.h",
+    "include/nppi_threshold_and_compare_operations.h",
+    "include/npps_arithmetic_and_logical_operations.h",
+    "include/npps_conversion_functions.h",
+    "include/npps_filtering_functions.h",
+    "include/npps.h",
+    "include/npps_initialization.h",
+    "include/npps_statistics_functions.h",
+    "include/npps_support_functions.h",
+    # Note: CUDA 10.0 only
+    # "include/nppversion.h",
+    # TODO: figure out why on a CI machine with CUDA 10.2 it's not present
+    # "include/nvblas.h",
+    "include/nvfunctional",
+    "include/nvgraph.h",
+    "include/nvjpeg.h",
+    "include/nvml.h",
+    "include/nvrtc.h",
+    "include/nvToolsExtCuda.h",
+    "include/nvToolsExtCudaRt.h",
+    "include/nvToolsExt.h",
+    "include/nvToolsExtMeta.h",
+    "include/nvToolsExtSync.h",
+    "include/nvtx3/nvToolsExtCuda.h",
+    "include/nvtx3/nvToolsExtCudaRt.h",
+    "include/nvtx3/nvToolsExt.h",
+    "include/nvtx3/nvToolsExtOpenCL.h",
+    "include/nvtx3/nvToolsExtSync.h",
+    "include/nvtx3/nvtxDetail/nvtxImplCore.h",
+    "include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
+    "include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
+    "include/nvtx3/nvtxDetail/nvtxImpl.h",
+    "include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
+    "include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
+    "include/nvtx3/nvtxDetail/nvtxInitDecls.h",
+    "include/nvtx3/nvtxDetail/nvtxInitDefs.h",
+    "include/nvtx3/nvtxDetail/nvtxInit.h",
+    "include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
+    "include/nvtx3/nvtxDetail/nvtxTypes.h",
+    "include/sm_20_atomic_functions.h",
+    "include/sm_20_atomic_functions.hpp",
+    "include/sm_20_intrinsics.h",
+    "include/sm_20_intrinsics.hpp",
+    "include/sm_30_intrinsics.h",
+    "include/sm_30_intrinsics.hpp",
+    "include/sm_32_atomic_functions.h",
+    "include/sm_32_atomic_functions.hpp",
+    "include/sm_32_intrinsics.h",
+    "include/sm_32_intrinsics.hpp",
+    "include/sm_35_atomic_functions.h",
+    "include/sm_35_intrinsics.h",
+    "include/sm_60_atomic_functions.h",
+    "include/sm_60_atomic_functions.hpp",
+    "include/sm_61_intrinsics.h",
+    "include/sm_61_intrinsics.hpp",
+    # CUDA 10.0 only
+    # "include/sobol_direction_vectors.h",
+    "include/surface_functions.h",
+    "include/surface_functions.hpp",
+    "include/surface_indirect_functions.h",
+    "include/surface_indirect_functions.hpp",
+    "include/surface_types.h",
+    "include/texture_fetch_functions.h",
+    "include/texture_fetch_functions.hpp",
+    "include/texture_indirect_functions.h",
+    "include/texture_indirect_functions.hpp",
+    "include/texture_types.h",
+    "include/vector_functions.h",
+    "include/vector_functions.hpp",
+    "include/vector_types.h",
+]
+
+genrule(
+    name = "cuda-include",
+    outs = CUDA_INCLUDES_FILES,
+    cmd = " && ".join([
+        "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p)
+        for p in CUDA_INCLUDES_FILES
+    ]),
+    local = True,
+    tags = ["no-cache"],
+)
+
+CUDA_NVVM_FILES = [
+    "nvvm/bin/cicc",
+    "nvvm/include/nvvm.h",
+    "nvvm/lib64/libnvvm.so",
+    "nvvm/lib64/libnvvm.so.3",
+    "nvvm/lib64/libnvvm.so.3.3.0",
+    "nvvm/libdevice/libdevice.10.bc",
+]
+
+genrule(
+    name = "cuda-nvvm",
+    outs = CUDA_NVVM_FILES,
+    cmd = " && ".join([
+        "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p)
+        for p in CUDA_NVVM_FILES
+    ]),
+    local = True,
+    tags = ["no-cache"],
+)
+
+CUDA_EXTRAS_FILES = [
+    "extras/CUPTI/include/cuda_stdint.h",
+    "extras/CUPTI/include/cupti.h",
+    "extras/CUPTI/include/cupti_activity.h",
+    "extras/CUPTI/include/cupti_callbacks.h",
+    "extras/CUPTI/include/cupti_driver_cbid.h",
+    "extras/CUPTI/include/cupti_events.h",
+    "extras/CUPTI/include/cupti_metrics.h",
+    "extras/CUPTI/include/cupti_nvtx_cbid.h",
+    "extras/CUPTI/include/cupti_result.h",
+    "extras/CUPTI/include/cupti_runtime_cbid.h",
+    "extras/CUPTI/include/cupti_version.h",
+    "extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
+    "extras/CUPTI/include/generated_cuda_meta.h",
+    "extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
+    "extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
+    "extras/CUPTI/include/generated_cudaGL_meta.h",
+    "extras/CUPTI/include/generated_cudaVDPAU_meta.h",
+    "extras/CUPTI/include/generated_nvtx_meta.h",
+    "extras/CUPTI/include/GL/gl.h",
+    "extras/CUPTI/include/GL/glew.h",
+    "extras/CUPTI/include/GL/glext.h",
+    "extras/CUPTI/include/GL/glu.h",
+    "extras/CUPTI/include/GL/glut.h",
+    "extras/CUPTI/include/GL/glx.h",
+    "extras/CUPTI/include/GL/glxext.h",
+    "extras/CUPTI/include/GL/wglew.h",
+    "extras/CUPTI/include/GL/wglext.h",
+    "extras/CUPTI/include/openacc/cupti_openacc.h",
+]
+
+genrule(
+    name = "cuda-extras",
+    outs = CUDA_EXTRAS_FILES,
+    cmd = " && ".join([
+        "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p)
+        for p in CUDA_EXTRAS_FILES
+    ]),
+    local = True,
+    tags = ["no-cache"],
+)
+
+genrule(
+    name = "cudnn-include",
+    outs = [
+        "include/cudnn.h",
+    ],
+    cmd = """
+        ln -s /usr/include/cudnn.h $(@D)/cudnn.h""",
+    local = True,
+    tags = ["no-cache"],
+)
+
diff --git a/tools/config/BUILD b/tools/config/BUILD
index a8f9d0452fce8..ba13eda2bba7b 100644
--- a/tools/config/BUILD
+++ b/tools/config/BUILD
@@ -13,7 +13,6 @@ selects.config_setting_group(
     name = "cuda_enabled_and_capable",
     match_all = [
         ":cuda",
-        "//tools/toolchain:is_cuda_capable",
     ],
 )
 
diff --git a/tools/rules/workspace.bzl b/tools/rules/workspace.bzl
index 59e12e8d92d03..34317bec25f5d 100644
--- a/tools/rules/workspace.bzl
+++ b/tools/rules/workspace.bzl
@@ -27,3 +27,28 @@ pkg_tar(name = "content", srcs = glob(["**"]))
         path = path,
     )
     _patched_rule(name = name, **kwargs)
+
+def _new_empty_repository_impl(repo_ctx):
+    build_file = repo_ctx.attr.build_file
+    build_file_content = repo_ctx.attr.build_file_content
+    if not (bool(build_file) != bool(build_file_content)):
+        fail("Exactly one of 'build_file' or 'build_file_content' is required")
+
+    if build_file_content:
+        repo_ctx.file("BUILD", build_file_content)
+    elif build_file:
+        repo_ctx.template("BUILD", repo_ctx.attr.build_file, {})
+
+new_empty_repository = repository_rule(
+    attrs = {
+        "build_file": attr.label(allow_files = True),
+        "build_file_content": attr.string(),
+    },
+    implementation = _new_empty_repository_impl,
+)
+
+"""Create an empty repository with the supplied BUILD file.
+
+This is mostly useful to create wrappers for specific target that we want
+to be used with the '@' syntax.
+"""

From a9983ac09c9772fa426224b080090083b4c9607b Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Fri, 27 Aug 2021 09:37:10 -0700
Subject: [PATCH 294/530] Refactor structured set_output in
 Register{DispatchKey}.cpp (#62188)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62188

These parts of the `set_output` code are identical for all operators in the
kernel registration files. So, this moves them from being copied into every
class to two helper functions at the top of the file.

Test Plan: Imported from OSS

Reviewed By: soulitzer

Differential Revision: D29962045

Pulled By: albanD

fbshipit-source-id: 753b8aac755f3c91b77ffa2c30a89ac91a84b7c4
---
 .../ATen/templates/RegisterDispatchKey.cpp    |   2 +
 tools/codegen/dest/__init__.py                |   1 +
 tools/codegen/dest/register_dispatch_key.py   | 130 +++++++++++-------
 tools/codegen/gen.py                          |   1 +
 tools/codegen/gen_backend_stubs.py            |   1 +
 5 files changed, 84 insertions(+), 51 deletions(-)

diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp
index 1abc3ee391ae2..16caf5326c711 100644
--- a/aten/src/ATen/templates/RegisterDispatchKey.cpp
+++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp
@@ -43,6 +43,8 @@ namespace at {
 // at namespace already.
 namespace {
 
+${dispatch_helpers}
+
 ${dispatch_anonymous_definitions}
 
 TORCH_LIBRARY_IMPL(aten, ${DispatchKey}, m) {
diff --git a/tools/codegen/dest/__init__.py b/tools/codegen/dest/__init__.py
index ab4bada277572..441e4426cf29e 100644
--- a/tools/codegen/dest/__init__.py
+++ b/tools/codegen/dest/__init__.py
@@ -1,2 +1,3 @@
 from .register_dispatch_key import RegisterDispatchKey as RegisterDispatchKey
+from .register_dispatch_key import gen_registration_helpers as gen_registration_helpers
 from .native_functions import compute_native_function_declaration as compute_native_function_declaration
diff --git a/tools/codegen/dest/register_dispatch_key.py b/tools/codegen/dest/register_dispatch_key.py
index a943f51ba5898..784ee56e765fb 100644
--- a/tools/codegen/dest/register_dispatch_key.py
+++ b/tools/codegen/dest/register_dispatch_key.py
@@ -23,6 +23,79 @@
 from tools.codegen.api.translate import translate
 from tools.codegen.selective_build.selector import SelectiveBuilder
 
+
+def gen_create_out_helper(backend_index: BackendIndex) -> List[str]:
+    if backend_index.dispatch_key == DispatchKey.Meta:
+        # TODO: dedupe this with below
+        core = """
+if (strides.empty()) {
+    return at::empty(sizes, options.device(at::kMeta));
+} else {
+    return at::empty_strided(sizes, strides, options.device(at::kMeta));
+}
+"""
+    else:
+        expanded_topts = "optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), " \
+            "options.device_opt(), options.pinned_memory_opt()"
+        empty_init = ""
+        if backend_index.dispatch_key == DispatchKey.CPU:
+            empty_impl = "at::native::empty_cpu"
+            empty_strided_impl = "at::native::empty_strided_cpu"
+        elif backend_index.dispatch_key == DispatchKey.CUDA:
+            empty_init = "globalContext().lazyInitCUDA();"
+            empty_impl = "at::native::empty_cuda"
+            empty_strided_impl = "at::native::empty_strided_cuda"
+        elif backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd:
+            empty_impl = "at::empty"
+            empty_strided_impl = "at::empty_strided"
+        else:
+            return []
+        core = f"""
+  {empty_init}
+  if (strides.empty()) {{
+      return {empty_impl}(sizes, {expanded_topts}, options.memory_format_opt());
+  }} else {{
+      // TODO: assert options.memory_format_opt() is nullopt (debug only?)
+      return {empty_strided_impl}(sizes, strides, {expanded_topts});
+  }}
+"""
+    return [f"""
+Tensor create_out(IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{
+{core}
+}}
+"""]
+
+
+def gen_resize_out_helper(backend_index: BackendIndex) -> List[str]:
+    return ["""
+void resize_out(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {
+  TORCH_CHECK(options.dtype() == out.dtype(),
+      "Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead");
+  TORCH_CHECK(options.device() == out.device(),
+      "Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead");
+  const bool resized = at::native::resize_output(out, sizes);
+  // Only restride if a resize occurred; otherwise we ignore the (advisory)
+  // strides from the meta function and directly use the output tensor's
+  // preexisting strides
+  if (resized) {
+    if (!strides.empty()) {
+      TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
+      at::native::as_strided_(out, sizes, strides);
+    } else if (options.memory_format_opt().has_value()) {
+      out.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
+    }
+  }
+}
+"""]
+
+
+def gen_registration_helpers(backend_index: BackendIndex) -> List[str]:
+    return [
+        *gen_create_out_helper(backend_index),
+        *gen_resize_out_helper(backend_index)
+    ]
+
+
 # Generates Register{dispatch}.cpp (e.g., RegisterCPU.cpp).
 #
 #   - The primary function of this file is to register all of the
@@ -344,62 +417,17 @@ def gen_class_set_output_body(self, k: SchemaKind) -> str:
             maybe_set_guard_line = maybe_set_guard = ''
 
         if k is SchemaKind.functional:
-            if self.backend_index.dispatch_key == DispatchKey.Meta:
-                # TODO: dedupe this with below
-                return """
-if (strides.empty()) {
-    outputs_[output_idx] = at::empty(sizes, options.device(at::kMeta));
-} else {
-    outputs_[output_idx] = at::empty_strided(sizes, strides, options.device(at::kMeta));
-}
-"""
-            else:
-                expanded_topts = "optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), " \
-                    "options.device_opt(), options.pinned_memory_opt()"
-                empty_init = ""
-                if self.backend_index.dispatch_key == DispatchKey.CPU:
-                    empty_impl = "at::native::empty_cpu"
-                    empty_strided_impl = "at::native::empty_strided_cpu"
-                elif self.backend_index.dispatch_key == DispatchKey.CUDA:
-                    empty_init = "globalContext().lazyInitCUDA();"
-                    empty_impl = "at::native::empty_cuda"
-                    empty_strided_impl = "at::native::empty_strided_cuda"
-                elif self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd:
-                    empty_impl = "at::empty"
-                    empty_strided_impl = "at::empty_strided"
-                else:
-                    raise AssertionError("unsupported dispatch key")
-                return f"""{maybe_set_guard_line}
-{empty_init}
-if (strides.empty()) {{
-    outputs_[output_idx] = {empty_impl}(sizes, {expanded_topts}, options.memory_format_opt());
-}} else {{
-    // TODO: assert options.memory_format_opt() is nullopt (debug only?)
-    outputs_[output_idx] = {empty_strided_impl}(sizes, strides, {expanded_topts});
-}}
-"""
+            assert self.backend_index.dispatch_key in (
+                DispatchKey.Meta, DispatchKey.CPU, DispatchKey.CUDA,
+                DispatchKey.CompositeExplicitAutograd)
+            return f"""{maybe_set_guard_line}
+outputs_[output_idx] = create_out(sizes, strides, options);"""
         elif k is SchemaKind.inplace:
             return maybe_set_guard
         elif k is SchemaKind.out:
             return f"""{maybe_set_guard_line}
 const auto& out = outputs_[output_idx].get();
-TORCH_CHECK(options.dtype() == out.dtype(),
-    "Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead");
-TORCH_CHECK(options.device() == out.device(),
-    "Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead");
-bool resized = at::native::resize_output(outputs_[output_idx], sizes);
-// Only restride if a resize occurred; otherwise we ignore the (advisory)
-// strides from the meta function and directly use the output tensor's
-// preexisting strides
-if (resized) {{
-    if (!strides.empty()) {{
-        TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
-        at::native::as_strided_(outputs_[output_idx], sizes, strides);
-    }} else if (options.memory_format_opt().has_value()) {{
-        outputs_[output_idx].get().unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
-    }}
-}}
-"""
+resize_out(out, sizes, strides, options);"""
         else:
             assert_never(k)
 
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index ffa4ed7a1c70e..203b5a99c356c 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -1100,6 +1100,7 @@ def make_file_manager(install_dir: str) -> FileManager:
             'namespaced_headers': f'#include <ATen/{dispatch_key}Functions.h>' if dispatch_key in functions_keys else '',
             'DispatchKey': dispatch_key,
             'dispatch_namespace': dispatch_key.lower(),
+            'dispatch_helpers': dest.gen_registration_helpers(backend_indices[dispatch_key]),
             'dispatch_namespaced_definitions': list(concatMap(
                 dest.RegisterDispatchKey(
                     backend_indices[dispatch_key],
diff --git a/tools/codegen/gen_backend_stubs.py b/tools/codegen/gen_backend_stubs.py
index 51f81c702e122..5fad11c343804 100644
--- a/tools/codegen/gen_backend_stubs.py
+++ b/tools/codegen/gen_backend_stubs.py
@@ -231,6 +231,7 @@ def make_file_manager(install_dir: str) -> FileManager:
                 'namespaced_headers': '',
                 'DispatchKey': dispatch_key,
                 'dispatch_namespace': dispatch_key.lower(),
+                'dispatch_helpers': dest.gen_registration_helpers(backend_indices[dispatch_key]),
                 'dispatch_namespaced_definitions': list(concatMap(
                     dest.RegisterDispatchKey(
                         backend_indices[dispatch_key],

From 3abb6060910186f38fc1eef2a2169639533f0c54 Mon Sep 17 00:00:00 2001
From: Jonathan Chang <ttjtftx@gmail.com>
Date: Fri, 27 Aug 2021 09:49:39 -0700
Subject: [PATCH 295/530] Add doc for nn.MultiMarginLoss (shape, example)
 (#63760)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63747

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63760

Reviewed By: malfet

Differential Revision: D30541581

Pulled By: jbschlosser

fbshipit-source-id: 99560641e614296645eb0e51999513f57dfcfa98
---
 torch/nn/modules/loss.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 03732b6d192d8..af1da83eeef5b 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1323,7 +1323,7 @@ class MultiMarginLoss(_WeightedLoss):
     The loss function then becomes:
 
     .. math::
-        \text{loss}(x, y) = \frac{\sum_i \max(0, w[y] * (\text{margin} - x[y] + x[i]))^p)}{\text{x.size}(0)}
+        \text{loss}(x, y) = \frac{\sum_i \max(0, w[y] * (\text{margin} - x[y] + x[i]))^p}{\text{x.size}(0)}
 
     Args:
         p (int, optional): Has a default value of :math:`1`. :math:`1` and :math:`2`
@@ -1347,6 +1347,20 @@ class MultiMarginLoss(_WeightedLoss):
             elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
             and :attr:`reduce` are in the process of being deprecated, and in the meantime,
             specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(C)`, where :math:`N` is the batch size and :math:`C` is the number of classes.
+        - Target: :math:`(N)` or :math:`()`, where each value is :math:`0 \leq \text{targets}[i] \leq C-1`.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the target.
+
+    Examples::
+
+        >>> loss = nn.MultiMarginLoss()
+        >>> x = torch.tensor([[0.1, 0.2, 0.4, 0.8]])
+        >>> y = torch.tensor([3])
+        >>> loss(x, y)
+        >>> # 0.25 * ((1-(0.8-0.1)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))
+        tensor(0.3250)
     """
     __constants__ = ['p', 'margin', 'reduction']
     margin: float

From babd4499783abc699faf36f3a72a9fc491e0e572 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Fri, 27 Aug 2021 10:10:48 -0700
Subject: [PATCH 296/530] [JIT] Add aten::slice optimization (#63049)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63049

Given a graph produced from a function like this:
```
def foo():
    li = [1, 2, 3, 4, 5, 6]
    return li[0:2]
```
This pass produces a graph like this:
```
def foo():
    li = [1, 2]
    return li
```

These changes are mostly adapted from https://github.com/pytorch/pytorch/pull/62297/

Test Plan: `buck test //caffe2/test:jit -- TestPeephole`

Reviewed By: eellison

Differential Revision: D30231044

fbshipit-source-id: d12ee39f68289a574f533041a5adb38b2f000dd5
---
 test/jit/test_peephole.py                     | 74 +++++++++++++-
 .../csrc/jit/passes/peephole_list_idioms.cpp  | 97 +++++++++++++------
 torch/csrc/jit/passes/peephole_list_idioms.h  |  8 ++
 3 files changed, 148 insertions(+), 31 deletions(-)

diff --git a/test/jit/test_peephole.py b/test/jit/test_peephole.py
index 23de44807761c..ecb4a06dfe0b9 100644
--- a/test/jit/test_peephole.py
+++ b/test/jit/test_peephole.py
@@ -2,7 +2,7 @@
 from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA, _inline_everything
 from torch import nn
 from torch.testing import FileCheck
-from typing import List
+from typing import Callable, List
 
 import unittest
 
@@ -721,3 +721,75 @@ def foo():
         self.run_pass("peephole", foo.graph)
         FileCheck().check("DictConstruct").check("len").run(foo.graph)
         self.assertEqual(foo(), 1)
+
+    def test_peephole_slice_all_three_args(self):
+        def foo(x: int):
+            return [1, 2, x, 4, 5, 6, 7][-5:6:2]
+
+        graph = torch.jit.script(foo).graph
+        self.run_pass("peephole", graph)
+        FileCheck().check_not("aten::slice").run(graph)
+        self.checkScript(foo, (3, ))
+
+    def test_peephole_slice_one_empty_arg(self):
+        def check_helper(fn: Callable[[int], None]) -> None:
+            graph = torch.jit.script(fn).graph
+            self.run_pass("peephole", graph)
+            FileCheck().check_not("aten::slice").run(graph)
+            self.checkScript(fn, (3, ))
+
+        def foo(x: int):
+            return [1, 2, x, 4, 5, 6, 7][1::2]
+
+        check_helper(foo)
+
+        def foo(x: int):
+            return [1, 2, x, 4, 5, 6, 7][:5:3]
+
+        check_helper(foo)
+
+        def foo(x: int):
+            return [1, 2, x, 4, 5, 6, 7][0:4]
+
+        check_helper(foo)
+
+    def test_peephole_slice_two_empty_args(self):
+        def check_helper(fn: Callable[[int], None]) -> None:
+            graph = torch.jit.script(fn).graph
+            self.run_pass("peephole", graph)
+            FileCheck().check_not("aten::slice").run(graph)
+            self.checkScript(fn, (3, ))
+
+        def foo(x: int):
+            return [1, 2, x, 4, 5, 6, 7][::2]
+
+        check_helper(foo)
+
+        def foo(x: int):
+            return [1, 2, x, 4, 5, 6, 7][:5]
+
+        check_helper(foo)
+
+        def foo(x: int):
+            return [1, 2, x, 4, 5, 6, 7][1:]
+
+        check_helper(foo)
+
+    def test_peephole_slice_optimization_not_applied_list_modified(self):
+        @torch.jit.script
+        def foo():
+            li = [1, 2, 3, 4, 5, 6, 7]
+            li[0] = 0
+            return li[2:5]
+
+        self.run_pass("peephole", foo.graph)
+        FileCheck().check("aten::slice").run(foo.graph)
+
+    def test_peephole_slice_optimization_not_applied_non_const_args(self):
+        @torch.jit.script
+        def foo(x: int, y: int):
+            li = [1, 2, 3, 4, 5, 6, 7]
+            return li[x:y]
+
+        self.run_pass("peephole", foo.graph)
+        FileCheck().check("aten::slice").run(foo.graph)
diff --git a/torch/csrc/jit/passes/peephole_list_idioms.cpp b/torch/csrc/jit/passes/peephole_list_idioms.cpp
index f33f388259d20..ec3d249b8b1be 100644
--- a/torch/csrc/jit/passes/peephole_list_idioms.cpp
+++ b/torch/csrc/jit/passes/peephole_list_idioms.cpp
@@ -7,7 +7,9 @@
 #include <torch/csrc/jit/passes/peephole_list_idioms.h>
 #include <torch/csrc/jit/passes/value_refinement_utils.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
+#include <torch/csrc/jit/runtime/slice_indices_adjust.h>
 #include <torch/csrc/utils/memory.h>
+#include <limits>
 
 namespace torch {
 namespace jit {
@@ -57,7 +59,7 @@ struct ListLenRefiner {
       }
 
       auto first_input = n->input(0);
-      if (first_input->type()->cast<ListType>() &&
+      if (first_input->type()->castRaw<ListType>() &&
           !mutated_lists_.count(first_input)) {
         if (!li_with_len_use.count(first_input)) {
           li_with_len_use.insert(first_input);
@@ -172,7 +174,7 @@ struct PeepholeOptimizeListIdiomsImpl {
 
  private:
   void checkForMutatedList(Value* v) {
-    if (v->type()->cast<ListType>() && aliasDb_->hasWriters(v)) {
+    if (v->type()->castRaw<ListType>() && aliasDb_->hasWriters(v)) {
       mutated_lists_.insert(v);
     }
   }
@@ -191,6 +193,43 @@ struct PeepholeOptimizeListIdiomsImpl {
     }
   }
 
+  bool optimizeSlice(Node* slice_node, Node* list_construct_node) {
+    auto start_val = toIValue(slice_node->input(1));
+    auto end_val = toIValue(slice_node->input(2));
+    auto step_val = toIValue(slice_node->input(3));
+
+    // All args must be constant to apply this optimization.
+    if (start_val == c10::nullopt || end_val == c10::nullopt ||
+        step_val == c10::nullopt) {
+      return false;
+    }
+
+    int64_t start = start_val->isInt() ? start_val->to<int64_t>()
+                                       : std::numeric_limits<int64_t>::max();
+    int64_t end = end_val->isInt() ? end_val->to<int64_t>()
+                                   : std::numeric_limits<int64_t>::max();
+    int64_t step = step_val->isInt() ? step_val->to<int64_t>() : 1;
+
+    size_t list_size = list_construct_node->inputs().size();
+    size_t num_values = slice_indices_adjust(list_size, &start, &end, step);
+
+    WithInsertPoint guard(slice_node);
+    auto slice_list_construct =
+        graph_->insertNode(graph_->create(prim::ListConstruct));
+    slice_list_construct->output()->setType(slice_node->output()->type());
+    for (size_t i = start, j = 0; j < num_values; ++j) {
+      slice_list_construct->addInput(list_construct_node->input(i));
+      i += step;
+    }
+
+    slice_node->output()->replaceAllUsesWith(slice_list_construct->output());
+    if (mutated_lists_.count(slice_node->output())) {
+      mutated_lists_.insert(slice_list_construct->output());
+    }
+
+    return true;
+  }
+
   bool runBlock(Block* block) {
     bool changed = false;
     for (Node* node : block->nodes()) {
@@ -200,7 +239,7 @@ struct PeepholeOptimizeListIdiomsImpl {
 
       // only optimizing list ops
       if (node->inputs().size() == 0 ||
-          !node->input(0)->type()->cast<ListType>()) {
+          !node->input(0)->type()->castRaw<ListType>()) {
         continue;
       }
 
@@ -211,36 +250,33 @@ struct PeepholeOptimizeListIdiomsImpl {
         continue;
       }
 
+      auto list_creation_node = first_input->node();
+      if (list_creation_node->kind() != prim::ListConstruct) {
+        continue;
+      }
+
       if (node->kind() == aten::len) {
-        if (first_input->node()->kind() == prim::ListConstruct) {
-          WithInsertPoint guard(node);
-          node->output()->replaceAllUsesWith(graph_->insertConstant(
-              static_cast<int64_t>(first_input->node()->inputs().size())));
-          changed = true;
-        }
+        WithInsertPoint guard(node);
+        node->output()->replaceAllUsesWith(graph_->insertConstant(
+            static_cast<int64_t>(first_input->node()->inputs().size())));
+        changed = true;
       } else if (node->kind() == aten::__getitem__) {
-        auto list_creation_node = first_input->node();
-        if (list_creation_node->kind() == prim::ListConstruct) {
-          if (auto index = toIValue(node->input(1))) {
-            size_t list_size = list_creation_node->inputs().size();
-            if (auto norm_index = normalizeIndex(index->toInt(), list_size)) {
-              node->output()->replaceAllUsesWith(
-                  list_creation_node->input(*norm_index));
-              changed = true;
-            }
+        if (auto index = toIValue(node->input(1))) {
+          size_t list_size = list_creation_node->inputs().size();
+          if (auto norm_index = normalizeIndex(index->toInt(), list_size)) {
+            node->output()->replaceAllUsesWith(
+                list_creation_node->input(*norm_index));
+            changed = true;
           }
         }
       } else if (node->kind() == prim::ListUnpack) {
-        auto list_creation_node = first_input->node();
-        if (list_creation_node->kind() == prim::ListConstruct) {
-          // if sizes are unequal it's a runtime error
-          if (list_creation_node->inputs().size() != node->outputs().size()) {
-            continue;
-          }
-          for (size_t i = 0; i < node->outputs().size(); ++i) {
-            node->output(i)->replaceAllUsesWith(list_creation_node->input(i));
-            changed = true;
-          }
+        // if sizes are unequal it's a runtime error
+        if (list_creation_node->inputs().size() != node->outputs().size()) {
+          continue;
+        }
+        for (size_t i = 0; i < node->outputs().size(); ++i) {
+          node->output(i)->replaceAllUsesWith(list_creation_node->input(i));
+          changed = true;
         }
       } else if (node->kind() == aten::add) {
         if (node->inputs().size() != 2) {
@@ -251,8 +287,7 @@ struct PeepholeOptimizeListIdiomsImpl {
         if (mutated_lists_.count(second_input)) {
           continue;
         }
-        if (first_input->node()->kind() != prim::ListConstruct ||
-            second_input->node()->kind() != prim::ListConstruct) {
+        if (second_input->node()->kind() != prim::ListConstruct) {
           continue;
         }
         WithInsertPoint guard(node);
@@ -270,6 +305,8 @@ struct PeepholeOptimizeListIdiomsImpl {
           mutated_lists_.insert(list_construct->output());
         }
         changed = true;
+      } else if (node->kind() == aten::slice) {
+        changed |= optimizeSlice(node, first_input->node());
       }
     }
     return changed;
diff --git a/torch/csrc/jit/passes/peephole_list_idioms.h b/torch/csrc/jit/passes/peephole_list_idioms.h
index c8add4849d4ce..d20df9571db01 100644
--- a/torch/csrc/jit/passes/peephole_list_idioms.h
+++ b/torch/csrc/jit/passes/peephole_list_idioms.h
@@ -51,6 +51,14 @@ namespace jit {
 //
 // This is only applied to lists that are not modified.
 //
+// 5. Slice
+// Given a function like this:
+//     def foo():
+//         return [1, 2, 3, 4, 5][0:2]
+// This pass produces (after deadcode elimination):
+//     def foo():
+//         return [1, 2]
+//
 // Currently this is invoked as part of PeepholeOptimize
 // return true if graph is modified.
 // If `refine_list_len` is true will attempt to refine the len of lists through

From eca87f729d071d12ccb31dd2c958a989d8ac17af Mon Sep 17 00:00:00 2001
From: Heitor Schueroff <heitorschueroff@fb.com>
Date: Fri, 27 Aug 2021 10:16:02 -0700
Subject: [PATCH 297/530] Added reference tests to ReductionOpInfo (#62900)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62900

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D30408815

Pulled By: heitorschueroff

fbshipit-source-id: 6a1f82ac281920ff7405a42f46ccd796e60af9d6
---
 aten/src/ATen/native/cpu/ReduceOpsKernel.cpp  |  29 +--
 test/test_reductions.py                       | 111 ++++++++++-
 .../_internal/common_methods_invocations.py   | 179 +++++++++++++-----
 3 files changed, 258 insertions(+), 61 deletions(-)

diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index 89d2fb21fb511..01ed54e56fc73 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -163,24 +163,29 @@ static void std_var_kernel_impl(TensorIterator& iter, int64_t correction, bool t
 }
 
 static void prod_kernel_impl(TensorIterator& iter) {
-  // Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
+  // Workaround for the error: '*' in boolean context, suggest '&&' instead
+  // [-Werror=int-in-bool-context]
   if (iter.dtype() == ScalarType::Bool) {
     using scalar_t = bool;
     binary_kernel_reduce_vec(
-      iter,
-      [=](scalar_t a, scalar_t b) -> scalar_t { return a && b; },
-      [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return a && b; },
-      // NOLINTNEXTLINE(bugprone-argument-comment)
-      /*identity=*/1);
-  } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "prod_cpu", [&] {
-      binary_kernel_reduce_vec(
         iter,
-        [=](scalar_t a, scalar_t b) -> scalar_t { return a * b; },
-        [=](Vectorized <scalar_t> a, Vectorized <scalar_t> b) { return a * b; },
+        [=](scalar_t a, scalar_t b)
+            __ubsan_ignore_undefined__ -> scalar_t { return a && b; },
+        [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
+            __ubsan_ignore_undefined__ { return a && b; },
         // NOLINTNEXTLINE(bugprone-argument-comment)
         /*identity=*/1);
-      });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "prod_cpu", [&] {
+      binary_kernel_reduce_vec(
+          iter,
+          [=](scalar_t a, scalar_t b)
+              __ubsan_ignore_undefined__ -> scalar_t { return a * b; },
+          [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
+              __ubsan_ignore_undefined__ { return a * b; },
+          // NOLINTNEXTLINE(bugprone-argument-comment)
+          /*identity=*/1);
+    });
   }
 }
 
diff --git a/test/test_reductions.py b/test/test_reductions.py
index e716336e4afe7..eed7f732051cd 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -10,7 +10,7 @@
 
 from torch._six import inf, nan
 from torch.testing import (
-    integral_types_and, floating_and_complex_types_and)
+    integral_types_and, floating_and_complex_types_and, get_all_dtypes)
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict,
     IS_WINDOWS, make_tensor)
@@ -296,6 +296,115 @@ def test_empty_tensor_nonempty_slice(self, device, op: ReductionOpInfo):
             result = op(t, *args, dim=dim, **kwargs)
             self.assertEqual(result.shape, _reduced_shape(t.shape, dim))
 
+    def _test_noncontiguous(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_kwargs):
+        """Helper method to test noncontiguous input tensors."""
+        assert not t.is_contiguous()
+
+        t_contig = t.contiguous()
+        for args, kwargs in op.generate_args_kwargs(t_contig, **reduction_kwargs):
+            kwargs.update(reduction_kwargs)
+            result = op(t, *args, **kwargs)
+            expected = op(t_contig, *args, **kwargs)
+            self.assertEqual(result, expected)
+
+    @ops(reduction_ops)
+    def test_noncontiguous_innermost(self, device, dtype, op: ReductionOpInfo):
+        """Tests reducing along noncontiguous innermost dimension."""
+        t = make_tensor((10, 10), device, dtype)
+        self._test_noncontiguous(op, t[:, ::2], dim=1)
+
+    @ops(reduction_ops)
+    def test_noncontiguous_outermost(self, device, dtype, op: ReductionOpInfo):
+        """Tests reducing along noncontiguous outermost dimension."""
+        t = make_tensor((10, 10), device, dtype)
+        self._test_noncontiguous(op, t[::2, :], dim=0)
+
+    @ops(reduction_ops)
+    def test_noncontiguous_all(self, device, dtype, op: ReductionOpInfo):
+        """Tests reducing all dimensions of a noncontiguous tensor."""
+        t = make_tensor((5, 5, 5), device, dtype)
+        self._test_noncontiguous(op, t[::2, ::3, 1:-1:2])
+
+    @ops(reduction_ops)
+    def test_noncontiguous_transposed(self, device, dtype, op: ReductionOpInfo):
+        """Tests reducing a transposed tensor."""
+        t = make_tensor((5, 5), device, dtype)
+        self._test_noncontiguous(op, t.T)
+
+    @ops(reduction_ops)
+    def test_noncontiguous_expanded(self, device, dtype, op: ReductionOpInfo):
+        """Tests reducing a tensor with expanded singleton dimensions."""
+        t = make_tensor((2, 3), device, dtype)
+        self._test_noncontiguous(op, t.unsqueeze(1).expand(-1, 5, -1))
+
+    # NumPy does not support BFloat16 so we don't test that against reference
+    # implementations. We also don't compare dtypes or test for different
+    # keepdim because we already have other tests covering those.
+    # The test_reference_testing in test_ops.py only uses the samples from
+    # sample_inputs_func which do not test as exhaustively as these tests.
+
+    def _test_ref(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_kwargs):
+        """Compares op against op.ref for the given input and reduction kwargs"""
+        for args, kwargs in op.generate_args_kwargs(t, **reduction_kwargs):
+            kwargs.update(reduction_kwargs)
+            result = op(t, *args, **kwargs)
+            expected = op.ref(t.detach().cpu().numpy(), *args, **kwargs)
+            self.assertEqual(result, expected, exact_dtype=False)
+
+    @ops(filter(lambda op: op.ref is not None, reduction_ops),
+         allowed_dtypes=get_all_dtypes(include_bfloat16=False))
+    def test_ref_scalar_input(self, device, dtype, op: ReductionOpInfo):
+        """Compares op against reference for scalar input tensors"""
+        self._test_ref(op, make_tensor([], device, dtype))
+
+    @ops(filter(lambda op: op.ref is not None, reduction_ops),
+         allowed_dtypes=get_all_dtypes(include_bfloat16=False))
+    def test_ref_small_input(self, device, dtype, op: ReductionOpInfo):
+        """Compares op against reference for small input tensors"""
+        t = make_tensor((5, 3, 4, 2), device, dtype, exclude_zero=True)
+        self._test_ref(op, t)
+        for dim in [0, 1, 3] + ([[0, 2], [1, 3]] if op.supports_multiple_dims else []):
+            self._test_ref(op, t, dim=dim)
+
+    @ops(filter(lambda op: op.ref is not None, reduction_ops),
+         allowed_dtypes=[torch.float32])
+    def test_ref_large_input_1D(self, device, dtype, op: ReductionOpInfo):
+        """Compares op against reference for a large 1D input tensor to check stability"""
+        self._test_ref(op, make_tensor((2 ** 20,), device, dtype, low=-1, high=2, exclude_zero=True))
+
+    @ops(filter(lambda op: op.ref is not None, reduction_ops),
+         allowed_dtypes=[torch.float32])
+    def test_ref_large_input_2D(self, device, dtype, op: ReductionOpInfo):
+        """Compares op against reference for a large 2D input tensor to test parallelism"""
+        t = make_tensor((32, 2 ** 16), device, dtype, low=-1, high=2, exclude_zero=True)
+        self._test_ref(op, t, dim=1)
+
+    @ops(filter(lambda op: op.ref is not None, reduction_ops),
+         allowed_dtypes=[torch.float32])
+    def test_ref_large_input_64bit_indexing(self, device, dtype, op: ReductionOpInfo):
+        """Compares op against reference for a very large input tensor that requires 64 bit indexing"""
+        self._test_ref(op, make_tensor((275000000,), device, dtype, low=-1, high=2, exclude_zero=True))
+
+    @ops(filter(lambda op: op.ref is not None, reduction_ops),
+         allowed_dtypes=get_all_dtypes(include_bfloat16=False))
+    def test_ref_duplicate_values(self, device, dtype, op: ReductionOpInfo):
+        """Compares op against reference for input tensors with duplicate values"""
+        t = make_tensor((8, 8), device, dtype, exclude_zero=True)
+        t[::2, ::2] = t[1::2, 1::2]
+        self._test_ref(op, t)
+        self._test_ref(op, t, dim=0)
+        self._test_ref(op, t, dim=1)
+
+    @ops(filter(lambda op: op.ref is not None, reduction_ops),
+         allowed_dtypes=[torch.float32, torch.complex64])
+    def test_ref_extremal_values(self, device, dtype, op: ReductionOpInfo):
+        """Compares op against reference for input tensors with extremal values"""
+        t = make_tensor((10,), device, dtype, exclude_zero=True)
+        extremals = [0, 1] + [nan, inf, -inf] if torch.is_floating_point(t) else []
+        for extremal in extremals:
+            t[5] = extremal
+            self._test_ref(op, t)
+
     ###########################################################################
     # TODO: Legacy tests - port to ReductionOpInfo
     ###########################################################################
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 4331c92d56599..2230808b5fd43 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -187,6 +187,8 @@ def _np(t):
                 return tuple(map(to_numpy, x))
             elif isinstance(x, dict):
                 return {k: to_numpy(v) for k, v in x.items()}
+            elif isinstance(x, torch.dtype):
+                return torch.empty(0, dtype=x).numpy().dtype
             elif isinstance(x, (numbers.Number, bool, str)):
                 return x
 
@@ -782,8 +784,8 @@ def _generate_reduction_inputs(device, dtype, requires_grad):
     """Generates input tensors for testing reduction operators"""
     yield make_tensor([], device, dtype, requires_grad=requires_grad)
     yield make_tensor([2], device, dtype, requires_grad=requires_grad)
-    yield make_tensor([2, 3], device, dtype, requires_grad=requires_grad, noncontiguous=True)
-    yield make_tensor([3, 2, 1, 5], device, dtype, requires_grad=requires_grad)
+    yield make_tensor([3, 5], device, dtype, requires_grad=requires_grad, noncontiguous=True)
+    yield make_tensor([3, 2, 1, 2], device, dtype, requires_grad=requires_grad)
 
 
 def _generate_reduction_kwargs(ndim, supports_multiple_dims=True):
@@ -927,6 +929,8 @@ def sample_inputs_func(*args, **kwargs):
         # Override OpInfo defaults and call base class __init__
         kwargs.setdefault('inplace_variant', None)
         kwargs.setdefault('sample_inputs_func', sample_inputs_func)
+        kwargs.setdefault('default_test_dtypes', (
+            torch.uint8, torch.int64, torch.float16, torch.bfloat16, torch.float32, torch.complex64))
         super(ReductionOpInfo, self).__init__(name, **kwargs)
 
         self.identity = identity
@@ -4080,38 +4084,6 @@ def generator():
 
     return list(generator())
 
-def sample_inputs_prod(op_info, device, dtype, requires_grad):
-    def make_arg(shape):
-        # shrink values to be in the interval [-1, +1] for better precision in gradgradcheck
-        return make_tensor(shape, device, dtype, low=-1, high=+1, requires_grad=requires_grad)
-
-    def prod_single_zero():
-        result = make_arg(2 * (S,))
-        with torch.no_grad():
-            result[0, 1] = 0
-        return result
-
-    # will not be needed once OpInfo tests support Iterables
-    def sample_generator():
-        for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad):
-            yield SampleInput(sample.input)  # only Tensor, ignore other inputs
-            yield sample
-            sample.kwargs['keepdim'] = True
-            yield sample
-        yield SampleInput(prod_single_zero())
-        yield SampleInput(make_arg((3, 3, 3)), args=(1,))
-        yield SampleInput(make_arg((3, 3, 3)), args=(1,), kwargs={'keepdim': True})
-
-        # test zero scalar tensor
-        zero = make_arg(())
-        with torch.no_grad():
-            zero.zero_()
-        yield SampleInput(zero)
-        yield SampleInput(zero, args=(0,))
-        yield SampleInput(zero, args=(0,), kwargs={'keepdim': True})
-
-    return list(sample_generator())
-
 
 def sample_inputs_nextafter(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -5521,6 +5493,53 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
     return op(input.triu() if upper else input.tril(), upper)
 
 
+def reference_reduction_numpy(f, supports_keepdims=True):
+    """Wraps a NumPy reduction operator.
+
+    The wrapper function will forward dim and keepdim kwargs to the wrapped
+    function as the NumPy equivalent axis and keepdims kwargs.
+
+    Args:
+        f: NumPy reduction operator to wrap
+        supports_keepdims (bool, optional): Whether the NumPy operator accepts
+            keepdims parameter. If it does not, the wrapper will manually unsqueeze
+            the reduced dimensions if it was called with keepdim=True. Defaults to True.
+
+    Returns:
+        Wrapped function
+    """
+    @wraps(f)
+    def wrapper(x: np.ndarray, *args, **kwargs):
+        # Copy keys into a set
+        keys = set(kwargs.keys())
+
+        dim = kwargs.pop('dim', None)
+        keepdim = kwargs.pop('keepdim', False)
+
+        if 'dim' in keys:
+            if x.ndim == 0:
+                # NumPy reductions don't accept dim=0 for scalar inputs
+                for i in dim if isinstance(dim, tuple) else (dim,):
+                    assert i in {0, -1}
+                kwargs['axis'] = None
+            else:
+                kwargs['axis'] = tuple(dim) if isinstance(dim, Sequence) else dim
+
+        if 'keepdim' in keys and supports_keepdims:
+            kwargs['keepdims'] = keepdim
+
+        result = f(x, *args, **kwargs)
+
+        # Unsqueeze reduced dimensions if NumPy does not support keepdims
+        if keepdim and not supports_keepdims and x.ndim > 0:
+            dim = list(range(x.ndim)) if dim is None else dim
+            result = np.expand_dims(result, dim)
+
+        return result
+
+    return wrapper
+
+
 # Operator database (sorted alphabetically)
 op_db: List[OpInfo] = [
     UnaryUfuncInfo('abs',
@@ -7039,15 +7058,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            supports_out=False,
            supports_forward_ad=True,
            sample_inputs_func=sample_inputs_max_min_reduction_no_dim,),
-    # TODO(@heitorschueroff) Add test for dtype kwarg
-    OpInfo('mean',
-           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-           assert_autodiffed=True,
-           supports_forward_ad=True,
-           sample_inputs_func=sample_inputs_reduction,
-           # Need to skip out test because one of the overload for mean does not support it
-           # TODO(@heitorschueroff) fix this when implementing ReductionInfo
-           skips=(SkipInfo('TestCommon', 'test_out'),)),
     OpInfo('quantile',
            dtypes=floating_types(),
            sample_inputs_func=sample_inputs_reduction_quantile),
@@ -8890,6 +8900,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
         supports_autograd=False,
         result_dtype=torch.bool,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.all),
         skips=(
             # FIXME: does not support passing keepdim without dim
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -8897,7 +8908,8 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
             SkipInfo('TestReductions', 'test_dim_none'),
             SkipInfo('TestReductions', 'test_dim_none_keepdim'),
             # FIXME: uint8 input returns uint8 instead of bool
-            SkipInfo('TestReductions', 'test_result_dtype', dtypes=[torch.uint8]),
+            SkipInfo('TestReductions', 'test_result_dtype',
+                     dtypes=[torch.uint8]),
         ),
     ),
     ReductionOpInfo(
@@ -8908,6 +8920,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
         supports_autograd=False,
         result_dtype=torch.bool,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.any),
         skips=(
             # FIXME: does not support passing keepdim without dim
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -8915,14 +8928,15 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
             SkipInfo('TestReductions', 'test_dim_none'),
             SkipInfo('TestReductions', 'test_dim_none_keepdim'),
             # FIXME: uint8 input returns uint8 instead of bool
-            SkipInfo('TestReductions', 'test_result_dtype', dtypes=[torch.uint8]),
+            SkipInfo('TestReductions', 'test_result_dtype',
+                     dtypes=[torch.uint8]),
         ),
     ),
     ReductionOpInfo(
         'amax',
         nan_policy='propagate',
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
-        ref=lambda a, dim=None, keepdim=False, **kwargs: np.amax(a, axis=dim, keepdims=keepdim, **kwargs),
+        ref=reference_reduction_numpy(np.amax),
         skips=(
             # FIXME: sum reduces all dimensions when dim=[]
             SkipInfo('TestReductions', 'test_dim_empty'),
@@ -8933,7 +8947,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
         'amin',
         nan_policy='propagate',
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
-        ref=lambda a, dim=None, keepdim=False, **kwargs: np.amin(a, axis=dim, keepdims=keepdim, **kwargs),
+        ref=reference_reduction_numpy(np.amin),
         skips=(
             # FIXME: sum reduces all dimensions when dim=[]
             SkipInfo('TestReductions', 'test_dim_empty'),
@@ -8946,6 +8960,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
         supports_autograd=False,
         result_dtype=torch.int64,
         dtypes=all_types_and(torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.argmax, supports_keepdims=False),
         skips=(
             # FIXME: keepdim parameter is ignored when dim=None
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -8958,6 +8973,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
         supports_autograd=False,
         result_dtype=torch.int64,
         dtypes=all_types_and(torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.argmin, supports_keepdims=False),
         skips=(
             # FIXME: keepdim parameter is ignored when dim=None
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -8972,6 +8988,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
         result_dtype=torch.int64,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         sample_inputs_func=sample_inputs_reduction_count_nonzero,
+        ref=reference_reduction_numpy(np.count_nonzero),
         skips=(
             # FIXME: count_nonzero does not accept keepdim kwarg
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -8985,6 +9002,35 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
             SkipInfo('TestReductions', 'test_dim_empty'),
         ),
     ),
+    ReductionOpInfo(
+        'mean',
+        nan_policy='propagate',
+        supports_out=False,
+        supports_forward_ad=True,
+        assert_autodiffed=True,
+        promotes_int_to_float=True,
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.mean),
+        decorators=(
+            # FIXME: fix precision
+            DecorateInfo(toleranceOverride({
+                torch.float16: tol(atol=1e-05, rtol=1e-02),
+            }), 'TestReductions', 'test_noncontiguous_all'),
+            DecorateInfo(toleranceOverride({
+                torch.float16: tol(atol=1e-05, rtol=1e-02),
+            }), 'TestReductions', 'test_ref_small_input'),
+        ),
+        skips=(
+            # FIXME: prod does not support passing keepdim without passing dim
+            SkipInfo('TestReductions', 'test_dim_default_keepdim'),
+            # FIXME: prod reduces all dimensions when dim=[]
+            SkipInfo('TestReductions', 'test_dim_empty'),
+            SkipInfo('TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: prod does not support passing None to dim
+            SkipInfo('TestReductions', 'test_dim_none'),
+            SkipInfo('TestReductions', 'test_dim_none_keepdim'),
+        ),
+    ),
     ReductionOpInfo(
         'prod',
         identity=1,
@@ -8995,7 +9041,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
         gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
         dtypes=all_types_and_complex_and(torch.bool),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-        sample_inputs_func=sample_inputs_prod,
+        ref=reference_reduction_numpy(np.prod),
         skips=(
             # FIXME: prod does not support passing keepdim without passing dim
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -9005,6 +9051,11 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
             # FIXME: prod does not support passing None to dim
             SkipInfo('TestReductions', 'test_dim_none'),
             SkipInfo('TestReductions', 'test_dim_none_keepdim'),
+            # FIXME: improve precision, failing with nan != inf
+            SkipInfo('TestReductions', 'test_ref_small_input',
+                     dtypes=[torch.float16, torch.complex64]),
+            SkipInfo('TestReductions', 'test_ref_duplicate_values',
+                     dtypes=[torch.uint8, torch.float16, torch.complex64]),
         ),
     ),
     ReductionOpInfo(
@@ -9015,6 +9066,22 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
         supports_forward_ad=True,
         promotes_int_to_int64=True,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.sum),
+        decorators=(
+            # FIXME: fix precision
+            DecorateInfo(toleranceOverride({
+                torch.float16: tol(atol=1e-05, rtol=1e-02),
+            }), 'TestReductions', 'test_noncontiguous_all'),
+            DecorateInfo(toleranceOverride({
+                torch.float16: tol(atol=1e-03, rtol=1e-02),
+            }), 'TestReductions', 'test_ref_small_input'),
+            DecorateInfo(toleranceOverride({
+                torch.float32: tol(atol=1e-03, rtol=1e-03),
+            }), 'TestReductions', 'test_ref_large_input_64bit_indexing'),
+            DecorateInfo(toleranceOverride({
+                torch.float16: tol(atol=1e-05, rtol=1e-02),
+            }), 'TestReductions', 'test_ref_duplicate_values'),
+        ),
         skips=(
             # FIXME: sum does not support passing keepdim without passing dim
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -9033,6 +9100,22 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
         supports_out=False,
         promotes_int_to_int64=True,
         dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.nansum),
+        decorators=(
+            # FIXME: fix precision
+            DecorateInfo(toleranceOverride({
+                torch.float16: tol(atol=1e-05, rtol=1e-02),
+            }), 'TestReductions', 'test_noncontiguous_all'),
+            DecorateInfo(toleranceOverride({
+                torch.float16: tol(atol=1e-03, rtol=1e-02),
+            }), 'TestReductions', 'test_ref_small_input'),
+            DecorateInfo(toleranceOverride({
+                torch.float32: tol(atol=1e-03, rtol=1e-03),
+            }), 'TestReductions', 'test_ref_large_input_64bit_indexing'),
+            DecorateInfo(toleranceOverride({
+                torch.float16: tol(atol=1e-05, rtol=1e-02),
+            }), 'TestReductions', 'test_ref_duplicate_values'),
+        ),
         skips=(
             # FIXME: nansum does not support passing keepdim without passing dim
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),

From 538c30a713a1ee2a3d654c3e1cdf9cc20b7d8c72 Mon Sep 17 00:00:00 2001
From: Pierluigi Taddei <pierluigi.taddei@fb.com>
Date: Fri, 27 Aug 2021 10:36:08 -0700
Subject: [PATCH 298/530] [caffe2] fixes to allow stricter compilation flag
 (#64016)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64016

In order to increase the strictness of the compilation for some target depending on caffe2 we need to fix some errors uncovered when rising such flags.

This change introduces the required override tokens for virtual destructors

Test Plan: CI. Moreover targets depending on caffe2  using clang strict warnings now compile

Reviewed By: kalman5

Differential Revision: D30541714

fbshipit-source-id: 564af31b4a9df3536d7d6f43ad29e1d0c7040551
---
 aten/src/ATen/CPUGeneratorImpl.h      | 2 +-
 aten/src/ATen/core/builtin_function.h | 2 +-
 c10/core/StorageImpl.h                | 2 +-
 caffe2/serialize/istream_adapter.h    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/CPUGeneratorImpl.h b/aten/src/ATen/CPUGeneratorImpl.h
index f8b43a04c73c0..69dbb8b8de973 100644
--- a/aten/src/ATen/CPUGeneratorImpl.h
+++ b/aten/src/ATen/CPUGeneratorImpl.h
@@ -10,7 +10,7 @@ namespace at {
 struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
   // Constructors
   CPUGeneratorImpl(uint64_t seed_in = default_rng_seed_val);
-  ~CPUGeneratorImpl() = default;
+  ~CPUGeneratorImpl() override = default;
 
   // CPUGeneratorImpl methods
   std::shared_ptr<CPUGeneratorImpl> clone() const;
diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h
index de30f9b7e179f..600c16bb6e5d4 100644
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@@ -123,7 +123,7 @@ struct BuiltinOpFunction : public Function {
     return *this;
   }
 
-  ~BuiltinOpFunction() {}
+  ~BuiltinOpFunction() override {}
 
  private:
   c10::QualifiedName name_;
diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h
index ff29b68dc4dad..bea717d7ee50f 100644
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@@ -68,7 +68,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
   StorageImpl() = delete;
   StorageImpl(StorageImpl&& other) = default;
   StorageImpl(const StorageImpl&) = delete;
-  ~StorageImpl() = default;
+  ~StorageImpl() override = default;
 
   void reset() {
     data_ptr_.clear();
diff --git a/caffe2/serialize/istream_adapter.h b/caffe2/serialize/istream_adapter.h
index 8960d5535c885..680c288a15f2e 100644
--- a/caffe2/serialize/istream_adapter.h
+++ b/caffe2/serialize/istream_adapter.h
@@ -16,7 +16,7 @@ class TORCH_API IStreamAdapter final : public ReadAdapterInterface {
   size_t size() const override;
   size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
       const override;
-  ~IStreamAdapter();
+  ~IStreamAdapter() override;
 
  private:
   std::istream* istream_;

From 6ab3a210983b7eee417e7cd92a8ad2677065e470 Mon Sep 17 00:00:00 2001
From: BBuf <1182563586@qq.com>
Date: Fri, 27 Aug 2021 10:42:24 -0700
Subject: [PATCH 299/530] fix resize bug (#61166)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
I think the original intention here is to only take effect in the case of align_corners (because output_size = 1 and the divisor will be 0), but it affects non-align_corners too. For example:

```python
input = torch.tensor(
        np.arange(1, 5, dtype=np.int32).reshape((1, 1, 2, 2)) )
m = torch.nn.Upsample(scale_factor=0.5, mode="bilinear")
of_out = m(input)
```

The result we expect should be [[[[2.5]]]]

but pytorch get [[[[1.0]]]] which is different from OpenCV  and PIL, this pr try to fixed it。

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61166

Reviewed By: malfet

Differential Revision: D30543178

Pulled By: heitorschueroff

fbshipit-source-id: 21a4035483981986b0ae4a401ef0efbc565ccaf1
---
 aten/src/ATen/native/UpSample.h        | 16 ++++++++++------
 aten/src/ATen/native/cuda/UpSample.cuh | 15 ++++++++++-----
 test/test_nn.py                        |  7 +++++++
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index e50b053949d37..602abcebbe3a0 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -251,12 +251,16 @@ static inline scalar_t area_pixel_compute_scale(
     bool align_corners,
     const c10::optional<double> scale) {
   // see Note [area_pixel_compute_scale]
-  if (output_size > 1) {
-    return align_corners
-        ? static_cast<scalar_t>(input_size - 1) / (output_size - 1)
-        : compute_scales_value<scalar_t>(scale, input_size, output_size);
-  } else {
-    return scalar_t(0);
+  if(align_corners){
+    if(output_size > 1) {
+      return static_cast<scalar_t>(input_size - 1) / (output_size - 1);
+    }
+    else {
+      return static_cast<scalar_t>(0);
+    }
+  }
+  else{
+    return compute_scales_value<scalar_t>(scale, input_size, output_size);
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh
index 71443e19755d5..c69a2597b74bb 100644
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@@ -94,11 +94,16 @@ __host__ __forceinline__ static accscalar_t area_pixel_compute_scale(
     int output_size,
     bool align_corners,
     const c10::optional<double> scale) {
-  if (output_size > 1) {
-    return align_corners ? (accscalar_t)(input_size - 1) / (output_size - 1)
-                         :  compute_scales_value<accscalar_t>(scale, input_size, output_size);
-  } else {
-    return static_cast<accscalar_t>(0);
+  if(align_corners) {
+    if(output_size > 1) {
+      return (accscalar_t)(input_size - 1) / (output_size - 1);
+    }
+    else {
+      return static_cast<accscalar_t>(0);
+    }
+  }
+  else{
+    return compute_scales_value<accscalar_t>(scale, input_size, output_size);
   }
 }
 
diff --git a/test/test_nn.py b/test/test_nn.py
index c6fe0b28b30ac..4e01c94d4c971 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -10475,6 +10475,13 @@ def test_upsamplingTrilinear3d_spatial_invariance(self):
             out_t_5 = m(in_t_9[:, :, :5, :5, :5])
         self.assertEqual(out_t_9[:, :, :15, :15, :15], out_t_5)
 
+    def test_upsampling_small_scale(self):
+        m = torch.nn.Upsample(scale_factor=0.5, mode="bilinear")
+        in_t = torch.arange(1, 5, dtype=torch.float64).reshape(1, 1, 2, 2)
+        out_t = m(in_t)
+        expected_out_t = torch.tensor([[[[2.5]]]])
+        self.assertEqual(expected_out_t, out_t)
+
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_interpolate_illegal_memory_access(self):
         in_s = 45

From 9f1f22b9bc04318789b9e79c237f93eecbdc1bfd Mon Sep 17 00:00:00 2001
From: Don Jang <djang@fb.com>
Date: Fri, 27 Aug 2021 10:42:50 -0700
Subject: [PATCH 300/530] [Static Runtime] Add out variant of
 quantized::embedding_bag_byte_prepack (#64081)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64081

This change add an out variant of `quantized::embedding_bag_byte_prepack`.

Test Plan:
- Added `ShapeInferenceTest.QEmbeddingBagByteUnpack`.

- Observed

```
V0824 13:38:49.723708 1322143 impl.cpp:1394] Switch to out variant for node: %2 : Tensor = quantized::embedding_bag_byte_prepack(%input)
```

Reviewed By: hlu1

Differential Revision: D30504216

fbshipit-source-id: 1d9d428e77a15bcc7da373d65e7ffabaf9c6caf2
---
 .../quantized/cpu/qembeddingbag_prepack.cpp   | 36 ++++++++++++-------
 .../quantized/cpu/qembeddingbag_prepack.h     | 11 ++++++
 benchmarks/static_runtime/test_scripts.h      |  8 +++++
 .../static_runtime/test_static_runtime.cc     |  8 +++++
 torch/csrc/jit/runtime/static/ops.cpp         | 23 ++++++++++++
 5 files changed, 73 insertions(+), 13 deletions(-)
 create mode 100644 aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h

diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index 5d9abce940f58..614e274b5493d 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -1,3 +1,5 @@
+#include <ATen/native/quantized/cpu/qembeddingbag_prepack.h>
+
 #include <c10/core/ScalarType.h>
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
@@ -122,7 +124,6 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
 
 namespace at {
 namespace native {
-namespace {
 
 // Note - This is a temporary pack function for embedding bag which quantizes
 // and packs the float weight tensor. In the next step it will be replaced by a
@@ -184,7 +185,7 @@ namespace {
 //
 //        [[50.        , 60.00000035],
 //         [70.        , 80.00000035]]])
-Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
+Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
   // The "last" dimension of an N-Dimensioned batch of embedding bags is
   // quantization channel. E.g. for a 2D embedding bag, this has
   // [ row, col ] dimensions, for batched of embedding bags, dimensions might be
@@ -208,17 +209,12 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
   const int32_t embedding_cols = weight_sizes[cols_dim];
   // Add 8 bytes per column to store FP32 scale and zero_point per row.
   const int32_t output_columns = embedding_cols + 2 * sizeof(float);
-  Tensor weight_contig = weight.contiguous(weight.suggest_memory_format());
+  const auto weight_contig = weight.expect_contiguous(weight.suggest_memory_format());
 
   // Adjust output dimensions to account for FP32 scale and zero_points.
   std::vector<int64_t> output_shape = weight_sizes.vec();
   output_shape[cols_dim] = output_columns;
-
-  // Allocate output packed weights
-  auto output = at::empty(
-      output_shape,
-      weight_contig.options().dtype(at::kByte),
-      weight_contig.suggest_memory_format());
+  at::native::resize_(output, output_shape, c10::nullopt);
   auto* output_data = output.data_ptr<uint8_t>();
 
 #ifdef USE_FBGEMM
@@ -246,10 +242,9 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
   }
 
 #else
-  const auto float_weight = weight_contig.scalar_type() == at::ScalarType::Half
-    ? weight_contig.to(at::ScalarType::Float)
-    : weight_contig;
-  const auto weight_data = float_weight.data_ptr<float>();
+  const auto weight_data = weight_contig->scalar_type() == at::ScalarType::Half
+    ? weight_contig->to(at::ScalarType::Float).data_ptr<float>()
+    : weight_contig->data_ptr<float>();
   constexpr float kEpsilon = 1e-8f;
   for (auto row: c10::irange(embedding_rows)) {
     const float* input_row = weight_data + row * embedding_cols;
@@ -276,6 +271,21 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
   return output;
 }
 
+Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
+  const auto weight_contig = weight.expect_contiguous(weight.suggest_memory_format());
+  auto output = at::detail::empty_cpu(
+      {0},
+      at::kByte,
+      weight_contig->layout(),
+      weight_contig->device(),
+      c10::nullopt,
+      c10::nullopt);
+  qembeddingbag_byte_prepack_out(output, weight);
+  return output;
+}
+
+namespace {
+
 // TODO: Extend support to N-D batched embeddings, similar to qembeddingbag_byte_prepack
 Tensor _qembeddingbag_nbit_prepack_helper(
     const Tensor& weight,
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
new file mode 100644
index 0000000000000..c52cbae4f2c80
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
@@ -0,0 +1,11 @@
+#include <ATen/ATen.h>
+
+namespace at {
+namespace native {
+
+Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);
+
+Tensor qembeddingbag_byte_prepack(const Tensor& weight);
+
+} // namespace native
+} // namespace at
diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index 477b191b24156..bcc975b79cf25 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -772,3 +772,11 @@ const auto fmod_scalar = R"JIT(
   def forward(self, a: Tensor, b: int):
       return torch.fmod(a, b).clone()
 )JIT";
+
+const std::string embedding_bag_byte_prepack_script = R"IR(
+  graph(%input: Tensor):
+      %none : None = prim::Constant()
+      %output: Tensor = quantized::embedding_bag_byte_prepack(%input)
+      %res: Tensor = aten::clone(%output, %none)
+      return (%res)
+)IR";
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index bd213c78dac1a..1e987a9fab58e 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -1257,3 +1257,11 @@ TEST(StaticRuntime, IndividualOps_FmodScalar) {
   std::vector<IValue> args3{c, 4};
   testStaticRuntime(fmod_scalar, args2, args3);
 }
+
+TEST(StaticRuntime, QEmbeddingBagByteUnpack) {
+  auto a = torch::randn({8, 16}, at::ScalarType::Float);
+  auto b = torch::randn({8*2, 16*2}, at::ScalarType::Float);
+
+  testStaticRuntime(embedding_bag_byte_prepack_script, {a});
+  testStaticRuntime(embedding_bag_byte_prepack_script, {a},{b});
+}
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 36f796fb2f256..f171d2889f551 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -14,6 +14,7 @@
 #include <ATen/native/layer_norm.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/qembeddingbag.h>
+#include <ATen/native/quantized/cpu/qembeddingbag_prepack.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/runtime/static/impl.h>
@@ -761,6 +762,7 @@ REGISTER_OPERATOR_FUNCTOR(
             include_last_offset);
       };
     });
+
 REGISTER_OPERATOR_FUNCTOR(
     quantized::embedding_bag_4bit_rowwise_offsets,
     embedding_bag_4bit_rowwise_offsets,
@@ -799,6 +801,27 @@ REGISTER_OPERATOR_FUNCTOR(
       };
     });
 
+REGISTER_OPERATOR_FUNCTOR(
+    quantized::embedding_bag_byte_prepack,
+    embedding_bag_byte_prepack,
+    [](Node* n) -> SROperator {
+      if (!n->matches(torch::schema(
+              "quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor"))) {
+        LogAndDumpSchema(n);
+        return nullptr;
+      }
+      return [](ProcessedNode* p_node) {
+        const auto& weight = p_node->Input(0).toTensor();
+        if (p_node->Output(0).isNone()) {
+          p_node->Output(0) = at::native::qembeddingbag_byte_prepack(weight);
+          return;
+        }
+        auto& out_t = p_node->Output(0).toTensor();
+        fastResizeToZero(out_t);
+        at::native::qembeddingbag_byte_prepack_out(out_t, weight);
+      };
+    });
+
 // The out variant takes precedence over native
 REGISTER_OPERATOR_FUNCTOR(aten::narrow_copy, aten_narrow_copy, [](Node* n) -> SROperator {
   if (!n->matches(torch::schema(

From 19c1b45f25af32fc6f6d6da315f0055ab7e30222 Mon Sep 17 00:00:00 2001
From: "Tugsbayasgalan (Tugsuu) Manlaibaatar" <tmanlaibaatar@fb.com>
Date: Fri, 27 Aug 2021 11:18:52 -0700
Subject: [PATCH 301/530] Detect out argument in the schema (#62755)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62755

After this change, out argument can be checked by calling is_out()

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D30415256

Pulled By: tugsbayasgalan

fbshipit-source-id: b2e1fa46bab7c813aaede1f44149081ef2df566d
---
 aten/src/ATen/core/function_schema.h     | 10 ++++++++
 aten/src/ATen/core/function_schema_inl.h | 32 ++++++++++++++++++++----
 test/cpp/jit/test_misc.cpp               | 22 ++++++++++++++++
 test/test_function_schema.py             | 21 ++++++++++++++++
 4 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index 68e177a225d76..a7b514990185b 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -34,6 +34,9 @@ struct Argument {
         default_value_(std::move(default_value)),
         kwarg_only_(kwarg_only),
         alias_info_(std::move(alias_info)) {
+    // this is an softly-enforced invariant for out arguments.
+    bool is_alias = alias_info_.has_value() && alias_info_.value().isWrite();
+    is_out_ = kwarg_only_ && is_alias;
   }
   const std::string& name() const {
     return name_;
@@ -50,6 +53,11 @@ struct Argument {
   bool kwarg_only() const {
     return kwarg_only_;
   }
+
+  bool is_out() const {
+    return is_out_;
+  }
+
   const c10::optional<AliasInfo>& alias_info() const {
     return alias_info_;
   }
@@ -116,6 +124,8 @@ struct Argument {
   // is this only specifiable as a keyword argument?
   bool kwarg_only_;
   c10::optional<AliasInfo> alias_info_;
+  // marks if the argument is out variant of the schema
+  bool is_out_;
 };
 
 inline bool operator==(const Argument& lhs, const Argument& rhs) {
diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h
index 168ecb4f3dc17..6e26e8c14cdab 100644
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@@ -51,6 +51,16 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
   return out;
 }
 
+inline size_t findFirstOutArg(const std::vector<Argument>& args) {
+  // find the start of out args in the schema
+  for (size_t out_start_idx = 0; out_start_idx < args.size(); out_start_idx++) {
+    if (args.at(out_start_idx).is_out()) {
+      return out_start_idx;
+    }
+  }
+  return args.size();
+}
+
 inline bool Argument::isBackwardCompatibleWith(
       const Argument& old,
       std::ostream* why_not) const {
@@ -121,17 +131,20 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
     }
   }
 
-  // Make sure that all the old arguments have their corresponding backward
-  // compatible arguments in this schema.
-  for (size_t i = 0; i < old.arguments().size(); ++i) {
+  // we want to test both out and default args seperately
+  size_t old_out_start_idx = findFirstOutArg(old.arguments());
+  size_t new_out_start_idx = findFirstOutArg(arguments());
+
+  // make sure among the default args, they are backward compatible
+  for (size_t i = 0; i < old_out_start_idx; i++) {
     if (!arguments().at(i).isBackwardCompatibleWith(
           old.arguments().at(i), why_not)) {
       return false;
     }
   }
 
-  // Validate that all new arguments provided a default value.
-  for (size_t i = old.arguments().size(); i < arguments().size(); ++i) {
+  // // Validate that all new arguments provided has a default value
+  for (size_t i = old_out_start_idx; i < new_out_start_idx; ++i) {
     if (!arguments().at(i).default_value()) {
       if (why_not) {
         *why_not
@@ -144,6 +157,15 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
     }
   }
 
+  // now compare the out args
+  for (size_t i = old_out_start_idx; i < old.arguments().size(); i++) {
+    if (!arguments()
+             .at(i - old_out_start_idx + new_out_start_idx)
+             .isBackwardCompatibleWith(old.arguments().at(i), why_not)) {
+      return false;
+    }
+  }
+
   return true;
 }
 
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 82f70fee1dd20..9f8a732f550f4 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -520,6 +520,28 @@ TEST(SchemaParserTest, NestedArrays) {
                                               .getElementType()));
 }
 
+TEST(SchemaParserTest, OutVariant) {
+  auto schema_with_out = parseSchema(
+      "at::foo(Tensor self, *, Tensor(a!) f, Tensor(b!) l) -> (Tensor(a!) f, Tensor(b!) l)");
+  ASSERT_TRUE(schema_with_out.arguments().at(1).is_out());
+  ASSERT_TRUE(schema_with_out.arguments().at(2).is_out());
+
+  auto schema_without_out =
+      parseSchema("at::foo(Tensor self, *, int scalar) -> (int)");
+
+  for (const auto& arg : schema_without_out.arguments()) {
+    ASSERT_TRUE(!arg.is_out());
+  }
+
+  auto schema_with_is_write = parseSchema(
+      "aten::ne_.Scalar(Tensor(a!) self, Scalar other) -> (Tensor(a!))");
+
+  for (const auto& arg : schema_with_is_write.arguments()) {
+    ASSERT_TRUE(!arg.is_out());
+  }
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(SchemaParserTest, NamedReturns) {
   // named returns
   parseSchema("at::what(Tensor! i_will_be_written_to) -> ()");
diff --git a/test/test_function_schema.py b/test/test_function_schema.py
index 0451debebd196..7c7a0f77cb922 100644
--- a/test/test_function_schema.py
+++ b/test/test_function_schema.py
@@ -86,6 +86,27 @@ def test_backward_compatible_arguments(self):
         new_schema = parse_schema('any(Tensor self, *, Tensor b, int[] c) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
 
+    def test_backward_compatible_with_smart_serialization(self):
+        # cases where out arg is provided
+        old_schema = parse_schema('foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)')
+        new_schema_same_out = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)')
+        new_schema_wrong_default = parse_schema('foo(Tensor self, *, int b=1, int a, Tensor(a!) out) -> Tensor(a!)')
+        new_schema_more_out = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)')
+        new_schema_wrong_pos = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(b!) b, Tensor(a!) out) -> Tensor(a!)')
+        self.assertTrue(new_schema_same_out.is_backward_compatible_with(old_schema))
+        self.assertTrue(new_schema_more_out.is_backward_compatible_with(old_schema))
+        self.assertFalse(new_schema_wrong_default.is_backward_compatible_with(old_schema))
+        self.assertFalse(new_schema_wrong_pos.is_backward_compatible_with(old_schema))
+
+        # cases where out arg is not provided
+        old_schema_without_arg = parse_schema('foo(Tensor self, int a, int b=1) -> int')
+        new_schema_without_arg = parse_schema('foo(Tensor self, int a, int b=1, int c=2) -> int')
+        new_schema_without_arg_multiple_default = parse_schema('foo(Tensor self, int a, int b=1, int c=2, int d=3) -> int')
+        new_schema_without_arg_wrong_pos = parse_schema('foo(Tensor self, int a, int c=2, int b=1) -> int')
+        self.assertTrue(new_schema_without_arg.is_backward_compatible_with(old_schema_without_arg))
+        self.assertTrue(new_schema_without_arg_multiple_default.is_backward_compatible_with(old_schema_without_arg))
+        self.assertFalse(new_schema_without_arg_wrong_pos.is_backward_compatible_with(old_schema_without_arg))
+
     def test_string_optional_parameter_default_value(self):
         schema_a = parse_schema("example::op(str? order=\"NCHW\") -> (Tensor)")
         schema_b = parse_schema(str(schema_a))

From 196fd3ee7ae7d053803e4f03a37ddd00ee3fd433 Mon Sep 17 00:00:00 2001
From: Joel Schlosser <jbschlosser@fb.com>
Date: Fri, 27 Aug 2021 11:28:03 -0700
Subject: [PATCH 302/530] Modules note v2 (#63963)

Summary:
This PR expands the [note on modules](https://pytorch.org/docs/stable/notes/modules.html) with additional info for 1.10.

It adds the following:
* Examples of using hooks
* Examples of using apply()
* Examples for ParameterList / ParameterDict
* register_parameter() / register_buffer() usage
* Discussion of train() / eval() modes
* Distributed training overview / links
* TorchScript overview / links
* Quantization overview / links
* FX overview / links
* Parametrization overview / link to tutorial

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63963

Reviewed By: albanD

Differential Revision: D30606604

Pulled By: jbschlosser

fbshipit-source-id: c1030b19162bcb5fe7364bcdc981a2eb6d6e89b4
---
 docs/source/notes/modules.rst | 316 ++++++++++++++++++++++++++++++++--
 1 file changed, 303 insertions(+), 13 deletions(-)

diff --git a/docs/source/notes/modules.rst b/docs/source/notes/modules.rst
index 4eba02231b1ac..c1d978dc78115 100644
--- a/docs/source/notes/modules.rst
+++ b/docs/source/notes/modules.rst
@@ -117,7 +117,7 @@ multiple modules:
 
 Note that :class:`~torch.nn.Sequential` automatically feeds the output of the first ``MyLinear`` module as input
 into the :class:`~torch.nn.ReLU`, and the output of that as input into the second ``MyLinear`` module. As
-shown, it is limited to in-order chaining of modules.
+shown, it is limited to in-order chaining of modules with a single input and output.
 
 In general, it is recommended to define a custom module for anything beyond the simplest use cases, as this gives
 full flexibility on how submodules are used for a module's computation.
@@ -258,16 +258,32 @@ It's also easy to move all parameters to a different device or change their prec
    dynamic_net(torch.randn(5, device='cuda', dtype=torch.float64))
    : tensor([6.5166], device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
 
-These examples show how elaborate neural networks can be formed through module composition. To allow for
-quick and easy construction of neural networks with minimal boilerplate, PyTorch provides a large library of
-performant modules within the :mod:`torch.nn` namespace that perform computation commonly found within neural
-networks, including pooling, convolutions, loss functions, etc.
+More generally, an arbitrary function can be applied to a module and its submodules recursively by
+using the :func:`~torch.nn.Module.apply` function. For example, to apply custom initialization to parameters
+of a module and its submodules:
+
+.. code-block:: python
+
+   # Define a function to initialize Linear weights.
+   # Note that no_grad() is used here to avoid tracking this computation in the autograd graph.
+   @torch.no_grad()
+   def init_weights(m):
+     if isinstance(m, nn.Linear):
+       nn.init.xavier_normal_(m.weight)
+       m.bias.fill_(0.0)
+
+   # Apply the function recursively on the module and its submodules.
+   dynamic_net.apply(init_weights)
+
+These examples show how elaborate neural networks can be formed through module composition and conveniently
+manipulated. To allow for quick and easy construction of neural networks with minimal boilerplate, PyTorch
+provides a large library of performant modules within the :mod:`torch.nn` namespace that perform common neural
+network operations like pooling, convolutions, loss functions, etc.
 
 In the next section, we give a full example of training a neural network.
 
 For more information, check out:
 
-* Recursively :func:`~torch.nn.Module.apply` a function to a module and its submodules
 * Library of PyTorch-provided modules: `torch.nn <https://pytorch.org/docs/stable/nn.html>`_
 * Defining neural net modules: https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_module.html
 
@@ -295,6 +311,12 @@ Optimizers from :mod:`torch.optim`:
      loss.backward()
      optimizer.step()
 
+   # After training, switch the module to eval mode to do inference, compute performance metrics, etc.
+   # (see discussion below for a description of training and evaluation modes)
+   ...
+   net.eval()
+   ...
+
 In this simplified example, the network learns to simply output zero, as any non-zero output is "penalized" according
 to its absolute value by employing :func:`torch.abs` as a loss function. While this is not a very interesting task, the
 key parts of training are present:
@@ -321,6 +343,38 @@ value of ``l1``\ 's ``weight`` parameter shows that its values are now much clos
            [ 0.0030],
            [-0.0008]], requires_grad=True)
 
+Note that the above process is done entirely while the network module is in "training mode". Modules default to
+training mode and can be switched between training and evaluation modes using :func:`~torch.nn.Module.train` and
+:func:`~torch.nn.Module.eval`. They can behave differently depending on which mode they are in. For example, the
+:class:`~torch.nn.BatchNorm` module maintains a running mean and variance during training that are not updated
+when the module is in evaluation mode. In general, modules should be in training mode during training
+and only switched to evaluation mode for inference or evaluation. Below is an example of a custom module
+that behaves differently between the two modes:
+
+.. code-block:: python
+
+   class ModalModule(nn.Module):
+     def __init__(self):
+       super().__init__()
+
+     def forward(self, x):
+       if self.training:
+         # Add a constant only in training mode.
+         return x + 1.
+       else:
+         return x
+
+
+   m = ModalModule()
+   x = torch.randn(4)
+
+   print('training mode output: {}'.format(m(x)))
+   : tensor([1.6614, 1.2669, 1.0617, 1.6213, 0.5481])
+
+   m.eval()
+   print('evaluation mode output: {}'.format(m(x)))
+   : tensor([ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519])
+
 Training neural networks can often be tricky. For more information, check out:
 
 * Using Optimizers: https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_optim.html.
@@ -409,12 +463,127 @@ Both persistent and non-persistent buffers are affected by model-wide device / d
 Buffers of a module can be iterated over using :func:`~torch.nn.Module.buffers` or
 :func:`~torch.nn.Module.named_buffers`.
 
+.. code-block:: python
+
+   for buffer in m.named_buffers():
+     print(buffer)
+
+The following class demonstrates the various ways of registering parameters and buffers within a module:
+
+.. code-block:: python
+
+   class StatefulModule(nn.Module):
+     def __init__(self):
+       super().__init__()
+       # Setting a nn.Parameter as an attribute of the module automatically registers the tensor
+       # as a parameter of the module.
+       self.param1 = nn.Parameter(torch.randn(2))
+
+       # Alternative string-based way to register a parameter.
+       self.register_parameter('param2', nn.Parameter(torch.randn(3)))
+
+       # Reserves the "param3" attribute as a parameter, preventing it from being set to anything
+       # except a parameter. "None" entries like this will not be present in the module's state_dict.
+       self.register_parameter('param3', None)
+
+       # Registers a list of parameters.
+       self.param_list = nn.ParameterList([nn.Parameter(torch.randn(2)) for i in range(3)])
+
+       # Registers a dictionary of parameters.
+       self.param_dict = nn.ParameterDict({
+         'foo': nn.Parameter(torch.randn(3)),
+         'bar': nn.Parameter(torch.randn(4))
+       })
+
+       # Registers a persistent buffer (one that appears in the module's state_dict).
+       self.register_buffer('buffer1', torch.randn(4), persistent=True)
+
+       # Registers a non-persistent buffer (one that does not appear in the module's state_dict).
+       self.register_buffer('buffer2', torch.randn(5), persistent=False)
+
+       # Reserves the "buffer3" attribute as a buffer, preventing it from being set to anything
+       # except a buffer. "None" entries like this will not be present in the module's state_dict.
+       self.register_buffer('buffer3', None)
+
+       # Adding a submodule registers its parameters as parameters of the module.
+       self.linear = nn.Linear(2, 3)
+
+   m = StatefulModule()
+
+   # Save and load state_dict.
+   torch.save(m.state_dict(), 'state.pt')
+   m_loaded = StatefulModule()
+   m_loaded.load_state_dict(torch.load('state.pt'))
+
+   # Note that non-persistent buffer "buffer2" and reserved attributes "param3" and "buffer3" do
+   # not appear in the state_dict.
+   print(m_loaded.state_dict())
+   : OrderedDict([('param1', tensor([-0.0322,  0.9066])),
+                  ('param2', tensor([-0.4472,  0.1409,  0.4852])),
+                  ('buffer1', tensor([ 0.6949, -0.1944,  1.2911, -2.1044])),
+                  ('param_list.0', tensor([ 0.4202, -0.1953])),
+                  ('param_list.1', tensor([ 1.5299, -0.8747])),
+                  ('param_list.2', tensor([-1.6289,  1.4898])),
+                  ('param_dict.bar', tensor([-0.6434,  1.5187,  0.0346, -0.4077])),
+                  ('param_dict.foo', tensor([-0.0845, -1.4324,  0.7022])),
+                  ('linear.weight', tensor([[-0.3915, -0.6176],
+                                            [ 0.6062, -0.5992],
+                                            [ 0.4452, -0.2843]])),
+                  ('linear.bias', tensor([-0.3710, -0.0795, -0.3947]))])
+
 For more information, check out:
 
 * Saving and loading: https://pytorch.org/tutorials/beginner/saving_loading_models.html
 * Serialization semantics: https://pytorch.org/docs/master/notes/serialization.html
 * What is a state dict? https://pytorch.org/tutorials/recipes/recipes/what_is_state_dict.html
 
+Module Initialization
+---------------------
+
+By default, parameters and floating-point buffers for modules provided by :mod:`torch.nn` are initialized during
+module instantiation as 32-bit floating point values on the CPU using an initialization scheme determined to
+perform well historically for the module type. For certain use cases, it may be desired to initialize with a different
+dtype, device (e.g. GPU), or initialization technique.
+
+Examples:
+
+.. code-block:: python
+
+   # Initialize module directly onto GPU.
+   m = nn.Linear(5, 3, device='cuda')
+
+   # Initialize module with 16-bit floating point parameters.
+   m = nn.Linear(5, 3, dtype=torch.half)
+
+   # Skip default parameter initialization and perform custom (e.g. orthogonal) initialization.
+   m = torch.nn.utils.skip_init(nn.Linear, 5, 3)
+   nn.init.orthogonal_(m.weight)
+
+Note that the device and dtype options demonstrated above also apply to any floating-point buffers registered
+for the module:
+
+.. code-block:: python
+
+   m = nn.BatchNorm2d(3, dtype=torch.half)
+   print(m.running_mean)
+   : tensor([0., 0., 0.], dtype=torch.float16)
+
+While module writers can use any device or dtype to initialize parameters in their custom modules, good practice is
+to use ``dtype=torch.float`` and ``device='cpu'`` by default as well. Optionally, you can provide full flexibility
+in these areas for your custom module by conforming to the convention demonstrated above that all
+:mod:`torch.nn` modules follow:
+
+* Provide a ``device`` constructor kwarg that applies to any parameters / buffers registered by the module.
+* Provide a ``dtype`` constructor kwarg that applies to any parameters / floating-point buffers registered by
+  the module.
+* Only use initialization functions (i.e. functions from :mod:`torch.nn.init`) on parameters and buffers within the
+  module's constructor. Note that this is only required to use :func:`~torch.nn.utils.skip_init`; see
+  `this page <https://pytorch.org/tutorials/prototype/skip_param_init.html#updating-modules-to-support-skipping-initialization>`_ for an explanation.
+
+For more information, check out:
+
+* Skipping module parameter initialization: https://pytorch.org/tutorials/prototype/skip_param_init.html
+
 Module Hooks
 ------------
 
@@ -443,16 +612,137 @@ All hooks allow the user to return an updated value that will be used throughout
 Thus, these hooks can be used to either execute arbitrary code along the regular module forward/backward or
 modify some inputs/outputs without having to change the module's ``forward()`` function.
 
+Below is an example demonstrating usage of forward and backward hooks:
+
+.. code-block:: python
+
+   torch.manual_seed(1)
+
+   def forward_pre_hook(m, inputs):
+     # Allows for examination and modification of the input before the forward pass.
+     # Note that inputs are always wrapped in a tuple.
+     input = inputs[0]
+     return input + 1.
+
+   def forward_hook(m, inputs, output):
+     # Allows for examination of inputs / outputs and modification of the outputs
+     # after the forward pass. Note that inputs are always wrapped in a tuple while outputs
+     # are passed as-is.
+
+     # Residual computation a la ResNet.
+     return output + inputs[0]
+
+   def backward_hook(m, grad_inputs, grad_outputs):
+     # Allows for examination of grad_inputs / grad_outputs and modification of
+     # grad_inputs used in the rest of the backwards pass. Note that grad_inputs and
+     # grad_outputs are always wrapped in tuples.
+     new_grad_inputs = [torch.ones_like(gi) * 42. for gi in grad_inputs]
+     return new_grad_inputs
+
+   # Create sample module & input.
+   m = nn.Linear(3, 3)
+   x = torch.randn(2, 3, requires_grad=True)
+
+   # ==== Demonstrate forward hooks. ====
+   # Run input through module before and after adding hooks.
+   print('output with no forward hooks: {}'.format(m(x)))
+   : output with no forward hooks: tensor([[-0.5059, -0.8158,  0.2390],
+                                           [-0.0043,  0.4724, -0.1714]], grad_fn=<AddmmBackward>)
+
+   # Note that the modified input results in a different output.
+   forward_pre_hook_handle = m.register_forward_pre_hook(forward_pre_hook)
+   print('output with forward pre hook: {}'.format(m(x)))
+   : output with forward pre hook: tensor([[-0.5752, -0.7421,  0.4942],
+                                           [-0.0736,  0.5461,  0.0838]], grad_fn=<AddmmBackward>)
+
+   # Note the modified output.
+   forward_hook_handle = m.register_forward_hook(forward_hook)
+   print('output with both forward hooks: {}'.format(m(x)))
+   : output with both forward hooks: tensor([[-1.0980,  0.6396,  0.4666],
+                                             [ 0.3634,  0.6538,  1.0256]], grad_fn=<AddBackward0>)
+
+   # Remove hooks; note that the output here matches the output before adding hooks.
+   forward_pre_hook_handle.remove()
+   forward_hook_handle.remove()
+   print('output after removing forward hooks: {}'.format(m(x)))
+   : output after removing forward hooks: tensor([[-0.5059, -0.8158,  0.2390],
+                                                  [-0.0043,  0.4724, -0.1714]], grad_fn=<AddmmBackward>)
+
+   # ==== Demonstrate backward hooks. ====
+   m(x).sum().backward()
+   print('x.grad with no backwards hook: {}'.format(x.grad))
+   : x.grad with no backwards hook: tensor([[ 0.4497, -0.5046,  0.3146],
+                                            [ 0.4497, -0.5046,  0.3146]])
+
+   # Clear gradients before running backward pass again.
+   m.zero_grad()
+   x.grad.zero_()
+
+   m.register_full_backward_hook(backward_hook)
+   m(x).sum().backward()
+   print('x.grad with backwards hook: {}'.format(x.grad))
+   : x.grad with backwards hook: tensor([[42., 42., 42.],
+                                         [42., 42., 42.]])
+
 Advanced Features
 -----------------
 
 PyTorch also provides several more advanced features that are designed to work with modules. All these functionalities
-are "inherited" when writing a new module. In-depth discussion of these features can be found in the links below.
+are available for custom-written modules, with the small caveat that certain features may require modules to conform
+to particular constraints in order to be supported. In-depth discussion of these features and the corresponding
+requirements can be found in the links below.
 
-For more information, check out:
+Distributed Training
+********************
+
+Various methods for distributed training exist within PyTorch, both for scaling up training using multiple GPUs
+as well as training across multiple machines. Check out the
+`distributed training overview page <https://pytorch.org/tutorials/beginner/dist_overview.html>`_ for
+detailed information on how to utilize these.
+
+Profiling Performance
+*********************
+
+The `PyTorch Profiler <https://pytorch.org/tutorials/beginner/profiler.html>`_ can be useful for identifying
+performance bottlenecks within your models. It measures and outputs performance characteristics for
+both memory usage and time spent.
+
+Improving Performance with Quantization
+***************************************
+
+Applying quantization techniques to modules can improve performance and memory usage by utilizing lower
+bitwidths than floating-point precision. Check out the various PyTorch-provided mechanisms for quantization
+`here <https://pytorch.org/docs/stable/quantization.html>`_.
+
+Improving Memory Usage with Pruning
+***********************************
+
+Large deep learning models are often over-parametrized, resulting in high memory usage. To combat this, PyTorch
+provides mechanisms for model pruning, which can help reduce memory usage while maintaining task accuracy. The
+`Pruning tutorial <https://pytorch.org/tutorials/intermediate/pruning_tutorial.html>`_ describes how to utilize
+the pruning techniques PyTorch provides or define custom pruning techniques as necessary.
+
+Deploying with TorchScript
+**************************
+
+When deploying a model for use in production, the overhead of Python can be unacceptable due to its poor
+performance characteristics. For cases like this,
+`TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_ provides a way to load
+and run an optimized model program from outside of Python, such as within a C++ program.
+
+Parametrizations
+****************
+
+For certain applications, it can be beneficial to constrain the parameter space during model training. For example,
+enforcing orthogonality of the learned parameters can improve convergence for RNNs. PyTorch provides a mechanism for
+applying `parametrizations <https://pytorch.org/tutorials/intermediate/parametrizations.html>`_ such as this, and
+further allows for custom constraints to be defined.
+
+Transforming Modules with FX
+****************************
 
-* Profiling: https://pytorch.org/tutorials/beginner/profiler.html
-* Pruning: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html
-* Quantization: https://pytorch.org/tutorials/recipes/quantization.html
-* Exporting modules to TorchScript (e.g. for usage from C++):
-  https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html
+The `FX <https://pytorch.org/docs/stable/fx.html>`_ component of PyTorch provides a flexible way to transform
+modules by operating directly on module computation graphs. This can be used to programmatically generate or
+manipulate modules for a broad array of use cases. To explore FX, check out these examples of using FX for
+`convolution + batch norm fusion <https://pytorch.org/tutorials/intermediate/fx_conv_bn_fuser.html>`_ and
+`CPU performance analysis <https://pytorch.org/tutorials/intermediate/fx_profiling_tutorial.html>`_.

From 101a6263309ae2f9e52947c7d02d630e1190b6c3 Mon Sep 17 00:00:00 2001
From: mrshenli <cs.shenli@gmail.com>
Date: Fri, 27 Aug 2021 11:28:31 -0700
Subject: [PATCH 303/530] Improve `distributed.get_rank()` API docstring
 (#63296)

Summary:
See discussion in https://pytorch.slack.com/archives/CBHSWPNM7/p1628792389008600

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63296

Reviewed By: cbalioglu

Differential Revision: D30332042

Pulled By: mrshenli

fbshipit-source-id: 3a642fda2e106fd35b67709ed2adb60e408854c2
---
 torch/distributed/distributed_c10d.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index fac096e339e71..302114e1c7bb6 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -800,7 +800,8 @@ def destroy_process_group(group=None):
 
 def get_rank(group=None):
     """
-    Returns the rank of current process group
+    Returns the rank of the current process in the provided ``group`` or the
+    default group if none was provided.
 
     Rank is a unique identifier assigned to each process within a distributed
     process group. They are always consecutive integers ranging from 0 to

From 6257f5b168782e026cc19788e171f06a8d962afb Mon Sep 17 00:00:00 2001
From: Karen Zhou <kazhou@fb.com>
Date: Fri, 27 Aug 2021 11:51:09 -0700
Subject: [PATCH 304/530] [pruner] add README to repo (#64099)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64099

adding readme to pruner in OSS
ghstack-source-id: 136867516

Test Plan: should not affect behavior

Reviewed By: z-a-f

Differential Revision: D30608045

fbshipit-source-id: 3e9899a853395b2e91e8a69a5d2ca5f3c2acc646
---
 .../ao/sparsity/experimental/pruner/README.md |  93 ++++++++++++++++++
 .../experimental/pruner/images/prune_1.png    | Bin 0 -> 170728 bytes
 .../experimental/pruner/images/prune_2.png    | Bin 0 -> 100490 bytes
 .../experimental/pruner/images/prune_3.png    | Bin 0 -> 87843 bytes
 .../experimental/pruner/images/prune_4.png    | Bin 0 -> 132748 bytes
 5 files changed, 93 insertions(+)
 create mode 100644 torch/ao/sparsity/experimental/pruner/README.md
 create mode 100644 torch/ao/sparsity/experimental/pruner/images/prune_1.png
 create mode 100644 torch/ao/sparsity/experimental/pruner/images/prune_2.png
 create mode 100644 torch/ao/sparsity/experimental/pruner/images/prune_3.png
 create mode 100644 torch/ao/sparsity/experimental/pruner/images/prune_4.png

diff --git a/torch/ao/sparsity/experimental/pruner/README.md b/torch/ao/sparsity/experimental/pruner/README.md
new file mode 100644
index 0000000000000..da0afb0bf3fb1
--- /dev/null
+++ b/torch/ao/sparsity/experimental/pruner/README.md
@@ -0,0 +1,93 @@
+# Intro
+
+The Base Pruner inherits from the Base Sparsifier.
+
+
+# Motivation
+
+Sparsifying weights allows us to skip some of the multiplications during the dot product (i.e. in the Linear layers), which ultimately translates into faster inference. With structured pruning, whole rows/columns of a tensor would be zeroed-out. This translates into model transformation (not just tensor transformation). Logically, the process of structured pruning is similar to removing some of the input/output channels in the layer completely.
+
+![prune logic](./images/prune_1.png)
+
+
+# Design Choices
+
+
+## Eager Mode
+
+**PruningParametrization:** After pruning, the shape of the weight changes (some of the output channels are pruned). That means the output of the current layer will have less output layers compared to the original. This means that the next layer should have less input channels.
+
+Consider an example below:
+
+![prune example](./images/prune_2.png)
+
+The dot product of the masked matrix A (weight) and matrix B (activation) produces the zeros at the sparse locations. However, if we remove the zeros, as in the example shown earlier, the result will change:
+
+![prune result](./images/prune_3.png)
+
+The resulting matrix is of different shape (2x2 vs. 4x2).
+
+**Forward Hook - ActivationReconstruction **(aka re-inserting zeros):  To reconstruct the activation with the original shape, we will undo the sparsification before pushing that activation to the next layer. We do this with a forward hook -- forward hooks are functions that are called on the activation after the computation is complete.
+
+![prune reconstruction](./images/prune_4.png)
+
+**Forward Hook - Bias**:
+
+If the layer has a bias, it must be added to the activation AFTER zeros have been re-inserted, i.e. after the `ActivationReconstruction` forward hook.
+
+The pruner prunes the entire channel by default (weight & corresponding bias), so indices of the bias corresponding to pruned indices will be zeroed out.
+
+
+
+# Eager Mode APIs & Code Snippets
+
+Supported modules: nn.Linear, nn.Conv2d, nn.BatchNorm2d*
+
+* when provided in `config` with corresponding Conv2d layer
+
+`BasePruner`:  base class with abstract method `update_mask` that computes the new pruner mask for all modules (see Write Your Own Pruner). The base pruner prunes the entire channel by default (weight & corresponding bias); if you don’t want the bias to be pruned, then set `also_prune_bias` to be False.
+
+`prepare`: registers the pruning parametrization (called `PruningParametrization`) to each module layer of the model; also adds forward hooks for bias support and re-inserting zeros to the output so the next layer received the correct size input.
+
+Note: for BatchNorm2d layers, the parametrization `ZeroesParametrization` is attached instead since its weight is 1d, so removing channels would affect the input dimension as well. `ZeroesParametrization` zeroes out channels rather than removing them like `PruningParametrization`. We need this when `also_prune_bias=True`, so BatchNorm2d channels get pruned with their corresponding Conv2d channels.
+
+
+```
+pruner = ImplementedPruner(defaults=None, also_prune_bias=True)
+pruner.prepare(model, config)
+```
+
+
+`step`: applies `update_mask` logic (i.e. prunes the weight matrix)
+
+
+```
+pruner.step()
+```
+
+
+`squash_mask`: applies the parametrization one last time to the weight matrix, and then removes the pruning parametrization from the model
+
+
+```
+pruner.squash_mask()
+```
+
+
+
+# Write Your Own Pruner
+
+To write a custom pruner, one could inherit from the `BasePruner` and implement some of the methods. For example, if implementing a pruner that computes the mask by randomly pruning ⅓ of channels:
+
+
+```
+class ImplementedPruner(BasePruner):
+   def update_mask(self, layer, **kwargs):
+       param = layer.parametrizations.weight[0]  # PruningParametrization
+       all_outputs = param.original_outputs
+       prune = random.sample(all_outputs, len(all_outputs) // 3)
+       param.pruned_outputs.update(prune)
+```
+
+
+It is the responsibility of the base class to call the `self.update_mask` when appropriate.
diff --git a/torch/ao/sparsity/experimental/pruner/images/prune_1.png b/torch/ao/sparsity/experimental/pruner/images/prune_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..f7f48759225729e932c79b7825d4c91cff6fd725
GIT binary patch
literal 170728
zcmeFYg<Di#zc);W#1H~QmvnbCAP6E#ry||m4AR}Hl#~iew;(OuDJb1AC^>Y)y9e)c
z-{*Rt-+BIl=Wxw9+r9VNd#!KBXGN*0DB?Y!cz}e2gs1dWUIPgU%^wK~)f@{0IMZ&l
zYm0=0YhxoPr=}z)N2liMWNBk>frRulDn%DlPqUvi=aq(%br9B5>=x`FCGQAcq9GRr
z$0ec>J&VPb?GCq4Z?BDE)p6IBAFoBKE{ts9jBeo})q3Yq^?{Y$=`l*<9><cz1#Bc&
z!0J|WZ>Yr!X>4mn>2pJ&Fp_t&ufDH5mfdq(ikM0KzCetChrI84)0AgiT*i?GWbVAY
z15oJ(Zj0Y0Z|APvX}hC87NbQnr|Kd{5rcqMnUUZpzD1n0NS`#E#F&hVyPgni_ZtTi
zRA{Vnk}MW=k+aWFJG}*u2A%UGnNuXN45G*!(M;AnIM+{9JQC>QCJNF+3H-$yiF(11
zgJf#4(|LHv%;qB2<vC{ffG-yF+}yV}<3!{(R@R?}$Bv-naEr{fP>*|f<HZ$3`RF?n
zk1pSGCx-C*r=;W+L$tr(IwSe4OkZT@+4>`+w0}{o(p1T4dm*V6KPAk_uU3q^aK$gB
zHGaIqupFi!Yme(v|44-SW!Y%o!w5I}@DlWF_K{8I$M#4Ek3ULs$~x5O3$q#S;*<@)
zbD6fi(a$Mes`xR5`b353)$pCdqxIqozTwB}rVqNbBFi3h_45(fJ>KCQRf^P@i)DT2
zVv1pJ9u2j))*<M{VIrhw_b;W>B&DgS<LQ43T90ucjcVYtZH5@V0~Zy8Jd;Pa6KG+<
zuSuUxE1^n0zDWNaWPsK&#;d(I?e_5zo9yre<1i>nceuOg$Gm8*lq3C6oZ1!_-Dx7(
zqr%PA{tIW9;`4*&HvR^m*j1W&Ge!7SNI<Q)@~HS&=$sz{xlo^@%*f3D(4WG-D^ln{
z2KC915}>fuS{f@nyNg6KB$lB+CuX2e1dZVQp(P1MDZJw^)P9{UV|iySdlyRuf4C}1
zfbO~y@Z^rO|HC(PIGSzO9egAt4zxBL_CoA-{{(f`i9&c%zC<V*If1hbS=kiK_YSOt
z%QCyB0HxbYQ0%nhwRfxs?MWo|iuiPBKIT{<ToL~m=j`(dTD}v(su(2PM_XS0r01-*
z&4I3fg(etN#<MS0z=d%+)JFzlFNyW$H1uV>xTA%(9>n>MD`bB!d^Uddu`HV$Yx~0{
zS}R*|>o@q-tadkkep*$QxW6(5L=W=<M!bHKwMlO&D%;b0B1wc$_l}sv=}{Y-^j5yv
zVepiX!qlC5d;3#pnX46&L)FK48v)6gEvQa*O|wnIPa7?+@?Dwq-RN0+o~-gVsz&~T
zQ`lDmOWy!{$s0U4+&xT!w9%w+a{gv;Cm@1u=xE9PXqg6-czkjq<UjGm4`um-*!-=-
zl6#%*vKfkW!6^48hdDYsb$K(vvgTu)JwZ9+2b@9FL%n!V<NyNPj<%-+I7G7S=}0A<
z!P^EHUjoUE@mVoG8<Xqd{R(<v%rT998hG;oUod!WS+E|7J7jyA+dyh}ndvvmzAR5L
z-J{TWUIMu;+$sesLXj>^Z268v>qj(7PktnE$_KM1lJ`mGKhzH@m8+xE?yKlCSr>K1
zw|U~o2ui|V$2-LCR--76c>h88xAP>XMV`7fgE-CbW0t(I-@fkHFXMGSHI2VK7JY-<
z5?}X`XWS9jm;y{J(3{0e8ynJXYNN-@+-YgTq(`n3_0t#+E^X93TdA_csM19)isKc`
zAxfm$aRF#CX)D+$V!zAo;O)8gHT_~(0Mi(D51L{yZICipu8-k4{X^mbtU2rmtb%1j
zTb5=PqNwMexW*~>y0uJ&UBq4ZTu@zjs-mW!`99xaPI?;tNKpfcEwzhr<88E(ol=uh
z?^E5UyZJQL7&Eau3ivDqNlfbv>->k@hw6tUhXSrVPcltk9OQLrpuMws_xYW}JB$IQ
zXA+;&isfoQW2g}q%}k0-y4vEMJUpQ}DGLt|4-2p8vBs-XNK$C%k&dgO00~L|aJRu1
z#CM>2A^Ym_oUGlag)c3iC;kW@8^58D%54obd;*aTkl7zMw-&c^6+(YsBg7$SCsbkG
z`s0(ezFk!<vo(oriVgKxWB$*g7vpu~FzchQRyMlUw7*)57RoxuTz*FV?4Qsttx?|0
z8gYA_Tps6`UM}O+x~H>1+?YTP?o4hsfs-GB=fDNv-8b@5ucXQi>kNw)X<A5Ixcy%H
zfo{xiiY`em6_4lA!4u*A7BK!a{xbn5UI-o^c?S&xRkiig;Y1EOX)vx3mM{UuJMfNB
zy(7Fo(E@ClZdN~EIRbAbbX+$UoVAV~SIE;;$d)?^zXWZ7xX2p=ep8!GPPdCLLK_cs
z)?#`Vn7=R+GHbl8WoCIU{vzW``<Iz7CC?Myd#3XZp$vCfmJG#bv>5-XnJzWZ5!-%L
zGB`gi`NP1ZwY#~yd8*?VL%(Txw_(gxs+KbC;PXN4)Zx_EI+i+lrIvcyuNJDl=w0c>
zluqg8R^1w#*}XNbY5Ot)YbXoinyMSCn?22;Uf}L?tbYA@LCEJ*OM+kg4aU_Veql&K
zNZ-efm1M@P-y5u+^UKrXvAjcG>E?PHm6<AnEOnYenza*hR$tt{9vN(kZ|ZiIg%{u%
zNna1aE4XvG8@T5T_8PYu<K45jXO}N~hTG%zbyhrwv-fEBq~}@|Z)zj*BF4<}dbw1?
zH(5u~bB1$_nwwk94TTJ;7j+i@EN-_rUH-o4y5Kw&+g66-9c1s${k|F>9pqVZ`o3gz
zt8y!L+jO0qP)SWsRprpNh7^Nj*p?btC!=HhW*OF2M|VVLsAe&iXFO;u3qM`9viI>6
z_7S<#j#Q51TvzMEjHk6?E?|3z6Nklx?SMUjr~Kd**(u#TgCs|@cr!<bfax1~%}4b7
z?5^JHFUqmgM$1+`<Bn;>+}8V9Uur#`Y%Ek9#~-i$9j^YnFey(bzLCs~nlY*<2EA&2
zI;HAt&3=u!_eS`=sJNqh_wRy(@&kd@(MP;Q%*;E_pt<1Wdcgz7<k!gzMPm9K-UhDP
zhY|0_-jT35n1v@~h`}Bgs~3ybO2@rR&8m|s^Rb_v{%JPX>mAQQ<7L*odtS6Q`hxd;
zqQpv{d|6a;6y^`?KWu-Ren5^)y;GX^a2<OtW4WaxzK7pw5OC@-{U*S`(#6WPz%VcG
z^UWbitmQ9}d-#Z**dtnO)nj#PCTS+K8nWI6+v)o=_@aLyQ*lIjRDyFnW&Np&3V*hs
zhepYy`b$oS%pDQ$lXY(WclJ$GNeO9Fo7?RPmoX0{x>chi(1XxP(4{n!3#ihI1+@83
zRs|z$V!lXm;PPBsh)ZWfTD-$AV#Ne#M$AX&`EfE*_)5I%Hg+y*=3*G4Jx4x{^k*Dh
z8KgCEz*;lY_$y3W`>#J;Bm_jC7d7hD=;`>n1>FU|z&Qw84daTLCo6k>Tr02U+B*69
zYw7Wk>Fw$fr0Le+)@><yh>1(NCDqYD!GKxc)69G9Fq^|&=V3Xt*>!SYR=ZT2sbsM%
zr~c9)LviMd(4<hbki*9K#))0oFYZ@Y-r3d*C(((~E!%NZJTLD|LX108;@T<ggfr~f
z7mjUephsm~gs*Hl9M}izh`y29`!#ZT&t(|-`WMYQer<8TN~2Duc92-}$|*f8FP^n%
zwzCm>Sys3=E#-K&A@$K`%Pz~dZpUJeYXS3=?X=}w=w^!YD(_T;7A~gdHg>suDHkip
zRQq)I>ukBW`s-JIwtm|yNfDWp*Od4-u<N;ag`=Km#!JOin$I-Z0blp^R;u=xZ8SFz
zO4kxHS%jCQ_+VYNZ}U{^vZBR3PTkM{5L-=K#b@J*&Ot}^=U0xTH@CT=uM1m#dmoJL
z=X|HNJ+LimS@$ctt=``V|3z4)Sb_Fh%wOnu>yO>M9a(v=fm2J8``WKz7m+zfY)7f@
z**;e%*})PmUNL)wdnxl%twyJ9=dv5LrvW<-N464&yvHluir5Kz(lChsh5YTh+tqc!
zae>9?P>!~gm(Suf;iAh$_cYlq&u1PViI}^|3%47NCSA<C2e;d%Q|aZWQ`J*`PNd(3
ze4Ey?dz)Z;^?R86M~i3vtbh1+W-i7z*D_Sua&t~pE}mZ++;-n8cTN>noLwGXUgk@Z
zSKMhLh2RYjp)C*NBmFKy;$3ct6W8$|-650r@ZVKjhM4^Ht}<#yDh$d#6v&xCYm=U~
zkERo`A5cB-G7fBP0twgl2`nsV83;m&7><umT7-}TCz0C7^o@-6!&_T<=8P-9AzQyM
z6%Sz#!y3iSluP-2I&ko08B;x5_50vKFAXar90_uIhk_N5h~^e8)e7Vu(H442mM>l)
zu>;3gNNC6uNa(;3GVql~ru_G@0x~NS>iu~XB%}x%B(#6sqXPUQ{t|&NV$9!P)Z}m^
zOyGY+z!&-f<-hJm^Z$VQU&pBCz%?XUO*thc;8)Yk)xyHj&DzO5Qa2k8oWOQ|s_%w`
zM9PHtA}eV;0!|`N*=Xsx>%Dj`Zsz2`ZDQ_ZYQYV4a7H`_NfIg!96DIIo6tcW>>b_2
zp;8R@cZdVWh|@d_boaNo+etC#y-=f*b8@wy6XxdS=4Ft6Ku1R>>1u8%t|71Rui?P|
zq!_H--JQjGc)Yy4xV;3pom{PW_{7A-czF4F`1!ejJGk7u9o<c!T#jyxe;@K+&ylxq
zGjp|ZcDHeIq(eN{#MH^dU5bGL@uGkK{hg-;)aE~La&-IGvVaBhAg=K6ar5&0``N%y
zNyJ%kH5;gfy}rDS13)w29nzx0qLTOb{~uTW^Tz)+Qtv+_`S^r*|NGGYcIp2=RNKwM
zRnEx)c&EGcfB5yU!T<f@zXnS3AeR2$Nb#4>_h$h@OFxk0`8U&~9~hKiQUiRXvXNKS
z0)7E0L;N9E1OKr8{RNJZeRGxeSSyi`WRR5PWwoHlJK31&kF<Z@f^w1NnOJ2X>V2xG
zqhnVJO?f1?S&yba{n}M0k%<Mh4@LPUY2u@f;Ob-(By1FR_0)XE2Mi1-!4Gmj-fntb
zFJ0LBl=w7SHCh!nEnP{rOw7IZ*j`*}(SE&I<~EwabY#lTK!@^=A8JKoQfvsd$d|1|
ztjNHB_@E(UL*8Tk<5-5Vk7$ps-G8?gzxwCvbSsK(GHCMu^bvm-7@CVut>>Zq&(|Ma
zTjT%Zd?IBch!WuwA5lvjqyA~Wk%6JK!^ngL|MdB-hC<{o?r!{(>z}VD!t=!bCptZo
z=}?WpFKuM{XF3x33u6A`LghcmVLv_pGZv)%?Vqp9bR@w3c_CQttN`(;#VfZ`|A8?8
zb)KG&Q~l#Y34-vM=vE5kFm)D){)rF-2&SU`Gt>T;ssE<S|1uR2+5gqle_N>kCpES0
zjs>23cinq?QV}pi?KN)_LGz}NtFjHnvmKpmI+Boa=Vm`3KrZX{dJ{7D-mctcqCoXq
zz}?l7$G*gp@6k*<CawL|?zh}w>8l-&{Q|rvfIIl0iS3Mcb2;_h(JI%kuH|ger>C|t
z=P(H5OIN-Gf?QV)i*06|Lhg<rCp*8s4ae}bTKCfUlrH#R?Rx%DVwn-1v(MXhrKN5m
zL?*Om0gU>r&DTktd!rtdN_4(-cVpxO%dpxxUrYFwlS3o18p&usl+GvIOXVhL&$}K*
z8~g68W#7-^C*P>#(R2_djrG-DQ-$AeIa*tr0PO~Zwf%~U9GaV#AU(_c;=TEP>s7gh
zzE16oZ!gDdFTy=c_G{<Tl8V+@6{(X+D68|Z(9|^lAg3Aa^)jYIs=j5>EQTLF!ckZ-
z0%q8r@zd#FUxtf=jEmky5gS8$3y-Mlp|G2dV#xuET%b0zt9F1M6Df2MDRY{*4GZah
z_cP0M__pw?Zt;y>rpGV+s|7<!#4}_Hh-JFO?yip<wkC^CM+0tazQ8-{oSF~9Z%)5`
z7wWoh$AV0rz%rd?L$Qf>qB+VdH`0t2M?}|fUpY2xY!Mww_!&$>2nm8t>4BN~^*rMu
zxpxrOgvgG2cLxap)83oeWP8^!Ty^QxZbN)^BjVe*(`68B$f(p=7FYdpu;;W%#LNSZ
z(lIBH-g|hIs&5A@H{cddwB?2AaPK^+Rgg?>&%WQav|MhL+;AT~2ESQPQnX*~iaPzF
z%!BvrTxn5GRr(^I+I>v+^mksu?3YYO+k!0D=qk={<4kST($BYYfQ3c@Zj<d`xzOgX
zIWA*qi<5Hk`|=`_)WvVndy|?H;@?c|+DA`+iQ6?7p~JGWd9j%ba>{iZ5$jcmqxgA=
z|K~nz8S6%aAJVH(DtqMmozBQ&L$DcL!Xl4qI`JjC_B(E`_Gt%QqA2qj5bWBO0ld6K
zZ1ODYy_X;3N0Fqy9d`0Jli2<Gb|t%JHJYO*Ik=n5NHU&nrNH91b`?TY1hZv0LTQ^H
zd$Y=n1*~2#cPnw$4-3z6ruyO?dwu$ots90|f?605@vs{o-w%nD*(7e8i^%kD_NmY4
zb;#W?q;zVo0tVTKpwK1c%C1a|Aja_0p_CE$TVsK+BhNZJgLDnOyM+OribVbP`%{c<
z|5TvDGYKFH!es*dxt!9g-oMVC6))&(*Y@Ec&o^Qf(e~Tp0>}(=roAbdk*^Dywxi&#
zlSS{NYb~PjyL(kUL3_z`y?i3Kuyl8Owq}BSFpUqncAm4VXxTBNz0%v3_`-+~yf!)@
z+#pAj+-3hunpQOE8wtNlcZtJ3#@!K#{a8@~^CJdrCjk`CXXiFc4Ifiu043rH(!quh
z$X<INbd&8ahk)Xj;=)K@vNp@>>RY6oFN`HuoMwyIHek7n11rdu?%2w`k91_{gQg69
zh5^-km|?W|W}8c5uYQfe4yR}eb9=WFnd$nW=~M)QKw}yxWe@$W0gb=)YWnjfbM+%K
z+gfitHKlT^^kGs0E3uifqD~4h6Wnls`t4&At%y-{C=QBrTnCMJw^xiDI3jzDit%5S
zd0Hk@U)mOZLEm}@pYYU_!y`o|{EdWCM(W>(qV)2@EJ5lQcf&eSX29*mZfVU(^t7|2
zs2X5@2!i;5fK;<bT|M(}fcH{i8JNzumV+Kj&*O2$x%K|HjFCtEEV=f1iV1aMu=G5e
z-W2$R;Jf0d2+y*`JZ8HR5nj2zy;V|CmlrQSaC_bhnYDsk>2Lq}6Q#933<sAr)y%5h
z-~0&BS{1d~=l4G8g8)`DU^_jR{LW_T<{T+e#u~TF?Urs&zwHX-Uc?fX*62U!P_21d
zOoK3ug}H5NSdgz6?Gbtq$mMuK?6mYF>}j9<mR&j8OSS9ox1@H;i2JOi0A7??mEN;G
z5bIqi(xSA}_nSJ0x*5yUKE?#_V5^GcfjmP0ggEL{w~9k^<sSziWXbq97H$gsXs>C1
zc35P|K7!ga{RXfE`^p5BJJo~2Oe*tZgcC`?nj3=vZa9K1PSo$$K_^es><1NZ0&F~(
zVQdp}=Phw_JmFCLQ^)R-_$3gky5B3qF<SVubCZTCV1Yofq-mF-lMDoUx;xT|=6!>E
zQ@<Sv+*J!$jiO=2rRe*R+8qno@rRNBg~gm@ro+y-BCQaAWh}Ws55Cl~+7&?M#bd)d
zjq`TRC^EJ`gq6^{*0UC#)M7%T$82GlB3YGTd_2L89m1oH-ilj$??oZr?ciTSW}@Hk
zZuZm8<lh!3+|DAq>D3ISd~cp@0alL60%MOthF`X4zSDzP<#>NdT6`UpGv~)#bi**F
zkI0pNL5oHHN}x7YKBaqT&Ue7Ax`ar(tajDQI$7H1x}WPjfPn3sS}(pVxQ$Bvnwzr=
zxEM6D2lVG#?XQ=Eqz2CI<gp@AGB)xQcbW3EsV<W6Hq_J1rEBI;+ErPH=7S#LZ9qKL
ziu6VU^Y-rA$eo(W459R?#>qCI1nktI(r=4z1U~nXZZ?jOe5Iel|B+NPdGe<qw=T`_
zO+TrgG4d<Flb<0Xrmxs{2fSIsXrV;gm=(DZwEol5*DD~!EZWcy34S|f*q?6^ByOuD
zPJ|2W$bG>0>)^h=M-tiGZ$66OZ9W?DoOAdV##{YU=haNZW)7vuRvC0);lT=~6ILo|
zrNJ%><U+~#Rlv=O!(PKCzS9?@yUVF@MO-m?u4bm5I7+J{z^Xj&aVyFA=60h5H^F5A
z95KRg_-$$CR`dA%QKmnYJQ@<{@xr8bPrO+H1ylky>sRV4N3l8gi5CtdVw?7x<sh56
zx0~3wE7#XYql$?qBszNZux%cIq{}UU2+e4!N?8B|DRHQ5#}KxiEbQwM$&Yh53ZeHU
zev9F<dbM=t4=pII>Y=C`dyLb4#gYSMkcT6NCzWj-^wNHF@?E2>>tl%N^2pC_guLre
zCE}{6UwtqK=t~Ex_2(3w>IoHzVV&0=+TKQW6J6@Bo_y#;q=Rr|owL8zVu%;b@%zmw
z@8rR(u?<;9v*)Z$0eQ38`~KpL4p0yJ>Q7)%mH-DonwS&hM!q}7B7!O|BfHO7q>+C@
zGlVQ%EuM80=H)_B=uwI$6$6>vq1jaa=PRc%M%Nt*2{hC$9|<PGIC^Fe8Nr8lnU0NY
z@*jak|CGM@qdMi4$i4Ky(b=*!o?7A;U}noA6RI~-Uxlmn`vPp4+TAv~YxFzLqs0r_
z1<X;#!3|qMmMJ1SNh2=(epT5u7~wrY6-Mpm(@mm_TXzR26a$G^Fu&V1V1{aRjr@eV
zVivrXT6R-8@jk;lS@RpBBPiTRK^m~%KrBeSGyBu6_?G+&7hpPOg9Gp2#eZceQbv&&
z{`vJx>(yRn19FD<e3<=uUs48-OW$>Pi^Yv+67vxhl)-~J00deYJAkpFfmhATj1=zI
zuf-4gvdO}FoM3~d`~kta4P<J^ArABPgY&4YnSk>bFse@SJBTqFlFU)J7g!GQ8jl~s
zJd5q~KPfS?0JKI0mZLEPM1$$xJMlwt{mVmVt(<Ibs61C)EYe8`=bX_SGawn~cG|Rb
zQyycD*IeRkx!8E71beIxGnIn{%MTgBTqrQf9#sPY^ClmD);A!6QiQS92#i>zTw?JW
zEhzK4sk5AyH6IRS_()-FyYDXZxXp$T^%{7tFx)n(z2l?V)%hLcBj0n9*2-DcMU=r4
z5$D!K;%Cy@G2Ox;K(-=47FvTRGS6GiXdt46kf0B^Xi$yM_t6VrSg4f)GY^~g8aa*%
z=IAet3u!$j)m`#Nyvo>ziTruz321!To`d^n23d|CcKAJ(Z@mFK=X~Zw<35A;OG-c6
zQKn|UA#UsQP|dXm!aZC@Zy>P64XOHZ9#-R1Xr9x~5+V@X<@NpQH8#-=V>}mcxZfY0
zy1mA2j3%zSX$Ci*;tX83IyX>D?EZos!%;fhOq|Ff_|uc@h^?*R`d0d2x?|2}cn7@}
zLuGsI@AuA~cp{+$y;<S4J~s~?bv+B^xUSn!32y<t$8{X~=?%+ZX@b4CTlM;~(>y5+
zINCir!23z`=(jvXxZUzcfh!?oLrN^!4-I)GqfWq_a;xYQxGNE-X8C~F`OSQa;il*b
z<r(jjWPbMX8&S{v6SCLuE;Q<(rV#&YsP~m1?~$LkA(mC{^*T%E;Ce6Q*5zXxhE)#y
zia(#gpI}XLL^V7j`q>?tq@HIT5G(Xbo`3~bermFntK;N)Gq(<<QHH_ST;B16o>7@9
z+$oNe!ZW4)-h`)t3~itvC9igB!lv-87hHO&GkSF&RFHs9=j%}BoUtI4d%L^Ck_s9_
z+LDu!@dl>@92M3K2y-gK1_W7b-Moc_f1N}bU7+=9kata-@ND6}B~?fds+6vf|NEcu
zmRJxh+{vS#75+laDSC}zXGGyI)Z2-}J}YtBWCG5knoaY|@l}^J*O5$LTTP^{Keqn3
z%MU9oO|T>ujg8S(5llVtlw)pbGU00fJ0x|(<dIX;Ze5jo%s#vF;dy)BgMBQ?5WQsi
z?<FK#CJ;ToyJd6^Ju+{j$>NB$NeP9YTjcShCNgtU3feH)5VM>&GV{fojB{^rKMaM~
z_Fgot#Y<SWI2s6S7N-cNVzqjYWC=Vm%cv-r1@l?f#%EOIJ>Oq~uEp&H0vL}9c1bmY
zuoaCm=N5Rn2|dw^bY6OxUaiALmiHi7-Aeq<dsMO_Lg?ztQBc+D&sc9Mstyn*9OZag
zx@su(a~<#5P)vIvgr-(>TjD-?!9Yg$a*TQU+&2F{lx&m1s;4)ZG79iR<7d(<>TQks
zFoMQe_Pkpv@zQR0B6fMiQDz3@4-die-^NIBXgT>sI6o5>l~LV&{vxP>kfhy7Dn|{?
z=EQ;zb`WMco9bC3#Up~hTL8H!@;1oCqIqGgae4q2f$`ZhG-;?iYn*=eLmPxa(d1y~
z=3vHGW40w_q{yFVzFFVB+zG#w5E)>2MG;iFAi)Xi_i8<&o->J{#{HEN=Kr?CJ4IV3
z>U+~pbxf6%tXm9!S9`Qs<tW#;d7@HDl^5nnjBVVd83{j6H_j(zKhm7dMh-nUFFJy1
zJG`wLjx%!uVY#Wd8<_JNF_4jOE$g{UHzGT67m|~oBNBj&-Vc0a{HDIqu!pgt7&E&D
zbV6sG(I0y9e=@%!VH;G^HH;ImS8`gMm>|~gs`o!9Vx#0Um5&u7FQ5MQdzw#_=_<m%
zE#`tBNId7{KvE9_IIeO>EKrC9?yyPs7ahl~&I~e&bnQn6!;~41D3He%1UO*K&A!-1
z=2jsdge8J1=0r_Eq#Ebej-HDZ-8^j?(|sZ1Q?d)u3}KgcmhGSS>5ms(jr=XKrYJ1H
zzo&iLVmc*zljeWn%EO35eUx?TW8aYet+lKEb1#inXKf9jD!K$f8JMKAROMcAyolPs
z^jFEfFC&KxTMY&zrQ=Q7=$*e-<LaEZN4CK|oN{?zN*2~262_#BNKah~tw=&u$jw)J
zTl!Vbmg65L*K}Q4j)S+8F9u*Z-1T%n#28(T>5^V!=YeD~^TyMn<hOR3+e3q5`;Izk
zZ^z0Tx7e>b$Xbo_Q&_Hu9NG#MB^lfjeQ0X4gl@>6yEw)7Q9L;nK;r}rI9O$S=;WJm
z^ZU-^;3@&8*=~r#sLq@v*VpwUD-uT!#+S_;AIHg-e&$RxGSC=x2hFEmzFsOba0@u5
zh!<0oRDo|M*qSn&sRGE~__o}f>V6!(xG@>IsP5*C6%$!_+spm^`mpc0uDdjUFL&$t
zYWk6L;F9%dWsh%p)`WA<GK+QtPg9I{)9I{TMAg1wz`=!;^ihUr1Odz8RSl1W&vFo^
zsmfcD(X^B>>viS9$5^iWao~NxaEopY?dM+VrhXryG*qPVHs_VYH5+LUzc85bY*l>s
zNSSVq?mSr*@8Gkw!H)}m@5A-_aC;z(f-Oe!-DPA(I!1al6P&Sc3ickRTe!5y95W6w
zI4t`*SLD}Y#E}aypDLDC*gFMeMJS(teBJt$8;vEQ<l$xuj|Hv?F8&_%w9&*#%Y7jx
zKJbpW@V`QXUP5Ox?o{uF^zqF%sza33P4)DPhqw5iMQN0}84#s}X=zRt1_8VvhY}Kz
zWHW5Msz5B`-J(#l3v{Z(u)C(Dlm;2%!3;eGh53i6)a_ww$$Ue2JF)5PoGE5X3hX<h
zcNKtd#;pTtT-LBfa23y0jX_AdyfRm<7Mc>Xqsk1<pz?2!^7?!qF83(p&iChrl)Y*!
z1%GLkAPk5wD`ub6G1YT(ep~Xt(rfIsdYBG5oi@veaj!DK)wyWs7p*d|su>m@l+<>I
zeUgJ2;u5ZBZdmW(>3GZ-{lvgqBSUkxWW${|cp7UFWv)1QCblFAOU6ki%#GPG+T%*O
zdlP;(6aXa<=tKV@I{JDjib;`E&Xu0g&4lY}cB++B+4Zp3T>7ZR$I${!6h<NI$N9=N
ziJ=MJfwOka=t<gZd=1X5CcT7p7}s6EGBK^7#8lD&G@Eldm!tf8WHzQY_O&ibZpla&
zXhU8r+$e5t&kfN+Z-0GjB~fPP0Sr?;O}?4SZeS6PMiP3(RNC%Az}7mtF4-kN*&_d{
z{fmRoRu?YMPL9vgTxbVM4KGfeTa&4|hF!|phB9n-j;^-;@-rDp741HBK}0WkSdUy*
zfp9TqS`>VQHs=mXsYl+t(n)IvvY;HS2VGk(35k(zkJUD9(brOqJ6q>jgKHXDR6eo>
z20h_?P<=&5Q%S9Wo!c2k*2`SUXNr1|#M?$J5K6yVeomCv_pr<5f^#?KW6vqtN1Bxm
z0>7tMI(Api0r3ZaXfWv0>7P;sTpyT(>ez|Q1?;)y#_U4wE=Cop5^VisWLy&>je}&u
zT{5@A#5_z`8T6bDrayOh!XNpGnRV4C+R~_(QHRl@EC6M-RK8q7aNnDEFqJL0OXa1E
z0<4VuK1&*g$D|~69{aTfWAh~}a$ozVaI9c#rqm3St0?oDCI*0HBw+2}nVnkD8a68q
z!I>n+@nJ1KhAFd&is#OwnX&1{&dhSUPI1#uk299nBh&2zI<Q$nS(WaN&_^cY5|t8t
z)8PRy)}_z;gaQkltfBNhv7>C2>$B~83@V*jE=s*CJVD<Hl@=yg+?+mi*y**tc(Na%
zdA@S3@WH_UV&iIc0H`o}3JiVwB)lzf?>mJMnLl&sB_f%ZK?~24A)@7l9i+CW(;pG#
zaP?!^c;sPlT+<;Dq?4fHbWW{ZO+F`6InB%VoMpphKBdPKreeoIL0N4tRL^7Z5XkUl
z$GI}WT|*9Rg9CNGQ}Raekh#*)2HiL_Znkd_4~C(tsw&tkUm@a&`m4h$yiFfTCjw^z
zOr_yH<>=xO;>+=Eg<~L(q+EOa<xv8T=CrRKftpt+jxmtN*yLKt`<wNIOv12yX|`G9
zSO?OseK)4fEgsKXE3RO+qm2)wfu1bCf*H03T=-AuHvdS!f~wNZ^0LG{nDal%T*3{{
zThPS&SXyn^g=dsQHpj~{kzPej<B0X#tT#Tc%J{j)iVs>8P!$pkRe|+}ilfjL+(fML
z_<e4Pj54_M{bLG^Z!aobj`HXsw+ij048~pL&+1IUT?tf^t1Ct))uTHtD&=tqfe&7U
zvb-jem`Hc!TDuI=Wtbv9{pc(ySxWH28gr&k)G*ING>rDLPcniKL`9JNXB$ZR$(~d0
z#?+A^Vq8l?z{znY7c2g)_Xwi#zJznP^~oQ%n^^)SS}atjLJBdx70k7i&n(~;PO>%*
zw4hHf50*dA=(ZlF@V+}x=y$?NFgGxHSZy8~D?k${b`)3<rfu4mSo@$ebK}Wmgh}`m
zp>jN&v!lM`p}&SjB)Md*k<I6nY8tX2-kyP%1T%Vu!F26e7vl;3TD3*TAsaF@5^VAe
z)f!A|52cH8n|lkNRJNTfXf2D{Q2js#Q>H>!9NIM=rS@uZ#41VPKxo{>VAR1#i&UHz
zoa~imhASB%NP#P{yh$*3CM(I1^b-5+s^m+UZL8zf8gw$OQpQvA!CJdmn-B`$u$fR;
zfx5rZ;+pJQRq3iPT@2->x;{=8vr1Nam0+CMuFH2zcxjQn2*b3Y&o0RnMM41SW2adR
z=^^z80ffwiQiMPHM|6O=(lkqdF+FwaiyMm5V^Ky{fr5*y4x{QddVAMhx|mnup0#Ta
z{iL%xKz=f!kD3@ZrB*>x13($MVAAbNWS%rJPzU5hG>+{ge^MgUga8?+MMGRjc-Nt;
z(NNc&s^0J*DXl<2GmM8Abm*InMe|uUf6k6plQ)z8=1{;4XH&r90a;Tk@L*swpu6Z0
zaKj(BUlkM@Cz?Rjmx@cuDSCK4Odh>8%oJWBe;XP<Ag=c#C_VUN+?i|!o&_3tmk_S@
zrNYZITlBp5V5Xt6$#3i9fn#eK?T?p(j4FIE8~~SMg0#w%a_0E-#BACW%&4HoQl+_U
zpf&sYTvkvfOV-?^pNv6AUDn<>hyM;toO(MCB>Q|*=kW)JoVI)g%f0+O&2oi4(Y;|x
zZV)4pqsf}5VU@jlJ_Kv6N2pG?J$_*%I+5u@O2z~G`x)D&e;x*hVj}aL_<p$=l7kU<
zfHr-`=ap|dYoei2S=(Lck3Zz`g2WP}1Wb2EPePbXIH@~hBAJYS4x9l6Hn@&F-Q!SI
z6nqbU(ytMtKMbr4ifDsCP7I(n>Jx!YO0G$W0iVCdE5b$i>RP+p;k11O)T-#_k^`vr
zdLwFRi|NIRi1cXe1daEaK!}&Zdo>zKmXk6z?jw5$i9h^&#YQZgnE{Um6RePe-ie*}
z2T0c)!IPX%+8NRLf=981#oN=q7-O}g%SIJkabnH^b*d7IB`|<l?|y3N+RxNE5n*=l
zby2|cNhWVj2{r^g0UdRwiWR%m{1u!Sm=x|DxQT@#Tia2czWzEm0>6X2)Hoz*r>L}Z
zEwrk`2RSK_<qq7TL^m)s+!{-`N-;+mnC16Y)}X;b!sog&pPlQqc>@V@WXyx-krN?_
zQO@l|qIDZ?T_~n~m0kEb%>=?3Z^6@D2c{}IhaqeziqS2ur!!V(T#)s~z}_sYlPw&r
z949gl`bWgZ)O=w~nYqOWWY35?>pq}nYrr5R|5o>@btu(}LJ+%8kmTrY=CP;$Di
zWg|oAbHrom4a0JxTi8*8mDb2ZrwH9r{Wx%Xm(OE4Bx!x&?O3iO4Sls?W;*d^7SvDf
zctYTAK<iF1RuIO+)1@4|SaRP9a0ZN$fWf+l!M!pBr4qD7(t$aYGvKEO&FNM7*m>oB
z=hjM$%|8%M+$kqo8jUok(oFDSEv;NS%W>D)$M5EyvhMBYCO)y^tGSg!VRwxD1ZJo~
zg^-m9i)Z!`Z9#=WN#UD>ss{(>MNHSm>ua(>Ns%j&y2|=E;diVF-k~eX%QFl-thj21
zP%%1d6SbP-2$yF|D>I?aA%r-0xj@EKm0tu|!Q{hkkK84YC)kX}HH3Kr*691y!ahSP
z5d-Qfc91Uz&W&Xb!`Djctc*F|9<L#<W_@_$s#NKLr(D*K>Uj{g@(@9YD$x2N2rhc-
zojpRfxH)qa0K*K%Izpvqq&+IGzOgumJTGs(+!9q?NkeJbym}&$p&H4-IcT~!E;3>R
z6I@y;>hy|z@_xenxtYNwvA^Dz-r7hzGV+P#OrEb{qgbJ_yxq0o#)r`01M*EI2k_I6
ziz}z+CXS4Hs|(>%pv550GmZNm!wLi4idB8zcKzSN02h5=P_SrwHM%@)Y;(5PqF8VP
z{$wKfsT?L?gqoN8+mbP3Sx_c^eAD91`VIPoj2y~W4U{w(c`Nze@##g8cWS=r;;mCW
zrw^0pp`5WKh>CtNS8Wc|PU>l~6v6FpSybacRAE8#UCRZ3YrKwBM|*A=fL3Z7<UMiP
zc|leliR}zn+}gZ~lwDbKy1-MQ7*Fcg@qu|*{3ZgYIchB=X~?P>%P=-5R^9KHzE$@*
z3HcJZ0ZLd!?M9+Asjm-y>q+Xbx8(|N$Bal<x-R~j&$^^4%0NL>5$y)93iDgWAZ}Jm
z|6Un}d6&C7*p!}!1>t-W?8jua6-KSxSkL_2nWz2}-PlDPJf|_4<-xA3vneG0^{);C
ziwjCq21EPJY%fFSfzhHaA`>t@nF~q+v*e}m10FOrey8?0ag!i2I<t;uh3QfqF!8F+
zN?Cg9pN(%8zJ&>s`@Pg&23fl125XWsG8W~|hJYrt#@p@}dDVx}q6q~bx!stxwv(j}
zWd<D=y)-(xZxv4}16{+G{IA)6dsnXrmcuF6s6rwr9dX@bC2_MqHm}f>!sTA^`RY}N
z)5-3nUAwYfh8PEa6n=C3yH#+;e3e1Zj)<LCi_lu*j?n*?QNRuFQv&knSSJh$R3>Z)
zXG8m7kT^6s0UQPw@YtH<C5T~GFg*XbYeDL7+)e!EhmjfDn(SjE%@rx-IQV5C_F$L>
zTp!om5FW%bGkFf_VHrGvzj-{?upErDS~DPqwb=|0q3f9Mr-jspILzsv2VoC}=%t1^
z2YyI~!qaYF<Vs6XiZ~iTiwL*Fyu+RQ&UO-}14RRu`%fbuVxbSIhti^HoEN+ymXIFA
zfkfina4~(t?~)E>neF&7XqYh9HLdk4H(oRR``0+(ABvK+UO>l~z;`9(!5}4}-_xde
zGSCi~tl6P{58eo*CT6WQv>qJ+`z>eqE0{qUHzg@-Vt^e`-Fc;7__pzFVW{Zv$%9{5
zl~5bloIyiC?VVfc3aS5cuZP#|#pb>bQjiG!RQLal8*MVGMGf8q;vJecZm80e7L^^K
z-EJAWF<uwbzP2#r?zhhJ4<(%mOz<UkDtQyJ+M@f~G~HUwBv#!LJ=njyV~3Ed(&osY
zeVeoc`fPE#1aeqIF?{_*M3Sgm`dd#7a8lo_xP&fi{d)YJH?&_Plr9J^lSQj%a=rwF
zaw~-HmwyTrv(kZ@yAZBzwZws6MGRp3yW&3P#!}uhhFA8yh-58dEniOts-;EzsPJ2M
z813w+x2&-xzerUq);<l|3-Btr>-9meRaZp29Ujodp{=@(yGds45~V+<qcS#L+c^58
z8qm1(*^pEB(I8bYZw_-&{3mavSA^?ekrB4pK(ZG}f$Coa$fehsX?H*I4ldV|CMG3J
zD|LxKo|hkcOy_;r&mxCwskHf!%-;BgFMBz$IV3~?{UG+a{|V<w2M!@J=J;ijB25tx
zfzcO@5&G3jyZE~-$bxP2-^rdUZL;TB4~aCZVn@DEcmgYpM(6>qP>{ap2cy`3XL^iS
z=ZCCr?H%*Zf<~9-d)+292GjG)ls|48;HHlC>6F~C$5GzCn3O`cT5=A{EVFU+Zk_$}
zW?l0HLL#d}YYOR<W-x`S3s*+BwGIPGUR60qn$b>!9eyKAlh(;W-v&jhDXHAmX-D%+
zAGP!t6BtIScO9D>qwg0_7lVXO(^mKX`9OiQNnTK+?H-|V&f>znT#v8!yM~D9&y7bZ
zV@|49j<*k&=F>%tV8*TTkG?DYS!}X4+v78J%zBbb`qG--ZJfEMt%ZI_>cvnPr(5T=
zWO`kLVsJyH88>fFjCA7x)s3s!&w048$h)$Ff?k7bezKKMhA<0uXU~tll6o$&0z(-U
zRNtyA%Y1OV(wlVCE~+JtIG=}IbDVKa+`Z-$AY&g+v+K?}JZBk~6iFPeahsvAD>c~*
zGCfYZ_+wh$5E0b?H|7?7-y6b5hG9x8ZRXe*Nx1nQ2Oq{)8jOANxdU3?XTEo1%DvTn
z@q3ZOCGP3m`duT`n7b6i%;=@%-mnEX(uN4zjSmcGlYT0`Am`110W{Jr(GK&vUKnbS
zy*ywLPQB!fTT=h4K9Q<<-iQD4ft*pbHBAjiIp2^b87Gd|`60gqEZyvr^cgPy8Id5!
z)u7(tR*r0KsJpJ-vcAon+ddPg3P$63F>KpvrNaBIUZp{e!r^3j!DSCN+VVa{a7B5x
zT`1SA^Jh!N&{|nWOcUNo0rm@Z5`<S<KCHQy96)%0zU}u0{88Lyv`DW`SDqil2xJP?
zA<f}r7E3eQnK?H4pAVQC^U#=r4NgM3%E}mtDyonng)X?u8beOskrovQz1eTta`vmu
zN^P(%s%oxgdrep@L^8^%ANS-T$tgXFnDTr0%XSX*R!$pJF>)FR^o7<@V4Flp;1{+H
zF$XRmB|{phXB&uzJ=Jc-0`QhSC>Zt;QzT{s#J-YMdaEf`?ZTW5j7(xQ5KgZUWRG>6
zwQ#bYz9dF7>|vnIoQ<f|OwNqZK#By8GmMwaf0!0AWE~W5c~sDXnMZ~kQkVHKoO`Yw
z<x*r_8h03&#k;n?RcPva8}Wu*xA`Spk3P~x-NaW+aYX8DK7EsO+>C0%yJzXR)%uR#
zPr=9-PXPR78%6nRx?7YGF81$O#fQop)A1HM&dW_&g*YJka_S-7p7NLNB{6_e!TG{x
zb}JCh-Pgrmef{I77HE@v@u`8=bimnymy;;3<PF8%XT#$N?~Wg$sg!Fca=BQhcpwr)
zYzMXl(w^}5_3<o!F&{)IU@>qkqJ{~cowp{AO@Tt5U??b4B$a&C?@%ot_|n{6$-v89
z1<^YV4${?6ZhxvnVH#juYK(R$hOtKa%AT+Tr0h~yQC&$rsL0!;@2ShxqqyOpeH;Kb
zJ|~2rcP4y(%(*f^`Qw<(?0v&y&P2ji&ZOm;pAW7)gW;NG0jL*LBUn=TURLEqjLgG7
zef@?sQ>l_nspv@wYpb1ajEBIeFlmOTM3OV9W`wvm?=)31Y^XT4x4P~G?(}A#HhP${
z6TT{{Zi^&SK7DF7QX6cCam8TBxXz0)<dJ%$P1+N+kstXemc^xp;43AY1(CnrZT~(2
zhyjYVY?zqgV<hy0+5t#I=uloDz*B5s?WlW?-{wr*R@o+j#6v1id$tvKP6wuw9H!aQ
z&{@I!hz!DBwMxXs9VNxB^);c471S^?`$`Z>7Lwkf>7!x7s1<wo@b@+3J7j<66exq?
z$YIjQv&_e;ek0L)p25!)Yi=sIvVk(^8$`0mjpY)nBu%5~FR|FE)}hLj!ii>tLeBV<
z4vGGk)*Bsqe8O#<kvyqAys9>K95-^l&a~*TO@q(hs63LHKzuyepcD`H!@EC_RVLZ;
zgW~ns0jkm5yCL;*zqQ^3&`QKbLVBmLfmgD1GY}?_K7bgM^jAjw?&R~e(K0ZjSb0gh
z65*C@yxt|oxJbogs(x~OEcic?W7#s-0u4=6;dmv?Wrfe_L5s%Ag~e(_HXtc`c22cY
zE4RFHEH6#&0`^KXa8=^mq$9@b(c|CawS>RXbLl3LyQaso=iiwQ-!B4z$Wnh+YK5D9
zxp_6*Vt>N;0f7r4{7tIL(S-(8<4rzL=yZ#IZc{7&?3`ZMSkebKm0%sq)qt%zrR+{_
zoMw9YNf0I_6(M^AO}=+;mc09%O+hw;nt&<VTH=GXwye#U?Y@d%Lj{5g^gDWET;$yu
znZEUUy*JS}DlV9~>kxZwrQf0C3=QTJ;JJt=Ga!_}(%CzBZ)$L!)d6;HG0~?9`f~x<
zxyY$9&4RNYra!BqO4!A!UM%d>0SMMgXih7eypdzmKo(RtF~4&xmUj!_wcbSl@R!5b
zu$O<^w!By<U@Fb|4@WY+ejOlHT|#DF*6_x6+UG&2;iMUUdY(ZpVeK)5;e0^$i8+8l
zX-}}InWFD8O_l=y!Ynw~I?fqY2ztoF@fW3N9traYcsoyNSRGMBiv<x5Xo!IHvitx9
zN&H2h#-j&+N(jmIghQ;fk_sNowNthFCO|_!ge!((O5-n5h3u1mcjSsppox?`GVZ2-
zx-6C=&u4nP+}~Ln1b&8<;jI&k*;9`h45~^Wa^zHhWa0froo6kSaFm}f-OJ2`n;(=Q
zbvjMpNx*TCvpT=o7>J}OEYpl0HP{8t?>Ors>@bT4%XC3ksf_pd{Ue`39leb&?SE?P
z;yKBmuB6Ak<Hv@WU&tYu%}|0E2?BomjJtLzToyXqqtw(<2DUbIP43MS0A6HS?#c2V
zTkR9v2#>nJ^E5AY;i1d)*=s1V+_%vl%}n}PqV14pC{i~RSJ_uCAI{Bxp6D@%2Q=SV
ztDiL#n&l)*RYz{rtklz!&NU!Y=~wo-Ke6QHSNEoKI6@kOF+?~3MP1l|<kkReVxSe@
zYN66KZ(5SztM}|dpU#tTUVbJ%#lAZEFT#3Fyo59D8a_8)M&{u{WUCB}DKVMtzogiM
zAt#Aq@jH%_n~Us8VyqCIqH>ixp#J-n=d~d0GbUZU{TuVFBSBbF|6q4$Uob2u(8Vth
zJ9gnM3;%Rd7~Z0RL@^23JcIIEpXP^_h9z}IEPf-t%&(S!-ao<7II5^3408DtCJ6GS
zA;0MxQLyB$UQ{<87H!WgePs?_pWQj|HI;tb5@(lyV-GWQg&BtNRoKNTY8f1Wjsh<u
zBudEqJXScWlfCCZwKhC~wPF)9cQw49)^eA<L*PM!z5uYE3HSWsd(fG@u_#u3LDa>B
z=@FR9MBbP)u{?cp{v@B!(KB<9`#S$Nxr0h>D6Kj)DHtk8aRdNPrO^P{4)%E=|I*8d
z)C}G`CM)TxT0#)ZYfB+CtI;IB%VW<Qsl<fOarO*LCsJz9Mk75r+iyDFeWP{0a5S*o
zyi>F-`yzhqS6Poa5$RLZgby%7FBAK>jlNM#sk~?F8But#Qhk*RzDL6ty)VNWc97pN
zc8j5?mAU2VNM#>Vn@Sx0Nb{$dB8>Kt4_j8bJbigYuotWD<W$k{&1A?8jjmiOme}-A
z<|6gUxrwyvt;&1Be#zfZthYn-AS#;NI56KQyY5aEG0aKO|A8+qMw8Tbn>RQ+KJLO3
zR=t#JEA*yYHEC#YBwN8m#SKX#3x9wff`^R7_&L?N^K>U|%h;&S$|$xjh{ADl!iFh|
zD2i-|D6gxedUU4~2NZLCuI1Q!M_zMWEM)oQ%JYX+l)#Yj^B_fkmCdWc)5`9Nh_joN
zuJrE<L*y#(7n}a3!&xdv4WzkUQrVy=pV_oGw!67?JeRMSS0Zld=iiJS5CmCQ@cjH*
zq**Wv#3MN9OP{3w+*kL(oIrIygsJf6Z*~8f>@onf5CkPtE^>hBC6gd0lbUAOm_1!;
zX`Y0D*N^$@^~)Zq0ftPP!FeSJhCI_roN_lRG-W`44fU;s)7dfM{NzG7#3;a^{R;#L
zfO*~{sj4ABm@b-7jUAi5Vbo0p+=n?MAde)tz+{()BqFvrce<|T?>bj9la<T|5MslU
zP!DKY73bRsq8xf|8nyi0V_ihe`=t_)TXXy=cKlmwqNfjxNU%Dt<}>T*BGw6|S$%_m
zzY)&Kb&MwBhTz}Z(@vES&uJiS%k*vBO*^L0&(-+^PLnQcJsDQ=F>Sv(@t@-XumAl3
z2yGfo(|;-<9{k$?rGDh??(dzN(Rfbo<9Eg*0I*CI4b)xcFKRm6w^rb`ne~Pl!0;NP
zSP=VT`T7Bci#cuG*Ca0fkq`hsw%vVq@Lc=8Q1cPMs93SGZiD|-s9}_W4*}?d1q5U=
zQ`E(j3Uejm;mp}mKmhdi>A4NLzcwd=fS(!EgjY7pzAaY2RS{>SQGq9-fVBK`X}s5y
z`g(NPp|-I=%exn`^QlU%Bp$KzNydwS&Sw+>jYr@tUju-xahsgNcnYC`@|M%t`l?j;
z1ht!N%RN-48XJND<-PVj=st~e^6dl=qTjob5YVo;52Kt)I{(AWheH2M6eymSW77Ka
z#)rUb0Fe9~09we7EC9kEi}Sze=XcREGXVP8*Rk=>m-82IfO=sFKo%Om<8hN1S>0k}
zO*2F*DHIS8`u5V0zkS@V(MTc}36NXpyw=^8?)T`<A@S`}LQyvW2v;*>nK|5dC^Y$m
z??0jE#^Bb;qR)PJ^%?+&_@Q_DWW_7$`8xoy72g72F=<nmScd`65VQvr$qJf!UbWoY
zuz1F{7bgH%!5jekl_d$Mb-XzFoMvbXz+=48-T>5b=`6>;CnSAPwf8lg@&w)qngfae
z#ysDzUB6~T5JU3;U}8?s)yjDsduRa&P#f`7U>Wh$FFo$@w~ZlfP5Z5OQc_ZNH>Y!R
z2>jx2!46RQ39vKZ4cZT?_)J^xYxmx3O&(zAz>^R5K;Z$AY46$@UA+f9r5Q3#mDTn6
zfgjsK#F|L4LEe{N0C~Q?2i~DWtaI}(0b-J=9O9^G=KrxlB>(pDHQMSe0z=h#`bfJ&
zRXO4{0^A%4;H3c;cYOpr;HLfE&Dl2u1Z?&>RU_ao#sDqgXZ(ES-1U*;pI3ooC|y4k
za5cA7hd{QD+l1x<aRrk`18C~47B*yo?`{&n1qZnjvSz~H@~hIgbP;+kcub*}f_#@z
zPs4b8+kqA+DPw0^Xs+upeXRnTcoMJz6)No$C4>eyMz#SYw^5|^`}!hBaz^hoY)s&6
z-tCo>lM@!CGwgB4sB_Yk3qYa93_y@_5j#f?=vLFs;;R4*Q`Z>X^$_Il`m~E!bvny!
z1VG(qvXowOqsYn2bc_-N{W1aGfG`ts%K$1f2EY~cI5r<BtcB;Y6vS{=Kkd?3R@!nV
z!G?4`G3*7T;<qJ=#Y6vdDbYXqZxK5Z_yo-lfjuh?I~i6vQJ5r$=YL@2C9N@SV*t2B
zKrqYW;a{^uN(cq;kPZMuss_MIPep!)1C5N))YqOf=iftfNo93w5zJE4Kv;*{que`i
zPEkRwe<RQrwUNbJ!Bou!<YZ1Eo{IeYE2i!uOF-&?3>Vni$Dg<rjz}TgIqvc9K;_TR
z2dMMlrM+=1)P3b3lqu}+-fx|emRy+g@SI%uSiBP8H5oC25f&Lqq06xXBfP0T>gU<3
zXMX@VZ4?4n2$(f!Yw<P%uNKfO!6mVw{#Wmi3C+;&MEh!$?7MZqc_(cwONqI%zEjHy
z!vw&%R65<y_e*$*ppW$gu>Hm?g39%&bEiOxy&+;}fJ0uKa57wTb|ybON@f#*y%94j
z%0Z~Jj4lzK&y<ETmj!@&^*jy$3cj@fR=fxBF3t5-G~BGf)VUAE%Yi#Z5L55egR8A}
z;P89}5Sc9D7%LGwZ=uM$0NN9nxpznLJ_D~jV&;TE!|ZR@QwR-ytPhwj#ym2`bl}Md
zY+@|*LCvVNj~)I?#WEuQt-$;j2toMJk2b)1J=P6aMN$7us4B0Oe~F6d3ff%2-Hq8F
zvfTUrg-i(mva=!4|9PLvF^svQbg4jB!pQ7_m~Wfd9~%7zfW#<7dtpPkXi=!|#p~IV
z52YQW7HO|7fvUHz(Wz3iCa_BheqP1^OSrFDd`(6u?GNpRe=9BTrqT5(+5H|LKxD{z
z85~iN1<wFru=8gp8&q2@?u1+l=i?Lv_rFHU=!em+UI55+X8#h)3=7h$HcV+_pbuT^
z<hT9m@Z4tHKn)>bcZz_}-&Kv>1KkLMgc$<AfjWR5dV)iBKQoXzAZIru(MY}d8k#Fi
zp(><}px%2~#LNMYP0Z(y*hsN<hO=Td_5aay7H&~>-`l4hN*V@18Vrz59Xb`HML@ch
zW)y~&?v|32kWxxOP^3F0ML=p~P(YZWV}N(h^ZmVl0N2IrbI#st-Ru6`+uWz|VH)4V
zljnUMRoB6{{T3|vbWbsEp8|g~^yoFqM)1X!r8hD`YCiG7&fG0*0Pd{VzC(oIi|q1X
z@WoOv`Q8KvbYTuD=u>dRVLN1B!YBDlOcE!ywl!|KI!^G5MwgD1j`5*6Q6zwJ3pfcr
zcH>?bz{_Ofbpi9tsSenswz&8onQlg);b7;QpMB>3s}77|buRk1V&X-hG%!Uzr`y92
z!?((KW5Q~7zlkj^aEKMs9iCWZn$_*0{lQubRV-enTJB3U*%)?8aSU^8{_KCJ3ca6J
zfbBW@<zsmc=(Oejo9<6wA@s<yUygb6LImr#z&U}nEn)m0B|uNzfmkIe(7HneN68Ye
zI_~`*tix8($p-zA>YKN`++)J|8MlA5Up9WbJlkseovJy1wpHU!F4XD)^>PWrdt*Ti
zIn<LUN`+z;mM>><hX#b_`uKRz(WfS0r@ql?vakAKUwqSlJOCBpaSldxb753+PVJ|o
z?l##dfKpL!h;*s6MZ@#eE_Ug*e3pD~Ny}q;=x{dUF}cGEou8)8AMMd<udN<D`syY4
z39Y;>lLdEGv_rlP1lv+nhsl-3i~pe;xnQCDvY%7^Z`-1!?vP$30iM0_&wzINpmUOK
zn<%rLswr4UDLGe&gasj#k|vIDel}yt*R>INBA`f!tc&MVcj2aJp%*>?z)ll@s^ohL
zX(^sP6Ax*L*cimw(H(O=BRd(aUH>)yxQ<GaJ1hTovKEIcEV%QREPdD0t;Lb{l0VGg
zjjHU!-!-I>jmH4|B6d_60leCN(`IE=q+S3xrTV3*@t9BoTZ^EKcg=e>%RCGJO0j*z
zm2m$Q_;)3;xZ=4RIPc?Z(41WIpv-HUe8eAt_wsLPrrLKboepdQC`xrr`)c5E!?!+u
zR@?MO$sj|}VlSp|BjK=x|JkF{!K>YnTD=Kq9#H5xG2_Rp#b*MCpB3NgwQl0#_Xe<_
zKm0n;mVkON?u-0h3z)S9;Ouv22X^-20vz{}B7wpoDG+%at9Y+|fXf?5`U^m;S@7<A
zHq57elfhPc?Uh#t{<S%v@-ukK;EX+>7sG-#FG+j^XN&LRUjuy3CmghZ^Y{M*o8^;!
z8u_Ibp8uZ(&_D{mF$N5YF9l{)@aDmGr91$RI~m)yD}Y@#Dqol3K?%aIfCNvvdQH!O
z;MoNnk`mGqYaD>XA#o(dIZCSidBOSOh6|V$CvE-F2|hFU3YHp}yjLSx;-U69$6*P(
zqlxW<bfCnT-D-Xh$_Vm07y6`19MIqmW@Z4~DkFX06sz!+7R_$)zg`=x*#iQ%*N$!@
zb~`C4E!JK0;fR)rEF6lNE&q#cR<7l<21+wj)bje9d!I%`e}X3`yK>a5m-}sRi2tXa
z`ptBJg1LgiE}NV#o7^&+<rTBQBFk-r3m{eM5`>Zl+fI6g9B{Q@fYg7Ee2?%l|4ci}
z)1jyJz@#RG<IBX3>xZCW2j5sSvW05?LtsXFdq>0u?RU~sfmL6yNCI8mMUxK<rtT=V
zTiN6^o@{Wr4H@|LD{$Hk0H`HoM3zhR7&UAbKVU|E<D~Peiw^2jT-VQ;r<OOdM$cRu
zx2wPI#5xoIh!-zw$>^BP93pW>HTW0xAR1Ui&(<(6*UpAptrJft=cGSJeG(GH+5u#2
zBtXV|c{km^+1^a&fA881k#T?3|F`11s^f(bD^Fx{kMLj-+c}9eL8+_WsBFhbd~Ei3
z0AKfO4HB=Q72)4)7ysO&Fp-|C7CP1kdQre`F5H2A9)P-&(XA25PQv{GxV`dIWfs6G
z(jBerqVoC;M@pIDg>2+0?y-#^hln<Ade3npjK&Q5ia&prS^$n?(c{E{#jp1${Kov#
zFG)M-W^o(yCmeZ4)U+G0@qkm@u>QT|F=S}b_Fi}a&S-x=Rd6Z3ZS(NzTdr>$ct1c<
zo9%D%@Fd%J-~gzc9n`pjh$#6L<bOL0icaAK!z*?39`?I>yrm^svhR1)i0!Ez+f%;%
zmmLubBB+4B&!7Xay5mwY*XPX)-fj@Np6{v}SR6^=YgV!in9sZPb1T@-?HBiOcy3p8
zcX7w3;RtFOqEVM~WeyZ*#pHhW%N^B?WO6$IUCH>}(;EZ}!$R%3|G=jN`cL%gNTz6H
z*2EEYWFa9WgRS5{LoT(&bDTTu2<&-JWkvH-A$06#-=n&a30uw&GDOMcN34CZ4ag7r
zIjV&h0K|EIC1ObRU**70Y87CcRg10gs>tlG`U6Q~Kl@YnxC>I81d4Aj)X&n&8*R{`
zd<njISa{s9;MWgstJhLUm1*2#{W95pA6v_i3T3F)$-BB|rk-`Sb|Ud7T*~siV@sa}
zJT+Ot0wC9c%}zRAn(B;|P2kWgObjmNdH25By2#YkPsZ*Hd)<cOlt6=<-G@?*dA>2%
z$cAHKkvh>%su!=59M58XllsR&IhE@h*{GCKUaoU`YBnBFLMgn?Y9)y5><vI&P0zRy
zyElR^m)H@8fE}4U@l0*=|Gt5~Yj+7uCtqsb#jwCrTTH|=(t)aqS%<y${1T*l?ZZ|@
zasAlA6?otR$3O}>H8Fkrq158}G_LtBoFSrLY%vcLtN5>~`MyrHC>$*yfdjDvcm0>Q
zhBsBe{{U1mFF~f!`vtZ~c0}st01g^0s8vVd_O4r^&Ek_JQm=6oa&+Nmw}2Za#e!e{
z6hQ_aS$z+{utY-cxArnV%j(yT!K<En|5z;P4h??v{=#1VKkKZeuhuT|=Re1}Tu(s(
z0(DIks={`+x$&r9EgE$A`THYJc6w>DLttv!L;|IG>8>C3*J}f{EEi82aFf<hYdZ97
z<}411bao>epB6m(Cz3f5Aq^LLH&BAo+>S5ovD+MEk~TQWSrU_aubp>e!C#9Hj$d-O
zxWRHhvx$~EyAWmD7auddm;QHbzh7v|AC={gR<W`sNJ$GxlKBEaVAF|~O_Bhjmwa$@
zGD)rbE`4QQ+Uut!a{eQw5M!;kUZV9Wm91Xw4foEc<n&$X!hhHTBQQ-90C(rP*m&|d
zT0xPDh<9IymfasZ%G^)=5QHPFTbQ2JzQiy0NE9t2#Rd%BDa-Q5X8C{3?uUFT>KIFh
z)4G{$;E42{JcjI_FGOsvf1=8shq!DH^C3;O9r)|xU7J@UCoE8}KKfN+ycGunT54L<
z^JM{xB5BhD@jqFOA|D@r@v32MA2jnqq3KiZ)&J=0=RALv?*5k%1Ay8f%zt_SP6lk^
zUWrsjkRk3KGsA++P6pZh{=EBAVDz~X8AiFY-KPi3v$Rit!lfXNdAxW#?$#=J*Ab7j
zb#3x*)9JpwH3-89&1VGq){mu%NX6HK%0ZHQj4VX`SH~zipyG+uINcdw{DhhuPnk76
z7CS~@GXVijzzy%<rt<VqyYk7EZgA8_2>H|1h_y%uVIH(_KA~#FKWnvG>(B?Eev|X5
zeqd&5y<uE`@H<V1M9k(|T`tA0K;P@YII@B%k~LBq|L8eww*klDd!Nk&RN<I%x7`m#
zXHjQjDAvHjUBM2!VzmUa9pSroJ>Ys$FeBq2ar>b+l^=jTZdqhwaR4L(Je^M3$RYRh
z6r5QbX_HzGfa44sBgDxNYG6ID79x6n>vnxywZwQuksL+8<A^dW^ZNZig~m<2tn1jw
z;0adyX9Q_<3$cZ0qI<!hYN;H%XAE}*25a6@Gap=CXD@%!N%g|d&NE<B?%=yJcb2$q
z1QS!|rip0_sMYPkW)~n}vVOzBb>xNTy{i84i{s@2Pua|VosSb!KA<1?$W>Ts@sX>_
z?|*f>=_9-_JX=aauJ-9TX=x}ORHoYmXG)})qB@q0URTi+Y_Wr?&7Adk(Pw<Gm5p3F
z!60o(Ra4%n-xPRY{A%XOC(z+Y)`$E0ZBUCwUz<cKY5@GFyY>sHc)5%Pil~P3p;Qza
z^NM_Y8CrE_vCT5}LL&^v3NgGE5L&y(=b%zCf8|TH(49%)3>^f_qV<fRXri5Fs2Ijz
z;BuYqN!_$<&9@^mMTna!@q6gp4n9p0Yw+r0Mu7~ZmEBXive-?Qo$)0A21UO+06eF(
zvXyN%)6a>@!vbD~omULGq%X8HDqa8&pVh+11Jogo)T<*(LfqY9fUZ50puvxXuE&$F
z5rz^LU_#~LNN}XRgy?2k2KhdwsW3f{+=C;Iz9c`?VwdjDnX?2BKHw5mxRK2f=<cA#
zK4L!0i_NE>fJAhkJC0)wx`8i}WA(%p*5d6ks@D3_y>T0G|J$~MbKkEYIWlu-PC3CF
zzr>!U$k{FqWzEE0qXnwZ8ATTrym7v{<nm9=f1OSksK*_24R8LhEmYRVQ^w~m8bJ1&
zZ!B1xC@RPRA%weBnhl0nOP>5@+G*UoPn2}4mAkqH_4aKMULOrs_&)jr$7}IOmOCr{
zF97t#jHtGXJ{QdrPEGTSwC?w(@U{`M?wE*rLE_WcMsc`t5Dd;rh|V2~b-?~}&N0L|
z3?h2BNJ1zr8et`KY-|rT6tD^qFQ1(c@~gKnb}r1fh{|xkJJ2(F{_)8v=mnB-oHyq*
z*TBlnzJSU;W<}4QXd00Msh@Uf+ZAQ`mG>pGY<qsclauzW4ggA4mF5X&BIA-E&^!Df
zG$YpYJ#lB$o;0b{XVe*xWHSfbycV@Wht^AKFEnFMRb(4I14`aDq;TM9M5mf^W^AG7
z21zUWsQXzma+)?s`tsoKl=8suMNTK5rK)R72trRcPfG6h%(g$BK$Xv(RG_A=zW@|3
z@ig4RP8yp3-@>ZlA&b)@*n0>vbZc`V+)hBLsmc~)4U1W4g_cFj9};ywCXwo6VN-zF
z?*eDvDPZ!LfLE%oTzn+8PAhP**;dAYHY(C3RNRB(tVf>#)~_+TQ}>9h7gAZmjv!n$
z>1$rWbcj})@_M3#kVBKje&Gc}_hlqC_o;u)PavaYVm2r=&A}<(M$N*1o^`2XN#kdw
zWl1Hy@|xOe^0@KR%T?h+XH|)O5_=yJ=e3aMVD?^cM?-3PzbhxvRhk|1r84)pab9eC
znezrRB?|rMibkKBwJI+R;!@&!YfiGY%Kp>vF(DuQWqr@fZ2OFuE%ke=QAoNEEJ^oO
z_^REbwI5m)2@*ZX@bmm<WkSE9Sq#Ac;+_UTTqsE4#2U)t4|4y3QrBqzyj;DuOPukD
z^S4zp&GI$w3e(Qv=Lr(CknV(a(`e=jbji236aA725cz0U7S{&7XiXgb^Q49=-a8}f
zJ7a4M&KE8$D565cgEnghDF;=O=|Ra~ZhBItKy;{E8E-?Y8;S7Sb{$ruvV;-ooVB3q
zu_Q|=g}huF&}S!ljWfqT0NU*t>g8FRlck)q%8B)}v77VZ9V!9r;mJc@K%R)W9V@&R
zGVGimmwtZz@Ji-*mDm|2p0DFO{&whydO0e~EVl;u-;Hst)}bo_do|SznXO(8h(d|h
z@9`}PG~;d-#K=Un3%IM!{r(cgWF#R}`r_XzoK(K?&mSP4gt+1Iv7V`0T(HsOyBRjO
zun$lU7G7T*A6qQ+>{k9dnP4dY^^Y@+(=&n*ABtRU^G#I+%kl3FO4qr%f$smrz_1xK
zirw7!$n?28P-C-eh+lwar9pprdome~4olH8Bb!(8*m&`3&n_59OV`H>@36i3oZ%#~
zm_{u^;z@}t;Aj&VEH|3Vkh$haL~7H9_Q!KXaWCmUgj9YAbNym+()o#mGpaN$zk<It
z^DZ+cc+3^Lw%(Vug5NdCi@LGgnfwwtgQ8y$U&Dl5`*E8}JmE7i$LoEQZNsMQ((pvz
z>^Xfh+XXuifQzWFMny3PhHC0rpK=h~>!Ly`721>Sr|()R<*7!vZZZVv=f!jX8(gmX
zE?KQC@R|GM7;;?Q<y@O#Wj@U&S9~ejcQT=0^Md>RI5=h^L&1mluJD7ccsiaG^CE|b
zSaR@_ukW63c7790?53oaVu0Fw6kv^G;kGzPZ~JlAMP2ua*`#yJs~?rh%P|UE9Rd9J
zm$Nc2Mtcp_`LEu?>kJ~;VL_9V_wpByn^#Q^e&^M#oOS3^QoP70TB!?#;&X>KXagcN
z!`)EKh=nF-;H8T04<N8SSo1Zc3mQPH!suIV+JLKf*6jW4kgvYZF9@k!u+ke2<SbAK
z^j=9-f5d^^E?a-UaXUQxBIp11&;^Lsl*LVdSXV!}F#@}|qHvEhi4cA0IU@r?)Sc!!
z#yI4T#hrlspbhl>D#_!l<3loe)`I;|k^ZUD+vOt2$dgV4_elbaRP}qvv+1&T*m!^X
z`gfj*Tv!hsys9CY!7J;CthhhQ%hc1jOr-uM56(n-3*E+^5w-9FSk%jazBjfs{BbM&
zxp9aY&uYUanfBnuW7?p~l!*<no&C}R6ECTxAeN&%7J4^ognqbQ_yb9))z;?cE9XXx
zn;UAsftoTZIcF7y2wfUNPL$zl@c4GWj7Q()^na#uWofAk3_3}q868=^qGHE)YXa6^
zs^-=H^YHz67r|V@!S6TQ7%vMT3l$(>U;((5ZuYTsjRzk8dU_knF~9Oqagl<6E=^yC
zxM2N3{b6$5R%Scae)Tfmn!9T}{iy{ri7*Ka1FHm%-qW5@#S>0rQIc0C<n4EyKQ<ac
z+aP2J8AHY~Qi~pW&h8r|5^NmXIc;BoKE&-ty75@g`YTxTlfY2sr6-^!)rpo#&$a3a
z=t;ZlYmJMHz$}dWEE2ACDxp1=m>7bNZ_otI<=g7B@5*T3#TLLZU@SoD8IIRCLe(VX
zY<)?y7zhT!(Y`~<m0YV)Thhn+cCy@z)FN#wW4WjYp`P4CG$$>yxopvBLfiRpjnPy&
zg1o6EFlD<>#1Z7<)U6O3UJ#-?B?p#0Pa7=zL$$2J+;88$ty&C^it0S_#pK_z!9u8-
zJodY&q(b)fFQC_038fs({7E}ZUc!avY^9uo-3UA`c&mw+jS*x6k9j(|MAVv}4=>kB
zY&B1(xm`AAck!O#)WV@=`o<JF4_IGJDH~B|D`+6jWZBL;qsx3C1~B^#lux0Qj;gEk
z@i?`RbeZ&W)^RSO2TJH_<J<vlBgmDi>v$QUxpBj_1J5pZ*w~3F#CRNJD7fGpsRwO6
zzdNRE5B^MadNePLVMGS*KRH=79ob`?{KSpr5y}_0oID+aaYIG^z8M_omY^YI>86(>
z^5vDIem+w*>e}>Cs(Bw5QsX4hMfHPs-o!9#%Z}kur^j&aY~He(O`_Fw4X8(cawow=
z4Kl5IUWI9?9NgTk1Q9Q=vvISM>pk(Hv~Rjg*Y_a@9KrIU<01tQ^TsaJ+d867!)rsS
zGh20yzYO^N&E5FiJ}b)61UCG^&+qnRk+HYPFrTPJZZ#h-TB*bgojt<@2L(qLgdN`i
z7VccQ_M&_TFGDr);<GNAaai?Dh>)2h;!jUNKY6VGy&(3APvydbVKhO~yV}5`k1jF=
z*wWfk1aFC1x6`#lDbp-v)6JRN60Sl375NKPz&Cc`zFnA>@jyX>c~9gZC9lwIYTBLE
zeE%Khmzt+E{dntY95dsVZXG9ex3&nSJ}yQ-5xY+ED_YS=f!)Kzf%3j5)Z-eef?rA!
z${iq|oD=T%wwH1eB%;xT$cxlB*9|PI%owPDThJBi$03)+!BFkRW{h2RyV6yQ-uipT
zH~j(~9vi5K>+M8Vv9CJ=ZXqAWN_Dn$JeJ+TjIKm@9)tQ>Hh$@48|L7xt(P{)5s4L1
zn{j&|<=@&&0=qmx(lj4cSzf$ANPnlwCvykmR+Ac*j_$y`71&?9)mwSP3of4?o0ZYD
zVp<<XTyh(7seN*Zq<A*sIN7!@9MaOem(@)r4-3bb{!pvV1ACeQ%wimp)E;uu=|ypL
z@2)SX3MeQr^^C@y@y>*uu9)Nj%jN7XtmK*uu@2F_paPRv^($Xa#sqJ@H&M4|vV=)~
z31HD*V>WJx_&1Z#?emOkUW#5f^D%xDkqNco+m9D84H&;9GU?+dPg;KQgQKsSxr?e_
z<+TE~FD((H+AuzzAUgJZ{xQqL;ju98YEDRxuf?K{$Jo9{j59eGOV=fMzgUl{7u7wM
zLym=C={f4H?uUSIv-J6gU!@(5Z6rG*BnUooH?026b%#%2;^flM1}drrxY^nRFk9oz
z#)my#yF)IO^Oq!yWy>ypFFc;T{)K$&;ma^~%=4-D_CUmODM`t*&oEi*OD|iGzJn9V
zx($fBi#|w6EaH8qnjd&9X>wcmrLdY1kS!F&?yhCIX|W+zLZY`69+|jhnGQy?aO5Lw
z%xYhTv4|2xdKnn%gwXX7Z|$|T>|o|NJXl7s6&ua}YAO3zhBvZy5h|P<Tupm6f}N~m
zVa%4&-1Tn2%I0|;UM-^T8=62Z1-$*nf~ht}Z&Fw;`XS23o!_lV$@h~_qbDLfA0)Hv
zIUJ=+de@@~`Jrj+4j?=ue!35<y&@B~!S|yihVE7*v{N!5s!&6X^)hKS?iP<p8Kl9i
z%44+R=?N$5nr5Ex!1=Sy-d)PjyCk={o|tI9)f9Lh!l`rvGnpE-0yf1P76yo0$=VlS
zsu=xpCweQl-|Q*%=P@vTrRJ$Lmv5GQ`3c<}(6<%k3xt^<$qZ3Zsnf^(RpA`Jv9pgo
zJ!v?vD~39YSr~yn=mny18^rppK6x_CLUakJEGPao+NImkhrnx?v~0iAJ73ztXpvhZ
zpyyb<cdjPD4a0Bw+*6f&PDykDQaZ$UO05zU)ID5xbew-ZK>t#RD0q|@&!V&dur(&C
z$JD{mIlb<B+@QwgZfN)jYTVD%SrLhEA5}n_740t@hOuhhSiVUtst=Z2(y~?MTc`~8
zS;TM5^us=kJ$+o((*I~0saY}-JU#0Efy8jbV@D-;v2<yXcoB<&oq(t+K~xCGDCNKR
z5~J=fih|}&Y47frQlg~vP$V8xiFzg}apEW`JJiyhK7S^jkss|j@`vKbUX_cj=&RiM
zHp70{Qc{baBQ5}{MR9J@n@O^yhuurC`Ez&+n%NBd7oN++);bF_uW2zBD{LY1AA5?r
zzCYz|I{K|?#Fn=Q0d60$p29jTE((kJbW1UXjJOYtQhru6Pa>n5UWr1lgTW&`D&JIO
zyrPSk6z(~`7&mDKbFZC6x$cl;oYr%~!)6z&Cb558T6DjE$IsG4`zVi^1&AfJSno(U
zu(3*=^|^=o+x0pMYN)rjsQKzX-5-$_sR{g9=F2Du-yNvzcS!t_Y_}8mqOD=1?eL>}
zuZ(U3p-YQgO9<nF@6dYJm%qn-EbcCDDsTPW=Rp3B8yl>td}uNN71`F}Sk@xUx^eHv
zLeD&sq4xw-Y5@CbFWw2X5v*3fGG7rrhKK|^88phrm-)lrl6Rnr)XaWlx3iYzFMFdx
z{q=$S!_`4BKFj~hTOld_-<FLuyeD0m={fSzqD<2xvSn**;qdg+PBU{bW<vUm6Kn4m
zu1kU#Iu;o(J2QjqD?tY+Qum{Ws|OVmTtsk;B`Mv0r-ETaOSb)2JY9QE?x#zIF*AA*
zCdGLF_uh9$GMAPt;1m636q3gd<b8{`eAT`Hw866DbeC!>l;H;r)9a$EORJtm?sC*C
zdZV3Ed$y=MW3QN0M`c$l#~hn<O8;0S_&*4;loraO0xp@_f9+S7B07^n7Z>*1(n&4A
z{?4+&bFsnt(Gh!I6<oLsPgu|IP+hmumc6XKvvZG`?!O_6(;fVFys9_xc%I(a_k#IL
z^;mKp{0tk+@7Z8#7PpVfc)e4!SD5j>cxF1bYjPDu7W#V`GAj-r$Li_HsO~J*Jo}3w
zgeNK|>rWQ1{Ue+*Vt~N`k<|o+XWU%57rMbTERAk;D$!-q(KVq}k0;6xJx#FUe)965
z2@yw~)Sb|1oY9+RpK+a5<7FXo#K6XE(*&jZfwwgpoysYD&*J$%WBmd&8`3PTUnj6Q
zx*y+yGPfN;v7#rI+;tcU&)hF46%lx~dIzSn-y(+*&vk{~vPq&|2y>NYTe@R*?d-uW
z3+BZ7&UKwN%n9eAmjo3=YSuQJ^v*cu?#+SS9|{Ochk_pJCbX@u(1vHtRS;idj6{1L
z!sxJBop<Re!l0wh$a>lB=;WbG)+fXiwK7SNM_(M15Ic%47g2W;Ka!cS5BV;Nt&+Lw
z?#y@qdwIXkjpE-lk7{jqhM~{eknVm3iBD(SV>m5Bg|&VVHzP;Y<gI$+{O`hKNbhxc
zBQR*@4V8~wBAbUNt5Qt6GOcCk{O+l$dXM~7-+ui^ty=2myJGDX3G}FB8?3mJ)&gjw
zUlQWqJG{Gk8y?pA)*0MGViq!?5c@dewI|`U)2ezli`(%D1QTs6{qRK5*y$;-&!t#2
zrgu?odx$v<UhhoxOsaPHEq-56bpZ}D`(QGws72VvI`R5tQ;!L!gBo|lS#yi`Zn|+-
zPwtwU4k#CzD|pog=jv>^tMQfiKg0+rkd$1;uX81iP_5BYvxMY0xmhqP$6Jnl^A=b<
zvE`x8RPNHb;GPp$ExY8t9-vuG4xP0-<dlAy#Dbh5Q-gZWQU?uiM87jKX+DCT{!Zfz
zprt5)7p4Cg8$xCtUU{UTr_Lsr%YiC3Vz72nC!PuCx}tuKQKrvb@pL+)Q!!m0_F~27
z10{sO2C|ck%H$;Po?lk)VR)1))_c$9oGYZ<XqQ<T^xBZW|Ml9q(8BQIM0EG#L)-+I
z8Xp3QJ5k=I$={PSZBAKL%X1-(hQWM6D5AWon*(A#S{>4vt^>pLyscw7tEoF;WKCju
zNc06v>tttF>725(SfC5hv@k{Y1`j$_t?O>}!jsK0PfmqGc(6QzkZJ@L{U&Al8(~Xb
ze>wNXJ&Y0cFKSm_Y=c9er|Z@MRQ@BU6#OGj-q>lw^0m%Ect)Wf7mp&2p%z4WsL#Wh
zh+cPQ<jg-}N!lP&dGai0Lh$;!XyCn$y4S_qUU0gMzsbAAjGuH!z21%-$gO?~7;EI^
zY-W*KlTz#NyW~IU*6yCt4pT=53_eA$J#l-wbwwNWVqyZP*#5%Bh?A<zk;)@jt)#uC
zMyNQ1U?j~ngUfcuKZxB-jrr90Xx$7o+WA+@<X>D-R>m&KU<U=dQY(yZ2*y+5Gn$~H
za>S$x5yc!>4%Xz!Y?cW%^20Xl-Z0H?=5h86zH)4)0&NgvZuA_NViRxD!=6#kk%e+U
z+gj~|V#Kr7g%>-x79X8dM42HMJmkQ{H}}~(8#W*PwQKKsJ*@sidZ`q8{p-*&3z26i
zbJ<8NxT%4p)U^d=_0oT&am2Prh5D!A)AhQq|E|n*&=%6`Zw%IavJ8I`eTsO_yYWae
zfa+PR?!)#3g;7zSYV(VK&rF`)x?*Sa==0Cu_=S!1i!W%S-&XxFmw$ebfGzFxjXz(g
z|0kYU=lep{e*^kjQTUW=N}rjGz1+GZ3ilxQ4Pe4j`)_vdnjmF1G2LAI91Eq(bAVt8
z>E|1n1qdo)1tl`M))sws;|^`mjd!Iuv-lTW&?;@>ON*1v^3Id~3%0txFsGwmJhM|T
zgS1-S-D;w8MxN@WqYbE91e)t+^G+VI!NlC;HW%Pl&pgu}MN+CjP<T@J?G`{ltF$~K
zk%L6ibdvI(^JccXUHY2ZV_x#Gn0<cWRbR6)A?u>LlSgwd9lUwZ16Xy9b$bj!7N}55
zi0d_XQh7q#%%iol4pzsuwZE@@kZW;5Y!;Ld`hcf)D|^pErGDS7`at_beJ!f?<{^fj
z-CebZRFvFCx601*v3iN3_`X#mCTEP+W{xI@G-!s119gAd_I8?E&bB6uf{T!m>h`4c
z-}!vvTZpxYP#@ARDjUkh7K;+!w*_e~I+6Eb&X`1ANhapM(asqb5EeEq+4>gFczqoy
zCe)4n#4V!sbXe}1V!sq{p)}`nkaXV`y7UONVvP=oI`IW8W?$HyCR_iYFcZ`mZIBB4
z8h)KlTcTf<K|zM!jPj!e;Np2nhi?yd^HP@xiraRFT;#;r-}EzTK-%&?%ih4)iU{<D
zgNa7qUXBJ_h<3m1DN6Gx*Mk_L^EzBuh$v%9X+PzY{j%2QnN6o#Qmkr6yrMiE@dA%V
zM>f}`I8#iz2w`gP8AWT8@09)Q6_QS==qYGYXNUS*ZZaT|B#L$~i)#e&vdQ&Okw(hd
z@5`4}dG-++C<3d4-UMP?09vKPFDGyIVN-V7ZM8S2qh-FZj|2HVzDlzXn1V+PAS@pS
zh$byDIV%HP-KXNf_M-NlSC0ErDho?WV}B<3Tlv-9n###Q9r;D!SOFa~oi{U8ZJaOE
zjB3sj#cMwfP@~rBTYR+RYy5YzgnFZ$6IeP>&qtB6%zvA`oAH17a8&c>%I7}FRW>V7
zLrC&BVR(B{edcx)T*X=BW0=6@G4rFo1S5D7%|1dTWiU$XN&ejj_4hn)^1*4_|Jg~K
z`u!~(mDN5oF58m&Sp#LMUT14vz$zxTc(RzhhZ$L7C0nTJO@5Y7eWUmqU*CI}v#f;@
zSsYLp@DEP55k&O_@T4)o)0g#jg0pH|wk!-&?u<l4S&bN@hnSj>V$x}xv05(EGYJv?
zwDspHElDzext`fouF1lE-#7~KY~5gfI2cL7PevibKA@$pX2O(<w8&U~#-y465A)=n
zf@V)QUqAJl$nGnWM>TZ!%Ge)nQoU&daV3B50#EfL=|$hwDDgUILP?M+WKY2-CI1cm
zuacJS8np6eHr*P@tr*f0zMJ55Ns_D?N|n=;%1M{fnrYHGe-dPQdR3HD3#4_!;In5s
z-=4ErzOZMTmT?<<SUfgLE)_5;2%TBXN_ET2;pxj@@rmvCjcxcAFiv^*P9)Hbe>GR^
zCV95y9TbQc<+#*8T-0q*kL6_Z`ZP0Is(%tv_w@)Q3pmAkD%`a`CfH=6Td-ZU)<|1=
zE7Bv(QjShTxt!(PNWLr>i}6F@pO;+d%QjxV-<NhZs6nSCrd+<im|C7j9b9!@_P$=Z
z@8NQ-49+f4Y<j-`OqtA4<=cyhx{c3$Vl5_mU##2ujg|j&M%!uykm{EtcqbJ!yUmEH
zSKVx<lk-W6K)hBSa;~ORg-JV}<ciOi_-OxdKzQA>&RtBatSkiJ!%dn&!i^(R^}LRn
zCApGo=F!q`UgXj^Q_p1;j68>Ohc5l=Vr6w^*i?z01EVrzP~o1biSx4dS>o~-&Zxa(
z9xFZdw7Kg?m|fJ#v!B6j>Q1&6U!Ph*G!|_+Cpobk(o}D1!V4bFpS9ygR42Pd%Pu1>
zy>TP8v;*Yw`Kq@lB=gJ9^z(pq6V%kuOhm!Kaq*uzLM8F8xE}L2w*IZbBtYs^?nz)^
z(`bld37j8Fx&)rG=WM^HV1kdLnXq$DzcS+q9-`MA7<HKD2NQ^-zAiR+A-A$W7PT9c
zX;Uiuy^COcYi+hk6Ex}Bj2T=0UfSeIIFLX>65QsY96GmFy@b;I)^&ojl95vuV1o#K
zlcb|EqlIMZ38`QnB#I=;APVrIkn2$?G|)ElOm|nLh4~0^4Yjo~nrR&l3I9Hgd*arY
zoDDX=MWtFWJ;-bB*APE!w9R4Z$M#H3Ksp1Z)hD1PjL4%t2yH=<$dU@a`@pva(VQwR
zECC2(kLo95%h;^<`?E+`?<+#jA*4XhW0`!dwvu~2Dw~H;^~fj^9g}^X`if%tq=J{M
zV>0vkLNPb@DT^yih_z{hNVLTgm?fsaFs%4Q`WhYhcL&82zQR1R9nG~nFA5}CT02@4
z6IpA0{iKoLK{KznOMhFd)iYm<C$nisNcWG<PXH9T7d*EYN;gu;fU>jcJr|cV`}*Qv
z=BDPkH@~BFhYY5&H{RFypH%F{<g2jb*1Zk9-D~%gc(ElvE8B51v1INQCe!p(8jnuS
z*;2ak-e9xhS9VGqrOhnbb=AwV@(!jC%zsiI9hT~*7`<)As}R3|jPk?w%dCkvAL|my
z9oRpq3%dO2SdgI5b9ulIm%7sM)kfW!efmkq{+(d^oj8eKJ<=TcWY23VqVRfJ2OQKm
zUd;N?tt5RA=}{;ba5Fs48T*ycH87@l<AtRplhEwHPl*M#bYJ`3sU6DSO73OHJ?qR7
z-y~Wt`86sKLyTcwq`QU7d`33=y47N1uh|{3lmW3|c+BQzKv0g~g3A2#M6tShMtqu%
zoQ#d?^jG&XH}%#8dSujTPnNJFWKT<rx4mv0(zdA+=A0;h?wR-)T{w10-$2EL;kiM%
zqh+2eF9=gLBU_`Q*%~DNI+7Ws0<82@DPB1gfjv6bZuOS-*7@K2fiBeR+W@l1RUH&;
zYCWn`+%AFGp5RQrpX)bh+>P0fI6K$e*ilpPo_Z{$3X6YPO>-*@o)jMryach{zqf8~
z`=4Dy0<)#Sn&_#{v!2JCqtZVp`n;qyAC$t`wYBdz{Wlg)!)v#W#Y|}xRf6kp3h5w)
zwV?Y`NI{2rhE3>rSO6-@I=W$Th6qotKUr0)13Jv2kO1h(K-Rc}ZzaOL5H;vSp&%<#
z3cv4)*HqQ65ro+kdSQo@%(vyex95aa0s|4y?22kY(WZNU$pjqBq5Y%uC#n~0pZP-Z
zzW6A4PpC4fn)?!Fot*@P$La6<q!S{LN)^m&JE{MiP2qR6aZ=)biva@yZ|QRP567mn
z@B@<Ly+j*1u^mNwP$;S-z4Yb*1{de<VmZ_I%}7jmF`H^fkHLl{)ZlX_*XlL=FnF?y
z3K&;25$%l>DEqp_Gw*)^-Zaq;8Xoa<wFvf2n=!5f+Ba`le!SF1XTlTCPFY2IGlSHq
z%|o3hBLm(E&If?-Xw_PaYWD4d8li9ME~>4E(iAi+pX+O;e-)s+CE5IWdJI*>R9+B4
zRXC1Kfe$km8uDP@m37;Hj1X{6rT^oc3O@m}_umf~LwB4RS{YQ|aB0cL4Dde^QtE_)
zQJw@5p1#8xrT+8n(#Tp_Ho|C$5=ej6<5E5FEBZ5^MJPu3To1+XAC04DrAi{oM<Z8P
zh8wyNkNdTPE6AzH>+Ny@bdf^I6_|OYc>(<d+U|uy@cVuOrAo|lmiEq<huxbY#NDs^
zIzH0tfh-P*P^!>jK$!?DTkij39r5oAujAwu&1Z?GPA3MyySQ)>4X2Z4N{VHR<;0ap
zGEFw<)p!WQIvMA&VAiRMqU?z_YD0JIf$*q58yibjXSzJt63neFhz$!Dh$73?lMtg>
zFrd2gLgw>csAF*#)vHzBdZ4QGHp8zQuosl2j#1X13YXD{4hD23xV6<%S!f_eS0y5a
zpM3nT-I_blrQ%Q)LMFfM6H}(lw@>-%H1%_{n~0<JoA&EZoHe-%rqoI06R4+}aGT8c
zQy{Cqyo6TaviK8l$e{G6j9on*tkoZzmD+705rSO?eFr145}2@O319C*#Km5uS8-QT
zKPE;T3GF}-apW4_C{|dFViaO->J}rV3L9GcX;F`JF2H;>h~q_!4V{DnyXTJMd4DR%
z29<~L!XXZj+XzVxeZAoPhi#P5G=x+wJq6gl;|yLcV1p7`H|F9eoGks;6^oUf?!KVf
zmd!D4AYAPye4xHiNgC8RkLNr&BN{32N?DrK#kx9`F9=*>Lg6k%G-&iK%Rt~<nSw>*
zD(;LdZK%}3rJuDK_>+o-a^3TL@51V1Hw*7Yze70qCND6n+!*_&mf<I2Dcj+M7(x`z
zQVYoP>8$#W?FqF8H8UbyK4y>cZtl$~WINW8Ce@qXY5c2PL#~YIqBJ}SVa^XW%QTim
z)%6rT4zSn?;I}c|b)3GpT4rH7uZ=U{f5F*CcA$=j`e1huAt2kyjq@?g_<XuDMr881
z^Iw3|DAHYpgk^G<7BBlHs>H@e&ngm?_Ad0`7ypOa(9@~Qw)=DC6Cl}k5eGXv2y&^;
z5+}vc-B~9M&!JlU8|gq)#7ma}@-&89xNeIo+8D4qF}-Rzu>+PcccKsPFQ(I)9^Uk{
z6(N{UT^~vq)VP=1hOF^tGJL#oonOj<JB`6)WRY&rNh-wKI^Er;O?wG$cC0HbT6s$Q
zS)=_Q*R0mRs>Pa=t0`|LMlKIw<^`yx^b<x=bL#Kec-`B|@^eXrt*Q%8cM7iL&*|PQ
zWHzhL8|v55!UBYQi50l<B=FnKby>8YT<i>I(R|N6-CnaX=nFgQ<|&2yq}<xHzf5+U
zMe1ggi=;eG+K;gn3*Ljz^hQBO{&`l8d<%CtS&Ar)41FNx$jrn<#GNIY{aD%(ASrp-
z1aWDUy_uJ)9xp>3zj)9E=r5EH%hqxg9&^PB*sx{the|T#IUi&OZG$-^jo~lZVFzip
z<tOAjT7dyNWbVgDF|ViXl92PNj7sxq^V7Q=QXr}F?*5sv{o#w`T*?39`o?%WE}|z}
ze0EMM9l<9-$g?7az01q3`x}$?cEGDL-N^I~xF5t`5sez+WrLab4@bN%Bl7qtkl=@{
z-HySl&Hr^5#$jBP4hsTN41pLgLV-{YBqqd%Y`DSrMb~jfZTvczfX49Nb&Gcp^y{JX
zKYJZTkbliZ=NmonBpK?-J1OhMvu}3fM9!%pPFH+8^wSI-UuZVAZznmXmLbQ?-V7zm
zo&EW8{L#qq5<cH4%5T%WBCCq!X-$@PTaVj?lq4`QXknH*szU#<+>58utb_5Jf%LZX
ziUm!HB=%-uN7%UuyFaW(nrUx8yUuQj7V(R?HXD|5<M6~9OWKv-NN3vyJe}c=jQuOd
zt&^vHEh^ir8Te~ARRZz?7UrVX-wFKVo=uFq&%C5O685s%v-M@0p43Kc*{6cR?&yE9
zHdtLMp=uL>1a3FRe7jTc2Cqa<Rw&%err>nEF!aG^+UEa^{2`JY$vZv}c?B<u9ZUXQ
z^{8SN98noARQUMsG1>U9ki{uV_48L+!02(r2Rw4TtnH><RH1#GQLMC)Y?1%X{JIo3
zNg}BiY6FNn@w%Jp#qVOR4y=ie&-AGYsQ2ZV;~|o)R3QX)M8y*KTtV8txl+jhO9&R4
zmO_CEIk^Ki26ww#Tz`b|-<jN1Hjlo~8phNo*=KJLbcjDfnx=n{!1W5nsFf%AZQgUy
zWE7f@Jb?o{r}*QO5aK{jp~L}GwRrXn_{HgXuwVcU#DB8nVRjkA_~Iw7aH%HtfyV^f
zzjAm&|9MR*Vzb<Y8pGL9lEdF1I~3<2;Y}3fm@m5<<JsdSg)#}M?rs8y6(#~%|8hju
zkBj>LmCgpizx99>Ry#KrV7%u<aA0dc@XL=sC%2&-o;I%T^L@CO2du*OxK!-i@m&4}
zk;9nd5xV&lq3LHl*jm|tW?+`jMFMp#`lkHk4>v5D_f30>pn;K4=(8!wYG2?DSB_BY
z{|_3i=Iz*>17X2)KJ5m86JEy4n1^2h@yfZZ?7kpDw;BMJYTwlat$^hxxuHXWh)oiS
zYVN!;=u}XKb!x7Iizfu>wQc|p%CCnF3V#%TbIy{nsk4&flTk4eFx*<bMn$L++bW`U
zK6f0=od}@`D?4b7E>$j^#|?=VoQe}fC;M!%d=u{{sY@RNg_VGmfb4?1S%6KnAz^%v
z#B)=pvE3!^bp$#riqiGY{h-DR=fJTOSuDESI-dJ(0@}rPG1<@`BP-$?jSM|`tr%yO
zMsaA`Ux`g*;jA|}-D?+C;l<i<#xwSUIJkX09Wj)o)bkPN9z*H(q15oriZV5ib*|15
zz13SY!vV;X5zijxR(}m0?ZppZ`t<-77;yTuXJ`!zUW#%;n#hYR^%2gqkSOZF&K%eE
z^rwj~0d3Rw@*^jsu%~b5m7nc(QFV5<v<XM4`M&iYvka$f&5u+*2~lN^V6P7Lc<*ZV
zsm0>|E=g#}z-!OjC<qS1s(D?kTU1l9$zw@8g72=DH6lR@u>w&n?)oH@=!b$?5d4o4
zmK>Z8Lx+hdqN?pa@LEG^IN}?*I&BKUMD|dc&l+oUlCpCnC30H{k&6D!K5+-A+q6MW
zRIa_nk-U}&1CA4pN%UHOGW+wh$Q4gOC`NTm7xYsVK@L~*!Z<hUjEm$ZJCVN>7Pjtr
zC{HV8tvZCP8>_NzeH3cHS@Z+kL{$&kbBX%f<{);K(@=y@RVZVwE}W!0Pj2m0xnnB?
z`U83>5ZtX0RTo|`-=*`mY7r*Nl$8d_L1dMjc1&s}*)tG0o(ae=5i$g2?F#(0eZG_>
zV#`Iogj3M^ZT@l(*HPb6_xJE1$^$+S@9%slxx<yy?f`A(waJ!9kMP0)FNxNjtY_(g
z74$*Iv3;V_8AUYHH$XsOjBkR~1H%YsQYCQ<t&zV6M2&zQ7m#=@k}prR^M5{Qr+fH2
zJ}1BU7xaaK2#x2{uf9#{*UXL)&j=E5p@TsQgk_JLY^~zOe%GK7Qz4+Z7cZ4zSl2^f
zo3b12)9F?D2zl>SE%-!>sX7VpiQh#XOyCfO5)$=}&gS)I?01_FTxy(a^rZ3?lnv<@
z9{#=jQO3LgN_A*DnAGgQgnoi*PQL<7Vx;byqsPJfZ1Sv0Bj(V-9;vQgCLdF13Pb$5
zbhOP(R>}R-oU^#Pq?2%Ub=mXvt#UAb7q<CJov#-y1^s=1?>1u*O#vLIItP@iQR{&v
zzus4?k@vs7|LO(eN#|!~ks#sL)knYL9r-QMMa%KyR{h*{#bYnn<e#%-=q7lH(5zm&
zM7T^`pI-^-=AL<=SrbbJ%L<FXSu_mHBXN>j=kfQm$$Qp1`4dY`>@tz8*4NAIlXK+}
z1T^YZ8EEC+c$!yjz!$-cX)$K(^&J`I>$+V;8HiMs|7%&QZUQa{39>83-S;3$wSDtv
z|DRh0+$zYMJ#rrf4=DRUv-D!J(h^r3%}3$2PXI_2(0qZYWYfC@c^V2P->@{EKVyZc
zgo8s}-L_R7a);uj-E^~sEJbzxxw2`@U?0}|#<T9f8AFj|;_e+oS=>!|T<&FgHt%F4
zBqiwzlA=%q8m#Uh{!g6hrSlF(Oj%N7e!Z_eB__Jvj>L=mJB-q2RKPUICi_Ywr=Kcz
zTi31dZ5(Zi>tF#YCr(Jxx{C;|=NS3U%ho&cgGB(97?TSZVG6lfXYaS<c*!ZodZiW`
zIc3@OD^KKuc^xCJL`8=(aM#ld-Rzh}1qso*(|VB}qX#Rm32g%ICYhfGOQFL~SEJb7
zu9Cj(<9#l!JUc#{rh4-_0KZg&gI=}rC56OGjS?pOzscW8AsSIciYk6WWIToux=J+9
zIj8a7Jxt@UdUNBT_;#$Z$K{s!@~7S4okh5tto_)x_6yjpja(Tq|D&Me{6X_}6X$=R
zgXv#<#y$7-wv}5slh5wK*Eq4xxb(vEADM(nr!#*Q&9NGJY>((&?I^ZPpi)%rVnzvY
zy;>b)%KXD~{*RmKK^wnR^Nb~40GM4w$6UG&mto-D0v@)ls-z`}Y#(1RYk|4(=0v5%
z2%XWE*RDj4_;Y=;1v6)$)5WwfX@1=_r;l&(1u$cOu7ka((Q`OW!>^!Tp%Hf~>$v^q
z{#0N6KgP@>%Abqam?CUuWSQQ~UoSP!9hdz(vajFU8iIe}z*b+G6V;P_D=jgd=-2+(
zB7ZqoTGJzEbV!>=!iy(0H#!U%{%cuuASd1AA2(-o`t*MRAh4V><g(nM(XZgk-pT46
zseCeUMW9x59PKQ+nEm4RUdl=87PAmc7|m90_+<;zje#}xhmq6}w%$bNcM@{>!hUb{
zlZTk`fQrpY%;MVb)(*|k<kWywGuCmra5rjwRHQd*xF+gW7n}d~1gD;O;7HA8gcNx{
zTyT@?x7~EP9?c`Q9F6i35$ux5s!!{kfnX)*Tk_{mOuUVBE~O9??yWp4gU37cCv@5>
zw3bYmP+|+VgW|FX8RW2Q{|WVus?FavMp!4hsUz?0@BYJp)eF%@S^XExWy?2#(>S&n
z3yLv;*6-*yAvgIQ>h9hINB#2M^n3(Xco+J#>ofQf%F!}8sTE9H{pV$G`>EI-<obh)
z1%HA@a$@Y9&-}+h+$H=mZRNugz8}&u&(VTI$1%QH;}~UQsCC|U+I)gK2b;W^7*_kw
zt^OxpsTj_ua&~ibg6cy#*+MTZZCi&J!Ou-s`sOFrb^kw_+fV!T)$(u56x{8;5@>FM
z>2%AVlJ-jpVXfxqHANTm5o<96ql#nK_k5Cj(y6bfGC#d9I5TCw8g-xAT3crQ4RzK4
zH!cr1>UQepP`fP=7uHIeQTdxN9*wNV<q(+$bY$9c^!9J5u3VgNMfzBc#g)Dvl`zqY
zR&Eu)8f2n;yf;U^N6hU-n}U4LKf#;FQWt?-d{+MA!Tv?fQMP2KCoWc7?|N~oX?xtO
z?~D{~djlNU@8c=l>PL@GIu-b)qcNQW>a~BI6{R~m9CtWTGy4(IAx>)YUxsU{E?=pi
zSW{3FM1B1e-uxQ+u*+R(r#T}_X9gVZUs*PL7xP8!A4%E`Bui7hi>v);L>*2Yb2q}z
zGq6SdsoM2BslGn5YMUFTegM<qzM^|)2?;;l3YW=c3Tn=XBVhd@$N4;3{UCeMGTH!L
z_4J(^sr+Hl|KN{82~zYG;P_`*A2gbRbqxZi(sYt*jq}c<lXo^0k5j+ebZ~21CHtM|
zqO4)7hx((`>vKCmQu0R-x&S9xN^$s;YIJ3%akU|}xbTQUJt`oPJ%~F+%!Z+gHsy0v
zt4eOM{<{lo-;g3toWl&;=1v6&G{um0A+Lqab|dSV+XMm~BjE7dD4&R^Q4py7Ls55g
z`vmI_eGf6lk*5$#=vH3B<4a??PeK-2JX?J^_==R{AunuoVRBbDmGc5Xia1>?L)Dq-
zA9mX=c9VurOF}5UMwzLcwhoc?lT~<Woi$6~fkzRwc=If^!HM=|>7)LVrUMlaXOGws
zS*C$glhNc>GVD302*qx-XDu?jE1+ZtJ559qx2}87kRhW_x(`E~3`V$#_p8I%H#hux
z7p{fShKTUSReLYBxn-iKo`&AHz#TAW?7%*D;jT5q4cdn`oT}jyDe)1Xn>OtFc;4?k
z{n1(#m5zP9cf{`0<E3f7Y+2qqn{f%n)gV;$DgNsHbh19dwGLdWch@-X2BL>CBGhpa
zey2%{VJYcl8V;IW4qb}EH%=2gIAF>e9V5PF%|3)mij7KnJSDvzhXIN-U(%({MBb1V
zm|BGpcYQyAsoW&GL7{xK50kZKr6X87_3A1d`;qQWxX~LO!K-%YaeUBGI2xyYdQlbl
zJ_r`*#N@hN**>%RPBvOMUPMUgwIA|O=?}|x_N{Pg>jA2R(Az$TGfy4EY2UY)#*pqq
zR%zT>X)BR4p|4{QKTQV7<q>2wktQE&h1Tx<pi&yc`6I0@(vKXqsb8RczbWO~1*eLN
zsh^JS6ZAHQY+N6lJ54-(+?a}jExDR;LnKd6$IR5ERa#50z@}Ipb^(*p<9V*`#0#Se
zdArrl20UA`<(1n1!Pu@~US#Tzo%RC@eFi861NQ1~l@iRBzlV&yf7;u|*cc5~^68@4
zS^Ac7g`N^Q0dZg8k`=#X_?JHao!sBDdmr5c`IWk3&oOy{n3JDg!}p4V@>l5Yhi2&s
zK&={H0Sz&sw2THs4a=;{E0%Cckqzr|%jq%=*U)7cRdj^j7jY2DqLKR}v|yn4z0=8Y
z{a(DD5mc#maJRe7rTauh*jiqx4cz*eS{cgJuX{~Rc8TeCcPD>|SW2vIEU`85Q(M?e
z;rG@G1%CQY*Z!4VHRO4Aa}&<?26;c(1tiO-!`Wv-J#oOAcXZe6LlwMXau@BtKPshs
z!0GeE^!Z19_Bp1M==Gt&A4-BO8NM|<q3@A0Bzg=f(OV%hyMu*O<2RSG$01(ZQ7eB)
z^Tb-OVk7)ygt+*JhsD^bO>iOK`w<MC#y_c&*f*bnVTo85TNML}cR&3!#JKy_ps)U_
zg@WZtf^T%09`Cz&TWQ7#G7&PP5BD|3ssjQmmfG5;+=#^1PyX^hya`OQ90V7*T)IoK
zj7>oND*OY&X~oT9xfzw)B#{e%ePUf@gvNy%VM^E(!)~Dw%fD4Q<ZPjApqjF_N(sJ(
z6c-ZYAXrsYxd0+OE;*ua$`7V4tR`{;aE*)GAm1a7mA$P?2l4>=e6YOB+3L^tcXdL(
z?hc4@`m2h)2d*n(p>(GRZTX9X0mAT3btR{>@J<b{#a}RX(luq^X~^t6hj5bGN;Kvi
z8{Xp7ZcEjLm^H*)a;(+z@Vz>Fq(sB0454@Jp5uK=ny*AZ9~H96&mOTscs?N+E@7E|
zyVcWpHg}(a(!kipm>XuDY-{ChL@Q+6ovq7p7ELn_D1ot9Zy#Q~iR*~()wo-+T2;{7
z8f@~@>s5TcSt-XPp4C-KdC*J5hYujPf+2Hzr!w(!@4;X_J8*<%(S@sd<*iDnYjB5h
zw0I7v_CiZ!Iv>Epmzy0TzYYwUjm3S4m$#5$9cJaR7Co(l3RO#V6+ptexFf0S#J!&{
zJq+gm`2Vv24459KGdouY)7sNjM+|hkH-z|%iGcfcvIucRtf-}`P&j)J+uk-uwb$~>
z4aIWRlTIW>jT8}-NUNHBQ}fl@C>*5f^#<@#8O~o5iI(&XR&XjZzq6QpPV&zxf3jNq
zN@f?>>=<2@c$8qg+fz^J;{B<+s5fd<A8x!s8<4-PRQ_)tyoP_-Zj@MXDUz`n@!t#K
zB;(`r?zXb^UwDS6clW&v+GEZ{hn7w!w-5cp`}K1Ge_C>5c9&Wl>CPLWRInvox9*N?
ze1@98+K*db_|?swQRH1bY*r#7^`bb4iEbpp;+fOCh&5@;!6DdbearW$CV>WWa#X$q
zisW5Ow2feG&*Uh$dFM^bsy!QltsCB3mL3!v4N57g?zhCS@)X;V?Wt2nwxf(2lb#Jo
z$cKnBm9ClPkMk@&k0a+k0tqu&{`?L1$amFz4{|PRe^J`NJ8z$LJ!oC*Xo_I6m|HyW
zgdOV%_-o6X2Z-v3JMRSAp&#|i(%MuYjt9X1n6Qi1=%w5-CRn0e{Id@-A=ov0%a-GI
z-A4NKe+YaVYgy@#oL76W=p;G*&IT86<U5zsqG$Gjt8Gf{SVctVkBeJ!j{mWWdBwkm
ziG)+9xG3W64b9lgFNzj*|BtQzj;H$n|M>BYI5N&ba*S}yLL5}qu}5f7$tZhP2oc9#
z$3A9~d8CrE6{5^zmp#hL4#_z7=J$B^dVjvZ%lC5mUM~K~<&ra=&&TuexR2ZYc2(@^
zmhoy!^{VCFK;%qLQF*<*K5AG{(kyKT+%M;NuesQo2^P73Z=AG>{UVehvC@%qS!l9t
z43{09R{Hj9w5-!*q(jTE=5DO#T9e6Jyw|tVv&mYXjDm}{wO@>^#hQ%s!sXuGNM+iH
z{XP8ade`<uz=@G&rbGDO-$1Wf!6s#wBD5F#d(_w<)1hBs0-rM^E8YsC5SNEsis$BC
zy2~r;N-kH^CYCDAzlrIfXMNM55MmXHofCHQVEhhdxOv?g+DvR1>>TY^`Up+Ye|z1y
zQ==b4))vioHcga1Ol-#OL0HU`d5Ng)9L3LMOU=|RvBzK9*k@j>IILLOi&Zy&+z{K`
zemytBNjyqk1MVA!ooWW13bh>?jU)O+_rW;~euSAOPsjr-7*AT)Pgn!`zfgM})_k8B
z>=Zt(Z#@ipx3^e12fy)}?J!{Z5n|++;i18(yBRo3j&-3~j+UVs^$s<K*H@6&+bxI-
zvq~4~3PVi?NG>^7RRph`FNLXuO)!$LpAO+>BrB!ni)4K!WGy||E#bJ+R5$nC;IkA~
zg5jt=049qJUm-*A(mg9tA|@4Qu(Y5j_Ta1%ZCMpVU_zI3c^!e?Fj2V;C(`rMQeA$X
z=wpss0Et*8vK9G+aJLP!$$#8v$3dhzYS7EkJzJZrYMz*|VkS37K-}hb1Z2wXn<lb?
zNAtQ5(75QZ3r8-_R2u@H8Xs;nwT1EM74~@Owm7U@8QkXf9(n1}cNKRDekFQm;76IR
zcWz9CF=%BwJ~$al9wF?vd!|&{Yjs3F-s%-wY@F1r=aBa(K=JmSJca_LR?j9H%CnUu
z9g{x}R>2W}_8fzq35-9<1Jot$chw|MIG$A*8Xb1Y3k+s;B`oS%&u8C7y?R^K=FxO>
zP6RlahpReHdv>)~Ybq=2X{f6VOx}0V{fxLPD>ThZ6nahfGdcWi=mmU4vtO8!jBMCm
z=b_~-Z?0FBid77VheNj>Aif|zE$G&YZ>C=@g$Kizr{oV%PkD?!!t3rb32ke@=Js#-
z#h&-xgF7XX`mbZ<{u!+8UxXqtcZ<PG;HD}<C>$YQioX_kv!;vipflFds}Taox5zjw
zS`WRAkV9~wDTiZuLb1&fTzO1XB4($TS`}e}NMO>l)GC1tN9LEK5k69P%2=G!A^!T5
z!enF_eK#L|$fj-k-MA*Vni7I~Ef4s(YN@=xdr_RMh~n7g_x-8;$#xz$2C^gv8cX3$
zmF?!QVSf{y4qLVw?PGsStMe3HUsP%OJz`_#;)H5(x$#BwZq4tUjT2zVIDhh~N9aGl
zj1$0tzb2&ut!X*P#4jPTLLU`4)kbSA;@6_pgq?jMjqG1tO81}HarYfbo)Gu5tJ8Ph
zH_6d{_UncZI!U4R4(^&4R{|_u1(SVLc!$K34JYu(sjk7Us1@A3C>`Wp=t7^9SQLGD
zc>O2+YSJdh!8_~z3Q%;n%W|hXc)MC!dipp!09Z){1>IqW#>HO+ffV5~qT-eS=;KL(
zf0uRWeaP<Q33R<t^cvU%304b{l2JkkGp0g45;(4LoOiI^8LlmCnq0XCj{-!{1{~?Y
zj!Ji9=x+y<SnC;pVD6io<l~p%sE~rJH3a=KzY4$*Vrowv&0l=?*zr}W6*lI7{3Ntx
zX4wt6&1imH20i^z8QtLP$H@j8OAk-~cx*ib{UydN^B)-c8#I$Od`)W^Ab1{9fwj>2
zs>rS4EZNiScwE}OeyvIjD7!ouVt({I`h5ranQ<Nd5e48A6e2BhCZSS3^r&or=Y1)!
z9t4{{qgVFNjgCqNybOge`TGO#p;^G2eTdND(0Ij1W}q$ys=|B&e}O#{Uj3^r%k4xJ
zUtreP8l}ts&J6fk@UeNOHs{3-lxtAtF>7sb>Q;cKLYm3y(~h!H*Lh3n8m4S_r(93}
zdAEJScE8UK4S!4Z{B^6eagn3siFe(Z%nSHWNYAdK+&TlROyowF$nx{rolVZpadUv^
zzNS168b}xTz=yNtWGVlUL}}Cug-lY37;fOXz_{I<XJ>mhz^MHXaKk`X`+^LZdlc9v
zUB9IN;%WWh`#xsmdm>zu?dL9V7;};y$G!m&hhO6XCw>nA(8dXvGJb|;`&H$%eLIgX
z7Op4bzxKf$wb^0z7)6>?KQN-Yv_0w!7uT8l1!XR%vQJ%d4#;ng)&c9eQ}jRyNA6;c
zI!(;u4mfm8$9O#j+*`<HvI3ZaG-2$HiK*oF#N>P4nx}qr*c*s~+PBX#0G^jY#+T=e
zZ4!*i-9zZFMjU9|w|yb;u$kmSqok0|I`RM@!!00V88l$J7Lbu0PtL~m17osgrd57#
zGvH`x{b-+E-~74o?weR#7w+xUX_W(yooe+0^><2%4#7q*k|Iw1oK?O*82y(6H9Zrj
zQ~@jwnu@B|b)djPHKN6(G2$yQ5n6r~!tl`<pqkfWVJiL~QEFjHcixKix<CGTcJ5rk
z3E*R%o&fXfBUi^<8jaWz?lA%w>ea7+___?5I3*BIJ&bh$7(Q{xPJ;Smk;k(?*PoFe
z1~i}K1OL2ah$(s1x`hrJPI1aFtccq42B+C5aKB^D{sC9qeHFv4q=@_ke~r`Gp+RRc
z;#X@Ao`8w9e$`iF08<OuS$_g|tPx^LJ^;tR6UbW1Atvp1Jq3K|*-ul};HUXVM?gb;
zu%YS$)aP_O<j3J}f$yBMG|eveht!j)B*^|V)u-16$g7s;5@LosPXfCjLQ98QpCo+a
zn_X8%>K?aGDu+bOqrfg+a(e#ytpzjkRG%r^6*6VfTh*8Wu*U~=Xk=4Wyqo!8a=8s+
zD|ghWN}2=HWVO7RTffGLS31fMfrQl#eDu4Jjfq|Dh{as~w(T*clAD^)%1uIqqT3tU
zQLp(-DQW!7Az@O%e|7FdSvvDB;7F%cl{_yjA)aX%iJ9)GQ4HZL*rPjvAd_YtRlw6z
z(g(Cdnu^mYQ~H5GJnPc~=b9@ITpCVGnY&6_7tP5^F^JjNK~?iZ277Z0)W$vGuU_Yt
zsmNAd5c1LTz|2vSHFD;7QY5+Jmwct6BCxVeRl4vEIF>*o`Agr;kAVepW2tWb?ODPu
z_)T&pU?nxv4_MrFEACE)O!jotV)hr_43}6oN{%EQ*F7!0xifFa5Y=^woFd{>Q9ERf
zqQFznt#{m{upv~Ns${y_kgzquGBpoe;hxPAP7MQt$9yHh#W)lDhrB^Rht6}AM|gM9
zm#t-$jG8lk@9sX{L6HTA!u<jBf)Z}UNgY{z;iW*fE&h2_e7SfhmNMlox*B+&5S1T*
ztY5O5o5hOK_ioVF@SnCI1lEW7kf0ALwR21R>p~4Pxr^q&6zUAGpj8uO?PqnVE+?3*
z`{WUFWu+#$6W~KreM&+nQ@4A4<|!v5VasNi;mL-OPoRk9foK^H_vrx2uFOu5mNlfw
z9s%3a&$APZ0C1MYybnwREiAwZC53J^ttlcS^XeTq^*RrnzKD7wwhMrhpy>+?I=eCW
z3mb=$_iE@Fg&<3xp}5>{SV3N!1^>aC$gs78F}Lo}a2mpWtTNB$yaxAU9k=d{n&y6=
zmeZ@S?R$UHPKC*G-Gh9UwsXY&E>>@Y^47-L6)viCT*8DIp>*JYCk4M3CHrg3xQrSs
ziQ0O#Z3uPDA?N<^s{i3GLk9B)$l>$=LQ`@jZFDOIVB62O@-IJ_^8_0x*ZK0B;D9rR
z4!G#*uK(FQ3X!+(u5@RcAx&Y@gunbmagwquSSO96Z<#Q3witcV8D9OF_ithsdG4Q$
zYee%`hm}Z}VXYARDyF7veurj%uyo=rxr1kTL*~<H_`FJ+&wYpNa2CuHC>#-9&}_~)
zBlZ}pdUUIic8W}7m&hgoFQxdQV5#w7^1CaRYio8$5>LG>WYf!>5KbK>#Mce{I0vK5
z9&N6|19sIafN0DHrbGa!qQFU`!g>ltylz^*i`@fP&gf?(dF%8#uJ_>%=v7lgOi{r^
zDqrmdjk(APsqQUSa(1`s`zTCQiGEWhfRu~M=#Rc!MhDU3Q@XtjOgq0{j<d{c5f1P*
zv>r76igXC|ex|8a2gW4n)X8;LFoEl`AM&9CyeA7*X{mRmN{Z&jOE6LXPSE^h;^b&T
zYxl2aq1Stzf{)U^yCg39AP(KRDp}rA^i~1jUd>G*=c+>R_fAKL9vQc<vexZ}dKuv}
z1CI9+L_hZ}hLC2O^DGDtn#cMD3=Y?N(zO_BfU`~NyjdiFNx}Oj5p$Uz^hz6uG8w{P
z&{%|2B8PKXujU$W)#Z6HuJqzux~wC3y%t`{_S@MFPMuyj(+oI(*5=Fv&!@rcrSEh$
z+VOt48kwcNd(!Fb|EV$0Y@qY|>C--d$QL}g3Y=t9(jk*Z{BOv#c-QRF=^S@V^~LH>
z-%@;BQ)qQ7!1MR{#=8GLF{29na~Uuy8RiHg7n$V04)p{{9Z0U$?bNIegXQH6*?x+e
zOv*l#+f4mpIrCbtfUZS2qrkUynu=d#$M>e|dDmb1ItCqU_`h_z=;hZcqKT8l1Xl$T
zua~rTzidU{)<E`}z)-!y0@%~>EPw2>{I(~%Ol+@wvC2MqY3BVQ(}hUmYJLP<Zytif
z+I-N@^5(SDN;w#U!MPS?tA)2X02KS>9nL5p{c<1bpCR)sQ8P+l%aReYrkDZnbz<#y
zd2w_30LPCGvX+xJzBv2=^K#l%r~4hD;{%2^gQKei);>sf#%GLiOIKBOGWeA5uC%)B
z<(>g467_*n#rxkZ8f20vOaA2TJ^HPuYH#nZMr*oPoNLcqb97NbJ>9Ofj9&)hkyBO5
z6Vahgm}lmIZp^z%$0Y)R)$+V$C+Q`|8x`AVX7lgZK0#_cHN}{j)VG<)e1~jt!+%nm
zA@+z>SUFu-yN*a<64(n}+1jnzFKHfsx@TZ4u=p&9ZzNr*0-yCKi=!wt+HR^L<nvj-
zR*vjDu5T_4CBj2ZbU4F>baPxS&DY7Q7}NG1>rd4*6HnDz-N8*K0Q0j3yzF=ESCwBL
zI*W}y4fHb|hVbaMwHGOpEy+EZ;QY84`$+!PEcZedZA=1;<H>Bs+Pw?;U5pq5C|N$J
z3Pi|0%nKE}BTa+97Q@bh@-6}X+t<Js{wnH*;$4R|OlW*U=JMKC2d8|+@@78$c9AU-
z>({<PN_znL)AmYRCgE+5ha~yxfGrB25>xxCsSVzts*f{A$wk0YwknB)<p0d%R(!|O
z#Nzu=gf&SRWE-0N*$UMrGEo_Z`0KSUMZjlD3i7Mv8!n^uP6&x8<vZ+tnH0CT{QB9L
zwQ#w#xp_{v?e=biLyS)qhwW7JyV<T6ntG!kvshG7&Nkb6l_v+q_u$uCkR4QKmfq(F
zgV8P*4Bj`$JhJ<fPf|NUFvIwWF6m%El^36XKMP$l_J*I&J?!u@5(%Ty;Ctb18ItgQ
z(VX+Ss^w7EqI9i^@6~~gAm862=Iu{_z1Wb_VB;Dxa^*ZY*Y?mPM8;5TAki7Qfx{zS
z&zP|erkSKIm+*OAt@0k@;~6TIG`Z6Fa+z81MubfxAXPqqnMdlD-o=xlJPRsd4|ZKJ
z;zMt7zgGSrHm(NTO8FQ(2-*j%Rhd|BfFNt(^n}N1#b!*@VOKu-<*pV?YYZ`}GDL<&
zYk%8t(!{FavWu&=PMJnZjUsG}))*Agx_DV3^4SPRKC9}n8G)wUYMzGbtZCwDRjSCE
zs+HYLxyd+K+cT}Fk9G{ssKl3EbO@qez80@;V-c#PXqYZy@;Rc(q_`uXNlV>C{kw0r
zSw&}x1YeVUCC{nGxQ(?`lGZw<>m^s`;~DOgp{sUiYwjl7b$gVZq@v@z#`%FiT=!j9
zA<o(tx_h0U#r@u8+0t#Pj+p{I%?OPWY-{s@#=l#06C^e@H2MDjWrtr7^?J%B?~`V7
zm9W6BtArAxi6Fwm=7CGB(F*}hNvn={lp3+8v5`_yLoL;N?aL5IJ=7)}HmH-ATBEy>
zGlyNz1Mtdp^RruxYcP4tpnbUjes+_ibt82{Feq4PXTW$ac*=XNnKisJkf#&`?do@?
z28yjlhXcUeL?^<}C?SgT#$Z(VFI%o}EM89Nw2cDG$VnVE+)NW|&SFSq9e{KbrMg78
zhqWnxO$s4}hQr1X$QEGnZ}xa_j`~QG(<%V|h&C=>CnMDpTZmejtU8^4=zGg8dA27a
zwJhl9PO7Y%U5OGD^hQ{W^{@GA>aQa*N!8X}bwJ~gbM?m`rJphwQh{oTYJI3$4zYTA
zT=k>VwmlawmStXP2Itb47aSq&(%x;aDAfX4L%;?vvU#$@$5ZRdXkF7BBKG@s8#yJ;
zI;%AaGYz{HZj}LB;@+n#3g@+#L1|m>I!I0*Yu|kajy=eL7=Nlo^~w~rbIdrBuH4br
zT)`jl6m2J=FOeALBZHVi44yHQE;tMg49s5g@)1@9rg@rNC^qaNqA#f7g}Ao*I;T>?
z0%%VXIzDlCDD2!sdSGE^Wk12GxO9&t#!0W0J1(#%>}^~*kUA1*3g2d041P(|Vx2&{
zY{OCT(#R@uC}60>l2Op`1AH?TEu$`9&28}#Kc>%W;t67Z1dA3C>Jk{{30K_aR%M)j
zq5-rc;~PrNzM36xi?z>Q@L=zXkix!@$O8E0(FLEJBczDG&{DiWJ6reR&>CA185zS9
zhk{bg(Qnwpuz4D{Uv4zzp<_84Bzf`b7}!fyby?uJDc5lTh;ZZ2NkKc@stn`7IKzf7
zcY;{6!(mrC0&pA^zvqQpB~B`-QT`1~tCxmmuYajStZ19!1O;7FXDnmv&sDDlUkBxJ
z6;t3GQLIsvN=P0ESaHQ-dd(<5SOs|BcQ)Vof4XX~#6d8&8A^zSocX~C!yfdQ2)E>q
zV9-!`-AlimPkK%$5JH9L&LH&$ztO?#c+N(!c)>pL=)ZZG<SI(_3#154<c#B&l#c%B
z$|G^eTYyl>&_LcB=`Ab0WvM(J8YCSrv`~0K%$3XF3H#-Tdb64+!!ys4%PmZ-_^sa)
zSL~Gzj0fLYRZo`(@=*T;FZb~^548x!08{k@IWhZbO}+Viwb&$4#}URf5IJO6SoVd~
z$l3grrVE-utj@ciU979PXM@&sZ1@D9sWF=D@KCKpudiXP1xj8gbR(tAr#ci(D9
zjU{+>IYS*!Tog-p$j*5yDr?0t4WptrAJ2_wYO$k&N+9@xX>W*7Ik*4Cw+sm}4s~sD
zevGkZKpR!5`TnKLrLGrs-EaLW#Yl^)Hhg0uG``x%*M3+5iLO7`amMAfhcn|rKP|76
z3YXXPeFcL&`eh*6wRi;@l~O(Sf&2eDIhS-^Nlk^@yeSkr<SCmB4Mv6ShRF1!mPQVR
z?$8`Om*zcro6%j3{s`N_0!vz=UQ4_{u6A!*$ayPzt_#Q}tSXPXWlBOy$_9@O5Boi4
zchU{EMXz8jRo`Y7L-&zwJ6#n@c6C-9#rpFxkf7d6r%cZ;`XanOP#;)=ip{N0K9nTk
zSTsd=nO^q+dtDZ7N45~(AZ|0^2nMX(q|@y}7e^7F!|Lr32Z@(!+iUkeN}6jISs7Y}
z3&D7i8}L=c+<hiCi{y2kQgQ3xYb;)gB!X(2g-<Q|geE{;&%$iFP}Q=IO-w66KgH{&
z>a5OBHeMDk4HNUF>j&TNFNkcwT$5!XMn@~>CA0YtM=)EQX9JZu8<Uujf8-Un)EUMy
z#^5!p8O~;sL@o1~c@ekJ$w;jh3GMo9v+W@6(LS&D>B}%ZEEl6{d(FH1>32s(YWn9S
z=XMiXb#r?Au7YZ$CRB~Q&?grM?I#h+H)$*o7OygadSN+Her7G?VecZ?z%S``uA%D#
zA?(M!gwr!L66E4%C~>6(`O}S2%Py+ET4vIb9PK7v-sbJmu%d!6b$?WsZY{yaT{rFW
zMZDsO`V$U(flg>V17S>x3q)E5%^Ui1MQo#pCzF69x_%F%oX0|mHsB{}g-PdMllW;7
z(-32Cp!nhHlnxeVX<|JjsP=MyuR~@}^7n0SV)NEdbMxf^)L8jPQb;SaDg)|S1a(J(
zPN?j+!`4rD2^ANWAsly|1l8B7`sb9=GSgFGlgjV)nAq<8I_#2&V8nA&S-%%&X^E>?
zo38e9jkWe-Jfl%7GS{v`eSTo`;QC{q9M3saKG{3j&UtsG3oE)42yetw3pM09w--d`
zg6;4HI)_>*D(q1MyGpK6B{v4t+oo)BhKRk7m0asL`XvThJr-w=iFsxaKjZ3z??&jZ
zj?z$WZq7X%H%83ZHTL*F|Na`NY$t6w_;022>amh**F!u9aKK_)5iLR4{T+Qj^~5RJ
zo{}u^jQ&mQ?Jwvk>G<eO*Gci+6xp^>K0+3U7W$Tm8Tz2x%~!X>O(iRLO+?Ew$2bR?
zX|3hho`CruCuF@TjZ$&lv3JXzX3p-uF8<(44h{9{O)9ghj`i;g4M>6wMr!rS?;Etj
z=uO&Rs!!jS$xW2mFth2blu*PeNpz(<RTwvyl=_+$iXNhc=0F>rlxFFV;?Wl_v0|9M
z6xMpin&`RPK&OjPzrB53pj%e_+}pn)l)th1j8{DtvcoHAIo_BRU0R%TJI2=Xd2@oS
zruPY^Q8;ud$0KR#JY(qF#S%>!z6R`<?TzI(SuTytp+|@)M5NYI#{SJGZ=||p-Yom4
z#ReG?0`guSoKuyuZr#FNLz!&c7=F207Kza<7;mnL?I~OJ-}zel<@`^Zoc(?;IiK8d
zN$*31SS9g{NK?kG?^S%44Zjq7{!lss_!}Pb8VGzRJ>U*#G|;l|r?HUXSwXM|_7b`K
zsuBKZ!TYAb9qj<3H3CyH-F57R;MO*jjUAYlipgNdl}>hsNHjPjIbcfS)g;xIR8%co
ztvnfw1RPSe@+Lu`N|r>ay4ye-d?fOM{}9cEQ%jhCQ*inRj3$B{COs=8d~RrxO*M=0
zaf%y%*3EIn!`RYa#^Z0SGO;h??^01@mdN4nE#@1baSx1ly?`Y=FI|FcDJVv^F1j|?
z3O105GjBJ_!-^Rz+&ZO-Q*})^U}sEOV%6F#1cVns$!SYfChw|q^p?a<Ze_eq5qnW1
z_tC<&DFyW!+e;lN)nU@KC4u)x-^A-(qOWRva|lgu3PMDkw}^U(w*!U!1xzI~yROUV
zNw@N(B|6h2(09wURAa}>l$>~RaixO2Qr01D*B1E3BaS*}!o8Xn-La}`0W7ldWf?xk
z#k}R$|6j@zKwm^qq6N|4(CqtfNFj_GPahb(Rk1-)zQ1s$K9J=T!lNxR2&KYnNOv`M
z{*V0kB);*tV^Wzn8Ssy**<gm#k$yVw(*xEOLAk{7ef0IRNZ$*C1cUidZi5P<HMpil
z*sB%=)14vLv`b66A>Voe4OT*89(aGIUkza|MyOWg6}~elfWMn%BFYHyuK+2_ndBGv
zS$D^LS|u17R*vngXJE`XLY!~wEpHc0$1OZ(l5Y%Uea3@81Se&^Z1(Q+Y*~hk^CKK)
z%dCty_<3{u@@m{_*$5~0z^g35{vpVLBOUTTN#q$A@_lX$^>}W0ru%a8hj0h8g4b}E
z2zEaHXOvGXC^?P`D)N4df0!y8(`tS^q~kZWbGdem!)Og&ntGsDuF8RR6+Ot|?}v5a
zd{Hvb?D5fC{o7y_$6x070aL<xp(;w{?w9*KEHww<7_f-_phYac>0a=^MGUh*TMU?j
zI{7k=n@&SLvHGQs6nkWk%4>bPV#p2_<LuoRJ5UFi@T((Lt=m`|J@EYdu?uR)oCM&%
z7swOTpuWp^B$O(4y?Ul0#I?Efg-+h{5<WsT^Xq2=x36g^WQYWCI?}ijEB}hN!pWJ;
z)XHd9>F~sl=k~QGB$A9jrrQjV8W(1z*Bg0=rFaF^8PZfE-%>c{3ZCL%960k$H7BZn
z?%Mw*f%_Tn7KS6vnBGv6S^QS{PN8sVEkxybvtRd`l|3c{jk8p#L;EudG5UdAzr-RK
z(Qs(F=RU+^HkqKx#jf62ge_H?CL<UF!!{2S>WK5dE|$@tf)5Q$T5;hGwJczs(QpK^
zD%LcbgOqa{=~W#bEoAU&h!2;c0OsVUxf)aIqpM)@)#<SQ*_1<tS8Z_<!`yMwMkr3c
z4%gbEmJ8;r2TzYCYufxc#YL*we7;o|5aBx!0?9_NGf5%2sEq%-^2b3yIFMel2+hp^
z873_m9V2M!V%vQVEzt1`m4l(&uPk8+r&!}CFlt!2IEko9H5m>rSV4i>Lspu5?W`)~
z_t(i7i7G2}_=Ti`@)2bhE;?x@+hRb#VGHA#uYV<Su38`aIEK_eoM%=h&$a@h#^>NM
z7QkP}!6?ZoQ!jEQtE4k_lhT6u2z<zQ5pbOSl#1#F+ICkqT6GB-54)vxjswsLs7RQU
z$%@O9nql|AUlXBK+JsMqx&Tzn1F>rDw&<qvs32W2Hr3w<ZiJv1AS%GRt)9uxNx5Zo
zUtB6>7q4hcT{8j2`56q}Ua}<r{L49^x_&_%yoh>ze6k@~$FMc`DD9|gY&K1^L%rT=
z<>?8}1S?T&G$ndW$z#K2xPU<AMroH=IrE}<HFN9b1Y#9&oe@XV$ah1ev`fmuyhWf!
zc<}och4VVa6r1J+8_(u1?nS`P9ZSI^7?leL1>0kEmPo0NU+-6FwGTwq2i#6*P5s{#
zL4@*k#>qNyX<R4$*-_9S>NpvS+x7{*vr)q4=1jM7$aC}Wb5QTQZjkO%n-cDj%Ar3;
z;8l1G_j7+!Z-OfYV69FrUy=&fP>PRAN;q{Y&<XzX^$ckr!(Q#4x8zE*By=|&{`pu{
zZ}G%2MX0?H`$U@9p#k6EABKQVDL$}NANpq5F`3AXsvtXG54C2DZzD!XE8}~NyIUN;
z+f<?2uP28Z0kU%t;iA9wjT!|l$K)cF-6s5pSc&=sH2>h1Ks3?+1X?;x8BcH5k4ZI~
z)WgKYf>V8jQiSZUc2h8qPTmv6=R~{Aap_5LfiYdlosSW6!&wu*F3@!~TkmxlRh_aF
zyOGMVHMf`Cp+F_PUhMAs>2pKMG(lm`(#>^Q^0Pum8^$@Qs?CZk?;+a0_Dk-IDzx1b
zbgQR~4=lwAa4Z_|Gr}w@r&qFV#VpdI<6Hxb08i)*!$`efh!@RYEA5RyYtul<oj@s8
ziW%=oWUMXsh~Xo!OI_KJ-)KKA<<|S%cdIHzsg|>*O2(0^@1fXaTDZGrR*E6lmq);c
z9AC9yMObYe`y@P9>vPFqiE1O^rO)v&aPRKixSHz2no4_B;vHvGO$fQe%58$1XxEIY
z$5MBVh&fUo-XA3qQXH*2CmS&R<=g7dVfz((lPt*3*Q8Y~oZ-rja+E~DReQPoVuQNg
zVlZ9bV-iR!)*+<(*h%b_Va=SwvnPXm(4KiXpYwlUi%Zu6bDN(GHA())-Q5fLvu0(>
zzMeyxtxU}+&NkbtnE9hp$!I2n$(#qUTEwOj3$}!NbOJaPV83+ROLz2%Esuk%KdFA<
zoFxm+p0r7rxG<pm(SWFcjq36sx{@gCXTnL6+v4XIo9_m?Hi+0o`8;s(9lma*WyDTr
zOcQk|SQ73eycB*7W_A&KS3(R+{d-6IV7=?Ogdg(=B166XjE$cimSFfi;qQZe`a7;O
z;N7tSu#fW7#*jp{Gwi!E*a+wM_P*IyE9}OUPM*KD=XWQw`mQ+P0h;)n+?lZ@&gp~8
zO%tG}OQ;icy1q)Xy#br&`y=*~Vn(Ot?06LyOd<d3j}Jv3tNZOzpjKxCV!UJYaLk%J
zCDD<ym}lEEpfaa+{^0p`F2lNSLOfe{IYwj`Z6~&0A<)=D8$YHLLJg1Y`KUvP7>K-<
zLaV>_^tMt_`r+Q!{f?S<cv*i^pZjg8^@|C2BM)X8sJX{h6vyvPZ2o(GHz5Gscw)aR
zxFz7{rcj`3kZUrlKN^XUM0=zeKfZX#uSUCe${O#vZ+7|HZkLC@?t#8_y+6`>7Dhxi
zPF|m8Xw_q-W9)@_sVrD<tm1+#>*M7GoR@ND-c!%Su%wRV=Qv$ldF*ttf4d3M6?sKd
zVss<a3BBZ)Ri<>ntIV?>FGAH2vey2yy`t2*)ovv<f#ENyYi~b<q}KYSG1YH9<4-_N
zsn#nCorhfcfjPK2&gb)o=@W;7#CBl;lIDO-E#Flh5;{)ST-aA|Eb;9;-X}i^qc&n{
z1GpLfdp_3)4@Sa0w`ax!_1bGfs&9<34iV0fGl)b><W_aRt-5X|O5|^U77}PG3iBIt
zhHddSV6uF?#qGxc_zQ6+Y9Vx?G5UZna?i5p-+J8=e5EJu<@aCRr%aSg`UY=Olpg1a
zwbH&yO0bf~72PHW+?C7u1H1$ITuwVmqP!jT3f#Qv)=}X>zJPM^DVvKS0Z#44C#ISO
z^V(bStmDYOk8>~0)H}^8fp-z>_OS*&G||qzCeoyijR^HNWVSs4J2ECyz<#VBr<nZV
zmUJ1PsKZ#LYeTa}$mETbYr<%^Y_w`CCejlJdpFT1Khxqb1~$j+SUH7KFC6i0e9Omh
zuAJI|nasagE&{5bKnGKC6vO}_Zi*S-=^fovgoyr7w<-HpS*YytxY1SD=_Xj@2nas;
zaETH+*(zfWG6eJ;Rg^;R-S4=e!9;0Jli5hUJgA6W<VS4j<D50=xX0=cEP(QWS`D~3
z#xK(2YDCz?%yB7bmS58n#`pV0>S=X9I)}0!GsJ`&<KF#Vy=l4??ofo!HY0F|&e3yL
z8d%PaC#{{_JNECnck;i*83rv*y?%-R7H1AmU@02(#W(JK>c~Q0OW(C-e?w%(0q-fa
z4)O`IYWBBFaRefWik^J{Wlq!b+-C!qaLtTiJ&2!=os9S#lP=DDJ}f=FR{hC2JgYj2
zj#Ib9X|`laqc%I}KB5)T?aQ`SF(#xDM8(H;JE;0*gzRii`%8W~G}p=ptB7-4QPu<(
zx%IcZlAERSEv`($>#aLG-DA{kZ0=xeFReJkhE}vhdDNtHCzJD5$6rQ1O<cpxF2OTl
zGBYzMvpGvWO|fj=Yc13b6hSmWu(jMfPx24<_DAhy8m9n>lHM}YA=6i}c@H1oV)9#Y
zMkG4)VMk4nSBC^t=eD-c;*y*buX7a_a%S0qH~FrJ%>K~X$y`|%%_q&E!cdPJ;uY88
zi3(yfC;5c!(yyXrS<vas52}<*G&K;>5m4dy$;`v#@zi)}I%bTn)<b+l(5v2pI|gyw
z>glcA0T^`(Cf&PHVH}(nBZVSk8J+w59j`rn1JcYYAw}!zHTjvqwdw{c4&3~i_E_ra
zen%wg>~56B8gI4)l?0xtwJF6DnV^nrloqJYVZc$rQ)zTr=rmi1hQz34{yzdJN$U_#
zjz}wQyvf;1@9XN$AyH_o)@^JG&ubK~KEPo_T!~2F7ZhT|jGBSg2cn0B40c^J8Y<-;
zmH453w!@eE;RbsVf9#Il7p)b-oSXlXhKp-fT;4J%^c+`3=lJVxQ=2KEH%(NXtV1t^
zv+@$0A=(8$Nwa6(pHOGEKA*Q;RX~g@_WeiPyUdXt?r^U7B9qbe-49pOMnD*^Cjymp
zHSTOdi6|)PhMpq8k#z+OxZ6QLDe{jI!8psG1$Tf4eeMdb+JfiVJ#^BP@x=l1Mz|Wi
z8p9j<Mux_rm!TCJ=ZyLBsRVW>)AHPV{^6&6rw$J$PHMl~@$r@<_5n6q&zv^V_Z@@5
z9ks~<L9G&Q%~@!$cU+jFJpw01KH9IVbl$C0q^HB0QmQ&{mri~K1t<vfU1wCOel`NA
zY5$)*tIWaPZ*>h(e>n|cHomI`^|W#4yZ`iCO2;-gwlaH!yIk|bI`F=iP#pW}BVv(!
zd)M+wsrS$0gH>+PO3gof<2N~;RA%F{7^KuWKv(B6)H*1P%v1cYbr8l>{}pH3zlNKP
zbD(>u3Nw;3B<i+i*XxBb7Z=e<$F}Sem5Xd}w^^T&UMrrNrnIg<rnt9BgE)r!l1ior
z?**}n(-E7}W;SL9jnD5{Q?q*_QJimOhfHswXN=~=TQ{mck^c<plKmxZ@#DPNn7lAe
z0%v+L?TS~e@5(YpES;ogz7`LG#9}zj&#ol6bx2%0R4NDDF$0^k@{8R6VtYoQYPv%H
z;W7C1;0Lh$y4e$Piokf#(!$yX!Ak<&$|mj~osV7^8!PyyHLK14!?$W804p*?OS9_@
zAVH0`cqMuQvo%y2TwzlWCu|6&KCd19i%hOS&{s{Kdoj>5h){Nel?0#7uO9>ZA>+A5
z<~;ztY9j<3djatn8&l?gOhiox#Y(t|fysamu^db*1`~rR@iS&A`hD4BT6Hq>zZvq5
z89arXK46XQA0hHXpdGEJjwb#x>&_mS%oH5^Lj#|LKU%+b+3m_jufxq%uiSBP#w0yg
z2D7?Pd!TOa)ds`MZxG}=<;8xT1c18Kn(LyV{nuiBF9a^g*4z9q|JSDry!Y0K>19r{
zac{V4uw`Nx*dpcqGyh#JK%NAjRkl_uu$v+ZGI}=xe8me5#r|9Qq#MA)F7M*omI&oK
z$YIf(Azu!N`GD;}_^I!wAoC;-(7e>-I_(`x+5jj)F$3*Ir{JDirleL~bDBjgbZ-`9
zA#f}|!^yI9fCB&?l|sUIxu!)IP`ye8x}|ZjJHOQ{Ccr<%LjM~DKDYwqyI=o(@NXJ{
zIT;`)yfq5iQP##5_gg>K^LN^K{k0S51Um8}uh$UImW~cWikf-<5f}vkJx4HUDFK7j
zWiS<fpE(tcPh5I*c(!aaplecACdSPL{dXI$GXreiu_Xi-bgJ3<u?-3dvird#HL=jb
ziV@+b7cbAmJqCbIg~9>wJ%Y{&0D5xxHU>BdRe*DD=x=cymp-F^m+U+zbacjackU|X
zX@nbg0t(=Q0QeF$l(~)nEyr&vfl7H00&Vxre)&(cXw9|oO<=XT9IwVevzldhcpLa~
zaBip-g0GJ>qWmU!NOGA5TJX=ypC`t_K~RIF<GM8llwTEqW!ciV0^aDcMdz6<1F+P=
zxB0g>fh>!eCx8exI7e{cX5Gw`yRxr(j03%!ONolzz-8xVdltBVIk_*`K}oacHS{jN
zfH8O#$R0*5!_SWp6KGbwkN4)CfNzh}w-en<a%GQ!k&rSs<ZVNNIWG*|b>1Fd@<+`{
zz~g%i0HV&)tL_?uuHzC2A1Z*_r{-b+*gGE`kLl5Z?@h%8-F8uB7l1+-2HvUcw1~-H
zjZx2?0Xw%~LBKgHNc!5K_5WXU32qL!lLPpAa0l%oM1?Zl6HP&ib6foSZw59={&nFF
z0E`9OvE~C7DAH1{qDcZU86dQ>f=rMzfogOc$N`&zDS$rjEkr(Evv@src~B10YO8|%
zEPzdW>)*@))rBC@?|7j+AP*&+2Nh2hvMK6@c|aYA+_5WI5{SzplS2(Ufj@i41a^lF
zFd@s%^#PP?3lmnU;|9PX-sGA5)bO7(fhL4Kfqjz;|8GH=(f}62NvTLtC2<>Y0@Yyq
zMAlkF(PkWGJ>pXB?(|LLK2xHuX}HOMKxGXG<-1!Kcuz>1bmNxX`T>Bmwm>LZKI+VW
z#MjFZ2t!lvhwOjLXz`W>zzJ6XY2OGK1vq<sCUJB=jN?fORC6cl{`&Km61qit4A9F0
zHQuAfF#1oR)8CKPYB;$Gv2#R!cP*Ozhv0&NU58!8cpfY{@bg|i2zn@!o$W)#k1OXs
zECMeRTX4$TV^0m%kcBb>I{7JebLfeDMMn9J|8O#BA8=Gd=Pz2C`Ep~hE~1!{k;nPN
zV7z7I6t5f@<OqbNXz%ul{`cw&tdvIVJ3#qT(f(X0TwcM;pLB{hTQJ)^Mhtw2n<n!>
zdW#_Y^bxqFVlX%?2%Q{vNmB_E;cI&>(zTY5()1DlbSA7`v;J2gj^P0-nK9`Zpvr<j
z4En;11-H@>gufm@zcK{AD7%PPA<%AZgT?XjVt@%a&ky4NyEr{T%BeW+5lKrTuI*2w
zeex41X|z854ea3WFTw@nPW4lPS0qybmJ)$TyJx@~2h8dz1p#!V%(@`t!ql=}l>~|y
zKVW3(;I|q0isl`#1hhuvfFwB(cE}wBz6JIG!g4}5RMB(XJf%{^ZlKSz0P&RXVch_p
zMd*qy^mqt@ErP;rY|!D~9_dNBOL4H6Ut9r=H9r;ZQ88IuJ9Qr@H93xL0ED|P-D)u&
zEP7hRg=o?Sc-Dpi3EU9EFydJRpOe&9WDQ>N!_TY6|9`KWK&9<m0HD(~gf=0W3DD_Z
z8n~-l?)3Rr#gz7Cfx`j6wrAx3K8NzF8$@{i4QL*18(R6Q2rD?8uVU=!^+?6@nzx|n
zU{M=9O$vYVp8R+D!x$&y9w&c4517D!%jxGj4mb|q889Se2)XFk^Gb!@fmR3z?)Apx
zQlV=^30)!pd#+ms#@(L)B(@%=(ejmAd;rqb0>4$Km}Ka`T}}dPKWzF(`~TLy22bIu
zkH9==RNQ|p8v8{KfCz@c8gv5nnc7aUGi=E{YxWj6)Ngtn^Y1AzB)4Vshb<Go+IOT#
zKsy6c<Yp|VY#%h>Yt)-xgwU^o-sUXW-Y%Z%HT>_p{16D-btBRU4FQ2EYZc3`nhzKk
z{|sDi>mh}4n8yvGuA1v#8|V!X&~v~EWVs%DzhAa8ZQ#7XDAYMlz5n(FNPqu-iy}h#
z7tI)0W(o7*zkj(Fnp=jlECKublJXz-^H=7(vcT0Ogem?*tXsp9RCcFOl*>GC{{q{R
zs|LGiK<4JOG*k*5IqF4OEj>NP&;^-10iV0~#{Zr#!++RqP$^qp`u);8)}<H<Nc;g!
zc^CpK_xS&~yW0>^UoxrDUQD+E!6_<oCi-@U-UoQA^GJkw*4E3{3=xdxKmXqOIp0MS
z;WqF4cu%CD|A=oeeTBQ)txjvB=H9#;8;%=!48&t4VB)abA1E}b7r~}E48RzdA6Um+
zaRhQ+?cJ?^)o*&J`i)9?#`NE1)ntSOu~q=a5_dE0ayvjE1PlQgpN9NgiuLrMmfAZr
z(8%Q2;{~$4OAxPl2~@;CLT&hp4$1FB0Gs;wA79V3dwm<}x|y|OGj`=M=lvs$i=eEC
z+X4hl{^j0(6IirwqN5DW{hrAK8EeR;qZ`E2En}>;q)>xZZ?)05AUgZx8oTzt;2pl3
zfyl)oy0;qIOTm`C_mDZ`qRXvOtB~VA=g4*?vL<|16}|%h9cz{*NSeJ3VKbg)#~>wm
zE8SKtOe#ImSr3-SZ&OWCcL9CbHJnE$a63Rgsok4n^uW)@%{hO5SKz?}jsAnD{0i<v
z-hZn&O?MIm^X1@PcErjpUN`_K=W_ur&q=Aazv|Xs6ysoh5e7f*?>0Pae6<Ix@2u{n
zK*)s#4`9JJ3}($7MJ;txMuFy<Wt3469p5eYR}_+~Htzp|Rct9zi!>#qu(-ysOp&-o
z!FqFo*r0<`{@{!ZQ+5UN`w>u)w}Sw~(B@b%;eU|&)YCn{eG(spf64VzglZjx3D(HA
z2zqcqx|coc4l<!;^LCFKd>CT#<Zt`!e4V)W{>F|(IGV;m-IbF2>8nmah^YYesZmJM
zpO1p%rlm`}pdW7w9XO;~(0+%Ujhti?UegibCp#JLecG<YVF%!-V6Uz+yu41P4u2Qv
z0iyARc}TvoL1wT6T`P-MjP>;m@g_jIz6AWd-H-~i#`P(v2{=s_$8Au7;@R&h2K2uM
z|HB#XaOyL>O(2FjM|C=k(u^5wLZ5E}oYOuQ2wj@{8aC4@f5t3&@(uks;1sRF@6cFL
zs!l_4*iozKOC!}@7W6-<xv)HGstoYzy9icRbKf;@>0<MN?u@zprp9?{rCpVX=MZ@u
z$e=&PNKs*AZ>Q{-=tek#?uVMoan{LRmVZMwV`jY%^=<m>L>Bl@aC#yxvzog1B2bl|
zCp*J<DCePUdhSTqf1Mc$WFLr$egKz)bFDrD4C{}jvzoXx7<ESP{Rj5FiJClF`+utE
zb*M?rTXWfZ!;qTMD@AqgwHA<J`dxAh5!yh6g>CgW7E8kGz8z{iLsl;KVoaO+V^7*_
zxcXjHp%Uo1Ffs>Vug$8G)|oTA==AK%T5^?Eg4c8pv@@rMGueP3;XB4r&@KD5lak;2
zTc^T0GQoI<!iFzrDu=@`co5k~cQ3J20_MO$YWSe-%Dop=JL>p$phZvj==BkzOo<aK
zvR!F-UiJBY#gJ9Bt@N0S+T6I3F+pk~3!IQkpe$!+@*T?AJOH%3r8HB&IO{3p<ezNQ
zRN9BGt1ixY9iiVWGCi!hS0n)Qc!h<npep=Erd;_i1;J(WZO^$FpWn?#KTofAs0~Z*
zBsofdUi=j<M8NVs094TF<N6b_+DhfQafdC%!he09pgc+}Q{Mm8k&mKIkw*x*cY#sV
z3eY;X!kF-H<d)F?wZ9a~y@PLqW){(7n7SKty0aqN6cj7KW40X{zn}|gt<hJd%d7l!
z)q^K+E&BG7r73F2tLS})Temx+YjP=@a~K?tTp0@0B45Frg4G1Wi^poz<jA>|%;YL+
zv$^Xg_Q2=Ggrr{9$x6vfd5cn)atuOEe+AD9hw$XwXn#~zYCl0whB6Z>4?vzYc^)AW
z+E#Pqw4(8Tz?{L@^+q0-$!qR!z=i$sK9L+%&qwE@RYdunsgv=F1|=0Q6hF8Q4BG!Y
zx92I*9vF8NorbO>rzS@Ci$cPqMsG9PDIqWMqa}N#??8b=aBPc5rHbF=rZUcnwYcEJ
z5ZTda@ev%IQt}{LmXA2sJYy01l9mfeh~_o(wU;NqW-s7%rbKD_toULBX>k$AG@X7p
zL*YBj5vu?uT_#|0)G}vd%w-xE11T6_t1!%0jHga6?*psQ_@>fGmZkT_UqdT+9H(o-
z=5Ob_zB|IMIf6V(LKomN_Z&fekwxM9LRN7^lGm*)gum~Vw=&5RAN1dIaCOuj`p!vJ
zf3a#_!=JXk!}+G#(=^BJr7H(GW&cctyDE`P?1#bZ%iZ!1*e{pL2kxL+yo$0W^`6<E
zkO`LPxUt*8UjM2(w(Rs@>URC}V(v=aPKBQHC*YqVJ;oTSAnVa`q6BN?^5!^NcFOus
zoV@uW@af@Yf%;oQ|Eb47{wz(h`Yn<uHvtkaSK+6kSeeMuw_>uci>7UB%0%;Z`qIY?
zdso8X+~aTh``~cyC~qV7ltW;9<93VRvM}jWYs?QV9v!8+!{6eqvh6p4aJ+e%N@tdS
zAIQk9XDObB^o%LTsuvx<9Rb4eRduVMkMo0juXeTOS#jzr8K*{4j&RC9oo#Cq88{;n
zwa0z8!D@ZBeY<AWPn?M%j{S<TL9kmxVT9OrC<{%Nc3w@dCPymm(}cSpD)YdQ=?}Ql
zVF^HP)_nxo?S-fB<4Moj1zzIah?fdqC1qv(!7jccbqQO~n#%(8xwYcwZ)a3Cn>9Gx
zQ2mg1-vZprZ!Epg>Z2X&IsCocwrC=TL&4j9_AnJpO~~_ehS7@0qJ9nn=_pbbCzIvF
zl1h7{;%Ixc^;tvtU7L)#dMG#E!2^YwWUMDy59aKpmLavS>FPpa`rxq<OAJPbOOUm_
zv`Q#pv&LxQ1kC+A3T3#c;uCyswAA2a?~A(3?avcbUb`h`eT`>3Ol_V0I&C&jNNy16
zPWvL0%vyyr+Kc+zaQf{-R@sk$zbx;TAYT+dUSij=3ihQ+MI+Rc_d=hut~#YvA<fpz
zjF!yJYx~7n<M~cTAd%m^^)YtAP_1j?WyN135bpcms(0S}mli=8ZZQ~9VA3`8tJ;7y
z1E+;?pwHPK>RR0rD<=V9A}X|i@k==fwVLm4>nwYQ@VQY4Z>ay6no=J>%Ni|*&F^uJ
z&GjRc$#6*7-I&w7jq#fQ^OaU+7<ltAW}{}Ac$KtzM}yqfaO2FJ_F^VBP*wosYR0ah
zt7J2=|KqbHrl9RnL;uc4)@sb;RHS2L2(|OYxijN*FV0|0cp~EEo?hI0Y}No5YgRp3
zyYJeIk_Yvr8JWf38V_TN^OE+~yN37Xv$)MJmI<&d&R|uakSyF7rRGi@%)FS*uPmpW
z7~r1!L>psSuH*85UI4=Yl+I2-%;rP+fBrUEHx26y@j<9V%AulT{^gV5*lQD19FFm(
zU{-1vj5@L5Q~PIkc$dOVRD9o^-zyE4H4~Bk<$f1;&6|8{3FId9amCPZC!czh_1ZhW
z&v%c920lyH_8aO>a(dY<P>2ebp9k%WN+jzBcU%KFZcC>tOT^uFKoz0Ogum0Il19*R
zhERXB2!Y7LS`n7dA<a5wM_@(5T45Do-BMb%4D`j{F>CH~L^B9?uj%(<B)zz}Z{E)P
zuxDb{sVPICuKS7qMA21ek#ciLwyyq)i}LK^J$PQ;T_~kffAxnFjYbQ%-wSf>`u<V|
z?#l0M@-si%kHsI2g;Dcv22&D|Y7rEJXIhrt#3`ojt$(+4tn}35gopc1$!ikCy|VTm
zo2U9v;DSyw`_KF`Ec+(I)p!W%IgjkCd&TIk;9U~!zRqi>2ep(UWpL{V;wRwQ8@7<3
z6u_p`7&|*WtI2ygB%U|%lL2$#5Q9=8{qwrU8h+s@#?3*)yHU8&*@vo|lS7vUkzZ?V
zyfRpm0n%!<o?Qaom{7{APu-X%+v)W@-dOY9i;*Ns3fvXE6(tX4+Ut^(Sla+|Ci(qE
zF2_`H@$nY1<wki7O}}*)uNlv`oV^ZTjd*oB-g#D8M_9nOxJy4NcsYGn!O@NY<`PRG
z3s9Q}A!WHj@VR$7$u~W1kB!SDd<Df4ERDL>QRkbru^jTEIOQDbSw{eoGxc+`tM(4H
zy2QE^WAHaW27^D+>bK@ZrWvPv_i!!v)6<B(Cy1CzC8-w2`Q|+f1^0y>Se)u%wv)o8
zO{qw6*j&twl^Z8*ADT>p1UAk|aBBWnn79E76KUG!^Pq|b6(&T2K#Ns=BR;djgch4S
z+1nSPL_82a?9s&QVOS9UkYQJ7-a0XGT`vYWdSr*^k`u|PjB72lEC<d5&kV~HY|5R2
z!xt$fO4=oB{-}#rr>UmZ^1_cWbb@%bYMv-FN~BBK>D6<XyO_&`uV1Rv-t0WaC*a>}
z1~BrcNOfdB))YEeE$hFPKzlmw3}xiw(~L2LgOd>@^`#%`&%l}1r4wdTY{p>pdqhR_
zyGzJ2E(7Lqp~4G?J%(^yS~99P$T%rF+kK-Clc8#_ohUhQVUI;-B&6GQr3Ow@P*Nvq
z%(uqww>wc-SN2kMrJwmp;e|sFF$N7~$$8y|TR&}Lyw+5T1yWH{wH4F{KLx*Q$|^Qv
zHqURWk~Dqm)r3rEn)g!-OH4tX!1^T9b6j>m!@f;|WO2;v?-~bG#Bo`N8Jac+e{=)b
zgQ>^G#y-i!s(wWnJz}jZB}X0PuGuTK(749#uZ3A-yzi%hs9Nr+^Q)r*VQqhbIltTL
ze;;L){WI_frz0+_mT6!O%6xgqa|FOx`Jync*`mjKXy!wLwEp7YZtWSR2qD~^UZ7i2
z<v9kGRRbrnX*#Z(NH63!ELvZ36a5jbZuFb(Hcb$9addUUwNb{4@57tkE@hqJr8HxS
zV3A*`w5Co*IwNIaTu3FPO7s3*Q(Ld_E92TOEoy;!+(QB(S0-*%OhtyOTsr|7MmdCv
z{<hk;A_a!x6el$#egJn%YoZbQT?uXr@)iGn>vKNOPN>9shQKh>bx}rM^?a{Go_AAK
zl=fx>3V}JDD_tFGO=?3t9s1Y1y4DE&Le|alLnXxfOw9GI_H#UdR7t(@QsS+P=#914
zRv8Fk1jBl$A7Tfn+w1u#KQivb`*L`%r&gfa;@vrQuT`HmK3|x&TYshVu{n6%sIwj^
z_HBDp^@jqubK>ljoJAYpX6UH}hB41TIaRsowGMM!!mlyDhY-EjRgT8#ga1h6tETd1
zY+nT-*$$>u62!Vet-=`#+cw!4GmjGW84-de4S>2#0OZ%jl+UW65=MmbBN~hN$;{w~
z!K6m+8Qj`tOUl9{A@G!t_%ld&8)a0}Y|YuIWz<{pHS(ErrD0<jB;SWwCF@g(QekL5
z91UIb;VSlpBZTA@vv@!6#dt@`swsk0AoIhJUWC2<k1uAp^=okMPK2KA)ucb%nvxq7
z9w3DCTGR~p#2c<>m2ej@(yj-a5csI2Vv!-Uaj6p4n(`F!t~|vw&lnBDbI(}WmiGa%
z@=>eW8IYeY=6R8N0*0{b6BZp;0$<)*4-0lpkRl3g&~=|{3fYuj9i!fjSyNr&SEsSg
z2v+Gr@?3V|QvLoO2mzA~jBQ`M55e63vpel?yL)Rir4a|0%w-g^Y(4y-kMLOQneo;r
zNCipNSFbdqr=nR^pAm`8{$tPA>)b9)Iz`ln1#W;ux-2vCs1lAYn&sxL7M&3E&jS9G
zFrkP|P;2+RSDIJYcZNGn;IRI@^94$7%m{*6Vb4cCnI$d%Cba&05K!$j*jNZE_53pp
zz{}CCmL3nVG{)vm%(9{at{-1~rZDouaEdzQj5sx&1pWen=xOI@r5*q>{@9~vD|##P
zVjCMv3rnMovdkTu%A93vBBC$YbtPY~oY}LDugMD`Od{k79!_dB-1WmGqgCY{d=z$f
z-rgGGspZk)(NB2(=6=vHw!66z6HFFa_#EAmL9Fjm`ZX^#>h;_Bu9Ec)7~__U8PQEH
z{Uy-nTP*MuSs<mZwXfazfQF}v<IJYma7K+L)*N*b^9cL(3`+`m{}4CfrN-LXq$xSx
zjM!P?Bksi+c`L!!Xfv%B#Aw)8{<BekT>%iXpi)=P{?kNi9RF*(Sjk+_E9<E|-l$%8
zy3yXs_#h=b;)+XlJ@e{&jJOfsWPSQkU$h^>m!_BaJ>t-u8wk0M^-F;d3})-?#@oL*
zl2W1?A7WbDu&{Gk_>ymSXH3_c+Qgdh^~mpxY&#&9K!o1qrtKW~SD$-$0i-<i&4thZ
zH(?WUO7oQLQ|x}sDR+=Bg`@PlmD<3nw{Ua<(zz*1<#8rLiFJ>@@TN5pM*|vlcYUhQ
zq0NS4r_`+;B+3}d5&RN+@s=^Y;jFa5^aRGTkcqnuXhB0Jr!RoJE%Kwi7WW&m9=Mij
zu%P`cY5DvV^%EEqXm=Q31N9yFRC%e+Yf9>F3V}Wd-K3TR))aM2sxcZ03u===E5aN*
zu2V`@2g@=0y^nT%_*+kW-2pZ0Lho_zOLK<pctSvwL_@z|=-D+6>&ff>tv(476T?E0
z$`L#r0jezseqk>QAWY6)`$qeApgHBXx#B>63Mu30oBo27<?BbcvOu%LX(z-={tq`f
zC+ca(AISNZ$@UZXQ40k%8N*yNv9v>b?(#;emoAc1yU+L%f^cY1s2A{Mr^l}PBa#pE
z0>*e<)x3Nr-^m`sap{o9MqpK;`Z}WsW2Z=D#NbHbGPKV1biq1b)HZzfpH?6#SXt3n
zB8-2}82c?|>P~SVSl-SEJQ^=S_rYfO+~{>~w7bzVO`ISl8}ZIi*zM4@=DfRD(UL+V
zbo`o26{t;Em!HS@A(Rl8?_jkE%v5OInXgR#ZNG!UDT(ks+`3nAl+(j)Pkh&DRui;(
z#j5YT{8YJK;Z3{<0R=;L@TpFqEi9YX2RY3nrM5~lNpf_qI9f~|NgY+j!1vNyP6RA;
zv=C{o-oXwai|OH!I!8HKkPkvc9JZi7$pNVdNxUPTFByJ7JaGr^RkpbqPjzOqr!ruS
zCL73wX5!Fo@vWTF8waFJhcz&z9kmy9A!z=NA+M=iE!%2`fNCtlxoU#Zpn2@Od23np
z>SpS}>4T}5P_6Xp4+x)0EO3CocNT4-`F;Vr6UcWXf#r0SG*6j?|A6WLkG;1Fi>i(L
zMx|4_I|ZbN5R~o?5g3r}mJkK$?hffzM35AvbLf(g66um25Ex3odwkw+zt=wBhx>$!
zW>{<8|NMo7{d)JSepJ~TAp0I?Vo|%*!X)wa$(;PhN`#g<xB=hHwy8?qXDj|`pk`Qv
zx+YTms5<_43tpi9?Zdgn{C{w9cbUPosN34+hhQod(hRrfE6bvdKq_Qp?O1(K@BFFL
z2&gi1OC#M9#2=fP?RujKk1v{URXyl7)=0LMQk%;Ye5(n(<eN<C-GXy=@&qToTJx>W
ze=C-E1<mDl2`b_1QTc|^`%k<G@w|$Ami8-t!rij6_+Pm4IlxBVT0NrlAC;FH?KK~L
zYy0RC_L$;3u_FLg5TNuNO|U#;YuUz$8B6GY<$c=cMUYMFv8_~pg=@8KMzW3R6LmLS
zi!#espY2DN#gF55v}RFoR&F{?g$7*#uJ)pCOJZKKV@ss$(CHwkpIAY2m#rJRq0M?^
zy)TLBhCki$vc5;&P7#EV5u$z1m5)<DKaEc=D~@A(O>NuPWmJB5w7utHL8a-2`e~n`
z3+}kv{11RmCdxB$cb<EEja`UbE-ZQRX0?jh9R=7OsalR2A|P+G^!lzeZ24}pI39^o
z{vPdZ8kIC2Z8sk^#1?|KM(LX<|7^Z6%@d$B9h1KfZI9Ys2)sG8XBgJ=&Rg{UfMcFj
z4yHN4Vb%7?*WRSzD2Cz;J+h)}QNFI^1%n!;259L_&Th@R2l{Bw6vnpYYJDBdqI~<|
zN3!IXjF;blXh!UQ!3}*?fJp0y2A_%bTND2HcX-8^*4sjt4Anx~dvpV>14Yd$R2v10
zOZl&_gTP~*y%PbsOpfzg|LizDs<_({PfaIroN_hn<#BA|))Z`4@Rs)|*LVA;W9P-c
zO+*XP32n{4r4XnTemetE58urkroX5R=n_Be6dy`R`kb01O&KOJ7Lc6@)7Ox+IYU*;
zcR$JeP_-F(sOYnotdoBGWHV>gQKA4>;)-Jht61p!_3$gRyfltEi3pB4ES#hv-gIex
zW?Mc!#auT4RQ*^$IJ6Pf_Rt5dtF$WO5AiESGm<vnlj?^Q7t99{`omO&+eDV8Lush9
zRmn<CxIaFUiwQCMrm(#whn?*9jC`!LrY<xfIPXaduVt2S{{r@kDgtJkFp-$A{evm2
zy?&_tWT#B*Za#_4MKsI$Cm)_o`9})gtez`jlWatmC&%5>-*N4b+OuTX@Eypr|1Pm-
z<(G{Y&`+9h6=P51RkbTC&xq~Z`vY@A;)s69*#tdtKVMu!ss<>{aJRj-Fsj%z@|s}g
zC#(!uyV{WRJNfw1WU(Lan3u)qLq{y9>ce<h>|~vnSo2%!2kjPI{>JnJev<0xwY#}R
zAY|Y_*trI}&XWFr1Am3071FL?raT<hTodUkF+JYf3JQU1kfU^%M2W0IizS?9J`ico
zCx-OBz?KMaAX3Y~e{OKXT!vvCF7bq_^6T;&=P05~r+~7+9w#w=IqWqRU)df>Gb_i_
zYPt5ooltj$4|AXjrm?jCKE7~pc#;Pwu*gZw*XQRk;%Vr@3XBHaRuy|-%f{4`Ay7q)
z1+{i);ZP|bv7@}n9^W@pO7Y3c6WuJTFmTWYxbvf!l)LM>&cr3jX#ArLlG|O%PGV#1
z0pun#T+35cFo-CS2#Y8JThc1xNl)^}7-HYRQTx3IAwtC}CV!89QLY(Z-NhC<lCqG_
zp$rIK*b>6k0*WR+uSf=1EqI4xUw*+Qa8ag`eQC3BgGM06<aPY9X?86D02ZQ{!gV$a
zT#4m|1!orYEgAbG+;uYjI4mFUSS+<XF~D}-6qP^mN-8EwrRpq5SHhM49d{!zCqPKP
zD<t<?u)<q=u_oZ(dYS<V`49WJNA|Ba<D^IVBQt<F6A1Nr80X&Y^yunclk>tZXjMK(
zi24ma+gxV8de|&PWP5VTY=~cgP320(z(&#6_1q>!k;NYCX6;w6-s=l(jE(p=g4;w4
zfWZ|_%_E%%j~uucij7G>z=FDVA6+_J7%=ECCUG_xdg*JJ+wcHAkg}g0C)3Ck_XVi}
z&UO4ZPk<5$4_2~r#@AaW-Y@_z@>zO5cFd!7>0&HI=ej=Hd6m=?CuaifgIfyun|c)n
zA4)SK7=4&H1-@E*1#74%OG2Sj)|7O*VvGy$9AQQcQ2Cl8r25_x?syqz)R5S3do331
zL?gPWQp}_MwKi)%8hi+QWz~gbDtP?#I4;t8{qDN*`}fhq%>_$a-Xk1_3o@^a8`L?0
zP=OLSP!W~qWB>!c=r)teTBX%<f$>4<jN=!Emju<UFm;%&XCia7d(xJfyyUMpDl2jR
ztK%ZUm-ynbM2*y~|J{wS8_1<R1E-IJ%k#v>g3m@bNHm8`I}Yc7wkhiFvx@)vlm%==
z(1#TM@FmJkN)I%ciAIb+-=<-OHv{IQ+UxWUtiaFNuUeSxj&O*ZeU`9e=@d7Y7k~iO
zST74pP)wZ2wo+cRa=353#aY?6ZKcW-#t46Du8BxmyOgvr6NXJmczBr-UxL3SBjJL8
zb{DNUWN<-tnd$N##N1N?$V%)eZ4!BCml8Bfe~1&U0_Po0>l-^LxE`Z@R?hX^9@=`3
z_|99m%OqOh5mN%4FvA~maV?W9oJ1Fn*Cfq)Cf{P=CnpqVY_JsN@MeX8gB_MAU(cU|
zYa9LE0)so5A&pUR3_Ynd>Hak7uL*_>skA?N^EaY8y81902V3Pk*gP#J`A=z6f}$i0
zHdn1WAag@wTg(yNo&~<!F>SgBZ#|itvn=>16ttMCT?zZ*0<xU1oO1$9x`ebt-Q22Z
z&w_wsrrU^VhS;=J|BPW@Y@X4k7mMNjIb7fBCWeKmi>-a_fw#Z%zohy_HScksD4Ow|
zP9W4g`~AHM8NIq$5UoCRUf)UWl{-y%aHW=L2Q3vX^9Bny)vJ~HxVJyXjUBd(iVsyP
zj0+Nehfuy?Q6}-t$XeNCw(e-9)L-%w1Gaf%$zG$1-SY><zJdm$F-H{R)6sO&Cl-8O
zFIWl5yi(hwHj~1<{yaW*`_cFhYC4k;{P%w${b-^0wDaH+@Xe+a>2QwERX&P;X#cWy
zyvO%(()vKkAjGq92Xsq*w*)?V4`TnTr~J%<zrzkIzCF6`LITXmZXzi03bX*Mo!}VD
z^9NZRt@#4kWp&Zd&8n^+(YLgDVJ4#CV({?XZx7nSVk5lj6@&vgc65wwtD4LgIyfSI
zsE9twejvPipQJb|5Xycdz}FWrzmnz_bxA}A)ylg&J^vQ8$4p3C`Bjh+XQ7ln19#v{
z{ebo!>0K%R=yw>kXzk#Hp;T{%IA~{&{U?s)B~6}hR&?hMV3dg-e|{;oOsyy3GmoWB
zrfs(rBILam0DHR2XWYN%rbx)AJxI^sbMONfV^%=ji+tJrrAL|R*#hB<+?U4|jBQ4z
z5yzY8EO{|kd#XRqqzCLTy5pivz^1^;ylqpA@duox2QQhDxuQ?*IS#pXp3mBMl^=dj
zvy>AC;k`>tJJjl5Nw*aXMB4H{ns!S53cL7+Y&0fat%&%)c^{t&l?o#~Be@l`7d<S6
zh!H>0IMWyibCXcDQsq(^v?0ud1*QV80%8MtqFM4H1#sn>mr6^T)Tug;n31|tM1zAd
zp($E-R8KaWEq~c6(Bs4WpCIt9J?n_7*UZRW)jeK|>8d{J5)};h6`6IEczD<=if?*3
z9P7^>PSsOzc-7PPWVa}+XE1_`MW?Z+?oJj_95hOLowhUj<<`Wi^R~@CsAC6jAT1~+
zndkHpq4tvblx*MfPfNKu5P@UGC&ODO>A_^Sj5lRxj{T<QNkATvW@T!7U0{#RFGg|o
z&Fq3ZQMrTh_F}s5k1&j`XG;qpKW``C)OO+dB<m4fO3M#ny5p({4$IPTVG@!ILRn36
zL9OuDVEsr~qk=guS<Vq5DlCU!hEWjbu`fCD6A}|qan~KH-Za)ABi9VlE9;Gpem=4$
z|2O_Z31yb6H~3?5NtB2P%Xe+O4QtQr(dM;e5%!@{5CtMeE&#LP${lR^o4q&5{{gmW
zw0wtx2e$3dw?4#%sw5`;P~kBUdpjWursuz4P2thLP%Ro8mY8|2{-tqKx26T8wWKHt
zMKdmQGYN^n)ikIe1p^6Xfqpba>p{J`($fPq_PTSO*3sS<JsYSV_xTf^nZMh(z7ZAM
z=#xvNX?qFP<ic@rUwaMM^8#zxAx)7dF_dZ#M;)sT6&O7wm|aq_26|~5xQ?#+<D4-C
zLwpm&qM8K0;a+MPh+AVkmhi}m8{K&sQ0~X=8w$<F3rWcMyE-A|Gm%0#Ab(yDM71FO
z_b(jtKE&;Zc)(b(C&}uf1KvE{<Gn3&hJ)>C%=ZzZ0F`^=0d?iBhpL^Q|6Ro25n#hM
z!yBzd{y+U>_C0ZvLCcL|+LVC^6Q0fd5BkfA#+!%=Ea^waEc_UE-q!O=q<w>w!Y_Aq
z&U9E==di>bCBm7Tkf57wGqOX^@g`zQ@gRT4qd`AUFaXG<4Xc{~5W3NX$7kq+U%<jT
zH;#In`}a&KH?MBFUm`l8C#>mpwlAVEg*zHGNZZ1tBlYQ~nujb)pgrAbgrHZcSpQi2
z^r-wf%UmqSZX>|c4j`r7=JQQ>&;)Cu|1#5wXIrRBJrqmpjBFF&L(Sp{6E!y-W&m<g
zg9E6KJjat8qSnovQ_u!5@(0VeO5~TYPA^drtvZzAfq;s!d8gX^q9hgNgfP!^NznYh
z;OS*m%ihOr)t0fhZb{Bd7-C+`^CmJvz%3Mm@oGk5iF?<5#a&kn+G?-8>9zjGU6Ah-
zE9H1dGIo7g){Nc!C-23tUwL*_d)XUNflXhZZVH~V-K1~~_3?S>3Y^~J2UisFfi(P?
z2~16R#T=aK|4YIvI6i!FD&y)bruY=*cmKS)a@e;3X&?;VA19Sov>+d~F4Q0Ir=}2U
z9Ry(<dIj)z)@S5xj|J9z@SVc(NcLL3KFe(i^El8($n&u1@+-a69o)UyZPdC?RSdSn
zk?o+>-(JS+j&x<CwIk@6Y_E|66mr3;)YW7$FxeU}6KElx1-58-qmocLPL`BMJtRY=
zsuw@?<eR-63|?^21vH<lNF8ynMX2<OZnMm>$f4sT5I-gkut2w`k_O%~-H9Hd&g<sC
zT#kMc$4saiOAzgd(5>MLk+k0oTXx4QER8x}=Ea7O*O~!{z^GSDqerO7K<ih1x{34P
zd*N9yIJUb9C4-z3Aa563eGE>4wlu&$2rHda1U6}>HoXBS7oazf6;bbmCM1N=i`X6*
zIKBPy4|Do|HAjJ{Byx%5h%O*1Ko*d_;iERDXKz4wdQ$dknYZ25!YHE9DhT=xYvVo|
z{v;+prop1DjLhd>qsg*7ax&{aFc$nL24CO`{T>nn7c&mrUuvZNqts7dbqVwjDq5Uu
zHd<0`xaylk#0GlNDe)8##p{V=QcchFJ@oFl^)0uaW%M!FM{8@Uh8xSrD<}j%aXEP$
z;^iU2H($qJ5OnavC`9^<mY=fUnF-g$SE?ZSi9#ORynFk`&w=wK0nB|Tb{rWKLe<-~
z<zBJ6R0iwW=N;!{U^V{tg%87r6G-n5oEH!UHz2i@@Z}3Qs_95+Q%HR1;=kC(^35G}
zNJZg*dPiN9G_(sc*)goI8Q)?+94PQg(9Y*wXm>_@mXj6QO^*P5ZlZlqq9y}*#dydc
zvjp<IsY;Sk*jH{{mhXb!nmNW#)`cR(ao!4-pXE*e!B%qodwYB}QBeSEX>eu@0({od
zC5=4NR@nPW->jzct9+`hWBT^3=lp9?U)B#dxqJH1(Ao|pioL=cSeh=rv{xLz`Dc&!
z20-uhv|B#@&n^0m?*2m>z#CYP(Uk@O)`7H{2K(2px+6JYX9vvG{Ts4~qMnlgLR@ca
zih!v0Cf!wukAuwMmJI$4_m8GZN$@TCNP`A(#=HN)Ku_SDloc*9004s5X8wtORgZUJ
za!m*DRthLCxBl0Kyhh(+zFjR=(5e=+?w|kkb(uyLrv3OA8!d1<_PHMXQU2QcT`72k
zXpri*8_<1?{}Ky3TNmT+Xz7k6Kqe=@diV<T`zpVe^b>Q(ThhWANwW9=!sZo_W&J_p
zTs67C45u|ymXOq9CNyOzqW||m7Lu+?B@|8PCre0p0kttJBm`x1gZb+;m=>_uQ!_iX
zbFtxc0!)_mJ3+ZNqF^X_>R;CXnpScED0Ga_eM*b`2LQ-Z?OrL%mYWzr==FI21w`8T
zj0zRVVs2_Uqrq5@*t|v9eHKSYL+?7dX#gv8MLJ2959zjkRPOu-WDH`K3CuSK0E7zw
z`;A0Q8+E2qBBiDr*ouxSPkR3nVk-m8RS^)7mCXDw@B<*Yw|{w~?u{?jto-{&{@-n!
z0~hM8Ws+q#uKulT5P*~SJ%u4hcRcoqv_aaYS7|3<#WFvu+^6#;h^m}=r#F$kWva$k
zG8GXdL>_%lCYWuCr!Wv8Ai`B9BwLNw0!NO0{;aM|Q?5mw)mt$WqC>GQ07@-E)E~%o
z)@1Rt8RvdBMgC|Gk_q!a%DG_H&v_q34(H?>v*ix3qxv_4LmEBRDR^i{Fe;SYf3ze}
zKx7^L@@p;rDIitS^=O-uYP`P8ThvDWu3?RPDBFmQq_1)O9Zl+M94mydSD8^Dc?={F
zeJY520}twf#gJ}<+>;|Id<&#eiBA2lgXB6)uX&I~N5TWJ@|lEr{|_f;ibSTUyQ}L{
zVfhF)KY)+PQV~dsP)XhW)@udrKkH0UL;18KmHF*DWpC~3kmqZO@YsR<|LC2MP?zMJ
z`H`Af9!W=Tf16$a9AVWLz!X^EX&9Zdo)`lg*B@Gv=g(MO4gl%@>soJ=vZUnQ-=fyT
z?}7gg3_o%(tY~olA7mde9t%!cmYCXzO1|eRj6|#aR8=qzhv{v;l#<lto0@zLAQmBN
z!8eY$8+$lt3OO&e4~IMs;@%SeWpw9%&j4eMk&^t+Np~2qGqkCdWL^KmwQ`c96!4v?
z^{Br&c4YNi|CLtL`gLbyu6P3ZB7l_S?~6d(u;pdFxSa!6$va278A|Y%W|1@%kpaz5
z|1FUJ`%VA*HiQodyZj<adG)`a`Zp=~pAY~4AOHX6(m4=?1c9T}&plGlkPpW*M}!6d
zamJ*%pu4%C^IhoSzeoBm1<CcGxSakAXcLkx<$*_Dg$*yYw;I$YaJm4l^0vIYgG2Ga
zPfq>j;t{F|zsVCMY~0WPu*(X77w_)JTu=9aanI+I)rw!>kvQi#0igDUyUUdqg@M<O
zU+<HL7XTdh9N25p(gKn~wDkFlit&G0Z0o_v+s@lQx8^MhqzYPZ)}EVlC!&vk5s34;
zAhnXg@q5=A<=9Dwg#OQ0cU(L9{k^;Nv*{W~;$8b`oaX@=(*{VF{>`t{WmGKsOS;;<
z0)ngyFt$R1&0n_m-qH`Y?;*8bF9a8L(f9sWjV~ngfjhvA|A^rza9|Aht^4&abX)rN
zU8eCckh?a14md9xs^04IIvf8|2_`y{$^mF9QU5qC=<Tig<3M~@)NT}XW6AOd>gLx^
zUlUFTilf%901))ycSG~TlwDCk>7NIr8z-a~0lFADF_j9s&+co$?O6ax&>Fdi``S9b
z5)%C$xghzPG?~bv^8$IQHKFGqfPWSa0YPd$QTh*pC=mOSWhwaYx7LRvJPq++)8vhG
zUD)MvG^|BZa=~=-Wc<nTJOH210)M@B(RS<x#Od2kmlbfc4uS5k;jRL|{V-B8oYwVo
zO8e<BH=Kwyg5|Y7eGlQ1auNgY<R@<Eb^8jj3<4=?cmR;WdzmM_cd?W~93<NjZ%bpb
z(<49$QXe0_$PBhpYN)D+HR)r3OkI2ha)BF;I1o;M3!vFO+W>l}rxp8@;1dl}aNYtH
z`3N2YwrWiIYW|ky#v?f%050dt1>0dN9YDGhg{lHq6D&b{Fn~<LA~gl#gx|q7!vVmv
ztJ(}(L*K33(4%<XNc_D$y8w#q3WGt0^#0(^9Rl7#Bbfa2hd;%UHYJ`uPC&+-l+WIL
zApT^sCIf|+Odx~22*eXy!A|J#Go!>W*#p&@BOh#U<KjT%Xfh;FCRm|@>giB!x$#@X
z9phg_RQ{X23S@GRLSbJkr}=c`s@*ez1Z=O|Za?acvjqF9!&8etjJ;2yU*DO}vhy8&
zo4|JGSHmf2O{1bO1~RQlUy$EQOvANSUZ@#Oahw=lNWmy00t^j{I?#db=M}L^;=)e}
ztZUZh4(n2+f2VQMy~t;V7-(@sao`mUyaL|dFiF7eG@N<-SMon)Ds1W76{Y%vBZ|AA
zS<|PBHJ!w}<C5IxS3ozufi+Q#C7h9gL=B;8^V<v)JU+t$xK3gJ010esJ1>H6p0o8I
z#GL3YYE@YNOB`0HRK`FucSRl{4rp|McVba>+A{&L8J3ddfllAh^_fxc3S>x-Ouly~
z1VGvEII8X4V@|+Zi&Rela}5V!(90w|s6GSLIQ~oU2c4wR_SkP<Zrz_`l!%y7-lwz-
zOrQep;f<ZIEBY6KewyBs%#T2nzlagZ$RNbhHwB=4P7&~18W{`T03Hg6Jcme8_X@r8
zel^!1fF@6H1rebTsU!aHoP&*>bHgY^eFPV)A}LAo0{T0ptlg#5oUI*+buaC_D^3C-
z!Y7_RfNj7aknU=U@frcjB?U&H0kJSU0QeYE+k3ZX6DvDXfO2`)t}Zg02%hG}x@Sl*
z!Cf+VKTzz$UMje16-dpoyOWux2vyY<WC(k5HY#MD@ddCEhi8qQKS8hXQZ@nHF)sWY
zTMK)P@RgPd(a(qM=WwtmPms_9EXhPyAQ}Ay)OQ2J1=!eW4>WOQ_dy7%M|$MAFakWb
zIBVG{C`V(Z2pFX#(35AB`{Ub0Hp5v4EV&a|xxFR_kbE&9i{@3DG&<xs{k~5OvizHv
zIq8s1_*gULWSZ9L@q0`3TWQ1pbbLb|q9dibT>!Fs{5C1y1pw~HB#sp_BpZ#86^}v@
z1^N>9h(wL?E5Mbtbqlxk8W*<*yS}DTL&GY?bF_MbM|%w*@;&t7`?}<r?f2_<PA@=>
zre39f&-OdFVO5GBEC9!1G|l6y-_=K@1)$pcwz*v871Dah2U7j%>Fx&LNES33YJUOp
zP&P>U#MVEP=H@L=et=C~X%ZmHI}AxJgDfyPfE0rFDlV62EY~h9IBeQToE!jb$+oTQ
z!4($})C|8&sd3u^8norMnI`#7)3nwT+aq|IDKH81DExJ3f)wG{XmLOO;F2i5?U3=S
z`V|2YDo;hle@Js3WQ2NX@%i7sy$5KS+K1h<atKQkiBT@E;}{oEdg5EtpkSwzL5Y+Y
zfZ(r7hQ7f+r1_|1aL+2=k>Qu)iX{FHgcY`J^z)+yy~ZQ}+;!My(416;e(vclo>303
zMCwGH>69M;xaP-2Wd*s*iy=%x20ETyR`2xYGN}#_S|^oZ<Fqf@>YNyVs`GliDBRr`
zd$dBgF`o+_KCZJWogZws4g~)&LmVJ;zC=}){@2&iV_UcnZIpkpaAv+m`aUdRG5RMX
z`B@7^vSIbWLBpOg5MkMWVWk>*nWD@<-Tyf%#I_^d{})Es%m8w8cCBLm%ga9Cw@%G2
zYmY4Vxd*VFoE!jru=Z)GKR}v)I1ywoqOTHVyg=o8&e)C)T+QR%4ZhZc)OIka(H}^W
zLXJ_^))Jr#Kaj6=8RvJq26&0-j>0n(AYM)(&vFa1ZWws1uEqbI-&JF5X{v#OUbgrR
zfEYOWTlbn__0_ZQ4Cb6Hnu@Y1MA+fq)OQuT!V{mqqkXJ`rx~sj0sDp{f?_U*hl9ex
z#9Qoys}PGi6~~05NhDkLK|7%1(TlsxqkWsB(LArCY_EyYcITj#_MJR&zvG4d&dbq-
zed$UCKm-v&Wg3C3gZjk&H^6Zz(qN14O_wKq*Jjm;1&CrvmJLt{X!Lgk+$bHOB=f5)
zZDxx&XEx)kJuLG;(Hn}WPWphz925(<ewpTh1ZDGe-k#<_tlAY#O)jgO)?=qe1{ZS$
zts{p97m=uIP3pR3E7PDuXLBKq-O|Z*n09Hj@7<9?S4`oN!jFk9Y%$afdGNuoW;WqG
zu{-+1#9&hCK5YwPSd!hF*jf@JP*AX%*fV*b;+93*UB@!%P=IzJjy&u+lp-c|&hT|W
zR6I0>X#@0>#7Gv?<&129;N5k0JnXh2G(L(kuxY3XEMci-Q#ovs>w^CNdB6%Ms2k)y
zlUPp`1EaaLlz9UpQ(p`tXeAmZz5uld)c38;%XJPy+rQO6vBluiN7N^0Pf!zODu~_J
z%V(xH!o1MwLKvd|y-KGB7~!yVH3U#*tv+|5;wj{Olzanh8a%Poa*@dODGeyavo}az
z4`I@*KAd6K`S<+P{BEX17=tP%v{taq45kgh?XZKaKTxQ3j*fz4GHn1Br5l8#F#6uO
zZ^V9z?-F47XegnwJBwCLji?;3Y<qyJ;gfnllGC{eu>G1!R!@izS|)wLYkt9g_4-f=
zT+CM!!G$Z}VyY#`RK_rhT4tFxG_Mj)uS292xzBC3@Ch`!DdK%gXlutE*Ui0ALd3=`
zH5>Zd$&&6tqx`U(b1o&fmP6<mHp=Hkrg`AK*Ez-2LQU7njtTgxu*)It(G4U^6$wzC
zwF7-;Bfh!q{owd`C}MRgJRXc3eSbnoNTWqT!KI-PSpyjXd^W-yb<z!bpgIe_G#w)f
zeU!3RISN@191x>Eevs>86>cW!d|um2&>*VL-~QPa;q*+2Bk)w`;&?FuT<OCC6zNXZ
zRqS+{5CalvPMCycj-=l!1x(^U$l%EEt}DaVQ#T&4VeWQ80ofQYEU~ufrLCJj5-ODK
z57gmI{nWI!8F^5hxT0*S{WAKyTl(@hK}3yK$-M)m(Y#-JdV$Pmak$>ClS?MT(v`T=
z4%C_n+l4`YqJOVILpI5`;Yb?SOVI7+LNs61fYI7?;#Q;c-0MNHR|oSBAa4m#Gk@L7
z@BpgC^pHd<!IBvWlBGv65a|nhTERfgY=twUNn7tnjGf)gYi-G(I-=!LA>;O)dxNVy
zZ;F`}<r8hjWK1EX8k<p1#)4rvno_w7+(EjKXXr0Wxm&u4fBqf>{xkf*FQ$FBpQF*O
z9qaq4!b@wG&OhfhNRxO9{lvqw9jQz4$%NPs6?GqC>OUB@bLukh8S>M(>ul%0HUk_Q
z-#tmqnwo)$T&eC15W}?7!-+(Gh-ye*ZIVtVRDR&^{P3$~ciL3<oytt7`OK~9v$~WC
zz-4J?=zN26&z^mD6U1XOc78>@zp*4TsN)}HV)~PlyTh>J9tM<TKUH`%RZ=J9Lf|hs
z7kApZ$5mksb*8cN1?g4cCvUj*lAbVAg@Xa}<bmp|_-4e^+YYw99M+Gw&zv+H-Zv>D
zby*-zGn}nE41kpA33#m0c4<!d!Zohgw?I^=VTEGsF+2CC&#5Or-dAgQmzFA&jtu~h
zbA?Y=3Neh?0+ylv@Hq3)1J+SwKxIPm<NYrh3ds`+S*5QhlRSUezM5a>PlkhPhVnkT
z%Um{zG{j?Fp*68yi{8d2pM|xPo(4-bef8m|`1o@mXy4!*ztV*i7U84j2Yzm#@$k}6
zYFw|s^gSz#Fse)vN{T<9Qs*aMKD!YD)`KlJZJVj;o1p%OP+$IdA^n^%Vemuu#1^Qg
z(veiZqt;u3Hy-F}VZqC_ZDOd2!QgqeVbb~BN2o1Rw>PQz7GnS@6kG~gjh~5@A;75f
zs{+zT26RP_jIN)J?rBS&&x{I}QTJZ-#@@>$gz6HnrjUI`nlUTP5)|B8a>-^AqMGdW
zOR+%N-~UuQ$fnIggEFF4*;iEvRIY$poMnpoj18)a(+ecR8~{cAq70f57o9$iK7pk1
zCDm$F5WZV?BDPi}r;5uCz3I_6G_|D&t8#=8-=~JaOa$CTx<5tU;LW-=)jq3e>eGfd
ze6Q*w<a^G%E9jwz9T?Z=WQKD#D0y!LZ*fgcp6Dlp!e;Z)8nSy%@CKv2Ds?4xN*4Bp
z(WmIt9x+o`1h!<8CdY{qrz&a0`}u%iQ0+!tH(PQalDge73SkKNJ()rxD!Tk4VE?(d
zk%(ZD+V#u9CXj8FJOADt)8;6DML1&f)hvx+H&q%wLOUu)#$_jF9X7c9D5>bm_2y!I
zo<CrGCd}+{^hjd%#Zw7q=^>4bM4m?D(_DuOKL&Hve1xOLLXk^wqm+lxmQ4h?L9{@7
z)ImtuJ^K?oK7p+V(Hn-H*ZM-{0~sM-%$!*7jc3isi>;E#V$z+2b)okL3&GITdcXvc
zz5*k*Zvru-9S9PfYxEPRVD>UX$G3*2-uig;YCI%IlvP^-ltk|65cZ8!V{9eB!1-i~
z^||ILPIxzx%gKrlePsKpumbhV7>Tq~vxQOf;CLUS0{_4%?pZWM@^%GAScbB>3Ggwe
zIgN);HWvWYm$mVs5cTD}byACJip(t&>-=huX@6+E(~uQJZdHb0PJUG|DFh+g56T$H
zr~7q~Scm;nKy<5(HydmMpo}?QYo@?tVwV}UMrLYo9sawPAmNP#-~6IOQ7$sZCR4nv
zMcL-s0an+R-GV?1c`W6Et$;(rMnl#^ufZcuJUCR75nPtw)Jiwy%>1w{5>6VV9&mZ2
z#T#(L6~Lygp4cUhr(WeZ7;T8dr*SIAv7m8PraW&3{pEQutXRt==_B%VE{RzEXYwaQ
z3#Y0%U?soeHQ>r!G`9ESxMm+h!4Uc97p=5QbQPO)efRBUSsha=9{u~yn^r%?;Cf4M
zoH7w1bTY(BDUnnQfI7SXAaLgUNEK%&#FU#B>hbi^EGb0J0o@ai<b#=~`p@#mHZYox
z2BNk7N(LsbHsM*g5K+0+iktMI=yQt9p~m_U@<)69Z{(D=38;-~mPq&WmA%jRl0Bm6
zU)8QODUBb&v{@Z|e;y%|?2*#a#1682kt@Pp$9ji{7Y*MS$emKRfPu3%Tg9FlOS^hv
z!>H#XI)px@FjSedK0B^z7Z&vd#csKbArR;nk4|xjKd1^)X;qIkRuSdl>gsTsC01>2
z*$~<%mP2`jk}e~_mgSYxNQV9cITqqd-2p;pUV0_2i;KsJb%wv_EYQ3AAG3=HUJ@jD
zS))H+lnfC`SP!jl0AfS5lN#+GEysNF5nQj{iK^eOYDJIWcd924Nt^sdl9n1PQQ@U^
zw=Px5nCB$YgH+gPHat(@XpX&NDRpM+SkxZ6<)-A(g`k1xI|>6oH5iTOlME7+)v@gI
z^BZI>bmz6oicC->tUBoMFi4dwl-CavJq+`O=zgKFGh<N~BzlY^*V*d5#2>^)Ha12d
z5&JJ7y&}B64kqcfh6QhmvEOq=zf^hNQA&{eMul+kRN367VUWgpJ9rU+aVoQzT&ECb
zNgD`r?!pZ{PL0@*$nV97@nz;igV?W;*Rl{O&~cyOLTwxG?+~2zVUor(%XEpk`WI_1
zm{D`juPxqdj?B%&ToXB%-jp%8kLp~;LF=@%53O<ZPRS523&L&_8*eK&Wib0abWX7y
z?HnV$G|`pQXvS`pR0nxwA_Oh1(tj!#rfs~2AXcWc#HBZgfa2Ja*IKnhq%BAXa&Axt
z8fr^;S3)?T(^5-8yHOYM5q}!C#hp#)$`eSXeX&LU<daG_3T`}_jwu`$^N{qE{y+jp
zyTbf6%?S_l{zxjFKNrmSiyGU<pk^w3;UWa&1Q}{#PL|5I4;4#n!fj)9Y*mDrDjob8
zb@!vRj9Pw|D<TyJM!u8`r+wGE6qxMltvLB`%5LMFEfw%aS(JUeSgphsZF)65vqJ5W
zbpH6eT;TLlTv4<IGAprWuPgOiF=8X0S@WQP5mb34!TP%N_7AC!+0BC=<m~PT<ZwCh
z+Ij{n*E@`&HUn=KMKA}?FaT;8&I*Nrc6Qp+2{j-53M|LC3&lCknt*ItwAB5t2hJ^y
z8U@2rdWPh-BRooW^!Ha?7d*uNLPmJ>aN>!>MarwaN@k@phvKSqZ7!Mh42?7S*fBQ`
z8YVJ5Hu|&M03ov+pGH_^(CV$hv#WX@m(k*W+j<c-Np`tUKf(5zkLLH1yY?5ur-SvI
zlgP>B=kqR+K03!*>jl6=SLAKwX;oi@rrkB(G^9dH6p}vN!~~ZbqvBqoLCV;LCYU*#
zb4JZ<aA)TAO9Vp{hDx3ZLX*^{<^#g^3mb=W-md`3UAaLA1#=rrb*sMiFkcogDzOFz
zhY_EQ+PbSrj-!&g$f^L(stUANdIgH>vnn$|ogDukE(ZMUR*Kf%3okIvhHlLOeW@fQ
zSxC8G;?xiJ*yiWcrWXP=NUb)z)>i=`@wb5|t+KkOUG;_ESxcqZ>Q_h>;m>6v`4jjD
z2A-1JJd1-i?^n0Fm>dw#7tXMjzHgz_6b+dScDNN7@#(wkbitq&sR(me$EU3{K8k+-
zx<*|ouzbdKVKK!rBmSjLcjZ^`0KG{$ed8)C!*9m_udWb7rN$O^hWDmHtgfm63iE<Y
zmHc7Ds_whWY3hnNk+4)TJGBInh=-%uvN*o$T2*9_PahyI2IR0WYmQH3OL1@$`iP8z
zU$P3%U5Qa@Kf<Yx6BnYc0V!G6&oe0kWnjp`6Nol1AnHmu{d5&4Aum(LqNzC0oBBwY
z?>pOclY}=3Lk-}UVEs})`%ZEeuPNSAD)Ag6Q|>d?K3JS}nV?3q-ix%r=qUd72NdiH
zKzYJ$q(`AG&LT2#D8Qhq{9A`h4b+Nglk|~7!TVWU0yQ01c9T(D3`2Hz^<2`#IxQSG
zEgh@bW9uYIJ1-U5SvJ|nb?YpFN>is%;ypBxuX0ismi3Z+dmlUId(XzaM0?@JQRPux
z2%(%Mf^KMGjiZY_Xc7|Zn=?#Ck!WIcFn(P*u_!nGS<a6=s}#&WbN1=0#bBnEk_`s>
zj*a*(IRwqqA_~u|{M}u<-#sO#_mZ?eazNsljVRi|>z6DEFkUMEEZCQXu^`#6O_#B~
z{0LnAV~hnt55o1NP4uCaj7{w90ULMwBa8&hQO%1VA14NT10)6q{WizBW&3^H?bp#W
zjF4Gpzd4B2FIkXbDf>q1G8RIy?Ey5v>sGvNVAti#zVTSE-<y`gKS=azQyYdSaWSn{
zo1x$Yd831b&4dwR7tJWD&S_odJfIndc#zCvL@P)3SaisyrkL+npGV3XwEo*8H3~O+
z&Q(zS*EE}kzu|0j`K_BiblPjjW+p~?WZ#rXRHj!rKk*K`bRH>ol5T4iyV{|9Q9_{}
ziA5`EMN}kQA{M4^EUOy1WQC^jZAsi~yG*}9JiM(7V-<)cykDcJ!JbX_=4`qCJYOHx
zr-~8HgpHrR3QWcXT7V+-L57HIp+0M17tU)%fd_|zHt1!~4MO5|<2dyoOfuPNpF@8T
z!H`aTnpd$enzH5H*)*>}DQwTRVf43ZLubZwER<TFXwuMs*=V9nDoR7&xmcy}&vv5t
z?}c$g8|wr)HB6Q?sHzdhI-WoF)5JU*LWLt66w1BzsbKXoW7n5Wwh=8vN}qQa6bhFH
z<SboDy|77gyMDDL1x?(k@uJ0l*qf^{7q%Tv|MvOFuz&Ee$9lQjppyaU3T3mG&8MF?
zFwpA0&mLI&JHDsb3i6oPx*4+QvSDJAqOjh&EH>*2UU4-EcTm<jN5-~z78;|d=Ry#O
zS@hN$+i*?-$mzYZQHY${*$vF6HD{duRE2koCP$eAFtSFtUkjmX-M$oB4SWoGzYi3s
zV?@%NZm>XcIE<9nnG79GJFQ-YjO?MP1yXvB>kV`t=?_tX;-$OvUMt@z9Aj7**USWm
zVE8APkvgGRBJ1Sb67_rmzADo}qVN@|okZ?oPEW6A-8)0JD;)u>-oJLAmAeR4@NrPs
zP))^;UGfKAsq3k1cmg!h0aD>cfsTxfTC~n66DJuPnhb!p9FIXRe<Hc{p1+C%dr;D%
zD_BKycZX%aEB8tc#Y*y|+|5i0(;x-WU4oX12SLRv0G*E|U!SzaDEGWZp4g-csyl<I
z3-rNS4~<<ugd>?`m5l#4hH4}PQ+L2OjGbmhwhGC9nq*CqutE&fV9mZB%#7-hmxMHB
zG%^v|(4gZwQ*U8#*NIbavUvtG&g!h<H!J!x&YIQ-Q_s3qNqQ@)wtq1?Tg7gckL+&R
zqPxUYvE)gfQ51YS7u~F4-=B~{z9R3&f`)kXG<hYaP>HD0^sCW&)V}Ude-|@wXB`K;
z33W$bFwOb!mK1*x{#4NoWu|4;PVoR&ybg=U*WUWm)KOWQ)?P9kY%KUb#fL<1EGNXH
zZ$~LiaLX!P;D5Gc_@436uC-0-`4v6HDn!vhdzRks;~QI>1T1RiB&SC8;Wd~x3GrW(
z2k3bSu4xwp6j^J(#1@vnEjo}n1<7zQnvus#Cu@qwhlnl$cZ6h+itK?3W%~CfRN8xT
zL%sK%k;8_#eBn4eQD++Ueo;Xef1L|4tRRU=<?dNAylq^H>-#mIZEszUn4Z=?la1N*
zF~ZBlxSQ#@Cx>dHV=&!MYXc<Pe`ttC--Gy;ak0+WcI>_elRV-5%+FWWQtkoF-vo05
zP12@t??R;A%;OtkVT4eSNy+*c-*MpCV~2w&_`sf31gcTClG3*V=ZMBmFks8hFfo=C
zT}!4N<wtqf+p;5(?8zw<f~}c)|92EQ9t=B^4|*@7F&N7^wn~nZS)zIS%>WB?rF%`6
zaqM-|llARPSn!vs6`;JZ9IauZ(5%dcx3gmc*j5Fjv{ZV~&SbmQDv#iJCfpUcq!;QH
z4yaTkzFRF`7r7aX6NRq-UoC**4pM2O-Is?xU@&(^6G#Zo>W#6Q2Xp`$If<!AS4n(C
zZOb!=tvhrV0rw0zd6H@v8V`*^r>^_M)ez4n)5V?+Cz!fZZHJr&hl?)RYfIn#emXLE
zm%l>V-wH<bGrSu>3+)>-E1OQ~*)B|b`TP1xBP|L?;G!g2U?4cOoFb@}rEw5N|F9y5
zSs@29Htb-eYu}Vb*86WHgby@}-%0PKwMcSmz#fMD#!(}!S(7}nf}L2Py_Xdd8tAiQ
z?7>+_#fXQOmTBgSjwNpF^<~xLx|wmlaLHp*nj#wlK!*T2O2&d2xo55|zus54@x|k4
z=zgqQw=|olb8`PdfRENcW2?n9%Ea51j1Y1>i-ORLPD4va3nX*5^Kk>F?K5H+t7e%-
z=5?$C(ju`!8w{9O-IdYpc?O_gJ-F%jFf`n8tUo#Gz#V3*jOfolkm8;JEo)TjrGAGA
z51wj=q(=2x9lS#y+n-+d1zuIh=$0z=b@F}pvyrRlhv4L;`~oHz!)yuE5)^K*#&xp@
zKaRP_ci|+<^4Fdd_vF*{tG-X+9*VeEn(9Fv(J`U}p}qc*d20cAdD~QFbeDqt-EE@A
zVo{{h2_>24oye0(@z!9L<q{f5I;GN!;Dl0ES*x%t%BZYAo*z7RXzMiSwCA1366{q8
z?2r5In(6$C3J-GnL&XHLC=+Q8hV}0<J+?=)l>&v#mfW3(2G678k%Cy7N+vQR2Ye$E
zo-1pQXsMeCzsD+**=$<kUz#uwoa8R=8hfvEm{9*QRze=qjVeSI@{kH64{5Eg5?pae
z0p2%p3-(r;`VPcd*I?&UZ^UL{8j6PzaUUmF-6{*+mY^?7lM6<eo5r(6%jh4?kuHGx
zrKC!i5|3d~E}3YbqmsxZ=MB_H#;UIPp|fSf{c+9?4@D7iGi)3y4I-QLYuF((l_=tj
zURyfG7yT$1Bj;*l5Y=jAwMVYqCIrzP5T_}<Gr%5mG1Ma{Pz8abQGYO6j<>PY?E56S
zei}+zMqQ1*s?o+(jkQVW)nz{`>P+%hUUUiiNdh1cE1VO~M{+hDMbHU`v9nIG9MuHI
zr-e-=6o_`BDaX6hd=@<d;5MCA7{1UblU7c?CL3n+S?@%;oQ&~w@+ASLgx<&R_{pB|
z_;kG)&l6WdxSM+A3t%}$cv-@OBgpE8rmCKxURaD>HA%~2R)G%U{U+(4#3$3mtM!51
zxtKyRma)zkV9a`GkrDNTdyb3nJ`?R3uB)(h32}6l^`cEmi|^gbx!1Bv*xEKW^rVHn
zHR#m6jcU17D=}?>E&HuZtQ(3-WBm?w-%rQ2X^)hf5RjzLAyAylthO(Q2)<Dq@Z~BM
z57RO3wE{d}E!xF#9g<pIFyk1KU+8px1aj<WKg`=(P$}y;q35s%GEp&U>iEmQPb!n6
zxhE4686GTVan_BQ-4hu;b_NtG9n@%YX-xR%M=jPuRlPFh2Co*%EbHVBP_K*mVwne=
zh^3)@GA3|J2v-C=^Du*Bh=_!<!-i4SYshNUWz*+cIg&$D&(&y>X)uej)^Y<AWx)R%
z78~_2KTJDwDzpbLJM4yz)><@sRIgs(tv*L!v{6JZB|5H+JOvyr6Na`|u5D`c7m6!8
z7>rj##9w8)jmFL>hjYLbSUa!EN?kFK15=9=?-qfqj^1KC<)Xtwggq%E)~+&Qv_a$)
z=U&%m7KNiL#yLxJFN1RTHjA{{*~%{j<1Uz5b^z@c(qQ@d0m-0+$qG`d^;;jXc4QRP
z+D)@qc9hjZs~$!3irk{34y#=ZEs{6V81FOT7(x*BA9m*p-P`L~vx{NW8fqLrrkW$l
zOkg7)<Q|pGe&9j5`TD+9rbzc~W+RjJpxodK^l)c0S*5D&S|ae4sS<qh0Hb0wkOg^X
z!|qpE&8xrMS{#3(ow5RN4OL`yCV%^CN&8GKIc3FtI|a+zGzCl^B+~C%U$L&jv|H`F
zCXT`2<(*Oy^*UDHK60E;9!W%U<E%s`y5a~99T{19tuL&P)n?R!HC9fsEvZN>jXHKr
zgLgfu{3^?{6#vgEmbAb+b}FiQz<OkvZAGj$wuf_Vbr&xix_V-e`TI-G)?^pv>1Q<X
zu%3`x*kv(;$15fh9nUb5IB=1-NWW5vCJ{-ahikF2?WP^LN^nKC$uL_Lq~Q`2I-1+Y
zF;@9QUsqKCNeI>qgsfJ2p-fI)3TB~<k~@7c_gPnDuJ?U$lvl9QVO*;FF|tJVRucjo
z(C&Ap+C$z$x9iKn?+D>fuwjej>ghI}pRVx(p}R8AFh<a9G@tv?p=p~)8%;E=TX>`K
zEg2%$j=NdbiGr!|Elc+9(>Rym66;us^y^XLs$LluM}r-hAy{=-y69X3?UC?MKgqK+
z81YrMGe=<R;LAHNVi@6m6x0bCCpgi>2r;~UN50K)39T02^OEKiG782Y#3l}nzrC<*
zG*DMMvcA0tzcw{Gb3`=mQGux%hO^nxFJB16eyKBCezPo)+{R{w%!X~&gqQkCa^q{n
zx!PUzAjp!H2A?jUql05d7E*G#25y`r#lKGFKIaSc(mpxV^+=A5p)vGa;-wEhRuh19
zP0eKIpf1bxz(N4U2u(KRIWe5%17l*jrd%GPm~fCyB^GRBd&wSGai7-HVys7v>1B`2
zrO<QJF0C-J*RL1XhHT*o^YL)b%=9VpkPYHuKNfvKR;5tu9%9qg#~gPXODfH|;=;6G
zBJ>sBW1nc4jgTE$Qx%)nERbj>g@?sgc})3}K1a!8W@Q@V^e5z!Ibej;2_hRv!e~a5
z-cwZ44U$L<6tB=+tJ62J8}#D?E#*Oj1O*221~X1?56YiNhY`craVb3FYVT>h*Cec8
zJ5p-9U+}Eq!oA#*o|Q2mxjKJeOrpL2Gzy_&5n}t?fXVYnBm@z{)E^*dl=UQ<bvxt+
z%^6L-P?ko3f+Y!R<%sT5U$|+%`FforM$M}*jwukTCo$S}p5(3S^a}ggtZcnN6j4%{
zXq3&)*KM$8-+QU8@x}Sl-bPl5=XG+5MW>0jFC<jnC;2HmHFI2w2k2bKGE!Em<OwBb
zj;NFSjjBfv3tACZy%OtI@$pN4&$fxF7b-o>Z5X?Iv1Z*%`#`Boe0Kda2JGT~#(XIp
zK`v^eWQ1Va^9T|NYIf=f3Jk6znqE7!(5N#yfGNFr8JmDr_edx57aBVi9+OI5QPnA#
z2)bEBlZ>i5x5qk`6%PCTR2G+h-ETUa4Qe2iAyoAw8WtNj#NZsYo2xNwx!xyu7<3ER
zC7Yo_?{8gZ>h6WKt@>?auvh^<)<gFo*()|)ms&)KiCng7rWhQ}w>wYB@HZ{LfAwQH
zT8vzg*Px6_9byshR2NIg54FHbF!TvZ8MW6v;(M|($=fnYGPNm>DG!*oba=YBj8ZA2
zp*3Y(HpTQ_U5}aO1yEGKf(bpr+w#q&a8X-d#Xu`f*_;r@?hzyVVsxG$fCc=K(|?Y6
z+eK$fx8O0sLOYL~&+b!X>pC!bg7hfK{F^!m3;{<~jhW1^fw57>eiYmzL3SLzN%xk2
zeC#q|RnwFsaJ^yj_J$>zaXZ3y+j#pDOD@$}JYmV2cU0%Knn}XdT^N2+&B`{rlO`d!
zk6TPl)Y7wMju6iT3sn^pMjP0}_YWbbPx_DPFYlYE`BJ5@v%>wE(o<5dl=_vbcV<VQ
z1=hq9Qv23M#5awcJ}+cvg^3V8OisB2(g9#u9AtL;<AMc*Q$P4+19HJVMC=fWz$q#A
zhqK}Q^;!Q`lwRk%P1r~I2q&6BbUIl+nP+C_7a6U(#+XNbH1~X#K-7KjPgK$_K1^xu
zpKt=E7-P-Bhn%DFQQ&Q$<FHS1!_OL7X6iB<3aGVd-!`v>W*e(J29{`>%WNH8wwOnW
zB_7J_LFTB@jQaeGWb<<vLRN497jNrHGM%c%e0->725%rV3U<`}-5nri8f6yf(Cp2a
zxobZ*Bi7?I7vyE}-8@%uxR3;!-r#GC9y_`zf&-^M1n>oa+MQQ4`ksr44i7e1rkdt5
zopg@tjB3mDNz=1>_=MUY8dRmX)X1dY^9EBrom@q0=|%KhSRCp;V;n}uj#xO20d}zr
z+lE~KxjA<~VYrZ(`~#0Woi4s)$+ic02)QL$zO{8V9^zRA<DWJ^PO`nk;H2L;w;Qq7
zIpV{c*BCGbdyHAc0-LbHZ|)fc0LoJw@mm;b*wUwA@2I>%>@_q;nKnuWe>Pj#8v+tK
z&&-vzk3)k2lq^`(&HcUt8R;o9H!yEGB@(T0tf!W|>38JR%=9V5hR*_Xr)`$NazUg2
z?{&>zCpEAlAMRjzbBV}9{Q7xzJcYcA_t9gS0(HW5zA%ydl!QOWL^1GvJSa%EEc3J$
zE~mQ;gjHD!Hq&&SfSBTT4he@Ta;V4Z>hoC9*hv)Ph{2vaCa%U%=Q5vFF9?7gBV$%!
z2x;G6nK3`Q#dSXGKG?8Ud1C$~sd!gNs>)S#(Sfyrj(L_q`*wb`7$J{Z>k+X@Egjfq
z#7?xEbD)PF{+<Ae$xTwb{tFPflU08Y)Y({i4Vj!z%k*bu2z5GQ^p)?y2<L-kb#C2h
z*~WxuomAAwqF2}D_kg=w)N9yfaB$Z!dQZrHXh#?NsP02|yx!4*{ov!Y2vEPmKH@=y
z+`g07sCSOyph$W+Tj3K?{x)LYBCSU+$?_p3Y62*Q7OsUQ+S)9+Ukcqye6<)iNLUMU
zFf)czMhNd(ch2;?PZ4A?w-nT*Tt>calbotTB;6Fg`c$cye>)@iOFVB3hjrvZ|6RoD
zucjj6tjAK3pU?V|Cd)~sJrr_+%-P_*=Y7{*0i<8~d0Ls?FwZdKVlO3TP}LPPF=me_
zMR#t$bM1&?Vl^me3Sedxj17%ns`1f5(;W+l^wZO~wUO=?NsK?>NG5hxgG9>xVPV}=
z&9L3P#QDk}NH~utyUENNP7m>{vDWT?p(cdvIM3z^F26lS-!68BuY*hqYS`r1U=qxd
z;mqBB{h3%<;etemqQ#jOBdq4Jk1VVoEn*(pl;l3tU6-A)k^^c4+><V*C7A>z9>EG3
ztl5%&U-Fd;Q{$w5QL+ar@N!or+9}shJLJ*mR?_QYJb%_qs;T$W(In51TtIh(NtfM7
zDUpurV_BS3#8<)HNYk0`sxzT-zhmsm*QU)fUh1Bzr`M9)$K?8mZhWJaq5e{aoFrSw
zKG708ljag8@Qq`-B$FeOwbzy){Wr<a4>1oLMHwwB3`prAd)wy1<X<>t?Wp$NLtmb8
zs_Bff*QMgWlA6Um>{j~}wEeLcOW>XMPm9=oI-&qnF15`0=Ip;8$u<Edp-Ev#Sohu2
zbUIRREFyg-g&4W#6N9DU#hjK8KC&6pHhv~)d`jHNM#_%)SU~qwb4Cp|`#6j9YC+Ht
zg;HNaf%l67?~Z~sUrdWpbjw;q&0&VKW(?62MrT7+naM1p5}TBK!dq(6omq=_UX8pz
zu-ugxW*^Qy4%(Y?*qDXVmQ*`O5?>P97|dzuqCQQ>uRkBY{1_aUnnLev{OQj@j)9`-
zvo&IUJH@c!3HHZ3TV`c^Op)Yg@4|B}$Ijgs9m^NQo6Kg6b$bG0w7Lv=sTDK0*XYz9
z|Hw*5-+h^svtL$6m}kzLd9Vx<8IUv_l51u3AUvHCL%pSNcqOVkqB$9b(({6LPDv<2
zq^GeA7UHnzrE6%VYxO++-d8VoNp50C=k)zgu@U<Z`B>+uARqU4O(3VaI^NXT^4k!;
zQmJSXc10HkztDKKd)e{VOkGFWRr)qIpF9~|43yZn6itg1O`SgRAN@3=3z=dnlCCxb
zjW80I0wLn;S?72;rstv`yM4Vu6;vEP0e`aw{m{#;{Vwu8tsMX;R9O5>=J_4M)#7IJ
z*rcTvziU05;{r-#(HB~&SMRQM%_hT*Vx1ujnIpdD@x$?XXF2$~_(aMq>Jkp<$?o^K
zHGcZi3{UUdbf2P7vYEKnmJAN3EBw{}iomI7Rj2Dc5pV0?{~jjfmee$C30LC5G|5P1
zhdK0|6TNM>p&w7M#W66?Iyw_Je6%b&0giKQ`FYc}{yUflh)H?A@4_e6ZfXm9FJ%n`
z(9HFHMj?E>&naD@y;Z(e=c}iv)pQ(L5(Dldz3a?ZFY_j##2&{wit|e|?y9|VWy?Lj
zOY7xkpZ6*a%lfp%SkGYQ{$qVBQOELk4?8fa*FBN%S8_h5@=9jrGrSEzNX^u&JO9u7
z9Yzb_hmFwN1GtIZ4QmWa_5+ZCdyAMe6~RRkU$j(RKm2f8t@}NL95b1yFcC%5OXAX~
zc(W#_@B{+23}c+^RxCGxEcY{0P9lY-WFq{xWM|B^Lb;{O23<K8jb-)hKWI=IHZU}1
zaC=Nnqiw^SHEF~ZX!jn(6jse_!M%P*URKH0YY@BBKE-0;H0vUu6j)MrH@DHO*OIrq
zlLO9rSij`ApDeZRKKfH`VDU<FHFSy=Z$C*S${EX<<|!uK#+<0w`XUNE><5>a+^$a4
zHS9IZJolVpDq`&ob3ftXpj|Y#MJc>nED0d_s>cn8r27)T`FWv#OakTO_rl%+o)A)L
zN#9qdqTo>iKIioS()EAXd+&Iv|NnnD9D5ymbL_p2UG^rj_sBS8B@)@$d#|jJnZ1Pw
zIc7#AQj`uMiYPfmUC&qF@8`PxuIu_<e_VfD|NZ`{PVaYK=k<C$pO5Eb+=sR`5?;-;
zPZ+*zl4tw^tC1ZGIcfZC^m?JhKrb)_+v1_8`16hPiK~YZd>63NtW`A^rGE51rKfen
z6lHx&je3X10cgFH%IgMnO?62QvE6tRoAGD30q3AGg~7cp6gb$%sL8q+_UFla7E<{R
ziZ_=FB1=ivs3wX|B7mExEg1_YnMX9lLnUy}-0sJQz9K8KBVHxxM$=Z(9gVE{Etv&7
ziCYovACf*s$J0gkaE|xhf(>vI6HU()Cm>wcO+Bvv8Zf(|FoyeqK%Kv9Ge3k2AyZ9l
zQ{F3ESfafw69~=WwXPnSiIjaP;m%3`Pr~yFw!H(fAsMN^d2a{LA57AbT*jJd<(5-5
z)7cw!E&2r>)2LK@O+|#EnHaN^SyH;%S7w*raGfN1j>o&1S!1sd2h+O?Rif;v1PVRY
z@(ou~`h${e4U+orCiRQiG(8p?t02Bhe?(&0fVEX*!IZ+ewK`*M$e1yN(St$@^Z^OI
z*Na2nW9s9ADQ9wp(6S5-Gq!8Iyl*r`%|z%4BB&lD$s3><UR6A!fA}P+-Kw6_&oo9=
z8s0Xh<ih67Cu|~jEtMQyymcpQyyS{hpxj=P=qRga?X`s~>y<dyZYy;<j<Uj7^Q=2%
z?I~xNBwIpVal|p-*MM9SGV&rg_2k8g5i1Wbk*32ZP0<ETJl*&2crr|rHb-r1MNhRc
z*6_MCMr$uN)KTJ6{G7`<Ts&4;MxtpwHzy!FF|1dG>cNcb#TKW<-}}u;M`nLl2Wz=s
zU4wsTR2g(FYZR)XQL%WW!0{AdtnQKGy75I`xx!0U=?;3DF)wP0A`J8_g%PBCp+KP0
z`qjELFQXQVR-x(wjKzJlQqtVOW_hWu@&H-e_4w!GE}6>Sz?WP1e&`J|g%c4NX8CsG
zb__og==EabEIu2HDO4vIBV`0FNDV+EQ6!BdIe_bBvBKW*Nr>OxEgznM!FR0hh;F<n
zHs5C>C22l-l6DVqg<ma!*o#a#Iw_7PDvd=syf-_4W466X-7bdN`+|eivdp5jj)Rc@
zfP<NEh6s_wo}?FZPh(3J`|I^^o#)66&qhlPZAA@MDg)cjc%}nbHQ`5y?Hv=Y!HB>3
z#L?gpww|H>d>`N$RV*D}Q|$^tV<kF;J1cAupp{MFw*u78>>$c}gjG|+P->;&D$rPk
zo{{xsX}vR+gR3s-_2@WP4L<yf+Dh4E@gsg4ZAHCr%u#x_88EE0nlbMg5%=R~!ixM2
z|Bw+fA6#X<Qcg)MY4F7R@sF=FWM(-cPL`=}CVCdecNZp@*xKzSZaV2d8wfLEKlr(6
zGk6U5y0``gUK2BR^0?_x1@b>2f-AnFjXqyF!2yvGwCV6bi{0zWQhe(p>=#^*4`?b6
zIQSc8*i7SnVd&n3*CLT1mONnwAIxu$hYvviC=lx1MiPLH$8L7GK&1;^>3<l!02EQb
z;1P}j2*H#(0M>)ye+zz*JYi>x<^eyBhw5nFO&$C_ayB7z44i!WiD2*6lu&bIfyMRH
z@83mY{`W=x_v8BiZ*@J3#X5h$Rsp24;xDTKo^Rh4+<e6UXgwOVfJy=rsjbT{T(BQ(
zqZ>}YK<zaG@F8@@ydME=11gyD^q=2Df9>Xgo*R>lSKbVu4c!HVmMLQXQUs}Jm$6p0
zz>0*O;l;(_-#@Jvz$+vlP-mt9w1N-7-U>xP{rVaA@zes02~bbOAZ2#eNbIM@p$C8Y
z!bGhCP+RXb#%(&?i_##MlRkpgEsvNqsatcje=_7TM1oL}T3pI{=YuKODV~A@ANs(y
zF=qqcut7Oi8Jd~>+qJYRu3A(oGqVsUk$5_wt$5=E@R|yK%&XjlXpT0(btbLN?$)Eg
z6ozV+(3#vLU`dw+pyoCNWQ+(fWPaQPOhMKqDltac{M>gzCvSBU{-G5>U&bga!2h3f
zlxw8i-uLWFZAn_`0B}4&0cRX>lqTp&;#S>X<bu8%RojPVXiJlU4_}ym?3zM=tU$P`
zb3kgj%T6N^CNU)X`x&4FettT1*?a+9V7gYo<j1ru7(*Na#29>DWpN6;w&2lPM1=#E
zh=KAJ5Py<090!(Zc4ELjOFxIp2AL2-hHL>N5n!eMkXAc|_F3hRFjC|nAaUFwa}G$?
zlV4Xmc_*>sEO)F4@dzoTndAUh{eoiuVmZ0|YV@(!Z8Z>zdWH}90pugP0Rcjqh5Pix
zzqgK$lPAj`neu3sE4f@Rmv0LAYKfNv_XM|SeEKXpD|-_l+{Mdu_1+U3oZG#h67u;X
z+dS0AfkTV?bP^p|wJu$=bMN!4BQWsD0$%3#CX;HGr-fTZbl3v(sKVj*E%+j{04(fI
z9snz~Xp949*aws%;MZ>fCP|&@h6CWd0qkr0Da&DVDL7`#=0c84prEP;2UtNKnopw;
zR?smP1kW*}K+t!tQZE7|r>CC*JT)IOMOv&g;z)fKr-iS7qVng6vL1P4)|v(`dW=G+
zkmBwFI$nv5k3~L~7};^~F=U7B449KSc?SaVVcArfq-n^g;Ks$3yodiTG3eglU<}sQ
zE(vrz;LF_Ego<U=6$rPu|JJQrBn4PV#*7gaJyZTsH2-$z^y7n%m=8Z;82;<plcFyF
z9I)iBmp9m6v}qrs;;<u_mT7vG5Tv@VbImr|XnFlb=BYWfPU@xh??QLU3EeVL+J|JY
zg5+@nxuZ<?S{wsM0nw{6sk|#!^5z5VC`TySQ7*45;%O-EQ7*r$5ocrvDsDIc!%xnp
zviutQYIXB%FCj=Nuuwr8nSBp&1-IY_V}{VC37n0q;J0G<XG<r;t4Fz{CcMv4?BJwF
zo1Y=<qQGMnz4GSSJ@AanVj<E8hBvOOw?ezkz+gx6@u?{*uFtRGVwDes4EW!FfloL$
zm!-%2#VFpRjGUDSXEskW8a}G@YOiDX6*gxhf-rRo7uLBqE~FEknQWypZ(QU#^M#Xr
zQw853sN!G;y391<l|F_r9?;a3I}k-sPn0~jHR79h$CBxFCLHs?&&>w8X`afFmp)jC
z*Z)%U2Z$^V1=cRa{Hr9T0(pV<%DSgBzk%12Zh`$>JsY3)F%rep`|-!ez!B+9%-4>x
z<i-xLQ{N4`0$qQ(<KAfjh@kNmz=&CEp>KB6#`fBgSwYcv|1Ky$co*GY4jv**C{}d=
zjDRRilPYOd06yjzgw4WZ0t7^h%<^~HOqVN2mwh)$y9=zhJ}hqiI{bK3`Lu6dL2qGr
z8poT~g|-aXv#6JaAM24$6BBDDP$reD#S%;bLV$4Rt@)nN>U;6wqkGfWBZ#7;n4Ex{
zkxU4%d?WL2GLqL`E_W-i(FUwGpWf#HgL@2I5bc&Zl1TXF_DAKHFds(F@oxdyMh*ZF
zBy9Q;jYl|LVCU<3S_o_u?4JV%rMK-3*=m!U^eWEFsUd;jC9vA$`hxpSaPyc|>>o2?
za(^-LiUJB&gnN-Upx8-<+l+|R6}{^G0@eqD8|vhZ)kC15U<3%4qTE4<ScsFo3XDEK
zgZ)|n@X0UL2-$dmX-+PH`n<fME8TXt?MWygXl#M|%YEn=s`vbC_Rch5L%XZ7wdeE#
z$FlTlgsAu-X9Cpvfh@3Qweafpx-C{U|IYut$jmOR(#h;0KLH`}_SxM1=dSeukoi;}
zFpHVR^#EM-+{TuEefc1Wy3Kq)rn$W)_RYJcdNVg5Q{&zp^nHZzCEx>kohE=@oo&m&
zu|9V?AZ|D+>^F6y0CX=K!kR4l1T+bO5d`MebaMH&1oT`krWfBOmv_v>Wmbt2?c#4h
zi5bXg5qQxM6FA$JPxBnDXu614tb4^I<IQ2YpSuf~D`^y9K|C=kS6+auW53Y+))nRE
zipaV1VDCAVATd=lsJj;cql2yfv*tZ;Kx*w>rSV+Ld&hsooK3AvW!<Qy*#&?z+Qoz1
zSHTpW+ajE+yFh|9zbOe&jMgDWMK(3H?1X-k7G$^7L*aTGHe9s-32=>5^UPD1?E_|Y
z^<L^u5szlckk1WWe!IiXE5OC8z}<Z%W$8q?ANZ0*D6J&)RuRbWneFWYo$oa1&5Y*#
z3HC4%OytP@8T?bK?&IV<%?wMxEYeLb)79(U1?D4NVb^tT(FMmU7JDh21A{Zhe_934
zf7Xa=0S0Q)5^oP~mG}-E7-rr6?SNy{O_mBVIA0J<hhG9cg3Kis8B<c}XB=(xVWw)@
zyv^L*O;Sv6kaFj5S|i`&YIgl5Lf##e%>0NQm#e(_=lG%04edak4D6uJI3yT?akMo*
zTb++|KXp2HZw~f{O~+l?9{FFoL@8KemQ5NXMM+buN{`_vob3Mb!o>nb85WPgbjdM8
zQ7*2Clso!t5yAE8S5hDWP_FJc33b94@)~{{geXvfp>75$Q`&8!9g|$ywnPd?xiF#6
zQr=XhoMN2JOSi?YZl+0b1;+21v*4HQKe_Yqd3L|B_TD)i#MB3&UtQ?(igja{{XM{%
z(}FO$)&BLB;KbVb$0?XB2fWz#fK`C=N&9_6_W0yyw9$F!?w$#MWPi%q(7Sd%K#ZUm
zMRvJ^NGmvMfMt9a4I=F~)%AJk$jw*#Z{$BM5o_MfGU;_t+q#Qpo3C9VNKFQD@v5y;
zG(H!33d=eGEC1qYK~{8!R0#BEDJe!cde-EB^<*zl2KbG@!xU-0lx!dCKR|h@Cgou`
z45$d7zb>x?OIFbq*p^fQJl|E*`~BvNWXPDh3hVYGT3S=H*y7*CbBS-CfBti2B*6{G
zQZXyHt}L`cP|OoW;ce@{^6kUO>+Mfo3)+KSzYW;x?%2_d?}G#S`!x_y%9`{*S~l87
z)?5Uzhtm=?$_YVGGI7oLG~g;Y&fMSvF<3RGu%8P$v$*)DsgRYc2)FU>B1y?hh>(-*
zu9+gciptf+qp7plmmk;W%evmH-ZoN~+4M(a^cBTR23hqxgC~~KI-+jaQBji5tS?eR
zEd<hNiduUyBmw-mL%?EXeC^pHDk=-&^vr4zU#GSIJ}hqdnBw>~B^n+;auoe$86s1^
z=lf9boQYfPD)au-pIu=9@OOX!{@&Zb)<SBf8EgPWW{}l>%Hbmn^vBs>X)W(FaAArw
zEw_}LoW759E9sR&Kc8o=oLpB8n>Rgf=E=;3$Hu`fVE;*2(QXguquE_Q+kOC3m9)~O
zVDp70n@*og{3cB6Bs7|0WCFzQms-p^B7XwwPB-Xo=9ZGK6?P6#Zzulv6f>@r7_^ag
zP5AN^VT9S!1$7oslWK_$0Sww1cImu8W~x8Ps?}CE|999nz%SgL&(3TKBntN1>Cv=2
z&V?Sc1zdouSE|1Gi_YeZwKjW<+#w(>*8`}3ka&Z<O_XH^2bG8QroDCXw)D)CR?Qf<
zVa{Ow)ySn{T_WqI_GnE}*kbXj<oXq2&#gDbs(Dx1eXci1+a(~<@i_NzdG%G3g|g_)
z`@b?3norT<-r@+|eT)RWU6bpy3s|WRASOX#Q=1ci?=4{3=Ky0rhdeY@U`;ki$Q+1b
zO8A7WV>78;WOW*%?6MqqmB{$J-Sx^`TIL(jcD2Im_iWodW7y5|h}vhYJY7p0V{^h)
zDbnA&juhE6dQG{GOcR@zSrS!80Dl|X4d-Q{kz``sQaYc)K;k(VG7@CpZouaFE?@~w
z@;<cZVh-Sv`!4&9Xm|F4=2eMz<`MFKbv}Qu98Ts`P`>Ph=jNRq$t@FaK_(<B$)VAU
z*keC_|0)wB3zN-j=FS(}w?Mv4&#R{8D(;q&WuFTTcpfFn)GyyH`c}x@!h~DtO~z?*
zYHs&_Mts6Tq2N)_J3WX7F?v;Dc_!oUC4Ps1KkC$=MZ6=&U*+1TIG}pFZaaJVn--?n
z8$cpJ&gx<qYS~I~McC6~d!aKl|K8nj>t-EQ@hksVwEb&dL+9I;ys)J4H>9Lx(l7D%
z8Y*x3O2RgH;9_dY?E-qjOXa{?;vUf8&}pD8Ath1^BG!)h+CK3v7uyxYS|v)aM8khW
znv}<dk}56NRdK{h>wS!ka5A+LICqQa32ZK(ZU(P)z)0D7!)Ju}$-T!XUcU#1t5YJ4
zex%%t_iP>7Ug+=Csa$$i7d6iGe}+WX#~Pe7S^y<0btI2hDx!m+EfkWn=br7Lg-nRH
zGpeTB@juppQHqFU+dGqEG|HG-Dj=QfHEZ{>iTdo`VuHM?jlAo6Y(B|>K1i4%p<!#l
z9a|tPF#DNm|4CEK$rK)?qLcRnU{gvzP%yj*J3w<KBg)W|3qDq|r*0FafrQJz>?Get
z{}WdO>~1X``7u1;3-Cs2a2^CKw4D!lPa18OeGFr!AfwEd8d%)6?WE8Q+i5lMG{1Hg
zNYGRQHvg;d<}vdaVnLHzBN6`JfOOf9nNW$w)}~FhAj*T%c;n59bxfWA3`W!%<V!R<
z)y0QJfFD@Dy>|AbL}%Dxuz2EC7*v0-rvcCcB@0=eB$DwHMk|~qPjby@k%WX!OjY38
z5-*WhwR*vO{F67KFK~A<I8>A&$a*EX!B;LsZ@3~oQhzfQ6?nv!Bq(1VGSZ6~<?<WJ
z9>)c1WU0A{_ugO&dh1;KQ@xaY1<SWFrC3eZ!3Hd9gSP24!pjx(jjI87h+^sLWLrXE
z6eG8Qw0|LD9>LZ@f?{uvgCu%hGLM(4E#W^^88-={<`QGsf7q+*l=dDErMb*pvp!QK
z#oc`&(FDz0xur0d@uy8HxFtq~5+i=@N<RqudO+H3)AI~~f6`7@-rfKrSYiC9%F9xh
zIn4<hd%bwu=?ZD}k6Jfy6k=a;Y4WLcCFkc^WbFXCOUX(y1;8f9u8CN~2WQx}C=1oP
zz<}GmSm#1Q40#EE#nO@VTQ~d?NwIv@HEMU;*0alRw!hJ;>dEI707&5|xBb`w&dK_C
z)3c#l&^h^6@r=k0D)gF1-h@GQDtip{)Z=iBMoflvUuNJY;!_^!Ku?`s4pkLHi*OLp
zr$ejRI^*HBk8n)Wx63?-kn6nK>_?*^j1T(_3y%{5VH3OJw6qGCK&M$Xdd?09-u<|K
zfXv8V_vxHX17=jwM5-CR_RdYL+?FI;%s=i9;}E{BRSxHPP$Sks*kq#n;77?v%{`<!
z!_nx~sJ05F+h?(g4MIm6ZT`CAlSCrHj*(}d`)L321fi<9DB$wt;m1=&aisE6H~mWs
zH71j6(Ui9V3%JrFRkSAwRI4|wIX2iyek9gV(feE9Z@=L+Hb-*%{ZWdd{cE^+lUo!1
z8^hU_siq@8Lz<6-rJQaE-ycMJ&s{}h<WAYPlh2y8E+lla9>K3&O8dSBUg|p?M5r};
zyYPADov`nBsu3!#py{A({IV6C*viV?&)q=DXGU6c{9s-Y{NYc?F;x%L<gtxyJlz9{
zh!P9}RMGO-TM@k?JJLuH>j-)5QA4-qZ9o(7M%6jyo^vd<mDB5Y@ve|wQXHwNsFnDf
z#t=lc?|Xi+#jir_aBrYPLEUEb4Z%J)kuYt4=)~fJiQIFDnUscz${JqBccWLxiFb91
z+NTfX#5Hc~wBO75HiF;CfctIP^@!#U$6WJK#Md?0c}wva<$+g|+p>U|wZN_?z76Mq
zA91Bv!8PUp!G~!{r+9PLWvbdsT#@pW-sJ(R`jQXzI&doSsOeo`*Yf}=Y|O~2;nayR
zdsGQwZRs5H@Tqp+AI=^`^7-i@#~^Svc_cAOo#V}dObpXeLQ`r=v;;t8S~u3z+}A&A
z+GEnNoSQ6{Tt?jS=b%W8vHsS7<6>!Cpq0RT5kLsz8jjeYo|(+{w2qA;=nc?j)}QV|
zPmm}YKpACIs-2fNYPT)p8?_6bkxbKU;Tuhx6iOF<u|HCL+xGpT#U5{@yh^*V@o}5&
zQ~(E)tWROOR_q>gc8+;%=lEjACfwSI&_rw6k9MaEHzXdHyl8B|Y2hBwz7B#@cl#ZD
zLuV3r$sPj-*J~foZV>wFHH{?8B>!fUcV)wMrq_)L^=nDQKIYgiCiAtI%esm-*Ke^{
zR<aa@M>JhR6dHxC#c*xDr0XJu!$mSAp9-{?rM~O1hw#_}2jevCDv&kwaokNbCEeFf
z;y90ycN6K_jF6|?iD_d}xLfUznx$d=SK4-W9>BRwUbY_oe_31hE(H~+N-VW>KL2O6
zBSUqx|5|H|7H+$ofRDMci^*5x9&D*4csgz2y+8&_gr3OPt@w>_^R;UU-2^SN*00*W
zXiW)wa|R>jU5~DK7PuwHkfHHj1SXtkX%>^uZsV6G--`QfS`bJR7}_~T=}k*2t4kkq
zx=mFC=^M;8lawYCj-Fika}mGv(Pp2ibe}WQI`bz^Q6E^)w%^@d&Utn+16!Ha(f}K{
zAEf>-`|}LhXEW%i5KLSFfY;r&u{@b&(R&lYa7rms@*;x3r}t2mhgXs~LPq@}PIh2X
zE&^#)n)#cC-Un_BTTEzL{2F(}7dTjVlRt7Vrr|&ztRiv?FRkTj8kPE2=6eWe71h+v
z5ZWQ2RYX*KA5t<=qJ&n_BxB4}B{pY@_{9}o!R888niWEfx|xIy)H=1$084b^i$Y33
z!SON?;DN$?CA*V2aGztA@j~($Uh*229VjQr=p@x>7F3Gt6VWbNIfjhUni+m!V*_1R
z_o5dA6<55y!cw(pXm_evm-c7TluR3b>o4xQ4-nAJb44Nq+p1sGWSwNjTOd$vy_k0~
zO_TlX@{Zz1qpbM2G2l!Fn8&7=9$?MD-86}Xth~k=^Y)ZB<FV1I(O6EbJo?UZG3tA7
zzuLK<1C24$UD3%)VB0z0@x!hAQ0NG6d4_lOb<%{U!~w(&Ub9Q?gMG;j9Gb=}^}lwM
zn`pfG5p-ze-H+&@5(ATD{zK{ChTECbS%)^9bmrqzs70p4_%CFM6P1Ae=zF5%OPNi0
z$)U6s!{;*~X3@j{3$9Y_moWLbLUj}OpV{+qagFYq>lYPU&P;0Bd@eCtHWyhDB$yO=
zVbrl&Z3n%}FSsUSpRRwo_C?#5O?h8uI0G6xa!#CfM&W`RpMMV%enWWh5tewh+^;d#
zNf%JiG7#I=Fg5MA+T`EVlSCbpRJ{*<Ol@jRF#L@i8B#;N(imj?33GvpL%ZhGHJ$8_
z(HwEpz?H?R#SuP|<>a}-G0|35DYnnVv@=um+Bsh1YFNym8Gp0Vrf&k4^OeF;jvvk%
zP<Q#t%rmKId2gdEXs9us?ilo}NNdKQ-^bKR4&-#(fF>J{N&SagP=&BJTaww!2--V>
z1UiRMOCwa^9|ufP;11M<wb3O5It(##{B)eMT-L`hVOCnV4uge)n3@+5L%FB|S1p_1
zJ~x_jSqIL6l2ccEDFm1T>!hT~BB{|$zt%%o@fk>KfnP5g^R${u#R)jk;QDH41F}=M
z?PCc_Ya2kzRbBuruW#1xRL*y2=~dwJy@+V3)u|jbrF;xk99NuHl8Hu-D9J~SyDpyj
zPk}lrRcd)xAIXl!mvztr5j`ODPu{*LVVv@MeX!|-KIf&N!a);KuzJ``=B#j>_<iCM
zy~ebMh)jAvc9lTooKY#Xi>iqAC0BY0XC&fj5&w3{#(M94|A=p(EO$S`j~<n05nKXK
ziVE$uB3g`f&UnG$O@3+<X$Ckb1=XK^f42?G``kb^J}ZnoW?@0`9I#PpU;=}0c2jda
z$f9agJK%*x``qZ$SH6I@@kh*44yNS91Q|CI!0&Nx?q2TLk4iKK&=w@rZ!K~koq#Y^
zH5vd`o|q0Zddo@m7Uct(qHk&1n{0=PP#PCdC}W)(Cu&m}W#Gy`k3h4?Cd-OeMAqG=
zh6=M&5lgYxhUFv=wIoJU(yMSPg<7r$ap<%wed~HXQ;C<#2SHj*LRY^-PUJFo6Af2^
zVYBD5O8ERh#Q=3GK~pD5z^jsEj<SNB+bJ^XVN@uTfb(n>idrMN$xT2Xs0>vK>doMw
z&=9}xs0kt3K_V}A{o)lNY%ht5d#2bIw9a~btnBZnb<f1N{{^BcQl#vd&uakm=UzlJ
zPF9e`oh1q%LrH0T;S*-)T`UVhNO4T-md!ZuM!pEt`e?86bF^FIg2!}vvmxko{r*nb
zwMm)>H6R3HlBEAW*Tp1uj|I+$2Pk!ZfCz`Es|P*yhbS?0OwM8b#Ce5-g?@2blUtHC
zYO5H1x2tX{7+44uUK2r5FiH$hMxeGrzY6|HB1H~epX>!ywN1~PN}#k<Cu9|T$hxDL
zB>JDOktx+H{x6ljoRuZ)CcKtV^<r{88-ldu`@qxhaCr%G-YWG^@E{{wn60v?ySc8I
zGG0n$@nH+_@^YZwt+{89EVHT=n7Oztq<4GNvYPBCH@zm>freiOg%WB@USs3U?IBm+
ztdRE#sBu}{jOc#i!ngO1r?_~)Oo>9mxBDtQJ)T=1o?QEeKK(Gq_USs|8)o&g2Y*5q
z3{aKzO<JAA$^G@o;}6>(UxG1^bYEcAt$>QHEa%z--O6QzdRTWqb;Pvbp5=<9j$WVn
zC;GsWY#=lCa<0x-tdicmFM#HJga6^LJq(W*73Ur^_nsGDgJvH6io~z))8`BP)s^(u
zdc@pk3OV(0SHt#pa~~)GsZ32#Z_U>!sRYcjJ~j{cehKKmn#zp)$-Q8{QHe7z<&Vcd
zJICvl&*z2D?>do@4~&9M^VWzoro!=W{bAe(dcWkZ%l-XVQf!5omYcqGFpR4J5XQ4#
zeRur+8Z%sD>B-@)q$dnwYh<av0HT6V`-`=Vm%c4XypCF_L}xwO&wqj}6KoMWsyqtO
zB)(<y;PNEi8+TxlWQQCZ^H|6zz`~4L*L}WB)T>5-QW;y};N9e(ezH`5U{CJKv3oP$
z;Mre4qs_9HZ>8IMr|25@?(A!->S-=8T2)qeH+3!Q0}R_~zcy(=)mH11W03}t_lo{1
zs7Lj}c>>NZKV}qqgYxYex|`a<+Q-{+l_SojL#KUP%lS6J0MHXapjp6zQ`aVM@I^MT
z)IaIAb3>WuSmw=iglw@7D|)3}TS(MpLwY%dEF@aWX0C;0XNRC1Upf{}RHw~wgMUu9
zo&E793dNpnvdJ7Y<61RDvz91BQFa;Be)BaUrCZ|E#7ZhO#9}b)NrkfQbV%@GbUTJT
z-}Zzt1ou<a-7y0D^*SGore)<{DV3><WRmD@wax0n<LV>h>YGE6eOs!7Xi8Y3`*4n#
zk&?yRrN?=mP37i((z+c~8Ul`4LIRK6cq89#J<e0dO86=_cE@_GahR-dEjX>##;MDG
zcR*mUWXV-Vdz!IJ(=sdet|eGiM&9Tz-_VqdowX-ZZ65Xv(Ymn$v85)dg52B)X6w!K
zuA7K)#BD?h!WVIM)n4O|D)ZK-H499lWaXrUcG4`{{WX)`aX_b@Titb6?jz{mm+~gR
z%a~&+8cC+IEIbW6rJ(goy3w);_}!C0p^B?m&k(o(v6{S8%jIyp*&KD3xD(`<lLB)u
zdyTUY4>yC6?$0>@k|n+5_cB0Sr`fk8cH!Bz7=ekTgPRt$N^#$8l2VI9o?Xk+?-StK
zJ$QnX_UaTrN(emRH^s;FZ4B;AR!os?K^kD~$G;i2Hf4a&0Y^wM-kBLbQ^*E7GPa$7
zDLdU;-cb91#!Jxq$7I=K5`bl#8yuoyIumteCnRTHYD)vjSO9sbcr!F7UEbImgxYMB
zP@guDoVG+OiK#57;dhFX^C)vndb~R$vL?+CsZYm~T|AXaFiaQt5FV)1QFkI>Xrqtx
zTe*JmU6Y*)hM!Z{YJKuu*@jWh=<S<z!zD^b>ESNHQ)j92rke%xJ~u)d2os#9Uc0C3
zx$6G_gB0&_rpEr5qR-QyX#b}4(ODsytAyY64nIgax^d>LE$iPGC@ERI4zU3eoav}2
zI1!9<i?_tN$;|p=@#E>Fp5A<T1RtsV{dnvr+t{_O(&8_(x`s(g<LGUU#T4)OQ^ilB
z0mP>i%R-uz)qKInOOL#%O%qRM2XTrx?c09g$k9rZQ;Rd-8O!S*kZ}NsU$AjfJL3}V
zQLXXgnTf;eGc2xXFI@+%bdhCiV`T96js9j{1tX?pj`1-tKVTXu@sHua#!6Nz6%<>u
zbc`3Vw8RGqPl0zhnYm`~-Mxh?wHHP1Ei>=cEwbWLc<bSjiqs;@PPX^c<?}z3>VHFb
z{zoJPt=a`T-sMN0$W*#k$BpCPsNdiF95IQ8&&KJHT(Hmh3`nBPU}rL)Q2&Txzzuja
zl2S!~MtfNG=rETtp`EAQVEK9^$GfqckFaNy%z76u<n&l`PjWsc1|6-mz*R~KU1-cA
zMNkN7|7>+C${qA=YA)OuN`}FPJf4KKYm>AOIc>z7Zwx3O?(;?(|Gd-ti69J<xzA&C
zIdb)ekmMaWm$e|KHsLx^@a7HM*2PS+wFiirms@-B7jQ>wO7*B{9F#U9c9@wuO~szX
zAAb=stQ}<9TGppKXJ}HM$`$z{IA>pkbw2cV-B%mehQKwquW4EHuD1*@I;Gg;s!xY6
zA;h0s2^B{s7!c=?L{}!4Taqv&mG3W2M>DFR@$^!UVvl1g6Ze^X9d$S&x{73~SQGhA
z1?H{EHz;aHYDD?S3z!9hE!MDf$zcoV%vv2et4B_pW?R;cYgSc&#Z?@biocjAXr>t(
z{1W0UT6#IpR!*wv!{ta%>mYHhX*iwiuh3+U?@&|!Gw>>scEMvvIgB_A1N;$!xX;)9
zS`m&+Tkim?_4cyGNsIrS89gR3aLg8+pAjs;y!D|eB&jBDjIjV^eWsdXmf0(ZZm(3P
z_Xh^t&mBIZskrB;wydl(;|-nI*ys!|mi+ZwF+@MFLPPREGe^N_&>>p7cjm+fkY~@R
z1g;&$n#SayIih8V(PT%lIcgXAsuyu}uW(e}QFRh75+V}|5yde}_SCziE7UPZ)aCeN
z8AEV$*`Es~j(Fn(1b{&lZOwqr;v6?L-o*`}w>t13+*V8Gc@A$c^Lqefuae2=ak^5U
zeoL1pEFHcNx;7GH=U{8@1K%@=l+8F6o1|-T!n7mFn*{<JNv1d2jA#Qes&4o)hvbEc
zGRaL@BE3mesztxA_}UJbVooXEIZ2M$Z}ca!2yE$C{|KK~v<Ip$+AT+Hr<~goO?URY
z##PP*@s3925i~%r!-h%9J<VYV_JKp*ucB{idkR)Gu2fWb9q8I6QmLX{x&DyuBb_Yq
zQeHd<qjm{_L2exc4d~|}A0LV9L$iF|LKsI3v{D$Nku)l53NO~9>b@2WeV|1Wu^Enf
zEnh|BsEr^#5JunDcr2PiT-1+9^)ULyo2T?)vv{Puc8K2a($}bWKf;z{Zo$lPgrtLT
zV3>LAA^%^2r&&ZWk>##!`mMscKX4Rmnyea+hoA+~#f=baFEK7nE{byr2ijD&<Grsa
z7^u#_+~&<va)Mb}>$eeba@d6R`lzYKFk$1cf>|mvRKe5|=W;`5=~TD6l4&%SK4pe*
z4u1u@alF?>+BNUQ_GvgSOTiddegj^=FN{y0lwJh2g?TrT6W`4`pD|&Zpc?yg<TLTh
z`mE;U{pguQ*mpiN3dt8(u>-Fa$*5Af8M9}}9LwB1^h{q7<5M4CHl*}$<)$j^xqeD4
z*-xD3uub;}^;()Opy9&zzU#P?W=boZ>dF;GaXn2nbCqVzmES?IKk?vsKBt~egEn9p
z8%GIvtX*u;9W*Ma-=8S881eDkDB|(rVe#aj8MJN#TFhP0k)MmdW!wc#OpC4tHq0y`
zCIsHYwNS+B3Y-!m(;Szd1r-7U*G842_|cRt0X}=UwF2=@M85=LXo+MoIjR+al-L9N
z%==0dyGS66`|dTv?K9UuZKj-fmUnm~{jAWUL=2qEoZbwj0yzBNM=J?=nUT4HdNV?K
z0c!);vY{s2o>;(>KTiKDI@oOhgaDR#c<d0eA#Q^Khr==HO+LV{qITUCLof7LnO+oq
z5$%)ks#+AqthVoe<WlCraL11F!_U_@Fo{0`($%?K@VfybfaS|%=$GjT_`vXSG$FeL
z7mZ6g)~xA^c6&YHP}kKP7uAn36RgoL5DLXwaq#JX{vZc|Xq<`iim*6dwDx&<TQ?D2
zF*!Lk;R|8{(~o__Njl$+?E^$P&PP>&xcPzc_8ZSB-@-~azwO;-Io~C3#VK!V$yRV;
zpC|}qQYN%VBpbJR)4huiob|pVinAFN5{#c2C4M*}oK+ZJ8`uBo<fhKwlXBO*D9LXu
zqCXeFEp)@O1A*3%O{oC#p^U~NthDWd^F5-#)cOzPffq<P;7dD-iU4%hB<wAnieRXC
z25`@IdFT%m*AJ(lDjI?71^{u0EwLQ%?tk8ez0<q!^Cc<<Bd>M)cSKQ{ZZ=droN<fK
z4xbBOw7lYwT=rwzCsA15r3UIRCg_3Z+^^)J&P7Ji(_LjKo+&q78bHuGHSnQ+qxvih
z3Tlf)T_spfEhVu;(6OoM8{`u$E5?5q>;u=gzJ!WMnN56|Fy(ik{D-A{NaW6f!0=~S
z?9*>rAXE8;IlOfrCX{}~NIW@q_@>{4N4}~e6}hBxj+&ia;Ei*DdC1=l{Rl1!0L|JJ
z4rt0DSF|3tzzNXC;Rg*slupm{xvH<{R4%4R@6~5o0ei}>slIZmW^1<;tH*^YXN$%_
zuQu3VZ~_D6vhpH>{5lmYoT>>`Kp)v|!xr~-Plprz5SSvYo=t9>#jQptZGx+^=M7|l
zYs%w}alR76BqlOd`|@8~1Jx!9Q1Y6YJRY?tsIVKy4Jk?oTOR|FzbT;6<x{lp$GbVA
zEu0EQ0RJz0(D*LwA1cWBE~pW$5p$Ll43)Qv+zQ$a6jR0eJ)WX9RoOv*TcPm-gyELQ
zeMIHKaHOI71;Ev&fwsQ@1Yv0f0{I0Ytv`tV2FCfj@M>-==yFf8h?siY)GTJ4gSYwY
zD=;bR*eMHJ<|*tLH$L<Sy*#byeFa<x2<Y45x+7`_y@lq9xAS0KSi|1-9#6b}8QSqh
z8ZYiys7!3x(|^KB6<3M6Qn6<*Z~k#9P*y~}y)$$<gD*VaPg}VQBvrGga`{W}DZeF9
zun9r?l1kyi*NrYxs$Jy8`aU>u0KSV<K;?kQoCpMXA!(aIzb7{?e!kGVpUoCs33m#(
zLENf!Qu4;=t!Cm%Q>YgE{nT=<g`dy=@ob)?t3FFA2TH<|E(boqoVEi@%u4W$J{-=1
zX0X$Ntw5OKmB(W3(Y`~MK{jB35zriuh{Q%ffDAq`xCYO^`(gYFz$`<zSyNu=sH6%u
z!pYBP-=XD83cipR=hnyn!dQ#oe&dStOXN!b4bVS)pr?L&*7`0`RDAjsG-C>WDOnFf
z{yha$Ux9|?ByYfz2LMp@63DNjfWAgwV5;hh!(fW)!f!v=F<5B)92|dp1|hV$<HhAy
z=wf%F4S5DT`wl_RCpcgEVs3$0+X3cl%jgpTozH^`WS0vGcacb<rEnnS52o*|T%;mx
zs_q<c6);9CDShzg?0AwpQ~oG7Iq$(IB6C9U{TrmL2K}qS10A=yUj%JKUuaLV2(>~@
zeF&BZy*VI1oYu7p21Tg4M?no=_OuAvz}Qre9oR^4KOj0-dwS<4=O?wle=fcIL(|!=
zSspw)R6YZc%ssRbr|`rzf7rWJ@ch1weVi}*&#ncxlofl$NMw|w$k5g%IOiNZ@}5m`
z@dNejO|ZRmr3c0TVfVba!M51}+QFUp_K+<AzEv|oya}wVr*6y*0f>!={11RQKy3_1
zBkykM+5pUiSM9BxrvU&=bPV+GrXfEJw9z-WPcndj1&JSgse58Y`6{{GQRz}_fcq6I
zE|4~VoW`zX3!pMwC=E|H&d%v596Z!wwO$8+pzZjMr?LV)6|KrI5dfzMUIa^Y5S0pC
zVZ}ZmeE>-9Lpr;Ym{jpH@9t5&Ozddp_bA+#<pYT{lc$*vE=FC5Zx3Dmy1Nlg2<A><
zHo@dN6+a=U<GJA88$8gwu`(JWJtUG&90s^6oxZ(>MiJ!^$U0;DYU2Vk<`#f`<%22H
zl8eAr&D-V$3yqrqeFC?4T!rvj(Y60JwR;PMq7@tXu6_p4O9G8e9s1XanC5-}o!*zg
z&}0jMQ1gJuenV}%ac4!S-;O)g&tDm;x1sl(p1tD10YY;ZWtd7Qw{ZyUZK+u>`Jo-J
zd2r~ocksuCfU(~|$t5FU=spAwt2lv{!>57XFi`&bcK{!hNnkP}1VF|QKH)w%|58^l
zeF=~*1(3RPleNE_uNyc3{>9zlArLettkjF3AUe~z=?aO>_CW9Becdo26rcvlpyxWA
z_InxpY-jXLum9!@^DQ7xE&4SYufy5m9<b7wa!zF9v-4@!nnf&X84Q89Qn4E$k9!6l
zXl`cagZGZ$o;=0>L<WJSBcK58D=$1IJQ%#RSDSDT{-e=8pl}_baJH2@I~a^kqADmD
zBsz7`<!y=>Ph;YS{1Zxn+du(+(>1;}I-uDUKP195+<dM1Ug!A2n@u%k$Ig_Oh4BU<
z8TVJ9DPozzY4$w`S~^z<6fPm4=~d`tz$1C^Utrm%h^QRxiq8PLWTl<X>~>(;dy~qq
z$iS{y47~v{N5c=pKITF;_jk9lb&4%+NNrIfF9F808$|?dS5|*C)p7cra1G_ZXY)^O
zS@(y)cWS>>{)2TblD%DQP_FDCPpH<Y=b7na=i65!(zlgE&tKmFen{l;6h9jhQ#R4w
zn>J8>=lPk?bw7NjTc95XXPDUp`aN>F{#EI+Igqi$c6PVzn-G*khb1tq@I)dr)gV~f
zE6~j;f#kG(`yQR?^+RIq4zRfr2&j7qVOT#%&WqK6b0v>3I*3LE5nYLejxOLM|4*$M
z%<)I^UT8O8%fi3TaUO<A=HDs}n9GJ>mB6ya>LuecS<%vD8lB!n))ZL?d)QaW0Gr=H
z4lIr5J!to*U5O^@2iTgl-4`xiK)YF{vEFh*(hHVp8yK7RZ_85vpU<EC3cyly%kJZO
z!4#1mM2LFn5eh{;;8-9eya}-HzQ~+5?nzyrM!+8Sf|$%??26Wq_I>@L;@xj*KA#*F
zK*WGz+ZKq(SpXmXUiKzjZH9a6gJ<0%K1hb$3<+NI{`B$T`;@3Zh!8+AgUFkcsv8q3
zfBdUg(Ol~=Eg%JES8eoI%In<u-x^?z=7Jl$iJpVgj@r|`S+{&x=jQq1LoQx87x{p_
z=4s%`<CF*0^j8-^OOe|mm^+db0zc{!XZF8K)Q9{?s(`fM{P`+GG!OidvUH44A3qTB
z`ULkZK#m50I0tFFEq;D2MXCQ)^XKRx2Fx71d!r1oVL&&Z%g~HNv>i>o08m}5f(lFD
zkoRp=ZzrVzSWMb~;V1i{Blk;|=p+{C<ZA<Dh~E-mCFn|xSlrNVX$P?PqBO5ndclQN
zy>UqBX#pG}E=Wg!q06Rr$}iSw`2_4G7Eo(IHNNWR-}9A6|DgI@=}+B|puKx$IEC#j
zzZ3yxCEZ-zS5lvUj~AEjgbg-x8_VNjw}E>1e?LI5gvgvO+WuuGqUsf7Xv#{pKTmj7
zfp8!#w<My{_$%VBvA~<0;XR2D-w7A+B!^F#C-_4F+BLn)^m_L0`F`uuM8c&n2c|$d
zH*K=1KzRYgY_7oU<b$EVf0hWj_Oxm0<IP~JC;6@6W4d`lit@zBh8m+Tz{uBO@o7_o
zCd#$bMD$*I)b(0sKR=5<P$&ih=(E=%^7>#*Mdfss)||HBFsTGt&^0sd$s7P-E=q$(
zHhpWDoP3)Nc?hI~7JjiHclF9Y0|J0!?sg~uI|kVr&QSsiCW8OERj7@|>USahZ*AB{
zUD0=YCK`!x@ti3q;!<@`s{~3c+>5~)3;9-#OiBQ4o!xJ&wMRmuwp{_&Xfr+qFDWDA
z>YhMnlERnj+^>w6UDut(0J2b<Jp4Aj1hcoGshoVS(sjN)N)F|<JWo*X0%_vg#}hkJ
zhe{|8q{~!XnstoopU+d+DvrxZRGexE!22Wjw;w&dHl(2)7?J`^2Jh!mrIy(ufd^la
z_C|B(+|~S??EU-@bB3N`WEtSaxqnK}UjZ@xV-4^~sJmv>QQl|*0|J_MY~L=npc63&
z`m}m$asq<hE0N=@&-BRj1~8X-jk%6E-;6wSGg;v?VPNFsBY~(CZN1|9_pI$%a5oyl
zil2-nH33$A2RQhqxl%=3+D-a}YmH13xLhs}<+fPsG_F8bD&9a5M}5oTo4kygqGBS-
z)PXk?KGT{Y4c^78QRjBePhvqcW)h%;g+Wr+1|Lf7m9s$#aiWe~gWh@ar+(Iy?FC@T
z+Mh@vHBJbCjeng#k}`Vc2Z(XR_vQjCG;70wCz1GI8u~pX3oh!OX}doK&b4A;ZplFa
z)M9vc3SUW$I!+&#I%ZVyw94lYCWX!xH@<LI==SxY%<OX)wgNgK3Av56G5o``Z<jiB
zmeyY`KBswGINWxjcE!_xB@7rM4tX6C=hzWcRzZ^2H#dJx0!J;k>Scf~_g#I4D?4SS
zn+?X?#?sr#*(li3n_h(kBl@YVuT{@fxVd>I!Qm<fSVE;x^Q7{?t7w!6*&v*K0gnAF
zfRwUas+pfmK)xg*z6<Cy`XC?E5-oAdHI$vo&)!%DErjPDrNS?xYWxHA#Wz1Ca)<%9
zpH5)<hkW6ff!MT(-!Jagr<dx^$QA8*tM1$Xg^zoj8G^TVz<+JlsH!sb|Ho2+47T=0
z7>`lq>%SZ^6+YZtYtZ7TC+VL*ayyk#9Nj!Jmg+Q|R`gbumD1N?(llYQsnN<W+O!g3
z8Y2vT<@$cd@iBRHlSOh*#Uk1!1T@q{2}~V<+V4cLe>7JQCDKd(95V3Wo$Lju?vgZs
zF73gev&0o_wR!)X6xjkM9tB;v3<I{vj)Wo|-fjbJ`^SC(z@N-3ouR(w(Nq#C`^Y$u
zzw!ssd7mgs*?YMu!R?+^>wimNN&c^TgUx)ANzqX`Q60k^)3_{B;$vLHwPG28#xp5O
zsi))b$VvQ-qej-R9}z{#q*x(1g6G>i;N-KqX=H#ETb$kOaJz&z()#8eE?oq$8<QIg
zZfx%95U~t7mdHe_aQ&cLU0pin;_~ygiB;sTvh}eJ;va9U9~2!mQ$kYnC(2%)pLAkx
zxE1&VG^YvB6umtFyUk~21&jcm&`U47*VhzuFj|5CeE#zb1g*cl8DHIGH4dz|Uc8H%
z2f!y5$Hp5MTg^QG53>~yUeFb+t^HJR7B|aR(hjHzQ+a)56$AO6E~^!DMN$)GvL8Wp
zLS=O92!>JYk9i&v-^vRtYx;v>7;5uTsZQg=?|JA#wed$)Vi}yCSBn(_YMp1pnr`&A
znhzq%0Sjr)1&BnR8V7#sxH7c&oIblIjrLmz@60C1ISq>Ve!KxiuFs?Ge;yxCS{xRD
z4~u&Nm-W$-aMgI+mn@S_+<aT@z_64hx&@#q-{{CBqn+Em5NtueTl(9OiS&m&@QLy`
z8e@2>zy#d;){&4C6qrH{)F$jeVfLTT$FS=~-iuFjgpeqf8%PEjRYHHz#ppDca${p4
ztKgVJMSlb+1C&$oAG<GY-@IW!fGY&Bf_ohh#08jr^vw*%xsA0x+Qjvah!KYDi=Wu^
z<D(t{1ng~HQnB6&`~#k#*xFD!+#XY-@<{!1<p_PM+^R=!W$ya=EvNT5KUqy9yM_B+
z2*<fy(L8x?*usp%juk&5N9^YBcPrsaa=2Z~pFiL%7vFe)K(6m@`^w2>vaKh1r5z+g
zc7uuRp}ND44hlZ()9_ohj{c*?5HG?Fm?s52;CZzN_=QfAa(mw<_Ad#e-CZxfm(<u~
zim;QNzPa7Asx2MS2l;he%<EAfzEFgY=(`|WwyCi1PNSNQaT{r+dGr2H_FF?>yOA+=
zVsiazs+-0@S?E8fj+G)?`DFJI$f+Y2Yp`}b$t_@%Ty85vW@b~1!2yLuKr)?9hkYv#
zn<X-v`XIdkMKRt;)bSP&SPKT!yGff{kKBIfMje84LFK7~X;9`-t|nULAqmc4;ov`6
zmH*S6V>90=DYzl-Zy&*QnuOl?u?`+-vEZP4ev*8v`)u1%CE)H1jIr$!8y!nIiCst-
z75htu0-KmB$X9AHDm~l1LXwo&o)R*u@wmVqZO2rUjOxZ_6o;q?7E7L<nsVT7%)EXB
z0{4o@6+f0<fX)?lhtUfHu2IH4Ja*55lBBvBw5>G=<lN*!@3=+G_B84XZJlN#V#4s+
zO_QGb1J^Q}bHC-T9JL@7lg}b9hQiTIkU>~MDAVaPp?oL7%A(+}$%8@<YkFuQg9*TC
z8<*bouP$m4`IGJ*+|er@(jf&lJLW?w3$1^E^b^18==L9w%>;@kA}lL=Kk=<Bh-A9@
z_eCCOQI$r5!)8au`~CgNq-(kEZk<QeuWKEd^uDm8P=}`OOfq5i)p3u~pE~YKFE`{U
zvmp-v6qVer39y&m7xa|(Qh?aoNRc&=0+Oenj<6J-NCd5)xQmopDa-|TgI*(zLYE|o
zEziUL`4Y((t(+k*^ChI!XF8c&e!k4f;xA$0T~EZN-dw|#=PX;#N~lBYoJT3!0XZ<m
zk>WmphWRuqCnG+-cYX5e0DHu<=V4Ms?fKSQ7s0k=yt$^3<*V{lK2WuxZH*0Wq48)Q
z4~_q6l1j=wx75JDxHK9xv~g`&Q8Hh8Hn<B^h?>Re0+<icris4=P9F-<ZTVZZy4-5A
z!4PuY2c4_5CELY631mzz2MHlnfJ2FhM35seknre$`|aNpOhdod4}&I?22OIgAS(`#
zEmbWcy_iou9>>!obG$&E)Mz&jnRk5i!awz0(L`qcJ{)?Gn5JmZc%K~)<c3|1bl9tc
zRWvGknil~@P<>|5`N<U$i&*3|rac-ZD75CcR^G77q`m;Qjz#JZX_;eOz!96a6v%Cw
zySEGBdtbyT4=ib4(PA8P0hB7qR6*R-S|#TM)HA^DGBnZFOTdPJIAyj|{mUlIihP-C
zi$W6*U0mDN!(U_F%u7n5SYQr=S=`s+7AxccjaJ>72L#?Oeo>qw{+x5_Jw;neILA5;
z5@dkc2KN<EvlwOW13kx^lnzSoM2=!H0{}wGY;(o4!dD`YMg^kk+L%-ryw+(ZyY7(R
zz8W+Mk<9MHRlp9ozm7ci!#Tr_GVg(*f??;Vr(PH;E}Rs`e%Pn8Mc1OE8yOgjJZt*^
zbaNhoY<bRmx#ut|yH7(~qz!@D%cn}U*oHIRf9p)hJ;y?Y1}Mi@$NlS&fz*0>Z}~i^
zaE0Pk0rga$p@0vTuBqmDZ$YaH6rW>d%lve5!<^(_7IZjnv(Tr<y7)?ao(SQi<S6Km
zqw9JK_geusEPFg(@*-D*$WkpD#R3L9HkKB%SGfeO?uQdpK?G9MtDSx0dF*atvxXGN
zm|2N{5%*#P1)rG+!<8~Y30$)nRIUs2UFv<}Hlxu@8Kuyl9%jLQ0mCGm&5qxso<1dT
z1+$>H3Le1FBpf{muj!@25XM}B)=W=OmnfK*<+E;XJ+~AGiz1r#@$J=Q4*Rz;X^?kE
zF!12Ac_su`nZW^WCSWo7c60@RAN4!))Dj{K3Qj0u1)$`vng|SGE7Gz*@3MTvCfFRc
zaYE%IQuKP;4bBpxlKnf8KT5RiQ6qTuaV|+RRAOwXUn3;3r`ELX*^h4&A5Ta=-_&Cd
z3<<3psr^Rz!+l)$qv!3;@z-;kX}L$o=nQ~**9qtOZj_6o9`jZju54Wir$LAdQ=<rv
zs6GHL^zPYCxttY?V@Su#Qh(p?GeS6bL~@M1c039sMQ#>Wbsf2#0N7YszGUtg!CjoE
zT6twnf!YrCySmLU6GFEpvwOBXPy`%#2H%~PfD04g#}!T~ReP|{{hSJy;%K_nk_h~*
z##;zoyd~+kCHL39jmw5aj;G)wrjhhf=!O`W`gqriVr5Wv8+fuwIH~^3Ex+JtE@pFm
zK3u$CDnoW-BmND`K}b{JbEDA|*YJr!jaUMfFB3|MA(cuiMBa+thA4`?k2ha|zD>8W
z*H=im&*iZ{`6&uRC`^g^I)$DGZA_myBIGY~48L`qjQ)jcDd={+9USwvBE*^BvuH4W
zugnMvDFGH-E!2sr+nbK-Px_ucdEs7UyriNmbN%6;aO<FY(7rG>KcY@2<N?jf;-BB%
zKc$ggve2@Wj%xSNzh!c)o!8rDVA+tB+`pWpyb7}8RW$N1Bn=R7@_0J`$yuj>Ag@=p
zb5Yb2)uD#AWP(rKWNMIWmYb_yPkCb}b-2@vQ^kk|Ib}CC8+b9OLQGMX2X~Vitfv8P
zV+(jTFm$U@bvzXiytSI0(-8wQx#(WelALk7c3G6%<7R^QbTSbP<Zo}UzR20ZbP7#Z
zru2C@dA<y<!O74n7fPx9Bh&O|U8LFOONJm5XTw(NxQfI;Hq+OoO8&s6CQ0O)ec&65
zgy@{xl{`;AmHG@9o8!Mheh{p8XBy6fb)4ZnGn895|Cv1S<RsNCH09~7jN!r=mrtP^
zYx=Y{?hJwLU1y&|5$`STW^6{}e^xq81+HN_zdk;A@T^B><5@F92U+_s4E~wTYt+$V
z#k<OujDl!A*FWgSzyU&|j@9<sd~AD97ooV}iij8St)h~=xBK_Rw)Kd)O{k|2tWbQ_
zY*^th?r(D(43fP^`xm;?Mb_RcHd;;Glew&2ZF01~t}pa16!p#jvXQRQW36STT{5#t
zszaC@l&^dKm&jK8Q8(eqC(%V(4pFx~F4)@$pXg|Jp!~(_p5RxzVYysUt0YLTE8hPc
zKRmutEv0vfF`cLtdJG4?=YDeX&~3o<=}AYj$M53D8i~`OT=I3bKpOr0q~U<f0F7zs
zvxM%~gnk8VX-dX}W1XRIQu9{4NpVBXe*H%bE?TvdUPZBQTnGiYx1DvmAU_8fzzlxU
zi8l9MfR&z^DKwBR4w88z_lW<?(jcJeijqzN-}YIgONOo91#mu=&h?4;wwEx5`yyvd
z$Wi{GkhgpCh?)I*+q>71_I_;*#%@wv!8?DL5mc$EBBZgw<=-NKG*k{LTRSRGQR*dC
zo4&BMsYj?sSg7GEXOWu4J(s{}Uby=KX!72@2l!rvGQ`oMQ<LM8YlV4vZ9W#(`sm+}
zl?rGT^?2mPaghF6oAq4&)D84jMAykW7C!}{UoY7nz)`=r-~xgE0q7Sz8AgRHBD!sg
zuCuOm=)fo~+T~klm#d)na9e3J<#GMAXrccA$9B2uYe1a!xd)hG+D3{;n&^|LMh@Qs
zeXR8>*YUZN;j?M`UTvHEvC^sv-uPd~(^(#RYQ-vAe}ij~HMNUu+1{W(cQaH5KegwN
zg>yQjlK4_qH|&6SjfM!`Je}+2UKJ_eG#8;x{6*KS$<EGx-ukq`-1dTVjX&8np^<4>
zC7mAFM*zo|bgqXELpG@#+XefK!c>2liNl+!GcRk~GukCS^^3WxX=6rUOv$ICw)4VG
z79t;WMjpLn#md9lwv`ujH#UXv?`IIUZQt+t5b|?lE~IgNsPQVUMDc&1l1Mnv>~z*h
zLMA-J_hrVWxkjdOm0B7<F%AMfhP(_!FG^4Su_67rmW?Y-1h4>~*P4k-f1=!nv|{b1
zPYIs)CutS9+gq+iMs<Z=OTJ}IlETzCw>kM|4~kYOOs!pwUbfR|mM@<?B<q?&<eF)c
z;V2)+eepHBMG_dxw1oBM+>0IKVKUt08E@EY@|KXJs{@JV#fl+!?*W><?XoO}sEu!O
z&Rhm$OYRT^BhCMWYuE1`GdZJLMs{RrNOZXK?Hala8d4Y%=V_C?Bwj+VuOuN1<IbHK
z<~l$T$6pem%at^Z9nh$VQb7r!iCR3H(t(K!eH^E+OfMI8Q)$`*n|K8$u_fDyEi@fv
zUv7M)$pEJkXG9K8-x)h@dKjL62VxNu9HVgH4#tMP%{O&>v5FEIK;YsnqaPm6=kTid
zK363$rxLikdIEHH{kTy|1TO9DZp9(2?ALeZl#0XLnMl!;o8Hoou0g(zFNkd0I3KIp
zxW!d`mV1=Q2KR@5>0RCi$;rV(_;Ht-cFbe|CG=W#_YqZf0pgn?XNCWF(lAi1=1L?_
z>c3UkF`aM*luRV9mq=eR?tL2-Z>?&bRWUQJ7xyaT|6uMdoT7Z+@X?Qmv~(;;cXvxD
zT@r$nbgCe=AiJbA$kIp)A}An8qm;w~3rI;wNXP=xQVUXxoM*ph&Y3yCzu?R`I?lYj
zJns|tbJulUNxWI4{9RKf(2IlDUndw`$TjfurHKf1NNJ@zoNf61@$*qkk<PxFxnuk!
zN{kvNC~9^q_iCD~<*3%PoPb_%;krwze`fc#sTMa#F0Q2xjD-(=!)k`oTE&wF+z|KU
zPnNk7SADj`a<c#HSCzw5fTyTyUcfz;3rEcG!Xl6ze9aSvYxAWyOr^3v|7IclUd)5*
z(@TnJxhf7Htu(^8M&wU>kYnULP%n<J2^Pl>6mDgwnR+x-WskJ+VXYRy)6O^~K|G})
zS?#%q**Dp47H#(WW}Qk{P)ogfp*e@7zL8j7g_L2Tg;smu4G@N`JeMb^(0^+e5Mc1=
zqITKwYl*z$c}X>t1=B4P05X=0by2rUT9Lz=N>W}rt~#<hiXCItY=^#mzqnMP0FLoL
z-5q_bIGauWu+UE0i7AESf9E<?G6OK37xA`4SwN4Fxtg19O+eBM7{`DidC%Eea>l_W
zt3+ClZ0rD0L{uNIq@Ph$Aght$Rw+4Siqg>lP=APGtKtTRf{>#6u@-B41gQs|(4^J|
zz=1Y-xL(3{a2e7d<0C~}nZ^T0p?m+n+8j5#5DDcsL3%c;EP|<`l4=1@^IC%LR*oT&
z*j1+Me%fafE_|<_X4=Uvc-EVImznOmZU{WB%A?h1?z;RCmLT)J3#2U^B&hMb#n_*5
z@(V^LKsM<#Y&EF5@0qlt1UYkR2qGgK+}?JF(RyDoZHwO>9me>%jpc3>Jd<x5(>EHM
z>TJE-x(Iv}Aomv%y4MpG=er3-JFOxQCSv<oIQ!Jbm19xIXL&<mV)#QQGiw1%d40kP
zs6Kab2P`tzcS!z?q}G%x{$zqG>LBwUH^=&|s0B(Vl(Xx{wXcqQ3P$6raTbPuO<ZPZ
zSzY)l|GC|$zLtWByeb<Y3V_=jvajf_oNZJDr8R67f#!tLRV)y}ELhCfCaDoR)AssU
zJtQ&rmRt-Hz<n+I`HS^(!~m6bkI%R^V}?q;P`sUsL`jStj6PYy=aVe#0>V{09oQdZ
z2!B6rRmhzcB>N(Dp#96SeeqlM^id_M7MY8t+GMOz#;Q-q7&`cf;ZRCr4O=q?+TtsX
z)t&8&G+q%g*a;rzpiRTHZG<voy}nou#M@|@Wl#y!*C4`bY}K_ovrU&?JWic!zX0Xd
zlf!+uw-*1Fie#Q#^hqlL`uYW>%j2x^*r*41V;<E%El|MCTsP3t*eI5yKWvDxvl|nS
zyGNx89hlmPxyNBlgpaqu6ILL~Jrg6&j@D3<fxI=1IqY?C!v1_-um23aYjnSL&4ZO{
zO!Yd%-i7lnWaQj-+O?ecuztCN>MjQPTG(^Z{vR!=8Jes8(K?CxT!RL3*09uoF&#Q%
zcf6qnchI)u9xg~D(?${9jJ8|mvk^^*<yg1l@{Jync==m7&(GW4g=+=UdUOR-ebm_R
zqvE;}XHsHhX*?8>)|~YFheg2|w?Ez}5o0sAt~knSlO<}1%_yY;D}8kG{rO*Ag3=f1
zYilD=a9b3?zuYd;?Ysu%ine|VYWkMOA8Ne4zd><o$ydwl@SpCH>vju#4yQu&<_s;P
zBlgoI78=Lm`CQ`F@C{V>^w0}iA(!NcIP87)vKytMi+TD}m>`()n^Je8D1FP>V$EPp
zEc2_MepeW3)pe02e_68OI-ExRSZ9;J#g(C|*4t(Rjs&8qz|WQgPngdi9JI?m{|5%s
zr~QEOcavWdTr}K+L9D%0Z19P}Pw}I6WXqfFFJ=gB3_<;(%(^9XE9-=xb_p6}w^I=6
zN=v_<cc;!xjg>{vnMze(xuT=Z6aUp&i|6gBE7Axxhe@J>xHHG)R~UC2i4Y!xrnbV^
zeM_HMft#do2NO37LtXWuNY2}~1lc76ecPBW2ot=z4&qwwtGZ!z1VB_LrM-K_d!HVE
zX}x=w;s(b<sHUV-G@&zdg6nwm52rK&cUB6Y=pSYrH}{EjMSi$v4!l<rLs<HeJlA(-
zbfdnD`J0eotQfZXg6ANa=3eWb7m9q_M(uH{|FtP^*jg;QJ1S&6Wm)Rcazc3oVL_Tk
zZHxrjT5j5ZGQQLHCvqI|h?<HQX|(BCces$u8AK5~AZ%#!_y;;kr!G5Q#A$YBr$Q85
zLgwavJ>y7WdB)Ajg%1}-D>AN=q-S@N+=+fZD)sYK#<gM^0Zq})FQlwwmEjx52q7_f
z4zU5YAl<6C6PvOJK=i|T4<Lm%2X{vlpoRWDvLEa`zQh*^xFlciw)avf4-xX5W++U{
zV}avJ&&FSaw))6M`n!PC-p+F~vg5cNWm{l2>$+wef6r1-fXgP`#pgkC=X(#a)wWCt
zU8n1PPIptS=2Qv8Z@V*|wpter_t8GL2Y0hyDa7rq()NZWB`MA@|8Md5X9Ba6z?sYb
z6ww+Ej+`e`1lU6D>d~g+A#Wm)s}X6^k01drAR-?{s{BlzkMl_D6H)cd1^21(T%-Q9
z#Cbp#k-4eVNU4xLl^Q@gbDBJ-i5<vC-o=CSwCrUiu)(zlz8!>uA{+1RNAgw!bA7Md
zxRa0f(4It!%|92N9TG8YqG#0Z*C~l(Yxn9V6U`iqN&E5zw+j^yqs(4PSvjg`eos}d
zTN8=lpU6(V>+?BSc!588_N#}IFuH30!==iY=nO@mvu))cPv`i~F&O;)msHz5Th!>m
zMPk?=I3ovwB9?yDGh0F+8zPw;m%KK7k^90yB<^*A#atOpNw0@+Q83r*B%X^lmQriG
zyHz_utSsyFux$!t=d$PCTEK=~9Q_RKA(a2w&<QYwPZljV=E{pWxTw(jn%4*U{mu`n
z1>qkNJ<b=s6QR@CWJLSZ!T8z>z*oeOZ^W2HBdOTsxFg9lw2m*Sa?o{iaT2DhBe~Q9
z-6*<)zNN0kN04Xnxk_8|NS3GM*M+U_x~W#@>a>IJ@RbS9CQfB+w+(5%jV~ISzPH#(
z$&W9a)CG@ci)FyO=j0H(-{33V+ILEO=v?>IVk!A+3|;oN5*@yY^Ejukkoq?~*9uF7
z)v*ryuxWqflz`SA+8R|qnX+~}ZSBS2YJY%V#O#T-#`D~>h_r*(+_%=1XhTF7{`2iF
zr$4%Rr@5QqK1b)EN{r~A$h+e)OQ}#`LzSc!R;Ln*KwauU7+=SZXEupcA?DfOew5P7
zkyOMoRo|ssw#n;1NCSDU8lgnL)m1|X=jC7R3(D5g@XfKpM0EWRJXI7|4e#CQ7uE1B
zgTWmsRMR9Jb}CUH!?&Hkexq$#Y1u&~;0ki{w1Y*1;)pS!7nYYFILt@nSg5S_9}ia3
ze@MG`o^;O?vv^jxdRkZ<UnC*85!^{|SaMBd@i%z+%WoiJvWejheL&Y5DES`ncQ_6F
zwC%Jv{bRG{?&43)!T3Dr;04#ObcEk&g+~~b7Mw(qX;q-)+(%dIi0J)jy8{leZnApu
z?&OJjEaKu9ph##i)#9t2DDHUv2Rgj)sCP{BS<7AkKOWBHTK*8jCyDTg!L3Rn4?)Iv
zI#%kV9&x+4MB+q|6Bxl)K(PO&(Sd*7*X2y$Ck)hFM-8`fAHV$I)F}4l6*C#Kad&|t
z<2F~=Zab&S8YNYpghl$<_~_Z}dW(nX4o2yRK8(cA$pG7RTNdG>kVr|iw(|b%Sz*~t
zWh?PL!EGr|xRVbiPz`buHmWlkEG}%QFhv^C6Tmu3Rw!|JlR*-xsoZ-@5|1MbvI#Xb
zxbL<r_%lj?bJ9z=N?AQUK!2ufx@fYtFzjN>&}D7Qvd6c13gsHIk(*>nkj3Irs2cC6
zF<M(*HpVF!Q^OG!J$Z%*LA{g>*>A8VsM!CJG4&>l=iq@+H#!v4bP;c~Mv=k^jiyvA
zOjHw~3T|&#+ThT<;tV6d?-{r#6JIp-Ws)_mWenwpTDXDl{{*2I<e9-S_*4B7=`}e0
zDNXW%g}vkfHSxc)^~6k*31>+rvB1bwm0OHS(6Q%X#bdUTN5}OQ$ow(y&58k?aF;dT
z5z-*Tly!6Msf=*}hyHJty<@Gbb=Hp=A~?gQ`x>l;z{&{4;#P)~i=Z|-=fxXgUPzC+
z`rN4+4=i`q%3=)JL#&CgeXbLf_||0Xoxeq0GaN7&UyxVQ2&U?(N2T2#9J#9;2FZl-
zB(RPBK4u+j`+<yF1(?C@wgCD+2Sa$;q$-x||519ZZQORv9y>i&Q#E|(G%QMY;`quT
zqbsXTsmexMyQ4@@B@Qh>qpAd7PLx)EM*cOQnn5EmmCce15);F|?$Gm=uM=z9*6!7)
zEq1|L!(hjM97DV4F8sYExE<sbfV!pB+T5mtp|)vI8)8hUt!|jq_}cHdwJyK!w03RK
zrGS)hwVPL=q6K;CqFH-5aS)b0T&+$`?3-3)uEUMpsKzKLcTK{+Ge9!a=Z~<1#<}W;
z(_U1n6<Y!S?4!<#8G^J3I0JtBu8d3Z&MDV*eDwZi>RR2*!phsz;>1!h?akwwNj96K
zCXJzdlVMV=Y?+!?_FZ6*zn_V5Wb9W6<^dbjSz~M5k<#xMd(;R|88;VRe?xv^M{@Mk
z`fFc~5*jY5U6)M#WW)us_<eklI#uNbLjt&T7!qDARsy4DNFITqR28U76(RP83+Z${
zi<?sG!NaHO{tlQ7D21yF9@nSRpX_Fl7E{tU0MWi$^%p)5!a)znWwkQb@J7!&mQxxO
zM}LY-K2)>1!2fjz&D~tG=1=ql#1zzM-gY(^M;FDkPnrP)9p!>~Q!9)nq>rLMIrx0H
zd54=K@xFYy7;T)1=y}5>qK?)1^-Q58FeV4P4&s&uI}8_+=TwrQp~Zbn-~{y29QQQ_
z8DaqMjD3)@owv?ti^g06qgsTalm*=cp;vsoZ7D^;kK#aQ!-o&=PU{FgXKi>pw$VCp
z?I<#M*ITkM7!fGHQ=`K~FxP+%=w9}xW~#qMCBTr=aSgbFPT&qj83~P-pqn~?i~jtn
zO5{m%*CW-qRkQF@g_2zNmkVwKss!t1i@x*cg<-GGbfu@y!*8$f6=_^go~G6jspa26
zb*QY3HLJhl;y78v!AZiy1Xq&_*6tDH1RlF$W-PVPV`{>KXTYl{EJf&$ODe7{zet6;
zl>VA3Iq3$7x*x8zd%rQJlO^XSl74rd*rm8O3ZTCfu)^yKwNad5ep%($U*fyMmCPz1
z64BxTCtW<@Ui4n0{P~9%stIINao?zYB4zPlE0is^d(vrqQ3bSF&?lnl6Q~{wH(Kwa
zz6?Z-bCP}NOAbwJKBAA-2nI8mb|ljN!Wt&jDDV4G6S++9qLr9h*D~KI{_HXARwGf=
zQ}o<-WM4<j7fsUDN=@q3h8?812i4LPPJx3S--ML2bg><d*}X(3#$S2+>OD+6XrM1<
z78!Wg=MJ$89<OnvC!I)^$aOmWiJbLL%YM5`(l4&|n7ef^g<i`mvH6-Fo_w)E^=c}z
zbSEfMYT*M(D1qaQ1I?19Nc?B|Vi6Ifr{bu3RsT#dO1xkmTz%h>BORJ&>itEmS^jsu
zHY9&#+#qnIZ5lWnkMw+UpZ2(ezHGXQoy<C#q{;Hh7j{u=%BLa&+4Od5UyHVi>MDOC
zNy{|%D%&pBjjP#btWz~ajwX)}?^)DA=Y7q+iQTVW{^KGvk&5L)KienWYx5sNUwt{q
zV_q*jc>5G>HGXLYDZXj|?TcSyA{IGKg4z11M&~$Z%t)OZQ=U7^5Vg{4JRx0ekCFRc
zrE~wFQ9Rxn_UWWW0BQIb*7Lwgvn)-lJmzo;d8b0t5Ygse98PF8Q^bSpu!fk1E+L|S
ziXi{F6*->8sL-P<f<fT&IQsct1q)Roysu-LJ*_7}s`$vebz`CTUw%1o+FC`n2N;Zt
zZ7Ib6Rk7a=b~O7k;6Ae<(MgA9oCtU?rhKtk^7VRTq@rJ`=W7SAt8|&B*E<!4m>I7`
z<8PuRV}lU1LlT0@I<#IguVE0v9adB_xOqCfai8qukfAV2MCZ3ktdVZYc`)4_Y7yBv
zYuJA$z}TcINVrPN-D+8KlG?SAL<*7L+w{2kJddxE7Vq(W;3<82@~*H1=P4z0G4-az
z#A$K^-L!E$?iZ``Nq6J`mB&KjC0_LXSDE*TG`$e?+R1?vcxYNXo=^_irsewo-2ZFk
zwb^N-Vr?O?j+PAQd6558O_)7bDN$2sf<wu!)t9a<>l*nB+6ls^#e20r+UD#pl{T_S
zx#-8~S$SCW{o;gz?RO>AM4r$o#*2TUFQYn{=}1)iPeiGJSC<Xel{bE8Nm!&^P*In`
z>z{jT-GF}2tNY@jb0}~>xNs5{h`ggxAd(8@Qd4De!i&{)iJ7tDW9PJ~v@apYBPW*9
zT-q%9m-;rI=<0zgSxs@<PcpG|uP4d<_$*r(l-e%xbR!`|>MD1E;;AbF&SD31rhC4v
zTyVwsOUJpCTvEOr?`|E6GDhlV>@YfJnli6e>K|2W+StcCUZ$(EC*_L=);ky~HEQBd
zwn5){A1buBwTeRYbnap-VRxDiq+31OyHwR!?+WOI6O|8)O(lSfR<tJglP<QD_M+f0
z$xIt>O9++yi_w`h;w5BSwG0*Xx#9QqhRgB%Q$w{E9I7>u2>(s7qbq;q^fcDN&6A2D
zh+Vp^%GHV8Ah^7<w4uuRw=`d?y#>oh62XW+O>~m?G^i;hCMpCFmnwUwlnk`YxhH7C
zR9Os~6Qi5A-Ez_=QO&ZX0ZiHAXF+!jsWZf<2Q3zgddlxue3jjI6wv(}zKSo_@;p-M
zl7ZC*#d52NuRf_eLI`W5Fy$dU`Ewslv!>QB-~K0vm8aSDFz>E13qn0DWJ3!6lKXW`
zLv0ND<Vc*j#%!aNA=DbrtzRmiFdFF^k$aA3NUhN`ewIzDp<_j<u8bes5v!T+XHQ&7
z=G+-)OLA*sx})!_(3FRL<KN0JZsjv!fmPNqT_@^q)xK62jT-5$9GwXq)%@=}Z{bfL
zoLN2jXJg-&@_^!;NWCh!dNwpAL@?As^ea1(Xor=bD*qQ}?AQH&^XD6%x|=S+UGc|G
z2-yMiTnl)i0KcW_Rs0u}O#{7)1~Mi(Re!tz<Z-c5Vg#&@ktZ-R;u)GCcS7mJptPZP
znk$p6=|~SJoBB`1{bRToTTU}*08!H-x=(Wlz#V>Phss?6;E57OMVC2}b~Dt1Z5^I&
z$L(L*Ng7BSOHVv}$6N7AF){eNGi9#Gk-79A`x(wm!K21wANeb**%a~VW3osoM9l2r
z8I5RJCt=H+awv?2I4Qo}(j7w}Ji@Y$U=M5Mi>K@yFrDr}*8j21nKX?F`lGg0yT2dP
zKjvs;+(a5tnQ{+~wFzY@s*l#h7S#XUo&#xx<wCAnfVwt84J6rK6yD@AG~9}%J7P~@
zMA6K+k%sO4MTk{zA;e}#3E_ac_3`fPsgg`?(T#GotVWSiU6vNV=wzGzNMj=Jm}E92
zFLmCFnSUJg{x_ruYHAhW7At`6IEIxrr)(+u_uvk<Sgeu)vWQa8KQ2U{<u7T?psM(k
z#ffRLwjVxEEb$Fayiq?P(3aw%Z1j*?)gRxYB|AEAAYYoBY4bj;sP;WK-SPR3`SM#M
zDdV-W$Rgv1P;KhY?Ew*dq2eT>vEk=<*9Xy_fIJ*ZYSfc_a-gV4E&CU6wfq&QI!)E~
zkeB;|T(~mTe!c3>V0{^C-#Zp2|LBw2vJw%9Ls^fve+J_s_ERSGGh9Sy$Q1F3S{3n*
zsC!eZIz(2N9zh(%eP_2-YH96_Nt~mryPcA1kHHcpaVaB`Om5JnN$giY=tK{+7|qHs
z-;AK3;DhN*pLOcQC)vC_Ni`ne{qb9ZBcha9cIzo};4dEo!ODG3vlb)RV|M-(d|=O7
zP736zwO^1aC)8h6VpVt7>e9F$dGX3qB*fS4UidPXEGP$2x>avHKlwZFecG<RG&v4a
zI<pBSR)W(Fj`%LV@@3;Y@%L6)x=T`!-ma_U;QFtbs%2oUdqV7oF6UH;8lhk6AP+ZV
zV>_cSiPhLAZmPg&<<AD&O4FK4?%Uy~qATtsxvQ!|D;u!g?Qd!h_8|2%$58PxkP$aM
zjO8=RxXt21IV*U$Y9_#Ep}B`JG}y&6^P3-d0L|tYE(dTgEF$_0ka}N|tKLi^d}*8`
ze}qK{EB5-#pD2|bDc>}!cUw|0Oo*I&cwdKT@f(fZRKV>XEBkcpRIr^nna>(NC*B}h
z3hl4MQDj?&=g#fIj9K!vLRzV|gSwz%3p<*h3|<@pT9+1A`<s5|3)KcF-Zm2YWAi1_
zA;XjLeu>e(pc)hW;qA*(_)QK$PeZ)t356qEJ{z{>*Q9JT<gvm`8G3Dd6>m7hUKIP!
zo`N_t*5ZdKelqTQb2hH@5GBDGUXra=BapEv;KqB*69&;DTd>vgiX4^yD<}GM@biaw
zmLisCjY<2y+xzIA?+>c7)*>D`$@WUsj9fRNo8H+?kIbBRSTyrqlU+icr1*l;i4(Ge
z{7Y}UZUxPdolJ=}YhAmr8MFr`kPAWkn~g>t-^}u%R(D=3j-CGGUvXzE{aa(&JND1=
z(TLizC0GIl`j*x`P+45r;54od2EJj}ASXDl8!k6K5#mB*H3vcvFz^1i8DEolrhwPj
zT1qKT2|j}yzv`nsE|#|8UkGRbre9tL_cocN9~gS@k_s;4-mwcN;T88#gUUWN2L#yp
z&KlcSS>QLh>;Eiw7Xa@b;Q$Y<n?0m315T+0=(`V_w(y}+-Ep=Sp~rIuuVdtTxc?;@
z{OIEeEoFUNnu{DfBwLmq^cMWa<`uAW8ag5q&#c%cpVd)9aK96h7Szdu+gj<e+g7;Y
zLvar&yq>yA2Zlwb4VL1B&IXJv$2>8n3j$t;il4d^!Hp!V(PRwg9_Oorhej$&S<AsO
zt(FJhSXHwO#@63xEsT!>uanX$UGgRdKl-^eDJTn!WR@8`<Y~xrau2xHK7CZ+8$)j~
z95$$Y_~FDMj>`i{aGr$e<3_^se5wIA%j>v_fC^R+3WA9cN(SG!Bh(DWCh>`$G(rNr
z4nb9M`dAJ8sL0cj!uMb#vbY?3zNxOVF_;J&U+|4q_T6A?DNma$i4eHoiO~7_iyAi)
z-8(13U?jB)xQQsx+l~MevHa?POF{z9;s3cLXR81G_uut3L)?YW|05&5`{q`=&5x7M
za<^W!sk1lVrHwOgy!W<wa`_>1qz7qV`Z8@GpN=o7d3*CMzUDTM(=UZTynkK0NV=~r
z<pz(nVD^4_O9rmbE{!cgC3oKNlxTX~e}J!X4`1UgX9S3ol$reoIMARqpY-bTlmfK8
z`$B?u>xA|y<ut)F_wZW{U)Fp8*(!B;;EVPEL<qL94WJlf0bzJ7py03BVR+Ib)edkL
zj;|k?6W0KOoE*TDee?Q@2xx12uA`<-A2Ce@k@d8k1s$?IpsSn`G_qA40Ne3p&q=X?
zNLt51nppKV_2eEWJKl56dq|_l5554PGH2>dJnFDKRbuehn3+3mgdh0tw}j1rLL!~B
z!AZ|@G4Ol~7dx6S(DWU7!b^aq{onK5|9^Y_6Ma}t13;m5M!5gn4?DdZE*;LT9H^C4
zw=pY+`8MpFr$0JB+$4xD%awW$mC1h@UF1gd;~dO<5Zw)8)vwPKWgt|P5qOkc{HBj;
z)(y=nV~Xs<uYOe8<Dxe-&`Cb|rUh^-!BE2S9(QL14wlsn*w^t0{sBn?1|W)%%)0jo
zDprJo0gz-IP6l`v;5xo~Apn*xe{2<y^H9T>eY$SgZB;XJ^I)Z;-i?k6f~mCDG~b}R
zLA6PT&y78P^>m6I42eVotPh5-(h5ldjY4AqhxM71+Kq4=T+Q)dxf>109*6)e)EgrG
z#MlxvwB@U&ddWLCNsu#2a@&B`Xh*>A{`c@7=&|37WtF+6{(*u4$X_(*$`c(1Trw)C
zKKjR1=PaACQf^6y;G;+phd(`P0!$JWY(ozF@+Y{6RUdShXEerxS)z~l@f2*)AVS<M
zIkU=;fLU6h>~hHHH-c_dG9ry6OF^tNasb+qqQyGSI1f%MYdV;4*b?!Ovjg#;PfBj}
zQX;lENmQRAK;_sR20kJbfxQ9{KZG<2Fq3gJ13hDv>Ge^woiTGot@T6DpGyJowlDf2
z_AvCV{a(=ZwTq`q6;mXwh3ZqvN=a+_rUEofQd3NjMmsZas=s4!Q-@_PCj?tnffsDm
z{*01p+*Z}YgSdcwdbeqWI&5GS3gF8z0Iti61-pcT#!%idCt%vDhfHdi5-5r+!^413
zL4q2oD#Ca3dodcoSoR>R0BW$}42Y{lLJ{$gVM(;02s*~#>_VP9B+<_-WgtDS!I9((
z+UNbeMWgScEX~ky5-UHT0M_&Z;R|Sx$wM#y-k$etNDBVQ;q?Z#l5lkPbnlOr`;Y*b
zO@%j*0DXK7b~Z5lHD}f-QZTcBIoQ!<=g>tmbex@|K{!Aeu>O?!ceWY__;@4{Qls){
zAZp@!11Drau1Oy@0Rbeg&TOVz5Mp=rH88V9HU|P94@{pTObsM7Apr3hYBx9%ePHc^
zF8c`~KH-#`1m=N(8+%XhlPx&lPqTBU<-zJI;tWDr?G`%~CD@A6U~q4=vf&O5(QNCy
za0KPFrY5)DDKDDO1MyY&tU?yG#EErD0@J_#T<uPf0&Zu7M9L_E@mEu&9!$!FDO%%f
z8WYdU?hnQ_S{09eby^{f-dBp-!v>abPu@`qt4HPZep*^SG<}Zvs(CiZ#D!O=6ec)6
z9{lwU*mFxNJxoPJ@%OU95VxpRScLJw%HE~{uNJS*Bh)#oNGRVXuhJz}eMCj)#D(EU
zmr<I&H~jn(w_Wf35WsC$iU@NO)sPz;_IU4o&-42s&Ioihrg^MVCgr~ZeYR+Rgv!O{
zXdB#OJePX4@q<;r&wBB+4c^vLx72BJZ*J2_$w1QKz-n2G6nMFZl2rzESSw1LLR%_@
zI_yZCW?S8e5{!msD&PV5Xe8y6_XknkTImMgmLEQ&bx!)mj~SLqA*I%oTRM^+grwmZ
zSYd#ut>6To#$cJwpxlXbbR=IO-4!&KaJB<3bTokO5d+5jy0)p)t;sSt!+5UE<~b;P
z_5dX9n$|F=z(Ig!f?{VNLT(8r%F4i)5nZ%CRv2AWE%6#R;cx|Oq>-JQ+SMj8uqi>2
zt<?f}oW>Bm5&E#Ri9>*+i3KN07RL&Q26W??^miEsk_p*&`d)H%geA#L6XDJq><k^(
z0}xHr>wxrGOh_p0-M{xYJ^UovK>`6?7`#Uej`zFlDi)8bx&j_|YhWpQ+A*Vd0N%=q
zVc8rIKEKrApm>dv4+q(pjx09Va>M~R<k6sSce;;IX?A(B#$J^{h7BOF83ZkVO&Ulg
zX?(emxGY?_>wqh}#e&|gVa&%arcuul<V4WUr;XE0U`BiF#Q?0m_=yu`ku4l6-7NoX
zR}@`F@RljxD&y#^;zq@P`MKI~EUm-4slF`SqEJ^UR)D?Z&GFp;9Vf=Dv;<z|Goj0O
zi>v`8Lsfw02)*cEhjwMKMCeRYLt=f=78futa!A)~TY6v!D9IH!---k18vVa)?4C7)
z5AVVzD3Vct;Z=n@+CxWVZl(Z9*&xOlAU`C8gB9-ablb2aS-=JF2paPPsH)p8tk-tB
z80)B$tDZ&=0_hR{s`jIAjsW9IVIOj+*$m<_TQarp?EtKOrM!;QC7KEHayW8Kieo>G
z20cr&F^zz)TtK+C3s3Y(2LJ#@T+;32AN>TabOqT3VCR<f0CcKVd94--Txbz}CjHga
zVcRdwTp6ZIql*MzlGG=_GY?Et2OrAi8+EDt_Y52#@P`rq-}zWLYLGWyI2p91kCXj)
zg3>`1HvRhijBmW%XocX8CuLdQvyW#$jYY=V(m=W<^GWBlHjN+_yDmfJp7->_lg=gW
zp)_hh?+6j09sV-Y@#=3arK0-<P*>o?Q3dz{ougO0A2cuP8FFvR_vFt24okBFfYMvl
zpm{(mA}mk{P%0fM{L{SuxnookfULKz^dW~RAX&!Q+(U44yaEcu`h@PJb&2!donN6i
zt^1M~?mj^KU~fIvL>0Ju!-yty1wh@H0K(^@c~o(y7Z6`uUPV5CI+A}zBVs0PJ@7=j
zrfC*vRXssFbS&L9G=HfINr}0Q0_8UOy#AbU6<k&DNn!e}%}|xLfejyUS-vO?vBOd2
ze+bAtzW{PiTt)jhMEiLA@H=>^%pWXVp2~Sm>WL##1qGzm6_3IVyq`TqV8aKLM4P4_
z9)cF~KM&N8gWo@To4D@!$JxdyfoBDWd2N*VnZoW18vl==4dzoC>_+%!CszNrTgV=D
zPb!+N!{D&i-ewh0j4rEDJK@`_i2wX{$6oaMxJ<PV+%S_#UvMbK7_!k~^Zfr-M!^Ja
zM^e0F8CC`9Y7=se-Zp?WTLs-@n+4~bPaO5=He52J=CPomKk_$7lK<v>A9F_&v*J5m
z#(aC_J2?3}PWT>7#eaax1lS+sf~zuBBTzvcOlcfRc@@z;s3_LSmvj1P0q{X!mm;1Z
zmSZg=XRS(6&D>b&KomFDKZ(!dA<}3DvW|YT4lV#zH?`3J)8tTwiMNV}8OfvZPmjAZ
z2vZ0Vjeb41#p7u~ffCy2lO%sKW!jG;a9#<EiI^Q>`8}jy_b6+I+uE!2474I!9IxDO
z23IBD$}6F2h{&)gpb0>FIjr6;HGlITf>Kh1TOt|Qg>kjbh5S$Hse_7Nj9baQqaK-C
z_1$`>=h1obXx>{0K?wuP%E~Y-ydcF2Q0aT0%oEr=@-E4s$eO>qkDqq7XOxcNTmyZa
zVb72e09Kw1<0B335<Uv4VwGcjaH%QZx1FKALB&rN;YOqi=>Y7Q_Kg}yQE^Wor6s!O
zLQEFYqJg;AT+4j#5M-jN07;1?WWk0|EbCau@xddC&q+k4TQGb<R!D05SWDu>rjRo$
zfQ2>ocC)^y1xDXJNWXgrhyf+wWQu}yb^#pJ%hL_6?GyIdH>o$ree`ChmZdK%1Zp!b
zKWRTb08r;>M+(>_rCzoS`Cnxqm!qVSkUAKSU1bXkp|9~9Ww!s;G99#bc62h#Ea7c6
zQxz~;_;XagO3m5^pqsHvb;FWFFiSj-vWLzEHwnmm(C(k-+%VtU^e*3@QVWe*w0>~x
zyQ-iUTQ3N0@lsR?Wu2X#Lj~|){j=<TuX1C})*tS-;cnfNiEsO1!zIy0*wK`rX=^;a
zy#;Keu3`zWx?-H)5o@E|(MpHCG*^yV51ZH0^^t&)<#C-$+u~7FOgDbw-P3RO0D$2t
z51I_VnnCH!rE2jKYz0WYQR#d#pU<!kE%(})a;`4yn}X6Ar^AE?OHHldf}&PLnHIO|
zQ`4&SH?~9NyGGU8kez@s-r<}VkGP;B-n>GOp$cb69N?yS@9T8)E;m%K=T1MFDJQj{
zAw!>5T+t(4Cx<<OfRo__zm@)*83RpnC`i_ZCXY-GTw2qP@)E%~J=3K!#Irs=sd`FH
zWbv%_YW~+hdK?8}jgltklcBGfxt_BfIdNj0*opI527e>g+joYtDTOs8E&o-%!sJ$u
zC_0Gr%(QP{6}j_1EGUG23QV3j4?3E|&vjUe=th1C`Gg}@DSTPbWZPk9S!709`fw3I
z4wdQ}h-XyRt0a8L2G?;uul@W&(OFeanWOmqMmIIM3WeE_lu37Gl0+bPRw0m^3Q^Aa
zo*l6Daeq>bCLJ%K0?;7R%8!97no;;yt_6Q(P_3*v_uI8my1J<=UXO=^Z+1|x=UD*i
zG;s9Kak;Yy@R7^<UIqjK-^Gb$^W#AjN3@Bnor<WQanL#NdtaEmx3O6!CDxLm`@lbl
zsEgWk)_rRW;pZD{kwDb6ZVUcm=#hUNfI4Vcd()i`Zykt^H}#74{RCJ**%v?)X*aTX
z!7|_Jm4J=>JAB2+RGzeeo7B$L3*UwJjf^@`#--1u(zIE(Y$(UP+7GoD$OKOo)6-f_
zz%>EN4Ehzmu-E%T?KvAUV?{BGEofqoC2f~lABkUDJfCsA(J{oMY3tc=oWFTrys?EI
zk7x%C7$hqN+>WK^Cw827xz-8*1pJ0$WYYs4#T>I|Rqp{?ft;lZk9(ZX(}YHCG+7yU
zyp@`XJ*I%>#jwclArW8yw@M0lEFMjnQqUXD)y{T)OKcjf8r;=`a7xW`#pxMAdk&vc
zg^yGFg<&ZteX#OWx0cKhr7VP!=hu_zrTwE0DHXv<oM@5gie;gA5!c5XS#qPdv%wX%
zSXtGbJ0_2$&>HWdP~Dx0!W@~yQ-ig)HX=Q6Vi>qs?f|Y9|4y|%Wm!vA`24xJ5f6hV
z)w?@SkByK<T)ofsh0tZPCC`Fud3msATxN*pxRd8J{5oh35AW{{W11T{pM8M+`)gw{
zDHmljq>`?ZLDaVNTA_r=hm{}0J|_pUIHg$ilwAu94CKv&i8Rk|6y%qFTE9UV2w+4{
zQ07>GHw}@K>frRcS|fh<knVM;!4t?9Ewi`1Q_0o4J+uq1kv{||_bInW)l&HV0<=1^
zzVBa~zXG}Z&Nx4We<X8<xcbzkffu)>l&n;NRL>&^H?8`(zSLj+?J<JMB!y;G9MLE}
z7i~<{Hqjl<;XXZ-wp3~f{I$k<@`4<51`}G*%^~<#PWZYjFovhEo)6#tkvm()pCo7^
zR8<=(Rp^5xR`zXR+{#^K1N{3NMoxAH8V53VPu(r4rw3ZOq!H%^%nv+|G#1QA-w8s{
zAyt7YQWfKbG$rlUh43Ivnpq`xnxYusWzWq>o;T&({QMqTwu<>{aDB#1eWd%)^CRF^
zYYD4oT9qqqEm2zpP*USM68fT0mDTkKx(6DsK;oqa8p%UJt=p5WF%cqK{?q*1tJMlg
zoq=uOC<KoE>D@q}!p!JPwjt;&a66TBj&4JzFKwa}EFRSet0i^T7}ub3U$b`SnUz}z
zq&fW?Q4*Z0%z>8K1-^OdhxLC`JPKU2?}^0znrGedNO{ouZ2R>YEx|%6hx)s#3A;3i
z7R@~WeZzXR88t&xb2|sE1GlLx?ae=Dt0{qLQCDY9QI4k|1cwbiL_kS1HYx8-yyzuD
zNPX*b-t{&7eAt!BRUyseAtEf2=Wg78_&aoP$V0DQ6zF`~Y0;&1s~Ibz%$BrKI!inn
z`0Fv-+98ZpQ_JBaGBdDu&-FU<7)%IB*CFQXPuHNR^JS9jso6NF=~J?CqKK`K$|)2x
zgUOkM7)4D^EaZN@Bg#~etkf4n<}Mh-@ycZT9CtYRUFyY{MYtu?rQ-3wn^mvCtS>y7
z{lDKIeGUjv5L^cLM6Nv9#W}Wpm^q1da#vyOoD1O4E>T=KvFfm>RoJt;UyFO8x;FBf
zJXz&@fw%s`M53~C!Gi#Mja3P$t^p;Ev}s30a#kXHK(-TuVYi^^KT?@-{aSBmwKL2(
z_I`(PoqNY^k17^$cKzfagDO=~CF+PK9Uu<gtV^5}PeT<`qKm9cw|{})BFy~tUzZ1Z
zXB8SBEcYcq#P>?Zefv2E7caBwztIILJqGH{R|ipO-5Aky#mAPqjo&j3C|JMOb;k5H
z-;Xi*3i3i1hY+XVNugA3z^JjxRUdHbcD9y54}TM^BA`~)Z^>85ZjPxXFC3_scj>=t
zucHX<PwfRo@O%TR_t@FOprpElCtWa{Gd}#yB;+uvC}!Zi-7+UJIL85N{tVIzyC)tF
zbHri3S=xL>*9lXoNn?DS8N@^oza~&&jT0V|V<6*FtFuGG`hKT6WSIM+Pl&tjehB)j
z-fcISL3eKSJnT<tmi(%(D~D!SX!IDjPVFLc%&9Z%3M*fomIEdK_o0Ye7Z?x_5Uzmj
z5x!KoCALE~PPZ@9JJ0>y<%G^iD%x6ubDc*&b+zF+x@;8BF;2$P?uaFYwY@=}LH8Y1
ztm!J|Wsck+cZLSOJausPs6hDR@|<w`oYP6eyCa(K{X>+;5Je~8E@HJl`NM@SQ!%s(
zt^-!iWn5tN5@+RBhO>iy+TikyL#FP)&~3Wnw+dnpG=2Zhxd-+z>@$=7a;eHHLB3s?
zfg+UJjm0C+)6=alGz(|qZ%04F1!SU>j`&<|rKn$7e0@n$(YKYDAFe7!2T=WWB8D$Y
zu|L-zv&kl0(t{L&eANs;%JHY1*cg#0;`P53iFBT|u2H|fX5*KR&va`Sqc|9A5N&3j
zQFmD>v!oSQd?V$vnJ!my??0j5*q;C5-MFC4#fW@rUpS|lV5mtKMrlcDw2WZTTQOHO
z{1<x%#asNiwhn*x5Qva%9!x~|3b;?d|NZ_|*1)Xt>Vc$w($zt%GYhm@VLgW{`Rtli
zRa|pw4iIG69IMN7O4l4?y5O4|fB<99PB^Q5_Ew_*zjkFQUQ}YL>$&fO)a@HQ`Yt8c
zS>KhckDf7j9E3Qx7sL^C9?Ok|$Wn?^Wmk9-UN*kDI?U?9q{2T?ygwBl)r2g4p7+`*
zjE2=q%v2o0mmA{yDOSD%(bzQEA6EwU+}Nq4%4!xK>{N^Zln)nI8Ysogbt&nc2@LyJ
zlsv%s5QFyu6oC)1r1@3|Nr9pz*mTKwu#VaWiDxR2t*3G5$A%=UB&&4e?y-z+@f(lj
zDU-wmA?D_b{e28}n6CeNDA#6!5H?FChbpKU5bdm@vUy(o#qxKI|3fQAM=UoT#;IaC
zmP~oP<9V`QsKWsh%Yprs{_S@Ej&>#IftA~lanx#$T4o+>^Hm)rvx`x1M)Hg$jj3E#
zjghXFgvh?LViK}(Brt_Z)l`5(l4C4Ut$#*_G;Y~U{kvh~*)v~Wj-)7#fYox4EJv6M
zl9c)$k?|d|aj11MrV;fJSBtY-P~zR(d5wY!Y&?uDNGvcIW}MNa?GuWDc4xxxry`?n
zd$ZzykPZ*?yNrVt`Jxy*3aR3VO1^nG8^Go)>wOu*C|iY}2Jm{Iy1$a7$nCMMoklQj
zL<qrKDPPe5w+5p2U!?xlrH^{m{c?3?6j!aksC-uFSxC3?$c@ltrGiQ7iMGusxip?+
zcT@Ew4`6?YtVZ5|CDAYg5?f$NKE{*OrQt&T_=<;okMDsb9_icdOc3gSjk{+N!`4nu
zbXPBSp9u4iUrvWcgiIpLwLc?rfpJZ+*wMh3wUX(#y;4S6R<ckyN{36Dv^&m*4JRiU
z_u~CkGdWrg%W1?~?Ej|m>+#%2qLzCacC@|e4{z4`cPe2+yKgbT{xOi*YltBx^lnGx
z;5VAnFT`-R=bdp9S@g10`mM)8m7QaLgK&9UOOhir`j_j7SSVVbZ$qU1paIaVD!3z~
zasIl21YTBj7YB-+HtiRQGwUd<A=MbmtS@epWRa%|oG@OsL|ks^$)jql($l+R@qw^D
z_Eb){uGYpw)<}nzTI*ILyMG_{iByTIVOYuwEpnl0FY|eDdDw=mL6aCLb`Y2@CV)Zd
zdTZ@0sx_<p48Ug($CL+G9qo4tdlXveuU4gAGO(Q1f+U#=25R|<tw0}XB%_w(opCTJ
z-!Q>t337jW3A7k*T)Zo>Vfb}V5yXbJ?&VVPAjRy2XG;|(-ae2Kl@o=&+jaE&;CEYk
zapHX~J2Hfel6DnsL--)R$O@j?4GwW^o}E|pAyW-oGT)*2J3>3*HK}gWSe49Ikb)HZ
zHDZCDJ0*Ds8GiH2P0{_DTks><KLY&}t7^L+xv>ZDQ%=#5J`pBK>}l;>%lb0n_|a7;
z=7+=y3UeUerW{8TD?{u}%uzYrtU{J3jIHIhq#z>GbrE2bT5U{kITY7lN5qj+;LA*_
z%nztW$uTCpYhdEg2(&6>OTWoADgv??tk%*0Me-Xj<*4`a7ET6CsI(gvX-2ah&D0iR
zemeA88z_WP`sz;GZDIOhjxp@942-9#W>iU+BpePzcI_5nN;7C!v{QOIw3F+;?2o<X
zl-lXJ(XiJ{r>qD^rB^yH4f(n8HUaU}`a25ea{tas|LTd;W{|;r$Z3217?)D02d3XY
z$&oeLjg<$x=Eyr0SI7>G3}5+99@x<_9N+2Cge62harPq;0|R9e$gl@uG972s$gpSz
z4%XBXf}TvtA#R5chr=Gxw>9o03p{5wg7kw*+CmO*DVZ2owf1j81p47>;tclPSkp6N
zRlWkB$fCqraXn=A->;1n@U#3{&Tj})U84O|Ki~hnj*CK+*B*2FimbllWJvl0lcP+J
z=E(m+Ih(9>q+|5C4FuEjY<^>upQDRMLNdop6fL&Z)4-Y2#idU-5MNAN7>J!Ugh3jV
zP>F$}fQrjfRr<5(hk^p#Xcae^cFhBWd602z&r~SZ#-$cF%0BXHnJKb)6n3n%#4<1!
zdGt=S%R<K8@eMSjU|ioN_kh#Ygv$F3C2`)9A`*j<9_DUHy{t`0P1vBg%$6PPR*tEa
z1J-NyxJl^|PB$bmV<Gr#xYqtXN)}&WJ%;k0645y}EuN{S>KI<`@7eUf*?$(z^lRDx
z4J2%)X(*;|uoh!~^)an=<RD!qRhToHl9|{Z?4*=SYyB%)C%lP@r%WOm*{J%Gl*VQR
z;i9`5Xq%o##S_Y$3ZMokpx9qiRTAl|Df<lA^4>sU8L9yqa8lrTw@)-dnIjOtl@oi=
zmnKFNqR^|3R}wD<c^@ljpCd6rNj*RlMcJJ#+RLfIH5%cIh{LNnkF;8v2oTQ{pE~gD
zf~ezjNE1XM>;+AG-Yx*e)hm7C%UW-9A#lyAZ+hF^385qz(61lYx)X3v(wRVIzW^F1
zPH77+kGE)WSdU+$?c%C9GO0U*R;w<m)iE3+VsS9N<+KRj<mN8f^{)?0*u;hF4|<vb
zy%cl|ttO)|ZTbvB;T!3P1~~eMHlT2|V9c2ykG!>$9iVsg%bMp06*sjz;u}U}gYMBT
z5xl+s=%MeTUQygs_OU_HL43FIDkkR3<0b{Y{*z-VkNDu59<<}65%p7vNq^!kh+K$d
zt5gw50D87K!#kf#xF-ZFKV+i+eLJ>*fj3^KHj)aD64S3B|M{Ul=7H+L3-y;;5FU-B
zf9y6Av08UFK+2!LY5P)6A1InzQiS}olpCgjBKjFP#g;x*Sty-YntHn1@8w(Vln0Hq
zg?lh@oxYurNWUYNl=cmt4TQIA?r^D!bI3}iM61(`&d;x>1@m(9IIrEMKAm>Mm32<T
z?E?CMj1sE)|69Yh{9WZ>Zx_WQcpF#1t`!TTI~@s{L=^jgY#l-6EZuN0N6sd-umNw*
zf0PEL#PoaA!os{cafY?_i%~IO@JT00{t5-v+LvN2hp1?UtN3#qvjcLIPg_3CGweFB
z@^`DDA{ke@zqJtRhuslI4BJ0eNp(O4*4lrFjM>B2KU(k&*<$0x;**{qU~=nNgy%d#
zSw})IaB$?;0dT*%_3ja3WQpiwX?KTOPG=Cw4lDO-+a<dN*%TOP!Xp`5(D7W!MC-q4
z45UeU({D)?L@~Apj5IJ3btCc|?6Z0d@BX0`e(EC+QvZ=LKOZR-wn(VYpUOEaTk1Rz
z-!FCGX@Cwjg+y8zt=rOBs=MtHUQ)9d6cGL8`DtYd$*z%sSXXB|B;-U9lXN3@BClEj
zVl1yg{KElYUX-vjtUS_395J~KQ+SRvh*%zBd_eCPVP(!nQJM+)*|m2q?-^F2KP#9w
zN6zL9`~=l*_H38C;C@no2Pa4aNgeOZh*|PK68G-tYL%+v$;qJ~zA03jWw1b;)vlQ1
zKr>84;Wm3FOH&vX^X=Tdcp5fofdr)@t{g(A+TP)Azmv1CLtz}J3e_*jljWQhat>*L
zn~`%#SR}URdLp@@6jCj`JF$@;ya%#KTSZ2w_tWw>_QH9C2=!Ii4B~HNf4Ol!>)~_q
z8g@p`;9rZ?YdtU{!CrT56kG@AxbJ=C{Y-$V#RND<hKNhd&baq5{8_&VDdG8ZI>zd}
zR+x<%xSo58z!6qh0z%xVN5sdMa;M>YS0MzLB}mB?JMC^9#dTN)Y9KUvpX(`t_um;L
zSNq?-T#YVU2{A96jdS#NcUS>&R#k^HB7_{mbuH$SG#*xt86&|wxy8lAv;hdbBCpD+
z>_GDDx-;qe3&^#L;mWG6yi*jwL$#rP5_=Fe{;ZI<TQ7`o=nLAOpOWot!u561Lc9zG
z-L={Ys8r!&{u2Kca)yXIybQOh`_3Icrlzn>Pbf}yj(o<!(A{nv`RJA(*Eco!0;$;2
zm~ec3rX3_`+cjHDLC2(xGQdg4h|Oin_f*W397%tz>9bNG9Px!Z-sYbsCrSWSZlARN
zk7_Nm_v9$?h9|$>ems=2;zWRwUhUKQbV|xL(+dO=rS>Zv(9rI5dcyU}_B!uzQ1Xri
zEhF<*R+~|zg7E@X_R(w@N#gYrH0OU#TYwdl5in#vkdsy*S5F8?wlt$~uO944*Cf;z
zWmy7U30*cwMhU6P!xpKU3<r)BuPUDhmojZ1UZw?f?1dGLNu1(oSgZ7}YtjfKP%Y$u
z&Z5tmG#AQshd0D-L`p1&&s;o3mf@b&JWfN~_MI;i+l+>bdWe?iXF*3tP>QPhrpS^w
zt7T6~aE{2eC@vamzxVN#{}!w;*MnH~y+-T5K&coue5i(wN*mFaEzDrhZ||jzYsB%K
z7V}C!`5q~DAa!Z~qTCuvfa<z#L!_DSLs|%r;2O)KNC*||+*3|cYwr}1woj%B7M$%c
zT&qb!GMQVi!JI0au_UHdMoEiUHQD-lEfbV|3k(shJfN9{0*I%Doly@mIE88U@b}20
zYLWU*fRXq%ea8v`m6mvCt7Db*S?cwQ<;9Nmme@M8_oE=J4J`4%wRMUnZs+h8O6^A^
z&v*a@p;Gqe#TSn|>Fw~%6;&J%Xx~Uj%L<&_>z=O*ha?&#{QaHN!`53Qydc)~uHmty
zz<cB&Ur(SxLJIH0Y7XS)4;u4a(sdO(@R3wEGsU3sYCh`>#_R7IdAPdMJqSA>m_*Jc
zlix{A#{QsdK!RNUF(ehbK4XecH_eH}-{JX0LwcJO_QrDHU5)qOZkpTb;*8`zvKGOo
zv}jWKf=RLA_hbqbogb_vofwYK$d(fxE}ii(0Mvv8R?3|?@ox3tdG5_ZQ?{X$u^|t6
zjKiNg!*jADQb}+R(Wj71b5iWFz-*SE_XN4Htc*wWlgD~630*oKaG3kCjz)A_|IB$j
zEg7qL$yLumE9HLNcNEq<(~rvqu=+n`RUq`t$fi3r>UCO==$Kkv>{Eb%RskL98X40w
zG$zKK0b^#QA1kxd$XfFwXmsvdC?p#tFNl6O%5^IBFQuaFb#mH?b&z-E1`HGvj3-o-
zwYt2#{@)^HYx*kRYUTrYe!J>g9Y;}8%u7V(WfbXbHnY)By@wV@VL4Qr@aW+u$IQa(
zB9tQ{i_j00`-MOFsYVXhKcN+rc87eXtG^zFV(jASk;}TTMqVcI7Q#ng9d+k%{DFH9
zT>mGg=PpI~JLy#3Jq?P@OsS}>adcWy;|`B)nGOz1#r2$)F5KT@5{QsP430ajjPe&K
z6cNJ<eRA+OGBe#5%e0or)Acb=`n8-)1wWejy-`wH50EDzkjiXVV^>JCoLsxDSM^}(
zxU^<Xq`|&VdvG49jErV8I`GuWum~^dZ6j7FvEkcEl8&SQ%=kd~2x<xOtfujEPvQ~V
zmlNtYvO|@G#ouEk=I=LRb=M5l!#Cdf0op0RA4#IMM`y*>41>@9Fu>%phLQv?e=H_(
zLs$QPql>4fr(Z=e#4--b>+TxFaFO;f_QBR+-+Zrsi-r64kg79BF-BFrVT!~b6hRx%
zQBwcT`|^XINVrfnk3qk&w&V$%?cMKjh%4LD%cAI_wDB(dQF^BhC-O+{*X5SpzekY0
z@8G%e?xfkwn$gaS>7^MTz03wn>M1#mD;#w3oIhHoAwILrgK|du3foTZcKB<-$~u&Y
z^pBZ#_)Sex%+Vz=R1*(6RJT?{jE{c!`M@Rgad|drF_G#z`ZuPFd1k(u9K(w)vvX=X
z%{Va4Px}kHNvJ_R|52o3$7^8A8`G;tML<&n3=d8hCcaeOtRBN9nCMXMC_J7H{JG=I
z7g~sWnfD61&XDq2{Axr<nkOJF=*Pj&=);^c{vbw(^hTnYOa1q>RzZHcFys)yOR7u2
zHy+G*fU7F#XYx(^taPiu?mLi~3Xs*^{`=ZG?~Y(X?j3leOHbH8EyRl6hR9KH0`ZTx
zl$_<wTJII6pEwQY#ZY-<g;K5@xCzn4S#~lz>keNv+z$}MH(akgj^O_S%Cp+-#y{)o
zWGlkli=D=)ZMF5Cc8dNyoHp~C$F&tHpUQ{{>&x#y%AF-Fjt`FH;XQrzVwiM)66R)@
zqZ+NC)F50cz2cg=Cw3yfp#-Y@9FMM5z(4Nql{McsfNAm1DoEZT2RX9VsQPLSFh7>K
z`6247eWt=18DVh~Wh0p>(JK!7l$V^n@cEnVA?PFV>k(pn0-4Bs%1cW?LDJ54khvU6
zCOt%m{{8v9Y3XC~qhmjT-BQ`_hb{vv9FYgk=eY$U2*;Q-k3p`I--dovw+nZpAZAPQ
z4Ev9hGoKKrl>BCTW&T6ZdWmUc=zNxM#N_<&>$!|`EGa9id4wRfqBh(I)_n^dD=s!9
zLvwhqj>Kp$&0LVTRuvN+h?|8C*5{d-Uj*;6P(&MRkm5h+O?cs1Ws6GIrY#JKmpq7Z
zwJw649L#;;UKl?*K2tQ>$+#rzr*e9gLl(HF?YWUFE(UHq;#&=R@#9C-N&fp4aSA*4
zkw(fOovV#Bx~gg{Bl6Dzx5Q}rr0?SWdxLL+h1=3aL06G6KHiAV-oh*2JjXR%GhMs~
zD`wOhVnEHdkaC8MSL~C%pE)(X>GN07ZNGn{4F7k0w2}Fe_mJ;(-+G>hXbr7D;H)LU
zoDt`H)+g{O@m6@~k<Y(6Tstpt`yTYp>Z72E1lj08EHQCg968H&!siM<H6BVmo*qG-
zn8xd2WsaHz#zKvsWPf}B*ftjMa<Ln)<kW9u%9N!O5yLCarjx54zz*~Rvu$5kK^z~=
z>gXBBC^jGD9sMy&e8(-V`%Rb0Aks}9D%_t|UeGk`@%gpfdgR&?uQg0l{M+BpkE3pQ
z<gJY|*!V<Xcm~_yf6p$tQ#$}lIo4Y~e|ZbEIb}Gb_1{YfHXg#+|1b95Gb*ZW+ZI+t
zC<G*_6glUdB<Gw#B#LAOMU*T-K%hvHqhthBa!^Eqpd<kW2@)kq34)TNpdfx@sr%e}
z?swX2?frW{Zfkqngu-IgTyu>%MjyTRF(^vVtjX^p*?z}m+m}&Nbh%;XVc#|WFNFj;
z+>5z)MX?R$Z*sB*&gLlL`tcoSyS-LxG|7VpkE^LpAs+0y0d8d1p#?e(g+KKka5RCK
z*y<X7ArT9aTU(n@6S=8yg0|$NIib<wq>gr(h$)-L?q{#03pEL)Bu5(h-I4y8BLPg0
zF8)%(zVugQPQ`Mn->phn%$cVQaq)?+a9d=UrqXowo?KLJ3U#*YX)+9mx$b<9d1y0G
z+AW-bL`9f;mTA9*Y7?J~;&`&~=7y#cE^mpb>LaNRQR?+8VxuBTCz<KCM=JR$uHjl~
zd#>)C<}PD$>I2aqe>GF3@sC3f-uU<2cISi_JVMenYpb9zA6=~Qj2-LA^w8cfxo;A1
z0KB*O<ZtA`l?cb^te@2L5hNyMR`&_OHgbY-RDtAXL{|t0H>|&qQ6=T8a$UegX%*;8
zr8&lFzWU_BPu)r1H6}HB`(|>Gpc*z*p*?g=6wPL(*}<jSCUh-Mg1^k|My5#LSS6pE
zp0kwORlfExb*{x{(`_+LJ3f3jzsh`V7(HtCm0o%8w?Lm@VG$w7eI@64kz<@~uGz=K
zcQ5bIzu!EZ&QN;=AtBuIn-F>6(+^J3DA8!x_g52#eJvCL9pouq%0|$^a0N0<p4Ev<
zC&-ZQA+URkzV{%M(~PAUW&=I=n<QCt-@g~{kWAmy8fJoI+6wuBZ4R^l^Uo)3D4$M5
z3f$!U(PT;tDLy?$?jm7o_w?Nnc$4E~PGrGn$iu6RDxOSpFPtpXL6=|1;<*1@>!vLj
z#^;I`Pdu40$xt8<1SY9i|1{WQ7o8+`9lOqTN8o3WiF1$(W$=v3U4O$<W0hVX?WS0p
zfROvMeFNJr8!<Iz9W<Tg>co4J13uU+Me$nnUDUHQc(48bEG-3O)Y@WIv6peuU=q6a
z!qblyUDscr5G+p(XWsVdQz$^YUI?X%sS)wI<AZsoIsbDH`P7YM)v}jxFe%&lCG9al
zXCZgKX>^eFLLRia+ra8(<$LY*)v8tIfh<(QiB;fmZ9vmgsbZw~P|~40VFR&KvEjoO
zofC3d*F%DJ^7Bf_SHWXmb02BBPlXoIGeuZyaD5E`Urzt1d2hYpOZ5b%5x`}%P@64`
zmpcVCQe7V)gvyq9))GF#Y)7h6=m-X5@~&4eji+JqkeSgQoSQ*Zn)S}LkeyC_5V-{t
z!c8isYcC@l=Qa5|Br-5Z;_|UlF^LW)dLIHNGuYayI+HJwVT-IitZ)X(?3uV@zpy~j
zYsEs+zw@hmnPAc`4?G;@En(D*Rql&ifw8a?g{NUbBNak%Ng2)q_jd!VP#V<rE82~m
zPO>e0-ZY9f*<ID#_k~`wG<UVJ{HLHxfGRIl0}C=7^t^I0&>!8UEeKF~Fh=^IaTb`Z
z<aj_}Koq!*ov#C;*Mq@r<|QVtG4fgwu+_0eFEo~=5Co7_q3Ihk93`LN4XVVEG2q7z
z0W>hR)7{^4H9z!|?Q&xHO<#S7i^`CgixI)Wsy+HV=NwD5o&w{^#CFy{%9Hv%gt|7c
ziP4eV%G8Htc#=KawXGVM@)&T>S6o>Rd;HUn__&hZI(X^2f~6|1yX^V*^`<${K#zqh
zR(<dYdFann>G8w8^TdG<AM1a>_+yyoa6_BKN)Djq4EeD+wu_#6ORtbEHO#E=k|<D&
z<4(cM0oNic*uPmtk{(vTHH;(Sj^1tz<*$@#8^0GaF5Y)p{Y=&|X08wtrc1GrBiDw3
zJFFYDWRB~X4flsW5625qK*oODef?L70&lMIM#_FooEfS?e$6wH<7<z;4su3#%BLpG
za5nr}CD~Jx3DBxa!ew|O@TlO`WU#`Iqp=VDeK4|O=5)GfX9z?wqlBmnf7J(nvdjPa
z^MN7kQEIlwKVkd**LVN*_Xmuy3}+k)hJpS1n||;={&Yf0GPS5{LI3$>P87g`Co?#`
zjQ-P`BfMd_>4tU_*8lnCQ%Vqj6xp?=ft&R|elhZ8uo?k0N>b9lQU2$b|DXBj4;)d4
zpgh9@jj767rqk>;M<C3IK1mOAPBP_}<P4<mAy1hATZ?RpKX-rZjDExY$Ntv^UKiTJ
zg9>*&QlSTFNmE(mm_bWgoD><4(FVlu8K$FQ>BrEG;?NO(!)japq|@F0(ahX4U{php
zGr>XRd<fC5U0?p^@PMpWj8G@)Hyhq2ga9X~?C+c3MTx4xI@>ortApQ^MJDm<aJwC7
zJ5&fCwJ|7v;r>_2#cUBvpaI)LdoLmX{Z(hRc2F`D>--5<Fipw_ro>Eb7j0P;zLZ8t
z>@Yo^Z3ViHwlHpw-V48(1!1Kk{~4shhAH)VW(Xf-njjml9h;zhngAioPPdWt0-}E8
z`ueOX1|Sc~7-*5M#u?5bw>NkLgc)Jr{m(43KO2gLz+aKojZjU$joZgp18E}y(ax;R
zJp*pqpEYaw1#32_^?Rd0mM;I(lQIzXJrfv^?O?l0uf`Ct!>x|n|5fDY&QOHx-js-r
z*vLh(y(&Lp7i7ByTPYno{&PNA|1^-m+;!s_F!=z53IV&9=XgBBTG0vo(<vK6+M^-i
zkQ5<|j%Fl97r1p9NS_aEaFaHG-trL%HcZG|_w8@P)OZa6kF(vi_P#OgX#Zx0PUIZ3
z>jQ^z+CYd9rSvJRf$Ss$@-R@x6_{wa?vFM1ukDQ#ZXzN0F3K|u7Orw&=n4F?=3F@9
zUV=;e?|=p))r$GC9L9NxUxt?NN8)k9Dsvpi&G9Q`eai4@0NSk-cqH_^*do1`6Mq<~
z3b;>UZQd{!?jcWo3yKxH_*bvAhk56#$)6d+NyK&Z_)$`qrq7o6uKs<LgB$xi$qe~k
zBc^W7M&kZcJsPtfBq(_-5}XMZ2{?p)bsKPF=^(t%6Xbl}9QrOfmiYIUGh)@9e+QIO
z((am#ZJ<hp>C1iZ=y|ujEJ18|{plNaLKxUpVoH(BX?Vqbc^q?G%jeWcWcN0%qY6q%
z|8ctAJwYHD9yq&Ip12x&nN(2doB2?6xb&kJZ`Ot3ob#8w<yLw8N!+_Banh$)9TZP?
z5V)A*lXH5k-@UCb4(v|Z6OAwbJV&>X45;p1aVKoZfr~i46oG<qzQ1|-u;B<uWOoeq
z;Q;?BOC$vxgPC)eJqN3S*BGwJo}uC!F7<@jhNz#6MGfPa7jXXY;Nvgv6B&C#T4aom
zfy7<iL9Q*aOu(M5Ffu9CnxSl?C3hcrM$P46gZ0rXhV1r!177ar*fub_scdwUQS<1f
zqln#@YP^py*qpiP$5L6}xBJ$Co3(__rDRDhfobMf3FhLYZ^jFyZ5Zu6yRQB>2o@um
zUZdbO4%nsMYzg96e3n&^f02N7-Q)WzR##Gu+oYDuDFh?0q5BcyH5_r$GE}<=^_cCm
z(9-Y{$6eCkY9!GLgUJhA<=`7&3mEn7Q1sk)$R~(kdYCR^C))-E<PuwH$vnGIL15=f
zRB$}y2>c!fpknj(=heak7m;dW@uvAsF%FQJZims_tY>Nhzwh8(WcJvLeU{PPns5={
zxBMVna|KCGLmhSYee<hSlo4b}&}SKSg-0X;9(YM4QUv~L39aUkfJ7zeW^K{mhj>^S
z0+VdYvv>$hljPmez+dc`gUsSIG8+hSF0#Ba4ivR1R~FXr&Z-|6S7eMF7SuDS9{v|M
zB$kQ`+kmCW5uYMuA{4VJ=nKzTJHq0sQg}9I<sSwLsUcMYi5j{-bPx0hmC#b2PUFCA
zXoC!8&Q3KmT8e+{+Y82m)^*7HJV_`@7qDJR#xm5~!KA?tzSq6_`7UD}D6h7_;o2Q|
z_#EF1{}?(DG!eN~D=Rhqr;+(~5-{Ao4K8tk$tpWU?49N82Q@3Yj@;1GQk=X%fTx$U
z=(>{GNEq-~{_W<_#d4{PKl@agqWYKRzn~y9Nvu=8(O}^BxP@J^cj!0cHMO_024=_P
zKBpKJgf*&c3rE3p2sKfX2ZsDNn1nZh{~ih6z|rTH&XP#X-gT0cRr{cSD@N;>x5-VZ
zHF(tJ7J=etEy~2V73e!BOL#IGv<{{#z-<vttnM0B_x*TpW1=H`A3`g{VoM>5(|JB%
zLZr6+taRLK+awAy&@0)HFBcL}2GZsc)K1{edV=UhjqrZ-xm@(SGLEZH5Wr>5t&5KG
zoj)Hx$U2V9ZE_o5xaqE$v%9NP954LLa4&)((8qzU_`gmey}Q;!!+(+xfWK7tg{mU<
zL9O;0X2N$Ms;JBeOtf5TjPOkR-RFXAl!3e-^&x-77f~y&(<WeCnf9J^aRcc>)B@XC
zWH~4Uo=Vtf(*xOmFzH%X?WhG$CWpz_uRFK#j$`n*m7*Ym$E4{<SXfa}j&l=aWQXio
z`+iOZ#y3qPsg2c*LHA2nPNAvpyS%8w-iUe{;L_#`dpI^yI(1aUwZ2~lv>E(A5RuNH
zQhPzzrz>k8xgQ`z&ay8{O@AWass`*{>AuEJ1b+|@ZUSn#br-FN4rHPPbD(vjjUc8m
z@ghRM^})qfU<@$%F=XjiHjHJ>X<Xps6_|rNwU#ZE#MiGsGAJSdhNY2&@*=&0%1!X<
z>jv&8zg!#w+Rk;Jqo(95?<W4qtAA#cuV9KyotCVk)WQ<lFNI;UCObZ5b6YMogg6^u
z(LlEd<3{#u+U-eO`IhC51>sT>GR2yF?OBZIR~c=3b_$3lCW#}ye{#VI<f!(}@r}I%
zCmKJFFR=YX%2(W<O>qozuXeypsD{Um1hyK;L}Zm1v`4SdoEnYG86?`>6rWuR$xhCh
zz2WOGla0C+0raNP*J0PE+pC|?_pdQI5ImC=Rn6rb0e2jlPbX$reX5_x5p1_jvT7&H
zaz4-Te6LBDFOLymYAU%xG5gZCq15siy<@i9C*V{sXV=8<Gc3X!zClR}?jJDP&Fe)=
zr)AGv(FE_ctiN&N-xHed(2daB9RT(&HUtRHz$69PeyouTH?$8P&6EZ<sGrOi1$t|c
zZei7gr2qDLB3+3~?w9sr6!sCq#bueY^EW&0k)<s<!n~V%X{2My*R3EGx+$Jgwks4r
z*`4J;5YCcK7HF2W>+_I_G_fdr?%R}y{+`vFx&5v898CoqGuW5GPRIm@5w;_G-}%I~
z*J?iK-0<@4wHyDc50MPwG*WW3^E$`C_Q_#Sz{lcOg#&T8pIDqof=F`9`=T>?@Nh!j
zrsbG;`GLh277l)mHxVKaY@NupJR#BYizLJo=)6_o@HA2tWLCL^au~*U(25(>YI0=f
z>Z`vq+58hS$4=t&<1NivC@!V(@*;EdJqg&8`)(kPdgvcE>|yz0Z5wnk>SZg7&eTt0
z{FP*559oa>HOg<C9c~Peh4}24hQ#CFNy)H2?C{^HTciRFIjpmCcRLIq{_<+VzR3k5
zeD3)32~WmDMvH_hy!d-j9Jvw}B{!(vQVk>-;9c@DR7H^$r)`8fjzB2*VmwM3Sghv-
zL-jOmdS<g{*k6%YU8sJnb-7vd2_jmnS_(76UG@u$@tpSde;h(-yf{0Bs)8{%Ej<2i
zv`mku=`L_!i>6IVJ3;G=Ea!EoY%Fxm79oA)6b+U^$<N1p0R5^gE`fduGBceYi{I)5
z>*QTX(i-XBjR`7=)jEd77bulK+Klguo?=H_fw@wc4(~uMz9|s&J(`0~t)qVBY0q_f
z@#`cL%XH*+{lY!G^x=`5lAwTXJeDx#2nwJjvmr$KHqxZvmGg{py~6?-+{p61$fwej
z^98_VvhwJzcW^*&0(S7lqz^=Vn+3xIf8w4W2A(<k*2-w82+o8l&|?^IqmF(a_>Z4#
z>xT-$dg!ilH(=V`>xy4jKq^?z`Q=4z9!dgThnzL{LBzQThhH&Quh;UdyN{6j4+EFe
zw#O4IMA}H4Gw0H!Yv1Cc$X@&N5y;odjK4S63d_s5n#l$8i?h5w<7su>-UacdDZ!G@
z-$VnQ0augg`y~~*-7zYRByA*CQgE<%K*n(syj+Vf?SBB3>E&)laZ=+_jkVJA+{ZgY
z!TVOmZvKMRozjxW_flS}$&`&soECC{BHJ^bL8gpTGA{CAn(i#LpQ9ojFkLVaz|)F~
z&wu^bq&s90ufr&<#>Yh4`k~RAqH*wRBW7z1)UIoL#F)GfSF<XvezB7}uKDZ@VW(s8
z+q;ONY~3k%jNa>fG5=dcW+F+0hr|*u?1JVL1${z!v_d3mUYTj|I!hY~TaDlN-p_x$
z0JuKu$ekgOAED&L>4T(h;q&X{4UeUgqEWWPJ0P;%gek<%5gR8wR9I_o;J`<%tJ1_U
zXf?iqK|UX(@88H@njqKa9b^JJ+Q{_TBCW>c19^SvRZ+*V4G{o2#`q2`;bf)D_dy|P
zeNdyZP4Y(a!J|i~k%ABIa15+(h;Di#>{zHDwwMojpLR1}J{JgT9U9+52i`o^!-QjX
zK)JNyBo7p3yx$aCbUK>`Kz}l5oZvj|%;U)Myg=*nY*1zxsBFY$6TkJd$mdRnceavg
zpw5KMX^a34(`(^fz<XZVO~5cT=>tE9bcqeik>xSL9G|CBP}}cQ@gx`0c8fl#-v}98
z+&rSsZR$_>!2uUnh`Ysjm@wzq@Hx$Sd@kZp^zEk6$1Tk^{TlZ}&fzkty~_Fh)`M8#
zIrx5#&?TcdxY5wvQ_r`mHFt)-{O<z2TLd8q<<jjUB~ll`&a;gZci+$Vm6TGl&6L$@
z88^cK&awTYgLPBsa+f-$L&niwneE;+roj*HYrXxT%v|>veJ;3+mT2oo)V29mmrIs*
z0q5{`evg$3Pb<4j8rTU_0Ty*6_xdjGW<qWGqS-Ig{c!#|Rfz}^5@1~r<*=1t6m<x!
z%Q9+SabIyBnH@ZJmn9`~$#NHtXkVB)QRJ`gI)%)0Rb;j=VN7AF=7U_V&l$)sanKI8
z)K#Wx{Q@G77?2q>(2PY=A;A5+hXYj8PFZyfzVl?1A5e<Sw%@^=!C-70ftaAtK-ehJ
zm3agsS(urFEw12PTr5zc%Pz`xf15dSY{(oTNJ_xbdEf>!4$Gnorr*G$m`Nb}r}&Fh
z@IU~7)9N|EuX=F*Poy0M?ZT{aLUCUhY6rkFW45@2O{h6Lj-SA*PY8CCb?7D~Ag3B8
z7kK-WLJd>w3MKmJnLt4I?(ah_{TdRYp0GSbB8vDPY(fa`u1}f?ZvhJnI)pmps5=7X
zihm{l@kJ<!;^BOcV8C}Ct+zNIoOQF23rf^%IwkYwreeJC2-5GtAS`(a5(W9I)(=l!
zf^+OxsU8TEkvKex9BaEf&KvMmEwbdTUyw;+vPA^8G*B|w9zu~_>P@GAM#jv<;u@QH
z(!sY?8Uc_3YDC?xH6*ZcVjiUp*cl{a!oe=W@>PCiQsvqN`qmJSuA%HT5dXy?dc8WD
zw}o+>AJ>sG$_;e3drih0ohORv<CEZIW7QLRCn~270k<C0OFeW5IOc0mS9SU(Qj}Pc
zl)%I~=1;qwQ^?TyO;DSbx;<LBnYJ-xnJe{AMgdR)^<O9fo`hNBEs$WSY;`dnA`>Dv
zL2g|6e))^9ez&;@jOzVS^^LXTQPPFmG_J<d8HKUvOxKqHQRE1zZGoT>{Si2;rOhh)
ziXDt@{B8X(FSckO(&dAOQu^@)8Zv$osV|I_BAeCJQl7;kWo$<XN|H;1>aPP*f}+yZ
zSg#$(C0;|^d}go9g3EMj>^gAT@Anhug|_pmq<GEb=(g$Kem`3K=SYG11dzt1Ux`H^
zmm%aiw!uvta)jd2?L^!uYdA%GMucpq0wCPwemv2hX}$Upl+KMYFH1SrUejQ8#z&tG
z=`E|RvuE8PM`mA@vZ!g$$ldW_U5R|FiSBjf@YLbUI);sg0>G;4$Gk_l<0Y}1y(HLs
z^JPr;-vb`Y)?a#~<7L=ECz2V=NJn0Q(X6j555umQ*iW9HyfGId#8j|n#(Yy{aP7uU
zuvp00w3&VEXWs{>LA;xpe}DRe`eiUr_>CakL+mmVuWAP%J{sOgN{SX$+XMp4BA$Sx
zbS22NOL+x{=>zw%8n0RDeM~p3#6qi<2qqoo0J|Zpo4C%isB@aSRL$7Am(bTyFhMgn
zIz9Lcd*4X0>K<dhiVuh)EN#X?5-Ve^pC*1Pgwz8(GY)__2#{D8U{1}GvWBJc+(DPf
z*o|@-aL=rrKyjNdQygJJk<p~|%#DBn)F)(!2~<!&`9G!7WPukn$UfmThyz}d!ZmLr
z%A|p~#<+oZM-2MSMdb@+x_4#d5sG3qIRE|(j!;QUNRAnT%6BF#4>E5_IRHdz1qx)R
zYVeQS!6CTu@H2*${3!LadD{<;sYC;*t-gd6TBhO`C!-2<>F#vzs`_yHmZe$wU8@>^
zA9@`S>i@bRnt#da7DS+Xi!a}j-?Mnl67YO}bBMc5s_J)or|<%?c(^H2xe2G0DA26h
zNWv{n1u%}0_(~%o<6gLMC;h95e#kJ_Rzo*Co?NdbXo*XxN0NPI9b{?`B`Y@pQIoh`
z2WoHUtMi+hewl5Z&)Frh_4Xt&2e$3uzFt+D966#^F#aw>vKRrhG@{(NbrY-Z8K}!q
zO!H}YdcI@~YKidjgEG{Io}1jB<;GP4`T3L5I|IsdXN+{nOwDY=_Tgl3T$|Lwg>+BT
zbn)Epv&Kmsr%>q21NPq`$RM!=QV9B95B79d@*jKa4E!(mKiLqZ@vKe06>l)k5<0_c
z?kp(8Y=(oi!3wo%y}&2@d@60PL*N$P<$X9{LX3|SLU!O?{Z<`uWwC`vg~TMYA@%ag
z*~M@XB0eIn^y9R>8^r9{VGxLA&5C109O%cpwvJ0qA}%C1s$20|-Aiyq(qA_Or`i}V
zgb>JKF%){JGz|%}tp%IZ!ECW2Te;pzj`TYHYFYv8&%KW%$mr(kdp4~yAF1mv9ITyE
zUb9BDc@iK7Zc`%UI|GAviv_`2;pBF%yI}dXZEwP~{;oRn0{2(z8T&)_d(uDbjlYHX
ziu*A@HPSEWI@@BCtGz0nLEZ2-clS(!xS$s@WD8+=RT1v4et}EAAQou+rxv1sv)TQ+
z7^#_i0Smdw`U>*a<4m7G!|XQLl|+nRCNaW9^l1A_gVb?CU$;&CzDSp}FOuajN9`uB
zeFC$JB~;R_+>gSLw5OH(ZWxtgwvdfr*@*rzL!40novsPS=i1~)wKjcdgG!fycZ%Ok
z!W6-rLX3Z&f}!A71@+6T0bn`^w3|jR!Z$j@#CpC{z@X7N=ZP(|lRf%pk?}pQ0At|y
z(4Fgj!HtNjTie<#UB43al@kKnsK4sl-?l-p1)M;`%nw)Lle*|K-fGj|I{!p?u2oAU
z0+v*m5zkzeD~}^a^ys;3dNq)}aS3AyOh&XrR6@(pedm}d%1ceFSwHkq#g}y4cku~q
z^#NlELX(jmNpT;l)tk6BvHROlhMv@ZOvj6P(q40_(>h=`mGEpDO3I9%ac{?#-cKok
za|?rg6drLl#@9^33E#1)q8!KUw<*oj9)23cQ^+QD$S($h26L6@?&aU%jxioCd*01t
z7{U3bNV~H(l3Q%hO#7sxyB{3$8Wba#bkLa3+M5Y1Z4wT&9CkVW;Wfr;?krU66iuY0
zcV}b9m7OSUy+2J!zI>$4bPSu?miZjh@$w}>RcI}n*q#?(-=hwxJ@EoIJ8IjQNZF?6
z7uNZAKmtIj1a*{$M_J*Yn5)8wz>U1Ze!rVEF$z{;4D9%GD0@j+!F#^5o9)VK?jC7^
z*_&y1MS`L|JnGZ(PCkF+EcHasS@QD}Zh1YRAREMbG~O2V?p#>cWjMPua;FkekV6Kn
zaDdQ$UV2m|LYgzI$k^YePd7crk7NzdFC`B=C4Ob8`p-}HL96G@cia+xcu`gxPJ#Zd
zu@S&2CgS1ZzGZpDLT3byhzGSuvXNHRhqUcnk++ZwUIOe_0SoyP*HlVLJM!jW9s~_z
zjzx-xYk)rO@IqQD+hc7Cg`$%@{F{keePVXMbJpNd<N_^TB1y4AD_jECK5pp&{QEEj
zM|L%O{JTF2xiZqY%hpe5L~><>tFyx!bQ$jAgP%y+_DXUr@(-k9fn2%s_oWqhi~ruU
zJ`)H3zD+kn>QDZj-~m_RE~mJZ3%|A>xrCZSFE7{y-;iJXbHS43Z;<1IE3f!mRs+j-
z2DyX~{RfQ*OW}VP?|&EXe|O&h?PF`fb!aoS!RdmuWnaxVcwX{*<!Sx%GDC2OkZe)m
z)AXy&d2vG?S8$?k`RVyz+3fX`rz={#ffMECceTGmM7GG~tJk*=TXkVCe*u7og}mW(
zC?4@C`*>OJaR49NTY^utL)vuz)#)<zKT(MJe>~_;sT<q>I5;7eLAK-7wpfsnqF_Y)
zRWpY~)i$WI=sN-XY>@+;)d&p+O>OJY?K<6F=@@lpc?oPXf^z1d>qL*B{2NeMNiCNY
z<w})<VSz2k_?KqizC@<hb~I8L9YP~cauL#hi77~+ZTB~@SU`x0p-?h10~3)umsu2S
z2DZ^qc?C368m4@~&3#PYCZ&2Ncpd2h!CY>yAX!N0Fo@}qj`ZrYmyZ#mgJ37xe*fo*
zK2<y5z?5jKar|*DAPt5tVD#TEpwsh>P{nPqyTQkx0-FP&v(N_AK+4|?_W@+IyZF2w
z%;?%cyoL_9^L}|UIUxm|b#t^slX2_Bmac23Lg#(}Vq!Jh8rlwdIo){}X;uw6N(5p|
zXL)<t%NA*#A=M?~OJaG~{?*BIW-Vb+X}6zJS~9vF!SYyQBmknxnFd&m0mN$01?@T_
zU4iBo4*oK1ydaaa?M`6ALb6Q&E!sPX2O}Yy+Rlz=E49!*xbvJsfPi{H&VE~Cfg@71
z;OjF+7?_&PIR&us%Te>a2wL|@kxrwlU`gd=SX0zxu)a%GCKI^y=+V!=N9%+05Ws7x
zt|7{W^Xy<_g>R2HxY&x%Km-e#>@S1Z2-*UU>QTr`<=wOgml}cOLVMFllj^sQmX4ka
z((c*d5VRX;_%Vp{5idXqh))I~($tA4-iB^>V`7ZL4piq62oa!N6W&KYFKfbdkRC+_
zqeV0&U<`K~xObgHa^usoI|G8ySn@^@1K@RA0_xeRXks#yKnfrpGKe7r-9e;-7tR!o
zSrH6L&aPU7{U*m)xe5tbABAFi!)%}zd>;d7@A%jeRNqvH1y%Kc65;p2k*cJ;3h-4E
zp<IgkbFK?8+5BoK{nG<-AQg<ylK9||tlA?)_HLPdqH2K3e>8AX>BT45@?_AvcquV*
z(MVO~J@a~o;v)Vw;+LpG5nu<S=?lYdt5W!Vg67_y)2l6Y3X$v9D*h@Wcjus?N~IeU
z$*!Ox5?C0<AUSy<oSo=3RBJtpIHMH0qJ~Y-@F|G$C@>2>=6tq~{YS^EpR2d}22}E#
z@;&<`E3WC;<6NDBm6&5q)*<J+A@uF*5C`BV%@+M)==(R%ljI@{3w4U9Lf>^2iin@N
zEwms)pVHTv-#j?GcY4hnI-P0##uHVj{&EJnJobqnnb?|ZcKPvDsGp_`f6@XK##^|N
zwWj|v_|U<2DG7ao#CRPi^z&vE;y@Yk#mLWhTPEt<OKc}i$Pkp(#uDuno-)%v#nrZ8
zlTz^=vLPVHCK#Q$jeh=FeW*FuRQ2h-wg4U9Vhka9n)vIm=&gWVKL;H2Lga%xa)NIG
z+*$KMCTNb|7oV}u3JYvVmqhAZFwLs|G$9~wQuRxH767G~OShV9qvgP!L+M$Q+}ZC>
z9`kkZSi}iBy4YIrJEWa*Q?&taGE~h&xxc7p6l5aI@6vv7Lo&3Ao5^%I`?|eq4Kenb
z)_BfZKByEeEUUdypn#@^o=(AM#i+ED`pK@#!-RH%$ULIKtv4!VM-;!EvB37As31Kw
zvG*lP{#@W07y!O&*u;77ZT`s5tGXW{YPcaTKVr?$;cnd1FTvMZI)P(I9=Nej1==3}
z<7ahaH0HkcIm!%gq(9BJ=6A4ox&r)UN|hc{<0mwa+Tz&+Zx7adijxd7YSZuI(3t*n
z3t*lyfqYn&uoptNktPsT-e+>af=5*(U}MexX^5T0&EC{-y41L@P^!iIsBV<~ghnMD
zAEoZsg?mBvoNjVN>pR6=)s>!>Qn`zMeTGkzWjELZR2>vBXo!+?0AG><On!K;fMnX<
zS|;==44rd2J5rDu_igX#0vmBj3Kb6OV5}$V=lf0COQM~EHcc4%O5mQXrH1b-M_QWL
z|GX5(P4Ysy_t_o!HL8oz=4^VkahgAQF1H^NFRLwbJIhRem$OCui=bb3k^WmlD;|xC
zc3e)w<IT#aDii$F9wZIWT5|TLd>rLvXS^)ZqB$hAl12Y$%C{=_d+D%ZoKdkUo3Nk$
zU*JVb9pO@IR<aW-!;*;MW+qN$sJuhQnIjf*XM53G-0By%3Lbj-rt&|}%3n+^T6s<n
z{n$E$nuFgXk|L=fk|eXq6IIByslz;tl!G?_<ZkVV@12v7A7haeV5=`-?%dqD7ALu|
zAv^`hsjHxUKuX}%+-LaWf`QyLhJ3NC^$2;%2Yfcb^K41ui{*rD=^gX9Jl75~b)VfT
z02EFpVNEhA<3cG(L~hDziZ8Oe@o(E(DT^g9y7PLd%9r$tdVU#D_`VH5TC27zC7WAa
z6k#hSw;CY>0MNr*tyY|%rnIOPd&;}~7f=X##oJ?d2^O{Z+|K8-r{in?W@Y+{YS!uY
z7Qdx>c}|InH^Sh7(hRDM`zki6Rb|Ykp|&X`i=edCUFxizZu<|0U;7cJiuTh95|dkO
z#w+sn5n@r<8TSEuWX}eiS?-%wwlB%()CsepG|HZJ!*eF*bYylV-%7<f3&~&X66vq9
zX~>bGl3j20k77&PIKbcITWA=p)qSOtPR!mJt{95<U9v>4({WPf`_slpzu7#$&)BD^
z@u13o$!s?EnKHPUxLHlu1oWHV#jyNhn<6sZi<zZ0(UJi%Skou2@jL_KB=Y1N>C3r*
zHy!odestpf5fG#dMeU_BgV7add=VUK4t&YT-tCupnsxbGM$erG$5$t%iwNM#vdApD
z)Xfs_S;ew7mo8);Nvq8IoQ;ZiMs9;epWIgX>bQ4I!w$*T;~OxWbIn{8XC0p8+JXqB
zOm!Jd_i<0?`>2+^Jy!(4veQU%L2AgJFOTf)uODgS>j-E!M_UL9W-tZV!PkB^{)W^u
za(B*c%-9kEkKShF;R{waJu6P|8ClgJc&FV(#37@WuD*aLz&y&XnZNn=={=haY+^Qk
zI7Sl)@s)01ff-48+0*b2py77{>*ezGO{V#_d>vLDE27`a?aqJPiBI5n=X{b&ur2&V
zutuAm$d}w>ji$E;xG<Si<ei$cuVq8LZs8bO0%#?hP2@dA`ILIQqjpxqhqX)Cy8PBw
zvRglENh5*&np?-><H1g`jr~^Xq=Uyv(&AbM2Y=!GmTufLOJc$#&EyEvED=RopcHaN
zDY&?EBofSK!UsK|(*V9QZ|A@JLW5GRQO&RQ2z3^9?yDby3BJZO(6%2fRFIomx~YnH
zXf8D(oCO<)E4kAs?M42e&T2mh>&*Eq=+_IYb7QUOj^cL4hacg2za;}HPqX#s#qLRG
zCA(s~jyszgAw>2S!?|F?w+|j1tzisk#-)qbUD+|6Z{z)aC^WtCN$6L;-xsVVLyzZ<
z*l5MksL&H#i806JUhsj8gEt%eP;b%G@vGS+BrivZC6DwD@ga2kK40L-zBwdf`q2b9
zPVUEh-^T$~o<q7gYQMmVcXmv%`pxyoG**$vr|9#HzCE&*VL@d$rV)D8^a^Vq?!EeT
z*|@x?QSJ6i{a5dt->A}dkR(afT&<y2A~-vWa#J}ByIgy4R!Jar+$UW;Rh~4~tZ)SS
zp9J|4r1BkCm5j`Pm3z=nwfBm=IVn9#;QNf)<*QY<Bi<mPP+eNf_!qHfziGRGLOes*
z@2jGd)MZd?x8o*tk#u$;K6WDs-13rm<TGS6x0;?TetR;&BES%5d$9z(e~#)Cb)I5J
zOfK=gght>^9pecG#Zc=HT$ke<TR;}Y97580NenCuPAM)yJJxk>rMNK8{Y1u|QM=7!
zQSpEWGNQ&9t-7M0TRV3kQ>yQx@Ce(CZowPUihW}6iSAaC!^Nw<8m(0HQDPqV?#G2m
zG>mEUVI$-O7kiB|%fKggm_p0mlk(l%BBXY4=Sn}fowXA6@x5q4<7&U+Q}HNUyS#yy
zgtV{q`Gr2fhSQ_P=DU<|E7%VONy^87p#8SrXo6#b<Q5Ni^rqyzNR;CxE1DHjcTxvZ
zp*MSBv+1>JRQOY*TBPaNu=>~aHPp^Y_Vu@2t=t9N*6-TF=W<)jbgukLI;e5}vLD?7
zX_bVEVu!8_no6G~_UGoI(n?~NKFj>GKOpj5)J4QUMwqhscUNBT0neKpp@a=lGuPIW
zm-oLSb~=`v8Q}s49LkiT1SAxJPpZb8en6Y2;Pd4Yu0vNkB3UIN6hmu*z%5IP6M)P=
z-v-UUUaXtxE$g@aV{}!UHV-h;1ymygf=)bQop&Q}@Jc?@yKOE)_S8>Q!XuAwho^*|
zQ9^&H#pJ!8YdC+1*xHHTi{?C>JsIJHvj_&+ydJV5CMv=mggrE$J(R^>T>qrY#UsI@
zz^FD%9ymU>g;H(uMR}-*LgQURz$4BlBr@cfirsqC^NzS`9t;V3cCf4dDK)c@bJ9=9
zscurvOX$nS_De0NCv;h~k?b9BTO<8dU&n-#>esY+n>^5rB~9HqrJ;C)<447OUoGdg
ze>$%J&Za<or-FBYSn0=)#>LxeOz>ds*|-wqzBF4Aay*<vPk1Vc$xQc-W*bSD$@V^$
z)@PeB$>2M_{ynW&XL6~~uH#I$KByF$q13qLl0897AwOR;yJZ~~s>EXgXSK6e+en!1
zAf|5p+FMesUvyS_x`x)E&y4mle!@>`t`&Q9!31SFAJ23cx2r!+Vs*ZWL|KIhH{yEs
zGFhNR?sw)o5~V58Asq_!^gWs}PM14tNqc0HPI?kp%=>o@KeB|(KRNVk;yUA6G<D3T
zj#Pq;J>@f-mc`l~xf9L{T+7>&ksIxjUueeQj!s%m@n4BG3jji5Ll+4SP5#~3=b}Xz
zl`UVBxI8t2>;I6;{|6o6L0KD#9nq^%`+dah_S^dv^vhK0a4Jc^4Gl()Qw@erXhm8P
zo0hw!uRe|O9NLg>@yOWfnZ$Biu$5~~Z|-&94;c*WI%%b39?&nND1MXi;>mvUeLASP
zGU#Puf@#@ea(orj6&}n6(3_C;o(#Fr-r$K|<v&c|!M`~#4)tlW8aY}^vNhh7r9<`G
zM-URpJVOQEP<CbAGaSvdd*v6#(AS#zcttX*h#-UaEXx<lF<N1V$cP;CN8)*N?~fL~
zuCpEr0>dMBvq14##qfCE9qGuIsIFdZ@+h+Fq=^JxJdR0ya3CbdY4%%Y;g8wUZ)`EE
zQ}~8=UrS4s=ApCxnz8+6u85qf8BoXHsAdiV4|+^_ra&aiB1JY3nM#{E`H<cI5uyx@
zkI?F*RHFz#{VHKj^U3f9?S_23t9{$RMDrncY`Z^RccIL5U(sd)S{;@H`AH;EIW^RH
zENy4ircm1j4+MN&SUox?gYO9YcssB?S3Twfo|@9Ql1-<`P2-r#V_6%}d8yRfV}1En
ztVLI2$=AYMPxNaMr(r%a(OPJ(Yw)8-=`G2~Ds<{NW-B`z=&=#b9;cfuHmX@}>5|K8
z9fZ}lI+EHZ7%2Qm8|(Dx##iN}qZhzjG^DF|zbHV!k9opG&h#Y-H#5Mz3@NE1ZX`*(
zc)uz5rF=tKz5&nq*pbhBI%xb^DaPKr!LpK@$Yx?}-|ve!DL!=h3%3lQA=JlizMp}w
zUDOaOD?BnLviaVY4G4}ltjUhyq-Q#!f!Np;B#AFRSv=q|ZX3}cgfYBWG6Fkj-UT-6
z8kNGnxfnLW75xLVuk2%%hBYgZpZ-a2%i*KNB<esQ-%CJGfZ<^xZT$gt;ce`(${REi
zt`8zc9Cq_Op4Du4K>FGVj7AAZcau~?c}*f8#BEXktP&yfplsTmEEk#L7Bw^i#pHy2
zI2q;_me-qn_=~uFl8jx6R5(nXDBr;6<mcK}6w{RmI&as+ip-7BbhXdrZ1*kSZ}TTE
zqHJTm-kx{gNMMLOII(VgJqMgqayD|6c~!d$+Zgu<E7eC7RPuQ98!yPe_m8oLu1~(>
zUw+1y|G(Xl6&M~(wpV?NluRx4v_;E8&xQU}cw!8rVx&sxQP>!%tsQG;1$&by5K1m|
zdi~g4UR->a*~WS}tEw=rSG(+(Q}|+rBPHy*ZsbLuM>Y~^&mDAFObi~=SDh`rkSHD2
zDMOmpCWEz29htbi&vaZegvt36^it?l_k&rS?+@^(phl0N)4t-IEqNDdn^S_(eV0UA
zM|yc@;5+g_QX6xM$PnN@-6hn^*;IM6<jgZu6vS$%Cdy(i!fei#NE{rNZH3}z4YLN`
z!}KLo;I%1)t1+Uo3F}giXPY!Xrli0|M7kN_`T|<LldAdPCD^X@59#<nklYXxRkA}k
zc>TVddo}S9ENb{QHsqL3;cbC*Yf17NX=L|jA?VKff3j7Q9bfFExwSJTk9I_>m{*Wt
z9f9$DyB|MBMaK831zf(7hE9$nmV>HFP-UR4kj0nQ=FX>+iZc&khQ#~iem!7D<)YAq
z=o9DiQMb;?7;Zf8zP>WqM=$TnoJVvxYT?XBNoht|t}9{mCQi%kBBp)&grXf3#H(iV
zz3o80U9`I4Q*%X3;<oWV)}ymgVI>PF;xZ1W5<>!?^7Y3awPup922?Q5BCggUdX2Xk
zxmkY9UQ>4Zgq-hrUjbJ5&_0c3VY2oiW*Rl7x%FdZ*l46Ih03FE%7bqq>3}MLd4*`5
zo0}p@B)Xy8d!1XBBdA8e-qd~3UDyv4-2XW!C6gw~hrb`3ANl~?>}gcH0rOOt^=qzH
zInOT!l-wXySRt_3+@P|ZA#IN!x!oRXaEhq;dW97>V%y+2fqsaEWT4;x%=cA{yqoHX
z#cw}1h(#uJH)*BNal9g6K22{t&(oIB)TI<>6J~nWRvz7lE@!2AeCHe3WE*>BLj-T3
zszjZFilHL<3Nnx~nw*tUZN>I9si^rY7@Se*kDo|_vd^{DLE(%f5!du(0Tb!a@CmWN
z8Lb?OZyk<RWMl;sj+G;mU*u^U7+`LMFh5RBZb_V49J7iRx6|g2(z41e1NAM+)}O#v
z-4=`2P+u#&UQq*c5j@)N_g~s71?2VTp@+Gy54SvUvfP>fwN{r|1gRtN#s2i3?&0#@
zVM=GB>9rw7Cdn+k0srCRPcO}HWwui2n4!|NInoyut7U{+ldAD}is*SWJOo2U)74d~
z?})TEF@>aRo<8@9OeY0(E|Z+M)mc!n?bmZlMf3XvSEo)=2(<yt4&7VWb>I^6j`JYP
z_M9FVIoq^p2(F{$JGm`mpEBG^HzU`HIb}IU&WztD4zTXUsk)+%^RGnc*_Ci97~6n<
za)!6JyCGA7to(+*8L;s~Qx8t3-H=Wni|a4)<-{etwDoiL5FBm>yL#l{WVqI*UR^Eh
zXlJF=s<e1$k@`&_ApXcrX1PVHF9MP6rPnVkGget@AM}5`MwHgxHC7Vl@_n4oQ!t&<
zz|1K>?9kTA?qwAln|EKDr)0W#R+TeF=$+}B%5w5c-`NhuvU%tG7TdWa!(es>OZl_+
zT1OvBd=%bfvC~Q{!ijdC7MQnzm*ptZM5hlj#p>9Kx?ktTg!Y#aNWQ#kR#tvZ!s7it
z0wEtbrjJdTuEDLdp})}=xmW)xox?ydKHtXHCc|D>NFsfoR!3v&fOrqDtkc9Th+-ax
z<VWYBn*dLa^7Bn9(N>Q1S-RdKA*8%-#XKkt;A}=a_jVN9Sm^}@eahAT$otF0hVQ?z
z-v3Q|1?Dx$3AUEDhZd;&G<s?M95%Gu8pNSpO6-;$wm}`oFsD==T$tltwf<pQwPA0C
z+qxB2FM(0?0rqdB;$^?eojst>G4SH@nZ>2q<WkJUU;ktzRmqW;JcuGO+oC>BoKIXR
zN%a$5yb$}@j;2#ArlPlQ-6+=U1)==4n?i?J=NY5m+EXI9FjkjFlgOs|$%6SoHBU~o
zXQfs@M9lPC3+HU#ZvEJAS_55ID|yhtV$|wR&e1{u9|!-!wZ*p6ZGz=wN~GRqL)T<Q
z_|0n8-tlyQbmS*>9+LZd=W~B{4RY)Y#w&*t5iC$q_Ac|7)qm7jbe!R>OCan?OPLwN
z4f%eGr`({<sn3h`Fr`1q>TkuYK>2TwMkd)(s-6p8Vnh|`OGOFX{;LljzoYA4>3Y61
z>m^$;V<6C0tATc5j<_Gsq4KaP!*)ed(R;?%Q6K#RNGAU2dd&e`^47OsJ^sbDhbsS`
zwdVtv?S}rrre*)W#N!-~L)Ji<qFOL1tmeAXD;MQq(Z6COu&Cy7=*m~AOrNiN{sU{~
z?3`^t)BPW+#b=yzH+l-5T?l`)7bp5y^t4q?oVIz9%Si4r@$RR?`3~V*Ec&N!(VG=c
zwWnKC<T3A^rWljyU8g6k5_K!Fpj4aa^h%j?WM%dHBP3(Qxy;-mkpD0ACX?&X!Ls1m
zEe>piN-x3+IfT4|Czqx@rb!v>5{*UUgEg#}V%}U}A)j-@d`1_n)cT(500;h|*f;M3
zxO1C`XE0#%F{&?`O@px1QS=uxCf<|#tiyR{^@!N;#2}Mw@su~1*ZH+Aj>Z?DO`TNs
znM{*?Px-0JxQhwtKZ=!PY!!L)_+8;!ib;|9CTs9ZG!o!i{U__(B{L{CrEGfSSqy3C
zI>$h-L6ymnR=ySZ5hx1Yim*uyl<6%zZ2a~yG-8T$?Mkjp?jA;lbDl`k_iFbwPoM?+
zpwZtWkdBYdNXGT>tW@hV@t5asr6#%`nz>KgXueQCb?YtlZM!`9`Xqb*rU>xWJ*)er
zo=?w2c)Z<IHhUOVol6mwEu5N3K$=?@m2HW6kkJ(-qHC*HExyTQR$FJ2x)l;tm~Z_m
zFja5Fdp$MWj}LgZM-3d}`o5w0Y_Z<ny(E%8ycs(h`Bv=A{%aijnRi#P-r$_SF(q@2
z|84)Jqs(QXxn|${*LGY4GQd|EX*%FI4&dQ#@gA}&pG(9m+(YOyCV{d<H0ODFLH307
zVYZvt=Y^TvJg;527{_KEnfm-2Yd7m{HQuG6qWEYf6HQJ=#wQE}#+o}?f!W?S1O-WO
z#qDh0g{n+1h<);P0B@qnYB;PK1l4dZn{8*uSjK%0Mh6G$9#N3HCP!vr>J~D{%Ns2e
zre-B_laGz*HBmTXPM9T<=CDqi?DnaOm8mQ`r(~OLXf~OBq;<Ap%Y8;`7!jbgE!mnF
zAR$hIIgepgdFSmCY2&&uV>_}Tb6h0qZ3WQ_j;rh(B+H5T->csHg)@8)=mWS~5c+*S
z%<BkxYnH0e&xf>>@Yk~D11(?_)rkU<y|W8Z>R`j4v|&WTwfc|da~Ei2MhqOQpn@NG
zj4Tx+`)sO5JF$4ZqAFVsRLp%8=ZtKltXBY<xK2s8!TZ6fFjg&7HBKGVQvaBmwDX)A
z{kr{>Vg5BhwN67PweFR5)!_N}s(!76^|K9kHNFAzxc~9ut{+faFLwtEooyGqBa~1F
zgD2ut+K3qXQ46oP-qr`lK2Qu3C6sg*ihHY#o<xtL$TYt=iO-ZvcXT6|%J*tKz8gIZ
z)h#T<>*Tl$7`jrm#98!Bz^A0BXG{;@WnU4)%}08J+0c{-yfYfu{4)TTdX$x*uLPyG
zE7tS#Gc(P|k;a$42{x|t<=Pas+j3`zd=S0pk)%Sgq6YJK5rU>Mb4NN#(Pd5k%Y87$
z2fP>l5IO#BoYI5)6rhs~byM-7Bq0LZyTE3B;;P6U9$8xpl;oq>x#Zxm)>y{imez)1
zYLzf{`s|`<mB<Tvhq0WBshk+$^iArS!&pTkNBZigFstPl+trm&+gQPf^j7D%&#iJ_
zeU3VLoS{2P^`@JDUHeu5F=dm%jPD|yb##+nHkHU7+mjq^&7{4g+`jJ`)nk<FXC>&(
zV?C-XvAvtC6J|Xam)6sLG92FJ_7a=aWK^TBda-*S@^)kDNDK5WCIwY-AALgak;U=I
zp3f<q5Lvw61rDnde`t7&II#$$5G}9@x#7T7WGg)TaM7tAsm2hccb3kx@V}g47vRu%
z(p<l+0o{g3`1={R(P1c4f|+47l+pYLqmO?YXIqHE3UN*%4-O&Kz;biVKN^|V!{TG^
zh*C=pe0Zt3|CLvg?suU2Z?*C7tlj}GC0SK%9nP0wRt3FuP!tNrpoQ8z+J|Kx5K1$C
zOBkpP`gN4+0esX!xETB^m!b8cfO~`xj(*wazxt<zO9hvQ%y?tManYKEh9nWvF0KbV
zpypC-9wEk^UMIHDejrN^WN-`RtY-c3ra@BU0ZCx_NOkF||5d;#oP+Ar++hcyE9v4~
zbjwfbgNqvAqnA&4n+u718(19_8dPQ06w>{sI;A7);xRMq{3l&J1fEdx#|FcG6~;YW
z&g0qicMNZNX<w;&!P6U!47eJpDcwd6Q^1~C4`X_jtN!@_W$?9U>Vr6-KYc;0PIi{Q
zj?BV~?#SYv=1{$ifYgYF25E}r%6VDAJ^62zD2zBPa8pG7g4qb3{g0Uw@`I-E+E7`M
zX()8ox|!e|B(u+Wfw4Nm)fzxPfZvZ{U-&mB$j@DGllqOy#^@-t^>@qcX#MRzHEO|?
z{MMy7+2eqtuuDCFMr}YgpW*vFRRj{2YQVN5AS03XG!1bikauZn5%YvMrAKD0saG!-
z{Ha9#n<(O0W(yGB(Fn!Z6+Gs{kvibf{lWs%Jh=c%r%bT=z^Qss2;1lM+s1IDQ+>z)
z*Z!+ZDd?Y@C4%s4@Kke@aVJt*U`pdst{gJZ2$^GVp=JAQP>m2FZJ?pU*#_Q~WIoe$
z2rJuQR17})5re<uaRJlBrqz(wnyw)fsLIw*_WGWrW37C+#N_8t^f%ltS)Sil+L$G0
zG;=*ia8nVd)qKiZ(pq+Ra8EuX#;T_`r!MG~?LzKQk^bJ0Y{am%@d)!q5Ul4R17X0Q
z`$ZZ&ikRw(ah4LpQk>R)kQT-e!-87}bL3d8bZ|n@GX_l)+O*5n$-fp`wj|rdK;mtX
zeaZ1;i{yQ=JX=D0tJTk+$D|5NrcJcm!rlDC2Gf{T7(id^C=J-x3XsC1H)ddH<%A+W
zodeWXUM1RIdVg=2%7bIDTqhrVNkG<)6{nW>aMo=XQqxHwT-qW1Y|jRd#Xd4}aX8Bn
z#Vxxbv#x{4M<IovgH>^V;PbBnJw$xV!j@Jan%d_<E<od3#QIc>;^xd1RC@W`V4*tx
zGE8k08{Ghe-RbfR?^@Ufn#+>$<{n2Xyhp&3`Sf8E$hfb6bS5W}T8!>M()2O=>ZM(3
zV9eb!iv4>V?`rN3GXLiFljC03-EWoXbcNRb9^zV1gJ`4MK0XF#{*52sp>HvF2#o3L
zh`B7H-Xde>s7fJEwwyhaO*U%f`}wRt^lU;zaB!nuOYXrYQ0vA*^W6n0C=rDd(}gom
zS`lwa?{du0v8czzsBAI>9lAx5U_76WBIH_9HWTwJXO{;HBXoiK!F24J?)~Fgx2e-m
z_d)lm;d;~`bVRXyE-qUc7i#kcq>WbdD9wUf#+f~34{FG!H$!=UHaYWv5`+~Cl)-&?
z*gbFH)UqGW25f=sZvxaD&ahD%cULFJ5D_E-1~BX3DSiPSz>|FChzS5S=<~@U6;36W
z+{|HCVcbVfx^KYtxdgm;BJA<@eIQdjLA*wjGCu;7sWqtO?s0?~-^21qs`trNO`;&+
zX^+FI)6sKCuT(3eWFoHtYLg=iU8nOm#s&QW`?>-6%M(~1IoxOhHm!@29ht>(2{=7K
zui+q{-_{uiDh?5rTtKly3^NbIgI)v8R~f07Z$LYVz|dF86}0daJ!(0~O@n}?fCR4x
zZ-x!vv9ichr~m<vl8+iGo=3}Ze6$tW4VRBEZm$bYB~>fawMmc$cE>&eUe)ADN=wpw
zVb(4aE?q`9#C>5EQV3D<`y})-<9^>(uEFR<UzF2utMVrZ=+8iB$!SI7@+=WpC~jEW
zu%dffS8z!K!ilKJi8l3=ow8t=vf9qr#*TmQlU^Su;K(M;HHB?-J5s)uH0k08PL3OV
z=uZU5q^nMk<MFxncf>n^!Q1?XCSegTgq@it+kCRzzX^}vg!8&mUg}&?bIEexpSwwy
zdG}YpF^GPUc!@ersVhvf`9`LLW6~~~Pgq>--tdjdc79QZn@v}M$FH?udpW~j&H-uM
zF0^&{h(E53mj3{IHMzkpyCFZ@H?qrT*VE<>4*g7OES>cENno{z^e3j!!F<{b|3e4b
zegRszQ~giZre8Sb8gY5#^La3_yvjV{S8913-h1+SuWZRDX>TAtIpmHN*|^lZ2ZG%3
z?Nkl7$F1@hFFhLW*91V;KP6rNM2`WWSR2zX+<T|qPGt#hXKQ6d_`m)U?%_}Q<K1@O
z>#^RGW4-MM!jxq@w9e~M1AhroQbfn6%8ii^tAKltwD>HxtC{{OH0SxYaqBYt)f{#R
zLn{-oJwilZpgGTrVZcOz8^al@{L_o%3gbv8K`F9jA1ZhM=u<?lY6%-T5g|4H>JC5(
zv%Bf>!xD4ME`-DVbTF~uNV9=bIWMcjd)zMB7@=*!s$SW#rvTtdZPSXWo{V9xbE$84
zRjXIQ{We|w6rNwv?Zb;F^JrSIlH0(ylfDFgOTv3QZ}^*c-jlu{ly#gSX9vrup)}0M
z^FO?%LL}31`lFq=sQSPJ!$8vBjd1pRZzsP68hyi4rPJY0*Z0rj;MtMo^3XcpI#J0z
zE@nWbUjoptti&zQp?@egEY$jT&v$c5<O6g}6MVnuEcjl(cgmfO=6ZE9iU@j#mM9{6
zYRVZ}^pDp~+Gcgt-vl4_wp0Hee@G+i=ZQK?99~%p9<u&@RlOcRFk=M~>}hA*XXDc|
zaGxVm1x%sc_61|d(qga&Ib~J~jGf<@-C4vm=s2q}n1(r7;?yFNp-rCV#pNh#tZatI
zLR5~n-l-Xdm)iQH6_@HZ628DOgb}Q*J8t?kS<IWf&%QI}O=Dvb$qX4v0W*}!jY1=o
z{sqlE5&e)XVro%N8Pkr2Gqt=;k}~-oAOWg*{bMKlBGV^9{70v3HGB$4e|P{W9)Z4C
z8d#Xe;z`(wwk5xb89lpQ`u57_@9j#QTey=S;2AX^6l)l(+o{3irI8Qs{*8Zv{BZQl
zZ4Fo>UkjcNwgBDwa4wu%<wiw%woNV|VNCS5&yHL88N8v)No2)?>6~Q}YHM%4_u1gh
zize(*$AC%)6$qTrAlQc_!6#DUk&P>Hu5npq<Mo9}JN#Q8y(*@OKVJwaf9)}^CIwOy
z7VERiBA-{d#>SAryIBy+l+#&mr@C{evY(%5E_e=uDZwffPkFptC1potfa%<(Y|Ett
zubEucHhus1qs@OdJT)TJQu`~^`ha^~ai3|u7hrvIHQ`qix9X<7vBxi7H$dN>H((+V
z4Hu*5f@s`8^qF};P?U@;4vuw=rI>CtGsA|5<B{$!+bw`!lNHuQVr9<S&&I3yNFumy
z#*b$b6woYg0-Dcy!iFEEZEOV#5d;zVvdZYd>xnGtS9z+M+{>@2(B=}Pp85Dru<6O{
z%4B2R3me!UN#+;mey5t?{DY9q-<p>^hE(+gdGSds{#+U<WCt;Ifj@s-!MLVyqvRvc
zz1;N&(KVjk7#G#suoAZX8xT2&tT6S69{NfcC`9Ez@Yo%tR7}Yvw(>wU-#ds$rg1`A
zi+qJ%)=#uTx9K4_7b47GDF`>~i1SK4MV0z&nr!)D&OmpeC=B&9?Og^lXQC(QK}3<^
ze47bKT<Vsw)XBde!4?cFBi2#r{a7>93fFTL_~4Rc5NkDz^K8`nS3$q-VHi|$5wf4H
zB@3FEUZ(!WmkaOoHE6s@u@M9Hxw8Iv0xDq$<Ez{td_#GhJcO)Gf3Yv81|hOtsd6r~
z9oDUxkIjJKFM>;;a^AbV>>=sBIv@myS9}G78x?{c?Yl3)KJqF5<wg~y8{Q|%FLJVI
zeg`!U-PjXineT0+_7i20L<+ob{G+E=C`23wIYlbm5W!V6a>cCP?4~5Plqh)!(dy<W
zPBXEX?lZmW0wtZ<d2xdpMz6ckq^{$G&TrW66A9t;=K%IuSbjHI;80K=Seb$5wv{Is
zod-$jfyKL_U>m`2Ca%73t4qP$Nn3n|T@d{0#`912tw@>jdkWPi@f<P^G=qQ3>f7Ji
z)8>Dmw`JtAq-aO(!QIAZGazgu$>nVJ+dYMin2tex<lgHEAP>}dpbhO5I|QHl2K-ge
zfsE@Ps!2&+!bDkiffVzr8Ep3WCGPodmV4kar_RqN6Sz^%tWV0Eq~(UgA~Pt#Ox261
zU{-+b%xNXL*sDw;o5Ba4Z%-bR1=9P>9%p;shfV&KXeo>8M9Cv!_t$Ox;O3fz_!}^V
zeEPl(=a$y~n~1Vke<_s@;vhjUDSUIm6uvQ$*MfohxSwgr5EgW)#XghyCSpz!1tUGW
ziCM8l8`MfIOxY*NLN-`1c~DGLF=H3u=<gl3<tC69R9Cscr8f&7D`r47{fV~eqqcrv
zb*EL-f4Zj&b7iboo98ZW8jj3RjS4PPVI6^BSZ=o80Nmgj_`eDa@*wY-0PS~|{%Qr=
zF4VNhY5o!rq>}aTM(fD+r6ZY(kK{&AfEc|zc_6nnt>%8lwZ|D2&`Aqp#umxkmhmEG
z1w_<wB#HVL*v3DNV$alQ)7NHPviSQ<emRDuLU%QXN#X4$;*d8iD}SmL9mWvmch;Ii
zbKiFIV8Q2b1y*fqpDXTmlL+PpQ<5>yVFHZ?z5E=O*YEBfBaGkMxgt*Hzu6UOa5z#z
zFg-R81u514$KIQVQ`x_7qdgMKypYVZsASH(3@wsMhzw=OP^rvTW+^Q5Ol8QJgpwgL
zWGFMWOl8ir5X!WWP^$O3Q_uIie|x|0AN$|EkK;K!adWS=KKEz1uJbz2^SUM;Kqa=*
zvr_)y{0knnnLI@>;(}l$umem_My^Ms#~tZhZ7Q;a@YN996Jy|I?}dzya;bSyi$RU#
zY_=t6!&3pU3wmvQHip+@@50Cnd&7KIMSsLH=bgNLOBwaXjT6Z?{$_o!$MA~n{hfLq
z&pLqT_1L%w$8O_vD835F?Y_enIR<WX<>|`fEQpznoX6r2Q;y@PAuMxQ2a9NET3(5>
z^GB#iQHqA}_<%_fQ*r`xF103I+5Q_63=ckPN?(dU^VbWY<>dJXc8CsJdohOeespQ{
z)S3>xaJ}G?5lD(yuaBMFeQxKF#P9BIH|^9Dzuy?CG3Ro`;m2VXIoj`b;P}4B1x^WO
z@@0G3vUwKe+##C?+0^f#;rt?0i?)Zd-LuJHHrF?Y^X{GviImy_Y2sC2N?LgB+*}@2
z1P$xL8{uFleZtu$e{dp5)vSPdAnXeUO%J;6eRF}avKy=MWN?j<rSsJ!yVpo=8VQ_i
zGrUjyt0$vQcM9&o7A`Hg$Y@P3b(eg7X;(-7e^|@>W=QMa9k=df^Dx?vnA2!!bZ7-6
z*=T8!l_IDOg|zoy$UAjNkl`KVX#?eA%2O8#uv>4FrEH%g)PH;?V%1gy2j6qBa;omT
zLU(%^K+resX2FN3R?g*U83`Rl&`|9Csu-7AxAph)Ss?RUb^xn(N2WJk!si7h(X;Jc
z9$v4I)-=ON$IYTc2Y1RvoyK%_2mBvo!0((IK*K^w)k6k6stC^FBMHQ*jk1sT&92Ve
zJ8VSK<d;YD9A$H!DdKkyB@b`xlRRdP1ED7Pe<6qXu(^uEK~<kG^Mxm4c}jiXfo(&4
z>BlX|hc0euDRGbk0+wFrJW<bB9~LI#Hq#zK9u6dbCG354q_Ks&5mm~Sf%kA0q;Gug
z7acg3?TW<MC(?#f=!@kQ)@!e?Y`_6YX#QOO`{)7E(<wkK?!7*bXuwKQeSxH)uys(;
z+s(ut-$EGSQ!55C4YD(Z7eLg3-MU~M%<}vskGGIUzCL^Ao)F=2WjH`COSWnYCozS%
z6_9l!NB05*TPQVPqq%~+R`Cmafl-4WPy2CD*}mdIkZQx*u>hy%6v>X~G)n@@x{8(m
zSD1>6AZI^fR7dP4Mprb^E8{Eq2<U^Nr{3v=9U2eDzfT7%qNSgsu)mYenC#h&LoOn&
zAI20<X5m|<1Ha>^K-BdMP=RY?gMvf9w8!vaBgtiu&_+h}2+}SfrhH7wf~bQ1(Cazj
z5=5n={6~>?nZuB`SyZiMo?ONAsdYjGi{92p{6Dw#QQWX1_cTyQ$gOpksCGlKf>d>+
z75cYw@m!cm=a=X~l+Mfi+P{~imlGoYFI8o9bAs5-ilQ5P&~+RP$225c7bvC5nM(8Z
zeLGdO)KIP}Ywdzfq7J|#qRZCLzSVi~$-?eO;y}DCv@ZR_0q~OgDI}onIo?=@dd250
z`9T^<We<Lj&i)FD9jz*ghvefdY+Wt1^$F3kJ*OI{zXI8RvUb$%?G-%?Pz=C{3DSAP
zMg8Eg<A)Lyk~9y9T*w5s0&E35mC)DfYi>*otjxmJ-ypocf>2av3@!pyv<zCNW{s(w
zdlOq&af43>Xtx+>U)X|GSDeoZ!fJ|t<+`J0i3J(T1(aJD!B1pz;{O~Rp4j4lG!L0h
zq`At@w_A`|YK=!@DcH4`1If**r`b!vA}SQs=YXQS$v--zcmv!TXeFq~<%U?L?<$5+
zl2MhfMh!vvy$p#V1Fp&5&M!2p^+%*1ra5>aBqKhdawa(lQFfStjHHQk0iga^^10xF
z+v~plt3`MQs&u61j*W12emjG46z9S(NPVO5T^Kv0)g-C;NcqnX2?sXe$TMRy{;(UG
zM<b=qoX#r5phur2;3A`cLN?*RG~4Am-_0&WQ63lqk**+<7-*l^YrI{f-LlWiEtLq4
z+QX(7Tk;Z0cM=t}5;Ng}gAn)CCX}B23ejxd%<w&arzgR#zu+y}fg(yP%wcBb@O@3T
z)hPgMBGMmge@6Hk2lh{=atwwFlVOxmkxdJs=`Vql_U&OK!^0l_P%b8W=+@;X^`dfK
zqwV0%v9p<~09+yesXs+0hZ*bqNx}L4k=N?1G?4$bF^q97_C6Hr9r!WO{lgkx><#&l
zMMz1c7`CP5lRI%eroVpv=nSS@Pp?6*bw*>>_34xWDRG46=d&N89?Wf(T(C=*{ak*9
zR~h>uyv$kNY1zmoqiPt_>sWGioJhb_p)bQeS%i3DeFc8Q1%O}(iF6M_`%~t!KA{ec
zT%oN`Osx(LuckC}LGYE%Is*r2J6{QL!J&cj$m3suMw1dM%8>~z+^>NUD!W?VtmaGU
zxN~D_NZH)^=SXM91LiIbULi+eCC}D)?6Vsv_bncV)oY6sVlaj=3R5Vv>$&%A#77<+
z|12{h@r~da>%FNjyN-XqG{!ylR~zs-0~XiIOe`G|)re8`MUWC(=PiY2tIrZNBnddr
zA$V9^5An`|JX+7^&Pq-29|r#LG?cYEF&9dk!^le-J;^C1(%+80ZN@aW%MDrWU3W>5
zOqF`5LVRWch7+?qY89aG7=c957HXEhY`rYw*#5LV6sm;))mb8zN}U?f1m^fqt8mj#
zJR%+dA3hK{7=weqHA!pQI>}~G5Va?bu^350k$@bl4ZPFhXP|DE1M}e26-(AAMt`e?
z45xuIyKZ6Y{O2|0mSB<etj$2|4@`)JR@b<HPX{UlQ6s&@NYL7Ji`W8`N6Wy=@G;_?
z7hAIXr*|!w!%y8;#>X&yqP6EQ3neAbkmXrLZQc#N#XhcvIJkMc5lLwzi~L4~uNd{&
zj&Kdy#_#0LeD^+6y&l1BC5MWPK!MAy-ZuRf1U391#|CRi8#A|#gU?KgEeBrMo2)O5
zP77ar-(PxFMX<0A;>)jqY<>WwfWUmCt#n3qo~pCytsST+Bqx5}3JkYr@_+Kctp9JB
z$qhcpUBO4|upsHqW)0vokFm39vAN;xNM}?-WI`mfIoTZ8+VET*3cAgyoAh>#FEMR8
zVX9u{{F<-XyHS=qVqKln4}4k3(Bo=_*YP?(uhKDpy=NMU6EB^;rAvlRV+@>(N={K&
z-%0K{(b$d8_-Aa_+c)bsAkusF{4|0o7?Hg=vLmS-B^V_)?tM-&DiO^us2Eqir8YHc
zgJ+SA8*rUU7eBp_;(6~rb3SF)m-x8TAGfY=`#s(FC)foWAKqtwW5fHwnM`8?hmlM9
zW&nPE_W0cA`ui6Hej5yHiD-=hCZ10R*KXMI^w9tw-ink4TxTx|f|CgR-8EjgHWFI4
zEC5?m9t=|7r{)4=+j2(~+he-vN+2i)JV8GPxi;UNZwBAJK#bNM2|-BVzDEHr9P2W-
ze}w!RI4U3}RNq|&EZ_k8jF==Vr7O52MtV+2*#DuxunnRaxOM}%#*xnE4i+a_a!AS&
z?X52X7dS%ULTieB^!eQ==oS(M@piq8ge@r@N<D^o%y~{o@e0v^yDPhYuG-zMqLFJk
z%sEalb^9pb+6yU$&%y9V(?}MlfO{6dtI{fGfNM)l7d+v4<QpQRc5|xF5k}@E;>qlk
zr^^lXAY?=lgXkMKZTiIE+6a;k$ju(FLxzkCE^yct$r|hq^;cv>Zz(Jw@_l!tV3V$!
z_7Vzx`jyC#E%g;l?~aTtaxHqdi;q4ha;+7U&kEO~ks+HVywKW3p?Xx|S~-K=7-D&(
zi;UX)aKlO%wa-X{hnML}?&iBAgSbE!^oF0SM20g6u8ky9ZeZb$T#+G5s~*ta9qJKe
zM86UXYhWGs9Y#h?$@Me9b?RftkX0Jy#q1`$eTU$htdi8{`*6(>LBcrKvn=6{j?u$~
zXqiEdg59D1j*O^knma3uj1n?xx+1z`u+skjN%;TuNvLiHrgI9PTEbxWo?_bqpX&3R
zdz7>#taMHl5+7J{Uk;O%qIGq8bzR$DQA%&2S`eem_8tU*4&djO0A9k^<v>e31cLFU
ztRVX5px}P_gKt51cl`+T9EW-RiI2I!8s=m<T|mWte@U><{o(l?N;ONM2ERmWg+u~i
zt<(2&E$J0rvKP9_*BTRP>@EGYte}NS2kkg_gMQIHc94tz`p~=$B5+if<ZSLcA962`
zk{EA~*J<5hf`y-ANQ*GTee6&CKc0h*{5w{UtP7EY>MNixBVgEvfD1GYd2m!A8RUgL
z-SoygvnDGAk5i9CDK%4v?>{Ikx^iAR=JxhS*6$FLErJfa0_x=$QzUB`$kXFpfZW#Q
zB-%z7tg_!&iFJ8Pp4p+Y8!9z!i{ZA1z&lX|q9{3^FW1a~aES1+%MciwJRDw5Op|(`
zT(go%KQaxg6SM(R=id>Og?qnIX~wHxKYh(0&2oTFLC+!AlgE5OqFi(!DXyT>E))c-
zH0nqT2G~P1vr>?~oH<P#YS!0i$)1VrSoQF2S+Vhg2by)3BV^kZAg9E~dw^Pe1W3tt
zkh)zC!vd3~Xk%q!!UI&(3!~LI!$YRFJ<h|2q!JLVehp}}93c9WgOvIr1al5hDl0Fp
z|MsS+dX__D{|F@4iy%8HgM!8ogoan(*L1^<i`Dqrnx{9*YG&|jgbeLvz@TlqAs=x>
z5U?;cmkg){27U)Vjx|I=8R-CWSG+m$^qPuEwGPx@D1AXTzXXC{s)ji*GJJ{*mmJ`x
zA3(^>wcrIoHAC+f#Fj6h-qVJ=isT5yXKy~7E&0skFU;mM_Qz5s2)4)f$3={r;0(RS
zP4?_Cv_SX)r6W7U5G&pP+b21o1r0$AZKLA*wH&Ong=)Rg0#F(;sDZEz%LhiZN!Tq#
zu!VR0Z69w=EN|pkNd%Ow+X<{E(KrfPHY*@TUkXzS!-8{Wz>jI(UnNjxYy#X|tjV6N
z!4Zmx_n;tQ7lp&*q6OfMaseLN<^bbQAJ4=e7$^7{apR8h-U8K+2BG==9OB!qmv#63
zlfVzMH(6@QSMz<qDR_D~psgER^Q=Hg?u}~2sCjuH-Ul9lo;H6w(cT;sXV(d{5*ov2
zo5WFokqR8;JtXoS-YZ9T*WL*fM9lk!q2!$G{WHoTQmmqVbnA#g*$xq6OJUqOk#)%&
zMA!N@gn>81d;YB#t3Ry76wVtdoLwoQTvwptdUe*CGbNH`y1zeQmFo2;D06p1kMm3i
zfGWD>pyTNxLd-1|z<tV4O`&mV@PTBu6M=67anj>0O6Y~&mNEpV7}k;Lyjv}AG)p`=
z9dEtllQ3R*CP_SHt~g@j7a+`6#IHt~m{3wPyJU;h{QCL547l8ipsxHR&FU%Ly&^L%
z)6kOX03q-jb5w*~Xz1`2#Kw25bXJ-0(v@Ehmn}J6^NA-R#TUau@hIod#=}Q;kcwoV
z1~4#^7cr7OgAJg$%1#r}`Cb7E)y*(QuuP?IdK|a9kilrJDH7C%+jC{PK{_Me<k4HY
z+icS$*l&Dic?>R8I*mO?bah!)p+v<0@9HuOV`Dd%;PGln<8lpB_bd$54N7muQ0Mpo
zVEYEscSyslT>|B>+;Cs4ya$^i!rKK{PJC(W!OOFW9KW2|R>y7wLMFpN8U0%feiiDE
z)(v)KI~TsA`ZG=*UUw<>`r5~7|4o!=x3iHY_;)M)-E#hn?jQt9$H)V~ffmh#@y5+|
z;HL-eyQLDnGZH1+sgHlrY#xT1U5dBC06GGZN@Dw`Or>e1=&SX7h^Lw(UrD`-WcUB{
zZS<RY;!klQ9hM~mxoU&yd~vA9mB^#dwb|-<eC~Sd<_7XGppYy;8^!3Qt<Ng8_kQqc
zn|o||o6QjmZ&pv=nz<NtbNo!FUber^ylqr~6m&&sA+`jWu#oK~W0+f`nf}6cCM9$@
zn2*@;wh5P`Z&4I{(&oYUE<e{x3amBvr$5!i%?5iGeKhLe`nY);`;#v6*Up^UFaXzS
z-czfD#4Dj;A(Z;rjcm8xu6ucSS>`T+#fDw>4Od#@@=ULf&vh41KkIZGd}mqeePpCS
zP0uAA*y(p~0*~kS#&|>|i%X4;cTr>y3|}s)Tq}l`>3ITXC|Vw8+lsj<nm}9j!_7a2
z`L^GKP@mx&PoG`odB~`ULF^ye&n8ajeSR5mz}tBA!6A=l0<Ptplclyy;TD@<GOm3i
zts<=3u_;<~`UKhKgwwvsiFGh(<L$%J_!e6=1ulD7Jqfia8bwEs#ehesNe%2BLI0de
zhsy9$yK{mXg+jG_9B8_&OBV)VEpe|m-XGd_XI_X^{dUNWBEGVfZq2Hj-a%+icp@T_
z??k0|9^JS0G}k4tZFE9t7&G1i+fs_IpR*+?7|hBMS9pXFhbw;4E;?<gHtlMQDclBz
zi|pFY>$J<=2m&{geYknJB~sg*uxvGBX>+zrS0(MJhs|_7II^9cspy!yH}D!xH0J8^
zs*LoJp-(q}nplZ6#qc1A9&6in>~EYXWO37%x|&>8;mre{V@LN5Mn*cs58CzRpF_+<
z*zHq=MuH<~WJ(X5jN!VDSSH$>T`Ls0gbuIv1XWTy-Q5hsYFvw7PbFZOO7$AcimdC+
zqpP)6td)0frn;y9N)zqN9JTc`F%{(zxR#ghadEL+-epo9ckb;_?jJY<IqOc7{Zuny
z*$v?Z$S;0gou)AO3I-u|fap4TDQ|U|Gg{72eD?BuB>t-Y`+=2ngZqLB(Y0GGYOkc-
z$4C(ssDH9Eo$z)ujBd6^5g549EX)sEFJn4eWBO4(o+vv><t^yh3$%^{gOiS$G5mTy
zX^i!Vz`1o|$LG=li0&GvNMBD?U$#0ICTmLseGhk11G_WzTGwmdit6BnRn^yt?VeVV
zo_bu|g;W>T$lnn=r)(@d;KZXC9Q9peFQ(HAOxyNY8$Q8mad*4RnixM*BONSnszN%e
zGs+jcq1DBMjbNjh7B#jy;l-4FvhenRiFoEs?!^nY3$ILch_7=uPB{andltdLXKVx6
zalxKvmd#gh@wK6OP5?HEBP7z?90pBJLi|E2Y@6T%{>FSiZEOzqmiDcJV9(Z8&%Tx?
zR+RR7eO!xfZRy{r)mVjPSc7d;5<f)jsBQ<_vnwySLWC2wvM2lUr{PslG~qoTt^_6_
zwQ$>?ZWmlt?C;podUum%r|QkSajlifvFI}sma4c_@!I*d@A}Km%lxtbV=3wv|672)
z7ETIU4V2O{Fm`(oR~Fj*6hS(UCv;@3i52IlQe{!)9o|0>z&|@KFw9|J7{4vCp0e_j
zC0k=OmQ|n~LN?BAJo>ga#RIb{R-#>)7Mv5;&13n~i`v3)&Up@SI!4XB!=LH*X_&`=
zKSz^xi2Sx)`(~nj`E1NT$77nCQA|L<5=%}_{)VE4DAun6Y%nj^*Iew&IctfME5Y7X
zcd${-e=A!+8dlNA64c{d7(yf~3RW^C+(mZy$a|deC{3A(?5C3)XQyvN%|m*kTD%#h
zsW|&c|G19&A-(`9n~O|gle8L4rvw9EKl5A}=hvR0zM8g~{^2<|xrpMax=J&=?HuE{
zl7$)w8vuJ-Nb;V`_CK3%8|6z#mg2uJb#*9Sw7`_8%%Fx{K@UNQwn^}O!da}eB+$pm
z6vEfJPW3gMmEgBovb9`eXXKaXHYhXa9TLBv+Zt6GZAZD}7PEaYMgCMi;|S7FVD+2+
zG9;63_<#4tCy4Y?$?^%pY_%f4D6nINT74>OsXP;R+=`F&boM1;!|7XY1ioggd6-CU
z)_1JWe`KXS>XBu$DG_yn!GxnNt|hGL44ELh;X0K1X>yK^QA$w-gmb^e(27K<E@x7Q
zj3bz*BeY8y23RxIyItq~MuG0LI$#~X9Y&n@r%&8{{atuMRH(8W#l%C;ypH{O5Fdiw
zci?a)-O(Ycqb#)NyL|J_XNgCx7uVP|lvrjX++U9hWLZZEz9|*{d~S<Bd>u9-X8XGv
zC+MG;Tk}OL*G)494q>~FV05UdQu=~>t@M1B%^FUQAFRzW2#~>@nzB(Sb6Pd+sFT=s
z07_XMiDHVUdLd<{X}&i{qgG@8YDq0A?#$BQlrlCZ)G8QSe6um|G>_QqYuG16DMjym
zw0V5gJ-H}|=Ap>A5+;#klVd3^!SIU!C71pxdGBDi7A4bhkl3w`@_T;+g{@o{re^>Y
z_EQ-9v~*hB2i+fsHXvHt&z-(&+=34HjDP_dDMbiC!AB;>=r6fNCDQmb<k_%PLNU~9
zC0%IvAE{bmyymSW8Q&#%taKE|t_CN?&J|PTa7-T=Y@f!oax0q=e@s>Tr{X!g7$25{
z;k%YLMe7y+22vwwMyaT+O7!5vnM-Q?KegsBKsXr0Jx+nAlZwIa)7Za4MSF?%3T8iP
z^zeB!tJA9=U(w+tbA(M|C_ADT;=&|^_KyhXtln7YpQdD_bv`yQk930xG?wZr;m1%B
z$r|?cM!6g1zmAerz?h*_AOK)}@T6CDkTG?x`l>)$IO&m4gj=Qty`qp{wv@F+n7v=7
z+nI~q1jWGMIgU3f7KWMZ5;imwYMKP0)O*wgwsnGj=UzRfaYQN<g`GPA4Z%G}Yjcv-
zDs)cXe9K}-I)>fK7Q>9qpGmc?XFx&E!B%=lFTGsvNlTY{z@M>O#NhzGqz@kcx2b=0
z*3BnY;3PCpw(Z6Pnpr*o0X?yMEDGOqgsGo_%3Lq3f0(gAt)N=7*)QW)5PLlMT0jGA
zGQQD*6+%rkJ8TAQZ`j(rw5+~-<ao(u=fE2)-@}C?#HRyo=4@H3Vyy+{g3~NjmLP4x
zr^{WG%O~geZ#$EA4!@W->bk<bDEOgbnkuj~JpaUP8kQmJ^No1#0F5Hk#J4F61+j+^
zny1}NREKQ6Vkj#^BV!|jcYJ#<`7}1TBE=3P{MiUNHOvC^X;c;j7A{;HEhl*^IQXay
z@g4XWA-kP+;gwfviQKB(+YM<V*F$J6n+7?~57fq#!je8Kf6$9Jx7t1LeZQ4;DU2B}
zA@GH0G;?YIS7Qc)&b$!eDv(NS*y}mqSlU>&NAG9`H{P`LCyqU@yS7Y~F{EQP7}cDr
zGk2X=Lc9@^VwXHuWH-xV-N~#J>d7(@YCeETYG-UMqhwIj%(_8yE1qO1CfmOY!Jz}D
zAR*mvm-E}i936g@ky-LFD*DjXnFHA1QapcTe0b?;RXYFRZrO)!#nTGeN7)_=UEARH
zPSv$_AgbJT&dQ}`JKfi?Cx;snyo!+{wNa8h9k&yxDM&!Z;GNqt;?<~u9y54mY?wf=
zsJB7Y?PvFa{eZ!z38j9drmVgd!Wfvg61RUH%XCqbyXtkEqPU0H(#-S6%hwqA0|mGB
z(RU=PuIK3!M|@cQhc0q8(8dSXD(2#K@H~wxroV&&v}&z)NCB==-T8i9eZFA5g6(;Y
zqbJgSEl-@C25u@ne`1;m%Dvi#d`apR!esP&W!f#IQrzE|%(gc>4dno7(rqDeiPltC
zw5s+vHzpO7#)Dtdvc(33rsr_}L^cUiY5CEh6%{fLs!}S6w{|HXv|h@o_@V>1RQZg7
z<8wQ5<)YJ%)2i<u<1<fd7sUL|VW3Wnl%%etok!5u<5cC%)0F>&XjY+4venP9@E^V*
zwMyO;rB6qD!>v2+=Dk2hyi|yK{HrEirP$tJ{^-(ni7osf0W4WkoIj=^w8OJnd>hwk
z^mzQ^?L>}vtR6N-OQTqAX@y+H1~zR5E9YGCyb88k2@ciCp5*I|rlbfyhcNCubgNZ6
z*VfkoM!EBZ_Q&)g$HtGEpJcM2y&PqZuNTtV&!LQ?YV!HI#UD%C$Ur>lr#5*m|8;D7
zwVypHnRQK{*h0pu$vGQYQfgx#0gFJM(>fYe?A1vuXi>^bAH}LA<BD>i55|Gzlmzef
zAB}=V#x%F=qP{Q_DZAO-L`yv)?;!^0_3yntp$$^mNL?Jk*8Br5nx;Xr-K+z`R29`C
z;ctkk-O?KY(_g3<`0&>C46rxx3l_6Ygkm>!23dXcEU464=I3`IXc$$bEyxDpCIUO}
zWkjNrhM?Zl;<U6M0oyw4@#*lF1x5GOkonKmiX_ud{ZCC2ljuWOY{-em9WGtzWKlvf
ziE8^Kvf-eVUL#+9s>Y+m=3c$oyr=cUzP<5jvbn$o(Vj~3ZRV}$af)F~`Y>3rqr+s~
ze6%%K{-l&MQ)}!dibsh+qwH>dS=cu5Q5DoJEb=G+39yd#9Dv!zlYP@mn@^af#_mR_
zoV96pBSQydUqKT9U{O=6MKQZZ?TGiCsI3C-l7`SE7unv?lroz%DtQj+%i3Nn{5}ca
zL%GyZ4+z$>!7J$TlQ35W7wJP>OrGu&_RG6)3Gc&iaDdrq%Sn(4>4Czisy#0dUVOo>
z*0fltzl-$_yYgl>y>&uKE>E{;LQG-}<qCHf>jObe!I72^5o2TRimmHp@hd=t3VRVR
z(PPw@hIFLG-~Eha-ALkng<SYyYQgSz`-LVIj}z)n5~9&K+xF4dQmj|02RhZ}DPpTx
zW4w!aa2gXJATwLXaL@AZpABqM=pTD6t=Hq=gh}x|nk-T&E8k3rUZ|398_RkQeMTN|
z2F_+2IXhSrzkky;_D<EnJ|UdopqFP*ues!NV0Da)CLSuvzQvkNpi^EqSD3BVCE$XN
zy-51r5G;gaY#!Z`*^DcJj#0fB`Lj;<Qnv|PNc9mBS9qfms*^O{CmV`vU&`Rz@c?@E
z5O~Wuh-)`*jVbd#sVa4>Ur+V+GAA5=DZV*YS3$t_klQbRtTx(WY#FV2<2QE@$H0o~
z&KW(Es%jYPPE-sgB%&YqdZny9IWXw;eCDywi=BqVfDfQ$cjWQ8Ccc=wf|VanwMuib
zQPqlKvleefB6yo+>jN6Lx1QJEI(Znkk`UOmVSZ}1qqDQ+*t*5X!0JftS(_^^8PEi@
zZJTGm!%TP3a<q+jSBl7C`OSw?tjBDZ29Myjs;jPu$FNTvXSF4?Xs!21yj_v$7Vgfm
zMZk~QWtJ-$H6j_@e$EyyGOQy}vdh+P(-_~XSelcw;k}(LQn>69^JC9J!0=#>BGN7+
zvS)VOV&8?b`xap`thrN)Eq>d}(DzqeY7ir%q9aysmpd!+%bY{4I|C}VVj0zi2RG#+
z>t_ZBC;mlcuAsJlng7e0z82ob@aiJ7aDWKC<spYQ9Wvuu1_R=tkN>boj)I{4sWnri
zXN#a;pd2dG%1$z&bZx=cKbwB3N8DPxylr3x?;HR2GGdeUwfg?0iPigjwc)D7bLt8w
zOA{7WHF(mnn-U(5zPxrK0!%i{_q85Q>|j))(lMumIu1@SWgME3vi7$^sbLQKY%Zr)
zXBn1%FA0(1s-`6RjwT6>Ala1H{h2OTCyi0ckd4LFX#t(WD{f1ZpU5N6zq{D{fTEZo
z!Gf7dD~7}ElxxN}uf(`7Df(G9`*izSBB@tci$q2vF>>N$ml3#Xcq6&~64HwdAKuIB
zac$bS!zce^BS+&~dIp|~^LZ21g`d=JC+ehabm?C4GA6nwGKnX?jX-N+%JOM`?O>Iw
zyXj|R{Ejcqo(Qz7MO9azi;qo^bS6UgFSjh?39LN91dVLpxy^$=&RS&Hj`WK%_WpJ1
zDUZ5?t-IK&XM$)fT#zu6i37_&l$wsFruA<oqvf~6H(A}aPUHWHG<l2H@rJicWt*gg
ze~YfTXaA*FuJxQxu5Yeqkzg7{Z>T<_rijPs)2+v}<g=ANy6wc#vhoL;6zphzs%Ku~
zR(1H)6=NyXa%Crq^+0X=@xb}LW>MGHTB-)MTRpz^*;)b=fPLlG7TEjtY250YRm6dm
z))_lBWL#^?#yE&8_Br4$Nn$jRT<qAl*JP_Pz_XcaGye@j#35=}sMG?D(;^Gah89Ez
zn+{fIyOx}aT2~Zt!u#`IO^{TfVya%h`#r@TEo90(^HQMD6~B_&R=V>QSNt@$yCme`
zLjgrUwD^0Ut6YoHZDm0u&1Q4_G$1H4*jHAaUL7wql1-hgcWRkdMsa>ue>{J{I&}zl
zvR-N==Wfk?e}2=_CN}JT0lA42-z=^Zd4Twso9pvuUh5;Yg0-mgh~Z!WRe1=cxLPqh
z3Ce=ad`fZO=U|=FO($QjW;?xeTCZ_<CO62l2bx05Ffh_MhhFxQ;tr$y#OgsSL|Zl5
zIDukEIACm-*gnnV5;@43Y6FJk#x*K!&diP4<J1z{D7xuu=_QA%ou3_O)kzpe*kw%b
zeGVD9Y>*P$t+AG@ZoeiwgG`U}@V17S{Nrt_eJsk($RB%=*4Xr&94@Y#L-5{?l=&H(
z>u$Y#UtKY;Tnfj7dA;HcDV$5HXH!M-;lP7yIq*5jxACe+qK1y39m_Z`A`qDg!_GIH
z2{o6TNjMdF&W4;7uRkD@N#m+z1qy+Wfif$7V&|(4{1dpsZPxXkQj=-mwtr1<QSm)n
z7pppfjJUbE+V!rtrrj9M$##x8_69#mW4!GwzMIF5#nOle%V94c9lkxyzDIfB*e0N-
zZsu(=WKvYw$t!z;SJZvJk}_U(U5{OPk2v6Vyg#KMZDO#zF8a%V`}TX`AHdvEW&iQl
z#G?~QZ@ouvLMJHoAoMjl@GSX`Z!cky<2j7K!bcuYx-WSk>O8GM6gT#XLR*oNLvVEI
z+g0-*mUxsiSJb8P_2?tZ;pakDTZ@<R?ud-$?50iS!CEiOrr!Hy>utl+oM8rw(gQ#Q
zk`#Zs-LLwp?&~*BV|>yVFk97R`=)#7KsGkbmbOIGugQ>1Dn<!E!he1yCf>(OjqmE@
zSVALytxXyLVbr>Sl)MBUv-L5l{J`ho=};-ir>yY4cPGz=$eYFTs5E{EkJB8moQgu-
z$zu@^Orts@uwN>P!JVq#w$8UJ3oX7jcx?0F1;uO~uhP=Tt3n-)A{)^w%+|hPu3^Gq
z2S<Zmeo0@iUMIM}FrAn9p(&|R-mJl(C)<1!FQ2iu<S|JXdj;c%=3RB0`mUF5?n-&C
z4X1M=Sp@e_(+(LgCf7@g^usQO(f<>}iq~GLBxtnRsnC6JB>R<LrPn~IJ<n7xj7@J}
z_l_^iw+G)z7EtsEi;4L|LQ&s<EZ}^%jtLz7$3WeO_J1ABAG8{LyoDy`K5JXT@zW|%
z?{<9B<`f(sH4bzgzD?bF!Kq#6MyPFuf&!k`|AiW`T^f@g-ingPx(uzQbIET9`}V4l
zGwaQAS#obr8UB=gvF6Di67D|LzBPIAnyQ;3jz{lslnHj4WTQ@Mmk9G&d>k^b;YDlh
z8cglj^a|#9E5)mw7Z;<z6Qp_}!`?7}YVN|*k~7sorq0{J>Gen&kM>7BJ!5&OfXA^n
z6sYnd>&?Bx;M%N#&&s@Rxq1X!qJHyPDE3w4GozTY$A^OlRHk+9RioX@Bkz_RSC|go
z)c3bmooq6PCUOrOqw8PCERC!>Y20~}erAN|+$>c4675eqFynpV!My{^g2X(7i%<UA
z5_|+C;BdD#qPOHoR<{Q}Pjp-lm7BNKozhO<Z~baxl+!eN0=1^37`}6&uU^{vxzeu&
zn#+f2;W4fl_PFwZhOjw`jFTNMO<B6FxR-h-Z`%>PwxRdW^?#n4$J6n+G&O~awK0lo
zb472S&c@yv@*pSX^=zf$XobVDNxsQ!Qylzf@ev=X_l3)BmypB*Wj-X$b?sY!?I*<w
zYOg4id=C8tgQZ{VcS`5|BR85GM>Oy9mnsTyOswxTt`h+kkCv2JJG|X>u_4C03gOox
zWy=HyUa!f3X|S342b;fQnzX$fOw2xUNNAz{UE?j79;eT2{7ud=z*hLkUI6k{^}Wpb
zM#RvaLu^;;m%Q+@)(#&JL>+ZQmF{I~r0cj@E&StK>{e4TiLEzG8Q(I)ah1(~<_&*Z
zQ{@Q;hV0R=oF8Q<4vn945gw(<Q8eM3#@GZppXIS3C;I?FszQdy>8TSx%!#4;LTl4A
z&-L6nKXHrCJU>p*)X}z=mAa$YhR_O0>J1;3Kt#!qEz%#ruiVFYpWFmv7(&mj)J3x*
zhKWRptc!KO&!yXlN!mY-!S`F-+UO^CBr@FDQOBzbl;m10zbgxptV8k#ay#r>w^t11
zL9Z!%ner0NqVS_(3Ix~O;Ftg7{CKT-{&3lJjV?m#P;X+Y0_9()r;oHe)&V5Ha`JZx
zP2R!^4DxQBxV&Uvt>ih9WOuQRxRPm-xCKGwAVDRNkoYMzl<NZ6OLx-Au9t|f_e6TF
z&U?o4)d26RGg^Ip>0xK3K&(>v(!;oF67!O`jR`KG9b?i%&bx`9zY}?Rw)zSo`T}l+
za-IBTLsQdSVBH)x+&e2{PF0GPg`k2b)NwzoacWNGY%2{JJtWtCn+ex65?He`a@pzf
z1J2=w<V`52UdGb82aY9f%uih@RQB=-v{^SdE#*<+>M(ba(erbvUr+ZAOY(N&SeF!B
zmwaT}eRCA2kmCC|d9GP{lS`W6Tsb&#d!<RaGR%grQ16W4uB5F;lPK+bPaMNBsIRC0
zBzV528K}pZ(*6RJCi2BkkU}QU?U&x9JvK3*G9c5<!1nT_UZJ(Z`l2<rj06w;RY9#N
zNmgG%T@^v4m{56BUe9#mnBG0ft12W5SB9Eo!;ETZt$M6m@hYMUK6p0&Z?YYxju2=R
zCBHx?Ds}QTs=g;+-6j=-x!dALSs<DGBWVkbLl;OckG^jkJ=22u=cb=7R+B?85dZ6&
zbB4}{U(MC^vBJhfk1e_<ta;pQmsPBnW&19F$+d?W$3x;$@hPts!FwNy_imh%%utNc
z4_RHzFq}Q*uJNVgqXu_TH2`^py3od~4WW0v6cl0<@`2pJ3o2BeD0Q~s=<}0N*E)Pc
zl6<i#&X;)76x-ga+#=L@6PlJ&$@j$6gr;uO-<<7;%Ap(ZD4oiVt$-%Tc!dD3!TexZ
zTKkEA2$)gPjD2D4B^nOv`eF<HPm`(#by3%<7o=|SqVKD$7!=x1TsW%9-^HKr-SyNf
zq{p~^k4cOo4xF>O4yOObdEOxCW!k+Pi2jE!;Nk)QWcpfG$v(*hG~X71Mv$(Uk(>+T
zE?MhFCtq~7_1K_yl|kZslj3JvzUgxTR(EfUxpNe}J`#=zWA>YEaUFV1Jy7z7btpe5
ziCn-ewRqG!&^?uRmRc(C6=$iCxK||Ubei=)b_0So?Don@jD!3aU$c$pQV;ghTV9Bs
zy>RdC!O~!vg~V6y%YtNS1qa--S8jh(Ey?iy`lv#I{!;C;Emcxit<m?LEN&{b>ayJK
zzeQDPKn4F39KR6Xx#_aQfwVH&i!cf9Bo^`S60OP+5|(3xNi=-hrwsqzP!0NwFtqDH
zJZb1G^Z4g3Rjm*4rF)^O@i`OR3&??JWfwR#*a>6}jyMPrDbjAz?Y4f*It#;h-LUos
z5W}*@sb;9v>VLQz)%X}a7<@Q>q2L$t{`6z^Jk+fQsZ$yd7vcv`)%e`Yd5q8@e_upo
zZBq_6wUGH~^VY5cXScA9XqP<N-{_4$h^1->O?MgcNw$B}-~Gz+_V0b=q|j?|NAQ=b
zr~{kZc=XFYwo_c46|0L10|ka6j&qjO_q+a{-|R3hZ}z;qj?fQ<T>#nT1mVnV6%z+a
z?aa|qyalu^K=hPD!;cdd&i_`MC`ktZH)qHLief`p9_ze!jk=zcTrPLG;S7-5=rc6&
zN%{Qe?qE>>nuFN;(Ae+}>MkE>yU@zZRnyq50wK+S$7p|~mpO_C(R_3Gyq#yd<_<EO
ztOgX)<+1_}ml0U2SNc(E>B^oJ&_EEs)g-t6hx7m=(~mq5ZV7}&dQFlHs!NaSfOY#7
z{J+g|8yUhLXZrh?0nNq{x-XZ)Zd28ee;sl45Rg<2fcmOb?Jb}f(|iZRhSp;Jnf6DC
zx;JRV%Hzb~r)yb98@2x>-t315ojX6DhV0kOuN4ZkNT*(tzMU$*sk4IIuBV;c>;7Yt
z7hLbw?|SamEP<mK@EM^^#w6X?omIQQ$bY`7k0BI3dBfb8-P!b?D}V45cO<N8JM-cX
z`)8K}^zYX!C*<xA3JgyEZ>}pL82EJV2=WI1@Bit`!`=Hj&lmiMMDc(5Eq!kI$o;DZ
zz`XoTxBMSp^Eu(}pRYdYJNLi29s@J>vsb~c>Uj49{nyt%I=H*LjL~PXX8Nxm|NHOr
zCjqE)uRRaM)PFzlfBoZg2;BW;$`SWn4(fkh+5PqZDti6<$@PQCpC2tl8j;BLYBCRB
z0tOn#V)jd+KZC=J+>)xv!gH5*yZdpkogf2O{7}SQ5~+P(fCCu2=SItHKNy+*e%t?p
z<huJ&)CEW|0Y5<3zZ+mS5cCY1H_CzWVxFPT-|aL6u^!IOyA%XJIHbuJOoNuzFM1S5
zMoX*uboa)|#9N7$7{fCx8R-}ek(A2`2K{d`5FVXlFpF^s6cWdfUe3Y3_YOX$-hFP-
zh*M1tp}ZBGYV;ZZp=F5*MU3K+s-UF#X^XJz=KvdifRxG^3=2}4GYv{^f-6RrH3})^
zcmN^m14sw2LSe|P*HvWI@1BuyDd-xChb@)D&Va>Inft;(C+J03l>v?i&_B0``OoYA
zw|F99Ph&T14A+njBNnbLEc`Qfzp<p^c3%2{5NHVaG*A>1vPefnd;7^ija{0ktJL8H
zl5+HWnrLNja^g}MxK|GYqIc1_@Dxu<*eM|wErrwL_o3NG3_l+<l-1%N$EEo`vaezw
zQ(5`D5%g%oWIp!J!M+gQ(-kW8_y$O#hQa@1e@GLMjvQf8>-VC`X&IuTkUT~h#0$cE
zA1y|oKaR{VCCPx_>Ml^YX;<hhr(^f6yn&5#0O?4I_6Iiw1LLjZ?b`s*IzTfTxh7f}
zQy>r`#Hh-ybmb4w5&%7(^z>OpD#GYPv`xc+yxT&3<^`A;3;@AE4vt-uFty-sIktb+
zSkE~)5k5f^O;?2A6S=36fkzEIbxN%9Ux26=gIQ~r6sML=_CMab9@_mWX%6e|{#$g$
z5&RnAQEHHgDHo4k8i9@&R?rgXKG3jER)FHHRi<)OZqaLH@+>|de+`d1wp9Lp1dJeU
z!p=f!8;aP23{*N_`VPe?xW#rYG=Cj|pD|}w=hiB4UhzMsyUVv|6@d9!I%}2#i{X$4
z-=*JOE4X1P@b75%9p0;#hio}nSTd9X(ml9wuicmVULZ+kM&L91Fyq)9t?1_v+BG8j
zSYIGBHr0f#W1cd`p?wG7EI}t3o4dA<&<4pS0{Nc1FQ|pQVEk1xTjT{1X8Q%rC%;#&
zm~f8)F*HshV-P=!P}btLzX7E)h9s_uw3{5JS6-A{7_ali2Qo&8784HWJ^0-W^?xwO
zD-Z^_W7QKQ<i*+#!W~6k+#}Bd)}rLvH?(EQ>!3*-s2=fq@RV^FIcBLpqQqkYut`NL
zEO3;;wvVvQBOdlrA-A|DOZM#G4ti|^IEp94<q{MaxRs81T&@KPhXaroD?uaN2lgyS
zmS*~I$mE6W?n$5HVT18Wlih%rAgdE$fXkHM_|0TYMFEJU!HM_ghUbVhMI7!B$#ekD
zBFL87pl@(3uWK_SaYJT%CQ*LOjQ7uunAMBiqnv%v3m(R4gelGxtBaf@APl-zKx*2+
z6oa16xMu!gqyq>~D#07D`EjyMgu+iKh!2A{^18*#o2B5B)9U;5rg`+Atyi5Jxl?H3
zV{tgN%8@g`vCUZQcR(S~!_mm_p#cc3OIlTf2yf<8TE~jUIIktJMW4`d$`PRUWe3Gl
zyO%G<|C~u#NPD~MRfh-fAa4>mtRrH6haKt*2v|ee5GqFOAigg!%v)XCB4K!Cu10l2
zsh>#Ir=(&c`e(7|BmO~s6KMr)@B>H0{E4&C=sj|gBoc6+TJ4wy=cl@!E{n7|qj_`l
z{9k`RN)SSW=VN|m4Wsg3^)&Ef!Dh*VHN)to!9ORjx*)u@_u6I8O5q205yt#c0VCuH
z2snV)KiS}K{5@ihAUn6s`6q|q&_^MgPUBJgD0nN(smSr6viBu~iO4hfe|f_Z?AA8a
zeV5?;sDujdm?dZ^Ih8#e7S-R^`Fbu4m)R}Kt%G`T7}*67!n8xeDdF(TkD_1K$hNhQ
zu0qR;;ziKyR)9UhL_OF(J%Ew{NF$BQz*(jwT+~eE7Z`4r0t35Twx$Q|*pU3<8<bjq
zZ~aKarNR}^AAdj$d_nOy94Hf_A~7H%frx;<f#fdc#SSop$nBX}%I?bpCDhA9KXi_=
zn|?rMZXp|jk<beUe;NlVQNzsPz9KA34`XP|e}{1cVyqG1`#FHaW+}vRLy&UwV$Z&y
zU<sk4SOfIk7HQMV<_tZNAAxSurgTX%gsXK8w0jO<wi`Q$aEnn`W<8AS7@Ny8vig0|
zLIFgOkUj@gxgD}~4l|51bTdU*#G&YBE~`Uetup2D@t#S@KPo^jL&D4EYV6l=8G%`(
zx>Gsf?~8JDK}f<&;I0z!tPEmYlZAJ=krqJT=HFVZ_~67}Z{)>kgd;1Nz3>HOpCbsH
z8SeV96Fk!@rXoZ~fWUkmwz>!$EYiX{I4nJY=@~kIzd7qhrV%rllI>;Dua<Q_!@xOt
zrkAO@N;naR7?=DBEi7O|xtNr95eOprIAlMu>LZC_GDJX{nRQlmdj=t@4<pTcUcH-k
zmA*<~Q--+O5n9(%>t#W0p#=__OUj@tUHF{zol50nm_1uNa2cNc<|wlmn>GNBei_(L
zbP}kp2@s_s;<FEhWnDL5THW&c4#Wm2M7(x-tH6b9)x7EL9PB%EAt-wo&wRuesv&Vb
zEH`XFDiv&zQaYifmk0C!9s;TGC8EW7ARcvxLZ_7epI<9q1zXjEH$&USba2nGt@{Zr
z=78`R+G2+ZT&`W$MSMM$5O!^t1FG$XB`oA`v<0_!S2lOVonNsgy9&lhJu-2o%i;o;
zovvN^`4(3JG)p@mLs`s*m>NZmeivGoOm=8xw<FAgn43U1MNkP(9dQXbjI%z3khz~D
zQuK9nG(ki5;W(n5-uRKq;Z(xla=Bi-YBG>erz6df{x$>NA>l-rE1PsEVp}l)R$mSv
zk8d|mZeIo-e$;}|X=baX^|~z(Y|60qNpC)UV}7|^73>676>b9&d>B?&Ms;#ZwcY88
zcc(?Ki2L8K59o@~m;|wEhdFu7i^Yud!;B}nq`G9nwykb~_vD}ytN0PSWnv<zA!i%S
z#|p2vHn80{wFyT&UxuMK%>D%j@W}aX;D_w%Sw^Ty-9pfM?QU)WrAl5yAOCqq;I9|J
zEzD)n2cW~yaCx2lmX$q!6HM~)*-@(@tJC~k)BW#fK<liw>Cx^4%}vll2TZL37QvUR
z13jjYWR^e2b9@+pc}_CU*2i;?*^4cgWxwtGqAGD>?COGs_UOalEcR&;3K*TH@3(=v
zpM8qIg#YK4<uPSM6M=tDO_o7(^aJo`r8P5=Qp$SI%T;p5D%_~b0Ji-L*XiLF)=QWS
zv`LHOhq@!SuyHOAxohkm@D#|VdR)Y^2{N@qI$4mMy=n&7Tj>LwBiJvieyaS-Gm)yF
zmQgCeowxKliS)TknLBkV?{vNGv0h<Ijeakblxk6fdfVP`*1`A2^^BLE@0E378t~fm
z8qYv;KxCSP#wlc6f`E;V^T4=Cm;*sCH^bZ|fL&tC=xw+?`qaFMirdv&UdiUORKd1q
zeZ%1Y<!#J@R$VV&f;Wmrvt|{^1x$((*79;DCrXHxE=6~lH`WGaaGj+$<_fIxMppS2
zniJ!}fa>T2m~PldcxC%M{o`A!X;!O?$sl-;?yf%Hn@mioVqtnjGZtz4C3ZW&*~f#V
z_-w&ukVRQn4Z`M}oax=UFVORn{O)v~#=hn2lJFC;gOe2JA`_GoeS<I}Gt1VVhu7M@
zs!V4#4&QazfV~p&uSp5*9fEzy9zv^!)x9PjY);2gd<`_mIheF+uI=|Weq*66-49|9
zaVA|~Q(d8nE1_9PjD$HVmT)@BKIcH!_g~bu8ST9w<lNM>OhiKOQNxE;o91z0mz#iy
zn_4|-yIjX~>P1K0`D>PLoYrS+!7Z}Yyjq*}l*iMK?5Wmgh>$&=CJ(F%;?pWeRd6e6
z{Ax`pvSqzmoF|rfHt$Bf7<<dSb^z>gXt!@^Z|QX9B1o*8Im?qwU&21}@6bJmJKTTl
z{lR+5T!r_QH|YjfZ{dm+A3)zJn?cb{ohS3k@gG8|wuevLGquG?4E+-~N2cg|HQ72O
zaGI|}&inqTy<~#wI^hC5@3-^;c9iQgmS*vL?IBx{a-+N_I^#Jt+)(x=I8$4_lWTqP
z0&M2f>?TFgqwCilOYdumR^!AuZvEAq8bhz$y#Dwn$Kj0fLE_9g$U7n)Jp~vMWgETp
z;nd9suhM0;7&dL}9wEkXU1h$pz7O%Msoilee=hh5NokS1AtbdGb)=Q@$g>eSSb-Ba
z9^EUD(DHNP(l6?UFvrT_RqE+g03J{pG33wLzLp!lnH2Yk{fdc+-+@*#9Jg90+a|Bb
zOL(XrIghT^*3==pHk5eN7tG5zoGm3Itd}xqTV)Z&fXh|YVUC}$eLn|7a`)PWcPiBf
zq{v*U8t~h<!63P@eH>&uHkAsr+ZX5FdQFng0admG2_?5Utr4M3xJAvoWZVQJ<!{TU
z%);+b<dYw`{Zsd>iw8r3si%t%91pf9<;bNOVvlKJJYK#1HU7xgRV)gL?~%S6S9uy)
zr``>f+6KwqI;ju)JGVfGJl5=4q+_83lXXqo`9gk=&jQxq%r-*YgaD*<^Pb`qyR;X{
z3JD~b9zOOoB`w}zmC%&-cr`st+)fr?RH`Xq5j97A-9o&E0bbIM)3q2-tYH+u@b|4Q
zx^?HwJC|0t#lDxAwQSTAXZiQB$q7vUn*MW;ZYAy^(srI&UJNnC>p@n*!Z?e4cvb#d
zL`Zd&I(n18u!-)SwvHbhBq!KTWOuj-isGh!o1_UiQ}gpIe`3y-hakeI*9ATD(Al|m
zRcrP`>u-Yjtk$Jj3KP5&`U#Q3YS!%V$s&IF*Mc{5^m%W5O6DvX25CuBbsm=Vbu2dG
z`QwRde)sw^lU_-Q*@;U3Zga`7og+^AcJ-oWT3*S{KQgN07>VL7rf8d)8q0ll*sWxd
zO|Oof+r-u3=#4;rgO-s8acvjm3z;VlimeHBf-_8tFG*9uq|`r}OHo>|meJC@?c0&_
zlZyg+NS1e19D7yVT+7YIt-p%$^ek2bZ_o1bzCYuQn?POxuFJ9sqfcw<o#M!wDKr7`
zTUFo|n-Y0kwmVI#+T;R;(#zv0m`pRanmX(hmVp<s*xL8FlY~n5*fh14&xx5Gp8E--
z8i(9sR&@=$6NVBOo|qC9PjvAQkYF=m@5nJ^<L-V0;gqnMUAnvUM4gy27)1o$F0ELL
zmAM-1^U<*PF9D=Sj(lsM-oM9tCyT#iTzhxG>hw5Q>C;{~W-R)&TdxUZGSmoyU}y20
z#OlFl3y;N9)v}TvDm%hVukSPY&$CjLhAITByMZk;Ub|UAs@5w$DY&_frzpO<QtO;;
z-Tebpg+o%*g2O7;o`ZU-`VpVTPv)rWO!Q4@QmjWvE({S-V+HMJ_bPkj3aok@Y-Ffq
zJX3X_>nMSJA&Up<gn<e{=VJLZuns?4NfO55;P5~jfa-!-sn`3`<*JzAbK$OK6{f&*
zio{TV3y{0>7fI<6Slhfxi&D%sDIyhfBw%%Tj6OBZ6#24RK0DeZ(%M~w^J8G-qhV>K
zDgzas{D4{w#o7-s;pe>+X{ND~bJZHl2i+v^G5H(3vWkBF5<<z4YqZYPbJvqDkB4dd
zozyn&UOi2{0pK6@wEN#|?;|7|<!RX$A$kOF$yBh!nwc~z1MOZ&oDB4s4UX4Y>%;Dw
z4i|fXv?I-b1rPQL04#?;rNr6mBf;q~G>YSRc=9ljQjpu3wf2a1*LMijVh6cf3B^CX
zu9b`_W>7}u-9Iyu{kF*pd(CR@Q0uvTt3#QP&{?NRtE#d-U@hwYsPh2vEHpO@Cnfr-
zEb(y>`;Cel8SEopM3I{44N<Zrf<)B4X)q{9#qZ(2qj=;+43ph>L1;dH{+#Ysws=i2
zu`c9wllkKcrrYG<mdmc8Y^4B8wQ<dERvZ7+#daZ`{L(&^g}&dnfW7rdwd2L=mhF!u
z=4eCCaK?#yIWbB{JQ|;BZducL{VeuUoxGPUy2G!||8=#_W*NgUpn!{CwwO=<2?bV7
zt$)eD8wmP->+ub!?qtPK9fMXv?W^XBL|LFk1=0te`R0z{rD}Zw#tiym-M16GZ3ebl
zRN3veH`ad@o!@Hs9uSB2pjsO}#yD_q;=pZN!p7pT3`Q1xghZQPE)hIpzhIm3MQ(^+
z8$Do1_z)m9`APCxPZp`U@y@}h{Gr;J%^Jl<y&FNkgJq<qq2a@HK|I|@^i(Yk%$vg6
z9#Dg?b1VLiHs?p`v?DD3`H}h4)z=%9DV$jwYZNlfl5v<A()r^h(>2CV_i05@5$|m*
zdiy*F3T$n@3CG6jR?~aBfiNqS+cs@PbR_^K3ydk~a;jWSC}c<57W5q8aYU)PXc3Ca
zdoEF_-Fs)C@;Ccnhx|-zI`1i@<w|V-L_L|Efl*;6R-cd@LHlFkoxg%7F6)MudYmF)
z^^JlJp8RhB&wMG|B<8*6?%MtQ^IO(OGY-i|HNX0~Q9Nv4TxaakTE(iR93ow<{zOWM
zMsBE;lF<!ZZz3^DEqJch6YV-st<>0pqQw>IiF=1;_jM68O6}Rn7P2qKW*)mq`vDlb
zh~pxfl{C4^J=ZGr?Y)BcNd+O3{Zj%q<SY2v2DT7Xc*}rHnu1Aqo1RTXUWCXRm>jun
zxnL^!cD$rlOuI8KD<MK3H@(3=yVe-WSdsiLbijoid>zaB=^RyS>o73_%|jg~+sQku
z^y~56LUb=Vh@o;-ypHd9mz$(438LG#+gb4?I^sw5%F~?;zoyH9DMibpKLD#h&k<w|
zQftXVeBN^o1rw=Z8ZU`rdM2>Lr$t&&SYkQCW`KVZM-x2C4(;ts4P~1HK@cXP8T2^F
zn-?Ch<RL0SLG#ed?Ja8|J$e%yD)@Yem}|3GPgt@F&u1cTVJJnEkr0=-@VLup<uiI)
zj+|5%j#^PR=<!fxdUDIG)O*THkVXyf*+9p@Gr?GN%BnrnZ$NEs^=6YYN+NDXHpI#W
zvX0RCu+q#+3dR5fz33BZW8M_}W#BY_?gl3@p}Dt^0V$uVE1B4@L~%<|I3F`jJenfg
zwu%j2U4}yI>LSUaT;P-2lQy3d`da#@Z1k0q#-B3!zmxsM{k67=p<8n8JKs032s)6m
za?SLMZ*o!htvsaAruk*J+J%_B0T@zY`}97{uFr9MD9++k{D;J7_Oe4)9M*vBxYZV^
z<$wk}DTpt~6CHxq8Uq$}W+^A5B~8m|$PBc2viZCy?lSwMPml!oI%JmKXI>NobkxP`
zd(XsgaptPt#OgfJPx?tHE&IE>%oI6^I|pYT9+n%>V)fVNi!ot2l>pq%pMeYPU159h
zC;>kvj*#U0wYu%RvM!Ua(buV6XH8|H1740J`N5*MKVot#kR*?kWqpe;;qFLv#8=_V
z*0iYdFa_|w8!xTlv6pxyX{;&7^q`47{y=;fRypTnEIsj%x7o)_-4k%G%`<GjS_bnl
z?V_{u6Cm~F%`if_b_nM8sL*iR7D3oA@E_#&8#0Mh4PO9)q~C^Y<W^x!pRvgl@VO<Y
zbc@bWABf_neS|Q*XywV-u?D2IY;j8_A&)EHuV_Zv<Qfnuiv_Wjl6T%xsm*9+^PI|h
zI-H(o%KKq$3)I;x0-RVl>KfE4U$bYh?JWA^Uu(THZwyyxTR<8nOQrz2z_<C%RaO2u
zX|EdQDVQo5@z3tR*exkAM;q7zks`YQ=YqIv>jOssG9@^!YS-Ne&U6(%JQw?g+r}3+
z>kCd-Gz}3e=FjebAvC-X1sA<;K5`u9)nsm~&%vX#$DM$oiCV?`ps<h>dxI5A)lT(k
zM_H3C>lo4v4Ac&6RDD4p3)KWMO($%<(2h{!zh#jTgv|AIR#OE8-N|>z6BxDV8U1gy
zS)I7spde;cMIRrqFx15zyT8J-TYh}&!KJos$nXw<(8?ZvhHuLy#vb)g4t<DmslhYg
z<kiDY5OVxN{@G;<y+#HBaxE1zgG&%0P6sDjeGy12?NlB=;&syMXyQ;x^$bQCO{6n8
zx<jYya=lMqf^E!a;Tiw8l3e9ejq5GsTazyW^utYn2wrp6m9{I*&L3=Sk{@ngx@fp{
zd=E6B6=`}9)7*_1TZRR0{}BJeovh!+ruLA^M%dczw<@!%<JV6^iH}bO@E`*QyR&1#
z_^FQ?Rl?f=lj=XPxz&vfOH!HTXK$}+*CJ>J()-cuMnSH?ml(S%xxrs&Rt*n}_5YO;
z-ABMmY5$iO;XDgsd|h%~H{V1$0wpSta&(HWhDf4O0b?1iSB-)Kakx3KL#OEtB+*x}
zwuL=h1}1qUX&NWhL^pGfTCE@NvjljbJM-s##e3-$_yncoTE!_uaFjB%hEbD@8LvVg
zbw$wxgrVgc=krgp@p#|W1z;*6!>G^kJdyIgAFDRCE@~P855_{P##Csn_wXi!X_tR|
z+pwoTeA)8|t<P*g+|!02N;{2Edb-lAp6S5vq#yZ0)wt}1_fpL$Og|~`!p+_jWk~Og
z%T<?OI$>YQ_$M=szW|+{rmjH7-mnIjS*~$RT3ceAH_~dUF^4KU<%MoKIKsAay{^C8
z({D}`$z=X9>w6Q#>Y;Z!%L>!x?|z^dJhRnt@ZrCOe-C8Mahtt33TrNuTrw+FaOY9B
zKNL+Ezm>ELL^dDzU+sPOSCd`0Eh0kb2{lMf=oYFp2|d!K*-(lEq$nUwAV`rCdIzN!
zK@n7>C?E(ZgeuhvhzNuhr3Mh`c=zM$d+xbo-185d@%bBu<RQDPz4lyl&AGHF6#2eW
zdPAh%k<6Q5$K*uA=yymJl>ORx!}(v<jox^=2-2rxKbG2>K^(X6cGfP2YjwXjFOylm
zZMbZFhgAFHQEsQ=2fTpH#dd}Ed`OwQDWKaZB9ql14kXKSunVp1-ytP3j3MmEb^&KY
zP`&8`1`W?pNGCM?+4G)btHh5CMPb7L;za2FhJ?Qgzo<M7++4mqTw$a0P?KYetnr(t
zD53n0CD$>47Rs2;ynx(Igh^g1t2Jg<#b#vNjhY^G=SHL`-uP4oAI3*j@$aYV3Gkp-
z0ml?Mh-9mq=5L<>IU)Uk$Z4{w*^`<KN}HOUvBU(VgN3_s-F4gEC;zK72<4~#Zz#`b
z!JhK?TW8a1xg)9cyW|_@d@lx3#f+3`*}LSo9)y}}2<)-&*7-7KCjYx*9CNojC{x(y
zeFtW*7xOzdfJJECCaSNk3GkMt1PViT9ydgXH&kB}$hZbz-xmQ$J?nFeQ8)E-J*N3K
zJxLNfKX8iE7yKPQV8%Me+PcT^N}Hatto_JJfla^nZ!yt&Ki@|p)+tqSofun(CC|{`
zzVFE()O@klXPhhW5Sfep+Gy?u6~zK>=9H0RUoL@fcZ&6yJkUR5eJrnZu|J)g&UvoX
z45tQJpa5kp(_CB>Pu~$ypJcZ6lsxUvT`IbQ-(X5golC$SS8n1*Kjz2F01J<G8KT$x
z(7a$mtD65JP(G1k&6VG*k=t~MuQe{TF=iZC4sevB9{I!N-*6J>aS_<zoG1YR${>cv
z%M-+eDFIJE)Y~$kLf4xUn3@%@zp5tL6?L#`^9oQF9#jV1*(z8K)^~4f9Lj!CU}Gk~
znIp@6d#d5krNLeH;LS0sRsOkvDO?FSpPTIGa2HpbC=}`S{Jtq3qCy9Q>qIltQaGbj
z6>xCOiY8?<z5`>36Wf_4g~%os96teivN;3BS5K^A9OsBvgC?@{HsSFh<e~oGePc4?
ze^KfO(ik7t;DWWTqW==B6@W+ve{ucML`|c}!cXQ*M@<FXVooE?klF=4?J_eKq%8mp
z^s_(62`nNMMN7=kXY?{if%vlQer<WfUclB#V4j0kA<8$THMFhDX9elw5dn|EpuE3?
z%)I<Key0tt!{ybfR!eljW>L(tWTMjFQPHugG8|5?M#CB|T}%%(FP}&R_IO7Q^AEss
zU?*ZO!_t?(u;<BsVt2-#+tiic3}P&zlTEsWGMAGzoieT^vJS7VVY}zP#_=wE_8G?(
ze%QBU7og^xP$Q(@mXN{1yi5!5QbNk;ivwW$oz3G>DwZOa243t468esw<Y1TmIC31=
zQx=cyzQfd<m>bhMz6AV|jQZPEeL*2YjrWvGN<i8M+<IFIeG`2%^H_ow(Nt3V^uG+x
zXZkI_yPp1w;vQ1Ec7<d9p5`k%xQYpEed>%pjXRl3>CrT@!2oX_y>Ltt_&%jNM|z6Y
z-SN{=^P`!1-4GelP?X&u#1WvQ3sfd6o<^r3iBi>%Ry_Oo#qTsuvghIY$%d9~k5YA-
zKOEK1!_krLnR45*>#~z?057ma#;4U6ngT7(Os(y$zys=N1Cvn1&+{cSa>j!!ojF&_
zy71YIw%dc5dpkc?40W2f1X63hMwTX2$$2x*axb%5tsQHiLJq>YeU8mDu2=ABY-2ZG
z{$)JWFdqVnXS@|RE`Za=gx(HQE7D~e$iy2FYpn(fQU=ItB5|FXGx(5J<@S2}K-ap_
zUz8+$e6RB3!T3BSP@!-aNf5n6XW+<KdvC%|?>8?k^37sZr{boKjk0huv#y`$kzdHx
zs?7;k^xIRaZ#d<qj$f~QFl3OYS3mi^;B>0=96*;?%pZ(oWpjH^xq+5s)LvCEMkP{z
zCx6#|cNJ(l2nPaTmpFybW?eYu<5)hnf^?SxK5ahoJuAynPvPtG8{H~*cgO64(@nnv
zVViiuuVt@$(A9qR#FTXdaUbOT5B><zK|t-h)4&k@bKLkjNd5Qz0eyYzhhjbrcY_we
z-BiX7=0`AhbQ6(z1~!f66P}$xeG-lg^kkOayi(0>qTISS90F0D`q;a5T@rZxe}VEr
zP9R`i>(xOLuHN-`89%!8dJO@c#ahN2a{|nBs#Zs?A|dUWvnqv?dBi;L-Tu?w!Yagf
z-K#oDqHC4)Fb<q5W%ku6(}rQi56(k!<uKnMpdaH7giaFSn6(=lYh%(g9fo;q&T~;_
z&))#H%N+ejVS;RAZpmpGk-qpmY{7`?kP<31@roU-t1V%1duY`U&>ZyeWPnm48@k!Z
zHDZq!sB?iNgySl|kRB(=3FB<}CP)dUFs-VE&lFEiZgL>hK2kJXePqh8aqNsf`Co)$
z11j~ZW!x~JIhsS&ncKLpXTs}!&hFMKYv9_=ln3@8XNuXp%b_%^`ILrFH|F<gH>g(z
z8FdfIb+0D(YUxFQ#w2U%+!Sm-N+ZbDf*z_Dk8=lnl({#3(XD5mUrL(gn(I`%1l2QQ
z$An(<rff?e;E;haXPZ`@_e!B)se&cXw&s34=F)iKB<u3+BWg%)0jU)Z8i4vl4XRg;
za>mBL_0cJI#+PjLVNiy4aCy_$NBl*0e(`tPhrko{uP|LDz|@uO<*by`bH!%MaQwRi
zZmG$1xI@)ZgwdHs^mW-g;wF8p_CO%X8i799$Nqavp+gNnz6f%t)be~k-rAICIxpIb
zGuP4En1)pSmPBxQiSU7`igZ3|Np_gIg&t-du67C-XyHU`1EusBjAM|vh5Zl1GY+Gf
zl#gY{EvEm0rd>QhZE!TVep#HFKZWXdXqU23gE+06D%JKcBJPx(WlGj9kcP*9yK@o9
z-ii(0y<s+~7KX!vMUd@*u<`54x-mNN%<3u8^r%LG>RYOTC8a{y4{#0)%?Ur2UX4Kv
zt{g>$!lq!kgcBcr1UHQvi|@{tFYVQxo}ZAx+FiROpD<ga%C;zBU}vWe9{|E8!*KC5
zvraFzYcJxFh3A4R2KSAvMAka(Ozb7Pj+Ri=bcA%9uzhD2q%zr0*mV!!fGwzpAw1Et
z#pI^-)ryvk+QdyWVecqb<1aW0kh^BFzJ9VxP~~@+LhOVRxbx>d_`QkHVqZ}zSQ3fR
zrHEvo(niF7KxjXe{Ha3(O$_HR{iT8BK>eHRUw#+@7(+a?sIPJZLT_W_7VRU$;_rvP
zWG&+QL_)sddf~${E(xetq!gyt-ZHBN%8>ppMbrAp&r7r3OLK*l7f5tbGWYLvFugp*
z#WZFgRIlFrp-F}g#uvkFkf-miFskZC3P&)lSYYgERc#*&3==aKwyn#}G&XnaH%`y5
zw)1pW8SXJ8DiyL1vxT?&&Kl+v{<N(K(NinL-wX6>$sv0er41>pJm#<WW*$ru3E_HG
z_fB>c@E>9l{;O?b0pb?y7=JE?onkt4KuTIYGtOo@&r`VVm>(qJmYKMv8G}F>0#Sx|
zNLvhdUT6n0H(J$xs2{E)JI$&Sk&+7vyTc>Tk;#SjL|r42k2D6yqP}Iv3+%k1?5tbq
z17KF9bzV1GdAb1Z;@JDC&>nsGWylixNs~qlvd&_^O?&GOqRfGMFK*E)QjaAOC<3=Y
zu6i&sT3l}}d{S`pQ`<V!*QCJqE$uipDJn=3xFtP@u5br)Ib+0Hj)~?ju)C3vlo;Az
z8LZ0B9O3C!ZxqyxwyrbB-S|`9Gz8!FTu$=9zmyACh;y5B>T?F{6!H+p!Q}K0k@ck@
zhte`Y=ZV%cMP=SONKM{krAFXzpn=&VMho~YrEmN7l0>oULYPo^DBKR75VU}61nha?
z28jk25F?4hL8)c6@4!<}rrr2MRE>k4nwL26FDJwm8uL-?=K`oxrP`ru6jI#+4UF#i
zsmUKYucPfF?|e_+q|qXS+kE`V$khYXU>oLjZXYeL8@$MRkxKU~Bw+%>E+(7;7%}A6
zV+UgS?Xd@zmX_6lmu_1FGu?;h|0vM@_r(gqZe@@><c-+Jkc}GqAwM-VH;dKz7x$z|
zR*_A?IZ|8eho@K#Y`%!jK_c3uw&<@ww}MkZ3wF9mh5?TfIa%G#RvpOJ5HWgsIEhfW
z_vOPMoc-N@!xR26raHhBOX})&ff?>$vvc)q@`SAdI7hVDG~@x$drJq-Fh;+sJQVsA
zN8H}`U2I*1bxPSCFRu!B2Q<9u4F#N?`|OYO9Z`~a>bpN}h5ru=2;#P=gGIBA{RuJx
z%?=vMC`!Z?*a*}~KFUI4c?j%&J5%|0Mucfolf~V)b0?I?H9qN4*&oOIt)TtkXjarU
zAXArn7EGpB@TfrUuTev)G)lUL&dwjI=aoV*Mnld|xc*t6*!z$%dPg_Zzzv?YMHj^8
z392J6VBc;7%SG*$Y+#A*4rE|zfGLKesN>BfPn})-VW4m1L&AOHTPZfA6(N<f89w>r
z_;?aR7aaF))&9+7&O@@E3N=u0vBOaBP4g%7H{Akm!`Mc^qH);uO<E_)(-lSYH$pOS
z5Rnt2MO5z&;}h>Gx2VC6I~e&BiH4%7QtyqwoC5@Ew8bA0SQl9cTGcJ1{r6n72f_T9
zZb=+SCn8}&cPC?kR`Bi!kYwfw?*rQ7cIQk~90>sk7G2~p;0>~8LTVr=D#)5Zx7i+L
z*RV<a8lB&=v$^6i!}$uJ1L)LSH}i*ffUbM3{*yn(ClV&$n|LXn{@2*@*;(+jEBl2&
z^gB(Ot>|4y6OWGnRthu!=}&-eZEgjwM`)gPz|7#DVvWrKOvMe@Z*Alha6Zd-2Si5D
zg}ggzVRVvn!*F&{3p4c!l<+!-K<cInd{KZ1^}Ev_&A<em769Lr-@`-!{ZEa56sqz2
zk#*JB!*awaz>jPcP?<acp~OW5uRawc>;@Fm;$)<}DHOwoJFp{xq)2YJkt*CDW(pva
zaB3s0T0H>6QxT9ob+w%<4A5|rO^iC=W<JmN>f`|$S}7<U^wkvY<{KdaWf4$%W(KL$
z7Q5U6s;Yki+7(Do{i5Cz+A~086<pt0{kBNaU%OL<b+Rry-n(FUdrF8J@ni$kUPpnP
zoeR(hOz@rmBuU6)Rvc-8$cZBz@p+oaBY+%U4Tzk<ejm>&eF5mCpslue{w<5eUnMgG
zi7y6D0cqK=B#5Kjm3uDnuWyDQ)Fbf?@;SUeM-PEQihy_oX{k>I1J+I)cA(?*fX2^&
zeIO<W=|nbytKm|bkyJH6D%IQbATO42!8#9d!Xg;Cq?6;TT=T$mEgx8PKu%_Qw)bZ^
z&oA2ctATX9U$pd(e*ArJXlXf@<Um6KC94+T=PQP0!||Z83J-Ls7E^Rss)3;ON$-{O
zFU|(CSwg~vs=r1Y97?5<%;CNimZ|(pE91@X@9t~WwNNvjZ|1eE{g}Sv3vx<coZ}+a
z53o0*oh__l&wl=8z5#7Y(7yn_<|fbi-~Vh;K=TK*#+;KI7iexIP^pAefLpcx90u6u
zotm9{{4jU#fuI*%5fC&BiN&acKD*LpaO;MgSO|3Q0IB3=@XpU_h^~~TWNsCSF99cp
zfyhrO>fe>k@`v6$1C}D%=_=>C%~FesMRX*;S$%NPFX^*5A9FkB;f$D$qr_d>GZy;V
zWp`&R^v~#*l^GdCT{9$f9lR;roOm_(n!N0@vNFE=b#pIj@OFXE;P%ps;ZfJ%q0wFS
zmCdG=MiHT~1DM3#&#s{iQK3Wh`kK@cG#nwcv0FeISjLGuMx?3px#({A(GIGz*t`I;
z2g8ySIF5PLhh9#e+byM>GuQ&2PWtyXs2Mc&t7{#h4`54a5jqrEsD(ZN`o{?gnw&wS
z{FbAn3k(|WJ0TgNG$QX<!OJ1_qfcl=phw=14<(v%b0!;TQh&?{B@Wi)-=+f}dg2WD
zOd~{HW*T^lP}p@-O_KAd^r1uG3GUz(_oM&YKPwof0dWs{CIH3E`}2iMkbB7Uy*Z^f
zlAFyd!Mz^~4}PxYkujy^RYZo_iQfnQq*375Yz@_ydcH0Dz4UvzhoPmWO#1>pQAWJ<
z>w0St<z`v$Z0Q8d2n%NC4Fk86hG%xS#-{}~B>+)=am*nE$$JR-;t`-$PGkZAuWStX
z5X=I7QrF9XLD>UXv-wS~M&QNY-fF!KY?o{NEhm5`+-wx@O^HlvZ-eKrcfYOJ4uRI$
zwPNmOi9&vv*P!~-3(5h73jxnDsvIF)?kDke|4h4`0a$k@=+zbsHL1@fQAG`D9IPfk
z2c);m<lsdIN5IfIu@Xd9?G0Y`DLljFAk%#So|^$6?TVnu!-Dz`+#{G3B_OJlcdlBX
zU%+=A!7Gs3r8m$AoOK2Fs<$h$%VP0|l;+Qb9(o%e&<L2`J>cSOdjQ5X1CUu>mxFye
z>;GogW}C!%C#$&qT*2C1Zy<j6Xv^=N7Tvur^<UNA0@F0aEQNhQ+G!5j`B*4*b#IFM
zoEt>`=!p>cd7onxNW|t?50)=L26blymr?CXy(9E3InG^Bcg0%4W>e|jcRh_`rg$QI
zPhYltZ}YNmr8i(|-GeIiO<*EVLV$n@u<eAb=PpB*(%yrC<31LEC)d5@)R$#G*O#5X
zd3?~C=5qW$1E)znZ3Om0P{3#o*rRXh6EZ0g<`LY3{-E7(tyAfBHD>Y<s|4h4@dZrZ
z$0|}F@W2wKDFExG^vHv3ALss@dj%Q{8V***pEcD1r4oE0z@6#&3KrTLC0YX$N!QLm
zs0BrZr{y&t3}<}`cdZ$?^44>Po183=Zyki+uU!7zO9pKucjT~?FqD8R?4Bn;#nXNI
zHFn-7OZ9a<5ZEZQo`wv)-sI<MKU;ST6%`1_u;NV?ffCQ`=gJUYb#CM_0ri-VG5JC2
zAhw_TsRB+%i^V*qsaWD5$CVnEGa!tF(uk>tZ6yx_5<1xQ{s<VND7NaJSd(XtOhFE4
zDR~+z^GuAX`i%W8P&ZV%x;r|YL#Iv&JG*Bwkp~EZXOY)^zTR(!N9BM5ekr|qS*%%>
z7_lkGG-!(bEYdqZsJVgJ^aX3*u=~qzLj=7G?8PSRMLzwIMhZb+mQ`nt!t8<9@p-fR
zt&+Z|m>yTW4U`_$2UR{YeanjOy%M<&HpYS&P0Y+*T4vM32GC<hp5_za>j!+kA;Kj?
z_5$(#7X=&TdC9U@``B|S7hOXr5rU_H-vf@l;Cw$=LW+ZbNRls>b;jJ#?N8}b>mm7%
zTxcJhjTuf4M^D=^nTD*T=Tgetv`2b;6(vu5i2se!=?PTJt6YD=ZGC(T9(Ze>O5J@J
z@FBh6)7bBWO^<O9C`z;BT~fSk2|@~_qO?w{nf>b(uXpAR9O@Hqzi(j`<}qhpWeQsY
zin-kmLBmpcUBE)3&%weDVzuM|k%Kd&Qs(R3&O;~dzk?fV0qyOXM>mM+&$fr(94H9B
z>DK#*jk(lzMR$+B@;--f)Tt#B3$@1fThpBGiFDW7AcwkvuDSWznR+328j==a$cA+x
zmK6%&!|(S9D(y&)5V5{nBZxoWAZrSoIghqzR5A4WcJE;)K+#ru_03j?^oPlgMw&2`
zw&I-p*p14qn}C68x)Rg9{}Ed}wsV23s;exZg&e`dSEuC1bs~rsmkuXX4s{)m<)+2>
z(siU&?6ChpmcQ9}=Ai(YkKA5s`c+%$jz00|qbn7O%y@N6n_VUNgIP$&aog0W^(t8(
zKC!AW*}Tzm?#RP^4eI;(33)j9c2F8W_1DJnjfy$o^y(w-jD(zdB;j7mU&nvd%_t^t
z=aJ;Q8`h3qtQ3eAcYQN}SjXBVG3S20;$Pg{hDu9S0<A#$w;7<Z|Dn*mKY9Tk0`iaK
zAir`@^Z21Q;%lo$%Ua(;XS;D`*E$f_G>@q*RP!y!iavAF+LYvM{y5<E--x*i%bIqc
z(wqLG1I9^`4R+Wm7VssvGG<yud|0NH*<ZW{RD?l{gpEqz$YW<fTb3jRTi9;*T*{Dn
zB5!_kIAvP4FnD)-^L6HSX#GW`X?CA4xI{>SY%8Pzgd(+5&deJ>K7x?=%~*NARAXP#
z|B!w3eV$aijaJqW%0q+l9!p}kv~=(fBOhnUG&;e6nN&w0T)m*3sVWRPmCk~4K@b0s
z`p+U(Q4c@uqj}t;QO6rErap+Z86l4Juj||I^*O7{`(LmxebV~MI&Wu*nhe|R#h5}B
zLNW5vli><^b9MO)nSov|D>w6yUoQ$d(;U*x@DCCd=NHjF-hkb8Eu;|Ua{?q+#$unH
zUcPFzQ29N7>CG|dkn~ZigR{v=v%f7W++TFg1Ux~7;cMK|a}w<}-T>iqem)KJ>G`(S
zy~JT}7tkqRAX~F+cU6oyfeM+ud5eUy)W%4m=I{D9P4hS0eC7Si^LMmMUPCDQp%mE>
z06<H84CMiJsWXtAb%@yaDE@Wra34Qzk+A)$hijk6A@$%^X++qX^@(xlTm4w@tA=en
zQK!%=pFb|*lTyRF%xQs4D3Jjvku*quR%fq+>wk-gc-2E9Yt(0}@wJSx+dK};4%>)i
ze0xy1p+Y<4kH#ScN2qVLvSxs$Em`I&t%OzAz~!Kbvi7<e5_1ZD+z)A8dHQ(S9z?3l
z0sPdcXvk;N!?ND6*`cq}_*O=_ZP~A$dKI~+UKxlO*(7K&wZvK@S?S${Jxm$c5h3T%
zcaO6J%0BOEc)&K8nDna-d+N(x$+;f3ld)S==Mu)cHPYzg9utc8OQeG(*}a7CE&;gs
zZI$~8dDEuJxNfe7E6)pSA|)av!mBawx8-byB=WwrE}ZqaeJ;)#GMi&?iy_#EVlH|!
z3!9)vh48-5?kMYxm$0c+Sm?tB>({EH)lPKKW8YB*dLDLyV{cvVkdo@<+74DM<Gk>k
z>`NGHUfa-`b$W;}A9Qf`y3Zk!UbR1{kWH9J5<eG33tXw!;}INdFm0sK#Y!tztGC6M
zHqBqC?7@DO0R%T;)zDy&>bz~x{GMpmWL=6WRNr`PT4L9?9G+Sz89C#}JH}c*@uiv(
zm1i*4zf+QY73`I@%X%MnV5-MS0eAP^hXsFKCOIQ{l8vCeHk9VJ5d8P*l|O0gq!cwl
z{)CA)S*BesLA*|QU`$NJTJ*h4k+_X0(?pXA@^*HX0)|hnJR&8tmDm7m!d%&{T9v9F
zSg^loa&}=NzrT7Q#-k8x=b%dN<30biY(LJXEhn>i(eW()<|$meV-~w_o94}KNjBPG
z_JR{UkGN=Uo~h#|42Jm7m!FFj8pDq1T$Ya!1eSw7I`jH5yl4|=7e<T}==zxJkPh);
zx4+^Ytr1z3LCG+Tnz@Nihz<F)q7rR(bWQ?epF>%`PW*bDEOM|rF!P~J`+b?}dj{9&
zYyBt*C)<Z`&m8y_xf)N)t2l0=_FTEWn(s5`c{Zn?Z%sp<$J%_(0bMxc=9UeKzW&TN
zd*G5|8hm3tB^o1AX`s3hINneRiMWdS)!l&l8p};{Nj9GebY|@1>O4}MwztmW-F%};
zW;?{XHOh-dGSqgB)BM*WIx?2>n;rJbKY}2;&Qk{vfa8kQs&?d^%5rTgS4fAQNmTTF
z{IqqtvyB^fVNn`tM-esCH`~{t67Adta+EPKLK##46WlF~xP9)n6UYp2;zJ^pvjSe6
zR{iFcqK=&Ze5g|gnnZA$5CD<1l&(w>w7DDGQpW=dics|fmd*=%N5`?<7nVCCQa@ji
z$r<7N5h)#nW6&Vwr-U=>ezqea8}FbL+kcH@w{s%G!&lp#y}#K}p_QS=K%c~yH^x%4
zGSxdW=B}Ue&U;mR09wqxU{+&I-H68>!=m=>CUiIa@rw`5B2)VYu9ITZr;iR5Qp$T~
z21WREKk+C^$FR&)6`dt8VxM2Ji>ilPWs#FSNE;upSr&0>OFr-j_^~W*n#Rj7oDjPT
zKL~=IyNMn_4=xAH-h<fVk77NEVwj>i9uI#1VAjgCvFi*2A_^;42J?)hjuUv-s8D>K
z(w~TGmXT_6+~#3<+CQU-s>3KHE+PjS)`d5JInFvW!oZrvF_kqOd!?R^M+d+2u%+{+
zi}aoC-du1-ynfM}z)oT`N2rq;GGcohv5E!qi-5e;x&H<<>dYRk*i@eo&Z{18xJfV>
zn6=(LGaxpX{&0Mepm$X=h!xNZ+j4$Yf}FcyuwY=|C5y#XQt6TQ<bNlG9$Z3U!;eIT
z!rtoHc|{5`l@$Vo^nOU&U7H2XG|p<%i99b;K&ioHK89%-B<I7rzW+3C!RuDFPEV~S
z-w9$T*{;|0Dp0&9bCBN=a*8kc=hHNAcP8*i1_{BLxjGZX8MNuf<@bn#crkAk0ORDs
zL~o4r!=&2ksypl3&64ueRTMj;U1Z#B;V`F2vxt_cVH3iM{titm&(=n2FIE!W#fT|d
zvN>L_WpeNBu|4eg!-fS0);cZLQika2=Sk?FQb(u=p@ZPq(iuirwF&QGZ$1(uHq05^
zFtHjFjDeRs15Vg5OVgK)xNj5+lfBgO^&bABn3XSLd|@yx`{ImaGbwp_s~^bS5AKdD
z+XT-AVR9)9_WLhPR-1mja#o4L$5t`v5v~)ed*W4gfpBK-czBF{O!3Be*?P<RHGnPM
zKqs3qXe@0E?0W-Ug4yO;BE#{Sn;x}G6|Y|=dle8(b7KnsCXo9Gv#lS=&rX8CmY&iw
z#h`(FLO4`iqO8sGhWJ7uaNjowex#Gr7K7|gZ4_x{qh%hsLZ_4LDMdrWq@CcD^f?^G
z-QfB`7<M=@CE*Ii4W}oDWk}G#IKE<_MNh%sEDkb|4i9EjAN90Z+6Z3!wi4q1yv?jL
z_Ykd3q^!_IIK#z`qZpy6enwsQnd^p?h26nvw0`a{kB;G=zvC*B1+vw&O1|^8;!i1U
z&1NqdyV4W#-zg7WIL52Z{M2&2_v(T7_wDjZh-A_{1xO`L)*iJs6^DiaOeYqZ$gDIb
zqXtZGjS;?119{owQj0;6p0f295h5@(BBSHIk<~a?<4;6954(UuRa@IU@MGhjzt_1P
z(}_1i?YV)9TSF%ukBn^+V&DvH9@6O7b_`_M?(>PS!*ijSYQKi5G;&PMM(ME$xbZzb
z(t0mNkLH43pAL-Qm2?_htV(7B{w`}#;ro;q*Y?T*5@IY%jns}$pQd{sRfW_x0^&;~
z?fX1p3_{&PRg?vF6=$?8Q00nOx2qp|h1BPJz(P$|_|z*35mkkFYa?&%5Y6xBg2pY3
zUB;b=5kdOl4JPk>x=_!Bvqko#mMQRja=n$WG4B9Vi9z6b>?T>+y^-8e#|xNZ6p+KP
zBYc&{UwuijTWvuXhkVQPqMbPlq>-=iZA|N-MJ#(;A##iwsn*IJM~F<x%2P1`2?srq
z{#mWGm2jZ-jU5uekc?d{zMdmYQwZ|X2Ng$_XL3+4;V6BWXWn5`+ZxW;?p%GjykffI
z?~_+%4`Q#4r)zKFPEG;d6EE6xG3{QlWuVPP#U>whG?Qye0BX>jUjr~Nhn5^D)7x6#
zZu8CYi4P)|S^|G9epxb_ZTO^B_H-Q^Y}@DB|651u5!9vp@<Cx~*pWRgUIq<Ds+miF
zhu2HwK1DuB#@&#bEo+ZP#)(UBtJN~`8{tE!_ExJ*B7f2c<@w_9)ws~mXy#6DH{OC$
z9Fx~R1fOh=U&!=%qSf!aO3}g;=X->#=dR=5*qL=MpW~^}8HYE?ykeNv#Q@b^f`}he
zj;oL#G`gP`Xp83C1P>@?z*ak(qL^?c;f8i9rNXW$s4EZZo75pc?msGq`~qVU5sea^
zU_BnB^d%DXD>s=v5x;D8|9Wz+FCN&2IDSWmTf6BLXdh;RGwIJHWcJ9|@>RMX^-6o#
zM+j>M7nZZ;4{eeMVJtJg==R4aB|fWz_(R{eV+k6XbD$2<5CzgW;v(fCN=4Q6F1ANR
zn2f)KYZCDxf6>xM<C1HuDiC({D~)pue)kmC!{0@akK*hwdU&amWuIRCOsqX?@6tqX
zz&FA9tG)`D6mem@SN<e509()l*utLWXd1v4j=)Y-P$Ce(shpOBI9N`ZL#<xV1k}~n
zZMlc0nT5Gmgc5*x5Jr3?GWzN=9piqR!z^gHaaRW@fRoMI0aO*vPL8^|@pV{)<FKh9
zQJG6@517LU{5YeTMg~aE&9YSWky7)$M5>6E(BZ?4yuLiYe=-Wp<b)~cDipBwahA&_
zAx2>7(<>F>RhqjqDkyGdG_@}UU7UY}b2M*}$m$R!A8=QH@vE)XQuT|Ate{3=?Tp8v
zYJ^x?LdPyZI(`)ke)&7uiUvpfF{Jz`Aw$b4&|02{x@zNN^cCjc^m4a-Ls|EOJI(1K
zIsx2FRZdyt!E>%3(yknXjl5`*+~$rQt{k{lPKW!rkqg3)lHaq~UpssI+Z=IouYW&u
zBo_V>05~Pzb)-a-P#L~9>906O_3r9;sshXGqfM>wL<&;-Cr+KVa{ISJ<8Q|bj*wSh
z@aO;Jk|HwW;M&`nTebv{&-9!>F}QRl31xg3G$_;?ZMBZHdJ2<iqj<xaY?+Y*$g_m=
zc6(_FNMZM?cN!RRWxCbFD^wv~;^tDeI-goKpB|}v(69LQ1oLDhy#`(qCm}47N{)JM
z@`+lII1e*NwXWP()||jaes5XUgSjOzYlW}iZGeGj_D6!Xd>ez+t3d_?E6yEorw92=
z7Iz}U`Z03OZOUFjAT}g0x6*dvC;C`cQncC%WQP=D;VlDZ<$5%FucsrsIMHoYGCHFo
zB*lPh`ko2?TviP}Owv!!AUIqYD>>J`OfE%eh6^>yq~09alTMu8$bJFXoe%vvNjb1f
ze1a#clmP-H4~VlY$tN%)66JdBazr7VjPj*Do90_Ar@!C-8AEh{;j)4pKJhdqj}DU?
zh_&=K*VL6p>>_?r&|SdLd3a^sr!n!D^H9|Pc*JQc5Md>g!~RTs38D#ik8pyOdtN)X
zTl58xQh&6n>H@=IQjLA{eTUv!OT=(PeiIDpc}B0(jzr#m$69el6&VIoLw=G+m-c|}
z&kS1WST9$tts*s=N04<0bI*>=ZoccwNI{GZm*Sk?e#t%$D<bW5dLs<M%(X-<%yfCJ
z&2G$I<VD0Nz?V{|rvm+K99~l@kQXPzWsP@X0Wd8XJE%!rT&{^S)3dTU5|VDSg&Jll
zkihSyhy;0-Q?zDKKa>M(C;LV_?>mDuIr+}sv?SqqUG%*kp0Fqdqb&jY4~U!fSD$1K
zRqbpjuM^(sjC=x?67TbXP3f2D){x<KaiVJ8_f4?>a`*IaB&JX|`jIee{y-qZgeNW5
z(-<%2nn3Nt<|!XCbb`#vrF?z}rhq?cL1&9Us9BF-9x73rA9iNl9KQ5s=W~-F1*3gy
zN`S|;NG-Ck`okJPZvG(uX&|k*EEh5aaQ;W2tD?B$?P>%<$0osnhL(6irda0tlWM{x
z)~^x5$8aN4JYf>8&abD(00T|_P8iuIlcX4v>mxJi6s<)27I*!{9Np9_<m}989-}gD
zTOE9doeX==sf-2IU(VO}6W_-=A1?Bp;l>Tf=>L#k1f2!Z!U;)1n#j7v`pyccJUyVU
zU{y80Rz81}Zap3s-^FSxvuIDC11)OwJ962Pg`;-}Q8`WAMT{)vCzW5k?p^|JKIGO&
zno&x0Z=Q-z4y5FN<VE%EZ!#J)O{iqHg^Ue6fZ<Pm$|d}2c%ukhaXExxk=OE#qo_?S
z5r#3Ix_BHXC#m&lhfd;{0iGR#h9wW+Y&|*GOep2(Se3==9=OQuk|oe9qPUU>)`9F9
zAsV+)E6XDFaa}@iH9OCg(B@b%rV#{W-7hPD6SZj_2<%lJjROJs49t#lA_=Hdn7SZ{
zR01gEbx@G~XkE#-)>~yGHwC9<)#WwF{t_I+Cn9p|%tO>I`Uto3NT1a;6qd$ASX&0B
z+D6fxoIH)3LC(!A|2US=m_L?%U^jC1llC21OCXTa^;5r<oen{LXz7EI^jYa+#Jl=_
z!9YVmk?>w9j#LziNW?L76X&=h6)>=PTnN3O!)6_hXwGROwoqZ4z&SwJM_3Rx`vEB+
zrBi)@dyp+{7rmI08HbT2xFEPXPfzE8@@)mG+{N-c+c5GX{3e`zGNI~}vE9>+A8qR(
z<IiJXJK}YKwQZ}N*vPLn3U^$-N@Fy165Rv3XtM^H(i97U$5TM%aQ+TNNxl|OW+^J9
z*m6_@j`n)PCCZR?pUCA?_G^|5`@-!I6}wMHfw<3=alEXqcT8gM{L$BcFY-I0o=Q2=
z{{X1^)Hv9^t&(N};sTmbhP?@ly#r&vRT$1JUzR9=gP9c*rHNu|a4^I_y!Nd-*n;R<
z+%CN%8uCoK`kX~9MpiVfJOie14DslKc8z4JWdkA09GxKidoFx6J|vd#L^Nbo8o;TJ
zvF$H?-b@GX@V#lu1U7F?kaf1GpWQKp?Kz^Q^iMKqxtFfy^rxRbF`-j)e-tCiw8u71
zEt;^&77%Ah*t$zO;Q5$EzV0Hq-+IR;L6taW7^5tTz0Zd8d3%}=&K9oaJ%uEnaj|?}
zF%jvnzeYb;evLkGfU&4N4>bLY)`>_1Hia+!%Co$uzp5Z7=;z?pc8?-(3~}QkgiW%&
zeekt(#(4*+>l+({6>U#t!#RqsjogLbHzXhi{13307Nn&PO8YMSgpml>;6}%OOODgG
z=M~6vu#=(~^&TGQJac0D$!yO-?01WbK;$_Psx*LAp7+3HU$9*F3~0MQj<|7I8*>t@
ze?R&nNwR+m`>-R6B^hOV7cOH&5SS&uGI0s4+HdvK-B+^KJ|#(NIHzshcksMol8@+4
zdOf?1ur=G3KmSdDii+i%0aojLLaa<Dn}D+;P@Rx2Ux23?iA0%XV6IAFdX>d7MQJ#Z
z1u0Mx8yhPkinr&o*;<LF*^z7Q1d5)BdW+nHI~m59HzYmo#ndnkGSA6jA20gg5AMQk
z$v<><_kKB7^uKTi*GI>JJQY#wO3<9i&wv}rt%HA4PXisHgX!9Tf}RZx#nX~4xsV}E
zkur%B7r~%lvP9t$8{#`7Q^62f@m5b5!;k6@zbq!@ZMpCk@Hj2>kQOWFPJ%OQK&1YO
zM1R(X%UhodxC>YPNS6Y<$C$V(!u)Aaaw6o};X6%Kqx;BmdGmuP6iy4j-NKVZ7?EA}
z`@Ovjq$mbGkpqw4x-2LkJG^$J+E)dI7tiz5E+ro(VHZzFM$;5|_4)2zSyB}DDZ3QN
zS1#+9E?kqi5hKn?3N7#WH9~KSCUYdHM2n0p!KLRI2f3Q=TMnYGoZ_~{<8Ie|TKK#P
z50M)*jq&_}Q}}+l!}kaGeoJ7y$Pb04=XF{V9{?!R+%l%HF&4WvpuCzDF8KED<*#^_
zI+gY8+KzoD*SV5jqsrR|@89eKoqIKmOP(GnjIkTW+HRN{m)r)&&B_ZGFN+om60(@N
z22eB*h4ZY3j`lHn&gp>XRm9qjt;<N!H~*rhtcP@(JEI>w14Dj}(1NGeK-MGAAe)<H
z2w=^4CV0l~6x%l1@geH8$AxJ4wKH5^+HB>*<<n}=dnxQkWMf^|h+AJ>9wf?%FbL-=
zevRCTLC<j0n<kB22|N*H)tG1=>~umV;;Lgmo>paOyNt|Sprab(4-7G!To6;N4MwBI
zXZVGQdU*AR`khiYBf_z2$%>`nlr}ul$mXg00G~&uqBGv2RE;}GS3<{{X)v9MF~WKH
zHsj#aJpoxZ9FD>_2Tu)?ks*RWp<EiQhAlf9H?&N~9*jA`7J9s@2J^b`+aV(=z~l*#
zh~wN~$`P*j6yX71jwEgiU1Du~jG}$77)x$jkVr4{8L%V0f(^+#5A<Ez60xpSiKZRo
z-^;CiK#|w)c=PWK9}aH#(NCX&`fB=Js;GH9elKB|BAWzv95Kub5?J&kR9{H68khah
z-9}VggelE1+0&KWxHY|@X##)kz<aB+%fF_dv5i$YcgJl%=@L~}={Z~zjDAKJZJ^kY
zMY$N}K1(Cd<ea8G{UuZxD|d75!BbClU72ER{UIetTqKU&+W8fiCE7{I>A5SnHt+Uh
zJ4=<eXayczhNNE_9b+XIekfd`b@}SdGjzZvIB_lZu6<i4?jA9Rc?hOur1RZA&81wm
zOfeIFiM0MY$f8V)&7a4sS3N*Byigv%q#TZK(f|#at|2!ylw9a=?3F^88Or48*}Lb~
z3LXzz!hPnfNX}Y>J(IuLb_h-1s{HlvwZ$?Tk*ZTLLhEy^O32Q42P96k2`v|GfNT(@
z$?4<9+9t~s8;SV!@2Ssd#y<Nh|C=d*&|OJ9e$Xp{+j0sKTr0uq8_&I=*O6WiY#&<6
z?ux1FcEyHd<On}ZUN!!tNWz-j)jyUO;E2BLM~^5Ybf*WHck8hct>(<*%<}qvW5)%&
zi$3is_`2bQxNT9|*OCOirBm<B0x7Njyc08U`<ss()Y+Azqpwsr4^id-8gJjb)t5<=
z42$jC5TLf2%ZPpVfXyn=d7$7`fezD2l#|mq%6rVLvci8Z)RiESy8c#8)^(`dde}GQ
zQEy<3k|>sRjX4?+C6(5XrHaR_bnA8APJRf^9`@r^VoJypl9)jB7oWNAQJ`XM6r-8l
z7;9}fc6_D*{Svv3Xk2hC5c1Au1^`@Bwy@9Mt<=@5zvZYsfY~KzoU^(M3bnFfu@Zaj
z($kV??CbldTBb5LPL1nR@o!rgE;`+f;x;HWNQsmAWHQ-vi=cNV_Ueie{>np~nlRgw
zG#RexlWT-)#BVT8qqdY+4zgontEX6(PTQZIVKO9zJ^U(ivNNRv-l;F<b<$M?^T3cG
zIJ;T(BudUF5|N-~Y*n3mL2O4%)_>9O0@3w&`cDfw;nUg<k4m;8zK7Zw$gNVw-d<~|
zs_3))2ul~?0EPk%Sr@#d{05A)!Z#S#TQ_<Fn&1+|Y0>oB^zxuroFf!_wKPwG)>M&q
zRWOe_wuTA|<WzQ8y{^4~!Euf)bM8)&Kxby*@HC7-QDXZbtSClFBtMzId99&cq0_?U
zD#`i8LyX=(ORj1LT<|i06%G`$Vp^~qA*1-G;4aRhYKBs-dcn~cwi>S{z`=Md<|s@P
zUydEII2!&m>JlS0P1NT7L(TW=9<BM8ANH#v^2Yi5osBrzXxgemynFfxFM)9$n^eLf
z`$XuIQ)H_Vj{G<q!~*dTwS)Qdg`~a;v&EiBDncepIO`M*n#TTe;8|jl&&aHlb~{%b
z9U{G!zbNfvj{JSQ{-j&591NZ1Q~W`d4}TEFdf`eXr`+bDah;8j(36{o7qWH_rMjEP
z5eJLkty%y7Qv9%3sXYoKhL(4~oSHgH_D#$B4c@ml=X9D!b)T79a!V(?KEe}Xvvl^e
zLsb}i6Yb?owl)r>CbLG(QUx{rnD-|6jTLo|+Nyi96_}L{qAt(2<=o_cO^)n>`*K}8
zHqmn%M>d^ZAzduwuX@z->Ur3O;oHxQEeDrC6Yy?>&i)z6pL)rKXF&Va-&46m9N=wR
zi{%imuSJbiA}B6`F4*i9F}6OZ1KuV$*ENo_S{y6}V16_`z~tQ4ac%qi)@?vX>5Fa$
zLbxrr!~m?6?>tcrl0_rHP8$4xUh@5mkC0}0f3F_gd4A~npR->xYUBIrv^G>#cn8(>
zLbgX={j)vppVcIlo^1p9LQ9SK{jR~9tJKus*D|QVZ<SDeG5YiNP?}}*eaf`OpZB9>
zB=#WLty#cX<9RjB9B^mk+aoa*cZFF!Xy`v?Pz_q7d9K^08@$8`?Lqdqj)0uk&p!&O
zP4zuh%zD>o)Act$0sJWe@Su)qr<0&o9LEOg?LgD)+1SO~fPAuKyxeD^DDPmj45$x&
z%LA6@I1<|+niXJtf&Z{l0rb&cKv)}ZILu2N%=aI^419g_jLY7D<o(!QLUx5XI8&fu
z<4OdBQ@pz720ep)GV{9k9-n8+ArOL1-ny3{f}=R=v^$Tx43%GY1O?a?j_9b*8}(5A
z<+0%;2y9Z2^@9o!EIT*(<EZlnt+uA#7DR>l5cqt-8sdZfnVPCCFi~8E`3j&mAgK=u
z_erYD5|Aq30oymu;&%dy#6dv(xdhdmfZev;3Fu9aRKq~ZE!wdTq$xeiD2IiKj`S;|
zZZG<K0GeA)k^jb>H2o*B2(yLQZ{eSH{~nrup7;+D4ouv4D8hxXEQb9vrGNjRzb}7}
z1iUqAOZUHM&cAQ^TjVVOBlDfyBJg*b|E`{YKK?%s{WB{68Qwp{Cbs`9q(9p$;XhmE
ePi@*323*ZGLVuk?p5mYa{|wHUU@Nrk!v7E1<{bI}

literal 0
HcmV?d00001

diff --git a/torch/ao/sparsity/experimental/pruner/images/prune_2.png b/torch/ao/sparsity/experimental/pruner/images/prune_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..5aad9d0451bac3f45eb16dcc8d85fed98eac867b
GIT binary patch
literal 100490
zcmeFYbyOVPw<b&o36|gxNN_?2w*+^0O>mdS-QC>@9)bpUcMECUg1fuB+gHtd<u`L@
z=KlY!%UVrKRh_D{&z5KJ{hSav8PT^$?~!0&VBU(02`Rw9z=2?3UaTX$27XzsqWc5`
z^TxzPP*6@>P>@*8*2>Vt91H^^780)vuky7QCrw>J+}H;}46y}qv?vZu4emvOZ$va4
zhD10be`g?A{&!6ng_6CZ&`1qTWqxoAO=t@}&bK&+iVO-WD*@QXL+T~Yo6Vv0&qj}5
z4+mPDVE*o|i03rqv%t7~cUN^6LNJpueIGWC(&PQw3yUGHJ5h4R#%2VjkMGIJ#p@+;
z-(z9y_r3JBCq?^L0_;RE2KW#>Sax32RS?Yag?j-F5lohX6+5|hAw(E$uUE$#txRE+
z279ppf=4w!Z54|<>~qZoWAHxeQ$H-<8R2*p(zR-|=-Foo9fpqztoI~C@XH(0G#EYb
z{-4uR5Ty+}#PP2d5@R_0wSjwg@&)T-IKL+$y%}23>Fzt*d=<LE4e2{x$+I?cdS%At
zKd)JSh~eOs4G>KpD-C5+kmvHxQ}zajC{Dg#C9L36bb^s96hlwWt`v>9u|+8+(h=Av
zT@Dc8H-BSO|B)4>Rx;d!6=X*eSoA8DYG?~2U=DL4kjX{EZ&9VnLaB9tTrwDEGi~~y
znpV72HahiESO!CV@JZz3df^S@pn$v{669NO2@<525zS0spJrG*SYa-l0?S74wYfp4
zGx%N!ts9vforKD>nD{FWVOcGGZ!GG1m<3Kq1EXm(uXY@6K_RN+_u;)LqD@~-9EoZ1
zms|ojN!vaea2<ac6c4BEGCxxC4~~%yqJ}6Bb{34zv(<20k_<%1?a~roM!)-*zqQ(X
zV{KD-eIjMzsgXq`)69^<$|Qr0+WJQ5B?`hTnhbB+mr}4ZeDkBKQ;1ImA{{SKd-!nB
zU_aFu>WD}@1;c4!@{wF)l9EKD4k7OlVf(`7KQZMiYNqlTJ{j{rh2tM%t#YBgvR&~K
zexm8ks53Z*Gle`I5BWu)t_=oAvp*hPyqvXq&oVAl#2oq}il*d+q#iC~9Ip7APqS+x
z&cugNqJPX7%q{Ct4>_Z|S=~Cm3Q(`0uW)2WSi5C8m+l8qi2AVf5S5k+cU{%ATM*}c
zBJ_plbL<KKY(uu}@5aY#&V}H5>8}>aVF^}L;kACE^*cHYoQ+%+;HMO%_>I*>WMs-=
z?21xJVfNtR@vXw}&1A}Fw$oo;LrxR#+Ie<GCCy13VL1H=x`%WlR0wo*yUShoNgahk
z;FYIhA19cXX<K>I@O#J}gLu<s^D4jFCYmOqBn}r=xUY=6Z*;9meV2(1sS>)OVR7d~
zP}M+Oa>4B%>>R{~*=Q2ExOVN|_X;8&I9swmTP8$}KEJqN_8b%TfL+dDpMSJivaeNM
z)`#WE8>ZW$Hh4uvP}+>P{8a$?@Qa`h5{(bRK=)hc7hY&@I@-n1kTLkFl3<ExeD^e7
z=X&GmpisQd(ZN%BJLw~>Lp_an>HUy_^2K*;`Aa<vo!{Ovod);8GWj;_5kI{z@kjqi
z1~fs)n+g$pbXEvFqEJV)@kc^&;n5fxAzzAUydJJ>ELERk!CGR)p0Xa@bv9cR6Jbj-
z)EJcYx2K4ma_<X+eq^w0TaUwof5{t@au5y*eEJoz?QV~#7O9lgG@^FS=K7)~vNn@`
z#PW^KdtA)V-9H(K!u>k+OjJOiKZd&GDtJmE6FP5?d9*uc%VqY-WFUBK$WFf0Y#6c~
zH-Hu6wBl+99YM@aVy_kNNfraV$p0dC!HN13`AFgl_K-@EU}5$l%pnFL<SlENero=N
z5h9gEJM!VM^P3)v4TlY*%}X2lijZjucd30)j9B1DQ3V*v1PIwiY^b=Ic$0Xyn6lVG
zHeu!KnecrPluvmv<m(OVOs90G@~7CRpKa-dQ}m=yenAxA;!NUl;w<7`_mN9*<|Gyh
z*5tgF!z`E?XCJpUeS3j*L3mLT7#J83Sk`6ywn8LEq@jx^qUt>=Gta2K3Cb4~3w&vQ
zb%8m4v#f>OmYlI2mUA7~_uT2N{#wGk{9b%VBL>DCX12_)epE42e=%b&Gj1KtGFCOK
zr~w&co5q_E{B6vhD3Bhh9oaNKt2Q!GHYS>EEm$b|^Ven~WTJOWwYW-h>*tW2=J(PF
z%cN31r`AKI1<b}MJlsFuf9oFOo#D>m=HVW=3URA*mul5&6)Y0A;Iz<rXnLSN7(5i*
zV&96M&n4lG1@?kBnG%_<K3g&HzGcMgASA`FG!`3-rWWMEeZ!2vg7!WRcb~c5^0+q|
zjBA*rUq4?ybli$=xvukN);M%T<kwe`RKbhDTht9yTD(TDZ36xA>ECRN&W$HZYhhgr
zpj;3-NFlZc^ht_CIyv`u?o4ixRMZd0B*p>QL5N||KxA@@&Q8^Iv4#@+-p8W;`Dw0E
z4c*qx=FaA+j!Dv9z0yvtu)BnBl0^Md{fY^L37SfVO23K?RTR}1DsolsRKkj<RMIOR
zb@a_*^{U!)hc+8Zd}ya?|JKf4rV%XA^;lMF<}5I~Wwk_kL_WN}>qp7=%k%5W>{$6u
zw!6JS;W)oM%@NKp;FM&bvQeHQ^W{_RSD&vnV}eGxcGYJZTO3=;e@X)L-fHvQ4;+`#
zrO`Fe&1oDq?lwl+r|!)z-*yfDjyO_UaU4uNBs}DqYgv4#3HlZESN~Txt!&^H#qg`N
z!8GmW<`x4jW-Wq6rNy1ay%ww6?HkAq%_aMu<nh~+)PuS0yOH63`X#HjCGAI<NA}02
z`}C-C0uuZR3&<Kw7>rhXf_E*Sl8)>0W_vC18L^fe`0p>BejWbf%Vi^TH%AsX);q;u
z$zYmwxgPjPA|p^9B^Gi70xhBi;@DeBB=vWf#Pg(F)Xf~t)E%GoT!p@VBw?bmby=4#
zMNAwnS#^v!CuFx<@1;=tCNSQZFFJxUQn?){znvfRODVGPojw6sNI}@EiuvjIimNsA
zHBk2h%MUgVOZ(34ypz(C&#S{988ASgeF^7u-0$^YPAtD`ekUzpSEY8*uvI(_iu)Uf
zO=+PY7?sSvDNrb1$X3G>5ts0@mb=8we0q99f3DjllA6#-zxm+0U~O2M;YT#*N{>)U
zNOK7MDB=#~PSYswnVw60^WhuIuG?@ro}jkCCj~Sb74mJg*9gi8>EPD}r9JLx*wHmi
zMS@r#sW2Tv*;gG_r)FYiQYv}ZA)EU>JN?qV3n`*QlEa)dBk}8EGBQl5UmO&Q#^u#$
zEK>GaT`tz?RO8H>@MEG9r?&QfN8N@Yadygv2EFomh5d^A>-RkTq{7dNOc$$Pf=t44
zxvAgK--9`LQh8fk0&l|EKNAia49_zmC&x1uxzuj#-&D<ok%l@BWe)WwpWSIBHc)T2
zrX(_z>9+RXXWc}3g<cmls#K{cx!d_X`AQ?71gr+ohRwe#(LApalCy0c&#5jxKht|$
zJ>zY9)OfU8`aVETE7_7@sUe~PQgzgK`Fom5ZLfq?>fCHQ-Z!gQtVmw8SdvzMtC1`^
zlgm8L9Lj95F|u)CRx(Mae&>>Eyl@d39on)NF-5QTr0b{iCqCl$2Q!vrbE<`NlPc%4
z5?XY1Q)&yU{#uMW9CMFGT9>(GZFkRtS<C7c`@2Mf?*tZ{Yffp!r=^9n;AS%uX0?+1
z!)b2Is}1f<w_UTJrnURvL)r!SOUlcZYvzZk4|l&VS&5F><?R06F5e1<3zFA}9aPVj
za>#3{dzgCct;7VSe7OIB^00Y77b$Yq6-st1nn0LCnCex1q_R?R2r~J)byB<*mGX&Y
ziJNf~QWN`2w)STzhr^})^$w=dv{7X0Tedmpp`-bgGoGzII%m!NmTi}lzej0pM5ZUE
z1ug3y1&@_S8-bJPC8A|;n(Us;=es**^Jec#yEUv@V(ix@2W?pAED<fa+fv=`E>e9t
zTb#lU(GTP2r&_fy+pqaIh%UYMEzV3iPZ`cvIz<tq4tX|tJ#T~_*X{1^^Um|Y!vkrG
z+)i$b)98yfH=Wb(4(M~}-8jRZ#&7H%sGF4GpO7B+^rn(ZFQ+P}JgjiqnBAM!QoEZr
z59<%%kIojaJSlb<_h)WKw$_qmDbv#~WNxHxH6A-3CI3w2mtEbS-ri<&;gvmoh4Fhk
zH~_aihyt@+0K>4{5W%73fV2Nj$ieeKbeUIo!lgpH87AK+_4ISv7+gEgw0S5ot9hU7
zHAKg|u?dx>rswm*!Z(dC&KRWU=NB!^FTBTL+TW>aYaa!+w$jh(l-Ip5{!z@~M-_lD
z{3b;(e*3cTM0goqK2^4@|D>Ccg6tTE_worA!7CchE|j|!$UQ>AD&mIH(lAuOX9O6y
z7w=(S0iRv~FP;}4{^PUA3ksN*&%eXMzyz7V!2R1fGQd0ZFB*73uleWw<@Z1sc;Fib
z@N&+8{r9usKp8Lp{qu`;;1~@5S3z-c;Qgz<Ef{QRXKZE9xfIh3{D5dJrfLTRgF_B|
zy%1OUcnpj`W%5nMUPW4pL*L4RPS?On4@~E5VGSJzhRc})_-FyP*ClqgFt@bhaONg`
zK7#}J4E>p&l=%4+doylQ6=^wQK`UD@F$*079Rn#35-~9`m#u*zhk}sEzg-S|<0dt>
zx3}h?r+0F4qI3F8XJu<d&&bZsPS3zZ&%{IvoIz{nVrj4IOlxUJ_Rk>yK8_IBPT$tV
z+TO&<k{CL!uAY^HJvS*S^hW>j_s?^JolXApPL_86HZ5R+^w1;pjC2h2|1majDHrrt
z4mlHNu(_&`i3K1t;2u0**uHQ*pZ{Nu{O2A2<w}+RypoZDh3Q`}{g*@k`=yF@U|T^e
z3*esiJpZ{~|90`e9{jfpx#*!&{})mGBj@K|0YUR1anb+BqVXWHz39&Z))C)CNcJ1>
z4rtl)-zo5)0(e2+f!7wDBUV;83=AKPxDfw0=NJ16$juKENxa93mUg-bcyB`Egea)U
zC2T3bqsQ#Xm}JX+NzM`w$E_0)kia0hl5BWb$H+{P#GRnew*6q3bTd40)wbAQoPK1L
z?!j$zWOvE4rCK({bGLs4_L#b=vK&5B9X=~|`0-OAn~xX`0S?yZ|L~Ro<Ky6)qyjs>
z7W02P482AU4MC6ey}<u%2pT+CekmHf^ONQO!|?yyQ;&4^|9b3yF8?3V{a=*x|0inG
zjt}>Elm5ic{df{3o2coidRz8%TgJZCM4)1n>M&|y<u+G+wdj_o+;%&$Bm1~6+o*}t
z{cJcY=m_^mPmfki35RV@k2_=OkM^fcCO<M4AC&4E6*(@a^~caf@ni12s!|rr^Y0SQ
zgv&V(s>epDwQu_D$D~<vo`Cz}?K<c>E!{1$dK~)^NnJH>agPmJd7fn+hJ`7iz&<@J
zJ*5t@t%cjKhJ4EXVAag{@OafmR9N=i?eTv2iR<CAu1%9&NBUUWFDZ$*KE~nM4OE5l
zk%Q>k?oUbY7gllV#P&*?ESV6mB!(6?S?a{y4r-Lq@vU6?)h`>rM8ZR-QofJQCv?Bb
z+9p2iI$1Tk#rt&Q(}x1?;pU0kA_h%bl3M7k8u8w#%Ht18Z>D+L#Ww8mTs6qLKHV*)
zA3D}Gy<)6jDL$bl^ImuD=3JD=QS^K~OGn)=s!89i^199QdJlmlsmY^Z>ccWsB0@c#
z4m*ilNC<UlJt_J<f`e1zSnBK2*^2MeCV8o$#Dql({RP61Cg8;Od;~eQ$AhIu+Lle`
zGV!ia{z(iai?W8zALp^@_t_dBAduPMx)8*^HG(7OZhPQ)+R)xU&`3n%cwH!a#mlg*
z_}d3@&SysNn{DB@DRIEw?tbFE={;|~YN3DVZ@cerZ@n$jexTCyxR?lVu13Jb!CW)p
z=a&kH@9rUg)_ZUOAj{I`)4uA*+r7Lk?jskIAOc6+`cWaa=7j^x#6#DqroiL6Vl|;=
zDM0w)p)kU;eN<UKt4zU3*TYZu!}hm3d3k3;oMpw!uK_Ww0-}7^b}gJ78_D~4EyRc~
z<WDxpI<4ckiu1LEn&KDFX^F!cK5`g8j;~oFji(~q@^n}AD$=k7q2m1>_t<gC&vioo
zcfREbP0BO%4QZ~s1v`aBrBoL5!68~=5sqBJAtT43MK7Vz0?^OukdUUOhr>>0{kjE5
z>PgHo7(Uia-S+{hChevq;O)Swd)U2z434YCb{9HOEXU~EJrk4Rr=zEbw)4}(Hm-*?
z-bZ}_1vthEM`}4IJ*l7Wr@e@bCy-A(6_8hGetzJk@`BnINEDcl5^~qMT>brdRHJ`3
zn?}q0s7X<Bv8O_rR}t8&H4xJ2J$58>ygbiRmS~Q!AP{FYFEq@x-;B2ZD1#~qLkR?q
zBoB!fj`-;-)vl=&!uK_^Ky0H9j^dU5m-^0kLTYTV*h_`@uZEDjtJP5)_Km5f##kKT
z7)MGrADyKLBK?y7PI*4;X(!ebE!_+`VN~4{J#35aB(~jyZB)5Vd&tHFiEjVug`9Nf
zb$j`t{jF;~iLlD1_dE&fJYCM?eJ~-3hP-57j}{!|hbQ=kkiNm-1jopj&pdGCj)r;m
zhxQQZS#4Fx7%U)gK5-1<#}$3xJhUfHt@yA;Q^TCF<u&K*5p-`#bC2aWR@0m)%GBCY
zd3&LH(fntB>Lbp)?z_FPPT_}Yxc}7s$X8}EKL6B7U>#^i!0KKnE<OF8vwAOwkkdD5
zdr50zqdpkXCsmeXC)+=%eD(qAju0Q`tlB&(dZumrX(gMswX@jfMB0Cz0Gh<6Y5n`v
zj#Jy6CW%+L#byL6Ffo&Kk4x3D?%Aus)hB)_soY85q@*{cF=o%s;d4ZLH9967Vea6R
zmYc=xp;KE5!YkNs>`Kx?M5oa~af+o8tcz?9GwBbisxzpzdew$1@nGtpr*qk-Z`L^B
zs6#xp!cF~}hu;Y=%Y$-gYp1i|()H6mutf!a7RK8-TO)d^hm*SlR74(&tXqe&ugSZL
z^RQmO@v7x0^>LN{F$>|`=ggR`={-9C%01xB(pzsQ%5tr|?&ipe$%`W}_e$!t@K`Cb
z9L>ES6k;s+rOwNiY|t^&6cDNX{Vjx|ICk@y^7dQr4tFlfygVAuQjaulHJ&g!XINkv
zF8D<dE?oy0yR#+f8nOpQ(r*TgM*WEHm%*Ru;TcCtZ%!VFAdu_w@Sp!Ua25n9WJdm;
zibj)*00mu6dQiQr>uBugFF<!N491QWi<EMryzbY)8++a_Q7HIFgv|4F{Fzuas)}LB
ztL9MMmtXF5Ak#~Ur9qua$Y>x8#y0ZMu5Wb1QR^K}XM>vTo2%@24{U`FnYReFpNF}s
zC<bOvG1HFy?T6hKt0+c}AEvu`l#XlrC|diqA2+n=QYN}OjbAwIXE=La{~@BYP}9*m
zaGA5HqxFaEP85yh@mv(>jap5;R^eTKfl4t)NFOkvh=%!~9>_ru)G2gFi9qFXFf;7w
zczfmbbhT=E+^5`X8&RWhH+${H1~)Lnd;J@A6j|%oSHN;lUHkDg5jm#iTlSJQIniSw
zF(|OxG@IJ)Qxw*}gVEvuzRL!S$|#FiWz-aDmYHn!r~9)Mn)VWhLB0E)1Pp1V)gR4S
z69yagGlnS^Kf>r5^LvXj%(!`|-K1FtEh=rN%v*0Z2j><qYeDMYzNqr4?KNJH5WKj)
zS@*Ub1Y+aJI_>q3cdf?((wFFT-Fup)BP?2Xvr{bW6sBj@dE6zvu*R{<J-g&53-B+l
zO`2xln|gije!%k_`wTX0rOg>{4NYV(7@CEGsQaX~8Z5L<LY2QvX?s2Vbkr`3I%+@|
zuMkupY-0Q}?A-PFht%uA-wd%Y2Hk*YYgklLYwEb@vZcT}d!`Uhpvq`jN$DO-Yw%-V
zYjmA234DJ>7xQCO)K=#&)1`rAiQzE!m3ifdF5gX%!`m|{HRhT3#V&j5OS1-h<>UoV
z8w%Y90aE6ysIGtg-e1J&OMY>w&{x$ped7foBOEa(D%zZ^YrU{7Xe9SS1+m6FD<4$I
zHE2n%^?b8LjD^^^E(&BVP9M*NjX$2-e|?XJ3-w<q5iGO$z0_HQo$s{g<v2*+ig>RI
zQuevkzZ1S@jB=5R^-IDgFD+rXBf50C`=jv51-q@6Is%FxQf!+&4wmyuo2jcc;r^VZ
zKi$7}beReec<r#dS;x_92_NuL04vpDQdL;|VDeI#@cY9Ou4h-IIgeKGgn7uQL4R;o
zyW{HC?0(~T^LY+EZKz3YK83_MAS!;TH>FcH7?`_+o2Xvbe~|I2fyHMAAdT`STEK%O
zwh<@#->dOhMifRL_rU~7&8>qWH=_+FG|bq70%8!z&uslYbn5L=WDIX1aIbtj%qF6}
z^)JXpWsX9qg)J|3C&1VT0Icz_>p=YU#YOfsZK^wjmy|C$o55%*qSDs+d_<IW#v?b6
zc;~iin5U^1)Cj(?ohVck-XYr--HqTQABM(c9M}V3*6!M7m_@(6uAID-#Tr^JXNwwc
zcZ;@)udh63`?&S@bsqe5`WZill4GA{Dl<9eBkM~!zM#fY!z$>M0UVF${56w|o&aZW
zg*}oBc;Z85s^-k?JJKN(q4wq!eMhXypotBr3@XLSe{Kd5gI=@vxoDz6Ak8{qDgQW_
zMFgprjJgiVJIt|HP&1IeRHmg)Zi)VME|;tQu&bQ7*WlrP<hDe;+14*kynFG8%#Kp`
zd5G;7h0O=cdfl^3RF8c6kqk@A`jx;g344@a)i@&;xnt4`k>%VJOVcy}FJk*}cZ*X>
z$mgjm+*uY~iV^iOk8T6t$Ut=^;&NWZ<lss74;Jsyo4@PrnE}SX229gizPly;II8TY
z1Mf3z5;Ng^05G2G#9yMwq9|ZpyDF~Qo*n{pn7g^I8ir{NoSzgRGyDkasWR`^J|uoL
z7zV<F%I&(|RUZF_M$Zi~2*+3J$jvWlJ3{*BKjFNSw0s-__o8dp7Nxpg#Mr%GeNo{)
z<W3POVEued)=EW6F6qn?9vsjx=H8@DNPU1`YzH%9IHl>NsXydC15UiCPVhzr|H$e-
zXNT{kX}y6SDc-y?=3<z9eKv$P2+VwT9ze2`5@=aj+H>P+4NMC=GEP({S`f*<tckJS
zOD`*3hyo*ODJ@~ZV_V4D*8(K=x8Odc;Ql0)m&Ks^5#z0}5C_0-ni}6Tg3!EZPU{hG
zf7%VPdmt7+=7d$D{nb8aofPeA_=C#Z9Io0QQH`Rf$8^9Q9MZe|zTq`=X&J_#E1UVo
zWv&(jF4EQ~8+ZN6;uXyig}jvAt0;Z&O4<epE278-omBTvC1jV$&X8Y((@)EhXP7%K
zpZjw9S1+e+92){`oI*C)pfJo(oQ4ewnxQq%<MmqD(#3qyle3T;j>Z?O0Co48S0BKx
zi%|pDYYWt8yweEyw{dw~IpGAsffF;G$R~g4O`7HnCjqagz?%o)!K^62M9+&dPcSfV
zKMEv-#lZy5zd{f4+e~^S*O)Qdb3X;NCQrqM1O(|?`Mx=CD*=TFEI|J7uMpu28s@5a
z@TLc_c9q4hn+IL5sor6+_h~QoRZ<LO8LD$17}4>LVb`>GVB+W+PGO_(P!6hFwOsH8
z;4$)h^Pc||lza$Iw(g_M^Osxt#Y54LrEMb%=O-xD4B&XY>9m?Tv$B@j-no#3yHllv
zLDsgLK|l$Y_Zf;Q-@?8p9Yn*_n|MPy_;c%ZE)g*^D$n(@UQJDsz9<@|2Ng>?lU783
zyr7n{mJ9f-N0g^hg=}UJQC_!Ir$jUv#|R?YVhqoZa9KulJ^gkMOz)=p``ps0N&T#0
z*rQa(xYgUDbNs2NL$&iuaLA*DZmSV6cY0eJ?Pqw@kL!~I0wGW<mBE$$(opHV-&Jno
zkZZ^=DC^_70V1(x?3JVyf(RT*_w%~>|CwqzbN~nR_mMEc8$Kws93|;TQbORg^*E)`
zmVr|;)ROjFt`>8+t6=orelf;QXEL}*Z(|Yg3j_EGo-vJN85Wg48Uc*1TrToOF5xvg
zW>@f-3};DwBpzotcNjMm<F?4`jo4sfZlyYD*@%8V<UUItrh)joYypnm&ZdR~K&^dB
zEqVkb)Ht!(g-2YHOI_rgEEP&jR%9OghaWw|R0wIl)38`q38+MWK9}@xoK%2%<BI>{
zjUhmgD(OOEe6cm;)M|HDQ|4hov9X#JMrT^P<as9`b^Kv7fkBKid8bc}Zw1jmj-<)D
zJb^glsCrCB5EHYTz)6DyG%vbb%_Qv-T+ITD^*(eEg>zp{&lbV(ri-=JhC<IbV;I~;
z)yjbZ^VQSmY!-J)JMLFq%Ynrp_n}Rc@5REA=T&A1;?<0a;ex}I1r%Y%6THW<+egsL
zn$){G0&w8zh)nT**eBm}%oMTkke4WI8P!BWY*)V*9Op{*cS*HkP6j<jt{SJJY!E}L
zdO<01JO(n?$3eUr!AKQc(de5-oQ}lpx-yeV!UCMZA%_+^nWBD4DipppUjHmVnvV%F
z2u`P3(0@7fj2u+VfUL)A40gp+T9GUBPs(7h>m~j<H~W_HcX)%gEs<1IKLjpd#^Y|k
zO7=5Fl71Uk;u$+E=NMQhWx^Q(Nm)G4^*N|4<x|pweF5=Gsl@M-zruN0#oD~`mfX<0
z;w3+0aVMOfZ#=eMM=8&ghTUM+_aFRH_GM$P_8JiXld!qhovP_fm#Twa^J|34O4k#9
zJpG{}a!YsfTRfeeg9(*GCi*d(q&gY`%PMHldc_YY5)D{hqw2Xu$mJ<FQq<LYH~ynJ
zQ^wFl3RzFj@*!$X3`o8B7*C%;o=!3eGzw$N(c#C1xh?Ez3G3ijyNYiYzzQnDbp8ay
z?)H*ghYWPN<n^SNEFiYzakUUc3K|bcLX1yZwzR&OP#(@!wRE!Ya{AUJbV)o*(E@v*
zN{c^EH!*xMrLFB4mJgA4ctO%5`XdZxK<lDV8T+(*+6Ktey(kjjNaWMnQ8Vf)`;>V-
z?7iGGwnW_Y`Uu$MbVOLvUn13Yn<++5yer3fn{ELC+^NFv`|3p8tHES_k-|O5>w6h4
z`A_OJOI5S<S3ISrT@81OSFN2L{rJDM3wep2kV8$HOr{XF)os$T?F)Zq9t7iInQ$Qh
zNgW|RS>kiOGXWcRj894qNrmg1p1|KWepnKpSj592&La(_+*hsow}dg1O{}u8q?s`_
zFMo=XgUq$@AVN;&bE1?j!HAZx;**>K1jr$`qsYrTZ!K5_VE)nyV}2>)VScG<^vAU)
z2F{)A*ihRqRE#!<4O=+K$PPg2OT?I(@YE$NwH-D8#eBi+-gWsU(RBG=yXJ*5uHm10
zic8Zy4?2pGx>#r8S2(|^q%;tlY>2gAQDJMt1|Goa|1NHDPg;AGTB1aCu$O!D1laIb
zPQZb`p6X$Ku2&?B=)4a`WF#*Z0;bxKD-=Q|H5>RnKpe36-#<G3#3l3I<wGDq49A5)
z5GJyu!w858yYnG%taznFK-~l8npBH}PR|!(z#>Rt0AuIC%FB%eWn)Rk(@fUaz*P2w
zMYX(5j=-jf+J_e^4?zA&afla;cxVJ0LQ2A6;}9N3u7^+xPw2cXrzCC0Yh&W|ijfiT
z{=Wv7R2)nV&cK=^=kZ^JHC=k78tksD!U9ezQs-Sp8b*bkLJpX?o$-F1aA6778@Qz`
z)jQ=$xF|)cO`*P=kp}`^msRQUsU-U#NN9*UJaw?lt68g}hj|pp2){wOy};(AQ14`0
z#IGJvB1UO<{JpI#Vy}bvCb5OI>czb}CKpemhTW>Xji%LVD)rd*a~Y?`uXgt*-H9p~
z3w<b`&+b}|JdX4#0EE<3j18Z=y8582$>?Lz`O0$Pv^0rLdG9qkIqFj6MN`=#?>68?
z_z?#GOPK($DpFr>{fSumT?POQRUo}V*VnVgEP)7dc0n-HuOX!O7fmVFqRX{Rnwsp7
zK+%yyPwE^d!j?Z{GP0->Lcs{SoVOeH{TznlLqN&2OFXIpSG{A&2znj!qbqU<7qjap
zzYX7Jg3hBTg-oqN_WX{rBU8q_Yo3_2JViIZl!n8EBDn4?^<C3pquCjd(dO?ZjNT^Q
z@*Y0<-m5Lt#T%1@DC%b?hP@uoDhW2BNuh}LMa9?%kZLrGtQ<~i6MWs$qxtE)8UpTD
z;3rW<GY)IJ-=Lowe`1T<5w9Tu`;AFzoqpGI_+$yU_bc@4mR(C7t%vHT2@7F#`0_aA
z)Wqm_2q>lI*6i2Ieqk4(%gOQ8I|FkMr>vV)u{Kw}$aEAwGW0FyqZR1fibD(_(eRKV
zwWOybV$f!k<~-hWj!-EKRb~PIcrXMa;JATirx&eIX-;k19;Wzv8poxsd4qOpzfTuj
zsBNo_J-r~x=5{-lu60?SZup{ciBtL%pCmb&F4&BGV6%!EzB9N(AOz)S(-dw_;IM0i
z?C0H5e21pLCAG7KneB9&)94VPD~=M;nwtO&%+u@9#MW=Oq<R86l5_}N)kEK~4J0aD
zdMr8Sd7_0&r%-ERfS}$4H|h=~lghP8;Ej(s;j~7bD=U%0F_yf7i@SF~K~i+)Vhz9?
zmUf-k%5el#aj5(>#GlMi`f7}%%}%u#PFfL|moyk#?CRDRw1Lz$E&%?l8(JM;F2x^8
z_Ig|)dd!5l@FS5}Mt@lqo}t~s8+H}}>K*QR8e2056=*W+@2>Ju9qN9+{_rKms@3I>
z>*$Rmw>|X|L$_}d?(3Y9GDJN+`+?QSf4m(x)Y~2N*R*6|W5abaz`e}WS7iA_jw=sy
zbB>9Tr<$+c*CoV9#-+AUwd<!t{05a2gb?Kr5UwMxTl)h%QAGQZZlg(htf~lu)H&^v
zhQh4IeOp<^zYAI+sXQR`(I!A2B8VC%cwwP-57UPL#uzzHS-LkQ{_X9Jwtigd)M8d)
zI^%hF&@XYV7YSh0=O~v6@xQwuKq_LC<aOWTTv8SLW4V@KRztwqRBpd|c>=g}C+Uo?
z^vCnOLA$iGpIJ0e%cs>ZXf1uKFdjou`(Bubn0NL;Npnsl*1_lCM?}Ny58EC;Y<P0_
zrh{6u*1LtHOB(%7L&`QYmkWZmI17OmD^`W!JN6-@jza<sgL_o^Bh*9*9-CfYuTCsa
zgdljJMnWDKpqa5g0eTOu`<}WV{v7tatV;PR!=BUgVs}NK=uQO<Q*uTw(Ag0UQ_veL
z5n37~E+2hP=3!2Ol1?p!EgWu<Slo(dh>oDmoL(4hs6t2HlId4<;(ezl2~N=rvT4<Y
zsw*x%J)qo07@9=PhtWqlD>Yogb*V<8Zc^{qO8;AJuBKALIuA>Ag2+G7(QS}<Do)dI
zJ}>h8<&E%eq&JK7N|k_`IC1WhUl+=QH?mmGXl49I>cmcbXc<u}?}8$yma0Ur)H)fr
z^9SFYHPA_TSV~Ny^`-X&jO1{JW#CgWF53KFah1%BO%}%bZRs>-Pw1~mj>725(=xaq
zt1DZZImbDR{!*gDf~}HkJ2G8i#oja5p*kb7>3S~$$7x-&Vvb#A>{yd)7Q&XNha-9%
zv`t^KC@$N#E@P@ZxD~_|Zb3JJ*nI_gX=<A!9?>};F;+aUnzUy|enliZO%XM;5ECw4
zcd_EE!YVPd9}zB{uA%rc$WQT0eT&Pz0{rSuNeYGPGc1z~Y*)a<gp1xQYilj;_}YWO
zIlzE;_<+F=F-^))Q#vh?Yx;!MaG0;iA4O^ePVQ*lNKj{CU9cZ}|Dtg&^|-o2c)PS{
z6zhP$PrI&Jse`eDjsM0UQ--30?Pn)y#>b1VUl8U2OTLmw-G+Bi7laBJ#@t-E{tpUg
z93xCY@2-F4D3E8-yRRT4-w5u2U#zG7BWGnn<S52Wx<LzQSH<K5)E1C+br!c1#(Bw;
zz^{;-+aBiHXeDYq3yw_nv9xSH-5*LR-I=8bFx7XlWekTg`o&FK)GdU9WsEq_?f~#6
z?x$hIuG?fjoZ8iTl)CKCCJbxs<YQ7-=^BJg-(0Sexos3o02g@`a3!gIJ1T%;JC(2W
z(LzJFJ3=6Jzi!DB?7lxyw}HrNr;dhcft7a!AcWjH%Pa$ELjVEj<^nzdZ8xb2Q|lU5
z`FAN1g+Ga5m7W3vdQH}nB;Ri0ump$b-(UA?IjdCVKwSsB%}_v-J|~&jl=%jzMHr(`
zmsqMYlG%#@a1I}Fr`V+$@XTjGLb)2XDD7OvZrtQP0Dz%C%%D-;`xVqCr2C7s_{X<q
zNU5~e5d>n=LY>44n2aO{Gy$RnCpOyV!VCc1D^XscgXnc68H%DDniiX&0A+YXpbVth
zO*>qu|I<dEcoUisMVEE80hL_%oDyqRtR8gvx#Vg}gKXzd#bnU-UBi4-hBMG~(w+rX
z{&6odKUlBhvXFO8Y?{xh$CIY+ML|_@mEF*cU!ZgwBw{E?5bsg4B(nBll|(XNWr}gr
zCAXcezg|i*jug%E+7=|*l|X8c!ShY>hU{k(gADUZv$FpY;QMC*>Lh7l@KGR@xo=SS
zU+TB;5l^GdCk0@byI;S&!iaX=s2Yv@BW)fm%6mH^EN5t8J0koJTX&(sGNdDr&6YMS
zB-L@YU|&_1F9PtEDz>t|c)~##15wRHhDxw|wGF{6Mc;kfFA5V6WwlgAbzj!KYuOeS
zwh@vlN7${+(u}c*st|1FpPEWy5&P8%;Rp^<BU@;j=oAP+<TSP>(~&oV#}X1YB09&{
z#}^30_Pz-+LuKly8A^3pblG4tdmUQexF;1Xyw0KbqJrcQwsNI$uap{+Y{Q2p7TVxa
zm|D&4V*8T6+~(a*PJZ2ydgzm`9=dmbSQ1{apO%NH1_BQCEuMz1hB$%#0?G_h=X?a5
zz#9huBcbi|90QoYO=F501SFV}Bkyr82Y~WWp!Ph$Kha!Bryv9|1A_{#-5r|Fz+pG>
zppm*!B9(r+o;bK9JkNlS58mmo^zFJ=z+!I_QaK?N`rPSIJY~u=x($FaYQuVeGoHWx
z+gejm4fUT>QZLQGF>;m@m#6*o^f=rtTERMxKs%#}V!9ZF=1_h#0sp)bU*n3iEt^le
z!24)lVPYncS@nZd2o)(Y*rF=yrzrvb3odF4K*Q|%bYV)Dxo^k&Yuph7v-B6u+~e4_
zq0zh2celToh(Xc*H0df47nMEyQWW&cus0fj!3SudWuT+ebonla>t)zSrh0zxs}Y|h
z!1Bt@Hj-8MCGvfh*}Wh&5I)qpS?78A8`ZO1WdrXev?+n3(7gI|66Mvb!q3)yDo;?K
z_n9OZ5~H^Oxy@43u-DpJEKk}N0ylcWGcqn*VCEo0LyV62bA%z*4+w@at_k6#`yE+6
zLU|4o(gDiA^@;NJ02&`yS$g6yKP!ST-cSyHSohXbDyqkmTW>@RsL_()xmnj6%+i89
zZ7b4$e%&b)Rutj6ikVIU%e7=$FYnk$&bAi>Yp7PU@uPh;OFh-T$%~xGj{w}M0>fEh
zSbd~r@vX&Lcy-8N;tHsh?KHyjm`DL-zo>FoO$5dj&%Q$_92CF`J`<QJ3$4g&>N<ip
zVc_JL3Ds7d($|K_@Kx4{ai|!5gtn!`;O6e$E2Cq=a#qUd<LOMjQ<4it)VHA5>rnbp
z%!$Hv+>YVYTa`z<ql^F#7oJg7OxlcEZ0egjEv%uU^ZV@|_d}0ehgHFu7XjqLyn4Qz
zDXDd6n7ax~hmc{rcxu2RURIqDKp^JhU*njb(<0$#BuTIFN(eOB6WC}fyi7K5jV7jH
zG!f!bg6$QG-~%}k^F#M0k$PoArlGJcl%7YS2rku=4pC2isBExOAt5uym@NXptO@lM
zk_r-pqCCqnl&c8+rid6-^dKSYPgaM65=jGjO~7@0*s8TvDYxAUAItO%42eHnp?;9Y
z?sO(fw2joCgY)Ia99(FK)~>tl0&Mo8kREX~e@L|tcWnLOSJcewai!Q>LRg;-EjN7o
zxBZ_WaLQp}^sV*hQ$wR>W?1_YQr0noQh?{trfec20coB6n0g~DMNFZ*Withpm}n2e
zNr?MCzQKKA(v4v<tWzoB?MI9b#;ZyD%Ne6Y;`m_1x%wZ<vs-%(X*pVGm>L?8+-=9y
zt=Ek5@+lDj{5JfRmKFDpx1)j`6Oj5w9-&&>(obm&RouzzSq|B2vUapUig6`9^EIVG
z%XirIc%Ha4s`)C2^9HffKcGcZgabG!0V0wpIT||HD$UYYvL3r%`>nP}-v+_Zv|ZMe
zF*hBxx}kS8n93<ph+-QdQGtVd8}jd&YlT=>-0Fe0MbYl`Z;f=ts9Fi}0?%8AeTGTZ
ziOpN`Nbd>rgPEIus`DDDH2z8X4R){o5<*)!n?Rwr<2@;o35j)~vxpvUxl*~lzfD;!
z{F3JfB6Lh>?XIE}w@VLprl@T13E}P4T%ikYSjfXE3YM7k*+MmWQu|^3S|s(y<vp#d
z#!5LIsj<ZBJ!HE23>Tmmw`g87p_J3M<CMBK1N5Gnx(5H$PlqKLt4mog7=(-@zYFIA
z5Q~W!?L&W6<QS#GGi(8*_*Z~9g+H%3UahaSoHFHS9!-8ubBStSYiUUl&X)GrHyrcP
zEr3fAr~FL;2j*+!eClbacpvp8C9*3evilB@w;A_!$jmw|d5qCD?SHQf5bkpurk5c}
z?EP9voLCvHY&|3-+gGGbiM&OLyh91!Amx%n1-|l&^C<kf=t;W~d`em7nd2~}$y!cW
z`PO(?wL+X-M6|D{fD)5Cr)9JLHDxRsCGy#ZM&zd$CiWPBvl10oH$-j0ZB2-={?znl
zGC5dGbwzwej})9(9#>MPa49S6Qo{@;|Kb4vap~lJnzhcVw2R{N=pe3-8}a+(Cc|B|
z`&s~m)ST_9=q5~wWLpi6bD|e>Y3du%^2F0B;jxp0#4YqFM1K0V%M>f|DqVgP#AdV?
z5Qn|Z)J0(bm!DEyH2cWw>5kG0th$M5@lky^u0nS4HZA*2SFaRK?o{h#4P_Hywv=a%
z)Xzl6u|V!vgq?xU!!%QdVXfy8R_?#fZPvD9L<y5Co5cF&^mEPm<83tpn1`c9l`bVe
z{MCwQZnDizY}p5!v|LQ8e|M>W&{}v5lC0Gbo2elMBWIrjwKY~S2C2)DIf=rikzMc&
zdn`M^)me}sy6F*BSnT`=rcO{`O-?$Pz5L7=^SEjC2)Y@Xt4MG`pi4dT08uM|5)?o_
zUu!!kF3zFJz<G{Z`oUJzY1@tb(*Q)YfOSnOJAe%66Lm0#<Nwnqazv2AWPG6^5GAX%
zLnT0_OI()w(N`OVPiY8@ITA+;Qb2J|Je3%)EQ;-PC1KmbZjM_(2<3BsxR`qPA~{;r
zdFb&s<=qv|3R6mJP{)_y&`f8!@G~TjzM@*QS-Z<m^r51T1>)?-X7b@Wi<I`mXBE_k
zEC=jmn+cav$vrk4NDU_Y83S^1wTrHMamsech{#VlKwIEkR7(?JRvcrgq==qMc=mBR
zWbZ3Qk9(llC)>^kkAiHQW7yYHm~fHNNoN{qlAUA#$^sK1Y4xo5hyFQ@#4{}A&@Zpp
zn?mE<Vh*tO%su40CRuL2g&^v~ta8J*(PYOP*;AbZjf9P-zHsBgIns9m?Y^+h10QZx
zYB9uTTnnP#F7$<X_a0hMc85yLP|*`ux4)o-r%e3{zwV81_X&*r-5#jg<Td}1cM&m4
z12RfdriV75*7V2`it*f~Dc?qr`oQ?Yn+i?MV#xBnuqp@veG|bH0m<KgDHB3l+}l1;
zNPHS(z2kMQ&R*>*DG!4DVJ}YucKe-YHp{80NT6j?_F03X|1?mM_?+EaJHGo_quXap
z{-wr-?XDefv0Pi?ClCU)yLWAXa&0r3(k6!f&!-D!yWj?)rtJyB>eT&6`Ql#N`|oj@
zxENudu4-Z%LKTG_r^MKU4J(rPXq<B=E}@(+$|I?m(^AGx-wbIfT;y=XVwC7VG&Htc
z0ZfOrWoEoq1y4bg=GFQ_s8Sbs97flH1+sMz1=4OBDjb(Z)@~R|B?^Z(#aG01jzHP`
zJ1a7o5;%3bi6uBBbo!0QmIxN*zf<^BkbO)-7%P-AS4E#46wvJ`CN7&fWTUo;&?LK_
zF;W2IwOt&SCB!dQK$37DLt4tP{)lDhR8W|g(M10`t%+^qyxsmKxxBgGQLf~U=|DYQ
zxZns8>ydE3n><zLGa8;Y{uS~5<Q-GU40A$<8tc$8Y6WNIh2V_1sJG71kqRfcGIQUO
z^f-rjsz|A1_r3Et_pLM}$@mxR!;^jt*{f%B0MMct31A;;){v;@+65?*59rpBq=A5X
zOgE5|6~nn<d5_foirZviClS++43LAJ-ozC%b9_2Za15!K*nUZP>|Nvdx&T40r%F<2
zhWn#By#0c2l~$8RD>|a%T6tJxMwVkg|KT{Cdj0gy$0pr)g3SFMpYolAMs&<WWm#lt
z?fcbt$c&`kCbjiy#*Q{mPrw5SeF=De99SQAXdxN?JKM8d+lA19LOE~q7=X~Ndwx?R
zH|TKgNz%7e?;rY<XWUojU$?{a$T?>A9KHwFMb#<_MViSw^3Ce_y$$UQNM#2yMRmfV
zInru1gKF&aM~cBi)r&A==f1CL3XUGW+K$w;&h1~Cg0U(|;oB^0fuw8`;Mh3-Zj}Dg
z^OEF+sPk+eGklAM?S6p*5smu#*AiI`-gKbQ>@N4gT^5p*L{gVRVfwt-VFB%5x{_3i
zouan>saRkk2!)3{<{Q-*nlplKc}<R0!RRkZsfQ`cPV)v?m$(jUrnJbfZIv7)-yk)K
zfpZ$*?5o{^R1ld1L+%b>NnZynIH7Q#|E*q&<V;Y(^wpouT-7{sRi5eWV6ULeWv+$j
zR0p=fIwr4+=l!&mn6c>}lJzwVeL+^f$-eb7mWeVNn(D<NhGR%or*bw=gaMt){Yfw@
z=2gGsU#Sxwh!{rjx771cmyw~mv#06!Uzjl`_dq9>i%o>?AACX$2auzcCh+Jn0hRge
z4OJ3qlOMr|<cfrIigTA^LwHy#EGN<Wg4xd*myo<)(Il%qJ^B5tS(!AziP#!bRMu2H
zbl~0iv1*=JKwCFoX%3#FMl%|;YH<J4c}Hepz7`hlE#4^!Bdwl4+c9fNl<LD6!%++0
zdf2!p;k`dmq@1{%d000VqshN+#A(Cn1?Yt)?I)miQ5Z!b{VevcKEQK*m47QA3@DEZ
zKcG1yZIzUXYqRmco?=BfC1XV7rb9@Z5aLoO|Hf%^wtlP3QZC(Hz!{vm$k-<oRCQb;
zIJ6GJZt9O~3>59`w5lg2Ge4W_clFQlKhXAQ%2O9Wm~kL)8i{l7(98NDA!$bw9Fp^!
zJMABu#W!f6`KBz3MF;3=0tr9{KQ9|MAqT~&%}_KS_($W4>^NB6t+ig}rEW>wwnc<I
z%_Uv8sqZ%D%`r#5ja4plBS2M-^xPhJ(g28X-E@a<o>P$-QefE1l3C{pU{Xm6z&dPe
zGsX>>R0Hq3Y)$xpMd&fr0K88F>j;f!He!+qa8lCFZdeCEIc<LfTrcz_fehFUkB#FA
zz+u;8QUQ`3JPop`XB#to?*l;5#@no8V5u4Wo~1Sl+$Bo(rzRJ4q&JiQYukf3I3&5W
zmBJm6`2g|rG^>e0w`-V2?@)n}Kz=~KKlV4X?c3+a&xJl;ZV^2a5ma%4pB0A|NCmGU
zGo6CKNWD5h_V7QC{*Tl_{8Bwt>S{oDv5_EBUy$_g+o=tI6QK0T;*U-bG!=!~S5KeU
zObHf{doi$|+6k&8CIWyUA8Qx^28|ZLC^_oye!8dv%qI6T&nslr{DV)ZZs$aU>ziC?
z^B>bzj~xvQXq=gOx<3JWz(Yfp_dI68Tj>6Ul)~_LP;v<#KXgoN;a+Yik-GO&`cHtt
zsh$0a=Okn&{PU1lUE@T_D6OSU*duG*P~R3ng7X@KQVZeEo*s{$WS%axpW+|4ydFO-
zoF>#sIo1Cm7%j2#*vk^DvkdcsC-#o^-U3?CO}<8*Sd(>a4+mq<v|@n24cOERw$k-6
zV1DbfQBJ5(Ey{q0su>Ij<(WgxKms)a)O~=wiF6LsLDGk~ujcK6%Kyi+g}1Epa;--$
z!*03HxbyCyj^1DNAI>5|8mi~9ACyK&0~wKKUHHcEFj3|i9|>?o?5A~uuwg?kAjNs*
zNl8*~U;lUJ0kjo+wz_#AZEml-dDBV(zSB9-$)fkT>DQKdy0m&qAiT&I%>nAz<O^4=
zS8w&Qyc`DD`<Yhn&xU28T#DOXS<mkjN34s^`W4^VfXZ-a0CIl<(2(JK2mc7QqAW`2
zyrC8bT?h7P$nzICP$<-ZRtVlP%BPdIJzYmViKX9;=Z#jSKkT3HX+Los02$NHTPtV5
zMu0NS`rS=z@N?@|n}{2rBoeK4-SG~`*7vT9JW=C2%cAodc#ROCVaJ;6{I~WEa3az`
zC>>CSbCysTqs{e7P+r^9T^oG`h+;fOL=+}+>2|Vi6vzcs5kla(xA}L-h;GC}K38Zk
zyw%e?cVrxZvf2C;G@#+bryU7u`PfP;Cyl_g%GaNj9|=(Ydq81~NXUR*#8pn)by%BN
znd?N+D52ZF{5)!cUq8Q>&?h^kVnj<OgfxI@Pe)(_LEjOkca{I`)Sv_PfU`3Ho$Ilf
zhxp$^;-@W}4nko)%G?&HTlO@MB(8v&B@EUkJfNozs2(x`>>g8&69{Hl(Hk>FT<GS3
zo66~!m!?qu&H)g2{@=c{OIiwH(pWh%kNd!62VnTS%mT&zF}KCDv{l=`Z69rFiC`VM
zDsrg5cV8Akw?U|RJzPjKi~tnDyrq)Qc65MK{04AM|KC0om<U|r%M<c?{B|vOL>u&a
zd!Zk^u!-89E*t=C9E0wmI0wiWT&XUb$<XwR+J_#%>%67ZD?`K}B>~!Q&VSfdq=03G
zBdBN)J}(UHGrsa(1A*%C@!j?-pr%eCaW-R=o(gm!g3gV+E{z_LF^t?uGa`8aa!j@$
zD)*O%eIs7#6Y?}5%ID1ibTM*E(cuq9`$8&!v^xGbLUeWrVuz%W8XYxd>QT*j+snPq
z0-tsl6KgG*H4#9w=c0KWXHD|3S$&|Wa3@68U5^CY(xFH_qPZrox!&p?*q~t1xa^Dk
zyj#S6CnqtNbAAP-^C6?^V}C)aL+f0<=b!ONd=0i}bOwFe6KGOMJ@dIg39#r=r7A}r
zz^g=v-ak{!iP3rxq(YOfACA1zuP>M~FADm8iY?UDchZLPZ6~2-cG@ge(X`VD`xW9U
z{nXW}XuTP<(kidD_@FTna&+rP*~eKl>`!?=%jWjZxz@%}ilE)O!8~y*6^Hsao@cG&
z`2pF4&CA?lrFT4!>7CmEJ^b8Ce)cz8NC;44Vk2-EmnR>BB&b^^^ITR$mF%R~&{D=U
zAl_uA4(c`SM(th?b?gJ~S{g|7Vn=x&*LZW?fJ*KTut`R}QZt_?q}n>d)gPRCw#UWW
z`dx}A4;pxF{#T{JWo4AdX7lZZ8tSLt6#(6$*?uElIhW|^y4$2`kVW$Du*qr`=o2Da
z!rA-2iB<buC4h_H%KrQPP~3*sO`?}Lv}ZIL1oRi6-mOfh@7oRFS+#D*cpt_{KQbAC
z?4edq#|Dr11%J{yu;SpgRpU->CHJCo>uJ328L)$6_<TH%hxwp(&hnKyQqA6V=^ss`
zpYUsM)<Uan^nuR94$xyYYTPX>LUL*B#8J=oL*byX0LI9%_`s?$Y_5@##vBf$eM_M6
z1E{w@RW(XgjN69p&fx|qlhV*G(b$BtcIqO~QOYgzI55lcnk)F@>;k2|47z_YJ1FEW
zT;mL5xQF^%6oA`SZ8}j{4X_Qe*P?iN7J&T=GFw0GWNrZsR-fm*e@tn+NjcvZ<>goa
zHrr6!-4~<R0_IV+P;G~#mlYv$4+SxR%aTI_WXq3GO*<FmdHyR)C271Q3*UL!Cj{dP
z;J{}H35v>H|FrHv6sB+oc1omE5Q&8XB~RjdOca2Aqn!sO;UId7h30W>Pzs9)olXN}
zKVp*mxur)(*Soo;w6v3OqwFoNEq1e>kl(UuGs?!p)*|H|K(|B!+LsxFKAe5~3;2)J
zWl*9(82e{|h~tMyr)Dyb6=ZD+72eaBD4L>x%=tH#f(#-*Nk|ug7#tUWAj-9h{%VU8
zpm1)~EeAjggQx0ff}ul209{BxX<$~u9BP6PK+-q3@3c?9CuWby^0rhn>XTog*^O%h
zF+I<sb_N8nY1^puDf*G@;%&Zh{t^;>N>x`5R<rAiO*_v%2R3JYf%4|0ppEjz;|^qG
z@&25bjAjGl=}v%YGab!u&^b}6hYStcxJCiKG_$cc_49ue@yc;vRf)|AZ~q3ds0JIe
z%oxOO0x{F-wD3r6>dhS)vbQvd{0Dw!Q!kQrta929sG>y}8Sv-!_KC(S$EGyT&hG;e
zP+DpI2k%WRugRfZ`rQ}R6>8wx;&^vWr36WWjfpl;$_NXT^jigN7GiaYFYHB<{Y9u#
zLRX2lxNf{7t(2A0T<3mPY{XsEE8xQ3_MH)bb7AGp@H3Cnb9@l6g(FZ|K1;F&_4S{b
z?jvZLJpLQV$EJMGpdHxRDlXCsQ5cb#)x4YYJl+K^fN^I=2&ASgGc?UN0MAGLIG}=Y
z2cV`1RRLj)#l@kvSBX~|#FPT1h-~~&{ll&v@CCYKCdFKJ%pfdkbA#T?Wv6VBSz#<+
zR(}r4-D46>qOT5;-vX^I1CZzMy5b#Eck<SNG&`DMEG?+d9nS9TYoy@FK2T-_6Aq|{
zu(SdFd5O8u+oUi)5PAx}c2>i~fz^`1YVH7I@=xQ+3-MnTf5zbhUEt$Dm>7dlk_yum
z8;|6}MVR)zUaa^iNZld9#)$5{jR6k=gG<=I>|AJP3qgbx*E%szyI=pr>QYm1)6{m?
z1S-odCih1ZL!fr)tZD!@jZLp(7eMzW%sc?Gzs5B83bmuw)oty=(zH3yG73n_7<WK&
z_rJ!cZJ9AGDhWOHOL87G)B1{)p^EXwjf}D3^RbVSBQKS~(CuE?()z^jO(+;qV{1a0
zNl^a(*m~=zsJ=IDTM#6rQyOFdX^<{KN*r3chmtPoF6ka*kd&0}l5QAMy1S&qp^^9S
z{XNfG&mZsquEp}4eP-`{-=FKcFVbG}&t)^KLM^t^y)e`7X7PP$C!Zh*PayO(eLrpV
zppjHY`~9%Jjl}cC$JeU$pLBAK>G@ewMn`mk58FS+igc3}gol%&@BAUj04>c{MpM4K
zrPW+DKBMG8onj)cMnp<gyk=UOj*=?sEh*B`Rs4G#mLA=DO>1YziJhnLMR4U$*e|Sd
z5)uA|&$&FxzQD?36&zE`0**ibe^EdjND!^mz>vj}jMF<oGu}&jJmHnp(k=C^Y8$zv
zU`VpYg#W{=KRc$;Zrz!;9SZuF@QgU0Ee7*mvKgjK4~WnV3HmbX-Qm6}iu^Yg%?q6Z
zl|}|&&ABaXJvFOAa%T?50DVDn$lRYHs9yT?D<Lrdx}~%jzYAX~V;14R0T=yv_TF$u
zkP3D8>6{XJALvgZag;fcG;)S@ldDuZPS3S4JAxVcbc<zi6~=)`-915wt6OW)q3P43
zniYly1C?&gXYKqco8O`0_dr!T?`DfhLPzuQF6GO1Mkw(C@i#@GzK>PW!&Nae+JbfX
zza8*NO6hA3G+dtAXAsJnnnd7q09JeouKhokQD7G%G6=eSY7OM*o6+3ou!+pD(}A3g
zYv|e3U^>cpv}k2Hc;P<!yPJ<DYIm7l4iD7)aVD8HWV0Q&k*wv+nk;Oz^*~A$ghqE@
zy#c;fZ-<J3hBaT-;3D3ox5_bD<Zev0`A>P1KJWJ%Q!zwAaj{3{Q}<^530<?#i>)7m
zgl3QGLQYz*ILa{|GF8W1H{N1vmRqeBg2n5Uv6JL{|Ec-0GNGc~BsJKxS=?D>`MPOE
zQJ;K;t-Z1s=ISGJh-U20<2UTy*!*?r>pDz^Hv04s`5NPbVedBJVQRM6!`yhk>N&?I
z!<@5sAR3*aAth23m(%kc_ta4RNlzm?b;MG(Xu*054+9}Wrw8TgT|3nYICfi`$eeN;
zvoq$!`r-&<A!NKSkGI{gU8L&B{fK!N2qK}Dtyci+&7*V!{=ZuyHZriRnVp_u{oJ!=
z93OSs?$3vse_7yBqk}w^Z#J?N5{4vSMXBuRH+(`RiLAF&#Y;}hnNTFN-d#7GRSj^?
z9)nDGR(C&~7kU2maucb@|ChA}R@w1arVUw^xbK!A0{>c9(aLS|#)Fq}MWu|N7-U>f
zUKKNukhVQggIL16ay*(y``~P2Iv?Qh)q4zO52;Qa%4eW~l@bbIt?B{ihp)91ZtMTo
z1<)MkCA1mM>-FX>ad2eKB^raaRc_{Q&Q2|ZMokJ5&JfqH=hmy0$DqGgLoO_%p8hAl
zgewed{KmgsjDkj<Xo_;vz^ei?=Ry<n-XL=TRmp$6eA8|@+#qKC+;vGspt8*4u8hs`
zc!t_5xzHCtIQd_>Jcb<zhGXO%{RpGh)yJtNKQo@Pv24956Gxbu=3ia|tZvhkOY@iV
z14<f1ujFR#-NI&0y&3iS?@a6VBQ~wO`N>m%UtE;fs@MbKt9RUxU&#iKgzAyR<ny#>
zoouu9XU<s%YZXR&Hsa+@J}S=sWrVIK;J-&J_U5mkM!NvkA&gHx26O}af=#yw{yT+N
zO9cef#;>J-{G>Wq0o1BG2*SrJ_m&e`wopYCHNK@=@&dSawxRQD8lW&8fZhqtXyT!G
z*WOBzljcRAR9>{IwF<}?Ih@x!hGKg@<@BHwtML1$I2-fI!}*`&M+gm5W7%kA`m;dC
ze@|>K=;YaNVi9wOEeu)Ns(M}XI`YABESn%+q|}^4Ay2tDkX)IAA+YUV)jj)(&?UA|
zsP^*erdUzg|8W1zJS%j|fSF?f%jA{4ek?O?t7ESI*3U<oA36Y}pwL3(`2}=!*AWCc
z5U<iWw`Uo70D=Qj<F3sWJGl}<?ZUGosg{{KO9j*yuzo0Q_3e_X9F>N?0{Two@A*e-
zaR9@$UDyQxHdP{$t&lYj$3Nd{6}Jj$%RP59H~4ciTy3<t%C)Ud`eIs=?IS$9zi$I;
zN>dqmptTP<Qgh{~z;iNfWoqk)F{(G(8`XqJgh_6nGi1CHuD&aR%KbZHV4;1w8+iRQ
z<|2oc?^A7}Z+58dig>3l)fO2bK^jJOaYue7{&eH<WDA>5OyI;fk&Dvy<%856q1U20
zA=}Rq6}^RlW@cpbb90}CyE1p~7ZIE$wnQ}hPYp$7dXn<KHzZ2vGEQcH*&k#LZ*>i9
z$J?96(CuwT1AW^)bym)usGK$LX<TA&WkDTFd{~M=4Th-m_zHk1sZs?0S6PgdHKyx9
z>Fnxp(14HPq7X-tuLlHd`-t2e37UU@XW1)bEDL|%@Y<nEf5o{}8$+d3M(YYmh6
z`FhVsX(X!Sy|yRNLo^5>c4BAd4IxKnEqS@^2c)nXEJE{FL}FIupPyh0^q@HR-yswn
zLsJhf{Lal8vsC6IQdS*_D!Copbu=+qjP~WHT+8`Gb_^p<e^Q<CT8x;WDvCMrsVavk
z!p5Y6P`f%fD7gx<RbUAY%ZUfmNUN@_Wt!$JUCTrkM>=UCXz7jgT*0zj7$n|6Nc-}W
zqo~}nY9))ZXWDWOCD!O*m|llG3Ijp!Be<lH+~r7dK0zXAw`$)+7&prvBXc1oL$$Q!
z7-)3LkS8izPEfN5L9oI<IGqmyXO6VrYANz!Ar=j{331*C3Pf;tBXCE=WZCw*u7GR4
z($9fXNbXa4ZBOfzUjkp+!sp=!BX1*S?jP;*Me?>(rfWlnm(@J6LMy3L&1ZZLE;`6O
zQOhnCnm?)3N#G7g4No_VC<Hkc3kOXp+MB@NsTHerWxIB1L$JuxpGP}c|Ndmbap?MV
zO&#ip_P&k1r3|Km1TMZJ6(GSI@b%5r3o2>qK@K@c;J!@}XW*P6T56c$^Lc6k^0YVi
zyDY7;rVjP$LVj2bPZ~{sSozv=xTc=<`{18R98jNQQ_D3V?dkdK(L4QL4X6|yg+u`i
zyO#jY*l*M9e_7Fi&;vDJVNjv`a&wN}kc@ZC&`{Ic!T<M*@k*TwogMltXyfVHbRWtR
z&^mmuESLR*mY$|`2G3Zu`CHV41!DLznCK;PqG^Wtv1WO`u&hdMJbeIjXMIbJ+7!Dd
zZpppo6wB;W>1-BHC}UQu1*K1P8>$X}!_7F4X<Ei`l&2NMQkp7GxWEG2S0L8``a@3N
zleaFtJxT55moS+-N)?#}hz9Kk5d-u=zr27_f-mOrGK#gJO)%7a>!nPHN}J@$NHBef
z;K3DTNv%AFU<oc}-0n5r<#wSFop3yLEX(R!&=HbTc1w5!aFZ<8=J{*QM3l984(ADa
zIjZ@|VDadI(v;X__SNQ#-rtW4482LjsujB(pB@<A#VfKg6bCUFJ$94-DX?2~%1%+6
z!My^i)jw)ZkE9TMh@6Td)}Aakm58sSg(dE$e*na&VuE_a(seXGr79&f#Lot2Ld!02
zFgegwgL7lTCI00Acis;t{D^f=*QI>psOLSm9lQv=-2!fMo4>dx<q~wyhk0*hEYEQk
zglsr^;=L-Dt4&vo{MDFtuuZ=Ug)v5W(8^<e^DaL~ju$!Rok;~)xZQNMu?^&^%aVus
z*@tdwTnI!vPqrB-1O2~#0U&@utF2Ot{-F7W85{k1kV3<jS682tPG{vSJ3i{Bapg*P
zWsP&uq2jjb6u+JjZ)R#DyWew1l1j8?H_Q}2BF5jV<PBhS6xvR{!F$7t<PIQC9CLKo
z(AE_C12OGtya9Z|dKbM6gJ;H*tQscsd?Dtb!&3Elxd-4f=t=E!Ah2Db3mzumUZ+B@
zgDMsD6Pb_@EfeXSVDci@)ni-wN{YCa>lI&Jf32C84XX9pmjh4L!xdiO%X?Ze-Kdu0
z*)4n})<!WYo4l->Je7zx{F_6Xd}%&pO!5*cv3&$y8M!+KdSIc<QY5M*YrFalTl!)Z
zN~vi!acrPld_pKWB3nF~H(Yo#spa)p^WTY&Q91dP(_1Zi<hN8nYlB7@iwQK9a`{xq
zO-}Y|wym5x;G5lBMV>3ZfT~%=0X5g{?|v04Hb#j8tVO_$sWN^gI2>b4#*ZNkzcIsZ
z$-#F)zqc+p+1Rf6K~?v@1`bCI`#21nhUTz0?9S-EIq9I^GQsuMrsIyUsI(@0@vp*{
zj6WR9l?$7j?(s6c+ZkIm<P&g}nCbfK2Yj@xu7r&LOOg%ZX)|+6G8cKbs}Db2kgnd^
zFiz>G1#@C{edha4tNE~w!4wj0kF10m#`C_2cI__%Xv+jOsxE}bY5OxP>G_3d(1>}8
zfhLq85C?-lR?#Mh*4u$qL-WfG>y<hzL41IItLrJRt;UIHZ7ozL##}r(y#CoE-P*n*
z0T5HY(14-8Fez0n!2^*Q2SHVs-K?&bdFnr#UE<YSgA*%%5dk*tA4lFe2Q-+m(_N##
zCQ0#aH7QwUHW<Ft-(6m$oco*CEhse2W;Ts_^xHqIa%18=etDR0bON+0Wq-PfH<d(N
zfL)R(*#vl;YD`b{kRddr4VaJ5y>bMUbo-Fq!Sf@_)lY3N|B$S!*-=A)g&(6+mK$*q
zGQd3uZFK(0w3$yPwDYRua%1HS1?PH!cyxli7&kcHqE+*-uE8WcV&SrJlCC`gXGg@`
zhN(Sa^!9Qm&A||zVEb>n)yF>mZ(ak>?b+NotO;KpoeRf?uN9cnAtx}bBwYn5T22(o
z(Sa7Ox`Fg7KmFt*Aj6n6l2h%%U{GPgc&^<b>9iDg6>XBWDZIVUXB^jTn_xt%g0GFv
z{bQtBTJZj~JIaXTN9>=sbQOB^KYf7ezCAUNOd*l%edAYW7N0`Qp<rC$$IHgOF7y<_
zjY@u&znhn&wEsrb69*|tNJu7&<)<^)hJLDxY`&vm7S1kN#p{Wtt?GupjnY-0h)IJu
zZcaNrm{S0OVE(g$)cQ<bEW?!zzvfqBH8MJssrJyqv1VOuCd(<l!W=ns6ay6V(s5^V
zM@5s9yiw>QyM~VXtL$Bu<}FC>1Z<ff!(VKR@#;PPRqLx&|M#$w+?ctIlCK(6=9H^N
z)j4p7`w`sOHbXTmw#ZF0c223u4d6sf@{xvcdkJtW@MK!?gRX#>Tx5A8D_d4i#2?aY
zH3OH;@ire9ZXjV%S7;J}v03^qp^wlsVl~4hdUZ$gn|di;&&N|e@3i_n@ZCwfKg%vC
z%~q|n8fxV03bMO@A~a_;&#ScXOs-x4r>(W~e?YhIynNN8843N<k1k_}{0Lg!L9-2>
zBJ#?IRl9L}iPbugVrf&L|2X2vd-L&~F&#Qk0IY<YBu4~wT4tWEq}E!MG$1RgK`}0q
zxyF^3-s9EjE`OwT8Q~%!yjADRV#H%n7hu#+wWVPa574k^D=tr)34sN0A8LC%fjTZB
z`sQIYl6qfU5Y<4zkw~;9El!y9Z%tbKp`Ymt@E)!cwUEP$oH<c@osPnYqlhIj!(q&)
zlbXGm?{JE+`h}BByYUF?l;ZJ&%n2@=n?9zlHj;cE(!4Nwyq&#b^S0ChUPZplJs~DV
zPUvE?+u}<mZTjchfsKaIVG71wn%OOe$K<z-qG61nHI)-?j_EkR%~UPmY0cyJSaS>D
zR)^Tg0!P5Va$qVpD2&U{+H^oCFB5p&QM-Hn=wfD>ew>b+4a`S<H=|qETs<KN+PCP}
zpNC}y_Zl*edR-T~aXbYuUSl5oQ26X7?p@^Ht$Y*BxwL%v1|`>Cuu67Iw_uF^F(&5L
zxq_%in$zst?io-x{!}4s)esk0IGfFHt@_2obFgNyhSJ6Fdr~#U{Y!-qCc+kn_KXAm
zpD_o%TZ%uq=(NkszxG|Z5fsfNyYn)8XIkPG)67O8+iP5Ww<%}20uv;CXt|R*4^ghD
z7dt$2x@-z<87f_F#)C+;{^{=DhT#Lz(@QS%lb!w&Y8yPRS+&g4pqXDl_n3g0TC$@4
zM5mj8kp5bIK!+4Pdiy5h=2p?s?3FTeo$t@KInU+VDVBTYP86|{za?4~nG9QRMADgu
z-Eh|0$52+_;@DF$!VpdU{?*YdbtcwS7#?rX#)niq{W2h(WO)HAL!J(E`#M+bq%bUS
zjQF#ygtQARYge;q-mL5E_KdCV)P>MCMxNH`d;6m{-&M7TiX*3OE(<OfqdM#9^y;oT
zZ&mFc%y&HWJGFLKTKA>W41cE7EsH&)rNFfpD*eM_+nngdJbIyWIDgbHl8u6Zyp8CU
zKcXJ^zb}rHHodeKNS2<USr~Nf6P6@JY@&&-Aw6$+{4xf-%6dAL*9ZKY!92`KcZ-7#
zNArR)ur;r6vfpEc?4C@`_7SbY;|<vb<v!NMC+`dD&;wP7-hMLLr)=04YYPVcPSyP*
zaCDGzjV?R6q+9y_`p&h>Bq}e7A{%VBoS9$at?6w{#(&b=sMG|_c-F`CeY~GDiNHP}
zH{2a(YLhKRI3S@z7LC&JohLt%{eQ2P`1hh__`a=*2$v3z*e>Bocd@B1D%@$a!1wd&
zZM4_^OdlGROBO$Wxwycwd-~2ji4=J1*JL3Et){rm>x)r$O$m8a@jDb5e0XZ>Q1Hf1
zx*ZEDrAw(Eeap-D-~r{-(miZ?P-y_>XejLTzF6DM`=1(3O{DC2eR02~?);)D*3v!b
zvzCN$P&x6than+^aqkeBZS)`hOW}I82#gbs@(vtN1VNxhEKhl@Q?a1BcG*|!F-^QW
zA-gG0WRWa{h5^y>LhP0hCO6&cZIS1-+b~)2(g5>=%9`3my2~WY7@qT<4Yw!cW?rP$
z>9W1e7K}fv&i3Np{_W>7LK1Dr(x=4i3xn2npPi3jSc=`r9=Yb~%?yp_R)Tpt&&i(p
zMW?m|wpQ%u!tJI5j3Wf**xojvASq$NEjjN%NLYTRmyS~IgD$4&v!L&-|6p45?=<W3
zEpT3NxR?eSBF$eZ|Hy$WgG`sJ0OXsPT!kX59)+|js;8T){Z+g0UBHzr_4nN2_8po<
z(u389TLob2(p*epSH>eG?9x)9_%|K$jE#x+W6)+N8jU_!(6uiV6g*icWK7w2*6|!y
zKk4zQpn=?bno;A6kD6;#X!KcV$J;-H76yaFmO<>}8{`<`uesdquoy0Um3DnIS&bCR
zrg2z9eC%KPC3$4-Qg}=U55_+;{m3|Gqx>~v`>&qo(nl6T(fEz*qc1qu%f}fA)BT9k
zLi<LP&$G92ep|Ule~fNZFEVKQebK3Xx>4WJ#}jgZG{$OTJ)7^b7(vFiKJ>M~mBbCj
zt>3rS7n#0SQcQ!OL(uUF+OwG=CB?bAR4<UF18IM#<e()}HH7B6`21~F1XBXdGT*G#
zK0B6wSN}?>V7-x(l6%vmtzlJN`jV%_tfuX3OS(llZ}sdnnacCQi#)0sMOjMVmXuA_
zWLN+i>eq8C>{*Kuy2>)aZdGG34ivYf%8AI&ruo6ooL7#yHD}k+$Lt5sRKp|eJ(-j0
z%lye%@n5?gXA3IkLey6!Q)BKl4eHd7c`IEgIoeoGcBEa9OF(uy`e=4bLZ+eIo5Pe@
zU9HOt)u!iZoZKcpW)DBb*-PgZR33k~Zw*FT#dm3}Ha$@FNK?c#J(e^81cdWjkfFzV
zXy`D#x}yl!S8H7He@vX=u06=_{4JUU<d2<CDtmgKOf<GY=QjU{e&Z`0D4L?AhHvTZ
zrO}B!m^W_mq01+aeb%ppQ*bPF1Qu=^82InciXQ-3`0cT^y8w;qpGv4+xzfi#mnYO|
z8+e1q@I%5_(nRc06orf|t0KE-*v|Lts$`Jd;(iN&I;LAgx{Q0xe%&tVPOm3L{jjjN
zrYCdfLHFWluYa;={8vW`X|V4a63k>b3g+ZbJ<Tgej{u2plYvs1n^Muw%@N-ML_TxT
zfL5(qUs#B1k|G#iQy@nKm1G=z4DmyR#XGiLym!i`+{4xO!K@Jfpb~bMx9_dXO;x9l
zMF-lisnzQV@+&)9yu`W`zePbsDS|_%6<ViRKs6|Sv$SZ;WA%R-*8VDF@|O07teMlN
zGk~m@B1h}j8Z23PqcFBeN<u~typk=e;wYFfd_!!dXr#Vp9|+QYOALXIGey_clho-B
zU=DfTbkNM~=W7*r?J_Th&a@cs`XPFVyR`W~E{1yH+NnenV50Vtr#|QGuaksFdCrm@
z2_yfy6$Erlvunex0XAXWEm}d7z%j|S!Z0SQ6oYxhjVAu8w%*i(nL@Pb7j3}hP&IZh
zN?RWaJobvqel9aUjda^7&90s$B*cBDFB$=q=Tp35DeA;VQ(&KGGg@k&Z?CICu;5RQ
zt8KCtxergfKh_NJy@&=<2AI8FrTrEk?P@=9Rm*viP~{JkdF@$z035b`lSKO!#zoDX
zobb3iyJE~aQ{{yU*^5X|5Kk(b0liATlh0%Dk47dhU#dsh-WG^&wn+lZLlT|6;^Jv}
zh%3fwdrP`9=Yg->pSXWYm7k^8`_Nn^-N+J#gS~SLyMP~ghiZEXL?H~%rDEj8vMj;q
z--PDhP|$_MTjT^_{cug}39&4qD9L{DK_Q+8Okczr8R%{*6(gvog~H+Yx0`1bUf7zR
zsujRI2W1Ufa8|0zAS7cR)b}pcr@it45Q!{^SWa2ml!!##BR&s~@Lsmm3ss)ff4X#m
z4_rwb&V7xm%`w3++Mhq${}c6m2G3#G4+|-aA*OQJbeecH=`dMFOY@n3XDTy)+6_tb
z+nlTH2@x&~SE2md>(|MNU+`p!iu6XoYP33bSg7Kb(p%+Zbw<|4T(F33+{=l=Tn}ix
z1H@=XefUyz2bgU`Pwi|>$$;!>&RdMu{bxwkz&;~rgT_W1&9KDx<gp_P>ViBcG|y9^
zN{UC8SLIfSC)1p6I=4D`*r600OM$7#ULCkE1UN@>n%uv=8tH;*4aHST(=Y53ss1dh
z4tZDVq)cZ?Cs1a)_-2aR6iY2AO-6#eWaBMxgYa@K@W0mV`)+Lr@48L;I1MCJ>I)=d
zf;{mEiKmZOD=E>qf8fNpm*PbS_epOuUcD^25iv_)SE%=t&uBPk_8ydF0BPTU?(arv
z&l3TfS%YlVw=ut4;q_$w(=nGr$RUhLXK~{61+Pk+d~f!0<2a6)%Nb~}ll}vkw_Oy0
zZKZTbSVvVsCYukY)`*Y(LX36iAtcFArX<7PmH$tU<C$FEp)L}kok3VuuXNR4j(cM!
z8E{7|mudbet(B{dwzIJ~OcSZ3Jkz9qI$4AG$OLL>!y2N894FR@VYOn(_?ocl`W0I|
zGjzeLq9ZiJql3&oL@TM;+fGi-lT3DQZ`dLKo*IoCiPa9uW4Rmf@K4#wLQUz<&%hS_
zTwlU>rNv~N7*R}FKIbY3;DP<1;%?nJ&ZdgkTUFfqM6J-G;VXF!wbG`uE?SW59glp4
zI;U?86yZPK<Uqa9W^Y!nCC5WU{d!ADNJg&|Zij%g{u(lkphnlEet8*vrcJhtU=xl|
zB;`9<6>Xi59ThY0CDM04f(O#DJ2q67zpLBW(r*`M`2oft{U}d;q1Lcz+nz{NGS)J5
z#gd9XtXU5uIPQ7>-7CFDe_H?a6GUdLhy*tGy6#1ITj-i<*upzY2!4Tbs%fLm_p2Di
zR`|GQ`|64i1pMXw4sO4(5YS+=Kj%Ti=o_F{KUG6vA0m}*a#Eup=WlW&i^7;K8$EBf
z)cr}sHSJTv-jK99!1s+90<M($Y1i(%{Py8wxz@za$mm@$Y>b`9@%I<|Ppv>Q)~a}Y
zFoiK3zyNI5cy&}kI!f349ojM`#h=H?`Tkr(Z3r)&g!aWsA{4-6&%WI2fWt9Bl}nUB
z5UXP(K)!Mfxg#>PYe9%S8cW1g*#n=j%gqUF#e%sHkBm$<CIpO>Ga|BCR`xL#hu6+Z
zjv+}HSvPM(CyIq-;7yVr9)4w=2L6TERY}I02hEX%lD+zyzU}6x--DuWy&CNPuotyo
zF!RHq{lIfo<L2k=-!2brh?F0cjCB!xWApraUh(>`Xe8Y%op<$tOVTSQuhvPQSS{=r
z5n{PV_YLiXT*KXF1Jc&aqqbxva7V$0KKG>ocZjN81@W$zL2E(gN!W!CP?2CzpJVKz
zw2b{3_#irYF(Q}zlk8Kh;kI7%CVb{)JO9`6v)^u14x3BmVfNc)H@LvLG>Z=f(z6s8
zLLcl3^6ie~f^>;8I-J8SU#98P^hi6@8Vhkiy40;H4s0b{hYt-Bn=H5ZfcH$L{kH0m
z?jSO`OE=dg_KX6*vpOZ2U1wE))^m9&k?jcoq%hhJj2Mah!~D>h26#Se5lwT-z=6Fh
z{2)M>*vNece7RLXX7Zs-ja!+LaZq`;pocf68go!TvYJ)7CnV5@WD*EZHKt-S|9s&U
z$ow)r7bmeBa^MLfM!JkASmLY=*3|;8;E(dI`RQ<*jhZ(-*yG>SW7&7NTGtza)XT=n
z*nW>S7w@o1;+<lmz8iWn;OF|H-I@ngD*hM7viR1BR|x`{I1yswmUR4b6a`f1=<%Lv
zaxR+L#o2Jwpz(py6xXy(R<TZF0Ku5!tRXRlRPED@+ChSB)Z0<fOkp=KSm=(SZX?zK
zhbGZ3qhZ8IT7)->&c}Le=_Kcru+Am|!ky1mEX*nygZCzAc+Ssl`u~fh3|+dOzrU@7
zFh<ewHGPW~?gxU~vaZLEj?_%t+*cSE5;<bPjbVzZa*iP>e$nQh@kA?0+}jREcKAq%
z4ck8MwrsaF4+!Eb6dx4;B8D!6UHtE&V$L_}@3@hfwKO+MZEMkTBzw7OQ}<9ttEO?j
zY0#Dx9|8pl3du%Q3w&hc#Iu*wsCO$u?h@}iv@=E!jW18#`8v5uG_~$Ak2J@X=MKM>
zoI6Vz*TW0H4VpA0r`cbP?sK0+*H()eF(S0n_O!ZRUmXZQUMvA!>o+|i|CrRo7+n+X
zjA7@B2Yfa1U~P@+%@+r6r3@^WZxsi#TU2@Uhs+wvq8pXlw}zCO)>g|OTzB}&5d4GA
zf8>DOnP6RF>*cNjxRO@-_1~KEDgGGqCE4rY*=u@qgR5!W2DOqJpWR>>Std+{Cfx2g
zYxsesF9Y!dXPA4Gp-o4tvYKlp8fdQX#gU|^0N%}5CYnUKrkMoQy1MVfA4w*(!xOI%
ztI2}6F=?;>8;ae?tDmdka<3UZ6O%vXozVh;a}USE)F?^mLIT2$9@#lkneUKdx{DtL
z+BhO}cp!8NFh*IA9uod%P<TY9hKf>0N?i!^J<Kv#^s5)aU=2D0lpOPIjnf$Y_@bng
z0r>#`qHT>(*9g}(T_YYF>M5Z>pa{J3$P#v6c=U=89XVD4B*0uU2Y7`=zJgqr9O!dO
zbv@(JP``D}YswcMwWj<Mom2wvnfATMBscn}F`x4_il|HSk*i!-OABUAa~88?$`P}~
zOMOssn|!bw)<N_(-aN4pDfbQOM+5gwAMuuWz7PvI?Ch(}cMTJTat%nM(CUX5O<Ip7
z2s3QZuv4z8vc8$|iJvdG<G=1>>YeNVkP-{^P5eJ`#7u&-C`Pk-zYB??%zqdPyR%C5
zoM-J;aExM>+GPFoWWPFb^PXC*lWVqgY|b{;W~NuS8ale%6DP|8gf5Jk;b%X@EcQtK
z^j=F@V2TeBbf2km;{cHp){xz|_=NCDY8r%I`N9HH31jugiZj);a}E*~RKJ=2x<TQY
z@=+{3;G$^vXBNQ!W0K>I7cy7bsma>${5CYli@ZioPPswFguY~brPKH=ZfM?s-cawh
zc(X1pZb8n53WZxWf$Rj(L2mZCDA~XAVFXF9^QorZA7+|hsqS3A$TC6Yc}~fw#l_y$
zINGEFks@#Q-Dw%dL;JP8Imb2FBOXCUn2Bp-TVtFOq^4^75}YKTX%4}^m;2;@O!DHL
zYwzP1Kx@eo)FigBtNy|hgWgQ$>$uk-V+u6TLpxp0b2oGf7_;-o-L6a|Nh}H@B(}re
zZoOsop*)!P$3%}T3q3mCR@g|c(Z3<Y(py5pg?vBbVdGGX?TC&XKLo1+Ig+X<Mo2dQ
z)G^IImCz4~6#(93u?y~EZ&&ueon#!Rjso4N6>C`sk<*QNSlpW-Lc!V37-kOY!yVnI
z{mLr_A1VtEi<q`Vk<#hq9hq#a_21jWn5O?IfQc<PiP&tzJOQGOG`iO6j2^v{>+Peu
zinaIGinDl#U1u(b*Uzj=H1b$*&Yp*Ct3TuVQHIUxtObkrlin0A6{vBr={!vS?M<Cn
z{au?|8CBF`=TR_uThlTJh8r?|NBl<3dz`OudSC+5F;@46XiN{vlCVT~+gM|TdMwbd
zgUV3N`VH-O=9s)(8_kE?)42M4YB=HtlpUBTWk+*D5z{elps(Lh6-`TJh)o^uR-874
z6>mCw^=ZXN_e*Ia&@7gC{B*czn}zH}-|l`HJ@HFwh)Ld|Y?jA>lo`g?>+_CW?XZod
za+W^@mZwV@7F&WStfbHb1hD#8utr$0Ua??xvY=%uFr_<6!I2H&8I0R=&slg6S?=0!
z*<>;g)m8|_wqsMyL%yHCPODDLaK_7kir5YI-<0~i93Dc2r-+!tpG)3MrP;gHq%mfd
zN~b7`t(-LPFt)qZB#r@=13mlXEEm7K_Kb7m^z&yK)qiw7D{N!GJ427uVlQ2D|DAX`
z!Yz_n^tkdU?SS?VmU`@Y29MBNYq#eWW6urX!RGKF1vtoqC5e5TigN7txjH{S=Se2&
z05mikiuzYb`>%KSud!fztM~KC4Caibwd^NqPRH(#mAlqyr)5eudZ*ED88~UHKIIp-
zCuzhz(9q5axARD2xMa2(i1bDMOV=!>zfjYrJRrKqwhd#;&oYzZ+MbSvYR$=Y`H7!!
zm0X$iikU|RLNTVH5mV5HVX%<LiOS|w-l|Tk1!Gg~^9P-@DWzKVLG9w0eV1FUdJa9C
zVW;-`Wa2nzC=`mC!Nbugt~Z%S<-Wvy^q(t4yR=^1YIr9}BHhPXKZs`xre$m^>e>>E
z%5^S+wZYO{Qbry|Iiz4{B{F~YYbwIax!v10S}(S9IbHB@i7)Fab;f%w?<fH)T6Mm-
z#2A^WXd`dr`|brq8XgHIZ$8Jg5q|hPscwkDcBEt2n&@+`#B2UfuSn36zH}`mqB6*+
z;qoU(^`ep0g$75kO$Js>%x<Q)Jvl|qgw9&q)AgT+C83%}XNUM|TBt_L_lrm-Y1RZ_
zN<N1spG<(NbJCne^PnGY0xt|2?n#_WA4XS}9ZvG7rk$$PWAz3uzBXx*SKHXaXZl$5
z81iiSbQgGitPNf!jR}^kk~rHK{$=Kg+;xwQ>l5f7cy(LkhTQ&rtvJh)HC=jVWJ@+d
zr`r5;f-0`yT7cSBU&u{u;cdFk?N`>)zuXAS_R*WxdXbA5PqYFA*?a&;!iP-f;Ri(S
zaxs&yC?|wyKD&Nfl^dR?J|QuPm+8P3Tk8nmA=-jpeO<;o7kZ=(j}%k+*r`U%Oe+U>
zl3CK8kpz~ym5ze9H{Y+1fc99>96rvowSM7Hxcmu#rPgvXZe|udH#0iPfu~9Hy;A9)
zLL2nvw;0IZ`he%eL0ZqtN3D$qS??GthH+A^v@Vr8Ka)Xn#0xi6oz(cpcsj0Yhl(ko
zlP$uDPi_UMfl#DWl43-^@Nd>6%gzLPsFtD{4p?^pV>=Ip&LDOSh1lmDQ<JCiztHm`
z!r+B(iGU}*K6rHggm0VH*%V*@O}XP&oq9OS(QLp0?9#d`@32VSE|>5G7>2anV)?xm
zAD_r{ESGDCjKcRrs4nU8>?8X5d`#h#Vm;)8=TH+g_$2vWmSs%gm)k-_gs|Tqm;Gj~
z2@9WSj3cK`#E~04O;IdEo3Ex<P=enU1ETBKh6Zu9SJ?q#J<_4!ZgI2Wm#!5CNSte?
zoYgHuaEf1l@k=XC+ws9yJCV4^n!A`qR`yOgkr0=5r(7B(;!RyGM1%LjdD!a32f%;R
z(HDkoOifFZxLnK2^NcYKdu`F-q*dFLv86B+rqT}2Wxt+U-Oie!vHMXEv|;L$QvBD;
zaJ3E%a3>>=d=>%InZ>#Qn3SizyYkZM@k-&e%yix76u@antFc6=0j&F!!lMCa1_WY$
zFPyHC(8A<o^0(4$o`~06y(=@g+51batuRcpOg9;5knX-_JqE?9Mqrpv=qSF`eDBMg
znHfmQg@0480Ef`s<&iMfF7~LS{kFZ#sHV{}!NDjHA5E>@rwxuaX5VNEC%S2vq=iU{
z!y{zMPS*Yj2Qa+TX5JFpo+bFzl5$tQ>RH7x(Ahb0mP$DwZs|gl?B*%^uzL5su|~(z
z#73ucD?<wtDYs<+g*Avw)5F|l2kLJ1VIIZ3>>S6deD^b!XY(8cDF|KiSH~=EbNp+E
z?OM3iIHL1~Wm1zAvw6)%wORv!&=*o&6j!@TtNaYU!9EOz7qcyRtc<)ipX`&0+!L5*
zEVMLAu=K@6CLYF@k{8&PMYmA@8+7v;F1nXtv3(Ehv0PJy3#;35DH6d877acxmM6El
zhi>tr-zYs8)>7Cb+HS@6A0En$?00vfE-J4yT7FHdi#05@%XJphC@r7=aet_6R01r^
zE9R<FGq0al@)Yqr;|b#Xuwvgjz)>pAW|n|xdCVhRDvW@gwk<D62~fVMiyQTMlZj~7
z%N#;zGm*5e)KC;0C-1uhu;YoFfBV5AkX!tm%iZ_r(kt_w7FkD9m$gq?Gcig6>f7_=
zqTS$4MnhD)nro(e-TLlC<e>Ea7wGaiPs(EZXQ6|^(JvdrO~?69D@Yt;)B~s~k!ER%
zZ*OR#UxGd>XL=|QS&O_oMZn>wlzGrpniMKR>Z`{7WCMEJ6lg>M!6}n+b+3^+E%Rb7
zrbl04lCp*62I~xG9|{|<r)s?>!GP?u_y4vaxagl*j%D4FVVj&Un9To{-@Ep{8i3m?
zmx>JW9Xd32eV{ta*kS!Soahxc{X|O;tkNbiGpb~<OD9(t3JllimvCH<m*loMs|Gy`
z`nP?^gu_1MZkU$b``Gh#PMJv+_w)`t_qdTrqUn_rO+z7-_<VA>Eaw?!GF0?XI@%Vp
zucvdE6CY8e=D*T#vFUY^&3;BhqP`V~>to#cAzSBP_a?JV@5g1=ZzfCa$+Ip~H?O&l
zHAp8ZO>?wTH$DDCV{4*2)cRmQog+QfhN+!!2j1-XT=;8>!QBmeIVJP#Zdk)>VU#;!
zSIWYv&kGs*dh%BJ&a>=M==*xi_R(^Qtb)+Vq!&9mOCXX?5*Nd~Lzkg1{3Ho3;O=9A
z8T5A513QN{IX&u{TfA?$C7td7=tBd!2jj17stuEQ-rPLm{V7|ZFg|YPbSfStkxZb~
z3NQjCWw7Yt)s5%ZXX3~=K)Qv?DvDW&j&#_j`=tO|pdCt@R5&f|o4z_qqULFxraE&Q
zi#ses#8PA1fe5Dd-zR6i@oe9;=QiFoQAExtzFIi+<_&xyd94WQb|giS>YyfmgrQVl
z5yVcHw;GlmL<XM`3B}&)@oh5)lDs44C5aO__=G|n@-BcfD)}-Z&)Q$QUAB1sjq>UH
z+2i=<3Fhrb1cFL>R9f@_MFPI{246qiS1jEgz8<Lc^8oIrxEX?^|AjTL@wF8oLD=ug
zD=F~FJ92tx$pgSLFSxGgm_Qi=<iG*!)%Jit8L%b&&l9K>E4r`5v;9T~&?<{QD|oLn
z$ZY>oNY@q~(|o<pEkJ4;tE?k3*o~Oq&+dT@L^?p!vXXNX^o6!I8I19=d?f`?yWpw?
zvpqs8{4B~`j2#d|A>~vRc|F@I8TjHp@noxRy{IH0%Il=;6~yxyx&sUyV9zdA!SJY#
zjQ8o-omh)3%C=&`Ml;obw=9>zp;KoVk&1BPGcg>9k-QNxP!bhkU1HS!{8ID{*`LP5
zYEM~B+K;7pPVZ1E2%>&`>*YIzH(NnzqlMHQSvXTjv^z>661Gh}{sW7d-?Be?hQ&g2
zL&hQ&xor(~205HKV}T(OQ1;J}B4Ot^K+CmG#?BZRnZe@3A?q9GH#QD39{i&2*|_Sc
zgP)bl+d1d`nPLB3wrmz)O>IP(*%Y2Evvb?w9d;;xyJYZGASMVig?aCIzZ7kC0L`ea
z6gaK8itb+&E+?B#T}2%HkA8MTIS=YfL}`E9p2$1+8e+CB6Bx*WxdWmJv_Z9JjB`97
zE5shxG1|GYa@pQvGrcz?M-O4`57`%TKbI$zrp3K9U<YwcYU<TM43@AeM|G-Lqieja
z&m{=$_0+Zld=){@_KT{!#3Fm8M_A>m9I7Ek^~4uTmSP7wE}BPTH9I5LU9R>0#pV}w
z)3A?U%Z8ov)~oIEH4OR>T<a>&Yxak#13#h(gpKwn<fgI8u{X51w8U1kYsL2HXD%P<
zWZXc`bkfX1>l=ON-h4^_KyKVy*F}9+Z-3zP7@6`g=3@I5^1@1nUQR#rN<pxJLc3bm
zSjZ&byzRqsWoA{fHOZUX^2DtlpGQ1rEu3>afl)_GTN#{Qy7$9{`dp@{RX{GDcottz
z%-p%Ok)pGCG0Zj4IUPU`x5Zt{eVq>+AGnUpVb%NX`r5W=eY&3$X->u>Wgd;>sc<h0
zk8A+${r^%>9b5R?m{AxmWkN?-o>49rYzIA;q7r-lvo5J-Rm@pRH<sx?2SwOGeG#9x
z_>54xAdt2JL_4Rx(Cuc;TFr~!_T`jBub&IbWN=C_CX1j=1P~Y{9W<hBC0|zi^XHd<
zuGH&gowik|y&rj8<v`c8&qZs*Qm<E0KIq7R9YqF8Hf*4ucas%8FRXo&%AS93@zNha
zzG6wc;JcxbG#>NfranAaN(CGf=S8n0K5$m-rOZ)!^11I}jJq6hmPJGK9mUACf`WO@
ztE34iQ?4YhTI>hw^c7(r)OIe)U-!03KL`t0OM&N-bfP0#UfR+9k{!u4<m5yYmZM?N
zW$fN$IbW^ApGP_6MV5&Dtm%CZ016#t1V*TBnUe?Iy~qXO!7W%Hom_wSLzO9Pt}1T6
zY20iDv%Tk1y500`ne|y1^jYN&kGP>Hz536@!6kw6EfU7nAI_Q(T#Wl=eGiJ%=%Y<L
z_zm0qn5v(05lhU)7)a=hM{Gt#00?lBB%nYVT>Aym3jQnr=|UaV!S@?BrB)}ZXvvkX
zQuxeT|H7lq+Y{|8!ER@s24S!5CrDQ-h;~?z@e!5NL(-IZmDiV9cl6GKgY_9<@)*GQ
zY&e&`k*m9xqw)Qs`cI$1PkIIQ$DoK15poY^J<pKVKlIR+en+kC)+E@w$i`VQKq+E<
z2~5h7rB&)*$dl{Moo%r6272iMF%XhyR#lSePCWoR17?-fv!KYwV02*qxNre}*ok<M
zx#>owSEh5A>xd^+;i#OQR0$uE1Jq)7lSK8Jw7sK$sY$(~gI4~;=OU!ul;78CZ3S*8
zY&}u+p^fi9zYI>csTDYrKUbm*MVC^*=qk38dlmhRmEBIbxr4}AzEU41`+wL@7&h`2
z1yAz3WwVe&xc1KFhc|aSdd@RSB1fNg;7S`N;||}F%)$=8>ziddv8#M}KesgM6+ir(
zMsjRWe=||+ml$s>D)KVkJ8=9uJDc8A)LUv`GnA%wVoG)9bBzJ;>r}L<FF<w^dW736
zKL==^KHi7@O17Y08+zEy^7n%WG7RyD@!3me{*;Xb{&f-9)6r-X_&hw#d+U4r8v!~$
z(%(e0c=9PQ2i5xiu`~p*d5LagxUkc7m5LTT3EgZ1Z-Bu&t)f!HNL3(yU~m<bDegSu
zxp;Ym1l?Cz0h%7S=m^n4D2#YshIZRHjD__di=JhkZH`fW(P6TjZ~yI)!FF;)1VK9k
zYayd%A7Hgo<J1{<C!wEs0`8&gq?lh*&At;d<UItG&gFv}q3klP-Tx7(06$1XGoo8@
zba3}zbS4aJ_#Yq4u3+U=q_^8hG&FkY7Ey84uBH3}3Tnr;532Qlho!r;Hv~V&Rl-S_
z^B?|D%p2s}5+CSv=dsx)w<SGqD4;Z}wY9WJ8*kxNu<GvKs!FUFBq3yJPJnXLrCk+(
zF$)fVsX6KoHug0C5AkO~86bjqCtwX|^;5cl6L3eoGtT<^G+yw~HpO8Z+o<qN`b_-0
zJIOjPH<O{}vQ(x|(CU%77>wd&N!m3(`0Lac|6RI?VQf3>-3ndRp7<BSb5JVZMAwsy
zVx8=Aq?7jhCjIM9k~m;8SWbFFvNK^EtrwJIRy7?Je*rMBM|H`600MpfuoVDTC_Hpd
z;9+f$EqY3f6W;87f<D>`sS0cH53+7+KS2)f<d8u5_B|myFnJgKP^q5--BsUF<e<1g
z$U+J1Z!kL{6Z@ZfN^*RG`D7^=H&5c0n*xt++O!=~=k+JR8uPl6Fw4CZ_<GWUSm^Oc
z5~oTADLr!y6Oe(!su*0P?x5j<eE$xRFB%2&Au~09BWe*j+iPjK5NJ8Kr^FzChP4$$
z8<7h-K{o1SO|}#73EACgqzM&-voht9n2aJ&a$6uq$e{$|)iSbHkHAp1x$(%V2&qMB
zizk9!<|iQxE8&}g$jskdA>6p<s7O6M0KGpj!C$SR1n1CZ+$$BQ`afPH<9l?os5Ty>
zgqs15WZ1wA5+Nf9<%Jv=Mzi}K9Pj;x`$uUv+%lJ5-mq%!v{<XP7$tL*yIE}|q9mSf
zGhR%VU0J`~89)2#i7jX8uVlS_<;goihBuHQI)Q>e0Jl-r!igJEFy?;0sn$su{=vtP
zk;CH9)0r9MvxI_T3)K>7au+YBOci9_IYd^8RDBYs^68jx1`1x}ebVQup3s9c)`wmb
zEb9wd>p-K1g5JUEEr?JC#lRh@SLWZ^ePF_38~Aa|PJHJfdJaYxfDhara8{&TV*yBl
zWs<uF^P<E;tb!jhuQ`1-w4E@HkKFZsQ)h*N@U`NKK_2YFU7zQCEYidRoCZ9^AB~ad
z!-IT<tCj%C-1~NtBx^~Chx_-FFpzwZFP}tHAHDJPNI#r4>>k?mVtcq7F7E`Ist`8q
zD^iSQaB3L9kNfy+2s{UM-}=U_@NZbX?&H6oI`VHkc;lw~GW0%A{PWr3`=jhTBQ+!n
ze*mQBM710Ln;%LN`%cq7a`6z->>SUC`g?~u;ltMivuCqWsWPz|e&@gQ5}UXnU@+$L
zfIHFC8J_v`OMmunOu#ed3crlxUauNe?|Kq8lVnK+i<mEQ5J2Cwx%lk;x|xYn0DFAH
zc%KOsPr?i3Y1W6r2th<<;Wo^=@aZ^MeAWT9+ZX*AFhK=7=nF8y!hyXLO#lEMr7&qp
z!1ZoZjpuOI-E6~%h!(`Bv)LTxA-Sftizf(wsbUynZ*~wYWHyoxzChZBACu%`%O^ek
z`Ym%%+`_Id4{dL?gJd$C;Vg`ovb|+liI{gS?6wS<=o1z|)R-NKYMIx1gV36Z)a<1*
zhSG&1+&qk|XAY>@T;a9THNTEgr2QSc^6#=i0NIe9cr9Ndn*Ov=^BGv0<Xn1n!VSe=
zBMj@q`HTef3G(;I-`quU_xN%s{WrBqH*J=HZbydW_<wnAKXh1s?}aWH(&ZzsMBqN`
zGSZYgfK*QUOhm7VM8`4l0$L5Y1lsF~-D5}rPe)&r<p`j&iR=k!dzk}d<>T_F)U%y^
zB63PCQUX9H3<l85C#Cg>(|k}DUxpR14Vy{+kmVO$$53V_#Ooc?j6C1}G_zj^So+U@
zz?#CU<s~gmRFiztwJefKlYK=@o@9vIsYHJc%ei+`lVQe+?<7aLS!eV&h354u4#hjY
z-gl5oiij1~EY(pW7DH_0;$HF~b@ObsT#S|!%Kj@q+3{r=b@TlR#B_Ku1=RB2EIpCW
zRbS7jb3u`tPsH|%gl+<JfG=WPq>VDp?!j$_Z0LRyudk`)9TY?%AhW?bdS;AHcH}g7
zzpaYF(%i?)6h(z3UY8#VvZKNOK~v?Gy2aG|F8N6QsBN<;DjBG~vE^+F{l}`@BL(Ku
zf<#lXyS$RX5PU0b@($N>Qb8BjdUQTH+{8f=wb^qV2!sy3uXXduVdF&C<R%U*bnIiU
zhw1^xs~GO}714N7BM(34Am;ORza8Ezoei}XV2qv8ytMfC^6Q-*vjM_E`A$xQEaEnT
z3VPmiMrx$hyB62aHxkNRHmwSpC?A)Cz{pS!OuoJlT5#$Y`5*%e0=ftcI?z`^az5M>
z_M1_{4^4)Y!M6Ka#}|G+HPvdZJFPw7ff+Ow)_cZL$;{X4D=%u&wcN%+!rp!LA**5h
z;PvODkuE%*c&rY-@-HE|iJ(UvI~1G)C8I{8@o~IM{P9?E;{0S0r&ZSZ&AzQ-iCDff
ze=!M8%zIMVUmr*9p55<C_$y+E_+7bW4v`QtT7+Gbr)J9wskwIMU;Hxhvk5*W_J3bF
zS=y4~CbVZ*#dz$T(oE#9Y`6AU<4$w>%|PR!Lja$sH<qkpO<Fi8;7eXeF%t26;&3u_
zr2H^y3_NKC<_Vd1iFES#=!yiin5in*^t}OUz1Z}$k&zF%IgfWntHf81JEN_;t*Nue
zz79uYyTG77Az-@7m_tg<j6+3Ty_@z+oD2BTLuh-+KRT8Cb5V+6O}(~VjOzSe4oY%C
z@I6VD(sh2MOU`2{Ba0C>_5;2qMaK0gFo;E9d5R48R?F5d4M`PslX&cPY10g2;~&{g
zjcroo(06o|(dZryp15J>8Qh>2v_Y#C9aHB~-K^>66zo%=bWj(ux&0u!@~3*K^t?4~
znuWx70l@0NpJ1J@>zW1VT#-|&XU%<kdqVmxgnUuKcYRY#rwDJ34W;W`bbD$plnVi^
zv|cXNb2z!{p{=NuL^pQxg+cV@EsEBzzF;ykjdU@b8h^jk3v2@hB&8Od;iLK&0JueI
zCN(%3gLMFZ&)FI9;|n!-v;fq!#LXI-gkX&iqV`uDzS)m)Q?T}6-(FN9p2E=9e=;9J
zdkW2EOEIGg*4gJ(JlgN2<MZDZT^)+KMqzLh!j@{J_qK2P_Ry;05n41H^cxP9hI=<&
zI^7O}y~p%P%`1p<tE&GN!`sKxh4sf{e9q%q`_4IurxCfNX(oWuyJ7!?yfmmEOKh17
zX6IEu4MGG^TxFsw!OloUlP<-krp`i>T8GzZIB%bqGD)osv@wW(^?TLx`*n_E#X^}y
z{%uj%?l`9@T1-pWKU%-R%8FsuLeuUyL3g9S<!CZWgi0Bo=E`nsi;|W0ALmlK@=^1j
zo9h1&xXs*MEd4}6m=^kFxe$*T1v4-TZ|+SLhWnM0={oBr*^tJGhyAM!QO~O|e(35y
z_jGSFl*G}p!@;ODli#~kjnpKB2AZchTJAj0Q#8->9$Q_s7+gNJFVi43?xSsy2u`T3
z?dLiin5MDx9~w0y9yCY@527qN%k+hW2`EmNnZ!Y>Ro_MNTO4MjnX)<+e&-Q=KV<Q%
z(lGi(HJo-xB|!%1lE&AQG<Q_se6`j8G>j*elMtcG-_<!}B@c%6w6;KoMxr$WzX$<%
z=0xqXYfa*=+cE%K@y-)-5S!#D_+4~kSVzND3R*Ie0tpvwEixd)o&+M1ugkluzq+b0
zNh?S<jAo~YKTFzbx$m|Kdt~kBm43bQ`nwM>QgYq3`>_>;#b-Yb_HLqQ#+Y}}*kk$B
zA2D;ez*8Te5z^cBx}G-U1J}z6y^%+_j8f0O+-01S(w9K#_TK_GKb)hkTUYN|eda0I
zzFq83++k>eLPGpV7Xy!W-@HB1?Ke;kyuy}mU2$*vo({;wBNn};ryrnoIy6^{!=HM3
zEOHF4-T-ez$~f_4awL|UyJz`s_>Q*}$pxub<P#;!$v+M>T1HAH{{$pWUCLMA+Gif8
z9e`0fJWTFZ&gHwlho&@X=P;Q<ZB(O4$e3=Hi%td8`w7-3#8<g7OZ?9V<?kVNh}Ud}
zlQO;{_~J5PTQDa(1Y+u_h&u2rZ%OZgFHldXk5(<6xfV2+d9SP!%bo**fN7aftj+AW
zU-NEVEFz#^G||@=;yJN7Bj;Z54si<&@V0zY8@ArOoo)APGrJ~Gv!AwvE#;=LpBBLI
ze9*hMjg9cc5gSIFI3dOv@bmOc&ujfweZ<*0fe{k7N!LChR9Kj%AlNFD)&e<nIkG_*
zw@OWmlkkj6v?AG&&=5DVh;sYo!J5|<$!<{2kh7Fuf4^G4Zt6a!&9y32tY&qz4C*Cx
zT=;;>Gey&~C-xlmRg!4QaSi;^Xa52i8tLksT%ynT{C$^w{%)$q%wKNT&?;eipTlZ;
zKh*Q*s@V?o^Idh2b3i37nq?V|_~Kb|bL8cEBFJ#w1?3B+JF-E@-VO3TIK|v{J-5nJ
zocegTC&wnColM(fY-EK%XQSu-eOGt}u?Mn+g%tyPxZR!(2xWbtFmy7T9iz<`^;DLt
zl~=LxmSvm%9%C7o5{JVkl_7udEFZhu+L^F+>kzT#sUr0gkQ|0mWv>*q7L+)W$&#UG
zTLMw4W&w^2BYv!p7K}%YhCLy)hZ|b+Ws3Jzb0i|96)1*GW8WX&P*EbUVBBI03F(jX
zGQSau29I?$jai~;?L&4rfH%94A083Y$(Q^Z^KU4rA#4VzzL-dRTv~o@FW80IR<Q5Y
zyGn5+CYDf?Zzh=>Y{E+KdTb?tGmJbVspne0<cIvc!T0hfvv)>*tbfTIBPleKPT$9$
z&~TDVC2Dvm+&?kjuR-9o%zhV>q$fKYQ2oUrjB^;<xi!>$ytE}6Jw3~!Bwn;3y0}gM
zzb=6PkG;2ys`87zMiD_NfkPZbK)R$u0qGKfBcMoki!?|{BPr5#Xi-9?ySp0!>F(~3
zzWed_ANP)X?-=j<^?tZt{2*}7dG^_R?YY*PYtA=7`%5(Zf<}<$RPxDIDkWjT<NO^C
zk@J}rKmF_gYsdNLccf~3=h8Hee%)m~-JalCsBaGS#g^l6&?T|eA?ZcmcOR>njg>+%
zNeJ=26cDW)19M4Bx|^#e;kHE~0l*J#`x(IjUvXay<E^^yomzYy-3*LxvuW-5g5wUm
zcllg$_;5<=-0&{{G{p(_;t7ub5OqH`Zw{j{%Iz-f1y12ShqL{cKcd5L|484ea7@jK
z?3gR|G=9_dy>M=gMsA0CYws4l5qz+va;rF094Hv->OXi2)C2^kJaP)T*pCGreD%v0
zoKrFqQmin<foZSDaAMc_pD*9@K88s*6xa=Qh{%_y)orS5+;aqeVSG9j>fIB;oyjIR
zqvOv@;X4j#0$<9ETM0U;npsJi<uEMbj8>(NzCZgZusv9~>k#OSWUCLy##U^q>;eGZ
zCk{5bitErTw>K}D)3%w<odx!ip}IUjQhs5PLLPU&F4iHJs}O7-t`&{WHu&Gp2k<Yf
zmdH6Fx6DtHVrjnv*{oAMPA4{^dJadw%|pW6PZG}OcPkyMLXhJRw^{zK-1EnW87mfc
zs2kX%KQy5d_i;sA$E2>7Cc4F$By66opAYAfHKh`6qC-ELsB?3^R~r&+ug4}SW)Adp
z+>-QJZo-kUofgU)+mNoOpFs98*1T5G_QW|P_^Utaw8Vkc9I#bM6aOL+WGU`hs?W>w
zBV{T5Z&{&BCkm45XQ`5u7Sq5E<W!7(7U+KO&Lc_juC3;%@BW38&3`g<mL83DqQrC)
zedW<T-T`WOw%Fec^$N4V4ejZxfn`Z*EF-w!N64)dX@()5jycctHN!jcV6kUj7S#f{
ziB?#Ni!Jj_#}Ff&?sdiR>AvaXxg-a<GaJ!QxTQ0;J!AMC_5g)NJQnec=HA@4xD>Wr
z`Qot4MvAMYM0_Pp%c)62l5lt&Jt@Xpj~Qjb0F()dMsrxncl1bZp805deRiY~KHGWT
z%>veNX+`;iZ{PhbWa3ChnP2HEtn|ruW19%wHFm`n$ArAr9sl@Huf+;_XwL+udFrO1
zR~%}{Z<=(Rrm(R-B1lrt9JpZ+`?1DrYW?9m8NG=38*<wk{LPXEsp&AdDyOSRu97{D
zpVnK#<hP{asCiZO_ov;HC^4u%82IWvau9z}rZmsJp37xyjX(Izu&<N~Q;W%tl=aWv
z_{FokYmc;A&K;8YCaQ%}?bbYs%?488EZPF$iEA{Qp8akMQR9YMjUIRWC4+1XUd8g0
zjOJLNI;1%#%zMlRL)ws9_^Ug2@La^|W%~9|&o+lw^EugU7RtwZeV)eW;Tlz42r>|5
z#yHc!mHUaI%7-TY#m;^U-hi_^>tF_sgen1fLa+Gj#OE8!)Tz(&@g++BX~s7@x0f?u
zXG|6K$!!+tHgUwROmu!+et(o)v~~%pm4*0!_po0!6`m`eyPqYpJf+^*>wQ*)pTN!L
z5u~lCJBRzzSI-*30_Q5?s$5k-iBY^eNEI6pT^XMmPm7w~aD|8n@+tiFa^*|oTAKiZ
z$fy}B{kP+_KUb{`$3QE74k{Zf*(3k4>lNjDD?-yRqB0^Bb?_hMhl+g)eM1mOKx&mR
zsCeg@9br~;a)ZC%OKh*&w+uT7@A*QvP7B~DK%}1HtVmcBETUR+l#205-EJzoGY32B
zQz3TFjnL*Vb_yQj7wXK**G4Pl{wESx?<kT|2TnRMQD<!r9j19EW_+lA<j1%I>z8>c
zF^rr@r_}_rtf((v)*pV1gyBchcRY5pB;e|Q#kkT*rd7P^v9htlV_9-DP4u%cMpoRb
zI`>A4tnmnPyILIepVT<p8)IFpSUVHH!N+MaWSD9&TF~yuIrHCGTX&6sIn7{<#ldqz
zWsw_hNBK$Y*`Aa^jo<JIFYcNoC>dE&okJQP%gH8)_eMuqt~ds;oV1^fklkd@dD+>g
zyJ>}CII#7l-|nGO^jEY=3&FPKM-f7WxLXyc-$?41WT(ct1-jM?m45yBD=D^iHOD2D
z(adQ$mqD|fg0{rYvwno3DX7sjh5Od2Yh$@Jd2wqW#Ye<hirfbtm&wM;plpq)*4<h?
zKmj+Z{e*AlL|x+TGd}`9v^3SEGc@E$b1OUJ_8bjCuAk9;r>5yFUVe^kzbv`4=bZUT
zrtxyzFypMyzaP>@WA@T2o!;hh+%MCcA(lqu;%0y9C;n7BbdL(3P$v;BfFL1DzQtzE
zaIl}Z=L?B>BCKDKQo=x&Orq!A`7U+1eO5Ggve7q=7g7<J(fASE^9Je`2w(4)4#MXn
zGP<T3To{Z7z`5*Un7{z<o0~$cE_ZQW<7q4|s!A)r!({gR>C<MZ8*7^bJa`2+<cG2?
zSCWX!+dAD%;S^;2HIP=lsecITHS37t9#NzCc-~^uk)39y>e^!8-mUq{wcGH5Osgl0
z`MqULB0FcvZVhWk=})JlE0%@fquxu^uxGDVb43bSk*pHJB=qfm<#0lV+}t7IHRgqe
z_o<{|!d7{GedaaJdkPJz`6j{&eKhk`>yF-(xo3VoS`i}~YV4fJ0yg|{S2oGOW+F|F
zkGIII`8~_4Z-M9VYuAcjo)>vVZ7R`%hD>6Ys4{p5TSqiUYmkD?`79R8=MP3?t5d($
z)B`Rz2ZYs~)ZZ#J51H*w!~D$dZ49TOZ}G%edX@~+RM}gemK3adUfg`Fw8z;aHXd9C
z(soAGVV|ey3#$b3+b8fdKV8nbwXnOl>#*W(Pl*rImg~F$e}<6Pyz{eE61MIe4w!>`
zOVUC_f5naLcU@D?8jP-8)X-LT&Of>`xUVEFr%ZKcJT^EKP4CrPz*2kZYjUAcQS8Yw
zLpHm~QZAjFx-wdPb%V(ri{*3jLLU}{>2Qy7UY%Or%eIJ}snr8F(bM!Onk^DD!N4lc
z&qCc3VU{z;gYTV!Bv`ERqo4RE70}L-_|zsTq*t(<8D1fij3;tI-#dWfv>D-Ss4k0W
zPT~248Q7*3)u%jh$e4V7OuNF=`|^veN6%Sf?v>-V7RGu7T%mT)%EL7W<J~(hL(5&4
zsj1nxgEagXi~9a2gB4iQ8MmL25ARPR)fEjTyu{@Ewd~uS2;HQ5gI%wUYn>d+Y3J)K
zWN_^STYnPp&wfihonrq9P@Z`g4a2j0V^19K!1ljU6KTGFz|%K<2hC^kFE<@w^##^f
zIaiF~O4-Vpn22he&<a0CZQz^wtw&Shs}3qIy`=4&>rWQYELtW}A3cdxec>%}99L0G
zf0tCbSAB?@Wfe23ACuqBc)OEMookE#Xsft4n}xQROW=48lEYOq+xh1IZg^mg9$9=$
zC^n^_<JSDG%}rmorkBMmn<5YxvSl_0zk}QbQ|Z=($mh>v3*71F@>auaUw7+=A066^
z9MCYdNB&XRl0~dD`tnZNAqsbvXgHrp<P1&XQzc=|V=L3;7M0Dc=Fu>Wj>@d+=(M&9
z8E^hAmu$oiZSx^!tO*V3v_kjyuEUX?VGb#eDYNg@X_qfU8!>rTx017@N=SI|H1mrF
z8%<>Keltlu=d0X~LAon^?Un2ZTX8Lho*zYKQ$HLEO{-@yso?mqx2{Bd4&fH1p*pFR
z{e`RcwL_L@^Oj~g7{&?4VHCv}+eXH?@}f2R)C^MPYu4u3xF#xzp(vKU6i(*Fg3(p;
zkeZ)%PmJzo3(G(@QG08f${*IJ)33p(LFmNXq?0fhy6*`i&wanm-tjy<RyVVZ8vlUa
zUIsVgX%rrH&CKir5Tf$JduytLf?YdD4y-~@9JsXtCZcwz@mxFWcixbk`(Wl1<qx>8
z@>JC&1g8$Z1;r93G`J7)tGD$0%<=28LqqAbd677b#PUa3oIKPf3Y+7|Lq646CsCQf
zzz0VrettU<Ru8&&tPtm!g$eRjYUJXJ_9B^AP1Qp_7$1`J7F)(U;OW{=+ihPYC!JY2
z^r=mq&UA)u4%|1K4sVRsdV`jgD&;**_o&i^`f+=dvn3i?iq!nB_C{#qbdrcO5f|5}
z1W6UVfP^bO&35d3Mek;XJRG=*Wte{w(GBHEE8=>tyOiP#*PZ?nDIA1p9PaGeQS8Mb
zm<8pfb5QnToA-v~;hkJIpTv!#uZF6|n8P7LvIIz`&qbl*bmiu+3{7f^2C%11oQ(f~
z=hmO08x;rk8r|qy_l*o#UoRt9X&&ozq_nsc9B`98J$h`}|9mSK=dG!VNcs9Jar;wZ
z5(;UA(s8S>K35Y_qI8ek4v*@plZed2F+SQuF9Opb%-eztR*R%@IX8l-G6t@6E3^-9
zbu%Y08X26L<$8PXux2CI3P$+4?x0oyp`rL!x#&$98Jk;4vEcm5OuCn)wxT)nva1x8
zsA>M*WSneNfBNjYVD4dH2rF&a_i|S=1HWxoASjrGq+v94uJPE@S8pygvdm>?$%mGp
z;mQBG1kqIDwB^|d!jkbgM*=NX_T`Z5vDmKCKZ`)*_+xZ@{c)y17sr-yeWT9zU(dw!
zbAWPVkKR?A-tuRmws_go^R={w#m_=MLki+W^!oYBw3G>}&viG%(B=`;wA(F6Z%-i5
zZ7Cg3>cB*O1TtZF0<oBe@kiFu(wB{ow!hG(bix?@i+fR-%9*rZYh>XUnZlHV+R!o+
z{<N7s>FF7jxG4Sl&CnhxnCquiq|=p{9e($Yj>es$d)&_NKK02FmFr!W>Gj++O|}@K
z8J106euGn;<uY<ndtzgAjXFY*kUJlPL{E^Q4PH#B%5y;wMgF=d!NZ{BV*2B3$2hH4
zi3Hcc3=pkNb%(@KdpLJ}Y+PE%>?^|f!1NxeUiO~nQniUZv}hs35a3h&i5=!0PnMn-
zoEx+TgcTQ!_ioRl+28!B;j}=l1^o?SNI0wR+r}QvpN7riS`1cHE`yo@ptODQQ(LJs
z*Ec^yn~F+m1n8>Uhy)9-D^I{LzXk<Qctnc5oL@a`-iC7-<WqBo(T^oaDh7*%5i!NV
z4H+u)Uk=}m{646}38CWbAr@0Vt7-%OIBl8N1SFVTO+$;o_KO+a9F2FcZw9tJSzdv0
zuf{en0Ldw1OF^i&SlEKw!{%__ZnFyFMC;3z_Mazo47~=%zj}0b?FIHGOo9X-k+8(^
zELQP*1qx16vz;-Gw%hjE98Gn))9=$4cU+!TVlZ?YE-ptarX~~YVPRXW3fos8U%;oI
z3ve}zv<WG<wzikb!wAVfS;STY^~uaz$cvMJP*=<dc3ZI+i6NCMfY|NzJnv(tEvY3Q
z|06sW&`5(%h!}OVZIc8&L0(C*e5rT25As&(Cfy}S7rBl8fH74o{Y(_wVW3*ktGxPy
zU>a{556y|8U;7Kjc2F=$!{0U*0>WA4px_#yT`r5#H1Ynsv*dz1Rex9`PWdVha8CaI
ziJAejIih2*wvSFS&W-I$7W|Av&SYJ$e+0u6>G&Z$0o#%n!Y4l!Q?7<Mu7{R^T~V)-
z^9}gIJ`(tp6V`#ISE|pu$YA=w6ZpjKRBfwaWb9BIyHnA0!Y6txzJId?=8=Z1lOo4p
zMzUY55op$HIk>Y=xvRFe-dHv0Hhe~^bI|1^XG(p=9z!+aO5^>_`sN$C{%!&hR+K77
zi}d@H9i4b-Dh3hIM9=V+tSY?NIFpk@Z3=2@h2-)*u)Y$n#;RsjI9v34J(o*X5fdj)
zs8e>YdXO7{9E7VjsUxVKLe+j_A-}|I+*Tl4p`0(Rx`er_YOJ;yUjk;-&0q6V29?lq
zmJE7daXDWB5%|BqR89IQSQ(xJ%eP|MZaFL78hD^nDQKun1H?Nhle_J(^3>~meDqN9
zW?9ib*@=GQ;V=U<am=F{JZ_AEy;KE4d11Za$%rx`ODkH%58wm2<E<kwe;XBFOD8M2
z`*>QEe5;aD{cw?j87dcLK;@z`IUNEtgs-lUF1H^Z;Svq~$!m@i<pGn#QWDILUp-B<
zg&3`9)UW;AyV3o_q6!JP4089)>%f;A59Y`21HIguXDU_vE~OQ_SPnRbi<N~=>#crl
zglh7L@%IJ5=APW4cNtiXNx**aBt{Qm59`!8(f`w)FGxiG1tP?WB2%i4Ffk67ChTHS
z5-zU0>--st&MTeMc6k;$7Jph>Aq~;DaS<}Tzx%|c5xwoMEMZYTqqOaQ2Tc1_{B)}v
zQq<Cfm|QV;XB2gfsfJX)B8tMEl_*-G_XmDd+#_f$fB1a$3D+CogH^EN=El^JSOZxN
z;P5{Y%SsW9cqF-vSB9J9r~{po5C0t&O$;Jb|He1allvhFdM(G~5aW-LgUTR=V)$y~
zcpi{=aRrk%JHrU?FP4tdrXZ>Ol)=_5aA{*veQR4fzzUEYxY3+)8oy}NSAzN4-u8DI
z;rS$723HuC=0Y?K>RFSKMjsJXX*b5$@2@xBtJO>{zq0wLOz@CG(p5W<2mnpduqw|V
zF=tyB&JBSA&DnZ;USkSlKlLi!&86NAU=3?!DW;9vQ4{Vs=K0GOKqRzMRug>|Q7i0-
z2t+ov+f;@bO!h=%1J{&4x?=^Dm*CUjq)3XYwYH`ksj0#L^>3_T>S%_|@99H;L;bV-
zB$#`iJq76K2RlHE@YEo&%}>k?1X-N+y0?Jzpy>56e-h61`}M@*IikLkLgc<@N#d=?
z+?j{0LSZ`(o&rmar?RQnnX?xk8v{P6HX+k*VF3;}y7bA@0@qgrVNPlIFQd^(S#VrU
z=~t4-T2~F4d{7{nPzFdyd-yPX3d8?nwQfoG<fc)33eswJ!ccIcNOrp1{EdV#g`}qT
z$uG6OzmIcS1MKvZ&0-HBRry0uh;u#cR;%YODhv5szPZa=(!I;yh)9W0nl6LMK|}hg
zbe}IySUgC$iU=eEoo)=-EkHIP@%3LT<PIw2y)f>1YUWk$uRloh%Oiw16Mj?4)_Z-w
z>e=ob{u50M_8ll~2~k1jN}^p`KxS%MmXgoT+Nn{>sdzP|RBb4MB(x?nrkr!Yx@cG`
zB(|t(Y-fa!db|wZT?@^}0umi8@o0I#aEoUVTUGcj4gW%OTW!wJvD&|Qh6bq-LRH7M
z=(D_6MY7jOScoNF<4@^^y-?}z8(j&;PVBy4zqxG9=id1p?)f2v%8m`ytbE_Wrl=Wd
zg?=V?NsCfq%k_St|F<vBd&nz53pvNw&(cS+`guG(%L6bNx~doq6^xUtMyFwNZ?XNW
z{0~!>hJ;Rk$Ou2#%HMJR_^sG)!mHdW;tm_pBwR{-(SdU>Yy5My@OhV_wZW?~nz)x!
zpvD(6qi&Cow)FGhJac7t+J2|#+O4dKufw(&UpkIkf*zdYFh5)8HU;zvWA~UHOz~lC
zLE-QWDZ#@|+9QfnWziGmbM`kKy7k2;DtkE?l@ql!&Awm#v8CcxE^G}fflWl6R%POH
z9Z4D9I}!(Nv?|Mno2`afUvDc|MV@ry;Il2^#fg@-V&UU-JSzj(sA0$^sO8n=`PXQQ
zWZPNP4=e0P(DZ9!`%+I`s8&>~gzSKouk5o(-m5%y){Ky0L%m-S^eVQv5!A%TeG4ri
z8}y^=($T7DcLVo(ucKCeI%GPEp>Vza`EOT%&GC??5CcxwG9O>QU!s%r=X;UyiFu{Y
z4-^;GXWfi0&)%!X<Px3E$6xtcnVUO$vHI`FBU8CCJee;p^sfKt^=VU+u~;xBml$Fx
zi)-(68%V$2uZX?KJl&Y`=N2&gP4jNGIQN^i^i=b<#=ytX`uTWH%KY~Joaibe@Fsbm
zK@&_dpMR~FWlt$gONS&VDZKu){FjaU$1J(63Y{PPQe(BNPlrS2q55aLIMTn!wg0ml
zK8*BJpg%eZE1a;MmoKM7XqAhw$my7P@6|4jlW#H4m&P2YV~im_=ZuH5EhnnCpWFx7
zWHm6HoTJnqB5Q%Ncc*tUo=f9hkUsYjYfXqgS14B{#+DK50qW{DqVe$t*YhYmf@^3R
zNWl*eftoMB`{Z!yGsx*NRh1S@7{xnW6+7ws+XF#aTLM2tYekCIwmZ9|g?P=h&S(oX
z_KU%ctK|#RFMP$i5C1sl{bXer{XD<D)e~#<jNP&H-@;RbLg1s9AJfs7n?N5UItOTC
zN<n5f%6DpyGdb!uv~JjxJ2p^Fayzq!LXa3+QM|DfZn(bm)ZD5R%7q!6bGJ8ViVK3O
zE$!?xk!wuUL<lV<dk+2GY3H$rcUv9N4xvu%&mV^l0I$t?dUJ}Xwkrr`+hApNj8Tjl
z9F#pVY@U|bkn**!2v*sBE5t1KKhS=<7_Nppwnf>FA9d%B-}=%cPK((nl603(WYbi0
zjM_z&7Vj93C>Pfw!`+R;ek)KVD2Hl^Q%1sf9e^zfM}?=1{fEqP1)>g0FZXKa3Ak>;
zE@RiHt9jg}8EV$yV!VN0OZ<jO4yqn4#VmI_Xhyc3e3g(gS&bbEjBs3hQbTfA-G_cF
zQn~}zsHg3EHhgPsUuf|D`qB5?@v+i5Iv?;7aSp<=h?}l|i%2U>mMfNVt$Q%)s{kXk
z5VyH01D8)nRzJAUK*!z)iuk_z60$E4_`4#1oACOHKYWygJ-5W0M3?<NT4OtWG;&sf
zPxTe{`115KMgZB&`imT3Yd_&<zeD?Sqvi2wAr|)q+q<*N@a8)ls`RQDFr1iBS6{aV
zB#L4sFd>}Jep!UEiG>(Yo12pmOIuyCDbt*jLY=U7qkl8(`We?hP(>aYvNoOm;2F#7
z+HONbkx8PFGEjWHhbnISmM%BeIcL3nHFw#=@X9rO{Dz?DFL06JJ5tU#;ac`lx!B+y
zQ9<}wCmi@O3K=xq5bcsxu>?_ML_8e6?zLH=8Mk&wn#y>ZXx9+5?DrAsbC8aV)y#_n
z0TGd$D0A&Ub1tPV!LEbJrevyrn~v2ySJW7s!Q7lf>Xxx0hvX$G?LvFx5w!(j9K-PV
z+0|y-6Z8r7Rw)^iqJ|Kp`u6G1N#cLYNRMIpluO)Y0dAXiIn78c&>XVUwu(tbLD!A$
zGw_5k({F-t!Blg7>)iQ*FBx+V5z1@~q^ZMBDG9wX-O*YhU|jXYO8WEpj@;){g|;NI
zJexO$x%dQ~J7`~R!WMF^fWhg2NfH$Ul9sSFR?YkwYGJxiv5Ocg_0zo7{1HC4%kgjH
zj;pDCssDqUe-ss~8u})ojo~a|GIOrd{1~X+HsR=r^#JZN6?Nio*IBf4(OQgSJ9fN_
zg@xmGSXUm$II6m*Q1Rp|0+(afv&!*b8v6)`P0sVsx3|A3uYXUg5>8}^gz%r%>Cx6J
zS?}%Wd<`9OyhspAGx9o?@rqHU>!jXG2^x#od$&h1O32C_fobdGsJMS}X&3IMU7r*2
zz}M(ec1e^QF&7thLYpwa-HYX~{^b5U=xmZuF%DLjm!Y(bPgvl2lH(w4f<q(`;_mlo
z46<5EVWA3*Em7`=9arx=fon{Fc-2B@OEVgp8UBvSDL%3}3IwKOb9#tO8Tuuu?X&eO
z6W6x+mD!n?ff^zY+8&qm-PH0wHQg%t#WwzfPd3M&=KLIR>c4AIzXst^5f;LO_e|GD
z!8CvRddebGgr8}_SPczx{YM`gHEWyQ$Fci>KNfV)1N}8;$}ugL4cbYcI?=3njE$A?
z&C)OLzt3!FNNWqUc@T>V<DME`hQSO0ej;t5Ej3-L31BJcH=iLqYi=EZF;Oo`6W^L9
zHBQ0Q=i7yYAQ>SV*+*PDG=(OHycVh4*})A>1g(M>@QO<5{!fLU!tEotVE8;b#sPao
zuBF7-qIA!<`+m+!m%Q<+h;3_S{Qy5whY^NMu&d<!gD~YpJ|IWI@Xn<~!=h_+6rJMI
z#{m9S8}h^MNt~TtNLy|;&al{`;r^x<ZiHwHy_2n<;RX(mt^FiDConKg_wvhBk}aO}
zy+Da`@51c5SjA*aj>yR$#_HOCe@&XNdVqPrnDXJxlcz_MCN4V)*Obpx6RY4&2@+iy
zp0mH+cLYBj1*vL$uMKgs{J7}=D6brkBCqqT#tgaAg*6700$_xyvyo^et8A{3zN*J3
z-;)5X$Xw_5n)Doztyb`(v69XQ2VuRT`6P^4_g$o}+Vj<9mEIJ)4fZT3D{r3MUoNQL
z960Q&9@wwW`%4LbDZCUw+ycooGq^stNc~J<!+`qnQGBQWxD`uBPHpPBFE)GUj<{9Y
zrbgg$z-?dfv~X`=?j>5Q-*9fmgC0XeICqF!ZcuQWRnjjFY9)a2wE`MJq406Cl3`{v
zCM$ISjhVlzE=?)^bS`U`VpG-#%*7P*<q_!Db}(CIx8jl^-sKtF?G(OwipSTS9_z-U
z)oX`tXOxA&&*i1OBT>aqh(X0tbVQV>>+2M3brkiE&}CrrtI^S9rd~yKgMra5)NW>|
zaoUkn!?os#X?E?JHbB1uP{yjC!il(Wso8W%yS23zUcFv<s*cT-o>Di3ucR%YzU*p-
zASB%V1d{C`^{1zmzq+O_{cOfg!i(MP`>$L9A=zwS#ZIC3=cL-)v91KaQZI~fwSVEH
z`#_7LmoNykTZo|TBqUCs3cpyLyomm=ekS3$>gJZ$fZx86`630IDjIg2HSzBwmlu@h
zfhLvVF62(V2eYKrqrJ@_L)7@edRj!o0I%`*<peGfB@9bU)DSTQ{>u?bV+UUfpTI*-
zqPGF__{PW;tmL+WX;p)m5IGiFF;1ec<N;ov*C--b`=y1`@-+G-FNzX7v;Tr~p@X%4
zG?}GWG2D<quB>$Yue(r{IjcTRgoS<Uc<!}o=4on}{ixYB(+=%ek_!bNr;v~?yy~j<
z#(cN%usaki9E+qXNm2?_degqli$xCR&oUEDxwiVbDq50^;&$H6r<|qN@p)XZB5g-h
z|F?}p0kViI{am>wyeCiv3~kOLA3_xW(^rn_!N{+*XGli3TaNlRxwE<LBMcNE`Il^=
zcR5LTGNRgYa8G&ptN&f2-mClfkLEb%)Mpk3><R8Pb1#9MZTLrx3XIU`9s6n(Sfmo2
zU#kBt5;JOu1Oi{`W*+Z`3SA1TB(E8Rdzj@2jSMyWtFs?%<Y2<@eAS5QH-#kgtNX_2
zJ`rC#4?;{~;s#_`p*PtNwFEM@skw7S?6;6IhOCWCYzSg~nFSvXl1!4?dGyOg1dd$*
z{dQi2{dMI~Y<>*8+<n?sj$TO{acO<??S^G~js$H0$`t8?PGUtwW)xSRItv{HEB4;7
zzB9#pQ#ql+VQ5HhXH^C_{B{-s$^>kAEsrn9J50&%8s#xMq&wmNP@NO6J<a=-Jtugb
zxKQDS#A|ylF}<lsWNWhNwu;iwmMn%?|84{)>=8q_upjWMGfgaNdFu{LbUZ?Q5#C$g
z(q4Nmix?jw(O{Z;A9Y~4>-Uo%4$y8%)cPPpcUo*Ri&~gQC+jY>vns59nf|wEq|nZC
zaq)d%@@3!D-#_9k9ynNCc4GS8ZweS})|4o9Y<|sGv*Y_a?oJgQAH#T)T#a%fx|Z?H
zE?+uDzsQY@T<xYxLAMZO8<c=*4in3<c*nn00BMrgx4qRDJP^XzkxX9bBAM;Z|4VRH
zvGb-xliJg<Y+|<RSgo6igtHr`;X^)e%J{ZJppYRtIdRsBs=+nhZ5~o^B(oelWsV~>
z4ZxK;V=nyGgu)#B3arXA0bJQe)ZS@R!9mEGm{i*?^>C`^IB#HGFkd{Zq;xPXU{`Hd
zQlox_&8BJ~wk7%b%-q$e8mv)U;Mho&Op?IFWiRiF^4Tiq9pT6y(?5iGScM9nJ7|5X
zIGPsuB7HO=SNXgEXDaaaYT(vgcVGxTp^^Yjv4!PX6<fD7zw5y@p_TbR`K*M#w^*)m
zDMtUF<`1<xpd<D!J);~SOB3VeCvCFZ0=2go3~YW8@SEx#NX{91G2P$BJfty%4I!b0
zA;$JH_+M7FGf|Q)9Z@7i8Ar%g?$R^RKK8+Fni{PR-l~Xwv<T1IpIC)lBQ;y2-hmke
zb?n>OQ;>)hN;ek6#W}R3zS|TX#@`C+nVX;;Zz%aq0)4m{!<F*3U*;hXDcFb-;B->h
z26a0#H5LAjN+C!BHliPJH0cnP=2nfg{8DaIk6jGu5~%cNRrGDjcihssdN^egI1)}6
zn#I6dwXoZG?M5pofwNS(ZGt&0m-F(n+gW3dJUys9$YN09+&X=vtZ?yN_{Wj&nyr$Z
z<$7~9Y<lLCysDE^fPFeiGpkgd$%U5o$(7GFO=>$1?&pjhQUK3~vn_6I^EF6Y2E+aI
z!$eIW-#SWk#LSnO!u(Fx*OWE4*L!m`?@UMg+?mlA1`St4JB}|wx&g1TFTF4Nx2f;?
z<Ky$%vtu8~$BvDaRTJN0BK)N9ldGzU)r&WCki!|&$$5#r)eiTO){4Mq!w7*}`?C4^
ztva`kcR7EkG+K7Z<xOkieK}$2Fqc23yZUM`x#$cxmlkP?T`YVMs-t4>-r~Al{b!z7
zD|z)VP84&`Q1+8N3LnrP3p0p?ZZrSJa!Gw4e~D|E>z*;_4D1=oZO2}6qZ&w5QMuy>
zVFXlE^;sPfnjT(ay4EFMJeENT*uNTzuaEi6WfIo-n5Ey<0LipgRj4bw_(T<{%&ab3
zN#3y4FDM^0mJJDGR|JX7HB;6#5&h13KockTT}c_Zeedf@nHZC5;`YY`j3>|@WVO<;
zwb+QpGDwk2tY>>&WNV@Q=@BsbL1n$l1LxujD=#W~-0F8gqKY-ah@VIF?i%&A$uzYg
z?4UhkiG{gn(|;s0R|Ig&EMR8o{wKP+1!)3%!)slI0q+e+qn9z>_!8J`<6Wk>KIDSM
z5wkfRyn}p{o-}uB1=Eqr*e&(bz-~ZEhd-_7C?Hc}IJ?hmfd)yZh*qKyg_1=PuJP1f
z7$>3>Ho!Es3W|zqA(fN@P6l5oJXT>3@<Bv|SO32O!Z#S9_qwkf+Mt{P{b{ITI-_rN
z=2uB%hHWn0z9s1od3z)NOZ?9*)6YaX8Bqw>ne7V8?PUtd3EAV9BP%izDb_H860QZl
z4KpqTcz?}U@3Ce0K+<nvL8Gpm$mI8uuRHtZftg1lF&x!8XDtjgMW-|b^Y3#luy+?H
z+7PEYBJJKOgy;kh$rToK7VtJOcT5NIdI3m0!8DtDs*d*I8?`p)r0ZJBSaU3Oc$Zf<
zb9U)J&(tE=0YUNY$<?OJ&rwPmDvPMc3pWvQ-b9B6BCaI#O%io|Ca~?2NZJu%IBWfv
zr$)VInHl7&h}^(hFZEb|i&s$ulW5Oid|7Sq%m?BkA(enQN*_bA=MLcK7=z09|KKON
zk2D>Fx5WO2FV#YRHv4w&i}{VQku!c&$>eJe7zP+*?~wmdk|QoAx~s0ZG+m_oN+8yZ
z{LZ*jGKu3ZlcC|tGi?sT=9G_VUmH)0#^&f(`O}J)AEA-3_+U6CjDD;DnNk!}HdBqR
zo)Ix|k?|w@Cc9<dEA2N-U~&S#KJ3hmW{8GeGGsnpRtdV|tzpp&#Mh~2xz<{){XA!`
z4_<5vwmRloA6no^{RWP9$F;j4vr>$bj;~#&FJX1X|1!;Iw3gx&oit5{8M0v?f<(SB
zKJWsy!oOB;pR)d=<_^Qu-x?YY`}`pLuV^ZZ0Xm(yy#61AgM$xhLeqAzKA;UD&O(jC
z1hS-2{=9H~tjVhsW^_660h6>Hd*K^?jW4eqLnUdOHStev<@wmYr&76;T)eH)!QTC(
zH3q+BgY8CCMX)*ISfMEz!j5Q#=Nl+vc&@Dhuxy!H%yGM_4NT9W2qgX)YN~cQzL>2p
zb{w(Mx9@fJbD(9+k*cEk56MoUf<RC%%`++q#Q_3pB_RG~#74m7!ym*J%j1&y5Py*w
zp@m-f5kFq$tHLM@mG2Zu+~1};Zcf}7x(;dxo*_H&(c^=7F&@BChHpv5)-1y8hkH0}
zS+W@nG;7BlX+N0&gF&@dh`>0y)Gro+E>yY7X!@(3w5I{BZzQ!z)2#cN7k!iZp<xs{
z*ZCiaF{(&h|F2AjGz+?CqTsU%W~k`@`>Z|TVlcv<xquTis3xHeE$FOnh!jHU%TA2A
zcp1+K(!R%|W%K4F-?{E>z{&|LkzI$8Tb~nTSIZK_k7KB7dpkdS@b$j%kG9jkpJKEk
z<qp2I@x)E+=#2gYwP>P(-WW8WKZx<ZM4h>rNW0dE5?_8;LuU6q60KhdvQ<5lnzw9y
zQIXn-bN?VaDEJe~7F98v%MM>(W%S>^cEUoHMr_B#<*z||Kw2F10uSQU=?o>r<o!^4
z?&lmG5fGrJ>u>!w5JQ{`3LNlx9r5**M>`eh2UCAN0#t{Uw;67T&s>W}vvu)1{PIE0
zpuk3o^Dt|lR30^S7RZy%hL;+o*n`1S<9Wo`sS8Pc*<EZ*6NjBa22V<?1<<!s?;*bc
zcoLVQrn>lF&TJI|7Hz8Be5(l}jElp->8Uk<2P4<n@7Alyr6ePrWY@<Y#G2inUCs3A
zGe!^;uJaehs0YVHP>KvcWNtaB@96e6uZZI(atFO6)U7no0rNuSwElbGxW{1k#MNVN
z-GN@b3m<y%Ch+193(QkycEJ}^#Ki_OllFPd%>Pco2=sp=eQL2BV1ekRpaput0uW`|
z^BUKGzpf4r5!9Id2kTg$g7su|JW^S~1}~5K@3a2~$h{-G-}*85ktNP3^iRIZ`+xgG
zJ`_Bv{EWNGIrwnu5@_HzZAw98Jos6X6bODq_Q~=8-lhSY{SBX<N<CPpIDGIV?PPhN
zY39DUy$YR!c74Dd`Tsxq|Mym1i^EVtKm^c%Z)YG^-tybC+nc%i0eW`3;MF5w5xlcA
zRlhUU&hpmmOcEvR_DJ+L8hLR_)=wbB;o2q7kVR56-ga7JP3i^h&pc3J`d@#^;O%^y
zukdd7@pf`c`rngi+lpU;DTZ0-<|v2Ub!`GEJzd;@Ce5_9YmZj>-rumjhXqP0GkAB5
zTOqJ_8tC;ma0g_CnwN+14awu4;2Jg$ckMNr3}#G4W(*hvQk2g*_Wb=0(X-0_qi2=<
zJG_}MzzNB;uK(o@HaIIa*z#ZaCZS&Ud9RtsFksfHl|rVWKc)5nE^fMo%0X^mN^T+_
zG>_Jv+`5ErZsYqv3uOg$Nu!bhH=_PGd+mPnsp9Ri;se+LKyjrrfS(UA42Jr(7YcX>
zauit#Xx;}p;D+BWpv0XBH3^C$OD+YBT<NUV<hK_Ex09a0^Swd%;&-_9?IFkQ5Y)6T
z1$NixJJ8dHfTI_7x4&3QT;ZRq_a*R#oKOHI!&5L;U8w_5vJH?cyqr|LYE@jmUFo(%
z`QSKz3%y~r#J@N6Z@dHIfgK8+<9{u;2hw0A>|DkGgVQ<vB`qTq49f$BmZ^U#iLQ&|
zy@xtKsn+B~n{xL9y`L=(HPNiR4*&R85GZ#~5rh8fY3f;Rw$3Vd{KPXm!S2|l`HQyq
z4j|TOW<1gR_lANPAOJWi8I64hk*ZOG3c3h+a{0IxtWotvz^uc&>@?4hm;HNJ{il%n
z%nYy`o_)rCe<{eiSEJRBpAaf|U4{VLawp)EDcY3^sG56<6=9)OTdQ%J&=&jO29qfa
zL*J<;P@nuGayc<jP8-JmZD+QwMh94nD58H9tR(+JTg_z_DZJ=hD75h8sCuj}e+Epv
z+Qe6nWPS!D{LYG3KS4N5%F4aZ{11X$L-F{!FgNPom#O{$+CJY)XIs(K!Uljt=??H`
zF$Xu0S2RdUlm0eQUNxy^w7O`0Nx6nBTu+nNgtD)}11nT_y8}XAKa!#xdxpI-uB}xa
zkN?;K?3CwDjwb&WZwd~f2&>`IPoU7<=t~1FUJF>frj>b!f|1913yx(Th1b<Vl?qe*
zC%M3|#<c5+v@%AKgJ7$HRN);)uS3Quvf2FrIyZ2N<-s?<Ubq{?0%i+fecE^$|B{p@
zC@42As8oT=0K|6|=y7N(XIon#uhcR#p(WBCdJb3FC%xsRP+w}i{<5aQt(L~oiRg}B
zEA8Whpwl)E&!X$@f-5<|GrBB0g?i9a4EvdZ93jEK*=;)+Ly-K&>zmF0uC{UxRI*nH
ze9p`JpL+!_%`U%HKUKVE#?T||m|PJeYvq-nceU#=;jFkXyj|FGF>s-KAI)?DXK80B
z9_;-+y2;xLugi*mxnup!NkIifE#4^|R_LMN&26VdycvZkNvB}Cuo9?fYRJBD2U0DT
zqKCAiQ7eDcMIQ2UbYt^O`4>mHjJG_FyQOt29d`B@bKX&#FK!3~Uv`1O*p3X0aO<84
z@!Nlkb1JU?9WqzH#Ww}9uik3TtcHm+WRR@-Yo~ElG+}w;(T48=&4|Oq7K+w2X}?Z-
zI+J5^{Pj6J{l@mhrzqSn#=Gd;NU)a-Eb1BWPD5&ID>{jGgqEf!=+s$54&<|t%QgdZ
zx*qAk?wZ$iPWz$D6hAgc6FcZ-tgWs=r=AD79%|#hbPB~_QX7fKKf|X?=;0yAPE+mw
zCz5Lfe(yT2Mk1I0;wmJHKuENeqj&wj<}kKkde?nTumY+C{S+g_s+nI*+J6Ivs6NnZ
zw4RHCcs?M!Y&CVS2kJ8IK!qOVaa9`n@cEg$(N_1-ZBAb1u`zD{yBR<p>Gl3!ppg;1
z_<icn;hz4lyy0BgLNN(y89lAqbMf)EHxzRR6jKtxx@7h_4T&_vcF)vmfj{y98-{^G
zLT!ieZludr%Y|^RLG72DZD@h8Jq!Sh2P1rQ77Yhv4Zm+T3vPZvZTBjbA5=)l5>`@k
zT~YMZNOTnPpe_Z}6_W<;z)^C6nv{9;hCq?y^m9+XTgWKeQmuL@Hu38>sHRwd$>BE9
zQ=WEvo;KhNlgBc0oR~cTeYoAb7Q!HgXu@*0{_Xwo01R-r#J|#&s=qp)nY*5u=F#;o
zk`dl~y#p8_nuM%dc{UyvHGmHmwcOgpn3n9C21-n2Ioc~vE@RfMFixsP334Mv5h>^A
zo)D3nIs!G&-32mZhTQ0%GL%E>HVK)umAQf0BW8EO220e%>bsSnQ4LtHFB@*%fSLT!
zo4;8?N7?lgeb%E=nWCUV{e~)V2eMK;?RT!{xvbN-?WvZl#CkXS6gwIA0I_>A46z^t
z^n?I43K8wbGGpU5H8r^e{OFy4wU)?QG>ENx2=pTv&?t(>T?25eCj15(N%Axils}-y
zuk5}CT&M82Un7^Hf`mOe_F@p4ATT35d?;O=+h<x4EwF!-cI$pwm3CQ0=T_M@4ep5k
z^8+)_RkOouYxhL5Ibeid3YqTgK;mf*IOa77K#6|vKJVf3|3&V&#>WT==z0YxX0jMK
z(U3WMG}KFu9#3*_5X!lR`L)s!t;k}qMt6Y4iY^R8`ZWs${~3_Q@QudDVoaZZ)jZ&Q
z2weox>M3AY^vr!=8IYAO=u}G?ifHO4wRZlxQ$JL=IZwNFg@8PrXF3{V0L;#lKu*!{
zngiUW(NGUQdjB$I8jR*9p^5WWfe9%=Od6wac~A1bco*%TdEK5xLxj0#h+H2{Maclx
zu%m=>Ye2*kYGtQQgOi%t`gwuQx8ap12=F|Y4ouQ4a~bx=X%B|D*U*k$FhxZx5uBZv
z9Cd7Dr-@)na~IwCQ2zDlf>Nh0%9{mBg)jalEfYslUmLs5JonBNnm3IIa?U5Rh2%43
z%hznKM{Qm;)0h*`ZPIm)%0|rGzqaNcAZ=(j!+#L$n0u(`xe@Pa{;lEUENbuH&8&S^
zJ(i*kZnCsmz`^UUSKHyL;rlpn|22OV=+Fye9-0#PNHL1PT)ylS{4o$(v>$2oyt7rd
z(gWe^qFwDg0Kq6(bwA_VVEvyZcYc1qABy7jRqkJs0OhTo7=R}#u6_>Sl@R7uP?rh2
zLxOB23?TvzSO(P)@%=fpUUdUmboSn2#<A+rG6o=UA_Ij{lhs=)%N9<ADI-Sk{$5Hg
z2tGn$5e($-fp3GXPqO|E7)W>YEd=+dNPYi$w$4x2CT!h^hm^pQhEU>(wwqU6{F@{2
zuI~0A$z<$@uHrXhdN2GZjx0@mX(hO)BNz+}(;kX|Dp;C3fPhnqnEv$s_qzh5GIQVK
zjj_@8L*az`+7K|CJ!o^=qL9)k8-fOTd=p>{12|f!^X0eDm`R-D$fwYa;KgQ8dR4E8
zhFp10FFlSoc2@VNPxwT4rt9mfXzKy&(s={`bB57?OW?efQKaaDpB0R-?&;WBO;GSj
zTZtUYKTx1WRrnat@`gIFvSUR*7ppM(Vig-cFG_UzlcW1|qRe89?#hMH05y??S<=V!
zBJsilhHA?oCiC{ZbtlksJ&<8RwJ_Y$ww@6yjW}m1TvUWuz_rGtn^SK<4|MLm#a{gl
zOF?0<HNu2hKdSn*GtK#0x4F*<Vs~(={h^t>U1rfc@Ah17<Mg_|WkWlD?Jx*1F#kCO
zKDu@QAC``}kE}&xlDNb6=;ap;O^8?Ky2CkFA*Z_yjXT&-T2+HOU{P@6&hmDjjptEm
z3}+y2N73a*NW+m}L-aMvbSVRjif7IHAaS?1<fX(z(!d3|*ippn)BVR`G7r5VVz32b
zNwD1V6^(Pk><Iu&2ugr{Arkq2xZ*w9C~zGy#y_UXYFZAW4MS=JXe+OVYl7$BfJ<cP
zt5L2(l%o!m$<f<V<0HPiqlOE`NK^XOg&Xzb`KNp4?Xlaw`49jcC(8VOyN==ZXvEwK
zTcGJA3jvs#$Z6I9*n}xLN~tE#9RcT2J-GZ$5bHOwf%k-C&vzfGJLU@Z?+@ME6^+yX
zN$x7xev{@rP*hyLnQZ3W1cojJgHgW8Zcc+!`bybmfIEdenxsCLK6jHxBQ-vNzoj#K
z3FnNfcH|Y=k2H%fnKv3%WPLefrj%!_sXk?RD-5VBuki7Rk~eM^`7Bd!uhz`;;?B$!
z-83~{_x^J3z>KToP+Ssvb{c3kQm|*0LH@UZ7jF6|hf-O!%Kif{?$xO07Vw5hcuJUw
zj1B$chIaAccJmtD&H_h<s<#UwMU!jQIkB7s7un_DjP>wr2X0n;Svs}3J+&FG#^kaC
zbMp=acjC?J<%N0TNk<86SLFCVbZu0Ihbk{2>Rr34bQRJ_Ji>K&3UW0%KuOKNdoo;Q
zzvsc{nb&RqRS$YG{0IjBW8DRtyr?D7&?4ITG{GX`_ZjDr6L6I`E^M>yd$(W&SaMIC
zpCH>ARkbFSQx9WKK>&XKDZ<}y2ogImneW+PTBp`tw#INu#Auj<dp7o+90vcr9u~bG
zW+Q6GIc3tk<!sC0sd$P_oe6+)T95EH2S}P~MbQ4>*fO}pci!~iXAB-SYVHq0xFWb}
z-aCEXSgB}3KQ7(H#P~za3wOX9@AEcR^gN8#Y9&Y@wGu&VWEG9sW|&v2bT_tr)zs~a
z(zW#or)%D1cn9q)%o*dmt4<}T>Xhv7Szq`uo%KWr$+YghEBc&v=TN`i?QA#8Ys+Yu
ze@hQF^!<ILe+>efSJg!N4Y+Hs1izz%OHZ&<|1fJ*7dNzV%+Xq@#Pkt8NNgu9pFDx?
z%(D}-N-Pviqw9&XNz04LgxHC1pXv(ydR_=_{`j6~Q`0H5CNR)de{ap;y6>(nTnuki
z{Qa{#4$uX?I-chFMZ^Y@T<uLE>2(mFH=b(hCN~TGO0{ho=3yjG8T1e?kZrD3iT#L-
z%XJdSO3cv2by6>pcfZ>ufOR{I#5abZq9ui7trcyUO<^{igr8t|{?tQHh!r=deJtVD
zAvhc9^L{~STW{CQ5_8-#8*^!GL|M@a7V^zsA|=A&);6@~EaEKEE+;L45Pfij<IA~5
zO)HaL+TqPd;O-qGc(JbB{_8BGN?K1lz-}vH?kM%FG>9;E>5i^Qdkenu*De+m2X($O
zh~6MDD%@bVtYKg+!rQ;SY|wk2=aKK)XR(T_JNt(-g^&H?lsTn|izrsPxrlc7*+=w-
z>93@WkMiIAmy&{_BPHsV0hdAB=&u-+UVJ77>QXcP7c3OR%xICa+r|-|^{QHESRlD=
zK<3C&w{FMJoS;5^atWMc$LajiWtl!~c=cCYsev)xVJS85S4Bk2lyNXrg=+rRd?L~i
zkYu_v(-FlUQ{dit`sy{hijhxYDhunSaO?9Mj3yQxG&Bt&4pgUUldQCwxgk07WOda-
zBnCH=+F{_suAEmww_tKcPr<jfQ^mYUJoj_c)DuI?G2|#|t|9UOcWU%x)0F-VX`zbD
zC=V*pukYpR0cd27p={QSq6dVckvZ<DwHqa=4vz#I(5XG3`72j+p5<X{`VY^W5J}*A
z-QNOE?E=9&*o0=+@nkWL;i#6U+!xpGEy$-n{M6#hublQ~Q%r{@Wkk!LbiGa4DCJ_l
z{l=hPEafs<k{as~t>+bi%cE8ze0Te}gKX|^d$iJ<q37tP7dsygrzhpT*6!azIiPD^
zC2@MC@w`q}HBfg)L17q`l@fn@y+3O9=X<sY{P<Q0KAhNcHQ3P%L7!xY$~4Sc0b%$@
z0Tz$=wpM<Li_4Gdh*kw@QsnQNRT9qy68l=xIS=g<&10vtuzlQN$~wN%0*Cf!{czPQ
z5A2wqqOQ)uK<`3M4V^2tx8krmi}7~LNc^L*P$pDc*vO}2;&qOs>KgKAE+6T4HHz$D
zoBkrO^ch`>>DHVXGh3spBq>c@3LO@?8@MWO@^F}yO{ihM`8wt9*ckAV7U)M?J?*y(
zy=rm#6n#Jz@*tXu)_#`P>vPR=%F<OKh3OxgryYRI_LPajJ=K1N-=%l+^MA>wFscPT
z0|r)iP;AfJIA+rrUf*%0Ae4+bKY#U|z7#Qx5F#{*jt^04q<)XOxlQud9K!SowJ@><
zWrL&?Iu%_amxGCa(mj=O!jt#$kn?jOx+_7ahJl)cM7f^asAlc9k=ua+mq+B4k)wxw
z6FNz6-eZGE+XrULez&m;=Yg_GV5~(IPyP#0HAOTYLw<1HX|cxw@l?Z3HhwoDoRnFg
zfDUudNk#v8=pa8(4La))tv&m8y_b;iDDA_LtcDj_#M*|3O?CYtVSf0fs52>QWj(LF
zfzy|7t$axwH8ZHs`|FvXIN=TII_15#Cdr+Vp=kcbyu^h+8@u-WGaPjQmvxeq`WO&=
zIi{?t>q8E;(iV0N8NaGyzN%p75;BE7-`o~}-ErC%M~vSQyLEDZPI_kWDsmZ!PZ70=
zlpD0|1RNU1KU(&`67G`OkdjO2D9v=k>F~8U#$HBCHY;=Fu$cuU{G*7fb^cmF(r1#q
z-2SZoqCKWDu3dHyyXDbCvq?14OfS3ZdBc&*hxR>!*3J<=FAEIPFyy_-Z;2dhJg8U#
zu7}(<Ak4D-Xsq%@^}HiD;(B?i#@U9z3NV*NV+A$)$nLgWSpT`T<xa)-TRqd7UWChD
zxytm)6+Y8#O_4xQQGek>zQeB3ap9wV7t7MrhCJ+tNlgi&9n2?+P<98_syyaC^*pQ5
z>koZtjK*d^txmb>+?SpK>ig(ru?1ih!{o~ong7aHxITbipi_vY+(ERAx#h%Ah$Tg3
z=ZVC?^b3K~u4o-<R?UM7!H(QNa93t_KiTFNm6qGteuV2xPQI%e3-)@bdL$1$2pL+)
zdC?`-Fa+xq`Uz3;L_!=5+S>O#J=OQ5&(oR_3p6Mt-uv6Cp=<O5mL?Q-lD`en#nh7&
zbTmp-Qfq-bm%U-;gZXr}vw;0=auWqaMEX8!I5$Re7$N5g=JlPM9w~&2DNDR_!{v-}
zMAo2s(U!y!uTyHup}f*rEzJ#?^2OK(np#>qjE%|f*aO|`MxTHbPgH-13zbgrAzM(|
z$JgZ%MEM((e*<gYX#P<86@ehi(pa*s*3~-D@;H&9lS68m%`1)KtOjayjeCGgSQw@r
zBk=)rH;re*2KFC$JVjJF%!)mK<NeXCG2{<s4m;hiOJ!cSHykXB<$Ode9kuhb`5*!a
zU#jC18u9-NEHzn%h+Y}YQ44AY?E<#8m_gNcv=H>YM{8Npp35Gega#?wkes_^%4$&>
zXf&pg%Pk@cUV>+pc!q<x%3&w11m37tH3m}gu3yiz9DlF1Skdp9%di}E<UT(=EI*v$
z7ehXxY)?P${QF46rXbDCRy#s-@8gkCWu*xjzx88D-bB1eLdnB17wM5^^7JN&mdk>0
zqqD*KuqNf2NAZU&B?EbzHKRZkNE4Rq;Q=EovHSV`UtYBR4sBEd;j%&ai%HAR#rM9l
zoNqU4lbolo6rUSotA<Bf*(42zxU{#RDsk@=^A};*_DG7owSUOh<$6EjF#@K%5&V~V
zh7bJ3O@5?tGYxJwm8O#UT97=2B{SE*#O!7%In>Bd@DfqQSKY_#CE`N0CkYB1z$g*D
z_%530m-gYwdsG}J@&&Z$y1RzHTA@zKu^&BExn@S3CxU`S|9%qfBt~3!f2JTeWA3fP
zv+EN3(xfrsYQ>#H$$Xbt=3L-Od3CA1;-G36o<SWUVJCmSd|h3#8^B_M;*X2olRg=)
zqnK8kQbJY7tcrJfd-{!DoeF)&MFRGPpHCq5kMr}N4Q57Fz7Ibg3C^AOw_3ydF-RXK
zgfK}+etW6W{+f}&30++7&ZexJk`PQr^*wXRuu(c8;pzjd4WXf5@d8^QWzzWJzJ+ws
z1avw*<aeghReARNtk$M=7Sk$ZC5=3@v<ec{s>qgO<u#|kOth%jSRCm;T#m;FUW^Q1
zw60rRVw2B2M@S+GwS6y;{Wu3QX~Slb#(Z@fuNa|Z{j3jLc;K6RPR-Y^tAa;3J>1t`
zFCQLPzrQ;w!+P0yuRX+BwIim7bc;(>c|WOKo149h7z_*H^;U;p{v@%GXf*+u9Xk9H
ze9SiMv|l>^LV%WUqb8W>e1`t%8v?e&Fk13BU|-*VE{q@rK^5UhZSC^FCk+1(qJ4WD
zXo%v(+OFkJQ&Hqj7yejZ{^X#V^>>C)E)b3~d^X>2(Ds5Ajb&(9Vu|xs^>#qQuX))O
zhC5U1Sgl)lM2;qECE>kH)9Bk=wIStAM?^4veoDYV5nWMn*Cf=ot9lvK*%lDeI#q`?
zP8z_p3|3pCSn4U>l<tmT9FS!)Ym?B(is78tmFxN7X+ldB0WlxurX)2c8GWftHCQ<-
zH;$bK&(4bpcV_|*hYEJie%~gf^+@{2pghm|A}zn^|89^^Luld<9#_(ycENM<tfFz^
z*N?eWuSa(WWDj8S48GedVNY95l|+1_C0Y$pf8Re#{V4cOW+|ixVRQG4yCT`Hom2ph
zPx6xaOmG|=9e%??-T!ElwRf4yow^M!M7MOlGfWPS^H}2(!g{%6(L7`>m7`AXk9r5O
zb<oYV+l5zotBawWiI`0n`zqxlU#{+N{2!eC1yodT*awQjI0y*H&`38Z4bt663xcGi
zAPv$j-J#SVl7b-J-5{MJAsvd+ATc!O9sc*+d+z<dv+iAYt<kmi>|yV>p6B^JExuRp
z%l3q2Itg*ZF_g|WPJP`@5Jh~??h>bY(@OKT*BD4EW;ko97EMQ#wm2OPmwbuQ+NXe@
z{L*(}fb@9Iko0nac4P#Ut-sGe)|JrQSDXLr-e2e{h|PkyB-3a-X~{4yN(eC)tMN^P
z(~9AENqDTsN#cFY*=uuUcCK>oI_T6%-dwpG%LcKNVJZUkI31C^-ZI50-Wh?(suSmC
zS~a73Zbq-$S*OW#KlqZ(#tbhvGc(dG)bqe_a=-<@qf{CXqqGN;NAaEODDUYMGwIp!
zdfE0as^d(5^lY|lZy9dnPti}_dP6EIuD_|=4I#$^dSh7BvfYP$)LBtQZH3E1OO@M$
zr%y-cRH+&+-Ekkp66wk6?Wk2|{eFLSmyIhoHOMOA{`z4ePmgsrw33jl?wo6x?y9_O
zK^2B`K@{+BNuppI{Zh>5l);NdVmnH0t0OMx^YVppQ1q?o#w52++^Hb2q`yT=*Txi}
zaNGQfF!uTri)PU0F6SbgDamP-?}%V~8F$WN9bSu1P(sindEkKWq-n5;J2}vLOv}I!
zw0PM=F;29)DW2naBF>206|9Z2O<g9D;hTUSXepw4ty7_WU5A)_h1-6Do)^}bahZED
z{L$rt&)}p~E$9cGf?i++mGf}58uR)iAsg`{Lt?ys@`EJbW}Wo2y@re3b3JZ4#uk@^
z$fLc_Y%|2%sFA;)0XMd~fwt;@*4wAhX(bgH4LlypL_v2SvRHyn<8-iW+4M{U54Dw*
zesuSti=KZgG+|RVg4S1qcp)*{jBj|5KR>KvCQ($!#=$9=Jg=*t!XV9vX6yXQ^1XA$
zwjuMg<XI|?(1}f7v0I%0t)TZ3Y$ORHr{UoPu5V+qhi(PxKB%x_ec3lc$@_$N+JqwJ
zCq_%~DkEv-Rfi@-EyM}br|oy1;pF9>SbTi^ylw1alC6{cA&I=K=K~yknHT;dg8kcM
zq)zQ7u{?fapK20$SOn&R4Dj&>Jk9i;QlS5{6<|GEd~Myd_-Vh4VsO=1Vp?Z%#&N^(
z`Uq&;Mo;Vrh1Xvh6*y}$>*~wZttoY`+d#G8e?Kp@>b^J+5O({NyqZb)IE7>bhqX3t
zgiNND;^a{rTA6T#ToF9@A(1hH-7NRB?@GP90V}s@o9r<yQ;RK(sGECgzvx;|n&zl!
ziW*;LzPQiztzq-F<3dZ%GwHXO;`4p>`yOl_$RfX1jb?e+rR?5~gOk*84-gz~+L?$d
zA2$7<tP5Hu*u|36krVz=ugDzyWHcyL$VO+&^HIUley@loM2}!TcC@uDH6I7~9^bI_
z0o*t=p{6}_2h{nA?qFB;nw3%{{Vex+Rz6L?@(S$NP*D=@q%H)_G<;m~ua&eIJGOg{
zoM^<N)GHW|+?3Vc=uReza7)4MW8noV3|aFYsGR0g*9Q`eHLRlPa~Q=fkAAzoJnJDi
z-kGcL*k4LhB3gaOl$%H&BP+ho+C2IkUMh?r>fD^?wGXFG#p&d2uCi0WJbP2U$-nhl
zu&gvS4Q^6276Sj!6?NY-(^~jBlP-$h;moZb%Q2CZ;N$SAs2f4Bix?;U_ll$3c`G50
z;koYGr^kH$-Ah)UYI?m1Q+nJi4R0%m$(m?YX8ei+wtW`=p-lU9(RJ~vAXwY=b~M&v
zU#}-cWBGoP!4^Xr^a-4IpJteZCdt6*{ChC}HebPAHfrrT!;8~00%0l`IBtCE6*c<S
zY{xv7;CTS20PGE~Gr|HLvuTRaURh-HL7w?+ns@&aUs7Jc8Yy0QjI2jkz?kJyjsJGK
zK`!D^{RN>iqu+ZplPqe#bZTAuzs;a9%eK?blV!(sRJ|CuNnq#-h`E9Ujy-@pANhu+
zCR1{?|5MM?3tg{*>s0iz#Oi~wfpavE@zVLI$0*`cMqPtHjZ4@*LeGB9A9z`$EuG^*
zS%3z6Ex!Lur9;5U$Sq~E@}%=>YOp{ud{P?P84T%ovFL6W@^p|Rr!;GwS6!q2-{vgW
zxoY;WcpJu0lb+#8r6d}UASk{ahUdQ~iB(M)8v<XzH1m5f{O`@iRS2fkB!YVSHR~<&
z>|tcVC%s5I41)eJ%=?5w(1Z?CHYQy)SljP>m~(xO?3mUdj0Wj;*3T6Mei_wIn9*!n
zHMF$e`VD(-?=oD5lvD~YK}pNRW?79FqDN2Ft{>@fRA?mNJ`P9Oeo&qK!f(=(CPp3c
z7_oZVS^5qQ<>>Ecvu{5HP%}CtwwvOP4h-~?IP3?f6n5@Gvzpj!c;xLLD^DOQ&BVmy
zO*crT@h7^TynYFDF?xQukh>p^^DgPVhp>?4Rk{GnhhCf@sg{Qh6pP7r7fm<wS0<oK
zGK-R;c%Y{Ty3`ZhvOgN}!Kz3H?U&sdX_bbhtrqbxlVkf=<freK+IReed*oB1-FP^^
z#E>=1XRnAmG<sJKO4>zGnlF6h^#1v8>94$i(i(naEj~9b)&11)Rw=b7RudmQFi4d4
zec+W*<F{gt>i;4R8NS?C8L?fHdc@l8E7oov;VN6kZ{<a2NvPN9nDT7NcoMNWbLl=t
z#}pR9c5@53us^vr&VO9k6$y`FEEv$NvmWN}TkkYw-;+O)6~pZeHlq<1mYT4M-u}dl
zfxpEvtiw&2uGUJ?MfMH;R{HVk{7_!f+dpkvGAOw*x&zWGu84%qx7Sqo1QB>a>#l8^
zO?Fd`L3PQ(f{0zMa8?bdfY;8G&fK6Xm9%x^-G-Kf*I^HRbVs~(e)nV9!u4C%5q8Pg
z+*Qmv_00!jv7!GA+d(P#i)xM>r-~30oFA9k0Xhw|0VN-hO>@R>R7@>V^BhmPoq<<V
zglIz#QhV8tZgri!GGg(l%{|B2_yj&9-^wxQte!`SMYoRie0F={h<#1~4G%L?RQAo*
z%%qaI9m{P+g1Yf(OmAV!#j)efh9eVZI6UykH7MT4?bG7|42F5}C`{>2*HYi=4?IT8
zeGo{<zbIWI54rTOLoM4@$(wL-P_Ol9yfFd=P)N>V57ipP;^EVoZFDiAb^#0&27$NO
z=X3X7j&3PmH;Fr(V6IOaS{dkbL}8Ru^*=u!byhHW{*BCpucybkk23Wr>>mkx=eL$$
z*CS+R>Md9%bB8<+W0+kMykgSR5jEyc>8XZ)TCDgzy#CO9^Z3>hl`To7I@aA?WoXdT
z1AG7NwjH2g$InHkcPKbLu0uZpGnr8XHVXe5B|~%3m}~(F!_8UT;$}q;y8kawV~i*`
zIYe|gzto$K;A#)F;i96x#U+pmNrobDql8g=mEtZR2Y>iW6@fWT5Yn^Bb5-|&o*0c4
z+Bv`?GI<_`SRA~?d`CzhM8*>qFB&-JJQlN3iN-jDTdXe7vWnxvm!+Zhep;=pl04}4
z(&b`LP~Ci&q6G7Eh)UZxotSSMPQJn?RViukcZ{K7AI7i^Zg+d)TrTQ~Jlm!(QrbN7
zS&beK@a@v>^Jbk7?`((cWD2KH+P3?=RayT1F^}zL%Fx<yf1%`;_l(BD31;#6BDP->
zrIk^8a^H`!>gS_)1Z?UF_v1FlGL^G<nPQ!r8T$Kn2ODN)8oP9jUUUEcm>DLB%k948
zXB2P9<;b<D@T;-Se4{xaYzFc6v81B9ETsT`ad&mEbop4xJ{^-~&c9M~BsiW>mX@!O
z<cfP-zaeHmt8n{I{Ci42ro+AJjR3dJ>m0=US8XnXRU+TwuV>@)lgVuPehY%mCAnQb
zZ`gW6{Kp^(tmws$q5mq>5tsHS9s#cyhRWcG7*&_d3!hilb9DDuWykD?)*QaB&1GKg
zn=h!CKZ##2QIGM^A?ZWY3x3MOIjr;#)Oh&cQUXhRLZY-TF#qU8Lz4kRreQcc9ILBh
z$<CfqN3AGfaN*5nsP@f%CkA|Q>SzfGbZ$AJG;1;kaAG?QX_e{H1sdYBI4xa@f^34g
zk{5bE_o5VtZvq06`Ut1C;-{SF#W`}hX17b)DnI}dULX{I@?Ka7&mfm(uA`hak$y|y
z=Xi19P~o6+WR=q%OO};R3d^|jJ$tSePBR&DJbXNjIkg+p^#1+&?bKxW0rr+U+_G(q
zJ?Og%ro0A5kqrr%__mn?&Ep0;8hi?wDq8f0F`Cw}>u1zqX=5cff7=xAPjeAIy4g<p
zYx-!58IY$|FF{qr6!hDWAF<fU1^paMRFPAJ49>gK`ysGgjP3X8X^`hnqCpr5*AZ4e
z{zg*6DsOtcw{fPN;?}yE8iwv0|Gx&wgT**HWLP>bprAWM`JtcRv@WS_Fz~pKHx>qz
zqjO7{5+D3%?%X5YcxIm7ObqD|3BpQ((^hYY*ZyGc?JaE^ejQ9VS{kh#?_3<)YOJdz
zIGWLOXllIsd&uWSgD4HtaP@}4wKHDgAHIFs-z(ushc_gbXz%-@RP5rBrB1DBd2F$t
zbRWjC#!zm5jhCr>JRG{VAE9UJpPp<2A|2Q)h!Mzy#d&`1>i-O)*nuY?VG%C`yYm!=
z2Vsh87-L(^{1X()ZJKX_$Ht;HTz&36e{ldx+Niy@VCjJ2W4gHuXA5dzXY5D9K8r$V
z^TSs9EM8JlFu!?{elB1et0!H+!;(r!NIf`SGd{at&$3)0-=-TGO#8jFt{B-F^<yC~
zeHeEMjJQOx!A-;cvDUDS(Q=lSSBS|kSk=8q8O2OM<4X6fsBfFcG~lkW%Y%lNW?e2F
zW3$;EG>?w0IyW0?Xgs|;w9h$Wbg32DuT^~x1ZK*;v$o+H*^LozaDTl&>SoB&mSPC@
z9}*7D6=%&d*CC9>eXaSR25+af^k{1X;YyCP-#B$kUVOoZMJSuDi+@nr6qC*6=t=B{
zZ+*|ezEe76%pVv3?;}4^Ubz>Kdc*5B{aPi8edvyLl_NT(bMi`>;<?SmtGq|NUr+0f
zpAPPscfvpHn>eMk8wz<xQ=H)pfn_}o_ywKeQ>yj{$Xb(zVN~Mw>P6M#{(oxBeKqyx
zQ&*7(I!mrRORf7X6N(NNANjv@&`ZMpV5nT%HA!5IbU`qqc7%~(7ziNzd>no|u46yy
zqzgjJ^k_VcR)oFmYx#Y>^2jn4ONV$#sBHZdwRrWYsaNoKE~eI5n)wKN^#@y|w{Lij
z3Pb`s&wsJexr=B@p#_d=*>X|ZI4-Gvnj~0VQ-=Vd8U|jR2q$%DM{T!g5*slolrM;C
z^&6j<Tjy)_vxX|qYzE!hFBYuEjV1R=A~Y>A5@k9Sg!Z205b%8(V0k@J`C3s1+Z4@Z
z7Izz${#;Z<)H{=QbkyJ{es=7E!lO*at48!{BS8Yf$2fzODQE)iQ>H&7BX8KC2XWNc
z*ZZ>vjbfZTCstXR1qgaCBlB?Fv(2%Rh+mXU=Lzoa#TAJtwj@!phZBjSbHjth+Rv15
z8T+p@jwVKv?oHpn_H6gZYoF2+EBJ#m_A^n`DJ<0eUB<Zyyw9rvqY2&WjFu)zOvQh`
z4*omd$QER_*Sot=n3}RVg-X9rVgAL8RuYeP6lOc+Tsu$=5@M*?bwRer#D2CTX>e=&
zrcCSYu&qF!5xv6)?(NCaN8XLs^FASF4i3koJPtltCXZ>CaCx8u+&k4}h<P8Z$};uQ
zsQT~w)o+%!#C=n{J@{N62K9-|keZ2Ee^MQD89W@tOH>lyeK!tI{sF>L|F?uYP=OEj
zs|iK1kmd&-=@&GKc1n+>KE6pW7AhTC&?)w$fUAE#(H6shP4i$^jzf3iZH7w-1u-l{
ztg}zFK=|a>&A*+cR*_KojbA$1M$OpP5jlVUiB1X*@(#}&VP}o;>Exc9RFhR|syriS
zdJUsEHzjB)-Ep=iUHf@7R6MMva<Ke`0XK9s)8kQ_nDTzH%%^a-{jr71qo<>-I4l$w
zk7wxj)MRp1z4l($W_dj|U9edj&odBmvUv8+Zu=0YNwZXxT4@4g4h=5KXBgH09jNxW
z^6hqNTJpy|QvKsh^=eHGE&#*2$ng~$xJO@H?oOP25aDOe5lHkw>T>5nDjsWRb#E6a
zU!sM{N8D3j+Ui;%NWwi~U~yXSuCH!p*0evoZQecCAis>qrtM4MAQDtMN=Mi@<Geb~
ze9<t)%s|X{K?gBf<_=3<O!Q@|K2}JvXG1FhV72+pG9|sI_rnu@`Ikz_jI|H2WB=3P
z-ZUO>%`k2==fD#<rDW4<%`x69_vQZ^WSa=r9UIi5@?PNz<i6y$gTko0C<&Yf(~_Ux
zoYDRA$cZ<R!T7bTe1?aem_1qkvMpJiX*I}#PW+;F`<x>B6>;YWFK%9Q2%X%hjagr0
z1qSOcG>GDgrZd}@XB59(^YP+T@=B@jH%!&xFTaS&<oHe;>cwBU+3E5}BzEWr{o~$=
z5Ajy2Z6GEjwD^SbsbU{K&^;A=dA}ZCY6B))0=Wo3P<y|oW2UWnxl7mrNP;g#PYOet
zg(Ko-_L3JH(c99olUe2U;_Sp=*F8gY!5)6;GO)SFeXZ}66#9lhIeb|}Zt8489z7An
zgeWK*lreL2``4da)fOXSZWtmeoag`WTuWz}nD(CZ54JD9Y8bO#USe*sYAI+5@8Qo0
zL5Od9@0*zGOPcFwAC*8hX5OBUdRjv1>e=?AXqlXL_XFS+A-`VbQ?5*FWxZV8X%Kum
z*RjX4A#ZQeBGR5B1k<xyIu)0((9x%ca(sd7{J#po(7?Wov72Gl9CX@Onk~`2GsDXN
zA_pszDjT^K2#F*vhO;NVBOZJkO_WG1ZH%QgL+sO9f57gTa{y9w^jS#RA%t}rB?f0P
z^`JL3_M>HK7Hd3~^3dOMSe*r-PP1m(#i2e5Sike+xJrp17-8M65>FGeL*rZAkFwlt
z_eq0HumYPyrRUAuioR{V!jjGjHBi7pt(1#NAcnYMtraY;Qz72u=+@5)<%T};1!3?U
zDZ1e+Gtz>?{8*h$p<N4RjKcFw3po1FH+H6@pU%RZQg>@=)mzU!!qqWWe>%#1wU>%u
zjH?`+i|N)ab*xNRnxw)L$VK})9ptJUvr3*Pi{-LG93tnU+Gk9DoHuqEg8!rIR%FP-
zA71chF>blnPdBCpBWfO0+`x*?a0ju4^=Lm6{WwZg=YV%P?T^gnfDmm7Rn>BLcMCy?
zES7C?o$X8+u|ZUwhO54AQ;mkA3&(@{Dq;pLEUes}nTkF8N_~|Pd%UA;WPu{4{uyE^
zZG(mb3%z1?tLBZ#I4QQ;2;8LIM-KQunGSyLcxWY{--R1z?E6~96+~G7=vrj%Gik7_
zsc?T|ij%C0cbK4`5XNp8czS~Aw*@l4T(R(3`<xn3Yc*V})zLcVgZkfqz^u_W!W4~>
ztA?Lr4L`3jn0j+7E#iW|Iuoa;T`mdDE&D`@;s-jV3rz|h`KKnCmL0`C&=hdSu1}+o
zF-85^<;FCRn!-i-%7<(062u{9Kgu$F{~p1l{6b#@gfy2>c0bAaFLNinBNv|lsznV^
zfL7V9`ea!^r5JPyHED8KXxPlA7>N?N0+!WL=^Hfu(Q)ZL@xv>z&5|ymfIR#izb1W1
z)3DPcyBVsz!}30ew*=F;axIDamSB&r$`5NO>4i6*q|whQwMv^$T|%abH2Wg%YKr)g
zYw~|(SwgeiqZbKJ9|wtGItp%kv`WzTnP40D;R#pbXF<uoLv)-{j@<^|mJ~8g4}MHN
zHuw7oQIp!*ead<3E>gysJ_8fNcsWR~$-)+h@8|;g7yFjf0pzMtUtUxlU6gqRn?ZeM
zo^<=lzJK6y%|>5`J+h5Uybh43PlmfLp;?#i<mrVL)YJiQmK#$SsEvQfpW{jKxhw$d
zkJdgxrd(DGdx`-Fp~>RO8eI2YHhn(<v$#k^XufRgd#=A!%@tTIH13P>Qjm;9yqDZC
zL9}10hfcG|d~4vnFkUwQ{5_%7jCIQUwnv2?aXhd!ybzJT&72iveExCu|9j$v$@9p_
zhc+Z87*CQQQ52?eUA_MTKbY`^bZ`^ue<?38^wg2Y4UG7pVr@<&>6zwc&uL!Z+l{uQ
z7*-Y`Bp*>HJ;sx7kcZ9fjPvbdHinh1@CLn`ogeFV|82GRqm)XmoBak@-ig9>W1_`H
z-c5<|!$|5`jg-f;rjU#v1nH~Zph{FL&;6bs&iqzq9nE9pY@cI%NJr8Ij(y<W23cR=
zN(Vp0lQQ(oO>05M6n9+GsR5yxw5m_QM?Qzg!x8q^UCkDm|BBVE5#oC%oPPb6(fJ+v
z7n6b?qkV|WB0|JRSGINgz6tVidV3rr6a&b-x>cF}j4z3|`FvP*oS~2F1w|Hk|MP;C
zTmvL-+gG7<mlR?$Ph0V~2inm*E>&@~8HqA0aXnkIuQx-We^EOL87tv&ALQ0`16iD<
zAyaI=e^&ELWQbsYk{e_COyz0zXT8F>Kl$7&-Ck#=CTR)oOE_XXQ^olRjwqK2%~o=i
zmu#-fsi@Fb-8#;4;&V^f)=!Ib`}t89Us&9>FWZ6zwun#;*b`Ut8&fJv%apPD9c;^K
zYu7T?G!4oDY|vKMfj=+7X>V%!kANaOcwhwc_-EdQK#7>G1PLYFOUFs0AosM_*2t=!
z9*~=zO7UIo_y2!0&N@JbrHN7kOS9|2;iuFlYlg%to}sL8O-f{fTSgFy+?>J!3iZIg
z*R$cMyA*}roB>|FZG}ccrwT;`1n!YlLs1^zkx|n`9_)|W(p$P8esc}D{rIDNH|$;8
zH@P9f-w)_wW3@#dBYUGYiX@9ODq7{&(t}ipIqnktyUdPRcMMmvu9Soz&&Z8{9z2C%
zoRuk5nB<<NK(k&mh(oD0zqjMYw$%(4#ho8QNGZjGhvQC~{Ws>ZND^K1^6b~Ze;`Tk
z<{I7IPzXj4)b<JlJPuI?>d<wWT9eQiynQgu8jvq-5`n_A2KtH(?aJ*QOs$_{A7S6l
z-0tkrOBU?bfSiY1m%q0pAa7%KD*4X;Y-a!kDGrXmQVfw9Q)M*3_k|jCV3m4558C<s
z$M%20yE`wSEFwvt2-E~~K>YwK<MClHPmLBI6V>7hWH7;#$^KQ%FP!?FmbA`WUkjMH
zQH!T^B|Ap>--G%Y=^soyRPV;6d2l?b?l|T(>T-Hg%8g$cueDsqz|5=tKNu>C>4lm{
zla`FitePX=S5RXTSs<SvWb^+y7-4SbtHZu}moYJqWe3#a&h9I(@6VWP6-3wYtZVrl
zNrI-5YK{JB5E>&>udF04e|{BpN5GRaaov$+^yYyO`&s4`${N$eQ1(J6V>uJ!uW~_#
zyZ&iu`}KUgYLHV5{4Yy_4~)Pt`hTY2cf4AGS{I`%F2BtWKMt~FbpG{Tp^kiE<$i5f
zJ-&$>GE1?J4?;-pOn>${;H22tXd?M@uqnq$<8*CVGBxXs|4W<Q>a1%!WdH3c=!07Q
z){i0Zv!(s(CHEiC74o%>--{MY*<1adNNqa^<;-K93b^VED6uJd*!K$=<d}grdtVGh
z43k6chz#yDFdq69M%BW-tSRwm626=<#6f=R+&1ff=~DBUs4jn<#KquNys4+>$0lX-
zqh8^<UGBAC+tZ_xblJ^BIG6Q%w~#q`8R)b3MV-NeQEZ6*^JrGlJ8?mlORU&6&4{14
zA!|Fi(?M+0)NMHy+xtq;lo^RKFW2?jPZd@)E?=G$UptHbFb3&HPu{xy=jr}G<G%mP
zO}o1)c?2?$rm;<l?g|?GUvAtLFhhPB9tBd^!lMk0ULq&=zx~1%nQg}>ql@-`b3e2r
z2btNFL-%s#uAbNb@~V!H$aKjOm=5ay`Udc&Qz6Dx0167q?QJNIWcvTv?PjMHy@Bu>
zaapU1i7Zz%ss^6R=xRh%P?Qk!aMUVXsuK$5jmm|7W7~LVMl5FeoZCuHjxgSPL{I(p
zq|7HefC7PCjo8bqY4^`GwH$MO`Lb1;Z}`GyN`I1Zm5Y*&2J*jrG*BUv&)|h9Zi|{w
zS;_zL>A+Nkk4wT(@&4zhkQ5b@fJmmCJo+6*#Q*q!KO_C0e*?bx|Mkz#&;x-5V)Eu<
zn={}klh-7Ywt0uSf=56Vq+Z+q$83HxAGF_IkF6mD%az5xq&o=T?qynkNxN!p`LXSq
zi5%9`)OR6~8~8s?kaO1vhx%U~Dv%30W&43h{hRf=A56$H-*u~D<a6SeJLXpvQ?z%h
zq~Q-~zrATT@;&@^3R=8@Cdb!0Z_&<{UfI`9?SOV90&Qb%$Vp2!Vkp8{NURqXkV|J1
z(NVwbZ);#*o7{T!Z-_&sBC7ZEZ08@BVeatVrtLChXAXVPH90DyZ0x_q=Ak>-E^pW0
zUuw1AnXNoM8k0~z;_&P}T=DU#9^tPx?GDpR=ePgjm7#`Q%)lD(HpS??K|JJzFx^4G
z&Dd=vd);golL~yLOV)Lpok{K20|MYJYN~ZuY>JC0e0(==5vc$9ZxWGT2{KCfOfZJo
zrtnm><@t2Sw#HZrq3?5b_68z2I;wJD2AN>FKJH8N-5rJR%&<^QQKmEJD3YF{;xge8
zvA&Wzx4+|^I{Mc6)|u)4rP{f7dyfK*;N<qd@<g@{+yVV`SstK0Ut0~Qu1j}l8W3AQ
zfG2Q0CrlG3M8)fW2VQ`kU8IOSQH5@h=Wx&8$uI44LWy}qX}nK&ghm?hJXK-GAO4sv
z5!H?=oRA==YQqGpw}hR2nECcl7CR(@KwWe~upNX$<1<B}PFjH}roj@p-ec+zJwfhX
ztt!n+xUT1FsF<ZJA(Ijwg_2M~9p=~5yQNr!eqmJ9@EMKllD)mT_EQcIiQL+IIi&{O
zN(LkI-qovBMsDRtDdaOO-pd)nXWtI77i@A*#2D1gDz6V(`o_`x^1^@iw~-Mq;)q6k
z^7L;YQI-LIe98O+tB^=MdU<$AWQldLFBrEOltG2?^XI$G&JM)~tI!tL=k`%->98|{
zyTkI6vwZv{cQ-a~QaMjkBE}22<R@Ixp_UEFJaZ~8X3tW3F$g}>ck{0wkONYX?2<h?
z+qAh`IAw67YInMU><S#spXw6V7VEvS)y?~D^GgA@ewUyta;a8<S{>*xJy$pHU>nWR
zROnNb@JKEk;|D$J#=R~IDYEwS4f1C2gB_51Q8AFn+zgv&bg>%4?;#Eb9d$=|6=dUc
zy?;F)<$nfeVYXictCEeR_-}vz@*;=uCI`&TV*Zd6R8qVs7P~oH=eOvG1*mQiV3!MR
zz`H+h8oP|}opz#A&pI^kT7qG=vNx;*X&KE`mu4uD1pdP}5%)crddcs7{HFPI&NjAz
zztsm(NA%KgqW;lB!0omB(TG5Ega;73KLLrZL(r%Qh5s*$5O|EFh4_3WeB=RXqNOpV
zYcA`7bttca3A;QGnpzR}yO{m1%qhnA?Fq*Ysm)76=TKS=yZS|U_u~nLdC{Zc&w|uH
z-C~u>P*Cga0HKqJlk~<>Zku0^%iy{NayZOOHoyg)bFE_GT&6)|p+@dMzinjtI7GUQ
z?C@d|j6e&A0KsU|gBSB9JyAp+nyqkfgzPTxI|KX94%h9=TTiR_?dN767kn}Ugz&Z2
zW@<2&ZI2>06j$PEJWk@q)93W3)$U3#Zk>6&*tBKZ>y)p5UTY~QKY{$F@0ez6p;dg=
zGG|kK?h^ix(zjXOfqnKggr#!X|MF3}PylGF?wrH%fVA<?wwpC{%_BC*IHgzFD)JqM
z1@fr4+i6!JY4Zmg*tDOnl*QeoW<L9^^^WTu1jy3zm5m!8ZHww<TX^qxbGBctUG9R&
zBsLH0z9F+@ZJk3ugwg=nv&}|gOs+PeS3bLZ^B&87yF;AqHS2LAqo)fl>dRl=bMmk7
z9bWFYlN$(y;gDM*`5q31;5dw%Fd%=E9t>jg=cW?a9akVl6jKY72=2Cu2CP`R14CI4
z@R``4Q+Z~WUu1$%tm)#H<w3=5q-N9sSCxHmnkDGGkecXWPtYGLEnBc8n*<#$7?rD_
z0XE`kGKRyw``8EB<({~|sY%BTj}~0U8p9{dqS-fGPy@{Xn;n%f3P@E1Lx{|F{RsH<
z2J*DV=?dJj(K+O%rHfyJaHZYC&v8}Ya4uSAxar`Y)=%r(5*@$m=!TR>mKinAv6i>2
zN!=cNd^1y)QJ^;bCFO}Ww2Jv1OJ_<9m6&(K>^lUkt7k2m^OdWe`p2gODaQU@<usKn
z7pJ@12D!hae$<F3a!ovSXi|d8()b~X*{TR1oEq8e4djxvW9jW)1D0tA&|%e#alqm|
zJ@Ko9?&(T%1rjE*CiACDr^^8W&A(F&>>?RcDyqlC=VKegbdt%R$m~Z-^5J)oW4uQ-
z-2=nt>lN&RqQ0_%n}!8x-vEi_N8SgrGtJ5EDgrA<zAC43$bE|m3f&C;dT2*zE*&fd
zhVUIM$%`g**MK{YB8?o~E};%1-XkiX9c~ej&%86KE>U~BJ3o)b*|JjH8oJlZ2XeQ=
zn@fxVwK&vNg7Br`=~4rdCYAkY$teO8gGn5L1rtyqGv*qI$TrnT$qKt67A6%~bPx(0
z$)Dti^?Je{Qf_6+_M|1u$gVdVQ4z#iePIQpQ-vg8@-ncz;KJr3pd;7jc<NgWY*MqH
zh_j7!#}dHLP5}rYA!0hHS`ltx=$Dq1kIV2_`gtFhYNvd%G+&7z@E_h8XkW1#LZEh#
z)DS@!VSgpYZn_2}Y#n{P=y%Wq^x>EVugwpZZIDrq<%Z8hwj2Qd8&i1+2*pAMoECmV
z?V>xXc?zgz)14Y}uWZ{>jE2V~g1z>K=AINcm>pq@9w8Hq4%9KQ!1V^|iaNQtujg~A
z&nAHvlF*`%4kR~j-mOa)18I1f&SAQEdA;8V3gLawF6f^n*GX0X#s2s7_Vf1jCvbM;
z5E}o#4I%L6z`NmZZnUpKAI}im`rhAIJSL><7*mvF3O?URS1;S%bWa3D6Hy_Z4pjq8
zr(4D47rqf)P%47+uhK5X-#iHZ#8HXOg+_vn97l@-Jn4=2ub?M~wIPxCQkzc!g${$`
z*iTXEaEB3Vk^G~{FN`ZN6<W5HYUG|q{>rwbGA#BHZ|@gIm4vi=>(SQ;QoHTa`b?v@
z1B_G{)o|2Q2doz#G4hn(HQ*PQ)_p|p=#j}a22DQYK67_L=Rd)<KwIRB5l}f_4lUNz
z)>tARpw6te7s|5P)8O5OCsZ%4k%U2W4|xtl$wDJ<t}mOp0%_&>gMgGOW-x_#oB)O~
zhf@wR<@-V-M>9l7+`QE5(VnP6zcaq$ssvrfeZG8TQAtorEv5Jd6<Djr2D}JPfI)yk
z7EbXt0zHh~-KeRyJnK7y$`5tpB1SrmZ9oA(ym5<e=)k-aP?1|M9oq$6>jF|}wUUlO
zC2vSKe~K72B$6@J@Vi!Ml^kZERF~}er3UyfZ95M984f`L*8~?yD=QLsu>W^%B_0s?
zT^v^eBtROb>k^6?BU}nAm{)&N>I;Q2r5L&E(O(yYME-LgYflV`^rgEn0K}+?B*v(g
zy}RdVs~|TZgCG>M?k%hVU&<8pk*)#4y*aOxBK4K`*K!t@MF1ylWx}uzr!2v!Pr>d}
zgJEjx=zU{S&)hNipmYLXX^w2bpLyj|TK6_@+z6jZ^IahwATCWLr4hwEk-d{x(MU?W
zBrpvsq2TU?26X}U>IkHh)3YvF2at8gKidI&BJ94y5)bI{S$^_+lrC9p{{j1fn$FLa
zot%*Z&K;=ex}YZMQ|gc7fR;X@sHP%mNg7OmNmsk@j+Xe53wvrz$OICdDVkS;uzd49
zHuF>Kkb3t=z_xAx1=|5dzpA;#vF@iOzuhcXx@}ow5*7ENgQoRs2Fa%JOfjE1AX0Br
zIFeRUq1|k*zSk!+T9ctSCo{(Uu75S=K?sRG5T-M}EYQ;aOMnYICKSMqyk?{^%`Itg
zyMru+ICngbIRq1WA7cEF>}h}7?TwEC0C@{6wv%5}JJU@)ofko&Kbt?9K4nO0UAO3N
zK*9t>!*tNh<~_VZcL>l)swv%PP2Gps>t~U2Q_o5<GP+Ne+jOnCyzL(w1!Bpk-1LR9
z-^mn*lhYy~_m{gPWlb00v^Q})50%aS22kLHS@SELJ7C)dww=&&GD&?c{+8Rkceb*R
z$`)|f&|1gkrA)t5y>y?wMl9k;E~J#5&k3j6^4a?H$wdt*jExpLV3V|vS|t?jwcKcq
zD&8^CN&kyYW)UDk>^t6}^3U5<6A>}NMF1-gZax04EbbA8OYH!P5S%>y3|Ni?*_sD|
zl-Ez2UJo_I-l7<#?R=d*xp8*corq*Y4pY{tN#*X8q#;j<Pb&4Tr)tY#FL@m}RX+ik
zusRy%jb8+2lDc7@A;Jan`1g`_QAnHwaFi|JH5KHzNr6Mn5Ah1MAu-!lnO?nh%l<x>
zd(HbLFUw!aw?u&A665u3L^d;^Sb`<FV@stf5Tfff6(f-}A1zJ+T#nFRMKFRD`m7F4
z?w~R_RbF1(4q#*HGzh@b<t?j{qAdF!#qNTII|ccIUUre&WzD10#sPjng7OprQK;_W
zvTTv4x&zAhR{%*mTJ2rZv{l4EsluCY{4ALT@|v!Ln6d6@X=N|jDX5Y#3)XEtQ!+!u
zgM&qdD_s2gSj)iccd}8%Jb<x=!zKnVDtD=0+i0pT$#==Bx~D&3hqyfh<1%tDCSyc0
zfe`>Ma=oq|7>%@DD04LSjEd}wBC?VBQTAhE2-o*Hw5CD!6r}c~CP1mIvu{0_4%-%X
z1~tBka!U}lfs!#<(w{J)O|{edL!p?Nk@+72OeBv?E(r{9HsIzD<CUGqrO+QAycRE!
zzhXDd#j9`y0!#7>SPYA(m!U{qT+$xw-Fwyc)wVb+?|O$t?B0g*#IK(<qGV53m%X=3
z+Yp)A0S%@5px1j_uFBwZN;-{VC|3j?b8js9A3eT)9vKteHH8Y*$Hs-xs1{>_>)5zZ
z<)Y`K#y-2XwWVJl1%ui0u54-ovGD?lVu%VJ+yT3bJ7DJ@Q`{4QCpvBDqKe3V)9*(a
z4D>+%3S@@|8C)?7*0P4&MfZ98J@)sGHLtY{0I>T>U}D<~YDcde>YUCyN^DOS#G}^^
zlPt|n%O(s@y)<5IyZWb$>;@s0uv1gzX|WtM*A^TSIkzV<^$NNb367@smTIjW@}O$c
z0p7-#;PuS}TJdja$s3B!gS7+dt+)ewD3VJp5YS~FfqeBjKg8l6JuHkJfw#XDag%<I
z+q_+}KQ6%dfsNV{@yh@FwFuB>jGQ{?gV@OPy5&AehRBSsi&+01&uL4JY#_w&$q;ta
z93QegtqF;=g#7pow*4y}SadQaA_n9cI6-2GI=@KnUL|h%SwOpvAdtNVi5PR_G?X>z
zZa|5o%8AbkJ_kc>ewTA$<PL@G6hGagmC8|2vCYlCt`Nyy5sK3H3(J&50s%i3+7d5*
z(ZAPz>)*UpR4hu+2iTpJQIqQ-y9QC=TcbryaQ@s1$mk(_s^G<{9$8N~k;C|j)QWLt
zbf3H;AT_~&;ffx-Q-yPNA_l-gTxgSyagx&SiK7(p&Th|7eKiO458_*uc|kfku^B9&
zB(C)U1HmQ)E%4Pda$#_MFX7B>52$1<)@cr2U!Lw7<!PbJZp5PiRmiFkA1wZNq+p?p
zdvdoETTx(NcA5!-QApe%hIG;@nW8_<m1uN_L8NM1t@(r;$%&QW+!12I%2;Mm3OcAj
zGm5@e<O|HM{I3TIv&bWGF$jS*q#bE(KoiX$R4q3K<ld*r2=tq$@!;7Thm%uaigX&O
z;OHk67;ryW3l{nJk=|SGfP(ra@CGkXm<jer+IQ66<`ep%5Q4YWK^Rq+J}i!uyXyk6
zzi;;V)#IKTYryj+v8I!ePa0PQ;(-iPrQnW;l)-7vWjwB$V*z)*gLD1{Xf(Sdfm>jf
zx!xDje0MgOBIK1#lQhxY)i6*=`>{(O5Cip?0SLWV=$FYmw9g1OVglb=&mLGn-fV&a
z*Bs+zpGHfbfbXA+`Q?1Pvn5R-ulS5|qVBFNkt7|ag9b+P^aGf%JY;h7ZkYJsEpISe
zOZcPSVbr*SG3#GoXk5x=dxP%_Gut#&!CuH_+aRw#mTAZ#uV{F|&v;~tgTQFWklczs
zaMSfpi^oI`Cghb4Ik=-Q493jv2Q9p&(+~sSzfd<WWV%~pKIE0^%R)NjRl+0iv$P!T
zPH+*+0OtR`!?g)1=xd<J!ANr!$AUZT8DPvQ;*cNyc!gBp`=SEQ7X){j1{w0|3+oF!
z<W+wR_}TDykqO`?@<^GZXd-NcBxQXFaxgimQ}W<WiW(TRGBv7AJH`uK@cq;zNl_&7
zl`n9>mG&f2Q3UeJLLU6g@{M){V1o3>Eo0!y;6M@rdmwT!<3l!|!5w90Fy{Ptlv-nG
z5mI5_+Qgc=14Bh&;Hqfzf%e@M4^m0wt$Csg7V|xr|5JRp+MUc!kwgyW@9Wy<;7&dg
z|9w|@H^&K?LK6MSai|R)@|D^$G&rVf0k@Y~hb!Tn@k-Q_ssb5@a3G<tpKo6Ly<rc3
zw;?^Wxvg*eH6&7hn|tadbPH*#yx29Uy%V}0Rk9nqpc3vt<x)UW{SH9CRY+UQfZMZe
zX=tPziP4X`IUCmPA72xr39XY$-^ymgP&c{&dNToKgxba;ZPKs+kEq<On;!)lG1Ul2
zq@RH-rE*O`g!NPj1IdtQS6*0Y3LHvU;9Gn}8Tc8{9PO9=5Vha|Oa7O;0_8W@yu&8E
zBTuz#USVAoZmbA-Tm33m>vL&97`lwS4o1hVW3ONMzWok8H10b_2e)`b(98rDqj;*0
z=8!O!S|XEb6|h78d}-{b&F5m;cD_-K)U}ZqcqCN*;j@b*dJpj)y4;sKSIu)-*4<eF
zH`es>F%?McAF%*>2ayZC&8xMaubTrJ4WRrVcwoETex%t%A-6kE4!J@n9p+}U6{hY0
zT+RX=dRLyUxTcTN<5MY8%7c1D{I>Zi)IX35=LalG{9gYo5W{D{=CbpR#Q16EdF9BK
zt6@&BIpp~-ptRNl;jlDUWBr+(lJOy_F8x^t&jZPpaJ;Y;8RM>(3f)jR7SPo{0*v!o
zoYSNhm{9cF{jV2%m}46c0K2+0JOi<Zz8o#LH<ug_`~e_E`_lz#Slgphx+YwduR5@v
zIefZzeiVpx7H91cK-}VdpBk_Xl%gmkW55TKQ|<Z^EsLbzHq1O)l`CM#3#+UFPn(Bx
zn=cgpR=6x{-m~$$Jgt35Dg0C!9w7!3W5`{Nq2UHx(+9YUc7QB#qQ~hz09M$8%hH<f
zO76S0GjmAGh`cPD!Wa%gL~ak5qyw(laYD2Th7>4G9dLaD>dpP`wFjnYG7iao{ZCkS
z8J@t?_GL5_6sGO<M(o)mePK2(fj3b>6l2t(r7sZ60qu*P>#<8#!)O~Ax!;i{Iu{pk
z%%%<B^ay@C`V9<!HJAiU4J5;$ag80YS(O5d^xS=H(v>Cd2z9QqQ<*N=T13JYGXziq
zAkN7do3F@CO;Q1LLq}EvN=wDr12vl8c}>>CFVkgIXcsK4dnY8AB|6z1n<s`-^a)qt
z;R30If6GJ}vQBu}gGx^C<G-d0#tP1m=LbTF1s9{8P8l?r9iTe7q*O3E^s!YBYqn$)
zUs->`0wVC!E-0qbRvV0PN^S?{dV8SHo_ZzxHar8>kajYC_9_9QpTpVOZ@*o-+y@mc
zYG0@5$-Cs>tsj-vFa6t30w{Q7w(1n%n~N!B&ej^7M?5{ie#DMo_`o%RQ78;-8QI^E
z{9!Hp9SWFQM27@s9;1o|&wV`BI?rdmpz}go)GRQ%HUs6}4gkR;?cRXtY_EM;8LkI}
z#{TE|<)p@7b6%mFshCL8eE7?vhLR$>RwPl}6MP@13RwB)>bnmXD$V;;E-F1oY__#X
ztBT@C#y}^n*GMH|49!e5hKz4;b|WcNBSnG)hVk_tuI!grt=>Pu3>|=1Lct!j)WjZ7
z?64zh**=`gyAUhKZY+nBM9%_m-vD@bb{H1*!6Y`2vZRqiD-<crXnY{n7zC9mhv=f0
z+hYLS8e8|F$RsuV`k?F^?>Fr*Mv7l@uncNSy91NzrFej0rMY{YJ{Jw$n38z+Apjax
zz|Y$$_MEFI%X1BrgmL!E=g)$uXO$xu`vjsSvpRv7x5Nf^tChN3Z-%%@;LDykIF!Oa
z#EhssS5OUrQQx)~&Y{*Qnniauo`qDlc7a*UU#o7*(5VxRkI}Q&o#pJcS8wbd;KCXq
zl4sK2|MTe3KmzDtLE226ltP$$=K~*<InovY4t=J}eLF9de}h*CIjaSb*r5vSDkCki
z6n=~@r%(I!sj@Uh6vI^qXKe#h2FW&TjgFy{MkPhpEb<qyEuVO!l}9lp|HS7s!(ISx
zf&uU^73%z0Xf1;N1K2>uqnl!qX=jQsd%0(`N>AN5ppK!xdI;3nwhb{L&+uiN81k@G
zgP7lMNYkS~@_SSl#aQs^trC7+IOFboJz_tJhM%u0Z!nVzB%6kJ1`p3yU7yY~ccy=-
zBAQ%83Tz5gFBL*O`{VaW%Bgt)vG!K!;JVu13gpTH$|Kf&czILxoZ?^?C<{F>8HF#(
z9W8cRiTm}QfrwLql>ufYC&os96ki&nD<LzAM4iiubC(WOv*Q&o+*26``43g?N6nPf
ziO0GGCcB%JR@4SLKY~hP$(J2js8|m0j-mnU#8EC#DBZsiy(D!>h`|^BqDn)0Ox0}(
zYWr6>ulJsx;skA#HLOLY*$t$*$9{CbZyvaB=B4s9HmeNS?G=$g?pU{o;SLv&L#(j~
z-mQlF>Kb{Fy!}py?+CdUvqNfkqze>}p%0ev@8G1tq=biUb75S&Im);1V^@I2cOou3
zOuGa<WUL{woV$8E7U2>&`>;D47m-~O0kLy5OIJ8hd^~UHdAVC3JN#sU<g>mDZGFLm
zc6pTtuT0uvgLhS-uxn60bv+mAr^gDsLHe7Ir?jA3Sx8yDZ(gf`1S4dsG+3r|Hz=0L
z?kgREZGD+%1SR7RKot(t?JVME{ia9ORPwOu-&ho%HJQMy$vrQXBrfTdu+#yhJ`Z{%
z<rREn7&mq@<y^JN_@GQP8d~vs@S*5|q`KIEPlOoKERQT3NOhD$;etAr3JBW!VhPxC
z<fsT5DYbv`9j(TR?7_EAq(4O>G4V!=+L~@spX~e5={p+-QjW91-Tp&Y><~Q+0@#WI
z7i(<ZJk}xR4zOqZYws5K5LtP5@2VA8{aIZE+_&(kNrUws(AG{Jg@U+ipvRBt0w?6+
ze5m#*(p37FlQNbW_NNPJ|1RDuX*1v9sOW%9w+93)n{flf!fSDoLLn-O9||rx$NsV}
z<W#<GN9~&C`7HvvH<}%IORl!b$F{3-k>Sj2n)~#XFz_DHhydJIWnTON$waa(F)bKX
zP#Bsj73dK_9J#OC0&aouBmjESu&5iYMDIH7bGi>KH!3zgNUzpI@iUWsnJOi_rp+&5
zhb5n_fgrYHr_0y_Y}r1MS^cz8vW`G!`R7uS+=GdQIU9~&-&uC~hW*c0(UJBLu}NIB
zZLoCvkB!g8zvk}~ViM$^G>(EJ*pGC0?mXIl2p}7sh6}Wckm+6L14`BdsCo~L`N{j%
zU8FVB$aO-luImA7L!geb#DVgz1vV=7_Xeer<q3FX+2z&9CB;<L#l`PbWA7E^@3cVU
zZ$Y}X_L(nYn>MBe%35a3Z<FW7@-zDkG19C>M}7e%h}Vtd3RKIg0xSG|15|w+34M}|
zHEdMJw|WxV(F6p7H;PJ2_?qF0(5UT(OzrUcmGd*P-`wxj<pO|4@FEBdTS;R^>~1j!
zF)i3t4$R@y2wJ}a98#_hv<Lna1Wz4+Ky$A*gP2bnguQ!HQ&E1~idd>n2%D;}r5P>(
zC!sBaF)z+F)mz6@i0i7jv*Q-PBHz$6;1)rg;R9@UDu^Ml>|1?E<@c;5GQ)jtfU8Jv
zvyW!-`6tOwSkN2`4&ZhXb70}Q8Gt{JcE*aTKfCoO`vb|vmU<EC(n-wOdwH>4?g-T1
z_OXCojZav<)GvYq$9(+`I?=_5`xU7DEI?tma2#NWlP=OQ7Z(K*JG1tS%b!tVO*)?)
zii>Kp4QK!5aa??+e_#JP0+6bN)Q~W@nhKEo?uAfbNoeG(Z<VV_Huc+n+S4v9&IZ&V
z)1`7_36bM*8T%AH%ddmfN0m3RUh)y0*|aOPk2NuA8j;TaCE8h2p!)3s8;DD)|EYJt
zzX4|X-Y~ZrqZy$CFUDB17!osDyUK6GZCRPZNTXyXe~m%CLmj}<7by?HJHRh9Z%(p<
zAAXz*NG|a)zbO3hhliwpOL9e7_Cd@cFtK}51S1U{PllAHxl4We8sCvTbXJNeubk2`
z@cKg0GXTt86R!fU-4K~{-?*G9yic}%b5j(pilCZ012{|7lj^_DcOA-72_EuOp4|S|
z(Jx@gIBH{P<v4;m=g##AV-i@$+9cBh_0$fsiuEiUxhCLq6)VY=nF!2BteEnW_&l!}
zOkYc28gSmYtC0nSP#--zGtCP5dpdxx%=SKHYMg<T=^0l~B5=deV6u8W<1+i+pe;9)
zjZ?!Lf%gnX{sE>=|A@yz@!vIKS!9Mrm%((*8((KQac{Aixzm;b6QYVA@{XZ|P9$83
z?_JZRdgosyY0*dqPiY3)I@u~<x?q-4c_zJs>3DB_J({ykG3d>EXzD3L$?@f-VX^rf
zbj`5gY%Mx5lvw}GmZ7fA*L>fFACra{ukgj&(~ek6Yc$_3y3JghbA+rE4Wt-2q`B_S
zP5qIEG5Pv-=6pwAzXHXJ56?Rg+Atlz(Ku>}r9bzM(#~O;bj-A|Z}6)!7(AA|3Ifny
zH0_-uto*fR07`qB2B2gjz>8u(fr68mAolq7#Wg!pV@ty~;r#RvDT=`7j6CoyCczcU
zhQbxFO>W40DK4i$Upff*%aD!z0!FEmv0I%+vddt8vxr!70gj)*!uX+LkrvaLYypPF
zTJCUco)ktC;ecHNgrCC!4J1S|aUkij*3~B^K604^Z!;D;B^n)C{fW$#roC&KYoU1e
zum;*^5zz@DC~?nb!}sOY^S53pBVt`>p~0%s51tG3reRf&3ftm~GJROl);MCbrkB<^
zC~}rETMZ>qlaD2vS}{a=tI1D>?@@IIvz!AX>l0y;HBayJ->F7VO*sxQsAL8JDekXl
z?elKj!|mw>FpL&6O~<f8rf|hi@Wqgn5m54#9Sf=G#;cIZ^Uv$@={r+MFm#Yn#p&p*
zTd=`_8EH)}pIsFSRK%;hcUD9%v0;1Cq2UEot5$njb{l-n2TVYVJD0#F9PO@<IGr_L
zw9Nw$;hCbs-{O*dGM-N_w;J>R<-H<JjKC{-SUu<mutNiPTC?|vpAZ6sA~+qQnX^G%
z>`tFJ+i5*L{qA7=m15AD%D!6>d;a4Gm=!H%+3lWCCGa9NIf3b&H9S@@s%k)Ar_aMs
z>mp`C6q))T86`TF@RoT701$M@-RIE>R8-BwMp)X);~)9TBqUqXc5rA;kkE&K6-J`E
z{4<ZA!0V9}vctO6ZXL5K$oX&o=WP2rJh4yKd|(q9#!&;7LovN@@XpLBOrxP@3Na*Q
zBK;jNb_y@g3-g<GNC<g@c-H^@y)S?nU&$AK|7G$&UO%NYQ(^!MlSR%;%I8V;i=264
zoksS;BbbonCgD8ym-bsyv$Mk$H+}Rad{29gfDp_&8($EY<e#;`fAEn%L^5oEE}CsE
zj5&<a-hN9;gzLv4lc}%CUpdai1s{RhE1MRM(NGwaS1KXG1yu&V`?REM#%BKTiDZoA
zn+}Ztp!Y3gk2$InGR?LH3KCSEFwow}fBPd%vF?dROR+Y<4g4k~n{bbF^!3vvpZ$}@
zjw{T^?ta(;9{!rRdwmOvpV_=rejz^H0CQF$6zHbO*=0V!<`f~BOu$mlh5FF{jk)XL
zdIt4XL#26AN!!KNKsT<0q01o?XnOM<lZjH;^?;w!sRMNn^?L$dw-4!|C)2n?onOe`
z8=T(MY&+}^Z$6ozr~<>_x=x6sZ`+*VAT0W~3A|!$K4BuBhRbr8H2zX*2cw0i(5g*}
zaQ+f_e|=&?U=7@usJAWZN89+@apN8LWRkYmJOBJZoAJPw==qp%PJ@|7cD4U3k*QI)
zldo~++qUg>nm^+WMlOYc(3&BJ6j7qUlIJ>gOHpg<F`ha0K5CsKSG&6Cq3rU<=WjT;
zG9yPi472xTBsHk_&+YSf1T!1ZT~JYf0p&Au>-GBp)tvUn1+#`LE*GIqllNP~VJf7>
zII%Ll{AT!c_jj`;($td@eVo`|JBgo+z5cK)K5E*m-Va%;6k;jQ@%fWtyu&y1-4dYo
z&w0qGpg}2MGZJFirn-Y4=Flg!Dm~OI3*`4QTjy!TvK@C#erE_0o=f~Ke#F|#FE8KE
zvDVAj$;MrU^@w`{X2tht3_3?qMf`NuQqV~}6PvUMsv<jXatTxr=?C2(#(Un%hx*+B
zSCt`t75bGq5pgAkDN0VDwolL*-;c02D+^BDI0GP>cROtUKCK!?nBh&}>Nj_^Ao4KR
zC=SVv=-{_35_$0|(sR$3+s95}qUsnN7^z7AYdus$jXgHf_hINw9Iy)DZoc&6SP_yk
z=>q2LFI8Kw{x0dYs4w#bcmVeO)djV(GRL{++AHgfXBbU3En$%kGcAaWaC43Hi*N4_
z08l@cwEP>~TXg{yx{`M6iTWt#3oy<^-u~(j7)F~5WRZ8$Z_AQq!pjP1l$Pd@kPQyJ
zhkmiJ+8i^>gYoX%JY~DQ<)0D)JEyeqmHek61cnQxV;=9QAibV>EMJQs@WmcBJayjt
zwbcA$#(7Bz&XNot1*F`MG*OrI7a87ZAEuCG7!w_b|8RAx|D7r>zs3WTeApdAJyHAC
z#Bkza3$`x1k=vAZfhC!BAx=#?(QBR$Chx*(-lM&I$*luiYnjJ?C%UW6kN20CaeT|<
zex#8?&{N)OJnY>EwuH^xUcixW1kACQoQzFFi5429VI6{9u)+Q4H<Iu5pm<^Y3EEIo
z`r8gca-tqs-4@L89nqdHlRT+3bcy&!v?m3rO$nh3WiD1H4QAOhL;TV5nP6_b?=@;C
z6ec1az(1`yo?Wi<a5yyKbTah*2E<RT`N5+fHR<j=nFEA@B(Qz%-nY2DH2bTRfdZI6
z8At;#?i)s1$$VWVPYQ03vcs*)<+?p!=x=2XsJ@1(W6nugK5X%RP#f&5Y@)IOp&}-|
z#i<Pz=9w08-}%e^=3gjg&jZmy=_i`Y#C1a)&7POiMN>gRsvHGo<)*+{Y!-5y1{lTv
zM|7klV<@H$?)e=;^9Zy|CZcDWkt{?@BAv*S>QU^w2Qi0u982$ZytBF$`$O<KkewZ;
zltsZDTOGi1C(1R59v0Uc8LNuPEWt2-cZ5Cj!5^waaZd}AU#r}KrZ4)(y9GaUB+2*)
zWtEaN%IoY1%!MLkfL7C*!95Q_Sjds#;*g30Ka$oqLAhKu&8hsK(Tqg<x}RUlCds~&
zkPdk!uZSJ&2i%IJk;)cGhdEiiupb*n``*XHbns3PbpC10iXK6O{2zS12RN4R|Nk#;
zd)>(1GbP+csBA)|qRc{8k?fJJZu6F1l(M6Vj50$uA!KCl>~+hY{a=^Q?|U5I@Avor
z|Bs`?J4bJ>^FGh(I?va5J|EYFt0Lh^7vrZtL*!^-7Kz*VXH>0F*A{9`<RFbwh;D%(
z<ByeiNwnYfCEDWq`3?7>AGSgl?|WU$cZOmr-fiuEIq2QkHy&lAt#ziS4Nws0dFjYV
zJWAC3y88TvqDvm@jKF&E>cUJU=}dBPxYnHx*c=j&Li@7~4~T+cH@-=jKrgft*G7wn
zuH?^K#E$4<Lgd{<M^>pFcJs<|JURj*Z%h68znUwb>!BG{GpWudv>>D<rt+kUR-IjE
zB_SqZ52Ym0w+bO|dB;H|7Ri+=5$u&lO2W{1Qlu*9cXy~s=IIh8xot=o))O8UbLOW^
zfdKSxRjWcw-*8a*EhdrXf|QDw2&L1v1{$#s6<4RZzA~=`j)!LnHpE8s&88YI=1ts4
zGXdEtIaY@*1oMLK{!Y#dn#NO`He%*kR>4wAfA?R={F0d=93^TC`6)Bh_GJ)L#bzhF
zIt_J&gZz|_1-0sD-)EGejit<PkPoS$iJ>Ya!F-x+fwtw1Lz~5dO45McChW?4r5#SB
zvl=lxsSh5?TUL6-oDb6RfIgBhr+~Kl6(8jAB8ALzA%nS>1erXoSIUCH<N?X+b?m%>
z>7Gd8jx<{BgKhJZ%Vj@^BO)|3sw8yrf&aE^AorQVUZfMmbcGY?w&8fEdCUlN=$#<#
zcTE1}xWdj&J@UK*V?MwhQ=|8(3Du_?P@x$xP!U5`&9#eo0H`m?E8D86k<N}HUDw(u
zOpOe6p(j2--gWiW<GYG=!WBz}t$vr-O8mNV{sydIu^PVQPoHK)uq$n+SNNQ_rF5Hs
zJ!7k%n;nH+!IE-0^S*z!9C`S1>{Rz;`!=~My&HD?W&P>XGR$OqPld6%kH@j3N1|_q
zx5eBakNyJ@&@(lf^I5vJsJ@y6&5Xha{FW44COr{~-bcH$>?hfTgO2#QP;XGQ#oWAS
z{*3bft7=%`WaY0=cakJ$h<lm}spSIFH}RRfR0Mi4=&lOskXsN=Jv$tNEPbPtYDvQ8
zZ9;G76;2ikmRc5F7PbS^r)$doE>l7y{2JtZlw#urk;7X$T&vFm(VqU6H_e^%FSOcx
z<yNbRH6)moHb}&6XPL^Iclef(lAKKAbsrD-4mtv_7qKQ8yr~^U$yaadSq3^Ef%<UT
zp?Udx^D{R(Ue}@eL-WQe|9aM9Z|!8cHSWzxwtmd@Qrb1t&c$uT0<)q$Dyxc1TF=WM
zuQ&z$0O&jG6-I5iEPa=YUT@Oo9raS?X_sLBZR%+nBm1i}F;l*UOWq5wM>hq@tcUm+
z#mqw-k-V(C)9Aw-4BhIF#6Q1uDlNb5DDrJl*P;lCYwxIeSe%s-3iquk7OGDKm4mb0
zMPc#cFz+XQ=xx!rj|H{sM?1g3=2r<FVO=MMPg_V{rc8lqIjxN+;tUGgc*0V1fU|Lu
z^RS)Md`$6E<Ri{DKi@72t3q&L;m7`4jvnqftiFG-ExI#qah-M$XQ9e-(IB2qs@WbJ
zS8pT#MGzH~&080;C#?Z9$iGNzEFnWC@*TuLF1kD=0lJAi*<^$SLn351!zU6S>j<q$
zH5*i4k)^e~<t#Rhug2FB1rj8c7BPtPrmC?E2Pwa-pB(p5v{CX`bZW1gvwBF7%DO=v
z>f|`GzGx`&xZ7Y*<;e}gq+(E{SV{5pcdqy`P1`YVzZDO^6lkRSh;`xe_ZP06N-XPb
z#KrexU4j<ZhZveT{3BF4UI;awc6-u+nIsot_5L%K^p}layZU5@&}11>#^G{4*IpUg
z$WV)#qV5ooKhKg!Eipa_MMFVouK#@mncg|rhh-LSa9UjqPr>GEyvm{%G+Oz(XI!fu
zkhh4G1ALImmzmGaWh7_Ao+OXIl{l_>p@13D8OgRtnCyPzC6a3py8cCYAw(s_B3d^j
zgT-+7y|;_piK*~j$4M#ld>R!!3+FDW2Gy)1bf<nmh`HKGRlsJeL|g0yck7E6p^<`A
zDUD<dI3AX}IOWnev{0?sZnbi^BxnrU>NXqx^+UP(H~F66cbeg}b@(0k#YqjaB-*0h
zt6W6iceZ&vml1Yw&V_;Orkn(e{cwb9IH!sbEmgl|Ey|%^#LGH1kghmB?rvUx^lhI|
zqEJaI8d{=afpK0PYB#I{S)}gX%kyj@UvBP5#`F)?nJ9bhL5YSfnX-+x%2%r|d`^9f
zMx9Mvlm(s4Xc1V}iGJQl!9Ygsb)PqQGWXJ%+rhdONi0^or2aLfYR=Y?>$2NzN^d%G
z+h|(JId!w-8zTc-!Q;&w^&(knVKk{lL1p!F!tP*bP}f+6%&!OO2N#V7efo~n6|9xh
zO=?l0WJ)V8`c2tS`YBlO2hQSKQ}RJV|1!0x$f)?J93`)z<LRbKdVDMrt?JF2CyqvR
zeAHWNoB6H@4=JF9ej8a)+tgNKZeo&~OKTQGgQ*vcpJHmfO*-t(rP3q1Uv_MkVZH6r
zP)VLIpE^4E1}CD~log)mmS_1(tvdMR>P3-e`x;!Mz~=OLDj5wKHR@AV<v*ZV%HKJW
z-GFk>qFT9>Vx*vvXnv8hPd8>b=&x%C`OU<t&EI$aeK1@(SA>84h*u}BGSETFD=mNT
z?lq{lvegSl&NsFZqX<yMGqu+*E=!+Nuy_29Az^8X^prH>xxj0wZV_76Y{0vt&abrX
zV<u&AWi1g>u7XJAS6&a-Qo?keG|{i!#<%=X@}i|XQE+07tVZy$Uu}p>yC1KwW}aE)
z(ad4PERM%I{uQt)7JNs&XUlkMIV|k!-gNR>yg{>8f?VT&s}Yg<14|wYDSPlTs&sn{
zio}k|)}2{GjN#ZP7Px|Hw<@}uIXaPvpsVbZ(nPnpnK&;u3TH&_RQL0BjzM8-k%TV0
zp5?4b>){-!JUd>p+Sy_5sKW3mM&(W^w*&_|u$1&8qeL?1>t~3bnW-(4gkL^{k#TMZ
zSLYrfRb$D?^M2RQTSr$raOOQ_>5uIY4tg{Uinbph+ZzA8ieWit*VCh%f@4c=-6!KN
z?cvvsz4>f!y5#)bM4tOKP_|v%4&VILV1r6Rn!eP7&<Xa|AZ;`);@F_`4;Su?FH=r?
zKJXTN94o{pThkwV<4vpmR7%aWQ6~%~dZtXwlZdSNDg1V)2(SNV*ol<TS1^)<#PB<*
z83)d;jkqdoF6IEjQsy7cSlim)@8rbGWT*K)>xyxSd;QDzKV8E8U>oUex)HvJ^j4>-
zz@YC5X#cs@HwGb<Xb2o`FS@$zj;o2%fy%$3-9EXi5~Fe%t;V<YJSwAebWKCAGV4|*
z9bME@Rp#l`uwTCtiHJ}&=6U1iu$qLv&BljsIU=2@&<3gD`8S3po=7ydNcLI8TA=7q
z^iwJ2((d?x(;)kGxvNeXDc{mWY5g|A;O`Dy&ZLHTm{*z+Xw5K<&7CzK&QmpucO1(a
zE(vz|{(W7b#$NV<boSTvmiMXhcoHMqjF4duP9n_jsDy`;l;eMua&tML#{4Dk)0f9-
zi?(+}x#H*uIUS6lPcrf|t>9*~jLYLak;SrV77cf+B$D|jtfXsI6+(;6(2iIfRjV)H
z(V2m+<;tu?yG9JxG8eT=1bVYw$t&-qMc}%5#C!VLTZ%O{FO4k8w~cB=cpr;7i+|?*
zjI!jFF#O<ll71(0S?qnP&nKIk7T+l6Rd+dx*T&CnHw*<nOq>GU=DU|+TH)qhIzyA2
ze-m*68)xC(b^-xK&z+c03P_$CO9+Kq^2kS2UGPrzx?9#;kTFHI4s+vAOJ~;bVM2l2
zr06P7uyc`%1$7WrQ+=N0l>EAxgcYw@+J4))uMATtexK*pWY!p6h6mSFOF`N=vQq`J
zzlvpfs~rB5`RA|+76t|q&6t~%oIR9vj<@Gkt%huA;nm$d0Vt7r`=n|eJDk$2mGi`d
z8X1PjGbjOrz%s5F;tH`w;u!QK|DTt1=c@^cVjVcXK3~Mtd!?1TaC+3v@$U%UQ4=CD
z-1b{Z);v$fQo287+NrY?p<3~2*?+LZFGAzugL@K+>*w$5I_-r@M28U+XuGbmHeEe3
z8Bab?d}T0FN;`Uj<*%1^EX9j8RVCNE!Fh<>AVjjUx~R`=(Tu@<v*tNgqE3dgp8fM_
zcb&YOA><1uX9>+H?dN$!+99Nz-%cOdC7tfRi}aL8bFIy!LO(Ki@IMhH908Mwq&{v+
zu}sri<$T+n8T2S)n@WYK>pstR`OfWHb%6{U<ite8iK9Mx%g1`9lVsKqlT4*@|6wU-
z`B18U<CSDX)N~`8{Iv&X8p*`^?Js&Q6*LWZ;-<}|%`xU=6J<~0Tz(tdcptvIhpCrz
zO?yDALCGqDV^E=@j@*Y9)~^t9OJSZ)=NplWE;UAV{WZ35Qs|qgKM9S@@=9xVyA98R
zn;7ljW&1GdTBg^aJ=>#@@fiPd5ad>-WJYd(f_IeU#Hk0AVvE*H%B8sdBla`=qs-V4
z;2v_T#md}LB<=X6!*4+RhLP*s?({EO48J4kfGQak#niiv8v%D(4`m%Pe`4;5P;n8l
z%;4LdJefSK#CR}tJ@hA{B+#^bM4#=&C*1auc%pksPUk$JJ(RJiMLC^vi#V?}l=p4r
z0=JqMvkqzf#li_5MG5BqQMwZEYY)ZZ`LsJV7V3f?@!h0nAdCrTzU%jEQ3t2Adh15L
z8j2AmI3vw<!|NnnIzt?7&8g_+W@*J+B+JU_*85}whOU)blit({R}*b<Pe-BZ@P8#E
z2DR$aXTqr2XGkQ(hH?!4@>j+3#u{`BbE}=OLewOWGRgQw*e>{8SHOIg2qrVvRQTpT
z|BQ-DzVt(%YQcrB`z;N^O_@(fJ*u_pFj?1s>C~YAeDS(2T^dm^AvLHA+r2^=_huO=
zi2UweBvjTOx_tiEIfasIkjCAdHTph$fr<W{C94T7X@xS5yZSdL)wRK<L*bVJ<28@a
z6>^sH*n?s>noyB@`O`t>xUcVH=~xzZ96In%WFOcXON_x=E9KvjIQ(#O8}xX8AjjN)
zroPgxBpQEylgF2GlEIAHOFBO#>_{cpuI1NonerI3Rh`XS3WgqTl8qB5%){577(F#&
zX}Yj%Z)>u~WtHmEYGf^jZyUx<UMcW#b!z9&7juzCo1<~1-1qP%b3Wt@ouGN<(}_X$
z6RW&W*O2|BQ=8BjYV|BO9?ukdLAbls`{MyC`%n9pp4Q!r4%&cX?fec-7s4q@tMCqt
z4Bp=SA;JEHgb~>SPNH#(u;Rj}we*#qB0Y=1t;rCqCK-vuohz6t5{1sE9)qCf9rSdI
zn2T9_v+fB*lNhUDs!L~abo_n|@@&}B#xKjnDs#|-JPfnIsWdZjbVbpB3+iVqpzZ$T
zbiPF;-lCQ8^gTl}?|U6DBt@~Oa(yK3Cq@i&)8Teb;bS=F#S2Tush>DE@2Dp7BnWjx
zzO<9KHM9yQAB>nK8s)JjREf`W@cWnIH5p!YEok-qS(+6r!P64dkfynI+W6&qj!6Hw
z$4=x7yXqSyNWy`1$HWyrJ~~b~o}#*#gfL*jHS?Mvmc8-=UtLT<bt{rbvE%Q!reY|*
zzrf1hmxScJSfB363>uaY7BYU#=nWrU_58r66B-wk3@&A_;T$qo?6KzJig)zQ2hoOu
zmkE?2wIws;K^^NDxz<+9HhB5f1Frb6UzeOUB^`NoI=8*-IG~fga+xm5m8$jFZjy1U
z*35rTbc<n{=aV_d)T53ovLvQz%i3R#9=y0@N_PUINH>RPD+i5J>jh?WttI-KJ(jUw
zD)}U#usEKXGuM3s$Ka)pVJ@Z7eal9>&~CsI={uvErs!|-L3o3+E|Fgbb*H5{osQ@$
zUut1)Xu-=Z6WnlO(C|x9QC)P+Rmz&!)v?eS@zu4;`efeC1>P`@$svu&;qZI1Gp{7e
zxUkP8*HrmPw2A`8&b@QEU<keGVnptedvw#`xVd^3)pb&6XQDO6w*rk}!3Z0V%!WA`
zWXCB_R<Q^S2W{nFr~nVK!17?xy8j|eb#h&ecg`H5vvv%&@S+RCt{cDNX0}u^*EZ-@
zN%?rHk%5Hqw5CL(%%Cm5th+aNW{OYKqnnQ-(UI(h6i5ZpuyO5X^&LVNLYnxL8M+X1
zDy~Rw)Eh+qTVi2?*a~FSiDLJlFyosl7Ei@!|30ItU^MI8v_D}`;mfqW-z{{rgjNx~
zuJWG!H-l_%eJ)eZL3iEcqk*{*l_FzwiL5VEmM1TjYV3bhw;RbCsLLmbjEqjlmb6hp
zMd}r0{R7@d8p)JB7mJ_oNxef0zcjwjnx^!XTAZ`$eY3#x2AE#+Z&0}?*NHH{s(@JC
z0`))R)%>mYZAq8al&!hwwBpOIetWgJyBx87W=nh58E(tSXNs`-36o;yOT*`pbPdVa
zI0z@gK8G->Q%an$kGsTFkk95+b2zG+Uz7ahiZ3hGbxMt(5M>u819MSS_!o5LXln|{
zCmkSn6v<83NM;PMXOF=L4GmTM*L}(&w(7rK;^<%@I9j_!MLED-p?4(mN5%9b!zQ)w
zylAQ09n7dhXSxcLfvl)C)>d7#@cg;o=TLKBWB0m;Sg4pNhjbzXD7na)>bWUT2&JeP
zC2SWrmXq=~1=dXWbv5RfjMjcCE0IFQe>KRx?VXl2ARgy{sXB`~v3a3Z3dQl&+{C!w
zW_7si)GP`A2x1zb9H2eR@?uoHGwa63yJ(@asPPS-4on?YfP>7}ZqbCbOWzW2@LtOb
z*`X=!P~7+8wm!!^-YxKN=f7A0m(xfyd&fIW$VS<i2w&{Me?=5a>aLW@%0!Fr5frQR
zd@=ueTkFB12Q{mEq}!C&m6tkGS0X&9v67;w`Hc(2HNR9l^(gO1`UQlEsxZkb_-U_?
z)NLE(Tr>$(wo~CK<k2$1XZ@96tYA88-Sop+te%NvttJp{$2<2nq~`-uv3vjhB!~NL
zDpfXCoUH3AEQJ?WO6!Ku^>UPiPeT%!0!;BfYinv{em1u}_i}r7Db*V9_20a=A*!AD
zyvD69(3vqi8v+ifvL{27-A1=ag6z~fzNjv=(3@XqgC&QyL<uGXB2m;?_qvmH=j+*Q
zWfHw;DJDtfAFsD5sJZ)7n$a^Eycb<+EzW;4mu8VQ4AK%uuRnd*#q)uS<;1cNC{xLP
zL#to58N-C(I)CNx*XpVF2RH1cNEnyjRLWz-b|TAlJ8bjA60Dd{zsX==<-KqI$NUN3
z{WkFMIN}=CyK)}oNHw|`6?j^pQdt6VYQs`5PtSI9?Zp)|OF8`Eh?J|3H@^Ss?PXQt
z*UM*$a{2=zr*$(mjLG^R@OFoUEE77$&cxK+Z2hSqJ{q)b-Y_O_MZk{_rV<x33(2Js
zt5{<VM6VMONq?s?>7X+Jx(43|9-ZK`^zD_eX9z7(R_BXf6{zihu!3wpY+Q8ON-TQK
zOVKJ)64OFwie8XAl{(9F^C;{>m+S`VdZ5d-$9V0ODZg~i9%?LZuR(JuO|Ij7+HU=J
zQgC?u#j6dVj*UkWnnLs6{}F%}lYxK~AM~307u;Vt#4qhAu@s`#Jo&MFozzJ?b>Gfg
zznNg;o8OPw&L^%W4o82hroJ7yVw>d*ZVggdCrSybqx(5c7u40J5hH(WXqW0I)$Pw@
zVoQ!2&RmUGq+2FE61<{4qPC+$uN++bJAGbp(nh`;-ZwVfG38M<`)>2X_x80aD3Q8N
z@oyik4SPAP*!1pQXeTzPs(Zm+7a>?rjxqQ~XKR`C`qtw}lbL1Ei7she;F8AC@Rnvq
zxbC`PB_c~m+)jwU@#E*%^5Wpl%74Zssp*vhEeJV}VOufj$|lz)nY{CpiS699Z~-&F
z=)RqXOP^!XWH+>p{WM#w-tb=VxdA(RGK359&x==%4LavkL19XmNiy{TOh_xYqj-aP
zgG2&ehx$wBd(YIwER}v@86&#5ZJm(fzyZ45&YpxH!(tN`sN_tR)ClA$eSeCgI^9%)
za)o(~vO*Mgk33gAd_)Nw$S};RE!QXuPMoc0!WZDQLV0vPmQ}YLAYMXMQ#C>&P|^9!
z5O6U^KCUNUXnn*{Or8HW^wHc0Q>aecm8|Ny(S49b3e~xqy<D(47NT(?25aXRlNuAg
z=BzTb-cU;<J8^h6D{SNNmwo1k4eZ*bib!aL-PWS*PglVSs~m)&sX9!{LD*`tI9QaO
zcY&5mzTfbri?2<>oiMWaD^JRvD2K^mZPeb<&ZuhjJ>(^O7c%~tc$QGYSQVpRcQvrA
z+Td)lRtP~>6=#Tp>TAk`m7g;scmInz@ZSyhw!A2#%k|Eu0#2yVf00KE80njWN6I0H
zTE^Ka=<F(U>vgq6^8`6-Z#<(V)zKC1oQ=XTU&L1KMX|fm)kEt<xp2+=l0l$&`5fOi
zY(=KcofaeI9@(#RITB_*?!UbLWbH;WqdKBs=QYg<(R<5Ocu~4EA-JrX>2$G$Xw}Xn
z7XcF%oM=v8V|D9GGNu1K(7nKzVoVL|tfwaa0a0^7FqGi}dk$h!cw(VaExBd0UZ-$1
zk37;Qu)u<h!)Gp|>M%X7*P&}3^sFv-i2Vw!_3Vf&!sTi*WB1R$VIX0R`%BKGb)T}H
zYSJ=4g#Z4UtyYh45!Z#hX}S-+Wl!)e@?`U-)Vx8}PfdeJ>1sKL-pENa+xg00P5iKY
zbZhxidMXk~ydTxOgLHaT3HXEbJtSq(*A%X1CUGC;T%o_lMN}56lJcmo8!hzdQf%`*
zhNura$>8p{N|9*RdXFOuIiV3BklNay^xbUlTQPb4DsIPwIyc;nM{&0AbUSg^&1Z+i
zHPK6=Sw+_Sp);zA1%n4k0q>j`4+>@WZxk_MeZUX(Ce`Pb1EhPxl-Ql1lwJ?@r)Q(6
zBT2Yi628aM#T{R}>+`Mit(i{Voh4;UtqtFk9eOJP_JAEOtKQKRo;G&NU@02h)vi@v
z-bk=JL%4~UF)Fw72TwsNe;RxCQn=CD6&3<f(?s0T{MZ$I%y!=wGCj%@KSS*1!`&Bo
zE%;1R!KAC6ju>C0Q&K)3`whwwXQtyf1L|W@#Ygxj?MoTVuv&~l@DH_|=|e~bQtU)P
zXTrMhY)fTuxa(HVe;(ijr0gp>>|qE`$z$-0tlbduIp(iCr5_J7Igrqd49|%f)~27P
zKLr+=k?b>fk?zHr8$3sM-;KXaSf%X!A6^%@L4=esrQ88KHHcSlA3P9;J^{n`TzJ%D
zsV%3XWQ#%WJxC;R-zDupQc(L3!!M+I#HaHZ#@ur^?b9X1KL>xc>-ac|jGteb;s>bj
z`$Gc3A1c}Hqi$h2o8nn~f{3J4D{h|p-`xp%p%xY6?c)%r1;(#yd{q7o)ZzbbDA(4*
zdv@|*wCd>sgit8hK74sMpkVCrzmdEjT~>h`ez~`c3PInk&&43P8G9XJU%<*D`~FSg
zc@CHlS0o3ZX(>NegY~e0KDfsy>|;4H;7-Jz^Z#kkonBO;A`1M_Na}f?@&=Vp%NH6t
z+ta8?ziO$dJ`NYfHV9pg%HQS-YhD!8jt?qaL9#Fks1Lu@={kS>+iT%wZ?3A#Jkj_l
z9Xe;-W<K+RNs{-T9)bo?2GsnwUy7O9_v#xK9fK#d;{0>nN$EaNVdc03j&^N)K@7jV
zTryVdRJGeE@}-)Pvj2)muC+)`exTFC=gM(r+?W50_F>Bfm;E{QKKnR47XxihbMYMs
z5+k}JFmO2R;3ph|b3u$!+{pH@A9Stuh?778nxly2a{%h14<F_hy=Y>wm^K0$3Og9x
zR6@9cc<>F(944^{?8O0*qI&;@e$W7d@jxO|gwMfb>aMGn94o84t{DjaGbufC_}ud_
z-*GgW3CErvk4x}m6p!pexpSaCoME|{KgOLl97Hk0P&DC8^4}UbZ1jt^l>ksmoiMw!
zZL9-8el&RWs3gIIDURx|t@i@O`Oq=|C5o@7To|}~D#aQR{36bY62}8*P9XIP#31?I
z^M~NeQqoHY6=p_yZ#e0N&ZwAfBOVeQ+zO!|<`j_o!gc~~OKVvI3`$EXqLaWzJsh%z
zA=92I3TG3Q0ML!)IE__pLH4?P<u5GKYN+v}qFX=sbj%3t@A-frav9!xn~wlYz-naO
z3f>7l-M~Q%H(Gq9V*Pp=t^G2Kgk>;DRN9-Lf0*|dvUoGU<o*`&&~mtJR_m>|5R%}O
zO_2$@CR^xFM6?K~i>+RZ;7}wlH0@ps#gQ;I5{mp-Dp3OE=Q1=bw~QFzZ!ricCcsQy
zHtF>zJy7TpO!;+XU4B$ceySg|$(B&?&Q}6v(c-8|**WuU($lX82J2<sgQc+>VKPja
zc+*Wkvm1U0{=2ESU*IMwM4Vg~i_aJ%+ko8*m?8V}=-!EvQJ>c0QJ;tV;5r>ScQ}_B
zv(-B>CmQE#qfS*JL?bs&fGCcZK_2r3R<0>ZhMb`+Yoq)(nK^pB6euY6uvT9hnC`TJ
zsl|K*-uTxkLY6^eL7Nv|xi#2oV0@2pZ1O9#e#=U9bBn&;UJ$W(bK6IC6BY(eCW+&d
zi4RW;&*TwSAVC)Y1U@vS{3DcVeS)Fc12_#ugr)PZ9E^C54FK874%*J4n@a=TdAv*^
z2I05n=^1bG65^;pYWIE_P*tEbEh*!10Rc^aVXN4p%H8#B#t<sYJf17?MqCaA{br0n
zra;M>wBQi`IORwT*~MKH_w-<^@q-16k11L?_O6~oY;1d`At&(Qtsepwhy@ZqeoIF>
z5XC(=uX;bihRX9`hVifa@4Yl#f(Pmg2%krR>tSb&H(w0<kK5a!3Y<W0*OKJD3!Q#e
zTLtD`X=vO!c6mE|jq`^FbTQ~)2cXNaa3_Q9uiPhnndzJo*yFo?SHdDaMOvqk(MZaO
z#xh-j0hM!h#5^|nHC0R{>58si1<Hpf_6@IYAY1;Ywk=vha~gJ;w!spo)p>ewxhj|C
zy&t-CCbtcR9!6{<W3Ph^(X4hjR_wN#gj2eB)lMzQZSMi7<7<SE^<N`^dQtdrCX^YU
z^SnsTaIWv(r+l&=QP^cXhKlSGk>-~);%0pJ<>6e*&J_2{O{8oKTa3sZsLb0R?5w4R
zr)NC$qO}NI=0Mn6q-_EZPjE5Yq<Fme37nx2vF?37olO*(wKo`~e9-;S>H38$sx_c#
z^pJqIom;;}7%SF%pj`so$YV6UpY%l>W^GmTnpTAhlMy-5cQ@Sn2}8n&h2tZ`6`35d
zuJRaUU_EFU=KLe3QF<~eFNNesgAcZV+Oli6TIw?Ol!9Tcye5cr-|nxr)XzUFi!5|G
zXBW}f+k(m!jDh8&V1ibZ#Z#z$eF%4(E(~5L*%w@jWpw_qP46ZvcMAXU*2|7;(S;Jz
zbb5|2TrM5~JDiSpk4CQj6$3YK*RnH;q}Y0VUfD-S{VC!jM({CrH$~k`{44@1w-tOg
zoUv9bWwS4M4aky5^Lz+#ZU}g+68iI8s4q};TYU$lrc#Vb@>9ev|7g_PhMy-VIK0vG
z8+?)KW`te#KSpLlFpI-C#~$bMk+ACORG+QRyFT-8&30Nu^6bHq_D>r63gY!t!^7nG
zzM_YQ;+fFbJn}3*r~iOYhl|jD%m~(G3AD_fDiiPsukW1pQanj%{lk-iuW{~IrfDY&
zY|1uU2<<00T*w{yqTDSA*Zg06G316k48yDeS!NKN>*oJqq*(J;N2U1x4Waq0mRJgg
zvJsX#^E;q`Qy7^8yOD9v6A`_4P|ddEfpbc!AM`YKX#sm5okr&R^3vY_vlh_=hy+Yc
z0pix_uf@*N^>IRw8cM2Lu=pwCXRi?{7+?EEK0PP<stpr=`~LUr|2UxI;of}X#dc_M
zQYiZT3=8cU%^Vzp4*<B==V!s>gum!G&~*{|_lNr~dajtbiw9(ie56!Qj|3WtRzj;M
z^__TV%p<UrS>2ODH0S@IlK|BKp<2d2_;TX47zB^a!Eg=j{7*W_x$?o`c-_AXYx9Vi
zxHK>dL4;RzCU=XBM|*s89S(7s+ll7;Bqz)TwWYt0!I!j*EAV4lh|kYm?$gQR-yw4R
zJKBNQ5*?Gvk)>uI*;J38sPq3<3AmCW@iV!uFudbUwB?sk4EbcP31k=mF_}{&XE?2_
zM+b}Y5jk2YG#HQ1-c97}EkArEqKy_O#9|LOt(u}mABboI#RQta9YJ@+AZu_ny-|{+
zcbk>`Ph+CFV(gjt-0R&>C2BlxQ3ZrI)r0!j78qm(k>NsH$kGRP-$Skf4!*bxHyObP
z4rn<D(EyEu!>AZU-@2jD3%}HIY88BejQ~%Oy$0%-hxqZD)L%gf(@3`y(?-l>q-;#q
zwJeliph0Lo-4BAx<)5-+U%`*s+CWkt-gtjn?%%<>!}f@03AjRTt!I!Qf@Oxh(;fWH
z%0kEi9;hF3th#kT2IBaSR~>PMRz*$0;Zy=r>wya%3)UU->ybqdy5EieETyrS%=yw@
z2>Qpm?MbJiUW2Od0kG{q=QW5;RzwIUha1lk&)mwnU|UaU@Awo`%M0}{=*{gw3~+qp
zBb6Rv>SA}qk|?v7^Tkf_WFXWp$Qdq;&+#LfDH5|943~q#4>lkYWCPk_P&%SxLF5k|
ziUeaI=Wm9pbI6-ol*kVgPdI$MJGJX^;AHrTMA%6%UtPIbz-yAES5D@~O?PIfnSS8M
zoGswTCju|(VEg+&5880V-`GGF_ya*21ss!98ku^Uav{>zRQklor$Y;q)1xN-i5k|0
zFIEy62H|v*5+4y4Jz}@@hX_Un*dPN4b|ONw*r+DK>~l?bV8S$_nafkjk^{umF#2lA
z3rJ2{G{-?JwCP?}AI39U5A*AkbwY62M#93fCH{0V9P<4T(>Yu{TM+!d-4s-%Ir`ZK
z#Iw4)xAlEJpJ8-;{ok=gC|4r$lBbaU=s56^f_T`Lq=N8eq39ICO#^C74T6>clz9r`
zoGk_s;5U`tPP<iD#QB2Lh;=mw?YS~M(2ivAK&WE3BzAmTaw&a*oTpTRn~ZQDd<O&I
z1e@bJY45`K;!1|q-#<T!_ZG5;VC!>YhlAD0jc-}lQbWz{gU)%MfB#@S8F5IeA_`Nb
zcl8EH`;qT1!FO3>tSkge*{7<<fY~)(OZZW(1PqLU8jM47bCkj1Fc~99n|Y51;F1og
zs<Zbi-rn^ESjs%Tvb$O`QW{(WmrzNR>}Y>TIKu3^|N6l@5~oHEsC$>l_lnJO_~C0B
znPQ6_hR&#63#xy2_%1m7fm9bO3}6~b-{g}$n2+~?)36T|&VL58WP<`Xaw=O3^5Di0
z97rh3X@+%LD%nVxV#oZ$On^!Ket#qF!8^S+am~jO%0+b5LAcy2WDada@`8wJW5{p)
zC;J9T?UNa{KvOR7KXKsh)5{?9t$*A3OB$~7y78JJtV|a22=8hChg{hs54X*LJ1-ZS
zjQ-!z_V;DDKfhj+9qH)~6N7l{U%z2Y&68La<!adJM-loTyMoVGkNK{*>0y^-iVQyT
z#VVd?l#KzKNz>G4y!OG#3Lc=eL_hXYuJ;nLe(MeAo;=xERa4+)Q2H2U-Cf)h(XCZK
z-28Z30LVQskld~0g=6;#D=bzV^>FEx1JL{pA};UJFWg|!MJP$yo3C*o*km!jh>smV
zfD@fv<5gGi{chPaMUp5Sqr{^nBj#~3GGE9E`fpwponFT#(#a>vdeI`pu7(KMSGvV2
zH*St~sdTo6Rz#iYem(dt8we_kQoTn17xxkdB{j}ic9Q=|kP$;()!rvguW_tQpNZS<
z=pLLZ4?j^}d{YlGxH%jW@7NuzMf$-6Hgd1B^Jt&1q*s$qrwE0fO{4`IKfWgermc$f
zYunFu?E7D^=R8C}Txk))HQw-zZ%uW9{?PcDY~W>{Sw!n=qHbegHAG9-d)<*+Ls%V#
z!Yesn5tGqCfk{OJKpKnifq)V^Z%pEBl?GTwr=e<dwOF2=dsO)My%X+x%5U{ZfuJJK
z$Fc}JY8pZ}6#?eZKH~6jYD>`M@~(#^;p(vNE^)eh2Z$#oS#@_*HP?}6l}pJKRw5S3
zF6RsME+0QCvd9KJmcW8yF$F9_iUMraReX`k$iaUbpp5dRV<hbM<9R=-RW7JQ4wr{Q
zpTegmLe>kXL3_jbE*ziY*{_|d#a&lKSl3U8hXXF}BFS4)ha+3Ch5Ln`0bL~zCvp-_
ze7LEOWc{CEB*Hg53-1MylCY8x(ngn-a7hsA5{i?vT1?L~jJ%aZvFz(6kH7Mc^Tm+9
zQ9czX_ELM@4BE^8U&HUYX84Ks_`}4)`5S1vfrW$R6=coAQqCjSk5eK<<#~Hfuor+9
z{{uAo-%Eb~VQju}EWb&8tQ`o1nK`MRNee&Gg5b(tVHY1AUxgC>p+{BxV>I$zr#$lD
zHQg|TVonx<mGe2P|LpkKK)@Xy-3fNb;Et1sx|xT~AK9Jm!%wtOnUoyI{<jed3zN@f
zkK>xr3`D-a<O9mVlg^`n%-Vm@sOssCFl!%@KI^H$Xw=hr;8zp>nj?avN)paY-TUL^
zDaY4DkRnB$S3dy-<bP6R=HVB|aa~O~PJld+1Zc$lE!%NiL5$24PJ<f>ei00oBvIe;
z@l}|RYlfKJ<UB4zc*Ccbj!F9ra^wH)YKD^TJCR=W@hy!IDOIF_6R4iv20szc3XZDd
zxB_8{h;BSvIF2&@4?@bl2v*7m{<r_by}>DuK*XafYx*%1MuSsUfhnMP^NSQF`W{j=
zYpeFSc)X_)!#~(%ywuY~zWeXxk}f0@R(NsdGP2MM2=1hO{>r&y+zAD8c%h>dkq1u-
z=II{E=b__sOcc2$Wo6cd;~?b!=ShLi4(y=+SZM=x3|b`)H@b&FD`q1P35P4)hsHAW
zEOth<Ohd(ERI^syV=KZ=kWe%ZM)0D4(GKLYPsp@`l-X+Q0g81WVKevx;7y<Rx;d`s
zVPCs}kp(e=gnOr=%pCz!l1LdN5AS_6p;qN|vw&o><oE~zBI*X)r)PH2@LOLmO6tSj
zh9vvt%eT+{hp2M>KSULPJzRz3{1D09wxH{vy{QM>OxA;Qjk<0zwtv>z_&;mkay&RY
z8nln7UJ*6v;_uv~3L(Bk`ZWHb=GU`xSo;01^>1$XJaFDaV4H0cc`fIeUH8v0UpOr4
zw~RPGHbIH-+oU<<R_$W*IiKJ)PkD3p*-L(TuhTy6|2NLb5D*^jLG_>{j$iXOSW>e=
zsgVSWn}TF{Hk_Ykj+dGZ>6-~pL16cny8`l@O2daxzbS^8g7=)&931~8NKT2Qr29}j
z8Yb}jces7{ihH;$l!&AbDS{pDZGPd8ma8><+Ktxi8aMrUPd#%HY6~Dd^8Ja}q%&bc
zw(cNoHhY9TeNASd9vG92c8(ei3nWSvq0#bxwZ1{sV+jh9Yvu0qT@6U42I<OfWWm5N
zkrUM(iAro49*mU(AWfU{*S8g;0&WA;C1k&UpHZDxsP=3J%c}*q|3=)}2#MQhCiiZ@
z9q<R-Vz_%XPrtj}+8*{`wu`3Fqlp!R8*u)w-h<-}g&Fl)M4Mm&LJ|+GIRd9SmFiWi
zz;Zv${}JM8Uy4<uaS6&P%SUtV=tk)_5@pg7SwOvL3$!!kyf|#*c@CBi<Q&Q)5i4xw
z`^Xaf(?PB1F1Lc<_4Csw*`@ABN`*=tUWDRc+0Iw6vKYZyXIGwP%;vUQ2RlXq;(Ko3
zjI2RUQQ*<42bltq`?aa;9=TKGaM=?b2>_I^%Q_~5)5&xTLi?Ska@#gy_Mj$y1h?=V
zV5$h5J@<wbEr_pp4JXzi+%HIqe!dUXPlEs;c?ili$DcR_U{C#8Lzp2vx8U)WY{;%X
z-dJSFp^Z0AAE{XfpTUvcu{7g*W5X-PNu-8`;D7$DjV6!#9(-u`H&1rCOC`M$1yU!Z
zz%umb$McaOOUMYFYb*gE882a4iw_nzk#Jp$<#8lyWY%CM*@S}NLvWxM`ew%Km_*16
zwuiv;Fl^YvniRPYsY|uv_rpgqzO`e<?ZkVlgAemObOAzkxY|>utVHjmT6=R|jX^hh
zm8t3!ogY$r`!{L~4k@Gt8!AiXZ%k<>OfPh0)idQZtSM4Y8ThL|f4rv8kNd#pPqNox
z)@GGPy?>tBN9w%@q*=7=&$ExYJ+4a8!#47Q=+pRgIVj-{<En*+p#-%IX^*Tr1!uPq
zL3b(NKd9a9n{5EPq?!7*L5vL`%<g)mJ-RHq9tm0`S7`)EJw6cBR|{izkR5S<PHKJa
z;YIBgjwf=Do67J|axB-;#9ZIofS?U2@*1dM*d?UhKz(EGRa^bC|6Ha<l&e$XZM2|k
zQ>dl3v?alu9vmy$u=h8kvMCsm`qZFMY$%3~JilB^J`nnG<wJ5JQDD{0Ud`i%YJ#s0
zo?kJ7`!5SR$6#u-UGZP`=kgY!qqy*KsGO~vCmIjS7FUj@N>p(L>{wXcX@XS29yX`S
zyy}M{$`2HNn(QD|ZG`6tQ6Bfpwh$oceN)Cj!e`Bt#N#QZ4Vt}%aCqdp!9&ga4ak-6
z+UvS*zxnc}-wmw4`!6^q`#1nt$#pgB1*)T;hMa+|8<v3a6uK(b9obvpPxTlXieJ$f
z>f+Dx=~Gd8p?`p(eKqiX%~<>C&6xZ%Nhk{*ZzzgChsao({KVul^JbxmCSTV^RvUqu
zOx}s)9NGnLAd{$2T~<Z6TZ4+_GUjWm>uS!B!RNmgUIX`D3f3fU7Suf4lffxXe^R|R
z7Un`zhj`A7LD2GioOi;FH!JTNFr*EBm!Sh+2>~(CtG2T45eHDBqR;M0l3n|Oc<9_R
z00Mj&o%9Ut^d?RaUR5n8Bo1UtfqI3@svPM8WYL##nl}fY&ac1+6pel&FLXwX`Xh3@
z^6B`ax;7HiJ-kHoqox2&<laFpX*2#&nc&K#QtSX$fBS3<o@ltlBer}EsRQ>xG8Ew?
zdYNBD^@T@XXkneWz-k==*bL;E7$U&E%Ab5H^1Cfy&u+X~PJnD0Nuf@h{E=Q`!lS5y
zUa$vVC?H36;7)Y_i!ff1L*6SN5HBQThg#;Qg|E?`_Yj;jn6bwcZrGhRBVA{^Aq-HI
zH|rh<%kh6D{aSYt$D70_QA4L|TaqV^+5&tF-GdXX5RR&S+4PH6`Ku(G<|02}OL0Il
zm=QUFqPw?!_aA=ii|FH0JL{F)51HC33Co&FKZP=dkwe(&cCqblIc75D%bf_}Z$Rot
z>{}~Hkns+ZNSZujc_W#!o}vs86n<upR)&p%m-6Ll#ms)B94A;QB_=<k+g|R_<LK*u
zeM|^)fw;rK=j!H3>>d(=RP7t)9tUnhbTIC8`U3(Q$EC)jt<2~PEnn`LchbwbK%u$_
zqW_^8U^m~tCnWr~09B`+3|TioOzpr}6fa>wC|4R+gZ1BVs@!}*%=~COAfv-wIvSW0
z)=vjOa2WX{-r5CzAWoC(fki|J!UsF+hIM}lIZ6~|P*zuhmf`KYmTmm`lWPs0Y&Gte
zgCd1>uW}i|UbN_xf}Af3$X+D%oAWr3n<2C2)h58|Yt)ld4HMDGRrhz*{HpTz9Wz5v
z>)wMuX0onr5Jnu-wJ2?lzv7-Qy03Ne!R>e4bfs~*a&TbgkvddBF;ihj@U)tTbSezf
zYc-$q2|+<*6FR&4QaNBBcV5u^3~Br#+Qf+Jxujk@71HOGI(~TR@x1|?2PM~P)d$t}
z;cXSlw;<NEJ#*glI3)DY<`J<7<gSs|N?;+h+bbq0cCSDACg;Lu=v%DNqzIZVplKb{
zZ~{-2tmLLX1pp_%>}4wy-&R6d2FTo{qX;IA$X?97x`q?4sjw2mkcw5{Rqp^{;b8S$
zg4v+{Ys$ASEee$P>u1d`_5)gO;iJKin^I6O=^ZW>8hn$>QC6AV2agV*5iQ=_{Nr)=
zII^@vl$HfQaxjG{guIr`vAipl)uT&r>J494*wmvH2s77ldZQvuD1kztoxadKjWi<4
zNo62&k2?3;SvQo?Fw%f#xHcPGmv#C15^%aPsBq{a{8nWmUww~s4XtkQ$ixY`gfJRl
zECuof(_a`wPxYVOmU6>Z=hv^4!s|fZw}(L7GBT^#{CYzFQ>a9&P?8)oKAjII;KG*3
z5XoAh;aW-i$w3TrH_h+w<rwjr;&)Jr+x0X5Tqqh=R(l?@)skE|Ijb&3?dnAN3tRC0
z)Xgn~JK;4F_q)x_-#-4Nx4>-*n$pI8WdA)bi@~PUf0>@@i|6X7jCu8wa%uYqQoeg|
zj~XBt{}C_Vah^9+T;%mb<`I;6Y80BXcp#20tQrn}P<hh-;C+foFppR**!XM#g}xqO
z4`m>m^WG35_-`icF!Ut#KIUv7$1r0oe=Io<yEF|TxEf}jzexRm>Y6$JcT#hmN%fsS
zaFCs=pvG75&QMyB6gbupGfUUMLfXwu##(omGy=}C?BGuKnR=#>iVN0JjEa%D;ux)&
zCw-|3!WF4MdiY;wuzsnSvmxKk9R*mG@c&>{kXZBd-|{@<G|-d4x{@(Sr|CbU+Cb=U
zhT6NoqE!M<5jQ%Cz8w+youZ55?dN6Ha{f1Jn;B)lTDkqGPLRzC)x|%@8#)J7bO!(r
z6srW0>{7VCrMk7y9z=yAW5~fj$3EGM9embnY0|&DA4IE&f0CGG%?Yn!K?HsJQ){|8
zkIx*k`hDMDxkyWP^|4i?6mzqeD%%aY-$N!mb(F@J5pN3j4diU>@NcA>nnkLdK*3{~
z*9&(jgc`4J<1<$jamsIap+LV~ory-4xo<bPJFO9?gs;!|v}%4o66_Mo->i97NwkI_
zR2ox@UA42DHp}4QGbRfb7x=HY<ltoqd2?;(4TPWP2c`vojlGN`jSJ=;e|+&&l#;NN
z7pVZ0H?#s>l2Dlfub`>+3}QJU#Dq>CfH`Gx?_AU!{Tz75^Tj}4PglV83R0fUS4O~M
zyJHp&{Yq1AW{=?&6qhj?&!Hi7iah)CWxTvrVBEs2-lr25=ak4b4CoC)p2wc6Z9;i)
zoU_xTk8Bh`HEis;TsIQDuBy{!4dLAl)lgC=28V752g0@x9x@>aiY@8ob)NBD-!(CU
zro;36pEay~tpGf5dY<plI_rlPe2&huSTG8A<7L@XWUhTgmPT2Ve`Ck&Q|Q{s`gj2^
zw;eySK;5plS6}{tN;rP6VOLopNcIxFe0l|^1-l-7aQ~-$MJsKIEmgKzfvg;l-zoxs
z%WwGhEKng2j8+UmHqQlQLxn%%P6#ptpDCA9eTw1Kb#2v8mBHM*7MFfK#)s*9%l3S#
zLGQ9BYaC%MxVPF*)rcyQkMjI@D)M=b5Vicy0fTF-0gB~#sJ8P%PojzVTh~99N~m$F
z$;S$0>2Ds98_UrzFb*$QuG(^M&c2cf>0K7EHg>VH*F%eAdfc^>lsSl4D#1;2kG>cs
zU@@dhxQe#^MO@*NNQpy}IpPd?-#v}EL;aIjJ#ReYd3_oB=4&DNeEKX~a9A7BTbg@~
zpyOE9$#QURE{dILxG12Bd1gp>F$-E`ho#lo^t#p~RIA%Q>YG3&MVQpB%f3Y>$OosO
z7k-IRA>QGQsi}$C%CD?3`~x-=M`MNiy>V`Ko67d3pjmnj;K|{k;0)EIzfla4Y;vgW
zN6ZAO80~F=8rxQ-lH{{*3_<(I(n@wxyaHdVr_1*{<btJKQ-WiKBF)7mjMFW7&5j20
z`Z8z8g#>}^ki6dL7FZB8!BULw*)XZ*{{zB>Yb-;N?-7nz(%&Z8DV!Uf2&)?}&DFov
zM6^4Ja1ps*8%onhNI>lCJFaqaGO!en^LqIhKMGy+=i{3yNxNJhHvHQ2_?P3F)D;;K
zK#mSMq$U4HW`M??IfYPAP9DfLKA}A^JxoTqzc?aCJHY)6aWNS#Wpfw@^=6v#u50|w
zdwkehsa)J|N$J_zB=<w<jSXQcUk?43TDHjxI}{I(%BBz4V@QL^3_=I)Gi-J5FF02j
zso(K-g5=!MeUj<=ojF6!w+v7C#s2Z$iIR$O!>(Z*hvj7LuZrjTv}c(D$5MyBP;W<W
z5W;uAMYW8wCr$Mr3|kr_@`}|?#AHmmVP$w^f{iej$*wP$aU<YLU*a|6qnhC|$h@AR
z-D(xmk(W}anjQ`R`2IQh@hzm?De&1Bsvjr$(#K!k3;&ambex4d&^2G!xu+O~Pm3pw
zpw!3-;a=sMRy^?9R(7fRuuTw)isi?c<#J4rez&=ku3)hk^j)a67O3rZHPzecv1JbA
zTnt<UaT|fR-f0)WuF*(D@Ju7#JP{l{q!M*Oq4Zuc_S`kqc(=0q(0Td7L*w;`f$%ib
z1#|Aj7Rc^=6{nwdZ9+1?QJtLSHUbD}1s1Cn?|8K|Vzr;hi$Xd|mG3l0vQ8OOxUP1T
zt@*X!wj-w-F2t>k1MYEaBf|PT4}pNOHW|egw7R5p&ydOxE9F<_-;mIkec`j^vkKf+
zrF!u?iSldMlNVpZT<9lTygIM$Nk8`MK0-S+1Y#y?#G&k>)iKIY{P<wXWKXG`xX?G2
zRR~Agj=rBy;c~&R{hCOoCWmKFjKt;2$;8O*T}ZGHkqHy(RtLSRbJzGt*SI`oZE(qR
zo-}J7$f(`qS%g@%T)($Y`g_lee;MxZ77x%E*$EJOu8T9Aa^Ppsa#T3~y(h?Zmn%bQ
zC*aKifTxDw^%?%r3AdvWhzA-w1BI(!CV}N<$2F~3<+2cRw5182OodN$>0?QL`GQiT
zgfbL^@5+*BT^;D*9-m$j3RvPl8SKB2noC$fFqOJQNX^<~G2I{KT*zQ<6eknEB!FYt
z)u2e8<Btrp&2i(}ros}NQjI&6IzTJxA10qK!WJDT7<}>_OW<(Y{-t)@B?xL$-WQly
zh6^qsEdFzSChOdxM$$BOFNm0*VZ<cJ)W&-rlQAS;l(4=>=0Y^QLtdjSy5*2_%ZRrG
zA;3wMbcW`is~_SBr1IIkEX7Xv<tX=afoahZ6%my>iv&%o_Y7+qyR|RRTibrVsW+xe
znRZU%Q-2|S|Kd<G@>?pp(}|@vJE!nUP1%$5OIL2sz+1u#IEt@J%Q34`F~bSCF7JEo
z{q+?2P2mId8AQLs`~`un^H1AW^`i$GT$;S7hcx7gs99}XX9<_nZvfzyncrXRfy@5p
z;_QR#>kW)8mwFJC-ZH$4<xvL8BGBg#B|{r&;V5dAg8AetCmc0LPXuW#iCgN)tmLaQ
z{K01=+2T^A|6QyV7n|U2d1ngI_mj7`V3RBq{GX|3yswC6h4&72#c>G$oS#<<PN9Xw
zx+j_{SEpziHROvH(p+}$i)f$?{L&7f@ia5UK~aQN_O5?vPKjc@((I_hius{XnrcL(
zzI!TD#mIPTnZ7j2Ex_p4rUL<rnJV<IE37_q+Xq!+RKZejwxJ}<hENU9-kl|~70#Q<
z<*gSEJ2N$71hQQ1G3_Vrx%`6++cz0q_!QOgl%&KpUmshYxb%&7l$QLEGm_s)sDt<C
z8&XAU5iJaXf)MGwZpXqRyrQvEpv~o);%Uk3hCUZDv~3&<WgBrKcmYaiPO@qC_8aU)
zk0-c@VMYUWB#JuL7u!t8QZFeo2X@Ayezi3j1SIVyd5NOBIWXo=Y|1U{8_JYB2V#qV
zTgeB<Xa4}XiJ5bS4wF@igJIiu9=}5eF&)ifSj?VViiV(Js}PiD(g>)u^zCmvDPQTg
zNa1j*ips_MZ^tY!Hqf3y4kFei;iT(W8O-PdqD>0I7$;hbBhrF5JWWO#1vZVZid|3g
z{|85OiGb1Izk%f529uw`>S7C2V&wx{>Cn&h{49u%xHb01&Y0eNY|azXt0yZYVHtg4
zL3r#|np1qeot3*M5ZXc#-@dqghWzuOBL;4ge|LSifAb9(AG-K8W#e`T^)NxCfmiOI
z)C)FgyX%_yL6(mz4vONVI7@kD=?`n}8NBouJ9%}69|=$C85!%lui*(MS>2WEOMd;^
zG{#f;KGMIjy`g95S@fOyQt#5>{&~Oc^^KlAg+=3tjGH1goWVDB6hbNLRLjgF-(O9b
z4(>E}y2u|Y@pZ*nAv12m?dV`vVSifFctPJV*TnNa*}co2pYFEGoE|EgB-A1@&TcZ5
z<$oPlCjT|g_SVp~3)COiW&)<^4eS=LzSr;N+I-M$OKY_nareNIeX;1x=ycx0j>2}+
zgOajpqtCfi3e0BP<$XmuQeI0}(pUx?X)G>bTA|R9KD*O9Q;VRo-LN`%Ai$b%5v?n-
zO9jqI9uePx$JkdqW=w+l2%PN281n-PJ?~DF4-Q(QbBBLlem9{L8onCIBZ(HT|1MG0
zw^^4MzhG$3^th?Mm2Rn9%G7Mto8G|sv^PLc^3dxWan*c_VcvYbpS}wg7A5Xmt+bVy
zwAw7|JuY-Lp)4|CC**bJK=EmFkoLy267r%6FJ2$d3LzJn5%-&*9D1mV@>4qU{JCST
zWi;*g_j%f|$oEM1(4XZ2a@%l{KkQh*bfB;_53ja9Jz-@;XWLTrIByGE@~bY^atUM3
z(K2~iXTRf{St^@>ejn#&k1@OkzTMV!^dcB}vS=&VHSkp)_sX_r8IyIL>s&=vsRX7U
zjXK;h%d?+E#npW24b?rR_S_tc-EZ=2agN$&Nh&mupJMIk-v?A?`HSx3nVvnvD45Aa
zJIe4)jyY0kDiC{}r*trRE7H3YbM!N$q+(ih-)BCNvR))r1w~+{zBoh3pL*)qSqb$H
zk1(I@B+4nDsiNZF?#Ak<z>j;w7nL-Okr&=<*BnfcXWU&x8LZcgk?<vZ_C=~0SM|w=
zb6w@$>kNB+BRE{;^eRl34OTfg{0shMcNm%si|)r@pQD8i2)2Wgg=_8^KdlFp?7D+q
z$Kwx9rP)me99EH(ikZRVJo@7S8j(krMQrzpgR~n7H99d@Sf(PMTj23$c7B&s6&b4R
z_?#O!obp|jw8AImKlMW-!xG=_YHGd;CJyb8i_ff#<jx*uWKREbMy_PVd31d;T5Kdc
z!O@JT#gr@a?%?wDnAzuhZ4XM8*%|}a*XuT0imtTx=kyc$r){MssHJUoy9qeXXV3op
z@F<{<W@-~Og$rfQ6SX0gPahPnnqWx2A1*tUyd=s)S=I%()kw|0<qrxkGf$#bY`2<A
z@v6QRgeyuT?HHfu+7zFy-5D+ACu8$u9LuOQ>?z^MTsX&lhUt<`|HOd0w)6MZ7fl}w
zjTi2|ko|7<*xQoMl25zs#~(Mc;Ofr|I84Ff4a_nv7G}?i$C<u1nEOu!Do5+~gx|mY
zv9au+Qw}%@`o_iEI##wG{bwXrX|qSDQs4fn(czW%JDB2L?d_$zYw2Y;_peWcE2vkm
zubf+CyYzni{DJV*3u;u*kC>^X_|wNl*IFE-7_~jUy`WRIOh3u)oI9J6UNxN&i4@6v
z!K0|z_i*v4zDwi4Fq+-t=a{B(yTtGrDP{Hmc@-}WIVGj9l$4W<HGe<^;a}3d`^9xG
zqRKG6XG{KLdN?Pgwc%`_oKU2<TaDWn#s`r@yHe~^hRO6+%{%^(+3oMLk?uLQ*7Pe*
za2T}QmSs*Z@y{fz*^)WE#_L5aN;}<lw{^AYJGokg_r}{mqb-@F-5svJ2i;>wTP0iW
zyj_)m_*>Y`i(0obo86KB7ceqQ#<2L|j|ohmF%Q~e@~nKr1}{pL2PLCzJ_HlEy`?*L
ztAIFlGsmU~`%runf3n|sn?|2Q4a;l&5Ide^@b_?Nw~j+zD5p#EDatD+KGpx<Uo#CV
zwMA+tFEvVnV(my#^um`LBc5CH-+pIMC+?1(7f)kVRLWaEX{q1elJT=Re#eocZYNth
z*%xw4!*3;&Vz&A|j@G+vxNlGbyF-v~x5}E!Ae79&5Oh^xdmniZa^<sJ)4K7$1CJPe
z=fgSS3?3VW+)`}&$9AL`mM)=Y*=+1(g=yGJ<&?;(mSpFV69lA5-T3HlxpFKDhx0d_
zu-Bh-cVn2nHqnt)>nW=niMJJ#4tH2(E3j=LEqjHlFGAPs9rp3|4tFPrRmT6Od?#;w
zQE;JgBayO-J5Pf7Ol!mFdfLAO1&KRfU*XTKbQ|P^n^-F0rfP^X3QsJz4jxYW;SHP>
zGAucX*0HBzW^K-{zrb&sD|*UzwB-Cs<!UVVVr)t3sPT~J*Urg&=Jx;g>^o2}1mL9P
zkyl)AJUa2Yi*HM+u<vkg%-L{T^!hgz29tVLmMz@f2$$yO57I9k5}lj=b*CM<50G&u
zEO1+eXz84myC@UXPrh?8PcfCWLX&06Ez>T(=KpE$%EO`j-hTEGF-XnGE@a=n5n*hh
ztVOa+vLuShzVAzxj4fdbgJg?rF(}!asj-WQ44IIfEMptqXZ)_;d%eH!_x<nv_nqrp
zb3NBw&vVax&biOMe9pP=2SVgpQ;{a%e`-m)2uBa7c_-s%KMFt!kUSR8p;5@jrk}5m
z7fEn8J<l_RF5l^$yho0^MAZlBvk?*wcCx$Jf}~nkSxnQyof}vJdOa;A<aLsV(<Kt@
zc+bTB=XB6LF-Ou7Q1}1dpx>-m_gLyDJ2$B3WbNI;)w@A_-c$xa4#Ux{1SL_m^QwHd
zthDk`qX4cLX5`%-AR;e2lw?HUM-vZC<2G=e^Tp#k0ukaVY4!sb5~I*uRR_IvGdz*V
zEIPTg5_VR}`ER{+a7JqU*=}uBrkR&}r7g3X;GcsEQGJiH{rXw^Q&YPXVj?sLvJ)#R
zNB!F`_cCk_`*ChtSPwp^mh`p_i%C#U3So%^UoikIR!&jLi<TG-U=_rI+ze0<Y&`QB
z01S1la(mhav(Q%>&K2TRKz0GRdaHtk>><>+k8jSd%j4>Kg&8NA;O_+Y@Pi3XM?)(M
zT#;_wO0=9vIb9*Tpx3coe05~;fkx<@Lf-ceCCx^VB?rw*M4(oNt2?7ExX*TxLzUGP
z%kF-9f)>4*Grx6OHrk!L(!XlH-8VN)bgmQL3>gKhxNQ&Iv0|FD<*M9p*8)*p<5AV>
z;;P<HO7vThVP&+ktf=6-XegVEd-GmO<b$-~bj%8+(6-z9Uy8<G;i>39f&Ia@!JP1J
zI6nlE$Lwq6;O1?4ngbj07EY@{{H(}hqcZ2M`GZX`T5fHB*4{0fiB_iU5hFKde|<}q
zKFd4Oa;I47V;L@Ytyf#_7D(IM9x<%_S+>fpp%(G*ZiIEeL7Sz62vz5gf$$Zc%_of>
z8#ABgG=8l6N9Bw1B2_ZobjbuE!=8sHYT8PWqGi;2w}el$jE<d62Q3W|Fm2u!*x;3|
zwl!ymR~VW{uk&`A-p{v02>-?0IVZa!DE>Q<(@awqn$sud51C;$i9D!SwU*=Tjua>|
zre~31a_F1_v~PE8Z|!4xLC3so<$gNdDcNesUA$dgiVAG+i;3TRT{6^$o)&+xO)^Q2
zBs<nv<sElE!jr(zpK_@Ubl);`tB`MH#9=#WJ`UPxI;|O-6ZT!DL%$y3xP<_5U><-)
z0uuByl<`oq1uqf!Mdw^7nn{E>em4mBN&Nj$;&nT^DTR?^;b;WOfV^wl4+r0gQcCqS
zj?YOEa^ffgoyM@JZNBJtlgTB0Tno(tlH-;*s{*d2;Ou|NC`-;q)$}j@DrUa~3hBBs
zifqc4CUDMCyYuiG&rw&QXW^z<!A{@!(UiK+JUPSMZF(=}3Z#T)Ou-&V?@iogqHJF;
zu=x~Hy~LUQWLg`={UZ2z{S=$M1593-k6NI52@vw-eGbC|ZGKv<X$VEgt2-V#9sX$Y
zMhwe8Cwst*aIz|uBUkQgD74%s+PR$0B;jC3HA!bfR%Eh1X$=_YPL!x`m+cs)_EIzG
zHf7x5J%dau=NX?XGS&stEFbP|BJRueG-eX<yPU-KRwxPnt!b<~qQhG=I4f?cE=3N@
z;kdjf8g)2xzSYIEeTPk#ljsEjN;ilc9}!3`qXc$lgl{w8D6HLf4)uNvd5Y9tJ&4;t
zP5XsgV7#Vv^1q5w@fA7MKMTv8qoyG<-P_f|g;e{t`$-W;#uSrR0Y^9as>AZrDZ7Kz
z<-ahg@kUyu`+$f0gTT-&r*cg4k_NYQy{(~pazzsa@s3Q@l~lkYTm_S7fc2Kmge`Mz
z-RpJVIh6NL3M5zO#yh7gJ6z?u*`Z3xRZIe^SVnl}`Jtj|*$}_(+9&r_@sW~9AdYAq
z0x+C?T#*-iT<~lZhZkrH)J|uOs)9z!i{-US#+TGH29+}sx6LK?-<2mH)r3u(Y!`-z
zclzw(@3hE|r0kU;)5<Vs4eN95gND-4K3g5Ejqk#?Ol+OpOq@=cIK@{CmxNY&QmUy=
zimZ)#?XKO1T7EmdVIDkRg1;d<vWUnkIUdUd%Vl(v%pJP6iey3`zB!v__O{VDBO`xF
zdX!X#%sgt?*J~owe{o&+>MOU1D3O0zTotxmZWbtF=mauy`n6r}1Tk_NFmbXos<&3n
z&>TB-<@A5^SFS{VBXju|wfZ8*HzU(!@Z}QuKgHHO=o^+V#SipI1(<R}PVM|PzLh~+
z$;qnR0P~Q@vRhKd^;)W-JMZ#x@x+4et-DY!vlUVvzI18FVxX<eqW{3wNm(~=-^^34
z`1F?Y?VN~8n^wP-0lxkY1@Zo83%%}Pn_HNMkBB#IW#pVWNu^08$4{G5?G<@(HBA-n
z>42f21S2Kvf<eq9esQbMobT}}e>J$bKNOZM%xlQW?JMrs+P_DDOR;8`VFKZs@b!kG
zu9^qdnqnpF-z!l}!pTN`cT_VL0Ei;*Mm37^riF|q*||Fj!fJU2=BO>r3Gw0{^O<gQ
z8_nWB#lKr6@Ur*j!mn->HOswcRnIQ0C*K$MbP#7vK05HI_pDeQm%f~VyS6#-ZT!(l
zxLLBiIu^3--#FvEbtrD{1e~#td%1lR-=jv64Ue<bY$C~8Qlt|@0~Phi0UWH3`lsq6
zZyb;Lt48+ARB4L6LT&fh!nsjUhx&2Y$Aif=v^ZDTg^^A$Bvub`1|_cHcwrYQtV=@^
z^4N|WgIIJ7ph!dcYqPgq7=d=s#V&Agyd2UWYt)qIe-IBrULI|F06|ukPd_tzTWLJY
zH5J81x|u6=>J&TZx{jtf(K!r!=Fzra&|2`+M651i#ijGk3;bZe^>wtk<pU-borKeF
zW7wfmXIu1AeYX8|OIh{0CV8nk7p(yZ{n!ZP3{5k9HoI(o)z+6K`faoR_`0AiP)sT%
z@CCCIv1G=~5b)Njtbf^=%2U@r%u1F!4|&Nvj3*3X?GcGhJ)2h!9NpNdfOC`RvY88p
z^}Vk?fqfrc*jV;&J(vX4F}W1hdN?bSCvDpt@C#Fx;_@IY<BnEJ;H#V?_IxbdREDOx
zSwF}a+>G2VQCoR<C;l?FJp}*|&uxmgxiPUAYz)IoAuQ=~kCANr@z%3xo`KR3J;-xs
z?0w0M^Mjn;kN22kQNMz`t@Fg~TpNHI3*Kt2^WJ`et>F@d&9938%%JeWZ;sDdFee>d
z^{)5iws+v^&0zw31`gUq7@GTE3tqs3E|NHozX`(z=V3y}cCx**G5$XY%ktKhE}fp_
zLL@P{#xD3%{4i`$ys&k`u{^Se4I>JIfAI~TL~nXdjS#k*nBV#hS~Q$j^0eXJYC;yZ
zO#1fiJ%$&m_4QZ%q?S>Pt~etT(9SeI8*T+d*Ux;n0^>Iw`fWq!b*=CtFsgmpvy#Mj
zd5Jqc6MxB!?gbvyrA_SB@@)oVomcKOV3?#Y**-yCAU350)ib%av(jPD3fXOiK7Ue#
zIu+%aIWS~nCTn&i%`;x!dhcL-bmvCM;ut%ohgoFeH`8OjgE3st0rIW#ShDs6{`ZuG
zr)yO+cTjhR)YYMfL92Sos@<J^bld&&QJ#v}s`sH7^6!kQ5T}ya%e-z(*1Q2Pal(?T
zFPP+mdw@F*oBBg58-h?IAYoI*L;5$)OWmbB%Wq97ZVu5|O`C_tVZ2b$<1<HruzAvM
z&RcE+!99OUXd<A&*8gt>dk1`l#l9mY5cZyYilm*hA!a}mJ;l05RM#aVp;Wuej(ocL
zYY>{4(~(P%F7=78(%nyx?De+-e;BD&hveMHR;Tc^TIa6#NxR=~oPhO1BvN9MZ@0T}
zk)I1|^e{PncHplU>oifM#<HKwRvqr<=QPtk1R<dv%yJ7{Kql|c%6lJY8HXHFA<&&E
znF$Bxgb{(p{1!G$jv20!d;CrB{*8PIlZe&wCfV~0$>p%Zuy(}+vzcrZ)8Y=#9J5^w
zdzXvs`K~AYK*aa5Wv6l6Z_KNgZbdW`0=$A5g6pRF`Ukn#WIExfB0L`=9e;0Qe6$7A
z&5siAJOocUP43!ch7X4)Hr=$f;bG7P8y|~U{4o1&faQiO6$=@1X9TQL<a34M)9#2q
z-+4muX&EUa7A4?WW+C);N5P}wmi14cuu0Z+BmeB5Qf>LuPvy1Dd*Zsw)0yRBU)6;&
z`^Ku#M-9t`LHCYAhvNJmF%V4^PscvT^s!3P^@VIC-ur>glw^&`wD(yo$uE@WTjy%m
zgfg2(u~9wvimh=ri~(z;XGP~{EuOt^YN|a$U1&H#C)#zlT1%53w>PLqUsoe!nO`Bq
z{Jtnau2SvoTEOQAQE_e}F)t8(bcT#R@>s}JeTKc0G-Pu(*ccnrmQy+FS<US&+8!7$
z%9&oM%r2I>B9EqR_*Ed}Thgi)<=Qvj0U$_AJJ#!xVL+(w4<$@(B>UF|U>2HWEt8b>
zE%#RaTPJ`+p|3Ho>n^<xUpfNH+cQQ+lhtI=$w8%#hb!sl8ZobZMACf)`)@9N4n0xV
z5I|kCXp4=WsOw`070eG|4~FL9PBnzXY*T`E(^lddFYQ<;HpMC2fa>D{cL(1-rcWGs
z_)tnhT1#)(*)56hn6O>7lh@Gxc~{uMgQK-f=Q-cedR&P^hGjTNk~6;63@OhY`k+Gg
za67%Nt+bBa*ad<Z)x&{BR2d{@YTX)3sHc1_sLH>LI{vN*Mr)%|S8HbIw!WRBjzKUF
z72&mB-Q6z<(^ZJxocWX(3_JMVAN1jA*d4b-!i~20iv0ZC?;5TC3+vqC#(Gtm-LZnc
z7lk}*fP(%DHgeCo-Z_-kQMKeS-Lk|J9HfCG4{MP-N3$~F3pIX2JvjKZ?=_Fqo%H;+
z51pv#ZH-jN=FR5;#0yhn)X1Zr`3}HNXP=7`pLTJ-Jd@Uy{dBa$v*$^BFG{rXd#w&C
z{f`BfTP!dz@ONJ40SY#!<SsxkZY~H2`Ym+RptUE>bE<Sc-+weaSm!&4gJBR)0tOs+
zoxfeCU*~nIkKjpYU|^TtF0%xyDMMDWh?9WrMZ=6|RfYXiKpHjAum|s$$GeXxm9gB<
zN3{;0H3VE<Yh*c9`0OwYF7u3A!UnFEzRLIArU@61ZlU@5MNuF)=2U2RTTu4nHGBNm
zKI>1pk3M}oxJ_@#Fu`$H!(7W^90;Ud^qn@<>M=)iWJRAAZ$;+xl?s6Pd86o0t6S+V
zZc(tbXzC!5>=5a~USL0W>|*2W@okYa(CZ#q<f5)B7I>NQJl^yrFu^dx`e#3=lw{zI
z1K+vur<V9%$p?YHiSfVEh+Lxf<BgrgR1GS5cdLR=naf^(0lq&y5N9NRZ>QH=#yyhd
zT-mC;%LijCD{+h#!(E?#6w_QqM6uIFe>>yI8+6Xk3vkWKM&1Rq31}waEAYqJO?Ju1
z9$u#|Qo!IU8=Rek>Qe6$c6*iIP6=#09`!@Ze&=$H`k!pTZy;adH%oAVfzrO&$Y}Oo
zFAqR2uB*TEnEBP2UP+WEOc%^)CX|pHuIJhQd6wmKYQz<4fL3Dpf|o-ZQv6)wrvN&c
zE|?c|5<;Bi7<(<--X`Y1umEI~EN*$rOMUhc_(a&|atJfO+MJTbx3+i)L=bZ1U!t&u
zc=C`y<{ixo%}8<jT4qqKInwdR+TBItGI2&Od7I{?yK9R(XnGpUgRL~VpTY*QEpq(O
zEoQD4ERX~F1pTWjO6EVhBhu})@#Yor?3}9A3QXX~m82%-Vq$1rlSRl?8mdUmb2Kk3
zA&$sT?SAlbnIRc`h-vxs<OA#-gy?TK5j0W1?ES({vY-3@q{1`ItswnS(HjBA*q_@w
zR7!EAjXxF8-#Vc05|GB(jRC8B>u<-H%>tRX6fBj_KX+b5y}QMrFzuu*5S27P`!Zbx
zYYFu(Pz?{Nn07;6;KS~C=N<e&w=T^^1t+jW_9QpsmM>yok8Z1Hk;kUp37K(N9r&>E
z_WlPK7M2!tfuxAeIfk1<1~&CZd9ee6i6(SiA01lO?i^o9DX0}RWEKnrAj~Ubfc7zd
z8nSqX0%w{T+HrJr$#OxoW<@3!L}2TYtabLCATC&O%2Um~upb9E*BI1=Fgxj_lz{T5
z7H8jPTEE!p%r2P0g*8>1)SDc61$6|H@msYjRvSz~vvpB}Mw}HJUcpcroKr>5gx7%c
ziVw0y&MxPxo?Bd7^*x)(`OB+N-LbGyzF~A{$QrcN%!X%i_#1iAhaAwPH`N^b6EhVW
z_pn>Bs@?J+NkZ%qBX#}lSGbJ248YLQig-tsKIk!U{E|N_tQjMJ9;D^iJbg*n6;bj~
z2HNis=JL+9D3?6V02|Y?MX){6bgDal{PhpBZg5i8T@=&!(zW{o4-9dxeLe)4(Yq__
z%&ad--3NMmY$BQW|EvMSfuLZ$h?z1OilXp~XcLKHgf%A6!Cy&m=v|PdWt7n!Jj9W1
zrtRMB1wpYO?WdMMLm_r8q+5A>t0xM0T$>=V)7i$g{h=?kSs3C5fVMat)W94ig8sz3
zy4-z$C5vuU_Fm~>4dtoY+uL&{Ax9g(9eDB<whR)YvF(3?5EB+QN?+PhZzM08eL8eu
z1ry^&+rh$$?K~2W?eKM=0HW2DyEOPW$U~d|CCSR_h{fDvXy;F94^QY@Mc|x0`Qx=x
zhln}>)z-C(u(ct0+e7u1D#gmtZC9T8@{F3*{Xu%7P)Ao-jF&pCmYoe?(A^S!78xnh
zz#~}AXs?w+FHvCuXFQ7>*cguypr4oZh`Xs6&Np>td9kuBEB)i2jK}XjJ#o5vs3GbW
zAKo5NcTC`YtV-BKlMFM|>qx#f89i*&al&@e;3l_+wYg3`G;7^*0b}x6qe&a;c{H8*
zJX1-*!xC5=1x%_u#_reh%u&lE0x6Df8=&#;1Z^4DE>6@!YE@zK=jlf#LzEN&h+*!X
zLAmyH1zWtV=Qp&=J_jWtzQ7N#8|4~!4YdXVA<3SSXYxIdf(Ti$qkpFG4?=)shRSP<
zk&>hJou}XVO)T9NyEb9aL2xIRZe{SG9#1a**{)L~p+l*$!$4a3sEwF$HyljVY6cOr
zO>0zp=xWZ3r{jHqy*HMihs&F_V4>J~)47gk<E!MW9bKzx@9M75_K8P=IGne?6yG~B
z>^!uwvI19LoqCD{L%}T8Je&a=z}7SRF>o}(Pom+LD!jX@Q29br#zql&{E8q!h6&^K
zncq?8XsZOa5^xqi`cZt+PVewu!~14Q(PNfKWh&y+E1u(X)b{OgOVOc+<(8bPNZQys
zgf<KWmDfY&CoZk#lGO4{Vk4Yymzj$~gF*;v{D$ilY{$PCV7r<rbPmHnCh0J*+`(kl
zCWXW5hrr@jH2Q1y#C?PE%f>K@U4ULfW<~$#gI+QTT{N4BXy6Nl5s&{OLSaKq<=qzi
zmidz4Td{bb_S9}R>SHF_9y;%P;9lVD)+@6fyrlrR0wwW+sU-L;xC7Xu?=udTq+1+G
z-lz!N-2H$kw~0w8XY2N-SYb{6jxJAAqZcQ95NRu(x6ECxTJU+M3SuD&1B{fH<pBfX
ze+#ufcBnr(UxU|A9SpDXm0r7gRQmN0JeH*~*U{o)Og<lz;17W7Qil4_;DL76jzm#p
zqsq*J{+Dba%Bsa&1O5l2UxEhY89XLH9d%nBD5R46AT=#)j3{Oji1z_DkO*0;43Cp3
zcYq>SIHrX+0A#t<49lyFf=K;h++22seRwDK9h>z}iYJf!>%o70a{?o3k@`yqMDgbS
z*4i*29n{#lh2ax%rF8s%wrVYa)6>s#{y~yLc<dY?SrdY}K|q)O*(?CMmYUBv{)67X
zX-@!1sDsUGGn0StL-E;919T<yy-r-AzZ3pXZSX&PNx25*+TMZgy7Aw8dD3aOM8Jq6
z(u_|4iGOqQ-|R^MTBd^kOTynto^%}nEb1Dmp)%3`rK4JQPXM}{7nT1Vr2RKZ%0;yl
zF!-2jD%<}nOBG;%Zq!B5zxVQFK>tHb{xJjpKdJxsN!<-63V+s&-wi4_1$?ewGuElp
HvWxmJV_bSH

literal 0
HcmV?d00001

diff --git a/torch/ao/sparsity/experimental/pruner/images/prune_3.png b/torch/ao/sparsity/experimental/pruner/images/prune_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af2c3cb4ed0825c6f2258b575e21fc81eaaf83b
GIT binary patch
literal 87843
zcmeFYbzGF~)&@#9GJ+B!E!`m9CDI}SqcqapAtfLU(j^jtlptLqozf``-CfcR=N{kv
z?Qd`QyT9}A`Qz}LdFElBx@)a<t?OD7@=RG47wZ8Q0s;cA+*9f22nZ-v2ndL8G0?y_
zXUa!N2naal=2BA6<fNo1pE=l?m|GbmAUq9;*F=Y^brGg%J(n}{#dwNYk2zcr2YH2p
znC}-6g@P|1jw#U|Xsp^?8OEUDq%J*FiBOgoT+b3(&rbLv&bc&$fyq`9sdkrnPUw8K
zKb_n3Ms2sZ-W6eRb3rcqTOKchdyMBxPiYKGMT-Yvqj!G1Mf17G5!achFkx>$gwQR1
z>+0@<Oxb<&B{pU&ed$)+2~|>n0>S8E8wrwt7~Ucs!mne`d=?6X%;&ZOw0d9KWFT8z
z`fnk{&lg$l&*rz0Fipd3V~GcR&$tkb9z@djAc-H6kCtPdy^NAQ<Zffb_k|+89peZ_
zKBrDYFf`t7Jvg9av=?Y|8Pvt%3`ajR^6X4L=DP`(@Fr)sgcKZX5;^2S+4@$LF2xiM
z8)?}!Ip<r^ct1QPBq{Et82hEs|Cxa{M`D_>D>y`b?7<>=skpi;!m}?=?<RjPlZ`lc
zxKl`>FS$)UAMixN3dg?Y5g*;FuLD2s1vyd$7NDjw^{>%MS|RL9eiCMpur7be%c#49
z{k1R79%ga<GOci~cz7IHMj2nL@Ak=~<uB))eUhq%SZyzYzhbp@aY8I5w^;_{f}c-?
zGu*Q`M6)sqbu+%wfOKNh-lbylE~Hc=BrmRF?~27+4znf<`NnBcC#Dxioc{&SC1zkN
zl48|Qhfp3Shb%04p0w_(jnXp6p}q@q{Pc)XqHlzz4=+TsuRVWwn!i%ShN?H>*(NLH
zNfgneytT!yb36MlXZwog-rAW=%5@wmd|b-+@fvWXk?&xjvShqvMOH+b5T71?IgWXo
z|D**G?}s=c1c|=VME{BWZ7_;1fjHF}0X0<=UO)B*#eF}dyj!k3b)8gklUp;1+wh0K
z?kx&KP#qS0WNulyGOCS!p;)xt{_6LSz+37IP!hP=J4T+geZV^^UBDBH7|HS#QNfUy
zGmcmehkkPDi5ul^q-?7thm}nY-mXwoC!a@)nk4hW-37r<5q2J#ZbjQc46?qwKPZZd
zWI9eOo2)5w>B;@j#a(`cbKBF*zw;0mvl7O5bMo%hdqEpxb*Px#Evx_DZs6qmMM()p
zDTd~I-ziKj1kK*uDPyp__VRjBYJxMC!p(n><<sx_lc-5_Q&z!>$^}8lpRBXrAOcFJ
zZ_rutW}DhYIs{#FJoe@%&pc~`SlYuMG&ezFX_L8SF%F3qiFXnQzLa_{jC!tgEGfn)
z$A*+kpR@3K3Sqp|#+-8}?&)jqyN|H){mJp!o1Sf-Aj;muIj6&Ua=fUc<6|E05g9L}
z`3!;S8|yizD$V)VNTRs|Y-`L$s7z!<b&z>AN$lOnQu<gdzGS_fxNe9(5S*5#rx0v>
z38o~30v5k5ZM2-XB>Hz4(6aSOptxhcO8U$&%#*j*8FwE0EzLi!L16RWnrG7%*_o$Z
zN7|EM_oICD?mY)Ystu>~$-}#RZRnWNEm3BV$mL{)qgkZ=7@|mi2!Fo!(zj5mic<YY
z@ehM#euq2eGBz}L(RY?{4=~%GJ@^vzA%l0_ZWP@(OVx~8ki1WlJ}Y3|(+Ts{dyUNR
zL$8kb-yqh%ulmG3WP_vsfS7>0^CJgExPQB$Ih2mB)x>}nN}>_+Qy=%2s9yVIiSjm$
zavKRhwyPgAKfX%KIj~~F24cOSy*A7J*faGjs@VV^+Cj_?6j?tCUj<^RAJmFe_XxT%
zrZ9sra_4m|=<De5LliSvhe&tZUl{V*3)*wqBipl=hQQ=K6}Rc4p9VgXeU8AG&_=To
z8!Bfh_g${@spiw2&*Wuj6XDxW?$GB((=LBo<~m?IP(8SR!0o^;lVYf}pVjspCC)r9
zJI*=|t(#U}C_C|sRAn~WGlKkyQGrnh3*6&-$K=Of0|NsC0*gD$a7&*=Kl#=n8d3fL
zk4JRa$^6daJJt`CB(x-_BrG#$a_X~3Hh7Qp-#idWZ+NFGBPQV^zBgoKCTQuvgZiPI
zhxxH3PqA6UaHiQy%hF0Zv-=kD=46AlpMT~n4OI=TnjKb{nroU-j5XxXd~F@H{~7YL
zYvg5Nxx(7Ven*{{q6nL$B5~J-U5y!n+DH=O)|h64UnGabQ^dK%J8z^#v_y(@t90{c
z$?FO0*}QbT@UD%n^DpjS$R1545sw6R8Lx6Ba-DM9a){w_lC+RhKP)qQ+84zvB}$CL
zgTV`V5J$YtQ)Bb1E6SMIB<Xd{bV>iOhPyV)`j02gLWiDYsXa-RIu5+RTft)`sr6YW
zdp!zk=AU(|-Pc$O>zJX-p}R}>JhqaKUQtjfIj1>iBBwwx@`FnfXD?D;n@K_M`{a84
zjdEC_wuZpgqk^7knDDT+K|_08d);`;7<HFnQM+!~Wx@*uiXO!t^@P3z9SswWtU?p0
zy4Fl-4)hWlRyYn#FTK%!Z5eA=-jvh7`t7SP>v+{*)#OPU*$mqcn=+m386J<!`be+$
z*JzhLck=vm{eOIFS%{(8Twh^unVyFUhI90~CK*9jN>Y>`(^skas#T6indUfF9BQu#
zu4%S@4a~*W6TRyFRm_&g_Kj^yd$)G8_PtZ;*5v#}M_+Tqp2mVpU+OOTuIN<#>~&>O
zR?y(<tWH*yz%_;e)U>`dy}G)3BV8U{vRRGUjoGbw+l%${wsV#ffh~n!xcjL)Q|p&Q
z13m0>wvBUoH_A5xH{Y+)BTLAr9+q0SEg^&<=r$$1trFMJe>1<@R7H76sr$@$FiXEj
zU*gxvys4Fk3$F*?rFyVJFw63@AL#EXOzCnN?_oz^uwq(cj^HX_X%U@JPE!jr*9q1!
zw{RQ2kyd*|#l_^{zN}P)nK<xu(Iw)DT)=U;i{aG^$<f+8*`Yf_W$S^e>v_>x8t-d~
zUX#&;<cFb_PQ&6$PnWEg=sK@?Kky6MIJK|m?icNIFAhB7z^9|zmUl}hj;VRPZxf>v
zL!B@1lG$C`LH!^oZZPgXqxI{+$Yg<4$uFv3_$x&t;u1bqiG202g28^ip6Yae&rI(6
zx^Cwze`!F8<3p6t!Vl@MA$1|>!<ZY48{db;4h`Mo>vnN$IxfQ5M1vXwZ=XY0ptS1{
zGz?9QbYnE5q92}V_oFJg3Z(8mVj^%36<Bm$9G{4uNGTIrZd={{vC*T{HIpLSuP`9Q
zG8Di3R9TrT^|ABwf>G61EY>O8eD24~Y%k-izCVnPOdMa^YL2`J!xCy&2@OK^MZJ$I
zq85|;FzE}oI@j^y;~?{}91&(5_A6sS(NwW|_rUXT0dDesqk(BI?BsaP0{5zw?ep@f
zFzQg3{!jf~$%mKPiQkx48&VRviVYgNt}@RfeL~OjYoX;(4NphkTR$c2{eZ;)*05=!
zuR2GS($5?kMzbplj}8rQ77xX~-)P@B&c*c7vMSUk*l0h|rhDn~+I{dKmDx$-UXfd!
z!)W)UdZ9XP!R*(xnhWh@*@+yUQJzp9>y@FEW6Q5&Y+9G@sb(|Bp;4jrTM^^zuWk+e
z^;_d3nn^8rldYI$j?By54!^SA)v{o=X6mWJuO_tes%3SbO4jrA&Y!fYsCT+dB#R-l
z7Fu#mD?BLrGHG0AY0mTNYu+wQ#O8EG<desy<wuLEZR1_m8T1pzlln8B>v7V{tP?(p
zUjok@2QTI?q{5|WE1&LEOcn{M>S%dccx^322c?i+k>0spy_$Oe<gg=@=0Y}sJexe#
zr(zGfP`XQJuC}&cxD=T}&pRi=x!P74o263qF;vj`#OZ8<z!YZsJ{6aL%B_EIdf`xX
zZHvuKC$E0peSdH-t&zfF-y*+$*(?91Y;PrS?Cw|DViX+#Z=R#g4a;duqM}Z1+xlpy
zrLjJHz9}0_8<ECTkIUm!KcRZpu-&`6@zdiCdM8b15-SuZKHJuZ7D5LcM+@z;n321p
zt76{g(l^VFmshz*xyA#%Y3d@b9<#8!v-aohFrppyY<3T!u-nmd$7|;An&`J!H(Q3|
zNku2)W#eA9gpE9&-<MK5zpw7r?4s`-&YpTRY;bN*oDZ!nC95!|ryVPwD_&^dwBIPS
zj^`DhUL0Ipd=@4tzEwl;$L;GyneV%Uu%3^=G5;+>P{WyUn@HN(dq;L&%;2Ycsa_pI
zo^R>_ciIR_lPJt8l#<V?Tji`x|84DeJl@J5+%q#Tv>&_SQy(22*YhC09Yts&da0+k
z7ue9iKBZq$jcE3vP|%+#0Am0rMJj&%q<ddx9$ht6rLkwflbnI(7lPQyEfR)L6pCZ0
zNCU_{LXDwvCQ3>OOkf`a0R`~^0xH-;1aDD9(tqwhL1aKc{{1~jRD#SAQ2ud_GWdl5
zMS(Z`n!i4gV*(M-!GHMR?UsS`KWC#@Wg!30KH^(&3_(IoN=^=Zs=anFHnwpzvvtxC
zt`!AeVA?%>>4<<pNDF@>$~}Md3*3L)`~}nrs-!6R+SZ!Qz{u9nn9a@F4t^g5VK+gr
zYi;agK<Q>}W#cI5CPMxD3_-9D|C*he^7koDmLk+prDv2<whqRWylfn79Mqy%l$4ah
z4n`(|&!wOI<8tty2(_7$lbs+tyQ`}!n=3b)t%E5$r+|O}I|mm#7Z)oygVoX9#>v2q
z)y9$LubceOeWZ;YUptuFIhotqP{QwPU})>?BtlIMf6zbw{%WVOoB97f$;R;?-2xqC
zhaX|*WaD7}=e@zD!tieepP9QETfLMvw+3Sdo*~N3|5*6<`Tud`f1mg-S3>{yN={C0
zu7ADsUk?3wsk)=FgOsf`c&3x+|IXJxF8<ep|F}??9p3eS8H&Hg`S-VAphdBS+5b6d
zqF55_huL5rADT<6ya1nImHqx}0$T>~hJS*0Rgj{_4m|>bID(wC#0xjX?Ns!dM>0Qe
z@C4RqLuxW8@nsxzGmYffN`l!`xmZ*W@M+6q47#X)QN8q2&=!}-e23b_;zt?dPnkE@
zc)R9$wK*vyB(l-poBHka>sqpqqiJnHfseY5&hd#rE?t*!atucB+rN8>cS4e+{ggUK
z1F)!(s44&EB_Zw?@xbrzu6>K)M2v{ZOQu1^^#1RUKP^!ug&^Mjo0sn$8p`IJRwn88
zyMKEe+}8s6Z`(sXXn^tv<^&DM<Ny2Pl+919#ZjdH?j;!h7A56Q0pz~y-yeSj%ftOI
z;}mo^N`J)Qe&*ueAAgHNTKV@Kxx3DRgl{ESDEZ>y-wgyJxNjEk-wizI9oZ9cKcyh`
z;82>snHX>!-)a=?Z#ycT(Tef3WTOO2)&B2~gGrtI+XeCePwKx+@c(BfHA4#Hs`2*p
zcAMh%EXcZMA@F2S3U8wE=D7HS0`JD>*vV<f0qeDNp9brSp(i|hxwz=Oh?6ii6%k{a
zC*i@ons^a%bjfWdNd{<_%dsk(6G~HKPr=fP$f!vE({d|5i0A7cHr`$~20~iHy6m&#
zP*TbVM5o5|e6HnFHu>(ZQ~lcWxt&ULUnQAxo%7m>mL=afZx}|Oq_yVh;lkvN7e7AW
z!{n_;$(xb+!w558qR6ABJZk9`z1-%VFp5<d#_{XVknrE0_s6S>*Lan`SaqR_f{!>k
zt-xMyl$-YEi^%D;<HlC#-d_D_TK)CO)X7gxy@{g{6?W{D(257Qs(wmK+ZDNg_Th0n
z=!5_Ft}!iJ>lE9DZ%Ul)_wGjq=(qA>v(;qu@lIKZOc|m^>w0#X;!lkmej^~=QjtLl
zu{BJYC781qQ@zh^yB`^M47&Nf{$jJ>WFt3y%&Ed-_gg1h!<nJSTB@s6?M6<lz8XR0
z1z(K5ni%&E)t?s1O7)BhMp^gJZQi%4liUOLLQP5ilruRzmY!WSmxzSc7hP2%-ySK1
z(MEE^@l7X_#X%Q6-dHNj<fzBV?V;H1YuShP@9J+Zws?0cM&wKKiS#3;6iV{%#_<yT
z;b?p{2+XSV{e$0r?}=3rHBa|CN%lHh2-<JSGz_6lC^2Xaoap5qnBE-OtUjmdeP(J~
zxBsNgr&)3(D^e7zgTm*kl(p;!b=6M|t%-+@y?4hHuTkHjq@0X?1{=FE)ju6b29vJ|
zHZMw?$ZpNj_Qh6l<j-7)1;nQ01m=CYv-5){^6}yl1J4tLW;E69!t(wyFqNaCw^xU5
zUi<AtCo741fs`)2oSm?`!+z%G9L_f6H|&_a$h8;F2Y>kUROyTfKiu>B%iZq=`@Y$I
z?rW*0t~1VwO5A;qT^2*=Cx0!pT6d6`Za+fN(Ih1H`PGcx>~?jw@3Ig;H2Jl9_JKkf
zh4<M))tvX`gd%s}J?fX^soE00#s+E#IbC`RCFWeujOqWJwg`MMma*Nw_c0I+eQplZ
zr@pPFkt%S`*|p+UO<EL8#HlGK^Nx71W0t%bPBu+5&;95i51rMvYeqHb8ckmufBV!=
ziJeqF;`@J}t9bpxVxQ|CUAF~Pq3x2^?F22G()*g)oQ2keI!;3vp;McO4;an~dR9%v
zE|X`RMwEEL=uPFRW?9WRkMFqVvNz|5s561hN9TKghLk+{40XRgvD<T%@3%LX)&nA^
z6^VM@6E2hH^rQj~pZkD4T{Bh5t2ePuzjfmUhd<(y2|m_Pk<1e~fxWf{Gi1A<dP!%A
z$-A?__J2GAx7n`v+v*31Rl~(5v(WLVn)SGjYh`|JTCHBYMm|=J4C&|jRy^Tf;?Pk=
zV0=|z4yT&YDXjd_a4m*ZFWI!f0}S9D`EPx|-M5682kUWz#s8pGop;XDF@#6m@^ck#
z9-l_w1CcisWKM$;yjz9k9a!2zdv)bXJU?Gl8V}28$(5L^7=sO-DVfZlQT_|QS-ieK
z>FBJ-p7rUBYdUNF(eT}SiTu3seN^Jl)*?kOew@s>>U{?`zw>?W+6Ki<*Ya!Sg>g6~
zCU2iP-%p2VeYJ=vZ=yd9koT@xfWB0kis<=g>#7Nplbg%EDPVWo!8Gqn&R0{4l}vx$
z;vE8aBBZf!adX;e%aY!(5J)-+{0g~m5PhoCus1qz#~xo^fDQi^!ykq~{VZO*9F^3t
zphR*siSsoz@h))UHbD;^f+L7IskO7@vEvfbmG`n6w{8zmFgC*(s`>@D-dT>`5!%)_
zP4{y7<g8R3#*c5+UF~n;18i<WIzIhBT;|i?#^!Z0VQR2rf3@G)`Q=&XL&tFIZz~C<
zg7+E^C1DcMlu)+Jexq2Wv3U}l-WiN#&whb5*g_D6&4?m*#14oORN)Iu?bscR&92>y
zxj3^kOyP1Ra+MR4u9&=ER|=;8BOaI_DQ@5HNBX=|=##<|F)HulO(FY-!+d<Nj!-l8
zbk<{_<gDhfU&uPyIDzwB%1VM(zs>iZDyYEbekVmWiRs574K3?R=!97_DpB}rDmtfv
zjAz`x=&XWa$H3Q2!b5=Tw0#gRxiEi%;vBFh>up%+NrB#VH@W+AG^C~Bw0=*<i>-D&
zQ)CfJ5IT6m@(5Oe{_X$reKk@M6k@X)I?g|X^~z`8Y)wRZA7w#oA^s&5FCDsQcOreR
zvdBEPOYa7uGT${U)n}wuNco~pIL7UBdpWnGHFuHV1qL{fLF%ok^aGUguT_)X2|6w>
zJ$5R`4OjAr^gKB=VlJBTyded0>SB^>)>ObwoT>k-u>dDos&3PG?J{MOVvSQx)|T8(
zsMV{X=jGgrgwpds`;*<2`_Yi>PN&#SX&=wHw#%rhY(Tq`k9JJ01?^}l;uudbmx}Pd
zG)KVF)!~3G^Vf<P*3vfoX0P`Gs~8Hj-X7P-<JMuURUDfu>sgWS2Y&7GD$rr_IzNul
zwE}L4=vPI-f2^ih_=44Tz9Vup7+mb9U-E-G3}z5SW<qLRp^8GLEQ#3C3Ia1pf$WS!
z4{H?&>_J<179*Z>BGyp7E2X}NgmXIptom*%8ehdmPD+t%_$M2&+bc(!z`>WMsrSc}
zfX&2EtvjXA#-gMw@(%UtJ%T-s%{jpT(=v4-W;y!)ZsNiZ4M)OB`TEgt9~gl+ZrYZg
zY!FD&Qbwou!f@uJ`)V?;#b{nhivVypRx9zE{kE9{v?zJS><wqs35t!tmEsl&?bc+G
z*)<_jwJr0hEcP2G>X?8T)8wE_gD@-H-Tsym0}q-B|9#<L@h%?&W0~Y@Jnyq>rh1R-
z*V}^4u3hzz!s{qHf?Pz9Mcm|rIh_#j_7VKd^6e-kuPy*EvdYg+C<8!iyy1Gsb_zre
z%gcV(2JUFeKz%92Pg_Ob`vdVP$o=|%MAPihi=Brki5!o%qi-~UA*slH4mLsM@BbsW
zQ^$O)P$Kb=vZ~wbbjFm_w$2^{sV!Gvzm1?(DOX!!*5~%x?x(={a%^kmT8si)2r-Mf
z)`4^erCxWkN}It#d(6+eM1}1EZ*!j;`u()I#*}IQ`TZtjJZY8((*qb|;YkwP>B}|=
zP`lk4l~RjQV6lX3<~6ycC~o*x_^%IE;)$b)iYu1iT!^jcT=aWCj1-21=!f3N`>4>P
zUTe#G3!-Ph^S$b5{SlAAZizqr_y@_iaxUBn$I6Qc`_JvHsR{c42qj4%liB5KD`{R_
z>Situm%9PxPc9KGQ2OZ6{7q}<a~8e6O|`=Jk3r0sd}aC77_2{c47_0E>`bLeiqh41
zP2JM1;zrTg(#>S=iw*1c`<gh_nF4Dcx!(m?_(`kj^HCA>qx){XBT&ZRG`_t)t(LkY
z@Yzc7!6u5Xl?QNlFaQlm_5<wFB}?R}v0DbP+3WQAW}IUG2SGj{R4FJdlgE(a9uRhX
zX%K!b_mF<yWDblyCv>l@hYiHI!+2J$dWJ`^QH8?N9a`Aq4keF}Ke9CO5zNPGB~(qU
zlHG84kIhmdUL!;me(z-oCH^*mR42b$@q&`<Kp>w2u8(XjI38_R9i>O65WsCEtM$Q8
zyge-C@80@j_&)U~{mIv`9|55}NO%Z+LwQ1plnP9vYW0KVlfpz`67&h*JJh!#_+~i_
zYgx*BxkOr2MiBWprKIoHP`;P}>xSx!w%bBL6^LXB#yH5t_)CHD-yvV1lBqS4(B)r3
z1+vY=%nuI?FkRZJ*3vxs`}vo`p&wF-rNY=7+#=f*1!+%Y#V<f8Yf;()*r1}IxM4EQ
z^Po#V)cL1t<K;}5v4Z>P1wbJh&+DiAN&kk!5J!jnl5<_vAfAY#lH#g<*X3E3_ANkh
zRfEVmzMp<KnVl;2TL%7l&9hjz!%fDPG(KFYi4-}3@`AL&bk`;JTeiY#0!b-r<F~We
zbEc=sT4qLUMbhp+!iYNGY-A^L>G?7pW*08icqZass@;UK)iR`j0AF^O=82JwOx(W6
z<>9WO(rQn*?9G7arPXSYl=<m<uiR<!YP$ZE+6TgJb|>SFH$45h+Mjf$yQV`K9rsxr
z?U^@Ez(j>P(z0*Pv={bjMs#}`7pLuYVW+_XCwHV&oY(c19$p9AuHv)w<vooTs^mmY
zWjd}iJn(#rn`pg^XADZ?0eZw;W&Xs0ilN`XYEhw;k49ldH4q`^x%{8-0T2n|luFYc
z%rFj4)8{({gvJdOPE|WqQ-sQ6ca<P7rcvqJ9=QJO0wLpg8(0#i1j7FEY2&TeDj`*}
z*0v($qqpJL6KDoqGtOl*u5<OWgA1<j0&>&61rOiWQ9Yr&u}j-d#GiVmTixF^2HFW>
zhvmc)7`s?zK`iuF_bs`qshRFs=^q7&T~Va~C@NL*(q+Q994t2aO;m}%Ifx~s0c3>b
z5OaK**?cvWv<cI65qL1;s{Wij54G}NZ?$J2HMUns%~QkVO`VPj?V)SVNs93PlRNUQ
z!iT&2O4LOJ4OY;;Hy8*j%4nrcI?iSOcLW&<IdhUtKJ+3loIt}Uk*ol+q3X~tjjm3U
zJ6GM^U+Og7-m>AF=wWZ1rh!mBbhDd*Jn;OP4{;p!-blTk1L;Svre>({bOgP0&?D&Q
z9l;H<=&g^A{g{@IT+OktAGO6~a=lh}+Ai%hu?jQp2CyU^p0ZNaQkJ-dCqrQHL<FKG
zf9QK@SzMpswPUFlMik!t6Ez%`Sf=g#Qys8G`9q^G2|;K@8d?2w5(k-(`tDg(P1$^h
zh$YS6o>VucHa(!x^Jd|*lWSQ-)+DPsJ>%hAkcBkeHyzB7*l!8VdHP5^sad3BY51Kb
z9ug`hRmbbGcL8@KA(E2c-kFsy$0!75__|4ZXJ^WMyS~^I>c^wPs^-Rf@NvfTu#f6_
zJTv#4!OMykEarD%4=x`%kE+^xkDhz%)hDt7uw=jM8A<%&1KyN7WWkQ~kg{z8#2*jo
z(&uQ~0Aa?-pI!rWf4QkhOZe|0c|8~910&3>X}+X{WV=+oBZ5?u0Y-U;a(%uZv{Hj(
zK<w5NgFBeJdiIUw)e~(}JYFr;5-L$_8z^}lKMK~DjZZ=3-3hNO@*!5<`(b)7?pcii
zL;w*>(DOdG!}gQSr9&arTVRAy!h{cdxs6F^(kP#tZ@!{3GcYJ0mS;95pB7NYZN^C~
zxAi_up@M0Hg=qXWOy9TQo)tei8+2KVO|FDdHv6(6R)56HIR`)N-~I2A8hjQ?OsIBy
zs~e8LmN+oddvla_#u%UdTA)Je1(;<5EZ!NKZb~}JGRhW66!CtJw&)9VB(xSB8&%*u
zUe>(azpfnDCEd4V=*dpheZ>H?W6`r`JfW}QMDz-KFVo<a3aM2V-1uyFy!f#+#@Ez*
z@nK2PK5e5Ega$2wy_$S_f>43juunhv9mKH}moKhzl!{_{1X79Na8<wO#g6jPp)*cj
z?G^USc_T{7ZXsX%tA7ul85H<WV-Kj5ptPX~mC(L>n7oRccgZ{__MoL0d?|on>YUKK
zIY=Zg3uXWmgft3UkxYrfeVR6Gy_i~his=Hh@9`@5R-uKG{DjTX6Y2A<#dQbrve>+F
zM4VXB%$U5{NCe>%cd+RTBlX|T`=z{eV9mM%=_=9xLI<O~D)~U@hs*4cP0TNH|HOyl
z5iAaWqyLr`=44u*Kl;a#zm5{;MomGd@Z=~IgD60ZW5f}v9zB8(tN=e^P@_zRAbhc&
zMPL;KD;d}IXy2>E;4OUe&dryh`T5+#wQ%nK;2Qoyjer0p+UCF-AryI&4GX``1T4zb
z7b|hNR3@JS2t&D+4p7+p1vmo*2o=SyH&j~XF_}Mn-)r#jJCrODSa?T3{p1;|AMP<o
z?s`dy6JI}S&WU{`zhn$x3BS+|<DcAt673*Mb0DUWWb>qDF<Fr!OLD{(?s;yP@GcL1
zx~D@Swair<upie<7$XJ*jzXa@NVpqFM~qRCDChCxiL7)z_uKgi+0?hgoV%RIbu?r3
z{eN~7s0gdN73N6Va+J44|4b_}ss*Ooa*Wwgr!U)^Jd`CX>-hoL0MB>eZ0NBV!u1_N
zXgI~-b)9uzLupZAB6l}5h$XVp?>)_Cj8`g<j*r<Wf)BZ-SANBxL!N{XYx^#4wpN<c
zptBaa0IyIT?$EcdPv`JdQ4InHh;-cyYBuy=9)k3^sPO6A^(OhJ!K{@~*fSdzD_)qs
z|8^hWT=<7Ye(3q5#Jo)POn$;rrx=A7(Z^5e)y0mF0d*%EPM*NsDTCQiCko~d=TkV8
z8zdLdU>+!uD&ILuB!EZdrtZJwIjta8RWQRa@<fK|arr|UTTEW}mYGX;{7{>}iEv)Z
zP4ttiv0jJ79IFM)6ISJYCg|jD89Sw&<f{ePKh;HZ-mozwp=0q@J8%-=_WC2HU`x8p
zd7UPImuq(=&+3@IXr)WB;z}`!k)z`e3=>^tQ(Bd9=+lYc_QPfiE-SS*&zuJEE%Y2q
ztqPv<$q?z>JJ)sW<8fJzmi-V$0`doj%Ptjf!-fay8@sE15$D|+tIQe`+Kzq3K4+~I
zy~sx&*m*>Us!|VR47dFfBOVG23q#A?A5!{HA>#?_M~dCle1jT`T}}*G-JGwn_aq&j
z{H)%NlDfl5G~9~}nF6gd1oYlH%5n50nzI`Kpef&_+rOi~OT$24p`L?7S2qtUDxsA)
zwO*O*AFi2Xn^%4Z!#27;*T-f=>7EC2sRCw)uYfF&9}rXZACZ$60-S;g2JbySrD@<w
zjY+vp!uR`K;e08`Pc5p>7Q=!d@1ChKjg^S`4FKm!I%VHU*0+=cGtW&mX${21Uhthe
z#B4SJu&Ih~)~(gEG8f`H{i*}P{Un^&z=o5cC8AhY9r*ee;bcN%!s5l}W#b(;sVp$N
zn_r)2g`-MlT&Ao~KthzHHT+K8|7|5$<R7$%?x&%L8=x$KaL@xt4Hid*tohVpg7*tH
zHdTNd4T4nka`m7vze^pC+tl$>Voct>(DPGr{h27w{O{H|7>W5<s|4|W?CF)q)sjyP
zan$U{>A7ngN+E-WwtYK<w&csNBwhkc-iOG|9sZkvsOVD<J%2S*p;1Zdt1lr&SSRYZ
za7K=EMOQw<C+6>W91tN&5GT*5kyY$Y%vTe0yc6Qg^cwKT@SC2TPtDz!klX(7G%hUB
z-luJ%7j&IO8&hp?sy)YFtO69q<oQ~<n5s^c>mp+_ol5Pcs6xp`gP>Z-9|Xw-qpt=A
zZ@q+!yKqOSyW)dG^^}%_rBOAN)FvHtdrC~9nrXKteWA6Fj=f70Z7$o50AU174PY@B
z>XW<iw*}X&;9NEW?u6=<{--5-TgfPfwVa<Rha00__MciDHr@;EVz@Hs#fWG}$dkY+
z4Le(EgAjjOHSZKBZ;+>j?0P5akIWqB4*wYiX^>LsV2?Og0lKX`A0DlmcIYADG=JVp
zI;du%{g&<fc3JSxMP``^kb>E8_w&Do?o)Q0VZAWWSz@N1potVVByV!_b7Kn@tRbM<
zhA;;a3$+#J8zm1%ei1<_28r1OFils|$7avk$Ry9?Oa1cDrO6hjAR}gg+I@faSSGw1
zled8yT4NgT_~}bxw&DLv#^m%tzZLU2onQCj>JjX9<V3^eE~&4x+Pppr2@)0_AV8&2
z{cs8NXkU6$lFxTL0tENKMp<a5B9`pU`lq{{XaJ)ESD_;`{}!3rr%+)Gq>ni$oz`zq
zD;cDSo6bPWg+~=gIb!!j+n2@=*XCKtVRl_#;#L*tzBkfPn6R5FMIB_wrgnpF3XmX@
zX07*en<Jpo%U#$v5y6J|R<A}O@@4V*YDm(zq5qCIg2naju1hzg5n>?JFqOW#SEP!8
z-u5gR+-lLM4M93(L`6)*k0i0k?p+k1zbRmH^r`p!PzodfxVJILKLl~8UH(O3-aE6y
zR*D<D%rjOmb&4HfHfKB<jnS5t;2N<){&sl-c=A$MA|w7<&t!&o)@>;SiLcxTe;i_|
z>vgPtxYB~#L&ov}4n6a@53;-gNs)tzlK!S&-A_rR;=!E8e-&YJv3N1q;&wnIaXCSj
zXoI1Er=4uq(@c$q{ZPZ1iO7|1Co&tOIj9t3@X}sGq{Z1HJ6@wK)pb@Np~>XdD0SpT
z4O!Yt!*^4}3bjjtyf)J{7^}h{f0f>TAkC(o6SJap6PHJ!Jp?hISM;ts^rOFQO1OH-
z3*w+{<j(>lgV<cy%XARwG`#X2{}Jhslf*&m^5tXl#0A`=lYY@hXHPDtU#pQrcRt6>
zHNGM@PrtcHBW|Ym3Y2qr&4gcmpna~*xN>OmuA0S)X~=rk7T@Ya=W~Jo$j^&0oND!3
zyJ$Hn>kr9dZ(h;gm^U8j?%i0;&8Zp2&I-)EA};sxPN!DRFB<k3a*>%+C}E@?ncf5G
zeqye}e`q^_m`?usBfq-LJH4fTQ^e2z8$}GH?JvO<66g>hRHw;<4SBy{=)V(l;;2Yn
z@mYk`kysN>*8s%THejK*VLk<nIS-IS{*y7!Ktg<nrOQZc5ukK=49>rBK0Zl_1;{w<
zOWc3CKgh0=;8{<p(>@S?UK#Y<Tmy>kMyK(5WgAXN(b4`3AvK8L`$|LL=>%_Wx5T90
z-d|I6OEswJwFh|0*LWgFQSbnMvSaJx(WDWCl7gsptmC|hZ#Ai=12(NMW&es}dH&*|
zubVO|6GD&;ap{hbMRp~~f-CdkomZ63D88IM91xp>o1f6l*^)JgP-%Id$Sc@DcFNBA
zeklx^lza29MA<VML^>QC4Y7-LnoEFKmK-Z}az9uwwR(EPm;OS|<jIB3Z&0moh;-Db
z=eZvMs5zg4{Jm5_U%(})3Xo@Zfrwoe!p(B1gdZU9lQ}206wbX3c%Jg|e!-qV91{GL
z5-<6#7_R!<z00ZZV2m`ft`9xoW2F19V+7iQ_T?6c20K8+tANveH>W<g)thOcPmMdk
zlT~#y6%Z3((EgT!d@nE_m0kgsu?m2SL7)%&sP`e1njH4=Rj0a4c0u|r<&HMeeQv%f
z39Q5!+z)Ynv;k=QasVtQifR_INP)2OwejYH>VdtAJ~-_#oeE0{IKpe)q(pr+1%+W=
zFD3XYw198^DOB*_>t}Q~T>-383gRN)$p^zwMniqQxOPcwRwG#ouZpkVcWXh)K)K-o
zZ)Bc0kW&D$Uu<3ZQ{z5r%Bac!$knQP*uMYP5&*5`958)*Nstixx+6Tqn|$9m;iXB)
zxTc<$F<i~4kv1^!N4N#v?p@_d5QR4}VIm-uYQC1VVDV@jUY+L69LvEWz_>W;S2!bE
z_pgks#iJ-XB#=_rVemo+exW4#dPS350R`F$5aSi7gwTvRG54=`vwmx=zO|T?lXWMP
zmQQ}^T`XoE0dF42(~HC1s}J-CEge7H#U9|Q%mF7;Edp{)`(T09BpQ@-`=t&zhhQ}a
zCJn9zl>;2!2^1%h6tH7`zLJQi>OEncxN~|9T9%oRyC)O^O!s6o@Q-#9SpjwBp%xIJ
zw&4tHvj<lydmk2+tVEDP{<_cafs91(Q!1-isApIhemt)2I04iR&1!uqiUBxlR|0a5
zVQ4og;u-)Ocm?Fs>2`@9;LIlAJ<5;Av<ASal4v;%$<XNDx%P^7(|a*%bJ=j;PwC6l
zi{cFtpWAI-<y4JX-wr64>sJ~7-a4Rd1M?Pi0o@_hczo>@rAtF?O$UcuUh?yB-MkMF
zItJ&^;sB+Fc}2U@8Y-JRh(Vq2$w>DtAXY$_;+S$<im;BBAuP49DLogyY+VDQvK`&P
zk$j6Jeq->phUMV;O;h##wWuU&{uY7j0U=L!+S#xC-g?W7KnWTB(?kRT@k@(^O^V6O
z<2Pag_auXKNPYNL>wB17k=cDfDsJ5j3}KDcHC%%9gx^PqMKw-QE*`)`PV~82qJVRl
zlR$yGN0@`+H~TtL#Ny8GM=?w{QJ`#|09qT?N#SF)*Klc5h|*yZ&Rkqqwgb{c`|vB#
z3vD4pX_C$Db|SqBu*#Vi+3JB>urS;yq%gV%OzQ-gV7=bAL%EXt`iLVF1yC6ADc9>S
zocBQ)n9)cylN5Y1&48?q$%}yrg?)4ymUDp<xPzahT5&@&T5A;VBDj~f6K<6%n{CQD
z*_reaT0z=R=6rm<gVnj@i@_O;aS*;TT=ad{B8dEr8MM@eBj!EZ13z5ra{zZDT*6^!
zdK$0%fkI?$N+bIo`0k-f*tY=S$iF#f#lS;Y)7wXs%_ia+-<1JdkP(If)Pd#?2Y?GK
zWvzlyBX!<ht%*g-?Wk+&7=yA5i?$+(I;!<6afb<WP<fE?*6wZauUxU~E@k2UHfV$;
z@RFsfSx+WFp#=e_e{CM7i?(X48S<K@t~>4I9w^};%fu~!8X8DN?cOX;vSO#4JeGnh
z&V^Zj72k~hYjMiaU0;xwDB5A_){Q6&1zh53z^gd)F}N`Vh;*EyfeE_1tR|aIekmw=
zpuy|qRBJ6}RgRkCgaq}i1A)k8+P-sQeJHCH7ps6Y;aTV8lY9XbI{_(-r2wHhR3dGC
z0Bfup@hvH7n{xzqvR|>`TYb4Z{Et{<O$(~jzScN%{5H1~ac&gSi#LTL?*%vWFgWVk
z@D+~QzAW&yp=w|l-Djk$D=RS<4pjQ0ukDlugw$YMr%p0g*k(bo4V1%cO=ynfnICj<
z=*#K2o{z{PQUnQi2jMF-D&Ss64BLA&$cl>(;z3kB__xRw4Sd4`33G0E1J-HaE832%
z`2$)rf&|?jn;+^OBxD6|+mdFqYbK~=rNxpctT&@|!>yzN5bjn$;;sgz00Vd1l77~#
zCcS=-*v_|a)-JT;GF@X|1>%$Rpn-;<5IB+<QN8Hqm7EKVd=NnXe<BC*2vBP`iXoo@
zUnAlacg+IzV?+B<661(0EzVi3M=0jBXknf}Gj;)@*(9j{AlkQr%6owV8Qug{_RdDJ
zGV2wAp@a|vN6?ljJnvurcHA^BeG=$oMnF(dV8)xFcZVYnm;ZD2vB0cik$J&1FnQaG
zsY<wuOyCkq8-^q_3Oe#}RlFWvSrQ28`0~uLU%<5Hs?<m%6Y8S?hY4B#*I)wRjokps
zB|!Kf-hBo~8((V{LrXtifG9mR8A~Gmjlgx|caaR=w4Fb^SfG?z(+BqhD{w#&TK^sa
zqx98i82;e_rGZ&nHdjlQPz?aTG0i(^Mw4|@azW|c2lA!kv9w-<ZZg+Ac=gurk`lNO
zYX6(fgmac7?5|PaH4IA$|2_||XknlEG3#;%x^O<a>TY<1e()|Rg-sSB`3fJl@pH0Q
zr@#`kp5MA=PZNwEr45u$yPnS-e{(^~o}S>|UrE)x5zG@=>z@Gd(Q+#jNX9!62>n2a
zw6%hj7%ZeY0v2NuUKXLI#H0REu|a8d%F=S6!e>SK*_tf5dmE_!^13|eVW$xt1d`Z@
zV~l%;XYz%A8gA{jKUg#`lKwqB3^FyNsTrdc7S{Pj3aGTur8+%N0cnF3fJWi8N+gP<
zWCiiRv>sH9L7l8M0uhe8wt)B(np=xwcJ33(m{USxlGL07083i^Jf36ep2YAVI{$(E
zPAV^+D@+;ao4lalK&j*%`&7%v#OHgMgVjx>s{XDw^xmk}UZ64y`{an*PKBAqSxb+m
zIp=ie^Yc4L(S6#qp(jt83>L>ff+)zlS9drGu%#uOlRYAsCNM8}v>S5NharG$&Uh?;
zth!}8y`~I6(!B8X(Mad;Ec!z`@zE7fV>jEHbV;#v2kLrwt?Z}tl{L$CO0HysOl*hs
zH{&p9goN{ggr6Lc2OY8b&SNh-*&44L;JK_664p5<^!jWmawmXDw*;V&L7@1!U&6@(
zoIJQIn6fDNYUPVc7>r41(hI_?WfBK>-ey(Ei`as!h3%y9mNY~bMFU`F)l(q2;}_d{
zE|FGsJg|fT`ODDZEr|d@sIAwQ7H3qv+aU3qn%pip5P7KL0XSW;Nx(c3QBMFi%S`NQ
z0nZv_(s!s9*nQ3wJXO4w+sixNi<X@F2~m^y>X`2V;8ltzdPWIb!k6fc0!$=d!n2%u
zA5c{`G>ds7FE*m&6@ij6R&@-5c5vYIA*g?#nAr0^{caY9(hgS<m(qd@?)7-ap4s%U
zR#)8DfKqAjZdO%IOnzYOPi-J7c#yCk$a%_Dm;?tQmmoKV6axW9_O=)Hddj-GQeERE
zZ3ihprSA3}WLIJE754s*6{gr6h<I-i&;K(JQ^st*tq{I$36<mbB@`I5^|{pgzTZLm
zyCeyOoOVDQ6+%Mp2rNfItdHK6c7!rdu!NvB-d^wRY?lwb7Y}b!7H0wOQD7T{ric~e
zb4C3pZQsBMbIk(7Z^dL0moFP@L`5;%GdFYrHCQD6=5%^eL3eNs!sVI)&W13z!GXPj
zxeKV_o0@O-UoyYMS71?%DsL!yf^d@VeUUE$l=a^+9bVXIS@bQ(8q~T35(jdeZ0X$!
zUeGj{ny-Kf8dv6DK$&{8^);&=DBw@r=dWh1n$f7D-?j!XV+4Sny0n$K?|xfbObd=W
z11X0Pxdps<0WSJZrFra<T=Vp(ZizzccT`PFR;VCeh}Rn+<`a1#o_@pSg;Bmc06ZFV
z2MN3&#=7zPglgK!@*x!_@6Rb2wW?pF;r~-5?u3JwzHVxRl?JcrNmk+DK1Cgv1%huc
zx|DN#^v+Bz%_03vGTw(WxQH?V#QlVG1fD}V>kfOjbAr-sP>kBA-PDz!?a<ZBDpaP#
zH}m*p$u;`|qlwIBukqG<m0*0^WA0M7&a>=6A%W{GsDBvhrLRavH|zC$9(AtP*-MX|
ztirY_lvnkB3)!Z)@#YO&J2SQ!ivG3&1$AnX#%!-T_BhrKi@lEv;PnyR(~C5Y1s|*c
zebBx8eNR-?DhlB`MC-kA1C!#aN-uYMo=kd+>s1Gj#7s?aEU2^5od7igAK}$7fB-h$
zO&C@a!zFJ}adEV)>7Ci)s(#WdY-fEm;q^Sq`^~c=^=iZJX(UcZg#k16+*FrWN1q}-
za-B*NVRE}Rhj@I{f*G?-n(%=dwTc-}p=`Yz+!0rJr4cSgd9;JemZ<j9(N~ZqxULo%
zak|)32|G&AU1;qFjhb&_cSKT9c%d%EfMS7?-1POT!$weh#$B!a{?M7I*oN7AJdWu0
z<zJ~Ci2uY$I2IqMU>pd}1bbF+#R;V**&&$2GhGEIV>*C}t^_Cin$eZlur0VZJ+6J8
zsqw3*CVdp$gT|{B_VPW?H6f~pU`}Y0JD(;6(;H5N(5EH!0e8+he^`T=jBW*FBeFwj
zt-+$-T4#EwDLFl@qibMqdd#UJ{_q{EwbFZKz(Pdm{1kszD}-add;S_F2|5SX7*$)1
zhgyZX-XtF^_u+=A+cyu>Rh5yfdK%c7+DXVJ!_G+89*DPELirTd3Hq%f#{e#x2fwpn
zC7|8`5bkA_o(s{K6&T!Z;W71Xe~iKx$q3`Tr!@eSKPKi*fPy@Q8zo}cgTi@>ke|wF
z7~Kj0;$*Flfo5vw45_CDz&nhF{NW;Sngit+{T~%NC5i8&tbed9V)P7Mv8Us54GHxn
z)F3C1D`iGk;i0zKFbU?^-HiTPyZL1wB`vpxyf}+RCGFzk3oVI?M`?~j`YoheUZygE
zu!h2@-w5$;2Xd$hFboeU^Y5yi-ZR$pnuZ$Q_dw5pSh+EEJNvC?yr4jY847R%%Blp#
z$S=%!7F-|{&7bVh0MY{~MFkB4#Zmb!2J5MR3=2>dpWlj$g^6>3@*h(GpmenlALc%G
zuPfNj%X*KsK*UV?axNbK!<i4igOYybkIFtN3|H6i%a<Etqv%MJa5>^s<Q<u7&-<HN
zrzz!Iz=C^NfLhdl%hKmZNU=%Lq=M0eyH3R04Rripa%CzGwFM)7>8!-`ZN45T2QorK
z$FN2Db0l(ls>7xPk})m}W8rpIe0_-3@Y_fUwq4axuWxeF8}>)HKA25mykRl~HcaF+
z*(LSmKq=EO;y68Hng9vsTzqW6E&h6nN`>V#kJtSqJ01gS#YTTOjW}V5eU0vO6>`2u
zg_)sMJIjFis;F|QQBGOGm(eA+<3}N8*|g&>GUD-fW=x2zQFyZ*4<UHvQ8kk;<b{Yw
zCu{;eDyWHTCO*jyv485Ix+H<R8(m}UxK|Jt+QA-wVt#D9antAigZ7Ja1-SG3uMU4H
z&}MHDkU!31@NN$@bs<l5j7X<BEokcABMOj+LPzNnCbm~}NtQnUBB-ta%8{@eox6J;
zebp%S^_qwb;lAYLSZg<Jc+H_{lc8Dp%&SJ?3m;2z$#ZIr7IKg6=z0)r0<ssiBl#DD
zgZ-Oh^*qlPo?Gj8@8Lw<T}|R|Nxd!DNyLgd3F4k+MBmApN^%ta$hXT^!$XDCl;$R3
z$C3ymcl0!bar{`ttS$bi!SE$pSRajozZu&mKUHI?-+M#QKx~tx-t4e)HMV=0aW8I4
z(+AUln2!K2FZH}z+xTTmFj0aEh$vc$fwJ!(CZBIQtE&WE=kdKyG}W{2yz<tj*-p7g
z@zjg$)?jVC0)}|-_`_|oWuT?;Q})-eXVuysP&>*oefwIlIAygMn0Tm{0f#h85VjJ^
zT$E&{cA#{Pcx`8)C38CAiI%UEb4j<Bth+4bU2$q7A4gRE^o-i+oJq^R-k~TP)8-?+
zq`);ATOq){p3|KNQ*<56xXc$jZ8_Qv=UyXgZGtn*0&M3$Ykj(N*MVkpH+j_?)CznP
zj!*f`R*JJ@awHkp4Vw)bpwTo8t0`tCMfC&c%K3wlPh8GF8D4kNMn+e6c7FJ-lp))T
zIEl;GFvA_xG8;ixAx?2in(`khkj?xc)HV>}1iA9!!Mn|_?!z5M!Pc-UEqr0O%pd0v
z^k4}Fi69aNLR#W00FK^=AZ}xJzbZ<DrSB~u&OJU~a|7z3eoc8jNU-m_m->7WI?2_>
zAge(u1Q9fu-BPP^H?%*Cy)hT}BJ|nNjoF?eF)@s&Qfe~<rVJG?tuH1dn9^KIn-oI!
z);nijtc2tS?}eQNC|K+%B6(Go^lIi~7Lccj({P<Lsgn@ejG|9zbQiD-B$B5%IP4H?
zA(uvnj01*C8|qyZH1m|myhao<=9zpMIxcIm$C0D*jzHp@&o6%pBcL{#N7G%TU-Rd%
zb3Q~gfrA=T0J*fQt`2%k0RbsjV-!Vi9sK%dEJ);}3T~|P@g~QkbU`hm6e(rZeL`n{
z5eOR+j+ee_re3>zjEj=iil9n}i(*;@>w);$(v4hW<hpqJE5=eEivbjAcwnGOyK^dr
zbbGlOo?JXeJdWx`Jmo62_z|O_&npt)0wtJj(|;l88K6flpENeN;`&88M#SPTUDfXb
z(E9_Pjjz9Uqm&_+3AuL9zTVK92)fw6zflI4-vf-9yUW)-FnMK+K3vwLq^usiu=$G<
zu7A`d{#ms-(2Hbw5x>PKjy@FXeh*i9BCQz*3D5r}-FM-aipaSRxso@9!U5_?$a-kv
z(0xcFlCPhyR(9Jy@qC%XedcaR6rU0_<i7YeQ3Jg`3-XQ)NeS1+A#K)c>Tb&P5UMq`
z5Ol=A<fvyVD24%2L+3Gg{>p{vB}vqx2%Y+RETnm{nTTA+&9)fQZKPE1gV|}w4N?79
z069t5Q1D_*-gz>yu@wI<HH{yUH8EoNiPqzcwO9H`9qxxM_e+AcY6(Ni{b(^0gxcO@
z7P*BAgL>1Ns?bdq9lZu^4{iG3lHiol`=wz?%L_5_(-3A%C9kfUBbYgtbd)J^3)!id
z#sZ@}N5=(PwOA1zaoGbE)j4U=no?mvPz~|=fJ&Hm846;pOlrX#G9d@eX(8-Iw$K-p
zzXsFoQn~UwR!~oIiyF0T>v^OXGq1Y6fW1A8U-go`2E6~LNGPZ}$V$a$4{*_R>T^UZ
z{QPvTTjU;MOQOxL0Otwm9%6Twza{YyqNd^`ZNbP;kf!{$%6HL&A2h9B?nnFI1YA-}
zB6*RF`5f`gj8Z)FZnNE$4M%7ChD47BJ)}P3+T{n0%$i%BToHZulO6A_H^_L<3gFoS
zRR$G=re;Jm^m%uB!)s6oaW|H|JuWN!2-RvN928rS+$&fE#m|B?)_?uHX7C9Tj`V~3
zR>bQZrABtKiV-0@s*mrQ4z^?@VT$>yr5$i0<UbBEgdD}DUp*>BCeBE6=w{^HSPl_j
z6OrkZkFA5lU(#J>O-=^mrXG~gxnc1LIYK0)X6(BzOt`(1P*!!k_W1{QQ5W7~GOC!7
zz%-e}2(%M;jMG0!1!*|p`7*0ul;Vg#qG@6?eAr1nXjz#XILg!6`Xcp$d?Vq`DA{qp
z!|u<v>@NrxX2>XoAo-)Gdo6so5o8CMxWj4Pw3AbS`h*Z*Bi(Dw$a{a5c7HTUx|wV?
z`0+2v&-0897L6<KQ61B8iYv!nQJFaC9Fm<MC~GqoK2Iv)SZI}u2vJd;=poBuZ^-ie
z$k&tA`j8NMhqIYU{)n-Hv&edK8%S)>b<y~PCNC`JN0Tf&@kG}+%x|d#32D1KkKMqJ
zZrUvLS^9c!^V7~#<tPxFlcQ)LG-+f1bPWZ7Xwx>u_^kc9;wShy(AP{Evd$H9`8n}9
zevmGkR8Ko73}8mh0fbGip&N{T)Cy%7X1j}C`k7gNS?T*H3R3~Mi?l~<BXsUVb9O#$
zTql5$`~qvdN0vJ?=crqAL#;hTG_M=M(0;mt?}ezL=sz=QJ}gI%+abIy=f!v`$Rb<%
zS?aAA_(9vEw}nZmBLC%cJm4ovO^l8icMG6<l;NCo^4Bkh__G-XG68P57j3?K3Us(0
zwQqmva7Qr+{vV&9lG1q)Y1>KQyd}26XZFjTm+>b>V}<H)@EdWowxLFUHuF%9@N#?J
zCxpyS*9*m%H+(KvvQWoBp&@{>Mg!q;Uq*NKG#Ts|aYCyQuHO#CWYJFY<Zd!*!n%wW
z{q_#e|JQJv>qjAER~8;O`~IYHBHELM)3(&yj_uaB)r8G~*XEkPXz^x~AsitaCe3IP
zF(>GbbaNQqD@m>Kd&wO86{&|N>TDWE-PO&)K2>Y4emJg~yS>7%`=Bp%&7D7`?~l9K
z<za!hoN_?Emr0JqoWKNX@}JkV-;bx)t7^DXJYPqKG??vSybrF4UmO}df#4)#(7;Nr
z*vA#K9hC?<D>OF|h~zKsv`x$V;krLpfis*H*xH{k(9-tCA;^$-H7905nmW$v?PYNX
z3HoaCpTFIz1@%F!%!Gn#rqdznHRR_-+KV4Pr*dKDDh<lUbvV5y-Aog9qaw#~^$oZx
zz~t4wg&GX+K{8TKbS7^m(8AbvyN=M04zs7?m_78oWs}{{{3<DX0smXh&j*cHnD!`n
zyga#d6VU$9CTHWqNNkh5VY!bu;3iW843|MqL)}?;jbf^N38N-W>TN1YN`Em;{4==3
zhIS7T%`^gw7f11zZ)ecQW=!88lCyibXT-hpvIF7?zJw~D-SAV<Ue%S`UNlxvpW%k3
zyh0&GdyV77snlWgfj)NSfL3MNh2!h0IYR=C4D$By!O_aCu&QDUq9K9GSt$P656U?D
zN*riDdm$e24~^H#X;xD8_ybuQ&2)N%C%K};kJa{OSYVJtjM)lTmbSd0%r=47`;wPS
zb)_dweK_QmcDb`j8Jz2~d3kiOXR>i2OC{C<`_Y1(eON>O4hv~N>;(l>MT?C4v3D!<
zF32vvX1?QZ<%#5IuZ+-oaINxn+rSm%qhl<vI`B&&sxw0>iS4lGd|2HoQwGBr9&{VW
zcBVcjSv&hj4a47vIEPN#QSz{h(y8jh4O;8{tK>uu5s)<pAK}E2r|zjW_9Ej<G9{45
zPk$_7Sy(`pG~{UxygUTF9@)ygu0jf%k~UtPFYOP38>s7s+_>BZs74Z7Yi>o>7+sv~
zpr)JY>5h-KT*>gi6KcI72GI~$bNGL#ddsM&{x|GbLPDf}lyrAXx0G}V3_XB=bV-+V
zgGh&Th#(~~G&6KdBaC!6L-(`!pXZ#l&P!gfX0N@!amRIi?m2347xRrb6}mNp)X5L5
zSRow!-@wUbQxea8bq$m9;^zyRhX#Y^lkJC^Ro1u3QXJSBntZ|x)ov)XU_QK$`Mq6u
z$*yQxH;#kVlU=cn_Q3HiOy99`%R8@D5N*;8{%LGc*ZU%)>?`Ong?v73v6{QnaxE{6
zk3-md4U_^7W}H$Wcl<8|>&P4f(?1U_<MmPO^$8FiJpHk9`*ID#(yvuU`fWQ5^3l|&
z_Zc}Pv{~Rg{BB>!QmghW1luF*j=I(`W8P9exu?MYks`5cJSDR1{Ml-QItLD1RKFSN
zAvX5{%&gH$nucTK9rDrn^Qt4@U~(GQ4DApi<o6SDS%-W04)au1#+E(Qv=MlB+ttir
z67Y!~_Dl*(TU`yn|B3tSwrSJQJ`Eh|$2K=N*KyepL+o+hiQ)GwY&##W*SU>V4tOUK
zg1u<*y&r`jEWdC+eEqcM<*8-UKhOQ$qU^QR+5eXXuva#pD?w^`e_$gO%Zoa9Md<~f
zaOY;{M4F>{;E<UkE22T*dAGCF0&jp$6lEKTmQc#7JW460za*rS<D--*qx`leN?muI
z(x|q(sgnSh8v|l@tkKe~n17CZ>F4oA(q#w<K|L9C(D2K#hA47e6j{|8pI5zCZ4iai
zT5b=<fwTO|hF5~d`WcM;ti5|*h%AKFo5QH3WK{7TN$2^>kIv;XR|jbRW2aMXizV;~
z$^Q+QdwP0a@CPuRofIEKe=!?;Oc_RTB5-(=jkrbykxgvjQ@GNyq=Oa2EE-6ecI#!+
zQcFw_<R0%RURq@Sck$qFmI&@HrqVBfa%mDgMbxD>`Xr3&1Tm$sy4X&f4u97-^G`=5
zmov>ZoJzptQ(oju7q(ejq%qXez(@sJ$EprP&6BvitL**EoBe_fhV#V%1KE69rjloC
z{2uX9p;#<>Box{E)ww`KL1lR1tYRXdQv-!X?0@|N+c4pp=)<@i4q(RYisGoxEc}oN
z7zC+j<A96stiIbVS`tY}xr(TYY{5_~bNF!i=$N$n&lO#m&K6zYIZ6&W1-R>h04eq`
zmf-Gv4QnC4OGi4G-(}CxW)F4e8d2@}ZWRJ|Aw8&jg-!TPk2#?q%-PN&Thep6TTYI@
z*Idi;zwd%WPUMjBl?h1Ab68hgnj6Pm=L4&nw_>2ID)#&?7Di}4K+Pv|Ima<f78_c?
zs~RFzO~X`=5>sz>gJusBYKV}1{C&WR(x|SHl8M9~s)@D;@gQP^&=b?V0sAJ}qb);2
zTbSRo9cjxGzHCeepHa7f=DR4M>wvT&aZpX<h^Lzkl{el#w52fNQ#Zq{e@~9`;kW0F
zI*@GXM(Ij{+Wwg%P$gQ4++$Q^g}?(S;-~!Ma$Lqdt{-j-#|}`O7-HvHX<JpMJhgD^
zZ%<p^F#^nRt{E8!lji9ta56&4Y6WC$W?X@)s#qzrVlA#c-CrWEyvzuQe{GMR0IdY3
z;B$5Tyg?k5B_>;rya1PpinI`Ak>IWrE|4uX<%^pAUp{f|+N26MCxzuCF=pdv11$8v
zwLiM-iXEzlkE+ytC}?XsTGkc@sjUej=#};^TE)25se-p+RfmOgl7RGJ=XDMcFMKvI
z;Sb)D-`2b>47J45s!IN}@pdrsJ|?06>a*mis42Zwr4{uM2Dy{m>l*J6c^hT)_x*oW
zty70&19(|1T|W2VD}IP8He=264P3t`9r;c7D%5JMLA#Kv*%h2S7#17->F2ic-9`Vh
zb}OshQ^}m!#M6Ps9lcb^0!G(QAbs)8A7#eh+RAoL5Hw!UIF;(BiDE{6l`OBfP>{GH
zgpfHf&mWd++*v4i-hdl}VWjw_fO73cYASf1xQVE0SzV(?gyubimG#RUHw|oaER{5&
zJ_Pd7>m`Gh|I+T&DOajB{NFrsr(c_fj0O!D2i|PrAAIwC&CfJhg4uX3k9yWKip0*s
z^bSrB9<&F-ZL(n}OCPP2fify+g#Ub`sK!P1cy<T=Dl&;ElPCm}=H$N_zrf{=`;lgz
zQV^gc4Q1pC09L(*?HasU&_xPH@T!8M5^Fxse<u>7BLJ`!%6IB~K5EBzVCfFUl_Yhw
ziz@2&7($GJ@Ya6HFUuM1DG@mA73e-SPAdYP)IF*zBUHnx2R9h2W5VUdvvJ7-dvPu1
z!^dCgQrW(!<ZwUC5u;czKOBY3o!i{6(s<u7?}o4OmVZvd5_?ElN>3de!n=sik6REC
zNd&rrK_;7P^H8((=D^$AK!$U+|0#sPZK5!NFee7-#qf3s$ujTt&(EtGUk$>8wEJP$
znfGLxr)&KEo1p8#DX7r8H&M*wejXsu-`Iz&C&V&x7;qvjg9@je0iDX}$MV$e7b-(t
zgd8J}&6v7wwL2G>erOcQ?rUf;=?o+~LI$*h8G<{v-vyPZ;YF3Nx6lmN`jqd=zHe7I
z-FtkkdKt*=>)K9-$0r;qb%AVIVu8$K2i%y3^xm9=Qn>_tnWI-ay9@b&(U~DEA)mTF
zlJE+sfqk}FuYuCuhtqN)t6z#WO0}pzF$L@S^+L?MhQ4*I_a}0r&C+(dw8x#N7Vwmr
z0oPjhONQmy8`TGzfi90{8%PTN>X%em?SC$L=5!-nf}KH+Z*QJgN7S=;9-58$;;OmL
z2s_7n;tDT2q-?g{12vnLY9z%zbh(^_BlkzAausj=mG8@J|MpEOcjGGlTvf|mnUe*T
zr{i+5P7gO_RRb&5X2%uejL!Z#wY_0HhG^;#F#r5NZI^c7k}T)GrA^b`dMfO>yl9at
zY1=ujNy`ra8-%73so(;wfJd^!Nc(8V|8HH?e+SNjxZqmTqQi;mtwplKlop#x*{^=L
z=nLyW=j1KaQ{G0i0^IHDqO~QuLR#N{fAMWR58)!U^{DuoV+c1<T`eWPmai%)g={s}
zeI-XJns@H?x*aNe4}Txd%vfvya0b|oHdPw~ZQ=eFRz-_)VNujI>J!w=*xZCflF*MK
z!vdCWYBuWVP(n?F0`FGuml&M9RB4{M&ZCz7pPqthBB!Pw-`NZ#yN$%-+hGt4yuj*o
z&qb#H_+g*N&-Hqz9<$9G@Y?n>Fkla9D=uZX#a?bIKz06ui;!mUe`V&9nHa5oOMPj>
zr}9EYRR}*DJUzsho>ox(6iKadgWm>OB0cCz%I2LzC4B7!LAB?&vUr!k67?j&)K{bs
z9}`S4r~9a|G`IaXy1(p~0NZ?VF#cg$S(q^jhV{Aa#;{nj+=5V*f^qsZ;{&r3+3T-=
z(MWJOj3B1$4StA9#hL<*67CY}?PxEpB_Ako|0%hZF)s+fCna8;dEHWw0(;?*n$`y}
z36iZ5!JG5CkdLPC03{JA9Z}x@jk)SEm=Ha+=Bi(;7(&E2%u_Nf<b%V}i7lsbucRBT
z%=Q=`OgXz)F6M#KyFY0KPpypcW``<8sse|^?Ow|Y1s*jS7XD|_IkW`^_)YwN@Un<v
zZZY$|=n-=dW<fxA?u!yY1?J$-Wp|uo>5Q5^P0T_V$GQ!8-YrtuheCDf-cfE=eTWdZ
z2dn^ZMxRtoG&L|P;?^M@?7bIkY)BVMM&yDHpg2p6lkZjhf$RJMXt*p85=8zNZ<TUf
z)*+z)AkYYXNi!$vPEv)m5Y#LurWS%-kz;;=4lEp(;*mI7Qm-7}y>bcMKRRYz5)Xja
zGfLz?ONAq!oS=sNc;LqR|E(Sv`Wvl?t}h_zeDNs}p6Fs(MM|45e-;zs6VR_HRjs4B
zc?KDhd&D%X03&q<D1ksfL@$<d^oxQ`e|%&%L8!KBB3Y%Ix_B?3k#KQfiiOi)9(e(_
zQ#p|=SJPAkn5LJ|A=lVFw;orHopkyei;xgONJyQ-F3`3iC1!A;V2OsF0$qjde*l}p
zwViiDp+*TR(L4)k%uupzCV^V5_7&%4^WX9D((0jG;LT-d`yij2z2<Qpmj-Q$_U?sv
z8mL%^<<bE;dW(6XDvxF^72WiT6LgOQ6|vd!5wD>eM=XpqKC0^pq|VF3H2dG8zH&+f
za8tOi`Bqw+cfZI|qFBW7st|uXK1ao>e7Q0-<rgx7n8rVPE$o`K)^fe85%k8x1>izh
z0bOYSzAyKHf*6baZ6owm7^$FqsE@E_KM*5M(e0$;)8I3kE9J?PZ@nq3h9W}m1e0I(
zc&)MU@)iZuQqG}iv^F!Xn3d<1B>*1A;ybK0Dvx-ubS4Y;jhD_^?ody%^v|5(XXnzV
zboiC+o%nR2>8Is;g)bA($3Iy1OpWHMAtl^g?5U9MVpAyESQ~iA&(@+cDjW}&-kY@F
zrVpoKHW)QqyKwX&&-YUrSa#7O{9Tn#AD6M;H1&hPI>jw|-(5L2za;CN_0V^7Ag#FL
z^D;NiGA@kiE2hZXkBB5U=H4wZZ(jEr@|q@jI00HsWQD@GO6DUT3T9myA*bhk<jxet
zB@KL~Q~<k5ne)m-glY9pM^&qKI^GEmQWB@~zJ~0`s8PHZ3FUJWT7X1<q$n{Py^B9@
zCqrYFT$KM9?zl0K)OQN+7eTl*y>WL}I4f&U;`X`Q5D2tC7se*43jPl6i#<amjQg)S
z9KkFk3;6Z#M>!dN$4Y_^hL64_Y?tP2CBHQSTIC}F)1xmNOR3q3Xd19P$+G(gd4;+Y
z+&$BA-LY`bqRV~qf7ko;dLcrRG&)9@^|prTvo&%?o$HCN_PM3p9(Q{^P5}GQn6W=E
z3C;-0aArRZPt%_m`>IKiyD!7{n0CgBTp?K+U88yRSw_bhSdrLfMX3pY4?AyO^!nF1
zIpO2wNHoSZ+b1_;sg1TlW}7dKKg8NEF)MGe|LL)zF3#$A;h~0P;bM$3=WnG6Zq?e2
zpz8#WGuLg^=8Z605bbBwxwYRE5Bs}0P@T`*4}#A-OD<MW>OY}Nx(W(mC3f_iydBO>
z#(LZ5a~C#d%<(n|*~uwhJIkv7VO#)3j3jC_oX;HzYFra%AML(o3ia3sb#WJxIm7(b
z{MFGZnz%AR8~;M?pSu)$OOp%5<yCYQfZ2e-L!jb<kvAKi76FXl1C4*KoyBJE70Hm!
zUu}pXt9`D+x#;8ayh6$rO=rpG=tecE&E%IrqjxOvj&kmWC0dY+IhS4E1Dq4h5kS;z
zIXO&hflYfTt+^}~0JXX^7jOejPUPL76u|-zBLrSvO+1OI#B&rL{Q|NgY?NYo8fE?w
z=!btSQ8KZFwqS}Hc~FlPy4ahZDR>M=<5{HBmaBLncM)#kcyKh9-o(*l#!vXo&Bp(P
z5$^jkXM-L!Ww`9=1l!YskTm79QYkn5L3kbOk+W>X>sml*!ag)~U2C|uq(Hu&yLGBQ
z^705$rn{)oIj8WYag)wEj-SeRht@r5#`uE(2`rx1L?YU=OE|**5(vrTMc0=U9!IDl
z%m^t<xm%0{Wi4t=U*lVpEv)mKgP@&)NQG9Ge+f^2BWe?kzGq{Ry8cTiPo5OawC9QN
z7^ZV?YKJr`FS;~ap3itip8{S1$~pWe_jvv^^T4g6Q&&e<sqIIz-N!U3bVNZQ+>ZIy
z@z&J)4&^TE*yAKjqW=>8m=|X?*J#)yms_QI+({1a47*S3{AG%6`AgI`S{4shq}(wu
zY==(zS`NXG_G2AD(Ku`Hv?{1$Gv<~#Wt>?2-2bi&A_60Zb;h04mu!2R%|ph08YW?A
z3Dfu*$p$jWX@MwCtda>zBme2Wf~KMq8g}j~#Nmvr`n=QE2qEF3MYSxEV949j%TtuF
za_Db#>AlJ-`&yq}*cGrjv=&<ItsG%R!YfuoYQFm{!f9f1pLg@QygCN=O<gK*I;_Z&
zmn=mso(`Km@`czV!eW`1Bx`nDRV>98M1sR>AG=aAnMWmeUd7hiIexF3wKMN_tqJVH
zjuMnJh(Ywne`AiVCo+~_nc&1^lKd1~76mE*geU+Er3a{TZF-GmlPuQuu*!hSsVzQF
zTQClF)YW9x$p2ZY{Q-TI?irVjH5&wYZtqbKDIY=K%o(xVD<65rt4rc*x#z~;*jM7~
zwI67FCvI5x$NY;{!J7MyU!7KvP%o#v<L0}4*x630TyrfQJ@M;9iX)!hkT}7in;Ptc
zc^of74hOIz2l-lFiGAUu&zz;X?02vT)KyqP1WRJ5H#vyIpOhJF%kOzGpT+44AeqV=
zeV@o3LoWG?)48<D8Fp{Gf5zJ9xzcgpVsY$49_?)-SpgeGVmS7?Ql)AQ0C`CCG~}Al
zz(dEj_eS>Z4zBa?4dI<5TE{%PfGACp@Hc{Ux&(BNeZH1)IEc1y8JS6@1e9<wej)9W
zUdoH=S@|Mus)~kBKiSi=q*G)G)|5t1&=6+|0K%eqlz%y-ujv$46aLsiBgpZE<e98d
zZ;Cn$zyA;m@v37>G3p89=0o9yZ@S$5u#(Q@raufuV%plvSRh>OG;+$hIomn3S-?IO
z#8AglF?J-?oX%PqDE{vX_i|faoNa#A@2u(~<6j%4>Zb}IQ<}W#&b^<Eeey{ybz4kz
z(m4u30D-=FoeG`{+#nN9l=@Pi!-BDP4>h@}Fxat~&jaQa?aq2NdB(X=wIEVM9e~e^
ze_w9j6AjzwoE?<B`_1XpC%)5F5)Pg*-v5^q<BbTAJPm4S8FbE*{H;MBqP8~rF-f{f
zd_bCkj?|Bh`~Y9sm0EeZWZm;tY;#jL-gd&XA;%0TRlz8E=SfKjRIr==Ov>H$Gx`HU
zIZQxM@%E^&;ZlIWK{;CRDriBQ|0TI{`hpp!W5ew4a>W`M7KNySU%Uz>YUxZv3gpEa
z)+4EXRsPGItgh-0NXNUazT?3U&wz_xPmVx}7Q6ozQ&4<xm6h+5Hv<E4o+jf!kD?(;
z#FT`(zR60e5rP*?ibpRFcou&_dGJDtT4m%p(7noRe@NPiEd)8irYVJF(YjG3*OkzM
z)9@XBbOXBfYA0yKy8CZ1UH4BU3><h4xvfz{u1c9>I;5pSF3Mm0RA!_y9f{DzKRuH|
zo0?;Peqbt1eSInh(bZ-UqQ*XyKAv1R%Qj!bN%#S+vIh@PjmZ{Ly2h9E*z`z;=`XxF
zOt`lvqSKaqc%W1V(g?HR`c)u5nlAO(8<AyVvg>hQrBX5~<KC1HK)iTm-*+!C@%Z~-
zgnpKOK8{*Sa;*$s3_BPCpU1wOMoH_7JB<?S&8VLB%YK&A5mq$-g1TQUuE6_u%m5jH
z$O-qu?`XF(HR`3jXk#MBy>Vz(5dWkP355YS><uv=j}%}WT=Ufk!PYR_J<s@YCMRg{
zB42ZE5jx?@C&D~WLdl-=mvMf2?mZaiZsWE~Ejrk?-9u|!YcLy10{ep}l-K(Yjo%)5
zsfF3<bsvvx$@jP4KG~!G+hkJt(rcL4CE!-6z8D#dJck+_T03O%V&v&UJwLN@MK2Ag
zu}?sdr?WN6PHp9Vb|PDpY6|jt>XI6RiQWK2y&ex`KNun*nB@>DNnjc6dc;Gy%My8r
z5R%>jNqtakuJgS0vgHdM^r@?XR#JB~FnGh>#D;t{=b!y26SG&9CXNmhbfG@BkX`~r
zn}b)I+G}z9<hTSiS(ZUecgE|x^4b00(`(VW;^c2q#rmXgaK>y6oT>7`4F)L~4DrL5
zvppEyJltv<kzc#Jh5LNsXm=CstbhfV^2z<FjH*Sx=9Kh<>(iOHMXj_nKIW}4#}x?(
z9PGl&oZ(^`SJ0VnGKmg0Q5HtJO$hdX0<b?4mc0GmOP2FtpK_^%{yW|Bo9DIe7x=cH
z)3wbpq`BW-r@eF^vB-I$v_$%vr6B?kZO^(ms#F2Ti43la;#z1NxVyJ|v@I9POd>$S
zBjkvyP6QOtr_EA#nTB;fTjajAJNz)7v<un&V^Eq362Opq{e@SoEL7uh7wCO_zO20G
zo42&X8;|fu8^TV3qG<TnMy*ElFl5<R&t_ZwR77J1seDDBW2uOM*3Lp*hiqutx{H%v
zk`E~^4|&K?60{;Wq3G40NdM#=DE)WhXDeoI*7E_)Buc!;$y{@+@jy1PawV7@B><z=
z0d1&3@0VjYQf^IIKv&ZB3k*T3Hj$&?^MINSQ$1hDXDI}z`+p9*eSDcTQb*NyXLqp=
zL1_0UVOajr(cQ}Y=T@q88Jb$u(^wbD+hb1{Qy7UDkM*`;Lu!J58Q}AR0MgET8`=ki
z4wNwA3d7B_AG2JyXUFC3-BbyxEqH#eBv8Z`&k?LkAM0t7G#^MZ58A3>LBb>;>V=Hq
zUJ>m;%Cv9?zg5o)0w1)2Gx7F74jOTpbc?b;Sg2w+5q%lai@jVH_;xw>dgLl9nyN;0
zf%#ChL1n4G74kkv9j*#D7MYb+aZ#R=oMYok05S9B+6bsnsZaY>ueX#XUWSve70_F7
zS9RIKk<1nhJ976{0mV0!VZ1Xyws7?f=&&)ZDM(&l+bk#1z-mS!{F-rWr}i0#<GV^_
z!ud1_6qs7zY*M&`G1P9yX%yJ%c2j&j>3MI)HNo2;8i7rLKOS}S?vGi>?Wsux!S)dL
zS8q~|zft>YqEw~ZQ39V45dA+gLf<AuY7gjd+O}LVuB)Ql>H7zcllOkZ{Q_G({jD{J
zXu@Rp=8%U?-B3nx6WCKj-ShvMV+sMa@E(T9Xo)nW*L6m{HQJe?4?^>N1D_OLXY1v#
z&yl0BR5_Zp)1T>fTAV16ldTaub$nBSq#(rfP)efh(S>GBJpVExv(aH#Agkx1(lLRB
z*Y>3;E7K6lPg7hFs_0hemsF6wI48pxQmCJWBW*79yJXBhS5UxdraehIl^l8qj6fx3
z-xWxA2LQ}VnM0+~-xVCyfEvfZrw<9i=VLczcTC3kxvS!N+@r(oFNW1BxmK0hi)&E;
z1WGOD9Pe!BG~qho(iwNhdhf+Ld%7GI%6C6IOZacw*qSRMcUS;sS{x@HbSLOjbxoO>
zGG?GLH$>RnHa46iSYubG=CY?QE`K~u2QLvnx_sMLJBYwxMfXS-H__Tz<-wy6EZYpJ
zo?ybi!7v_~|LVD0aVY;M5Lh*g>l2gqnNQ7~ggM!f!$3A{UoUkO1d67APs3KDiSF3)
z`N`V-sOKIW_ci>-UPWx5+fEome%^U1o+bXvEt_eitKXBS!gu^nMaTJEq{(>1!IIh6
zKBxiA5u|s&)tvhZfm>oKq`tCg{9~&?vxOZ8?mo&4$YP3lYl|e|hJhxH-AbfcCE!Mq
z8Z`4cW{Ec|doO#r-cm#&t<{RmBCY5$RtjNm`zc6NPbbh}V^r95X;mR}(u?awZ(VSk
z&XFPEO-b8z4uMax_)UW2q?axEQo9a6NsEPn81)LN%HHi%St-{(Hap^Dw85S=Q1z|-
z%(H3l8G^Lk!}p5=>7aDHHta|1QP0iqHaZW5f4470(wskvKz)Z{uKq5>!z#Bgc+!kK
z8>A4GWsi0uPm`isK?zBF$?o<x7SV7(A2ke0NaefUSn#(X8K#|eJ2KXg6iF=M`lmK#
zd>+kc@Guz2acJmlQhx@&+XESQN`BdysayKR(TeM0(d>GQ*TffP|CKNK6QO_q(=?~b
zNofg8CsUjJ3Mtd4lGU9ANOQj>B^uFY7@76*t!~ItHPru3@d^rz+y#JLWLK<J^pD%}
z)bBfAu$!KHIXT(^^t@Bl`;{?|5rWg2NcIN-fVL_Qu{KypE15PB^`JS6%<R|TA<2mE
zpg!_PcV8mpU!Z)NnzcQfMtPdLsyz_BGTHm7Z9(l78dWkVGA6y1D_k2VG`uv$bOygs
zeR>hJ(AqkmRXsqgLR}W{7j3KF`IfAYZaPvUZuFm!<H-uN5fJcZ$Bxt%i{9^>A6dNX
z1kH-Mh7jfXK#{1Tl_ZP_o|Zavs7M5}>MFgQhrcc7xiwoAHQRf~A?Z+OH7Z?k4js{j
zQUbIc!UpzBT56J4qny1X@-N+8=fp{Co5@pdU}Rf*qY@rp5mYS4HK&m^TNOK*u^bX}
zym0M2mBp~kSknoqvjF9T<DN3CWm@h#*m6DPId-EzUY1$}Ex@;t&-udnn%UWNf!
zM$WLsy-Jmr<f2MwdQNQiPOO%yBY=d^_VFX5=&&6L-CdnT@&v#W<hVQ~B7Z~du*2ic
zt9iyOs^GJ0%o2Gg()T4xrh&b0!si1UwUDMLptE=_2v_bimPokY^~OuT$FzN2MJ5MX
zbD)yuPDrjs7p<}jQG47{yHO$BEH2uBlvr7tp=kj82>u{n`wr2R=0Wcq%iYG38$pEB
zX6Qb?A_YF7q^P>6*O1~`>7L4-LM!;ifyVks9{BS0yFC%6`+vPd@JLCgAMlS4(Vxa^
z_XaEAjpgkw$z_56h#1D=MHL;~DrzCYQ`W-iZui5Ri;Ks2dU&w=!H=)M=6SDSo<Uyd
zj*B(WkM*u+WuT)Ac^UD0h1Tk*i}b=@(u@@~ZSE)DO%aP4)D-yJ;>k(qE8A-8I%Q8Q
zXfN83-$uX9u#?Zzs-pOZFj<o~`w;J!YrZxh*)~tTl}Zrs$+zTz-ErA0NM&T@Ws@?d
zdbB|cdWn6BVL*~|^8-7jGtIq%6iT<<Js60UHzCD>d3(u@2#=Zf<j2-frqeL2b?v^9
zI1xOz<|`g9kMv7Ycb>+Lr=ALmWj%jIfFc7%5nHh8;3WDkHS#Hab3uP0?@ZNEd>Q?u
z86GI#zo)U>jmA{3AxTmqUOl~C=lJXHx1-%$n-I(>1p7auJiut2>pGn9mBayVMK3Dv
z^ww#QI8A_JUd?=+l~x1mjKsb99dD)=|6gy>tt#@`=xi}ThWS4RF8p3+w)D78q6{6D
zlS8qX*MLBjRF8{gH<f7xP63}@fHTa)NBHD0o$@J|+ox(zKPO<;-E}QaC{k0d$x&QC
zJ$}^?fHM7K_~$Pys`|)Wk`U*Hcxw+d$}zT{(mmR0IoDF|wiQKcy%DKdY`I2(Prj06
zIAlZ*fIlOe`B8jyz?}jxDc{5>m<M;O5!UwVhh>1X4HGfVK6oS6`C%_L@<dFX3AaQ3
zg#Yo(lWHj7yxk+>{o>XX=Do+)X~a$|_$>2oY>^`RtaJ2fs@&AqD8lVdhl|6mCav(F
zOUF?RKC*eqdL6Z6yyw_x9AE;uOoclDZTeVd>!`SyszEoaHyF#Vlpb|}Sm)cMUx{eJ
z_Yd~$H*J%RyY1RbD5pXhb3>GpB@|N~yX{ft3*7A_jzXFp)!}{?sY`^}hL>uKr2$iZ
zw}JT`bHi0pMs~-$l)1Kx=ce52PQ(t7ewRclsX%`am3eSaMg)3@`0JiYOUWkn?c7oC
zZcD-5hrCGnL*<lO!k6S@owJ@wi$AA3wEa|%H%uI<8_6)rsrpV>Hghscujs6wu><uc
z6M#M{X5j6OU-f$+{aqm(z<4L`@H!9PV{npy%c9bED<^s{{qJ?-_LW=No>2Hn(C4{K
zP<xLvOd?x=pu5jGWYufPj#Vzs>-YHxrw1Jimt^L(At6%kO6Ya<5rDA^mrO|+<>aUX
zT$s#ja(Gd3T?W@CYk9lL_ui{_F^NpP7@*S^oxZ{;Ob9m_W<B*UxWfk8R3tRMf}8xC
zpZQlY-i+8u_Mp;9zX_IS721|JEr~up!M5>H^_MsKl0kDkdFXd0)lY|m#;Y5}`U1;)
zlMq^Dz!>80Vb1O!JM5UgT(-k+46-B5^UCPTZaH$KGu!*tcKTbx)zo-oC%SW8=v!3L
zMra90<Vyx^y;=~T;DpFkqquDlNSn(mau4&r13HuIl<p3A?0zC&Ep;sl#=tI0$BBri
zeZ{Bb>xcAN9Z~D{;|*EL6n2%0j-rk%-N(YSO;SnC!+D^Fy*YdJKlzI$#%KH2*`J+l
z;vnCvoq94tYvr4HBc*nR;2s=$!j_nykB;Ga3rWH_>#lu;DTxsIXA?<E0cHOjuUtMc
za*VkSR>_icsPfP05>$X}qPldi6u6~F$*ZIYF(<+fVJ2q0q#bHCDweA;X7Xhfpqr<`
zs98V@J^-0!Jxwtrk~4BMSppxxCb0%8M@d_9u8`5_(Uc29id~6F;oeLR+V%^2M_{4m
zj7pAR;7)60`1%HlZxHZoeZwq>CgQ+A?ei>&&Tek)=y}DBTK92n>8IRxf`d^@G&#*$
zw>3-|0R!SB8FT>L^&Z+`{I?=CxrH{LfsS0|iuPo4QMutEHZU?%>faEE5j6?Rwa;U>
zjyrI@_w--C$R2IOuflus2ddm@ei<d;v<ChCvX66lLvk_V(SKP_o~9)~Pwzs7B!tPZ
z?;7D-6{;}&p@n)Ir_<lS@S#}?T3vsT`ncW3F#mzBc=;Jbp6`tLxd@!x=fz=vkj9U{
zBO1>>6fbs-P211uME9N_$#@E7Qg3Qok_B2ilad<b48SdRIiU-e=_((RInx$0P0CgN
zE6p-JTve7n0XG?A5{j5r@fur;*LnY4yBY1rh}avJRLJMP_eic#py$$Z!X|Vw=rB}-
zRO7jDG>=`6au2Ntjq+T4MxsTQUYiZ8z~<FPa0xnWPwdd=S82qI7G-6}Y7#mmtFvwH
z(na9^Nc0Nv+5!ld_HNM@t*1_G`|Q?$gE=^7&d{E-TQ{x_7U{0Eot&_mtP0>|*Xcl{
z%RXWEE_q6wLhIi}w_YxH<4VC@BHguU*V>AGRr+b)GjO><N|XF|dtJWMF{$o0+U6B{
zUAK^eJI?JOY+CvqVW)CV=JI#xfq7%}4J3Oby^+FAla4A+Wd|K}GN>q*Ry*|4pE~}f
zI4uu6aVg02fRKe+UC5lgJCNViBf*E2lX0sG)d;a--0AWKj%+&CG<xjmmT>1j{U*3~
zA>6okowHLcY$psw_mWlZGfMqrf39+l+i~Dw{Dd0AC~UebF!z&>#q8Ovj2CQy{BMV#
zcRrz(>q@OjIsp;|cmBzEB-Ug45x&k|(<fX%b5;1Z^tpT0t(H@J9E?u&sY4VLcl9&X
zpfgrl3<I!mP^LVPO77_IX(MlPI0zFRZ&*f)bI|e>Q5<aL0FWum0^OIxXE!}!jnJPo
zd2ecK%>Qa6su2=nil=rFp|ZaiO4XpiN~g>VlNYH~U<q)83WDmvbwjc4T%4AY8oBSm
zjY9~p1O2QTjPfl4`A4n@E2;sZG^pjSSS7LYuvwPSlcjC2RZ?{q-jUA1+(D0s8%3I9
z|6U~HO$@;>o5gOh2Zke5zRcS~mg<7(a@97m|7*0Biz_bMuWDaJ81w9#rN8}dUnQ?u
z?R8~^bdNR*)ga}o$}`z+|D}>);Z~N^Xs?3??P8f7K$9sQE>r89yxx3?KFK$GcB=`3
z{?{jI5NBv&;|O9L(%$@a5<$}MiqG$^^|N%pAj+}f*w(%d@CYh(KEs)$m+^NLu*mx;
zK&alv{V}s`n|%~<S}uvt=wd&Pb|WRJ_~r=Ug{EetdU0b8z)I6!*uw$yMSQ-f>u)+}
zL><9d_&hlzgXga}fJl)IWBLArf4BSXf9XZf$c@4&2f3QllCIy+;8E50_zrMUMyBbK
z5OgbZ#D0ZA_;5nCDv=Z8&Eb5XKK+n5i))Ws^ZBHT?Aw7>gH5c_4h&MrFbzd&DIF9o
zDIx$r4UZ9S$7pQs0{D(zh6RW;8tHsg;cWPzDylO4XYq-39Dw&{-5~Zd(ajSOM4N+Q
zODHq|&kmx<JHY0ME`PV(X+s@Gmk^sh|7_G|FF@T&=}o6kyyX^>{&`Lr${hCNnNrFq
z`48YQ4X?kUL&I}*zs7m2FDfMIq!+~N#NZA0mICl8L0c+s`vgErE+c1EoBTdn%ss|-
zb6EIwNtZgq0Lgs-@K6;Ncs(!X9dYD;0I1On`tU~R)R&r|Rsy$!6svw;av;Mk3Q()s
z-?M4q0yAqQSPXAvLex}F8wJk*?QODxS%Ppb8qY2xsJc~14bL4lI7Eq=7zc#va>l6^
zk0D5r-Sr8B7{pk-&WE|)%aBZ>ho2MLs?s|;WzxvU@eYwJBT3%{<_vrP#l;=qT=ss<
z$e@KkOP1!-+%48(bYCBrlz*r(aBjb#+u`w{nZ2GIT`h|nm>$3;ivZ=)%!5Z^EBTqx
zXR;Xi{yjys^KOhtNNCT_RnOO&HN;R~4ATpygm~c<CtcbUu;T>zaM}lwo+25%a)z=0
z9wJDUzS{2n{ELZ1SR0G<<^aRfWK6qq2T~*RHjEIZ9gwSX)1wi-PGuupF$7e^nMr{}
z&)dF(;g4g8+8b)U*i`+uODleTY@pwQq6t6CIv}`duLOIFKT;GjAb)@NjF7Q=_v}z>
z+F(gVC1=`L?~5OP#tmj*SoRIdr{;0gjRJIJd?)HCG1Yt(9r_S#p-hG^JiG5SS9)tP
z+hSNkiExlYTy0J>L|h5wfSy3agU%Y-@F?aLH9W57hxBW}_s)MMlc4lAINmjM&3HcT
z>^-vj5OX_o(if$-LZ+zX#zuHSfSopX#BwQ&osd{LC3Af)`I~%fzkc*gHoz(EdKrO%
z4jS^V%)!_b2YMAQ-B6;kb9u1M!V5SarS*4@*%|^s_bNk5cElyQF$bs~Qd4SR@4U<3
zzGu6|rY+Wmem6Lb6~q)8DZ@wFx5b?%AvUa^I{%0>%r7TXGK30dhL-ZtKd}7|oT8M!
zhzgh!;$ra~V*37@>9Bzh$TGo0hT}7{4X9H{`XJY5D^aP+WLE<2#(zf;Y}LU^E8187
zqz45;5W$SQ=dxo6j^8o-SGn<@)z4g#wQrAi=z(AGZ@&RN{@rvn+`;Yv$Ye4Sn8y2$
zkjC`{F+<6Dm!Yzx{O*I{Q<l(Wml63ajFr3e&ih`tXpix9)R(3}*X&rjXShR_QAh@N
zreg;z0iWV8^GP{I61t2r0P%5lmx0|!%ii~ioFE440r;c;7O$+s#f-iXiTeTh^0fjg
z)7W;mI@bqPJivwtJ8`Y;_a}2Zn~4d*LW-3jR{Ja6A@P#^EEw$BQ4hKKxa{BT2tbHF
zqgHVKw*6<O<w=rG_vQSVwkP9dqWU8v5toJT1=sx#ERyySc|Lk?uKRtZ>2fR!E!+-{
z)@G~N=95_$8k4d9f2VZ9)mSnUrhBZoi`q|;*D*99s27e3YvQcr!P6K#7($7IAD>;8
zE)^G~8{Hk9w=>mrJa6x`6wG8s;S7Euf~NMlC{|*(I3xi?pS|-xK*n@{=wly)hu9sT
z_Q?L30K^f}^#IW!ydvtW2Z%6kaaM0wnb(<!#jnruo+yH^D&nDUL=+Yx5qyGfDjv^@
z<2CO%?nFQeQX0=bReR2b98UlcvH3^3?Rwx`qx)}0Rqlug0gl8->3UE(!@Q<9Q{1yr
z`<Xd^37{7i0hA<Bg-o-Dg6d~{B@pG%jPJU&iUo3x#`8gvPL5q6c2hS=i@{rKqf3U%
z;n4A+krT2KqD9q|z%vy3BlLx)dX3Ue{FQDto>yN2r9Bl~AdDoT`?!roECiLE$yLWa
zy%IFR8YEc~G77lY@!9`q>UlNsc^O2L*Y?j~BH0XlrKNR9aU0yud^*t6;1%&2CO#{f
zIRwsN`-S+_+m5vu_IIxr05|x%A3ycDP{}h|JW^|=P_S{O9e3LhR@Ikeef2~MYPo>X
z0B`6=P!q)~*CInQ<sikcmYXzT(m*eXz%j;mgDEUydseQG&75ky?-R)dLk8`A%>rp2
z8T@7nV-c7ZAiMKZFX)1zLdsEv5{T|J1b(^$6tEp<)E2dxRjLE0jV-Rr)0(eO`)aE1
z$y8cP9OBCM0#0<tPa(K`TSL_Zy+9p*=TqtrUunC=Gy-tka7;r~hIq=ys)A``<Rx=f
z57uC1F0I`Ze>!rc&%8*z4eJ0T0H^5<2^Sr44EIgWYMOS+8`o$U<*PX+c!qICv~E7e
zQcSBlAN}Ta+dXUy{)TBpVz^mOi>uT2qNjtF3Crn&6Lzu>I{v8vMqunP$#&FVmaU(F
z8OsAz%X-<L%_AB??hLt(Bjaa+>%;4qhGMg^kF+7xE|LvQ_wksbaRxL{pKSA%ArVb9
zcn5}s-ymc9IV6BFLH?NHsW6r+Mpy<yxf=L+Mps|p<Iw&r-KvI~IKvtyXY<B4k6Ko%
z_nGAyW|yK1rC!b+nXIkz_*Dj6aRTA)Y}AgZLe0<ad;rP^eI7e7+~=))m*Q8p<TByT
zUNI1faKknQ7B2Kxy<^&N(Sp}BW`CD%l?ivkTwI;LsL)t!V=I6TMCCW5lJElnFGhiI
zWvfZ&|HrcT#Q4XNODlz<>Dl)ue8dh<-lz0(!L@PddPuwbz^?Pg{RX9xbdk@LQIg>b
z%!t49ACL!4<z{^iSHqI4@wAGL@quA7XH-iQ>QQp(seb-6?NccZKb{fKjqr&hZUrdy
zBl6hpNL>JdVEK8p1o4wb{qLZOsPIu>Gg>^)hq(Fn&)id)%tRV<>hDLbu2zj(SRG71
z5`aXF6-~qXxI?Liu_{?79}i;7S+>JsiH;#E=D{KCq}xIkJs4u=e`GE5>hoez2_bT;
zd+%;|M0&r$EdPd)qd9Sa9KdRB&)iq6cbg}w=ag2kH=$=;Sr{Gd0{EBNo;mE+HPa%;
z&oCO?*yvYDcgR@eW=cc;wihlc?ghR6y*y_`J+KA(Cfk|BXc4E6Ak}6w50GZ6y_YWk
z8TLRT!n}8Fcqo#b;KIbdP+jZHL=`iZXs?<qny)#^mdaQp!(3@2xGMgvYMV5Nn#+xf
zfyXo@-7_qlaL(K=IX;a{ul@o6<nZUdneDEcu++A$Tz_LLbwNSs3B-$$8IAwDr7sC~
z1hL3>RwBkpUp0A2&bcpdJobBUuf=jF@N&)yw6`bf|KvRijaxk<7DWHQAYI_!QPn#e
zQeO}YK9iwAEyJ+V@SXPG+CY2`Ab`P~E3S5YiH8J81t5zIns&*g5#tZhjKBYuvdmbT
zcTc@QSYWz_6q%QVlwkG4vS7}VgcK2nwKXhtHMHEhb<d2uaby3?0k*LFNnT96170^i
z5@ot5-Za6ZH$=fvf0-B&3_LEcL=Rl8EigYf(n<x|=(2^pu{2n9^?<#5iIwgz$=W&T
zmAj;=#}Np>sF<rD(PByn0Nk)2fm%7i!M&M%h}gw3uRUxCWtg<`r|`Hh;kHHq6i1MX
zu)M&3(p5xgNRixf<|T=&UEo4^1Ky(L%WxvHo44$z8zb_gH3|qQD8i#3P^g$(HBT~O
zR>r?1z3Ix;ErUzcvdV`qKXZI4YHKI6`b+I@V%IBy+PH@i`HUZ|Ucdn8c&jb`FHpb3
z_*@i@ZApE!9T#1bwz_nnQ~9&{Gw$}|_UT8vH@KILOb}d)27rmR^lkd_7XuAew%+7}
z^U{xDWfSH64>jh<`dO1<%K@gAumOg-_0}E+q2_~>Mea?VXAi|+PB-8kGfhKY)>gVE
zb#;wmr2hacyFyMmC0tsM|1ki~jeS^n&o5`#>w0a28*C^l35`?Fv%4%x!p9Ymy+F~f
z7*R|7J5th#ia@EaR&4<3bF6>N7=7&;rH*bGPt%v^voi5^CEGP+GMBk`s6Uu%r7CP{
zfBuJpGiC#+Mtm4<PN8C+A#rl5DF{S3s5Dhy1fc~y_OKaGd=hGSoiB_#6zoS~H85xP
ztDkdWY@N&sazb09NsKDI@Os<KzEB+U^Z=m7a9v~lLE8T;c?4zxVtxJhe>L%4Q93Qg
zrY>c%eoeK^T4*;e9GaN$dOm9YT%opT)hMmY=`C(Rvy29(8D@C|SIuWYW~slITtP(w
zy#X~uA=s~n+TU=NU1n!Ui*)$UQ3FuQ%juubzzC;7Ej))P!u5)|5C8G+FhE3_?G-_c
zJop|$@gpX|Kh@g+_fLj6UCj#h$mnS3bp%NBZMn^)VjStc8$L++B{^<Y+KruDCOUEg
zDF>Bw^P2Wz4zrMOnq$v57NAEzXvG<#)}my=8kUMT%g|w{sTz0lOS=pU#jEU91+B#%
zI0+^hU;F_l&GmT%068MFw6dPiOoqJ&9gSI>7Ep-O3QhnxeiHxkZ1F{fv7~?jl%$W-
zKeQ%@!RjKmvb%#MA?Q!^(<frHDjf^j7@t%3ijw%u%X<-v6tOMLM>-9@<J6I7GCzvi
z=_Xy;wXD79U$NI%^QiQ%!v_;R|5cIn+Y6q3mfWRWs;g0Y3j`&=EB&sRZ}gJ<i2LX<
z_XdW{tFFdNMP|7{O?oP2sndrPXCIK10BQOiBu3D0TMAA-s)$qZ$?t*0qNlZ;ozy3}
znu`g4p5g_^<)@QQje%8POLm01R`hEeZUKZfiJoV0pZin-EIij#{9=1oO5=3bpIxn3
zyh;_+!Q-=m7N&~@Q)njF7|QcPC2!pB3s1#L-nc)$AuLLUX@|7^UD5}?N-D|$%ToC#
z?qeS5sQ#(VINAK^H)aVSv2mdhj{Y9GF3@tOFJF%VsLIzHwio#4UV!9IVml*2sXrPE
zr@sfrWT0{4r|jQnheSf0amvuQp3yVP?>gq3ceyT<c>K-BICmbG<d?7LC&Y%lr0OkZ
zMBMQFVbijyzGMpXX74F}HEGld!dpTDrr+%Tep~xiJyZNqdHUr$RL9WctRxn66hiM`
zfT+$j?&E5hONdnl_c9g7qm<v(n~$TqGl@Y@OXVf+SqtSeU$K-#<r~w<go@+Kgy3n?
zGcid|M`=fS%P{itYG~)vd_qn29!H5WWLe`(inZt=l?iogP$pbG4NtpOP~ohoMD*TN
zk0UxuZ+AbB=VXfTyN``5>DsiE`CT2gciv6f2V5=Kn*oDNYz@}`wGKXc-f&bFmt)i2
z*ZUo^`13lN8=RIV+<p`M75{jX_v9nVevELu$?&ZGdGDm3w(e4mI_zZbPGR>uy@37?
zzHkDD1D(0f#R|D>UmZMW#Jrh1GqjR7MP%T2<wUpQT7kjr>obTlEW)2q5}RZdh1u#`
znfNAa^V^YPpekgXOzu40dgpqD@%0$&4`_lnQU3uAyiCVf$&(dyDQ{N*B)R?Vnzh~j
z%GoH7S=8KJ*kb}SyBq3b<;+W}*%k1K1D$t88+^mM>K6+8#6!v+k+uhd5g5^IqwfL{
z=rGxVx%jrg%6oLAXVhHTn(~BVR+xLaf9D-Sdz>ZT9)mKJkguner4u2BliR!3@42?S
z9_ki{$M?sl`aYrvo-+Y%u#8SRPw`23kwkLEGAt?09k8p?Gz8lGR>epOaK8e-DNFW5
zksVhe1}d&4Q<fa(SW#1~<SPU0$|_iMBoxhY%j%l#iNbYEIQgaZsZx=m!-`i``X~87
zgX?^|A08`y)zp|5=<q9YWb!;N0bE!}Lf=#D@E(Ri=m1S6XLg>eq0W<vN-hXOqcok@
zNQrahrXS%#(;js=>0bIy-1|`IeYYPLd%(_i;L&#7DB%K#-e7k7aatq0x}(qpMDDD}
zN^NtYFmL&Nvqk)>h5~u7*l3<3Qa5o1e+T0;Y65bw8k57`q8T6hJ{jUo=GPJPI^Z^_
zQc*ET_U|~WNbyQ5=RP)KG^cN$Yd`W})tI1EMf=u1$hP`caC5{8F|$5G|H9IV?7BZL
zo=v;DJld^_)xzjT)>-ka*<|#)`g|o?|99=(RyoRi+3nExqG@;yG^lR+85v}sWWE-s
z--4YMr}7@pi4zpvncs+Uy|k@M4c8Pj0?9qk{b>o;NG7H(BS!J;!<3g???vbEGJfSo
z<8@q1uf?%HK<Na>df6EMlzj=`mTc-(m6IfAP`|6!xT_zLef=TRljWjW_JjY5A;(|4
z+IrSJ>`D4%!gxL}yI1>3=WJZqrpJf*mio+M{NemF=RM<Mfp|EaQQyzgsEw4jtJKVC
z-PLaxI1&+5)Ww&%S8ZkCn_96YoIVmMyTsK0dblY18b~@$!kcBkQ2uNai%KFkGHn7K
zCAW$WTH>KcS0mo*--Ol=-uHgX8L9SX(KlzIcB#?%;INz{wm09-rfSEZd#59_6god>
z68*So_E-{rp%Z)9lzp2<()!<^<yX#Tl-ZL2on0u0fA&U|c5NK17wpw`8`%twj;GSs
zT*}IyIAxRN_K19AZf_`7mS<Ul>D|KImikMv&5}vb<&Me84>Jk$?b}~_-+AV^-mwf_
zZo;e7Wj<`Ro+n)oG}2rj=?5Xm7z`!L0zT=l{t?T=ru`vt<eF&D>poWnw8Dr#S~e!>
zZnpA>R4IB;hkQPozl;!Sqjin5`femlHRpEa=7B1TOc|^u)8)m_QLNKJTl8GAy;~wW
zV;Ks};Olo9+7=@vBw=4<R2JDq?)c^D9&k}%_V3_?Fih>Jk&HohcGC$lzn&ZWB5rUf
zke3W@Q62=O&x-_Jt=k>fz1(AzpAwTf=Dngto3JUSgTB{@uzbX2G1-8%Uo9=NTm+&m
z)4hfj{|;ZyhzOJBqan4^<cT7^#>YfUT6FG|P)cv%Zk>FE^^G`<QY3$IO*D;Ub9Pae
zLN4AhAc=f3ovK8SiY-k39ag6(Hm$6y{Jf@d_W}(%?@iG1-BS8Tn2>+BP(6t68|s&B
zn0pY`ejY<YkC#*ICQ18_ypmp6udLH*z}|HEWk$a^6A=Vlog8(F^bV}_WH9cl_YKQ@
zDeL&YTk5_WK9*xa*(%NGDH$>JWdjjw#nasTcCrusa3que5E#UzwkB%~x*2pHQLUes
zVLSHg$5TWRps-ZZ?=Ujxr6%|ku1m@Kxc0Zu;L$;c3wJ5@8X?oZQyA;!=VX*LUQ&|V
zi>s1y**8B%xyvM>Xp6ZgCpKX^%}jv%S17&IMjT5FbwN?;cEW+)yy5rEWsiZCCQd=Z
z7}mOIOXD&gF?f{tsMcNQn;`F9QgNo}In_soZzHa8xT@!@#fcq-9be5Fk>(N_bjN+2
z=A|yPx06rA9aepSf<>eHd`{S#W2*z;{1C(rUVk^=)&4PFv_-E@fzm{5M3WD#nVqBG
z2qe_>;I#kC|6>4Q6Vnll#9aM-QYcs32`nWV_jg?XsR_?zQ4y^z%udXM={5}WxX4Ze
zJ=}xY0q50U_o>6M(Pr<;hoDM9P;3f9_z^<7GU`pb!n)NR3~isc7A6`2)%{LYAj<l7
zI~VjYXFIcM{3Ez*?frOl6y2_-a^x#k*b7^N;cP>P!S7*_!tZlIrYY26AihbKdyZgn
zKL86<ERSjb)#cAZacrflnP|0lFZ;soRGE=*pbB5eX7m+H^7bn2_)wfb7;`<~s5!yg
z!dwoMBCUK#XS|Of=bnX)8d*Lt!YVY>rbp2;3^zw92YcGwFndS}F*Fk1HX05)0LJbd
z6E|L%UZEDNT^B07%Rf>pVV1<AluP$Uh&eGo>Z3t!-v1_DU{#x?uVwV^vkWfeDrp}K
zX4L0kR)xtMA;>Lt9pywS+PZ~ZWEaY*!>|i^(>55@!=IQeKmX^Wj|=RHR~vU?V!$>`
z=yJoLV1`*Y6Hq&GEJJk-=RX_`Ms1iUmv7h;=5WMXmft;`hxnFKdnJBR`j|YrVI?|f
zZ+ezB?dX{|=_~bor<Uib&r{%xvHQg$`Il)bTTw=ys!ZgxFq{6XUO)MJ>QkcC-(ohd
z4FPe0tYNQ$iK0tntT9;{FmT6Xs*TrbISMvrC%LZhKU9Wrtl}5jH)~%P4t|!fx79G{
zm?rpT-od8fQQ3x*NYt6IcEl7|vPniQM=i}%rf=*jZ-^O(Ss!E8@2gL_?@P&U=25w?
zqi8!)!k@t1{(X|+Ey$v8FDh04nq+8GbG=^h@Co)K;coN^uYdI>a7w~<e18gf6}Ama
z!;8S3=0Jc9{58LTpcKb|MR1HE=h}Zcv6(joh8HpWbaO&I3BAyb7S>ZmGZCTG_h3WA
zsUXNNo?QomTSz$Y2Grv{VOuUPl1a|I;P@bUoH$P0h*LJ4KRI`wW}2VatM<ntuljbk
zM(MA+eK!yO{yqfel=|cRS#5(zyiGH7w6StW{cC)i$`Swd?fuG0&_@G!X`0pUyQ?V0
z2x(@Agx9x^YYEbd`Sdma0gq|$s#HQzlm!-%D4Xo;k3;Ls@wfJEZm$4YYvcp1G&is9
zonPmbAEzgV8Wn_F^Ti86F(o;f4_FQ=fA2vS)J=b$KikCrKHCE)GaeH4JEpQ)T@O(+
z!EEUA7*jZ1k3alI-a^!GN=>LlKqoKyZ*dm9vdt4_{d=!syYV@E^TGHG+XC<3=VOpR
z+B>zK_MPt!WIdDx-IgsfrW?D}gp0_OR|o~Hr&jx0wE`nbCL<YxWc)o1C4=DafUR1l
zAO^%_!dFc>(x}(aRr+uZO7%z)uk{13WNV%j+D1}mfNC62)h)miG(i%el{M<5Eus!S
zTil`Zf>{?;RNHhsKLpbM9)kS_cnC?F=K;dTb%aJ5+aHaaDTL<<zQ;`)%r6#)&V0kx
za3F=<oB()c7Y@#;=U+Z-Ary=Zbq^ZGl*><1pJfH+VD7uyyBV3`cpV$sfIxE~KYO0W
zUy#G{fQ=iBy#jiduNWVeMShqR4!JpiBGjdECWIkp2@g<nv2?e7f1J$FPRYJU;duV%
zC3NYI7Z(5aQ21iT>|v%E`mzk-KkvU~6S$QY8_YK2YFD9K+AVB|V^ufHMC*@QTUmf|
zvMkc~`Tog#!+GX-Cv5a5{&)3iLcuA>c?D%~0KK&4*>B5f0GpZ~cL$Wj;ZzShiq-7~
z#r}6SK|8Y4Lw`xEb7sYY9=C%YYjbGly{xYO4_{va6;<1=jfgUUbV#Q(NDmDHA|28x
z4bluLT|>8Y3sM5ojnvSsba%XTcl~?b_xslWo%5gbuQh8gbitmzpXcuTey$5iBfMP<
z2RvEV(%EQu@Yr<Sro$L(okTdaM{f<Cg!v3tdDv8akGe~}_<33O)6Ewj*`QluAyd=S
zO$Z}g2iTuFbRuTW9cL|)`|PKi?908j$GwC3wg({Q3a33BVVyoQ>9X1#4@QBbK#u#O
z%PUMgUdKFLPu1J=VUR@dF17$Nt1)ipDw~fB7NnXuaznZd^Xr&*-KnG7F1<2i^VB{J
zs2veckBQ7zu6t8|@SP|H9^T7Lks^yZWvkCoh#De+ic6MgixcP(OJ>;wM>%vqd;sum
zZw@$O+~t`n@ngj26XzZ@xrHL~ZbqN(M|((@OKJ96kcIyeJBW+%GJT8xlQ+yha%bi>
zwWpwlAb8%oz6W5^U7uZIM84M$0zwu!>rq+#duqpB9nnMbXM6;gps2RH=C(mH_NR5x
z$M3U2y?MP7CeINMn(<7xE0$~Qt?6xCmqlF9x{6Uq4ip6MJ1FiuE+g8mBO0T48_!cm
zz0B|63SdrR)m(c^C$d3>X&K`&@qNs|h$6yG)tZ%zzw;XIyd#`XxyD}h0Exx&lwCus
z?*@j(-fi15T`=5?{B(hvkwbg_4$y?u>-9(Gwo~Q<=GHalukO6kcUWb*ls1om=h76&
zTi;w#VHgQ^uV3c&cU-JExY5S&SH}g9t;dADD^|cwBR}>-&O<aa$A+98hlmQ*FWvY0
z-uGTg`JPK1oQWKr%>l4h8<#TK=Q1seY?`+5m4^00?$ganlS@~vSG4ZOp;AKB{RXJe
z#jUK{iNd<kn~YIQ;PrgxL@q01YPUtM8V`Vp9o@kbNL8%0>mS0h78<LXD@=f%N@nCZ
z=cRr4C1594;N2cNNY5yY-uINgzEJqU(z342ynD82|MNJQ^mo$JhSFmm<k%{=AS0^D
z4IUfO8l2MZygsC%xRLf8amtEcA1#Oj!}Y2_yNX7<o0Cq?YM=X@Z25%DbFLkhatz=7
zgV|9Z*Ju=AToYm!>Ypd}*Db+UnWqbbC!V&rxuQ*>)+p1qw@<8^%cZ$l=XvzCaEfJq
zst(vvgLM^R>&Vl?@{=&VBlh;F=hB&D<jPwkTnV_9nu2GJa4Y421o~a4@_udsKuebK
zqZ9-6@8QY*hRezNWwq#CHKQ~Ro#=(CsEf($lMAeV+l(JcLvrTxs82#(+x=kM5E0uH
z&utdZeTILt{rqkAeSfx%(ia%+SXE@8!w8{Lw#cH$m6K?F{ljMQflsBDoAOJn$+nyH
zw$!8-%lE^}8P(z0r;OQOlfZ2kx^0QFNckGSsx3zAC~o2?Ea8E?N=1u7?gOk2^p~K6
zlq`T4Q?5Yk@3JxNb1-cehnT=ccbl}WLT<Ac!CoETpX{@fY?toS#vkPeLAvZ)u5UT8
zJ%Ime8n}$i0N`NdkNCvdjx2KQe;H(O9mI%s4;V7_x2caKNB1LYTVNT+L+G%xuGZuv
zAZ|TDa2)o7lGfc4VrwaAls!3hZzDRb$U2t@P4Wpzc@&z#m5agjb>qoTZjJK_VZ_}?
zTJ*g)8hO#C=-bkyNa(&|Jpphs<QAZ3rnq-;bX#^bz+n`B>q`Yh7vgJvF(+|5e^(5U
zkc=q}y5!HO&4pf9nt#DYyrBB!;#M9y+abxI6`@>A0NAItmQZcKAg?RHImjc`bf)C%
zK(VSpRFIQM3n7~MO}qKr>&hA{2%vgwu=6k2TtN}a#;-X7Qx4|5k5I*2AwUoBEv(oC
z8#X}=E$9X)I<mYc9>`r|5NtYiEKj_bue`&owd`5MwY8vtk89zp<hDIgSUy*X2oc^l
zil{KxfzmFo0SA)@1!f;~?{McFwI*1^`L&>u3TqTe;O&*SiY(*$T`KLR=}Om7B`qk=
z>>3d6*&{o|fId%)O|kgw_xiAKpfjgQYQ5d$E6AMIqEWmK9OUCLss;_FcPrg@DQ)o4
zj?le0jjY3+f<?^8a1J)f+!$Q_y1hhVtoxk=yEQ)1&c4u&pwxL?S?Z>weWho!n(4Ad
z@>-7Ob>(|s>{is*96u^~-g+Lv1&bx1s~>U$7l_OI5n(fD35qe_eiAWPR+x?@_7wLl
zuW@yBxA4_M_4U!y;}L6QRLAPFBLv_ZVKug#psqtu(l#CMK~TpRiBqYgE(zK^4W3%M
zwV2;G#)QX~R8)rS;Ljnc=Q^mor`zA(`f||3Qp8FG6tor4R0JXTL)xGUC`v}c)uGxy
zyPs2Bx4)t8^p#~Q*JX>qg`g+_afA$NPbIezw)qevOY?N+Ar`}CB9?4Fh&f-91-rk}
zEV9r8!$3PCr9;6ny4~%l<q8N<++r<k2E?^S-`pVfrpOp{*&IFK`KutcoshI~%nRnv
zH;Vl3JjDK8(8=3JeILLAQ(+9Ns;KC<7cONE-F~WI(uK06+Z^FF19rV_hI)eOkd73H
z7%FPj9-P&oQI+5%K3bVjgxj?8n>0z5`B`e~;Cv1{=;SnYX%~S3U~;mNMvi?%+e8s}
zBNj*>t#C|CS=NM-5b1k5E880%ai{W#c#~eR!RG8}+B`EW8hv&7LYV>U22%4(1J>;C
z5YGY$#C1n&Bl`t4`(p%r&-I{|jyiLZD1ifStU0GB@fbEnNS2O+n`#d}dZ@P9QV=g>
zc(}fe3GAJ^#oM`?s&C~9Bd)ds5=qftAlzdaM8bY7kw5~Y=xkIQ7JL~of}Lvplulc=
z8Fiic7W>5`#=w4rDcRBlSjPc%Qk^0b!fmjC@vInRe5sSeoQXs*vATko!p)=XFeba?
zY45RsY|}tDKY_}~kU~y+%+aQdWlaUf=)4`07?1MROx%b^<n380-}?6MlUX)J?;doR
zFmCd(JiBZR)^qLmZhG5<-&Dubvsdl9-uJG)yX4bFvCZh}s4e#3O=0dqvw}KFhJa$L
zLlhfKB^yLx8&)Y3s`ZxJGBfX2&kCIE{AQ8s$4N=sZHdf&WCUMOl(i>H;?$?COwR&b
zuv1^@r3VmtL|7>O1R_LurRp*ap~FHHik{OYuX(*8?a$@K&g!#*>_a)8f$Pi?-vmq+
z#aG;{u|2Lyma=8D25~c3E_E1(m<5>8o6~mwFnJqA8OhkYaHl7wLCGm(f1`Xo-R~SL
zB|otZ-I<LfJ3%-5YB6XFLsotZ@c_7AvI&r!wA#b^z4tTX#qIbehCI@kJy!`t2pug+
zeytuNOQB*oo$)WAQD05wT<I39%@$XOS2IePeKZGH(VX5Xe}Z)5SM$M=PR)0iH>0ZF
zL4K2OBbJ`agfCRIbm~+1W%4<72-aHK&T~nJTODPUU$&~Y4Cni*O|O=*Bq>wVV|OMA
zGmq+BFD^eUcE9%Oc)c-(M}V0`HTt5#9a-`>A&<?`qs8p^r7edN*RocV?Ovy>WQ{1E
zwc&h@<a))KA*$-My<H!Cs6ws3?{$c8AGkO7t=G6|Tjt;oj<}1EN~D_zzjDf!-O--s
z53^%ayQ5|5zGW6Yy#w*5mQQ2HzmC40wAZ^dpO|JW3EZ=_S&U}sLn*$!G3m5)icVJD
zRQ543zqTEObaJgqvGvX{$r;Iipfn(~xX+kQVUDh;Lkp@snI}ZBzo;H}n$PF|HI0%^
zMU3pWl!-x65%Vp}3&?=r+q#e8kM-uV_GkUJZQ-DiE9gg~_c;)aQRKx>{>YqN!%c5)
z)B`ZSceu|1*a(k^*=_>aNz)z)G1=`dSO3t>S3BIic)1G&(=ErnUll*ut_~VqU1y%Y
zMpQiv?dov5#_BuJkl+~16yRfe<#geULjt?#;S<&<!$Kt~9EKNioh)*XZnBOnNtxyn
zKQpx-6r|f(wH>3i#f~~o6)zcrUE3KRupDuDM|2*3jn=}VC5GumPQrHHaP@^cjVX_c
zgMJfbuMcDS=w9y)c)XJP=8Vc;R{q1Qe7qKGt{OeqMZpt@M8<1~Fj(962U~5%J{Dc)
z*c5W<VTAI-prTR@>s6id_saWD)A7gMvEEF0#a&pJ8J}Mo{n>x)w*28iGNoVZHqvq+
zjSu}K@LfaH)o9&8z>9z5jP$~8%ueREau{j9A)i$QU~?I*d`_o2e-5KAz0$e%lF>7n
zHN2a$f0%Ms7P3>V2(f4A++hKGz1kEsELWVsr~l)c^$Nzd=Y6kugJw+U_M%Eomtnk7
zF2(Cw_%tqN$Tsr;$rKj~d5AgVz86h6D7AO_Jim549Pdupv$O3f_3q)b>fIu(tRSaI
z@CzvctU8I@%z`qYwK;o>%E?X^@6>aYep$fV!Z=?yrq;Uektgb8O8VxVPjkRZzfY;?
zyd|VqVKM~=z=ZX=ICN_h%%rrQ7V|LIjo$1(0Jtdn7D;){gnrX6pC;AUyl%VhzKIGm
zD_~8!Q;hClpQwkYNUcF;_J`CK%nsW<_E(eIESQRsQO&zZ1njItfu+jn0k3-X2=6BA
zM_dn0scyK)GosP)Bd42pj+|H<1<A4`%)6(*i+jTD3cQLL-9AT&6`(Z2ebBw*021Km
z>P0EYCuac-IR`m#^M8Ni1Nsm#6ToZH&DUzSX{2Gz)TbLO)(GtqEEVLH`<ygiivNU{
zbj2Rg3S`KRV4Dm@&}xKWXFta>LZ#;@EYfpWwiQ5NHs~7RjF@SmvEa#PjY-E^0g;R_
zNmR%qk9fJa`@Bt2kFxzWUe)jTwVFc$WgX)8jD^n`N^%AvKy<iTFOhB$1`-h&`N@bq
ziTxqx@WN<5@Z%qfhd+_lJ*l+Hv1GFZJx8Ceo>obkZ*6a$p(2`gAX-|PYidv6BLB`<
z+NJQ~9j@hq412U~@s!JC6R>}Et#uom4WN{3V--D@5KXAw$oiq|txfWjWPg+7JU}8?
zmtRh1sU6<U6qJY=(qitneX4I+Hu!Pg*bGs0F#Y;&X2hL=MXRQ3YG_te>k%R^G%1>g
zzFyz<a1c>a)?|eZ)iJh?JW-`TuG^;uZRu6Pm~sNPSNmpv_+U5xVG-{bfv7jJI2phG
zrq^TUCFHavuah=;Hb~kWG|b6xARK*}YmhZpIBv94Q85;M?&gye=|9(@cRkZ9l4SDz
znx$7O05tj5cb(UV=I#t&%eb{M@v_hku+*;1_rE<;dYDkk=LxyO40P>RN*(5{X@L*c
zl-SB`12AhN96^C*%iRh@n30}r?)`FORS7h9CZ~E!`n&{Q^Yz}k0`!*$)y^q42qq>0
zt@dFh`IePG@78R*NNm4-Tb_R}CQNLvH5awK?mLn<iHoLTNm9si^VXMt-6GhfCjHFL
zsG6kalDwWRY);Viq5tnt0ElA|kU(FMl3TFAS+wn5H_5*4sNzU?%~eAvg=@IcEZNi<
z!L-s$yiT%s&(i!XWuDW9uBioJibYVH(la3vcJrIp)L}8vhO?9EdcTDsQfHp1v@e{|
zoO5{6sX>ywEU*Mk{N0#4cmzLMBn2{f4qfKw8g1v~Ac~GuoqoKY#C_tCZz)8yiDeCC
zc?0Bz7iCUk>TS|#GvfS-7IaT|GgPj-J7h;+de;C*njvt9OlebIES6uK>-?q*lfj<D
zFiw8OzVxfw)e5_TM=q8-_UsCe#l26-gJiK_yu!wYTYs|Woj5ef_$s%PK`TWER_nV*
z@O)~#bh#Fly$#?etP%zSPk`iOkmtzheWc0|10N~v8QEs$MYOENV)QBflk2$Hb%*|4
z(4^k)IzIQ!ef1~yU@Z|o9&ktROzzLK&I`e|GeO-!!e8@BI!TaS`$`v2oRkZ@E5>FH
z>+E&V<{!@FVv8$QiJPUPhb6C1ciV(syg9hwW-;q4$mC=(h-AH3AE#DT&L=8=?ntj<
zoRoTUrpQ3$zLv=Ldh~IBROJSsKAEppkS#pi8HJaFPl=XQ^1fLtyiP#*xH@n51AlGN
zt<wc^&oaAu%vah&VDz(b@^JgK31fOwS0BO|`KwmKfFCz|fI>cU-e|J9HXSe9xJN|m
z*vZU7vWwWuaKDL$sMBrmR1;S|)DD7bauna%j~*`z%Hk611Il5!biKH^5^JT@&M<R)
zhQo{xK$W7WsQ2F{O5(krp*X@Unvuj!&X8&rh#pTRyf4E+9D({Xl{Bajs+QN8ffMv9
z7Yo-b4%vh@yB(iy=?9X#D0QD8(xzi})zEFP9nL8aUQO4D9byvE7cPg;5Tgu<(s2Ej
za7rTc%OAf~rLmtKs}wZ)aU0=F?0B!D8Kl7eQq$YC$T!;303b#Pr{dh&`&n{5plls&
z70CtMu_#%A%}xXuNc%twaj1!!p;pE$Y|~f!uo3)yA=FQWg+blP#ngN$k~bgynM?=q
z%TX!<Eb#8{NOrUg)CzNK{K18>xuK_1fC0Rd{`u+HnY3)zyq*u!;&%}Z-8DDFXdh`U
z0&}NlhZ*(MZReOi!N#dDe%qR?pa|^?YA)@ZL}OjZdA*iTTo=hc!z%s?K&EoFd1!F^
zT->5GuSEADW{e&~^h0w;3obb5vV(N8n+Zf>ivqEepHMm=uV3Wt_Vnhua6bs>j<he{
zn|lTOQ|4Goldr!X`5lrdnyxjH@W7w6Py2&0>l-GeM=VEct%-_0sbxj0i5NA;8ie;i
zg*b#P0n^_qk{hGCiVP<Y0DnZjP*VJ+{hfMT>pM*9#(OynNLV*!1bEDE%WkJmI5sl-
za;QG+29#2<it>>(Ptqec;DXnQwV$2@jjA7ACY)ye4s{jr5Rqb9aKKPJu1oU<5Iqug
zfGSF)2_*_w(iC(wEK;fR=C1AOuKyZJDT||OFKD5xs7?B6Jn5*4Gv|FJj8RNhNEX9L
zv6Y33(a0@@?d0jw{s|mpjd;WqRI=)+;cdr1_~U8WD;v?e0KZk$%sI+oU@izp%u)#V
z4St6sQ>ap0p^Y-7)mGgaOkVi)Nx@@1S7CD2?Ypx6Jvv-4-ocqkJ=FnOzm!oLj8vm6
zeYJp+q29IBTb+beXs<{#Rcz8c`=ZohbKhf;`4#rFG2<PhkT<dgE3J<Krr*^l1@d`E
z(p`_?f$fhKNUP<KIX$1<MJewDQQO2*S+Q{+cm>$0uNXqk*qpTXF@w=Eqp3<dg{j0E
zuW4KO$X#2r^m2u}OVJnYiT6j1jOsmV7H^=vtG<49p3z4xuh+)IT69M~W|AK*_nr5l
z!-}ATqt+&!7e*$pwqi;D*v>8ue!8r3YSr%4^C)y3pdN$<`b0TgWIAD@_$;|^)9&yk
zM)TdiKqDHZUe_MBY!2|u$h0n(rdaRb`@DYp!WfsTqp4In0<z$8WAL~S>fV)XO#G-F
z^5Bx9`Nd(Q)T;+^`6s5=%JZ9+iE^w@Am$zUsMOtri%0><(w%Jye$(r>?#CifWqO<7
zKAS0bdK+*_tB2h`J)BA@zu)zmu;5rzDF#F;$pIUj9L+RU|8Hgo!e;T`%%(hL-xXWr
zo4BI6AD?L1i$=?#80WGG53th~AyN(EfMi`#Jx5WTO9>J>IB{hs6AX2C$_C>Cr1{x$
zBbTEZ5v1M|(I(eElU!smLb+5O{h_;o@G6c=V&711#q1()>7bRXPMWY_XBKGaeWsX0
zUC=I4F5U{@v~Esf{(Quui=`{U5;9s$e|7UqjGBOmCfeehqTp&Qw+}s$>uMNg-PtFX
z7xUD6^z~&GhUL4Q4JWg*HjFdZgl&gBEvnm<vWKdI@vW@@j~uO!WA~BhPQclnICc=v
zX!N>H^aX%xEC~62yra}f8PkJ3+DSGdv9AsO*c7E6<GpX<v@>^nlQ{w?39k5Eb9t+#
z<P7xe%7%V!2bd-A(cb)&{%6Vy85=ye1GMn!y^Jkm#@<ztSjPo-f;SfkedunhIXYCN
zG?FLTSdh)pLqAvO+1ve^*0`xZA<v`5vAoWDplZG?^v=c}4vM0c8vrL;oG4tkgGO2d
zTkUuC+Dx_c_Px|9VI)`H7f@;5!x9?8o0p0~#qo+W@;TM~(*j1-{4SGDbN?tW5+XDn
zP+la#tu82GI8bvC?9No~9+0?)*R(D1Rbbd+Awlie?pG*MP`YiOk$g!pV|)qa+6|OO
zRevli6vX_Rp`mRgfq~b(-AKAq$k0X?Oe6(a-WOA^ayCCWJ@Ww(&-<nu3E?O4v!v^V
z&<`1jcJ;a&3HN+zsn!xFuJ|M_D{Xy!&(2mRx6Y?E<G`R@y_%F$`7IT7pfpN_4s^Ra
zAkSC27?_-TK*j&pE$P;nxU!>%byD(+XfT3iNA;Mown<$|06724&g*;LzXDIYAwS&8
zoRmM9J)J8(o!cC-gkGon_3KsWg-c?b?*0L4u>2w>39mW8Z`4<|g(vB|PR>O2_ydBN
zt*hWAN$r?*H%+gdSJ}x*BiT1xIi^+;OKs^u?`rW|?OnsJwK^1?ndkw{eLyt4rZC{r
zf*QtWB`;Prg=2K^(Z_4L=4xh<ZGMlP3=&Cy%f$)f#gEfA3AYd6_MXPNb$XB#iH*Ar
zKl)Y&PTpR<i`msnjWeb**_d_gX*^PzWR5g)_R*4pA-ejob7<_&Cj}`*rH+}uFk6sp
z&=JsIKO_v4t;PkLn1c776lZKSye}49|IP?ZVi5d;5o%sjfT8KYB#J-~9W0jUDF=u9
zWelZuG$Jb*Wk%A<8wyvO+!?Ic9x)+^tbcBMKNnTFpOD%oVP*<omPsE~3#x4D&s~5B
z>tP|upQ{R4WKvbNBtPfb<Y5kF5X47%)rR8auROpRn2!w$uCT7FJ2w5(L8|Ai_@bdh
zUtkV3q)wWeZpZ6QLIC?ksdX0p3OO>YXN*fj7VkxXNUk*#1D6m#!&F&~s`>p~K7!N%
zZjuwscM@}f(Y{Yn7WFI6>eeIsm%{UIO@sBjwWbi$MBfbwn;=0V`;^0v*`3HE1JU0$
zxa4%xi0PTi@yujX9DBfjSypb4M;eY2JJ?hzk7{@Z(VlAPgmQK{*r5OVF1~kDEO;c&
zVnDW@cTWT5&(AD58|b{AK}qMJE_pEU{uy3zLN>k~RUg{<!-wIjM`2lhBic0z0c@^z
zZzXYCT34ix9(cFrX%L4U+D&Z2vdkiCY5_wnEW;dkD5HRovQeB0eH!77_MLc|p6QRP
z5*jsn#b$PSaQ5?D!g=ospJ~L^q<|Z1wU79y=9O33ldO_XW(6aZ_Y8P~;nMo7y}x5^
z2ODh4b8Np$@tH5tl%3&0?R*(zq|ZMvykSV4e4UH}J~Kg#_Dou)L{IToLXjw|#G(3<
zW_vEQhrty-M)fA~+uq_Mub_;>Qdosisssu_awC_&%VHqyM?5>!9j@$~A!jguvc&*7
zRW)89Z_AFt50>^U@d2^iP2$IFR6I1g)w)D8oyUq0VHSL?dHHJgSw%=vmh<E?gwkf^
zbB~VhDpK-SUi=!#g8_mqX`U%6pt7=P^&Rw@fOr0Yz?3MO7N3O^>s7*w3)3IZC2fma
zf4!cHUccMzk8IrxoI^Zyj;={z8vm%TN?=RaFIQQMhu+V}qsUO*I%-+hunV;M9;u8L
zzk5Nd!n-(pqBu_5oEhEwSfYbhUob;SeyuPlIdz=f2DAFyv>I-<4=6ox)b(|`jw<EP
zqogycX}BqPmxfn}mEO`(%SFbhy8Mi4ut?1VTV<@Y>)n-rcmC3bY>!nqpWasx#|7tm
zNQf$U#b|Gt{1T{sZe*gD9DLo{t$L(OHg_i{^Yp>_B*=nvoENp)XkY-rWLaui=EBG_
zZ#%>&^n{Phdr|T02D&9>MAV+s^_Ne#nH5CTzks3SirHTRd;EugC+5Ax&jNZsp^yZG
z(NYXW4S$+7Pv+z#@c8xpei#cCMZ~-bI1IYtfF=9e>_Ge-8S?Z#PItyllCGUhwdvkj
zIRL=;0&xuhot6+Ax&bJP6A?36y#d0Kk-N>3C5|^X*CugOO~^iG`w7pLq?*kmpWC2h
zB(a2jk7js~-&{PSS!vs#Ws`aJAc`L=kd%!5_2xz8L>IVd#8$9`d>ct?M@I933>g;O
zuH?@<Z-vY`0OM;%G%w{~iFh|diH>a$BslnH#4?W4$^VJcQtQ-j!TmHoJ7b1xw6Mr>
z-N^;4s4Aqv1Q1C|)IH<cNnbHk<40ARlI{BhMYpLrFjD?7%@@!rz@noSuT6(2F{3Ha
zXK6Z*=aU$G5(<mJs<Xy3OEF6$^Qv<yeJgtE_Y156F=Szb_swoc-XY;zat+-n!WR=I
z=V5UgtRXf-xE+bzqfPM10{M;Y=I`ckBq=^h^4O}@&cT<KpK<5jIQH>k$Y?Js9?V>K
zVOLfTBrUcMtBQHD!g{`JdQv^Opr1xy@dS4$`BH3a-Xtc=2e4SI{w2sun?01MO1zPz
z*|Mw7&)=1cXDP^Ir$(FwX#;}pK67%wWv5<DIW7L}8lzbekuu?NZD)+v290HA05nHS
z)|x>`{T7c<*Mb@y6bq14BqF>B6~9fY|3!<hFFwlGK*tIr&WUKZ=0!(B6AN$e$1dZe
z<*uBe`aSz>9RR6KQ@0C&MlEuK{PxHVr6Be4>%iHUG-qQ}=ITI}ud%Nxcn3pG{o`&;
ze&&3_LQrk!%vpnFvRlc7xK$IcP;AH#Em1QBUx;*-XNh7!(KFsQ?^mfIT>FZBK@Y5g
z1vuGiknSS)xBLE%<^HX?Z}oDpk=^cS_v;f9YUoc`t>s0fYHYOIUSKHAMUQ)x#rCdE
zAvflkh<xoP&<dkTRIOeBXh-Kq89i$`^6(@Rfg$E#9zt5V8=&+Ouf<hC&9LdJmn1tV
zXp|S7Z$$L168c2Pb+xAOV!Gr|sJy(GS@-!MCT(+44N%optOxwmdyTucTOgLD$pj!H
zc^^h7MGqsx0*nMzm*?<Rph`6Bf%ZAB{E_vlI~({!Il@Hq5ecQ28=vc$mOl`H=gLn!
znfjEl!E^>9=Q)@^FB%X`qFb`ByXXs8j1Z0GyRNgn4C!T|NgVU_Bh`aQC+CZ9t!J4s
zSZ6eO%hQA+$uAd#fjTZf6ub`9=E2{yJl_5jw}y%N8CC&ttAltSoUX(vDK;A!mBFar
zC4llpc&@qzDUy6EDMMBQ7DQ|0xWs7c7m_0W%7PMk$aIP0M!K<l3U$IMW=FjoiY^`v
z0fQiGSk<pk9QwB%F3p*~@XP?3zG;%se%ARWs~rP#+Nd0Z;hFihwxie6m9MXW5bc60
zc{gshV~Je`CdU$n+qH^TImY0#2Hev)j<F2Xv9E>e?`E)Ov`H+TlKxPA3_i$owCj(`
zm&2noO>;GiaDI<r^axPRIxMlFCBsVDv#pMTs3Z-@(l|b7P{-DxWt}N{N2({dezNWM
z1k*BvoUHMWx-06iR(f&NP4IokV9_*wVczW2+*eWdQ7K^(mNoebjpHPrk?*#{1jAFV
zSc`u4WH4rYy^)(~k*W^Q=27xoeVw98F>P>43`hb#mp$qUmj+aPyy{U|et+;jox^;h
zLGNL?3t^Dh#~Mp$u<rRujbPHJLYUhuaAYb)?x-LX$|aU}w>0&Y;V29Blune6w`Q~&
zS3^v7lx&m7y-p%e<nYdH!DSdz1f(svqP-Txch%;p>g}Y50m<ud!?zf6Ei{QM#5`0}
z$_jF#x9KUen|cCFMd|Hs<H0|6m;4zb=$R9nh>f4byPGf9-N!=NOn7nCRL7NRm7dRG
zeoh*Q1*ulz1Wu3vn$38arM1H&S@6`$F;=rkgbgoJkm1tsPQ|SA6uEg9NY@9rl;AC#
zVzMiCK1mdI-Y>BSe!4=SkYy;WO&Y*rNhCY;x@8}fnyA=d9}j(1{VK}kWj4<)qLJYb
zyum!G*0EsQ)x#&FqNW_uGn%d@S+XHUB>X2nEbwi_CCn;}^F7sf2P#ZM`HF4%6!93)
zMqtm#ySZ<dBOVd!#;E#n5kWs|jEW8fvSIr3@0nnvAA6Jr^cWWS^FBCh&7A=~HkV#n
z8>bzz3zvokfXsGw>gLffE+mg*zKnuU$}j>;mHz%CQ9O&KvPGZ1x8L$NBhQf9C+mgd
zC%&TB%@UP|m0jrk3Nq(<tNZRJ1zB!K0O4+bzIqQ+I9TrGgH0+B=b9<cd6K%+p~603
z<f%s&X#Y8W>JC>GoHWM<?Jm#QlzvK*j;*`Qe;_`>u1gYAh~rtSYOhoq;H{`@b@e8#
z8Fmz~kv~IEwd!2ojz0Ch;qv(^$P#6If?_hQg_DL5aKdbGGeC-#)ZK7fWuNQ8X*YE>
zCfI9oO8ES0@*}XlK$`jVq~xj#Z{@<<lE1OFaFN8@KT}oI0@T5EL5NEM(w3{Z1>HmH
z5sMB<17-}YtVk0C0V)x-o2=$fx<ga7%4)hSZJ#+xmW>$|(Ig9l*k@od5)riMtfA=y
z)x$*F0BE0Wj$fSvUfNNDIYR@hdH$IWNQpIxCL=x@>zhuj5UX@90aU2~U(rZ+h+wN3
zrq-kn^vARbAOUJAEUF-P)(q4-$}$}+FsoN}jdJLy`?SELL>}lxYxjr_t5+`frk^LD
ztDRV0jTY>Wu`4?y*n-biQAilK;5y8j&(y}~;g7NIY}W@*&<n~v2D}G?RzV#G<go1u
z;n3G_v?6AQz3fw->aiTFF?zK63$Kmb$Bb{(r%bdiYbr!$i;qsYfVrKj>mZ7+1aC}@
zvE+tvt|&Wou77@p&vxi&xtfxoN;bdXm<o6|_=q*2iq?k1ikC+)pk|DrdHokjr5fYi
z_qyW4Vo^MDF`OFZbDis7eHH<Ep9__;TMdNV)wabxnJ)i+YaC`6kuRRgb~NEjI{83{
zL>R=GV1_DQ+Km(PQc3iH>{Cx#g`V`QPElpF7mY!*I(1iC;j@aF0+W$EJ-L*MRWs~7
zb}w>Lt_ivF_+M3cJ>4xY!>r0*oP2d5ijHxzfs8f^5uHV<b5x6Z2~~=CUfN8lGaRQ<
zxTA^cu_g^=;=1i-Q{r+SR7?$UFonXR2)`E$R7FqMu&%eAu+tHzwUTOtv6gbDJ+{n$
zQ@HkNStmRCES9pJvYPftnkOBN-GgGQB1jtjQ}yVp=mSrZ3Xe%4m|SE|M_wf(Ui2hy
z`!$kd9sUs}PGdLDNPiuH=?9<Zu-v>C9SINZUp_vVFo{Ze4`08;TozguY4~frRiB+s
zjt~uo%Gnk&XsBmWZ~`NgdyBe8KmO;)YDn$!_ER(@_N7qNi%L(;%cGPUS-oD9o9kJ#
zM@;LE1f_t#tWWW9UknOOt?v`~7xj_<-NgC3=MDxze^;}ir89*(4?R|=7+D$yhfDff
zQ@uCmmO2?Cm=9Ma9dR7~+AQ*lg+LrzA>Kkz?xc|eBg?4Uu&@}bz)@H*;yHnY`NV?B
zunC!{b?iVdJJ~dgHOe!d%=er$yb^VcVvn22G^$B2ylC1d<V_er+FML+-Ro{eyS=qd
z<wIGOrUNSEt-&nxss?44&-aH9Idd?@xLIwr?&V+Xcs>Fdx!O&_a-@(P#sL{Hf`{)^
z&hBCwmc!OgdRZ!5Gm5&FvI&+OLL7+kB|Z2;`YYZA6D?g?e9YtLk~-u8(|!=a=EmAR
z$%ntx`pD#2FXHEWul(xhUf6(5lPmcfSz{f!@nV)}c7pP`W)3NmpM2^fF1^MbDzqAM
zW5mdo9fPIb4Aw-`U+Jd1>+b_ZapbtQ?ji$ke~SdX8mMFf<<wk%q#uL=EsR&Po-wKq
zH)<?g#$w%$zoQ*WH7{{$I4p0YwVSLoxgM6?Yttcu3x~qDCEME5EdZ6YXs^|KfjMkB
zfh4ju)Fl(fg1R0@=VF!%mOV9*k?XHqhx>H)^P6|yj?`ODl%AWIR8~gIKMOG%q{<Q*
zK>Vsi^@WJnP{XppzO=$qPoE@t)Wziw>p#}ZTV}$1QghVyZ{x*x3BvjU`s}HX!-FKP
z10@c?k?d)-UVm2vY4cHq0LA(E;Bt&`JOk$S6?eH@2C)Q93>QYZB{qctqU};VM%j{U
z39-S(h71A~Kep&hJqGpD78x=Ag!*IW^QVf^0j}u6jCH;;?be<I3Of0XRlZW1(a>HB
zY1Xi{1?(i%oHYu1Siv#$`LD#5V^R~H&GK8K>8QIyZN*T&_W)HGSYxaEJAw+s1Udzw
z3AHMzd*wvHxa|O$xEO_7%c<XWrRi^+x1%;HvCCFR>IE<#z36bw%`B!8GmSch!WpYk
zRcKUG1@!o1jE5T2xe;01`;2p7-dwqhaJ!&~SdOztOe9VY@3To_zIuY(jSd~qD<Dd^
zB$pE2#Vyo$Kcl9YCrD1Cmu?kuF?7sOR=3@p7e!^w`jeARGu?HWldhS5E>&I^Hx|9n
zWGs-4?6IzAy@7A+-M&s$*thN2rCPHy`l#<15|L=Q(@E$9yZmorDgmsV_3bZ1Q2I#L
z$_zOJo!#%a;KAIstP`N<m31-Z^v^LOMHZl9t&R;YLHd|=9Z4yDiXzTP{R-U|i6;74
z9IETus!c+E7-u3@u-CWH6Ml|`uuks)?mc7z0rQ@l_96>S?IZ<Rxf^Xdi22Y!aJ3HK
z>$eV0ieFj;GB9vYwK+Oce+%LlzqRQu&WaTS5UsY8j5#ibWSTT=|GeUWy%PlE8idE+
z;fHjGUXH{;SPo>K-Uq37=X^P4gywC$4xhx#E%bS`@HgAWd*qqemcf`4r2lL4H8Q_S
z%5)s3uRwyqIuvZseAs}gDblbn-z}AVzS}53JZiIJyG@z&DueE*M^hR;OdGWO99e8;
zpLe<k0V!7orFG@pH=bxRIn!DSl&M9Xs1YMMbKOd1a$WhyTstqS7w_P!FUi4IHtr|u
zYI<4*=8Tno`dCQ&b-%z5c2{V1WdiMzZT%ux&|G%3z7j+d|ISNu*qg*HqWa`^71Q^N
zPHo!I?y4cyUm`ZH&B3L;NB?S8KbNFLiuAs#;17KjjId~2UqHYW%$ks>he<wQQ#M~m
zAOsydPsi0j+`-(IQSwkhvkm~Noj)71t4^kIxQvPDGs@~x#N@Ky9)|d3Sug)o6~%95
zh0RfV%Qd*ENQ8=pku0NCzAp?K%l6T}CT23gplYYfx_{A;Op$0^)=2k2bBs?Z-OC1D
zy??P2FJh4ObiZAQ8KUeno~@u2o>z|KvO60eE?g?EZIvTbt6ZhSq9EPIA_gw755~H#
z_gcu{ccOc!)dOgw6q?y67;awyEx3<AL`AX(reRo`2cNP7uGpSV*(|bucnF3akk{S6
z3+mQFqhHuDfcmof2qB5s!TwmIanhY^qA{@Gj2qxC)SOhD4{mp+<pg~nJLg?@zYam1
zz8*gj5>>g+JEq48*Q~oTS{zy}AGz+`5wY3Anp<{RMt!hX7qPiaPJbd1tz{O`iR`%d
z?cR{Q)V8%EjO-ADZPm;9Mw^w+9$MXR#=mt{kb9D!qkQ5WJ4WZ3B5X^{%>Jn*;w9><
z)#*l7?_@$3X%C;A7sMrS*7ossoMDT4)+7IbrC6E&W$J~{uC4`XTvo#xA<gw9YDp1k
zVaRe5<aTgv&U=O>V&ZYl1!nGheNp&#D)mmvup3m$Y@{lV^@7{HL?^|+uzlbMvt($s
zk)plH6g%17$6Vu7F{XNnP>?bQ^Q<xRR!`Vlsrn3I6-Sg3QeVe~zElROoDzpud|pp<
zS)EzO{V2>GT&{C6o7vxvJXkiR>3{C}2TrD>4hayUHjMcRMxs=V@TW&_((io675l)y
zwSbF_*D9vN!pJWrz4U(nGTy?L$J`P3>{v4{I)b0=!(0P-vmb5cG0=wxqf}LZW~m!j
z6mq`3=~7hIRw_f^%RAj_a#eh`k)lf`-(7Ou`vAfewT&0Yn*z-)@rQs#f|Vk>M3NHn
z)*Wrh&GY%D&(?@aCZ6Zt<Y0P#j+W}FfRB&{<-bRhYtzWvwChO&W~ptM^`u}QvbQIM
zy@F^N`9l0I=^d&Dv70V+A7+KSUP(>c185KNZ{EuiS$E<9;nbSVI>=aYm;dmm#rBe$
zURh^PeVBxKI!TJ!Z;XNgwz9`)?oBVfC$_KiTp7RbraIC}*P8qJ`PZgDU>Mva8l)nX
z+ndMjs~*jw%q!09Lw{Qx0*4iN_%7A{6=fJog0R3KqPtmID@pA}15W6ho3m<~NNtAz
z?5bcxnJ?mXDF>Wq@_tNI#W*G0AoIRV2jY|@L31|ks2O~k`|>=*z1bGf$BVLcXO;yr
z6TuAvx~nk3{KSfw1ytNyb2kPTjv&E~DLcc8*Lgt+evrw;Cu|(oZR1RbuP5bosD%#l
z8Av949r6pS;@470(6&TYxoX&8H6FWQALXn2Bh9N2+i{lxWJJX_Q5o4etdLcH`C{)m
z+EB?zNH9<8<&R7f3#`w5OD9?%N9AZT61So)cO{4uxTpL>!J<zw*%Vu5yT5S3szs%T
zX&$<~Xe7%5&G?nSzqU1gJ3s-6PjRkS&JgaoFDu6r^x$nkT|;YHuGb=*>Z^(4x0jq2
z$~sw-MDx{YtUo>74M(LOpBzMNM)JQ@6PgJ=%6{Ww#EL!*XhbP~@{kd-)Rar9?e5Lu
z%G}@GfU5=|4gGI369jv*Gp{0a5q~QeQ6DG7HpU;TiYGYVGl>|<5??$WXFl9N4Yi<E
zb6f^3;wj?SCj|ZAbNRVjG2Qt@r80vKaVwnjWJCxuAnP`Gd1Ry$<ed*0i|K^q?&dg@
zUp9_weRN^>^2zA2g8-w@cG&a!Ufvdqp!D6R=s~1|_{k}?yV5!>OAU`Kjx3PssQRyU
z52)0|bmjoxey2kTJ1LiIamlT@tTo#gH&}tP=!^;AB?y+YJpMi1hVu-uhYkx2%|&(j
zgs4Qe_qb4*RbnEn+1t3SczXdu!x%v;7Z_Pp9i4={{gkiT+xUh4z++XT7giXp`XyBN
zr0{UM)`+<CtjYIl4Db#{9|<QRf?mO+lkUJh4zjYFfN~7HYCG)akO?B8EvZxO&6F*%
zwViGa-r*~6a9Ln{SofeHkg%y^wlGrf4c8r=t}Kg*xzHo)y>hz+N_2u$HEDOiIVALG
z;lzI@huDbe;xna14Z^8^|4#+v@6$OYHe@L2fvgZEZ(MBPXv^vZ{CElS_G8k&{|v`p
zafSs&EoSPUQXdHTCn?J@FcGWe<+)oT_6-+Y(g7({1HuMe@duxOtq>^_{IHf<!`<LN
zV_0SIdzVqoPd*1E0bft}6q^cu$mv<@KgWFJ{`F@pVeKgu?z^hlb4%Afyjx}1PJ5;L
z+0FGE)}A&0dL9tcz>y?b;k(N5H2o8sSsSTZ61xT}x3tvo`SJsD;NkYks~@{oY2df-
z0K^@Bis--GzHuc0Z4RJ0OVRx_8poLsd^RG;w*JaZPU|#LSm$5QfrQv&pc}Fb3?4M<
z4wEv}8TrhOXHCHmYyg27MxM9c5p-CjeJ_Tb82*~1MgKZ1_3xpxuOK`lr$v|P+U%Ru
z?EcxM#-C!${Zc|}*niI<!pElVXmqCUD*rVBn<U{U8Fg~Cc;w^Oe1AGP4<D!*5n;%>
zgSVUKBR-wyx;|V?yP>6P*SPuuVOn!<sVL+czIzVrPfrL42v1LWF_zN*rj*^!L~c;+
zo~zp~no;fwC}N-lF`!;9DT{rqa1&x;2nP|oQ&VdXpTTDi4S0cu{3npiQ?k-nQmy@^
z@lmg%&|dak%j4|w9M!DgtaiU5th$VuPB4|ndMisi?y6t+5&Ncd2afj$$9}`1YJW<4
zyxn8tb=;8VHG&~swLLwIKCDwb6+xVT>ejmkzC?EvJgAA+1t{mOZAvgD0Zh+#NCb54
zZfwi-Y`=~J_U7x$y)oGqcAu7VYyr-@1pwM-OObCCsea?D+~abHN8;;eGmqP%PmskX
z7bmvX>s(#;&BVo(jzIl3pN9n%Io3qYWdUuZuheknu||Nmy{XY@yU8qBf9m0SJx0dm
zr;jnxBUK082MM-+zI-<m;X~)L@6*B3mvwYr>nRp4N-GjI0E=!B;2qvb174Is*QRnV
zZO`eScjQLGLfsn8J_0y{iSEc1$MfcNLy>LgY9M3+&JJ=cmG5gu&!rXt;L#s~f|=Sv
zhyea>r%r$Zgl~`OA7mtctamjiZI|Xw#N>QpML&i~=+hr3KK#87>WHh&$=h=nt@i6N
zb8~xV`ylCx{^)p}Nj3mnuzMBYJK)Q3!T<X?{#o(AuSJOf--HhK4>1%h06r{361b3#
zG=+mg0bt1g_)N$Gkb6HvC%z#8cV!%*1}+#D`jYT3{?CsUEoDc6#(OJeM5H$}cox{e
zC8WBH8V(HdpPyPB2B@K`sv`BKPl=06m0j#WYMVq>VW}Wvd@%6J)ynYq*46}|x&9>_
z|NP*ugYt%T=JztD=>1CT-J#$WfUM9Y`gq-$1(Z3Sn(#IC{?uAc4zRYIkJU!Yp>yK;
zov^jtueILpGMk`nw|nF#YB;j$);SNdRLiUTLx+SDN9o~Tr}6*$y8DN#WxKVmFuQ>S
z#>2vFAID{{v#2|OckAE+I3Bb;B6uV;ohK8o)0_Wt2W3UFGffykf5O0i4%xpuYU3RV
zHPvr+b=l7HEDHnY<eJ@P4eaRy+o8<{-;>gk|6F7E&+mRBeFcEh?N2xQ4=<(-crjxw
z#|s*@tIZqplXM0$eQtNke4if6AnQH<enSA|b>A?85{~lLe6{SmyxQ%&7Di9Jt*)Uh
zNe)iM-i?0+?1d@9_xgWC_Wy3zw;A|%9V3YGUGJ5*Ei5!TE%wFHECSia7BSf7ubwE9
z?IyP)yH$biEDKtFwlA~oHvsUZ8$kY^2*B*sK-vKFqMt3|El*jJ=OuOuhy?}!<@i5K
z52a;CFxtyMFFVYyk5><YMRT^DuX6!NNZkM+;Z$pYZS?25N?;mFdrI&9&DrA3cDB7u
zKeqEQ&vXM&Zk_&$IlUm^?#%?q@E(8!^FIsnJ1QiSx8Wgh46n79uLxHPl(G*mmwkQR
zR)b#ajIDORHuyCK`61?t1mYFkE&9Fy)HS#2W{lC?{z$%V{8=~0)Eqxpl!%nR#*zeF
z9H!L&?^69E<=-?AgK%t$FfQ7kW2U4W1B=mY)EV3uuOu?dV>MBf>sT^@+Te$NTtTJz
z8$)p4Ff0_1R0m$?zm)5zbgc1@i!a7&;df5{-&X0HasYZ_L{B7<34R0vzxUZNZ!<uo
zI3(`E#8@x;(+_O|q4H%}Je`80u;-<-+}p-HK>!1Aq22#EGpAH6QaYk-UV<I%Odi6Y
z?Elm@_|M7)NT{?kv2Er65Z@u7J5KPTE(woS!XB{g-U`)gWrIztjZmp!;Na=DPQ9J&
z8sUvKmOaw}+$i8D{(l~L7(h}uh~`@Z@dCrS^#DOyGvEV9f3`i|)<#RASJ4~;La8fr
z1aSJ!2m>&2cD-b7&VbVj+ncHUu4^1dTeX)LsUrgjk1=rhpKXdVsN*Ha>a0b!i|1iu
z;id-vDin7Cu-c0Q%;h%gR2lS8$)If=<l>i;pxB9<^%zlC;F(%z?)QI7VDhjP;A?2*
zgxlUj9k*FJ=vvc)JOM@)2b;<2|6<?(<=c1<Na;(CEzzf&7!m;Xik9fZyKgmaM@ub>
zfNR*qa{Am^Ib$5b&-5X&IVBeN7)l)9W)3K74}R9e-IW`SR5Z7LBu^AqFjtJD2;BT#
z{IBLuTBkxRwqxA{kb65A6veLPVyfDq=&gQJ16+k=y?syn>Fsv%lU?Cjk}2Tg2+T?s
zL(_FMi#?tfJ7hYK$s7V5Y=}Y{cO|q_io(0*ZY2<N0iep<Hy_Qil~3ojKXje9<*kSF
zNqkl<gGC{w@9r5CwE+(lUU~}$ar@7YKPlX<-UI&D<!o!%2>|*kxdRxL$;8n&e=&+a
z0O&<cmk0CB1q$hN7Jx)oSWgxw!hP-D@0>s;>CFn)FC4a2{d9Cg?S}v@xSN5`P5#4T
zfda{NsWum{@1qwlWbHk!wW83;`}&g}lIEA_#2XHYW0r7jwx;@DwRtQ6y-^*|VY{$k
zpbnrpSQ$W%PCX2_xjHqUSCjNzwM!h%|N0)2X3+=OlYK?u3$?R`A6?MK+PC7-=vvkM
zPFoG=ybHfqEdyWUHrjyr1&`aGlLaUMd=aeNQMI6enN*9c9l!uYRJ{_5H;iAW`(fgQ
z0(L<v9RKf%PicuLg9ZkH5{}8iHYs5!%w-L^f&kXB0{6R`u#gQefVJ0drQQD!VBd5J
zMcHViQj?W3N8Bjrh}el1Jaq8BJ+GVw8HSKLh6?VLRu84qg?=hMfO|1T{!KhL0A^xF
zAd-*+n9@rHEcFtuj1P4OolM%U+<)<{z(bu<#?}~~0e&Stxp(4+ssiD+{d2%n9%<;>
z@-Xv~DY46gF!PImGQ$UebYPyp3mOHUV~Ckmd;C%QI+ySh7f=-|<e<aEq9$53J3;C+
zvbAqmeaUOuhgA<8NzgGi&I&A2ecJ^W-TePmbU`1y>+Kd7BG_6#=hriKES4LHqE-H=
z4NQ0m_=5;t0P=A$M)dKsyCs$>oX4lei0=(OIhT1PEp5-;)|_>@3*ZkGcK)cV3+epz
z{ON-pg6tLlQBb(dj>zcNz4P|uv}ft-NZv(fRjJc00Q+fN-)Weq%xQbX4nVFf1+#1^
z@cfX8r)$g-_H3%1(izP3zMS2PiZ8RRjW_Y#fH8~+B%c{4%iXC2DCn?+wJ~<kLGpH$
z|NBDjexPLRcqx(DTBHI^@vCeFkSJ49D#m1)^T`%~IDNaU?a{*x;6c>lT&OhZO>@JD
z+KEA4dh>?k5jYt@dIKpEmt=Gp2&%W49XnlJQU9t)3NoQ)`3r#etNq1)?(c?@n25y=
zqR(w}<WG4j)vjIu#Ncx-A-^U0mz8JrfZx<a<3)S5>IMhCY|w^>s=sq279qW4n)n_)
zX+iJ*7<gYKq6qy$R=S0r0Jx=^*7)V^X?q=M_mUfL*A|?u<Cy^xw?>>2T~9z_*9)ru
zABlw^%AkTnslH3a)MYS6k?^`cfBC8c^<(Pk7#5BT;P6sE2hxLRi{TC`on~G9-IuLY
z3;EKH5+oykYcC){xSRnD2pLL*OjE>PIImw~VPWwud0**E@e~6gaav#JonAz%fb0HW
zmop|Y#b2jS&Tx@4l|zOs0kkv?aOQN2$Ax3EfsgwfR$)cmm$#Xa9-=M(PA-01EnJaa
z+})F23f`VyTf&5@f2=zKUeL9!Qb`8=aE^SgRIK5B(*4>F@PUVqcSrRiBmCPL#kKuz
zw>UPZv?+@>I5t)5c^dzZdvieSC>rH`6YkSz-~=FgHX>7a{Vo=Rw-4<ART$Kgy@k64
zII<_rGJ0<VWPRW3y<fKP<E)T9mL`Vqx$<<GzTjE#D%S`JzhfTCb6~(Df(B|`Jq#|{
zG73lm%$B{@C(_&9<+iqY^K@JJ8nH;jNZ`)tDA;d=deDkUg>13Xq`~r!#J>XHww$Zr
zHjYJ+y%c>ok4LenU-oI%HBU8@ZxEBFd4u{E9zdb6mXd6CoP@5vTY=r7<d_DcgDc*Q
zQC%ru)%`_LxedUZRn~3`36)~m6zK#JO3{`$n|!x@8-kRs_j&Wr)Uu(#f8X$TNV0x@
zyiGJmEjTw==|y6JFXNOjNC8qVv$)p=j>MU-aj1{5ae3^Q7H&_-iT(hXBs5D(ZeRTA
zJ%A{KhOO(p$a}_9a{fPUj~8s#1N;E3r&rBxamfcjpEb|BaFyOK?RAQ*3$d-e2h3GQ
z=6a_W^^uv%fxu8$<@JC}T)7Iq@Xba%m||($i(G`1`BYqCV=z+clUAq~LrhDyU1|O)
z`T8ZI{6^<DYPN>=)}@sn&H;1;$sy*&XVo`n+fo1kru&9u{O3e<jj%FcD2s`Jt>ygd
z@G`(iw9y}5!U5&jTklMxK8nJ1Ia(4v{lq;<Sg8o8t<6SIc^{g19NRd#yFEp7($zqi
z*F|};iyUvO-qIPOdd-Ny-pnVXVHdn$1W4%rxNPsf0*sa6qpY~BwG%3FS2G+&2ys-4
z{*^JYYc21GKvzp-m%Inq#J}96HiX4<0UIye1eCHAlu;fW%0?|s5Rs_kukJ&C)k|N`
z%qga@Q=Rf#?N=}R+}Ton47C1rIcNPb9c(Lan%gE1lTz=!xDRlr382(tw?9(HySW!4
z>W%vA6cBYOI1cp2+5LMFXO>fx&?rpro8L?(p<1p$D4>aJW4L<u`+X_ebO0^2RtC0p
zH_u;aSs(!iDZh9_)zS3)=wu+MlyVSI`0tkPD_v}YVPMB$rxP`W3hwGN2ml4Ij*GCi
zjxl{srMea6UmuFuA0~2tEOEB-4&e)+`NLdYnxdTc#J8Bp*>}Iw`$9NU`Jf>nm>3_{
zXAGg^cO<<4u~OWWE~xUEm@)ACyhfmt01-|U`JAI$FLST@C65yKsrI1|cgZ21@stzd
zQ+IzG9}rl?K<RwR^JuoI;-$!Pq`L;y{WVslmGP_WyA1A+cb5kM05j1MIUtklFU#D3
zAW^zP2DwOn33xp^w0xuR|HwswP)$w5pE7Gd#X=e{02LAO69<&jsDY+5Pw>k!J!u^~
ziMR`&MOP>`oyc$U_3TG496el=FJ!N2`JR9W;IStc&+>Adx0;5PbkV;7pY$bHmThI?
zQeYy2miWIQ9RSc_bSquuxc0iJ<tVCW__agK!5l!+Hw><qHz{}kw#t50)!}k7CldF-
zo!wU`9?Muo0%rIKgg#QO@0(AE_#9TmsXjvJ&gnbo6!U}(<SH0PUB-+4Tl&Qm3&<Tj
zoUvF#Tjhe1n4R{#V@Tn5GM}iiR6VPpqK{Deq&qD4@)?3rF!BQwdHbvHOaGwb5;E7B
zj;`$MT7XnJr;mOB;6n-}-bspXy<BuNAaUB8`e}VmDdXl-xlFUsM{il@9<SL$fA<r|
zP^A|UlaDHEHA>>I*Y1WZ#L?~CBdOD&JQ)$~>x0?p$>@l>#SR3ws{oH_#)q3$Ad_pg
zE{uc2&i=<ojqq9I<(sb<CPNh7=YNvo3(N+Sk{tNn=5|%SP{J0n2J%M28_w}9{rEC(
zEW|zziBKRk`-O{6A+qou3QQ@?x~K#p!Tt|pZygqO_qB@)jD*w-BQS(?nG7H$LpM?q
zf`X(<BOxUXLl508A}EN2f`a4>jg+Xg$N&P;9U^%4;QOBQKF{}@-}SpL|B(K~-fQ3M
zzVEeG_}v=&2O)hR3YSvIZ@Rj$73gC^g#!1+Y?zdb4}5lanXwN+*Lj9>9U1CNIZ;ej
zUjV~qJ0*l%`&6hp_saSK>y0ZGs7F&MO7VKY>~PVk&iW`Xo<IKj8112_W}ua#@kb<e
zx4#gVru`pKmxsbzal(hQ<%h}egzrO%eAdw(C*KMhasOYB7m~+rU#{t^ZEOKWD8C#C
zxr|i)+YYA>KZ|cSO++Basuj8yk*XaFVdod%%u@L~!vspZpMT2Z-iODo{CVEbw(AVb
zlsp<9?*QCs@uL9@(D3yF?d(lTT=FHvAK%&S%>XUj5d=8A0N?(g%I4Gd8h~xe*t#Y3
zlK{Lje4$<7a@A@wwbc6VOd2>n)8m*h{;5|MF3U^iKOa3@((0!)Wx84B8_!IrVv-ab
zH<sLCH1oz1(06}?!kwS)f>6nPO$lxyL1hWSU$r1z>I8Kdw}&TUWNr`F7jFxDW^4ub
zJb7{E;$S|V<2lIfZ~xj@k#gmbBsxc^m7hS;^4QxK+XWiVpiN7^Z=C^Lww6KVn2;a(
zsXHxv)STE(RAu|=7pXH|;7)@>>nN-=*u-r=D9s(XH7j#C2ZwVJQjmp4xtt2I2e`91
z?8{q)ny9V1JClo_3Ul|Lsb4l|JIj*aGI|Wi8&4Le>RUBJFD@8JH+Fl!(8_*@!B5|O
zBG{I3__^Pgp^j5A9EzVIWDW$J4>#3o&qdNhILt47J49XQ8$U=8-3ORLihXL{PtS(T
z25~gkj!&^4pmQw63w)RTt+4cugy%H*5g;N)f8l}ADa)EERS6FQ+4A9Bqv4fijf`iE
zW>?613gcy%Yn&#A-3p=ON+q^}Nc2NWX?HgN&*7C9=q8@5CwBSv8`s(6JR4NC3<K!L
z&B0L#vwmtt|DaeTY<n=DG2;p*ra|ga^ReeRv@nd+Q0Gw#kr_Hf=sf`N-{(7WTCt{$
zUKn2e;=Ujvl4sz=Z5a0XM23-M??CIo?y3yoJy*x?*m)?tnHf1ZGMBlZ`dESxPA*4i
z`IKj?T>S^5#-$rfcyGv2p~X%>9~ffdWKX;=zlar}L#Kf$EMi*)nxB?lRj}oPrb<li
z9fORt9UbmV!Ig4!raeB%AKN3=l)wZ~%)N+rDT^w$a+Qb$_fZa>=Te9fEE&6_f3@u=
z9E898P7j&2iW(;q(puG#5(6{w6Ub_eKO^eBF-?HJuj$_EEwr0DSgk1bJPW0{(FM8c
zt(_^%Te#7BL%mX+*EAsTGEb>rZsB#ef90R&xTmRCmE1;y!kWlmb9!}y6Q4jVO%_D3
z*&oY-v@PvB+S5$U#l)#5b(^>DY8#)ZW<-FuxBCb#r%MO^sMW?Sr?QgYoM!Uy`3KLd
z`3WNmuWE<wUgXX<7~?@RK?Sdss?NK4Eaewu8wM?X%-8}BEEkk{7A~9Asl+lrpgM}m
zH!-istQYiJ1JezbL2JuPl<`X-!~H?}1pMSA;P+PC&8@x>S|IgPO4%{B4pW+UAezuO
z+L^ZN7~QWl^rmaQf7hf`yKVS>X#P!sG%B^VxXDvRcqG*#nM!r{KNwISxZD{vuAeRl
za;`H`4(c=*7lm@Ba=f(hq_))R+`F-dr^)mCFMxR9p&KM~QV)^>G}28C3<7`ikrDh>
zh(wUYr&mUTM~(oFRr`rX%E?rBLs;{=zlC_u-ZY8eye-X*MDn4t`ot!BlW3An#r5;j
zO5w|!%22%qn^@lSDjf0|R@B|%Z$UY$6rF@`Jc4naj?oHn10xwY*Oj;i(n_yJ-+eL5
z*twR?cjRI`T=U!JfyZCBjuL)nDs`5q8pA2XO2Ehfkca%JF=KxHk8>hY?kpT|97Kjt
zdgQ^+d=ePXS~rZ5v<&;_5ADK%+ZO^)@Sdj?fWA84Uq4^0P7Yp+Br(uj8mY*Sq5}IU
zeY-$64Z-M&4<eyK3gIHCCG$UbK%TkC3>45_;si&J>e^hgjwe{z?`l3?e|K=Wdwc^K
zF4!x95XXe7>J|HNlybHmxGgz@<YN9Fx|SZ2hZO(S2$-o0Q#Z&_GUf@q(%MG57dKKY
z;ESmXZz0)RrsA=8{|TO+5>uR4s3d|PTeU@~lGn*Ab@A}HuyJ7lLHaN?R^kOnO4k4n
zT){Uw_<kVW%1mO!)IBy#i1X64dgua8{^m(hx|zKddLZ|$ZWy-`rP8_l!uB4_5vznV
zKur~NQiz)+?3L6xt;Uueg~cQt20J%?K2Pq=jv88`l{v5wt%`24R8nK&bOtjEcPh`B
zlPi>vt<uw_Y83rG0jG0-Low{%r?Wt?0!Rnj!!qO|X%F?X01I?VPdE7=*>gdxi>b{-
z34NrX92!3F?ZES7{Kav8Ig&fhMdV>`rQ0{Dl}bg?Ph^6u=5{?0@}7|Ik3VZMh#4kl
z7>r1!j{9tEe=Ww))&+Hik<8XEP!Ur(b;*NvDuQDRDQcbE7Y(7$oS=?f_aI)lH`kgD
zthRg9scT9p<G}&<5kk;WvJO<(*e`U0DJ97${at~7hA$3c4=!CW-4pryg?6rep6Joa
z+jvlnzdOn7P=@#W%70S~Wz)|JH|EpMu?;y>APqRN(yh1x+6Kk0f>2wz{#1NxL_M|G
zXiE~^AF#Ii)0@O-a{ys=*Xmy0Y>0BMMOwG|hWwW&OC>L^tkYr$g;yT|ZMp}+k#kW@
zY0a@R3*V32rf({Zy1Ua#63^$I>`T5bpzv|yHdk4n(bk*igOR8?*i7BEXBmLOb-ecf
zWbZY!DlJ>&T{y9;gi#WK6_vPi<scbL`|UYgc4z)h=zFp}3Dou-Q}U;@4XTSI6|`t#
z2#Z27)Db{aX<#i6g;a4-2qxUMOxGnjfa0z@jbGoNbRWnJBC%FV1xnL5I+?)Vo1J`V
z1e^-yrMxqV6!MLz6sBYR%y{Sm4W0`8;}7rdl2xkxN*JzUHGz*{cMiGZm-|K;kV+<c
zhc$reVwcVR*YUwY90e0l#)fwbU1LLEBWx5tQjY3rCIzNPJuZMH3g5fIBcdIKJFL!^
z4Q$GmbT1A&n5GC32=3I5pt-=xrK~oT{Uo61&jkecyN2CilMadw@_Uh2yf1jHjD8Yz
zo*YFKg)6wq;6{Mb;G6>sCU#fKY)ReUh{Gk5VKqavey&}i5G$$|W6u0%9BJWFIb(gJ
z`^}X4thFC11OdeybC5odCNCLTXmGi9j1Jvi4CUTok}>&`H|s|%n9(AD8WM`iXSB&4
z7n@?NSjL&KiIo^oZoh|Q<6I0Y{>>XCh!9b&>M=NGKyh*(xJ_DRzi)rOS3firBQoP*
z!ICG^C!WJ7%dc-TRq?u6b3voBTZwOMpLdk!qkd!5|A57(%1tgsN~neYLif+MXilJl
zN!+~n|Drbke=2SqhLr_n?gM$!_u|>4%P#3UUC_;qf;920f8`_|gIWySii*BJ$dlSX
z?+xPNH;|XE&i7^8L=_X(hoiQBHY?B(yI~nw|B4ckas}ki&x!rk5}wLR>I<(*h)3IN
zA%*}gNBwB|4#b{E?ngP3$irH&YU@Cu?8vB?pPGjt8>>N1k_Ehr=LnGKck-70Bhg&a
z<nWeyygv0!D_g9uM})iWeljb=Eix<^wo>j|zux4V+xZmEc$Ju3Fa^dg;qufM(A#ga
zQ0lRYA=V8oP2d<R>(?)wB#4%=>39F*nz7^@qrbtS<V?_yfFvi5<(r7%Tq>&`(jX{e
z&M{Ztb?sfRF=?ZiwiE%jX7MLGwiE7kGsgBMMUwzexJzrTzUAP+@=>bU|Ho<E&Ji?1
z{HzZQY_&R63Sm@yH(~~q3DdU^`%+r@Of2ke@`O{826!X1F1?GpAXVH{)C?#5Dx%0G
zg?}CyazHzV;<Vi89dNr8u|0g~OVuN9$>-`9y52tL_5pHkl1(YwzY;+ujz`2E^o~5k
zbFl%5t8%{O)oErST6V7^RJi{gBmXX6?T}cp;D2tD|6YqJ2%z=yzT;qy`@er(t@0Dw
ze<ESRXM*Xt!J=(rP-^|R>iS<w0fpE&$-lSHUsYGv3J8oTqan5u|M#zRU{!qoDHH#1
z@=5~(_5z&=lv?%Q&++fK@Qm8h{?GjL|NLJr3I?PpE6Qj1|MR*3ehR0bfv~LpPmP0c
zi^^GY=au6+t-xAQb03@o??EF5{QaT3w6UiW`v1NxrKgb)sQttKoS#`@PD{5z^WlBa
zO|&r5XrzenJscw7DQAu?!v76p`8Nobg<Q9ui_YoyJY5OOFW||&!v>5$!--%5j9>xu
zJSe)ZP53!uG&yAC@d>PNd7Q>{G?b-2fa!H?nhYc1LWg)-_Wcj0Dpu=d{YJRU(IXzY
zUjzRfnU6I3dc6lbrCwi0`m3nz;M4MZKT(8Vrj`QkL&>#N<n)Sy{U|o#cR=<zA-&lK
zU#l50dmGp=GHe5sLsM|PngTish)Lu9pBDbUup;GYutD8*q<2%YMIF+P1D_Uu;v72N
zTB`AE&SCVoUzXUt&IhQ)oUqSaz+k<qdVDy`WYW!Rca|mXPnSVE*b&8u6c6gZmrzE*
zF}e}I0H&{v0QR{bl07Y*TmnaDfkdvhKUeBwsx+LZQIN2}d9^LpChMF5U$X`VJ$raJ
zK9`!w6DJrxyuNv)JAh;JL-E(CSrG0ATq|<UQZi&LHjmrCsw>fdk!1mYq%$!ngECq+
z4J5pExnP1l0cz}V`E={(iWAqi{sB>guf-JnvOU_B>Abw3>^-aFHSP;J?r}fZjKMx%
z^0dt5{rmkD5Sfz$YjM8=mwlk|-aqen5C}2LV2#|RmEK8km}>BOJv-k(a2)ub94<J&
zt{E043TG#QK781lfhTZd?^oV@%(CHU6RTd>1VL)T8VJ~r!4M(SR2Ti_vtLhl0o^}B
z2Zto_ZxSZ#JV7`cG4p*r0X!?c%Ko5_YQHz^#I+G*0GVFYf6gqhZ{yRyVEDhbl3}i%
z+;ozrl21P$Y!3;!b$7Lo?jZDtXHt+C$x2<I2H7Z|J7QnY;^=v9mL&#yT;{Pzd6u=d
zl46UMV;D;M7=C3po&ZQu_gP!25B8+J?t+wOb*GWJ{|`c@v=0)wI-5M7jz&O#tbqo^
zj{yox18{7htPd#YD}X+{Kp{q%_>~2`x-cH_&J!MgeSaEl<oY3cXR`?+=H_f?J@6kC
zbp#5Kt+Vv!i$BUc8VEc|-manaA)a;k@-nfHMA0CE{EIk|QXe6!=*yRFZi;v_xSMYF
z#4bXuzE}9;7w_+akDpBm_!8Gdt@^#EWfAMW2Xw%4z&PEZ6x)d*-vJj_8wKb80OTM^
z)Dx&(h#CZix^YMpw7v{<uddytuOB_OB!{|^xnWf+@6-7>h##l|7t7eyOwuca*PW`x
zk!#@&-p-dxqjF(^2|%45SW8f(aRtsGD^cCwiuF(hiV-SO74HitakN1id;o1_|M-FB
zehIaXGOtK7Z%y~d!Hl<Ahb1DtQhPa~>hSGyi`}<WFFZh5l1s5wYGzscz4pGq_H%yd
zos-E&Yh|-iC|zcG?$oPXxoOtCSsS!3--%gCvq~k?jmIy2%x3N+zmCw#kbOq&E6XF%
z1m+w}R@;5nY`8xl?l?N=u{_*&!EXb5TQnMZQNc;EWc~SijNZcQO@Ktc5IjD+AI*&j
z;RA#N&V&k!ZJ>C3fNM)&1EhJ)iO8CVy%q<tz#nr-h&v^4=d&!^xkx1x@(iH5mhHm@
zREWv`jCMl3!NTjMm;F8aUFJ*R8@@9GfW8>s()|rs&w2GQqJ1E&-<i1;0JCX)VDL5W
zeBmzOfL#5L_CzbLuSwU|?3hs+e3UoV4B>VNCnH&=UrS{VIKaZfalUvqiUADkK$b|q
zMC<PQ_wN7`*6ww`R8}EI@;?H?^>3^1ckoP{j;++<y_YN>b@VUeLqD7oD^YyVye+Nm
z=9@Gb2*$j;`>u|{?Dz3yJL=It%zjdS!axX@R)nMA%Fu_uho0PR$QI18_RK=_29DCX
zC5R|3@J!+Pf;pgQdxetGYG2`rLY+g<kSmdsM~a&JZHCp266E$ZAai@7xR)#<tq!ct
z<onELrYXvsk=${Lu#cUN3~m>k0XRIK>CeVO@U3VY`T{!iKzC>f^M~rdmSh(JH|1MB
z8PdHn$~(-!R1uX!9&;EiW*Xsp)-~C|rCB%CrawoLkEb_B^7B?0#|PIxThk{k^$$U!
zohHk(3fXyh8zPV_Jr$9<b&NJvRD&uoO<kM|83UE>Z<k&eF!S<`Zu}u_D(p@t$ZeTM
zeZYrg|NR>v5T6^qQjqxKrCw6j(7{O@!v~f@%=`z3VM1WneFNE|cSO>_)Pdu}`N)Hf
zJHeVTm?1D@fALf6rC=VSAcrur*?G}C2v3P0n1J*R^suo1VJD=<q>l{G_nx63D3uFN
z<orSrZc8Lh<YGW?DdRoc?hr)R07~TJsuqm40*-$%nK0cMdwm7ReFQIX04^y{Iz0Q)
z>ihgc4Hh3%(grEzzWVRX8l{#)@CN$S_&8U9VaAT$r`BkrjI+vdf9)G3-LupS1zXN%
zAGKMdj-dAB^t|ZEBWtkT+<}Hz=QEb_`ARY!h{YsF%fRwcJ-7EF?@1{|ROC`*3_(m2
z())pR8^6H(irj#M=#O%_;(vh&jqo7x8LmIHTZ_+=wWC3Y!a~r5TY9Sip2(>5hOO|=
zkLz3E7ITV)T=^*Bt&S*Z5b1_LfnDS5|6|Bpi?KL=!(zpBf(4t%XzMsRHPP9Dhrb;6
zyM52&>mtai-ie9sz8V;U2Z>@DBjo17!e0WbNaDU}bi+&Nq0+Ms(UKZ1*^r(m;**a+
zozoV*yJI|Vh*C6h0B&_8aWM}sgU?*vnl{_yvYCsuF&W5q{%{c-Yp+X(zp#8tq!p=t
zm#W<4{T3g9KkD**raUEeb!+P8p1=2?Jel&;M2R<v`l7~4Dgf!qQ|`j{-<$led#z38
z_&%i7&X<wi@@ap@9VfxHMPJ}ZyY+`{-3jhfQMgz;8~1j7BS&h4g0MakIEVrghd579
zK5z^xav%~8$;C8D6%zux)<N=UsPa;W&5ra0XoY=?f;-PMDLY<zy^RO6#%V-C{EJ|4
z7bI1ZFq%?2UT^ge@b^UeS?P+%_(x53`PxR>U0{gukHpJZwME*-iVh=#;Z+1T7p6fx
zd=;;-Zun~<Z@12gOG$f}o|9&?Bi;Djz!Fk4ck2&D3uEGT^yPA7xr`%n%gm5ZFNNi~
z-&q#pL9#0Zp%(6ZATxi&?D{bxHq*A1O$jo5dI!7ea-6s57e$LM5)v~xN%+Q}jCYk>
zTjvr?Lf=e{-HwVLnD{gB^-?gwtWoYTQl8cv+(fa-k@-`olXVtAy!L~OXMOr4YOhIh
zuQmS(SO~1~0Hfpb2yUjU5lT&m^5=vXg@5h<PR9-SIb1<*@aCN9oYWnFD?Z-eDH7Fu
zRCX896~&)t%NtLD=RSWb>E_6<jDee7OLh59H?P9`6fEblZ{D-9{l4ROgnte_yDd{l
z@m)URf@g;B2A1sJ&3xmmqc9f{H;2^E`1aHrhEeC-@5jT~=A|7?KNo8`#p`!uA79wx
zD0p4beDZ-?a*xe-1*ZMi$#gy!K)`3*BWM4n7xr90_iGhB=oGIbe|Zqhse#kN=#~a?
z5GS?<H&Z%A3NuhyDY0O9-FP_Ez)5+N;%ad6EDUqW7~K5HyKoXRn#WU@KZ@E7@pMjo
zVI@ZX0Q^MbJ}ZS0O!FZ7`76$R>aXvUx@`1WTsuOrnrs4F(bpY@dkK28Gq72ft{V8~
z1)upY>-UHA%ZBDzp7mXuKt^jV5vo_(kN7!9(Z~I43x9}8BrB$WG&(spZ`aWf&aIb%
z03k2MG?f2^ZwYAxX_j%CeJaJF5ePj5;o$0_;2()j_lWz+GASJ=*gQ`pMRWJ$%LI1<
z!~5eBomsfKC%nl(*(;(d>{IuXJm5h|yPq*v!jqz6FnCLxSYCkh5NS9}$doSt{C4-Y
z<!jJe#kuZB+AEU<$vk>0d%ET~{X9KRUrP;dwS8rcxXaLY$i`E$4Jx+nz6Z+~d_kQ+
z9ilmn`6(`%WBIzPUJ4SfT;Mv0ykUH0Cx4&58y7I@C<dYcF0?GrB{1v9PvR}t;lp=5
zSOe;Gj>AM>$S({c1X-x~HT1z)aeKWohXrJBk~6cCL_^93-BYSvWyX8=Q_Xs=HWYr5
zYh_qZqr6tzQaX2F%Fbu|3*AOYgPj43Rg*N)j@*JdL1Uruej%ps*PD*U8}%>_>Ej>f
zq}Upip|1%_K4tJ0#h8=y$G&<}8sMY&Mt#E^3?$AWVC5HIhl~l=Yyl&9i{J^C-dmTC
zfz{5*k{#iX*q7OSF7<1%uutG>IA^3h!3$dt1m2GXA3*~t&nVikXN_q#S#J>3cgcPO
zc>$hAV)~4T(q`HvRg&xVcNi<^eN)^f06_2{jOf(bn44;mBH?@iG9%?#jQ0+ZB*qbb
z#8&mrLTjIAK3(fi^;57}NM0QGda5Jg$UC7}w=;v`sCxy(;JBhEbl^GrgB79?7<*lT
z@YIc;lb_e9q)_P25VD>m5Fmaf7DqK~pXS0@oLvA#cqv*4p&)H+KsB&o{G35@pxdhL
zNJAm5xg-6O=UfH$sMUxl`MqAAqhzjhbnyBskvmC_N<wSws88ez%YceD@!$O#L#F{p
z&DH8a#dwG3np(>Ei6=qF_vQKCOzZA`|DX1%FHIz)+vyzLzLVSl=gw0<K)PJ~Uh}z&
z@KB#}QGn|sVhE=+EbN!wHzlZcb<njckity}_QxbekY1UgJ%DTq<lf3zlkP8x4^<j6
zpyBcnVH5S%%Jtjo7Gv*dZNR}Wzx9{{5hfR9TagB0Qa@7k%X*m0)X(%DD{@&u-hjg8
z-!f6_SLgGzHv~laBAv){n$R_ds$63?^7PSR!ovqSJ@|k`3;mw&tjyUUI1Yh)okz>y
z_%DSur~EV_gqoyPY8E7l?w#y`4Pm^Nl@{Jfm1)B7FPBv+w6hc-J0iJd$-`Oj$~=nB
zpNnHBKrr|Q9<p&wIKmx^yh^W_qF3+zM3=9zsN#t`sJ$Gk1;|QyAGqf0GevC1vpfOz
z#1(Bkvsx(<KDkuOd&1>iL~wR7oNI5wKIOL=1i9s-qvzl_<CTf|9o~6jGv)18)hr-#
z0%SdtXAY>_Q27vBIGXl^X8niP?*0CGsrb1*pnVqn2;}4+WU|S@Gh8vY;M7a0P|0bX
zohQKfNpVmLYLw2TQCUJMnhVGj-Esm^vD{-fc?ypTAm@Oa{11EJZ@e=Q=?FgEqjo06
zJZL<)j61ceB}}I3S3M>VSOKE*1rvK6ry>ngZgd`z7e20N+;aoQ{;;rU%cn_Y)dor2
zB4oHRu?a^J5sA&XXZMe83O3$X*#)kz#qoDe4!i{=*OZ##u%TooOtTt{j7j7C1?<M$
z<GN<PK<B=ll=oXc<=5JH%`3x4T$-hH?a2B5(Did`6p%|wTJwF4Sev;v+|76wRL|3R
zcG)Q>08{u*#GEH3EXqb~I-*)!+hD4r-iJrPP*dY0eJ~yB3{2HkzM-WK4S#@qXME9v
z4mEqrPy?rb2KeC;27&Xng-gCA&~+K?A;Q1lB+hB-wyr;+D1rpR_2=@VYLNY@0u!3y
zTv%FLN0%rtp1<f?n2Gc=mYa*Es~-k7I5r>J*sg_jwUxU?Utrr?Q$&g49nXMqNph>s
zoGX6<=Y3?3U-bd*)<m5d76AEWx27&9ileyA?-cW7RFWCj8=3j=Zv7Z^>TB3!T?MDk
zr#m*e^nJNf;#+ydVz(a!sp!AZj%^qLe{jY2*UrzP>hljo7&6>tfmx+BalawDK?3fh
zD6?f|m#8D>*l;IVLn;IAy9j)-QNv)|dDlX1wBJ<vBB&9Ft2Js!u-=bjl;U2006&BZ
z)f;{ASY|jx&+vwZ?q-YFXjcROql!%T2WdYK(Ic~6t`~iY>u1-pLF`0rP3hkP#K|+j
zkW9>H&h0zv={Be___#{jgID*Z$=|nrJ$cFG_>_GpflV_KUmz?l0{j++AXC5!vp`!s
z<{UpT&){D7v3y276D!EhdI;r&M{0aBF8Qb+M7z!e16;oECz+!afYKtnj5?wlI)j+<
zkT0~5dj1|yfo`tC_wefbNjHKO>ZVF1R<{WVFHU{N;+UKHtsP3zGL-I?{tMoUUyYL9
z)N$MZW4Oxf?(NCn8?aor81xlPehG>Q_(nO3->5=`EbghBjw#CM=*>xMYmQFdPu5yt
zoA#w}<(|BZ^Z~G)d%gVuF&Eb`fJORx#KhLSmhp+SUe&&&1@13$J)?oY_#Urw$3m<B
z8;~5RHgTm>=oQtLgJb&%|1#jBf&rkw7PA^@S)Zn6=lFuE3~aG{)2)MTCWU~1;2WI=
zjjcvAR4J^qFEU@YT&Y8~?z|n4iSe6a+{R3gc(Tu1g?rk|A_`<ofe$L957`M{03;d`
z%U|PE^@k?BS`chR(MWZ2oqWHosMm&mjkYoSgB$o(CnqVdHP*weL0+MFEBBjAPx1qP
zl~B+jcD%<q#>;>ooezVpr=!_}KEu#@33GNtoKuleO`?z}*)F_bc2Q<MW>p9!bE;h7
z)SXPs14Y>LdB)NF!><fq-R2W6f{$g;&d@9`D=KlDOAkN>e=++OFB232m32T?8*qX1
zKee_rE15urqzDmFHCs`LnZ^YEK8jZSRQjsip7<=D4Rknmehe6yv=n^mK+=r5JD9C;
z$ChkLw0DI6mX-qsA13^uC#7qlOel16AySCO`_6rL3yvX%cdkUd+3pSwip`NoL^2{D
z<O+5>fUU@WA-7a#n!iDx@AwsCmp{@mA6AexAm5C5hgytTQ0NuS7h+g4-6OTU4jd{%
z@kvl-4`7bWe3`!UgwdW&boV10_wy3#&Nv3Qw-e6w$^g@9Ty`GvJo$b233yR$d>skt
z;ySg`fFtc5&2h^$yMUcew)T%07XcJ`XDo}&_X<<-T?;`S!1zO08eaHW?X^L)JLvKq
zmTKG&O+vJ;Re!k+PQgjSnYd5?<>2&8#F1~r{{j!^>Fg8Ws?wT+k7Vsr<dGo6f#>q(
z9DMCIlm3tB%tPtpozICCzv&L|SHVaHELk%#iWov^hw#>q?5k)Te|Dek^jW?gdbYXa
z4fVULS&<5v<?#Mk*$5eg`-g3$K(}%z*_nb1=nB|}r1GoloTE^gu!Zw^3h3JV!o?2t
zV)^dn6>{{4_g};60=gb4)FP%du#0g6FNVxogM6ld&Bc(-I8rP!w+WZES*%sZhRHov
z(q>+x$E}uwf+CCqCFK@h;`eZAm6x!P+gi^!M#yRVI~^$E-%azTQA+*7n{_{K-jJMf
zopZ6u?OH&$wE3s3O&<ehjSuG@d-Rc?{e9gFhwQ$ZRumKaKG%23o%nq#8DQ}ATl{HC
z*hU08e)6{F?CHeybNnr`YcUr`Eexh5gCeUyIZ+lq%rz9NKn^3N2{)8vt8A<9FlrUv
z3}a>CR>2?zQ#H4-88hh~7%}waHa03ri6XWgS{5`DRVxrVhH$E$U>&>!z&DLDQf!QM
zC%%9%>`<xccV*Orl99RXm}?)p(4r8;mXpiz7f;jp<zM!CpDw=0v2y5{1pbob+m!{h
zNkYkmKfpN<*8~EAy<%}7o%uG?)Om1+59e;)Gw~f|^O;NhG+Je?b#PNW%|18vklkhX
z!Pi)6GX$rlc(?gFh^Wl%(R%qej(&r*2}yfxEwJHDmrbf2?OXQmAW4d>yeP2)NW4Am
z4|^+=Qw#O6{-AHzqw#<l8O{=+6=8~tKWjgw$y{sx?RFE2(am}?7|yCl;HG_^C+1{(
zW;PAj22MD=A3JXM;vhCCe%E_JP{~}rAJH};Wc14_A1bV}weH?8*@?cYfbbXn@TFjW
z2grmuE+tg}BV_`eT7RDMC7z%}$rZuKRkS!?DHRFNW+8|`u&xMGv@Ej-f;|c4jC-md
zN2UVDUqE}~C6&yW=QFxLq9uR5Y2q`rtV|lP0j#mzS=hMMbIuSB$IYj1A3-K{?*S|6
z0DOGdvyv3yWw3;U2Q_e(%#bb1eMKrry|idIl@R=f9o-1v=oWV9HgyILv}*UxSGe^1
z@7l`d_BktVNi4F<_oS1@11u^9Z}y|G@r{>OtnCFQaF+Bb&@5q}-{_M|_)P!yyV$hB
z>w65u--F565Fjwq?lD{S5H#Te-4#O8J_?EZ;*Xf`^hb{{)Hy)KBdD143TvYTG{cIg
z8huUZQBAYtx7w^%0F*&iUH)8y6Kxf-BIi=0%nyu#)&Mph9bcNq^c|d0&qRlfTOlS5
zPrU^{OSgW=16n)Z?YHILv%rrqC}hpU&tT>|*M{Y$CE{oGi?gAd(F7KsXGA2VO1>*q
z5q-(hxP68Hj|~tcbW4+##l_0P<zB$yJfn4h5Uq*;rzqwHD=ifZ&J!t+Nvi^9BSJjP
zq`~<Mtj1uY<>dP_G|4(936itVeR8j9Q3Xn3iU)D-^o~uynRi)M(~zwLCT`}_XAE^m
z#&rwf1a~*ZHeH1~0Zt!Dni2LIPC*twwxVB$&A-KxvSb8Hb{eIhBjbXZsql>O)?F#M
zywf-7WAT#99^V+mPr|9^IxD)PuRegZiHi(IQnJ~hVIA$|ZdoGk3c8lmB{Jcv4C-EE
zESY<V8CC_LI6Na?6{-1gboSN{(6g=bD9rZ<0K4z~yN|o4(00Y!`gh!%8h(t|I55KA
zMFu55;kcd$QUb#eI=tG-nC8gzFSHKluuD8=`Jl}1wBMRx6aTxZCl1?XF@Y~OBnx5)
zKJBnb*xl%iFeERQFSYnDS(-(tF3pUW`Rj}n5DxwYb;O+)>L(W=&m#e-uj0p*i$)}@
zh0*4<FmvctQd+hgp)n*CvRZBEW-Uxbp!r?0FV|=6T`cJ>5>G@d+LmLqB~EzeCsotq
zV=%pOnqJXc&;Eqv7w6o$cu2?aYZLL0VTs)bR0@=qQoR=jF7)fXPeWu!%9}C{B-YAx
zN^C^hPBqkp*>i>zu^|Q!lP=u%FXe9LJkhTE)AyuOT+JfZRR?#kAz`aeN6*}JR25B9
zv%z+moM9zf(yaEC-5+5iG-^$7j}dQ>N%PAlKg>6*QO3H13jPEd;YRN`+KK+jd6y2C
z?riS<XY*divBD^QQG>`g!G;0pN?;AoI5Se~YP~((eLb*TwHC2Yb(DW3&brf%Y9%Q+
z$H`bs#_AG;awHazJ`Dp5Ab+?|q^|W>D2!Hp+H{sh0q(Nh`7J;CB7!dylQVOxWFi%r
zvY~`M)Bf`L9MWA~oPDY_Qt3I?nwUF@uGw{y>zjZ=U>OT33J>#uIWA^5Es}8y<r1lV
zNBE<^>y~Cz6=hTUC%y_jOhPeW=hgCU!XXtusRmuXxvUQI4<ONJ!>>$fAT^|snwtVG
zUEe@4!ZW~zUaC=VKL#pl3@|4RD_zIojpHBteYD&v6FrO@HD3ut)!C1mS9eaF6GHn^
zTB45)KVv2FMEs~HYHRlxvEc!u6?p9T5;8UvA*+c^)NTJVskylqsa{P>Kbj%SCb@Zp
zoXT4kB4hhm-sg*#*Yn!aQm>4He{mq>4_$G9dRmbvN&PDWC6ZfkSP0OIL{NvHRS_qp
zY);W1V#bpaapel>@tpT$DyL1>48Nsi&1pb30EJ{TOt}ZyU(ci|5B8$u&4QU8DCZ@l
za~OvS-M^9V%9S*CY1D<yz13nabNM-2wWSUftHdBq3aX-}-J7*}X9?@eW5TCIX3bo9
z<6g*&i&J@akGS7n7k9idJNjhfMlL_eIzIEP$R)mez*bhGSo*TrGEBS!IM^+%sO!w2
zBzjB?zC&eqMqm8gyQMXYpM*}e9fw31-*AnSvHckEfc8x@`7WW}u=g{MpBW2fhY@&M
z+cW#;th*m;W?k>z66U;{Bo!h0@iEc}m)5nbq&7;J4isXVW1aKJX*<R|!uJ+1R7aYf
zuAe=^`Weu;`j;e%mODdIFd9Nl3hjWaj9+?HX2ly#5r!2g86Y8v+F(Y*aV+H0T|YP)
z$U^BzQcU5$t@3^b5#@iT!G<R_kxMiTWT9Q}Tj7jBJd=p1;d2$7(wy7nh=;Cs(w^|0
zk3v@>^2=b$hM$cEnj2@LFK2DmjXliHM5K@NyK}vuEy_@T*L%>46@{TUhOj|Jv<<e?
ziBC9269maBu8iK2yyU!C{7!ITz~Sr3fA<1t#w)zMK#@j4)LKb<#N+WTT3ROq{!EeW
zLTUXEVYYr=TG(=@w4$i+u<SQ^69`RdNtHNMnSz><nD%H3a0{<C1B&8J&$9G$QY`_p
zrlwIX627lfOcIYoJ?u(U#I0)LV_k5`n7MZOX&yV|c9h5*+nfx={v~4F-`kqAz&93!
znL5?S#|bekH}USN&`*9o@r;Uk6@&#tT^>>SiBOLgDgkuwtAY>}Be4!O&QBJ|#5RV)
zjOeR%x2G-55Dth`?`(_32RD$~n=ROuR$FNxZ>}=>9B`<ifCiYB&x<H3Ic~XRgeAqI
z=WsSTuHio7)8Eje!JAzkt@7VjC`$W~><z{nY|-zd-aoj8)=V@B%Ml0J(cPK`;T%;a
zw;u0VCRaez-%)so;)Ay1nk>q_pL_)w*i3>&Cc37QyP9<NUDLjfMba0M$!Wz0&&8a^
zGsKxGGV!_2SCc+Ciygc8Lm)}_^yPt`eC=bD?859tG&qzcfM`}dWq=8Lw*m&Mk0<-3
zw|!V48D<R*F}+HD2gJWF<x2dKfC0}ks8cd@v;NNYruf`te3thpv?qcLO8<4}hxxOs
z)+Zy)Cl8!9Rjg&uLT6o*s<?)<GKJd}_)1QcUVa60b1m`=YgR!iq^0uCsZd%+@a##6
z<o=ZkbmfQ<WPtIiy^UN0P6B(wAgJ<CSmSw@9;W(vMpv9e7r%TMe}jaZ<c4sI#jMdb
zm?-$U;jwV;fZ)DOu16ZvtLo06P?8Qr$HgDPFjrwFn}U1qIBy|BX;B+NVfH8>an0X(
z`}MX-CL+|8X+N*v5k@ayxAWls>#qDH=PNm?&WVJwPK5$M?ecGXl7ng`L^-))igJhb
zG%yg<xbR&X-4H(tAzO{xse3dNqUC2A%<GOvH6(YXhaRNA-j`o2WX;Q500orX!zG}=
zmXFV^0$18ZJA5`c$+cz!AGk7N+R@%*@eK;`nGF1M%+yRa9mGiMD->Bk&TJYVpysRv
zP6~tK(060!#CD=O*>T+J*abicT5;oe4WE*$-V`_Lkk5RA%Q1fc*versdv2~bLI(Ea
zz>JD)fs=LnSkCG}c|<2uR51YpSC7yvYynS|wqgY#EO3Yeg41ob=v2YuAU?|%gug`v
zN)^v?6G{)?C6H<u&nn}5e`4~9b)=m35#i{)=}#?!KQk4Whw@gwx9Q?scuIb*)vr4j
z7?wEUgIvRHgCoezE|PQU9JbZ@b&zZZyYOI)Pqe)@0^YHSAw~?|b5@)sW#6_cOA+Af
z4cDcaXOlh17J|*AZ*?D*EVm&tD8qT9q9_atDh{%SXo+D8R3Qw{sH`~zwSjaO3!5!`
zB-6S~T*w+vjC=70=7<dV9Ol`+ps!tl?;)YRnlv(l4e@5cksg*c#aHgZ`L(O<b@Ki9
z?sPO}IhW<=7{g}-eEG@KN}R?R$%Q6qy$d%u*5OCdG6y*>LK@Dq@@YSx-V+O<_J0$&
zd4hWz?%&?jKp;PI43LU1pEZBsHi)u%l$?C|o*J)zvT4rT9C;1Tg$0?ob@ztOjS$5h
zyvSNM03j@wo3e-QNy#u!h`BO9IW<4l0o6E}MX-jzQvhNaNp!MlZi^-5CMTgxd<Gcn
zhkob;BALsnp;Z<VLc_I-39ARll?)@m1zqtw8yX|i%IZ`6i0Tb=KFA`}Wosrz@UB$3
zjx!U49hZQsE3uwFc09{c9+jw8VvbWeILrR%OdE3>0u^NommOt9yh2ALw$1~8yD*=r
zILbn4l(d7VKsN=^SxaL*&m>RAoxVM06L||0I>>;!QcF5AL)2{6_$1Rpv6jI{eJ0!y
z!EPPvJ8EG(>k2Dp8BbjnanGA=yg|-xVjU~m`%;xh?Ox%pJa&hUj8U?m!zIO=Rk!ER
z^0n=sA+%TaG>jF231;9Lx<ipx`U|+K3$-rNGB(XsHfp-L{e`@k;M0ZOdK~ezH%}RJ
zV?M*WEQ{VZN9dFJ5|&M=;eH=E471*iWY>t{)^KY^I%{~eB73uCfF!KFtwQM#XT!<`
zo$~GF{8lq-ooE4nQ802UH?jbVq;6C16f!`AiQ&`if5I3XYe^0LK{iSzf;xC`p^)E*
zyP`#)%$MrL0e8BCmG>?Cj3x0%*58+CP38wkS{YpD)YhFsIw7<@RQkie>lX4v*diIo
zp>waMCu=zw*(uj;E|0b_&BjBj5beLW0>9i+y)$M`CmVqhE;$x4LkS?S31ukOBhw?;
z14eu7MaUGoq2_wZXHQhTEgS2^tmvmi47{0vU8XOyZjgVQcaA4JLEsnXJ{c(GBcm0o
z;)lOFXz0O<vZ}_h+Og4lCtEy!%TOdH+W`l)$7$Qfk5F$mklfGXP}f!rWvst*ba1TQ
zdQJ6+dzb}F8h>fLDu3g?e2aX`&2PS&o0ZLP=p8MNag>6ic?u2kwr0THqGjA}f@wln
zFJXOFc0s*VCQ|Dq$Lw=yYi@pRAn`h_bKl;&C-<kspA*VCx54DjBPp^iGZgs9g}vyV
z&IQ087;ybEA*kPP_<=vs_qP4Tzf>fI7^{2D;Lal|29$EwPX%x4@)Pen8v#Nz7?yH+
zsuu><5r^AM9Sj{X6}@|%D&{-E63TQ24f-AN?2quO=$jUPNwa4kox$is&Idf`Y_;gk
z5MtO;*c!g)_3djwZbkQ*2W%h|Ru@&Y1TEd<ad83$pRfSr{$sLL3K33O3QNh9OMIx%
z4FO1)me^R^yy92>1`6B!IKh!{>JZwm?xSSAFm3!aY7~zJd&yPX`0fo-45PKUK@_Mb
z8&w#gK2G~Af`P8AeOSH*7rNoK=9s73U{uCENsD?>&Kvl)18-*8=&d#JfpteYgVDzd
zDglF0s4KnXR63_~zEssUpTi&lneF=kR25e5LnVE87A@;CLy%c^tQa8^Y`AOHdWN92
zWgs4pJJ>ka^Vo&DNp>UB^3hgqXCsa^la@2cj~{8C<kBj5p}qz2j@)prT)q-}GuB;i
zgx3Q_=cpI8_~;Y{=MiA=osy!zFc?N|fwWiw`VUsElO)9aQ+H>uB|D~oOLKrqGoY%2
z1ie-3N<z^}MiSGOhX_XR?W~M_Mx(v!O*_={5r*F5)0T=j<2D*Jh#KA&k=Qe>R0E7{
zv`MLS*TREB1#mW^Ob`9iUR$&t*bRgRs24{EV~iBLVv?NW3Z$(&Bpx~fscquS0Il06
z3s%zB<h`G9E>2396fDSFZ*2F9#46kBuw+y+prc%4q2cu7XqyAhy|Q=X_7By}LD4k6
z@J+zOlgM*`uq^j+*Hym4F8}-*()c4_3Hwmydixx+ia}4`u;3-<EEG(8RFu@j(<+;x
z&&g$<RZD-Z6dSc5*;bM_o;#u1Klfdq;2JS3z6;zHWmm-N9@f$duQs~eTU&61u#86>
z_lSg0>{HZvb~U#n>~Ba!2(?CSyma})t;?md6_BB?Wj!xa9_9rj<|l^cyg4*Wve5qw
zCTtL%_sdN1yt57$^$8z9gSp@>XFyva#b%!OMbR)$S#L@xovzVrdATvt4Jju?dRQcb
zKp~V6tsuL9Z}{rjIcj2gO7qQdKYH?O1y1E7^Qd3*<H+gYMWO|ZAa{SZjcE~#Hd{_I
zHFtlXa-}5=V%nR0WwHID#zlBi+pae>yj5aRJYYk7L1^zlqR!J#X475I{4<$~xR1M9
zmu3L7g0=)Z5pDcsnFtnoSETNk2%@V}P1-v?N7i5#o6%8NkQ3h$Q8NZAK((mkCqsqu
zHw8Md9vi0PAf@Z2yh1M4FfdNA((`dtUW<%IOK>%DzWMa-8kzswy+}lFx^I}x9?yDv
zXN$wsf;$^NuV;Q7BMVlu5As@hyR?Fvg#wu(IUHw=lGR#jV50TCE;EcYo3gpGWBwVt
z;QfPhZpQF>f4cO7X~q@f%9i=OpMJp8+O_C^`eX@Sf^nvju>V@g6c#883I0{fP$6Zz
zyFH&~u5JRmq7APSkE92NE#MPhu&tT_GZpWAtb8@FOJpa6CWKgo@<Ufe3<WgY{0>1n
zjU;}1b-Ax5gS$hFlVL6wyD=-h6$mX=Q81|G@kdL0J|DmaTWkZ5;AhO^y>CG0chW?g
z3Gq`|UUK4uAo-{czJxY<0RM>b(EVSy(9#^*11OgwFVEIlSHa=ALxae=@MEQmK5kT;
zU1;!Nl!Aa_An^gj3&&XFP(n)F1{n_VKR?cB<#)J8F>4B8QDPj16r{WGXbf$$ZHL41
zq<`OgDfwBjK!5HDE<5Uoq$5&AF`P=>B={N<*xuC^?+~J@8zHKCJtPRd1a&tE5+alf
zs;divErZFF)34prER*MHfiBz)6L!&qS!&`%71W3<84S$uzgR6XbN87R0_*0}U8wH2
z(Uvma!ba&94a(H4>m6iQC60u;JZ)z%>7}|!rPn|&eBt*Ez;>XF)t>ZZSp<Qiax6v;
zlA=&XYpraGJK(8CH^^Y%1(Ca)Mq0=KCT?{ziWtj3(}!(`m>9Cvjer@Et;vM*neIKi
z7S3fPzObtBhO6EoXW7~_7)@av8)DqOQO`%brNct2pwVdE;B@LEbfbzWPeTsW+D^Ze
zY}>k7$g=}uE7AXbe}zTnrWg9S4s?>4kl_xbaep2%k!a@qe4N?A=w+xYEU()bW^2r@
z#DbeMj(DuVXN^apg1XYfZ2Adu7NL;Da9`(M$H)EIr}ClC!l_yAj`F#WoMsZKgq~tW
z<z@4y=GzAvX9~I1X<0B?37U-75nJzgE?i5KkHF$`Y9FdGd;l9E8|Y#E-W|sEz_Hku
z9HdKvas@KpLlu!`$Mn+9GuE9!IEz-rMW8vqcWd=l-gqV3!JPjBZtl%qLGRYgvBROq
z8JCbe2j;x@+cwPC*WB^#xOu{|^%lw;#t|;3qF~)1iIX=$LAoTB7SC@hvRX7AV3u@y
zg99>1Li9PQ7m(!&sJCgD{8Bsb^NHUhaxSiPRvJ;de)G<+RMae7kt)!g6XTM<=F7_(
z$%vt(z25P0>e&}D9J~5s7@dYIwMCbIZw5t;;)?Ci^QbTqF1g-Jkxm)w@?IejL^yL+
zL>e|SWCLm0Wl)5h6*<SZDYNC%Hwx>A-c$S1$7;;IrNhZARB<5hJCD7)C3#r9LB;hv
z+El*`=HE6V8W5mWA*mXorG?%BB{XupumiGW*Z*#J@KlNXAaGIYPki7)Al_?A&2!2f
zd#G36&*I#luGw#w@>{T<g|wf=eGO349)#8xQoCRDa|cvs8W`33kAFON{hm>`vC;fo
z%G{&dr}gv6o~1~?QMRkhqnlq54Z(KXdlb&AV=RUJENcC}v$Zg86$$pWTVKUST+f$E
zyKg@T%dHT8V7)zq+3xnrrKNCdnn~U^yV#NCL1@31tFyAq0L%iVKHl744s`_=l<5|z
zrLtcQ4B-6~3>K0LZ<yznlsYB3TTGhz4n0@zUx2tyee5_)xH9rgnX=v!bS%o<cYJg6
zO@!^bGRs2Pg#~FxRtNgwIM=u4^`5Is4K2DgI_}?IsrOf0^AGd+RVhF5s5i=E^I7a`
zlKq>U@h<-B>el<EjAwHgJ8}jn<{E<L0Oe{kYe7{O!T8ZpqAZ#5e(u2bec$egbCQCk
z^p5LaQi-;=^NeYlni}41b7gF^-RqCZ5o6qYy3hCKDBQPY;iaqDPW5^T(niqsM*g23
zLUMhGRG4s(5vpC4UZ;MrfBRmi(S#4?JO3|FvoF=Yq3gO$uSWMRb1dTrhZkJmtdp);
z3I>huhwLM(k2)xe`Xda-{M+__fgUlh=I=~M986L^_UtFrnD1Zto;q27`kJ585lE3X
z!>>7~u?a(Af#KyGg%r{he)fHt;q3yNpxvNirHbXDrAl$xxPfXgie<V=PSV7cvfV5o
zwSIQmuPc56sD7X0LfwV#3f+K_`|WazXd}Bcv><Y|EynY(`Ci}3PVbl)tCJBA_;qAu
zE6O&)WeaEtH`ei1{v0_6(NYinNJ)^DT1%N#Zcf7_LVXc#lI%;CA&L8YC1PtXT0ov$
zl;&n#?s52qt=+S{`cRHrWmBM&gs9C(#ZBgMyAovm>YQKG`S&2hTez<|BIcD5#@#@(
zUs{2?|BR2AV}{gvi;b1w>AjZAfjvYu*kn;Wog4xEhHAG9oWG*6E{&iCMI8CHDejMl
z2_h`#tZmF)(MMq;y(GcWzML!k<3nUPPOkWUVIkKD-}dwa?RuM^m%Bq9o(#ysEfBOh
z6Yrer6C^q${3RbRNQXO`TKcydXTY}~tW8;W9<~h|Zc{ngKLN#;@C*KYK;gFoe4fF0
zw7tx!-aijH==5&_HH_$DP{W8LQz3drJw;4Ts!l;<%D)?&7*!CWN*m5!0kg<}lQmyC
za$y%S(CdW#!YxAvxKPs9YNv@8F5fluVdU_HU<;elZ(!0zXGK-L1u;Sxaa895Yd_Y7
zvf>a`P2qpuD$g<pYY`VO_{~GnH!t(i@2Aq%H2!U}mm@}cyCX(2j9aDMdoI4H1saZ2
zK5enR*<pXEg9{|ia^PU_`hq*DB>S+nRR=KrKbX2*tw_~c-@hQOez|3kY^JlSlg(kD
z85c*ZBZB2_iWqur)3n{a`x_b9!MdkoA#c9fm~1EeNhTnm*t0I~>ck#?9K($f$8b)m
zB5eEuHb?NL{S^CB>koWEC0AO?I4~I*5EI1l%IK_60-i8{5@AAsp;HNN-q?1?ik!!~
z0%y0$%NQNK%#2aIdxmjsYQ>~g^Ak9!l0<^eOS>tq-^{V#gjGilLSvBO08Xx8rbMgv
zeJ)<v2^!=8<FYV@I1cjWF~HSmt#r4ylssK6DB;mOgFRP4ZSLBQ+w&tfS%AxBk3jKP
z)L9YBXg)gO2rs&4RC#<&=Q4bZ@Obol_`dWkJ^)KeGC*mrJ*f~jwCc6y4&lp(j6nDs
zipXQ-Ih4=fVq!t>wFN#Q!QET3;MISj^}1&Wa6kYs(v`5UeJK44mq>{Z&xs)-&?Btg
z)4_aCEzWSISeyyp2%$>v|LIiTiWRY@8Q2`z6f@0k;Ggr`ENXw{_PkB7#~2pkH|7d|
zf>Z+CP#nx4ktkrYdOrcG*d5t~6=vslK9oZ?TsQ@zbt0XPFuuH#g6eBRYhi@a57f+h
zgm1X+m3+VP!_s^G>F;!;q_<jJ%oEsq_-8zt+TQz|O|<Dsw|U>3eBSOejlv^_|6vwg
zN(5l<3A*CuDMc*<LEi2<hAB90{k)v*E;NM<P!o%iH+o6JeT|)#rN1*KY%cG&{lND~
zUUlaD&AJ!N+1P2!d%S4*A09j}6F*Mt#-;$d*xXpKXq}{AiLI2)-pbe-Fn){EhA8y$
zdi4IP-=g4fP7tJ^6<U+pv-{AP35jxf^n6O>=dV5bZ<<P6tkHunACr9#_AX46$TAft
zG`IV~&Gj3$&I-|<FFfMC(75HV6QXrko~v>Vc$lRKq=Lw2rqdm`4ncZvaE*bYqpNKk
z2!$|KL^y38Cr>Z$d*uSmQyxe*=$o0DVMujtOt_Q0I2vlHomt8*<OPhe#VBAmAQXR8
zEop$d?wXBjrCmJq#YwBRU(4T6B-B3yeXR`5tcrvu(yp3(0L|6<wj>_VMZz+maxhz{
zq}}?*ds5~}lUw5)JLi{tOAREhMgzk7tw8yQ`76XR*T6-_tc^R&gR6X?d4`jC9kz8n
zf%e%TP6_#>i`Er*e@&Eg74RR8Dud64vmRd6_!(qLF512GO4hpbW4I~1K^oV0RRZ($
z6KQBIk?-yiDW?ai)u}Z<cYGmf*&8O&8Zc13P-ldkQopMicJZWxo0?N9Cx6-FYts(F
zd~qx~L=M=+Bw#AwlT2GuC*ap;jWd=cBw-6y2orJ|gf`-qe9Of)^ftZYIiX^!eF_q)
zpOxkEix>AK9!tXnJcuWOL1<Ehwf!LOk_TU^j11V(z#5ScutpA8KN8f9r+yz{uAl-s
zwsvMQ1ugIMOIdV|Clx!cP)DQ!<;5g=`>S6o$j^tK`Obtl!^?EfCF*0U7Ma5Us4m{X
zc$@_}+z==D|Kni=?!QHd8(@>!vixQF?{k4`hcgjs#&n|yad)QFUi<W|cYu#lHfQV>
zPd~B%{OLI=<8NR!r<@1RowtUS8AI5Z_jNeXieb7zD+6V|AFa?VV9y@?TGTwvcOHWu
zy5t`wDhsYb0{qZ_bA%6+g3yHizKusfM2;4-@0lyForyl%_ks&sZ2Pf<^3-cSMkqh=
zyje^H{^XoIz_e@|k55Cv7e=B6dqShy5zwh&zneebfOqoB7jFgJ1cAW*2FmLRrQu;8
z47em6p>Lyq`lMXNB%mZVdiPK@@L&a)9LJAa_NtTFI{P<EUTYu60eE-3HS!g_$mpBk
z&$#+5+Y@d%ft;u<5-9PtA_J(;4UUV|3B+ef;qj;4@j$x|g17nReYT%Bx<^{W>6$^Q
z&6^`YB@cO+KU&9D083W6Vgya8mHwkwPboVoHgY!sK>h%{c|}jZvu`>)JYv>qB6NQQ
z&c3>*Z2n_$s;e3i;EfO6p^Q}m!H@8c&gWH?pPJsH;4Om+qX2=+vQF`|-$*mIqEIz9
z;QTm0&pq1m19(dxN3cD42~>l-sjz5cTt+gR7qr_uPIl-u*~k>=bN9`E|Jy>Esatnj
zXVO|$>MypkP$RYJe$!Uww^KLl`GaaDAM~6bsrOnhU5AZ^9r4=5ZPuKt@X_|V4h+e;
zpuJ(a$$RIsyNfryRyBMD`){Or)!Ovf^@FlYyPE5_!OJDsbGy@@_WxI%REk_vNFVUh
zFNISv%@7*if^1xzg@sK%SIWy1SV>&1nuo1dhW-$GHG&dl6JI~jq{xud9*aCT08-3a
zhtbLcUr;=B+V1(lM0E`^eF(}8>j?&e142C+-w=o^I_1FcOsF}SCrYaxO*m4!`G=})
zicn-nsD$zdyPq!v8+%;W$>v)N>I-gEsk>AFPmQ4A{<%D9_xawc{aUqJ9o)9?$1Ag>
zG^~r-GKcry_gs3#BNnGz#od%oXdSTKF?W>ca(6g%cXg;J#)MFna4rUt{pRj9YGA?Q
z&;bj43-a>wU*$20HzD4lRZW6@b`HENFG8m^MPn=y*uMQIfs^hVXu6OXX&h1&6QPe(
z5>gRD6YTOMga%Do1_!_c-iK(1cDyY%Zw^=k^kvG3+=X}49jHmwSmqy~LBk?)5-E3w
z#Ec#Ie;D}G5L!c11|A6KJ2@dj@wGmn?E7v8?@=|V<6HZ+T;T<}Jk)%<d>CzBHUawa
z{j6m)uHbm%4{A;M#(_cP8(MQ};pB&WUYXOov5wR7g3nswS1HqqZgdga=)|%~kN1J%
zL5(?{JN{r@U1_TUuF;|!JJqvE7Rd}TQ|AFsq)XV{k1a*7685@)(cj~l{31CYLEUGH
ztTcA2`8ui2pL<_HTTjxv_V#b=s1W+)R2Uf+G${E=sCS8wkFJWw19WJz4{o>?bd=>3
zdX(>WZ`rhy+5wUc0O)9gqs*^GZZJakfA#g%QCUUbwu+#1cXtZ{(j^_z2+|!Q-Hmjo
zl2QU9-5?*`A4p0!NK2P=z5VI$kNfVs?>Yv<q414!IA`y**P3h2xuV$T1fog#$H)M!
zWz(3ZgI=(4Z(3ROZXy%dF~Qtkp$R`_Ixpyzvure0fIKQ{nPG+7I>P@H^h6tg^srMH
z7~UL!4vrfl{{oB0Z2Y<ppp7d^^7Y5cn%4z#+iVSIS}!TSAQ$y(M*0k!gwb@;&+`1O
z3jYwW3g<6NhBAYg022gWVNYDa0t7I3!0vlnY5amP@HUTJ85LDES}j_o8~em$4-ijz
zHhqMj{e}->B<;Z5Nb{r|!{l@YJkTa!IomJfK+J2mSKe~2156BWW>Y&GyoO_684VEQ
z5QkVIN(+Bj24`)#_r-xlG=0_g*_gs+-700{uy#)%6Kw$5<>dKG11sRgSm`<UYyTte
zh(lX<b{FAMC5&h~^d#mOh<tpnAutnVdd1Iu*2S^9Xt6>^PF{d*tp`jGqUn~!!0^)`
zpcqvp^DfUG|6g)0IYb^8MMm7VCH3nJ!=!D33KYLGQ6TbVC8b0f2|yjMtPi4!bO2we
z{wu*-7`bRj8J{?Vt%+sw{dj<Lu{)H=5(0u#U!C3JTnw4u6j&3QCZE>=X|jyqmqBPc
z024!<)=gFowEeC&Y_nl98T{NgeJ#Yne3A&w<7^G<*zUkeQP(dC29lNml>|1Qv);6a
z^JV8bkTX^OR0%dcd2-zTsVD;-SmxB~0J&QDt5y&_*I%n_!E!HuL|S>6E$I`4(Wh$|
zIl;7be_|Y+I*}M4fFuP0W+wCk=PsED61eOX%})Uzi?1FSmhwldeDuX1Lsa6;lJ`0U
zv8Ntc21bmX2L{iq6wf^QUDlNzQ6&h=ciIUhV+tP2!yH-_`KYoLb1s-K<RONL-Xt6*
z!QMa>H57$BTDTA#vhX&?!SZM^%t3*z0tgM~V4muLbG4Biow=a1;R4c$|A<CTfZIF0
zVIWLQ0_$9=!pR}=pdzou1=o5QsW^t}p%do3$)u7*q6+&NCr*+ZjSxf_6j3Y$Iw{}v
zSP*6U>)<jMVvQ!$sWY(;O1D9xYcUxs3r1<V!v!#T2F#>{M**T>QavKO<S*9Rx5PA-
ztR6vwS;F%|g<6K<?zP_=Dy;c}4k@q|)a7E}w_!%&8z}FAUR-tRWi(BMB@&OE${Px>
z`ZvVEY1vSk2>sg)i9!sZMIebskO(Q}z0Fm~LY2f8x4iWJq{zdhQ-~1CnDiBsfUp-R
zN?1Qga=_=w&xtsEjlKYz({_c-gH=`W_m?1a`C438I*tJgHZexegN7}ITVVB{oGIOD
zS6seArdf(TA02WS2qN2LXCAC+F%3ou<c62bRx;P~_+YG-Ng7ahSK2K$YUgvqL!55z
zfi}agNf}1<?1LSx|0tj^OuHLq%xrWCsD?ft{XKxTa&tS-5=|N<i-+)Q&xuP;;3Ujt
zrp*p?V&|iOn&=B2_Y@bPYb*B`WW!7d09;}R)};HJ@HybcrokpW!X5DZsc?QPQd}p=
z6O0tsDeYdEr~CmByR+&X&>iSA%O*HSMoO>ytr{@#22hRA$*sFI=gUHr>?m7rAnus4
zQ7arteefokvXv1XKVO4ejD)3ct0`JIAktz6oWO)nI2I$KCOB0dmrP}&Xsrg47~
zIsniMl&m-0(NBU14~hHt)UGUmH6HtCEnKL|!3beCh0jrd=S1_v{S7{T2*+HmR8;lL
zS3u`JllO*Kf{_Y0q76$9QxOw-B+rtZ6X)><;10ypou2)pG+%M2Qf?*xT7y@tc!vcK
z!-&M%eivY&hLGiRfPIAvvw1K{m~w@!Q_vivR$=UTnulW_fP_OoQ!)6avjdTi{aKs9
z8ovsv>5E{mM?t$`GMWw(eTcvECD|y&zhJh(Z0A_+6egd0vTU@;&EyeY(mK;;55c6Y
zxo8{NKZqpJzk$+VZfWS@J@-ZX#p5;5)ff?QqIuK?OU;0|c*z%ND3WEbiwd%;4_lp@
z4Z+U;e*YfyqE(h)8SKTi)gM^2R&j(`wLe1c3Mp0v;)PZuUc#GT?^^rir5Kp_qq5_v
z)G2A<NJ!8VnHQmJray3<)Z{}sJ4vprwPYlTg~YDUb}e^FHt{#RAzVXDwWFt<@V~-e
z0EWp+_d$+c2(ZN_?1ha=Xr6$%wog|YcS(s;21(swy$fl<yN?x38M_PAg3Y^V0v%`V
z)fM2@pet)33n6c_?97n2x^*mXPB!`^FZlt)tT&wCm`Q4G{0wA8Am6465J%?uMcEzb
zK#Qv6^us}gjgThP2})b}+}yP1X~xzEqiC$KXFY%uyckWyrO-F$(1o@LI2-$`bQv6*
z@<ZV_Kxb}y_d=X$Eyke+kP(|;9*NjRFpat$SV{GsfA;66Pbyx7$s0}!)%S=OJgcY$
zlJrHGR*L?J%CD?p^Rs5MAIzj;LlfI8M{ermDPZ)42+P#*pD%!)+{Y{j{1OcJF*yc}
z2u-m|f2=q)e#|Ad#^2QLJ$P@m?#p3G^c0Rg$wvWoEL5yuAt{)#Bb}K{AKT=ORG?zf
z9%~Ft=S2uhdxNUW|8ZjCxdX(3DS<M2A}lFEBYe^RaoQwRg%hHjEutw6v(lk6cQ8k*
zNDd`TBwV3NV9_w;0T$O>0x+lcCBTBe*&vf}0Xoo1A38#3p!A5^c+P}ClL&5Yw8~WG
z0w(1L?O^Q)mu_P8;$JG&sSWWJ;a6F}{2CuUkT*7V4+*{sE&xY$4;X&E?7)c)=e|{x
zrMk&|6LYg?soO&5ymbsJm`324p@YZSDi|wtB0HV`n&1YVii7LQ6;m7&NzO>gfKam2
zL}R>IWGCdo1ikI<JbjTsS?D;J+h`FHNy?u(fy;#3z<My~F!-;EAM`7mr{!0&lrzE5
zgny0aFS$sU>`wnh0n;2ncJ(1%=00V&@1dW>b0ne<1vP;;8JmP=`sk_rx`Y+=VFZ?^
zDc8xBH8lk`-XcgR-0r&BG{wnaMysPhMuiJA-BSh9gj^K%<`*Z%#t9DyA}3ig=SSk2
z>oJf*;7~GU0W3<weR5~}wq6!pqycS_V0Ih*Np8SC#vvZ?*p*0N-z5WC;__y@R!ai$
z743M!MSBFGPB;Aqo3Z%fIA!gT(QxZ=r~djU^m*Uan9IJfabS2%f3h)YtJv>o#~qXA
z3%!WFP>Ap`IeG`!D^=cMhxF$J7bq98BgZ(N0bD1!GV9L3ySK`IUD)CMPGezql^>Vo
zaI_CyT_4J5!bNzb&!v&e<x1E*pSCW$(Y*Vk(=$)ITu^ezOdEb^8RGbHxcHTRxJ4M4
zy7YyuONnG~p9Gqy>6cLa%~^p-h}V~YT_<2#6Z(n+u-)Q~9Im?SEc)PSaMyMLy^brO
z5H}{Nkrns!RiapXd!?T1qy~*)HUEh*`MJoTRTq~yJ~SQ!jVsRw6ogT?6$f-QGcPcL
zd7_oc%XI1@fe|8}FE}hFH#@Jd<12m_=EZWV#$I{FWk`m!HI_>8X9bA?39Ce2E}+w-
zzY9u@e6`{%npA3R_;zaH8OTsGkv=b}LxY(qkj=%}OcWt+jZQ#eA_W^XffPQ#)>Iln
zwd9>&o}zRs1oZ;O=ktw@ZIrcCQ<n~my6RIrA{5#Fiw6XUo&-Z_Tm`g;1V}s)_BgJ<
zCr+gk79NjQ-@Su{$4W+Q#!w?1h=~uZxs-sB6M<>t@yn6R;a7Vf0|T3!DP6R1ie>8c
z$#en+L8MPB`KiuH4ynbvg7Y9O0pPrPs?f3f4&-)RBY3Dy*CMH57^)F}nPlb1#x@D;
z1LQFOag;JVTdc1>4feWkDOSzNvGZTVU2kel@2+*Mvp7kG6MEzWMOQOJg0g}hhXGGG
zfe$-~XY?nY@2M%639sO%H>;BRW8C(HvoQwF3E`$dMQxYGTV%VtYrnZBkuhi;dIppF
z=Q<poYTp`(h$!^8KJhKoU6=SO@1##R)BRQv#sy=BfpqeQ#zV@mL#sdf6$c;`^@!dx
zC})w@y&KJCUdRJQ)F{9bFz*1uW-iTs=fF`rBCyMfV~BVNwElvFg=Y7F)XN$iQfECq
zjCum<P@Q=-Oaa0!u;^^b#R;=@?HbJ2ox@Ts6?cSENvK#vD#D4zJwjn7_*PQI6V7Mu
zUT}M8@WY_!TMib57lwb;>2ERgxi0=)a%;+%UyAUibBiS4(8V0Ff#`ZSNP-YoHWBn4
zG-bHh*}4se(_UJA^eq4Sy)3Q`%M8=Vm&pw!#H0-J3~^W9{g7scvX#TWyTCL~c&amh
z$8?WdUmbL?T>f^j@?ofo6-)6II9OnEjP`g&7xZ*K>hdrKD2$^Rik$Ui-E#)$M1~x?
z!sYJh1pNGp+MtDhkUXEeAO^+Uz62SC7?j`mS3>=h5n^#hkIWzKD9&LR7+T*%IxTZ-
zc}CJaYVeh>^j><1aS(_L66!XdH_P}Xfx3PMsF~=%-h7+Kp+B$HwIw&mL4uw?6?#>Z
zjfkNG<LUgAsHO55W;ptO6V9{Y^}_9Z5P0!f+(db|zg4Z~FGwd~g$?59qzW7=Z#p1u
z0o_yXP>eBUkFN(bfO?nt*(0=dtS(0sDnm%SD=EToG65Jof}M9EMqTcdD>;K4<ErCz
zYg@r?#;py*9Kgs9Om)E?T?XqvN#T`7F0(*>Ej<IQu-lMne=uEQ9?gJdabUVaT_o!p
zrraHQD(2D4haZCWg?#c0UZc?d1t&85Ul5RT@@G@@cvj%vG%dXA9NThp1ZpAJu&c~d
zxM+==9GChB1mS&`!shiZ1JeAAma5IH5BHv1JuK-T1n%{^NK6iZQo%ulsPfd{xu(F3
zcRSx<omqaIZ!AWWTjS|kkkRwsabdjv1SZ1*Rnj6Op{GbPi?F<YBf=_uj@Q=*lzlwQ
zz-bqHwLo?Nv+{8IRWz9kvGZ^sw^q(R4!S7%D#7Hjc@65Ac@c9%FNwvg7iG0uJvW8H
z(+*#pK4PRh!>R{){YsNECW1Q{Utp?{bgSPlo6|fJHjyLqqTG2;x8KA0q2S=zX0Klc
zhiT}k!8^_Kn^e=-D(@kNeZ~1KX9yGw7$wC^_lc!5MN&WL1=ZS|>xssQ4_ijaxyTkY
z?$U3=7n&j<T`m>E()!w0_UmaD&oBedckck<EU%wghdSA^v*$cXtH6Jus@8Pndf|dp
zFXe{)j66Cnq53yW>YjNqR^9*R4MEB7ij)paTD;XlX}hetA9+{sH;yxZ^nc(4Mt-o6
z1~cFAH#CmgHty<fpwFW7K(_&g&KfzY>#|fx4B1I*9qgN6s7I)V_X#vn<BcZ(jI1*-
z@azzSa*=VeaJV;4j-MK=hoGFgbbRVTAxaKTswC<hZ2pCp(4448c*~5wr4Yue^IQ)}
zx!_>2NoEP6+c)nYPwF%p&zEG@HAttP9X&(>O|IxANlXUX>2!M~LHBEabujp5NIm$p
zvjcLO0S`5x<UB-SR>KlljXC=b#vw$*Oln{Ay!JrdXSp*n^yFaOnJh{1G_I<bg}j!k
z2hM>h8-KKUTip>4JH4as4nq`QcV=2CUKtRA*-;uSxx&U_drdC;vl1SM9%9|+gcr{)
z<^Ua*1c;k;-PrKrzyPJn-ym>)7fvb9ew4E7!2vx{O9)PHRmXt#UJzWpJ4!wc%-?@d
zxIi^F1UZ_C1_IC2r`@lD?*wgTUZ4=sFnKmBLx((R?dT1MLB^Q}ppN+yvK2g;OFfY=
zA)Aw&0%s6p`(ymYbNg+3UK;!bf*s&)yV_u!{HN1&TS(G(pRLQS<TRK_q7u41`j08a
z*f%`Wo=J1(M_{SHfN+uhRf07c`MYh8^>pfu)rAmA3mW6lo^NnJrS?7jGKVW|1?(|d
zJy}9&qz&k4aG%7fJa^hNGV(f%d2h=fC$AD*gjrrSDov8F$U_S4Cwvi(^u+jp%3WFD
zIA%wcHp&)I1tN41l~;@m<9U0%<0N5TU6@3hp&s2n9{Vlv)F<S|(I$MoG1EVl>4a<W
z2Q!a|vE6X&aL!TqL;8=6l$N}ia}|VEpI)G@sO+UEcqiWsc;lw|%Q~5~3UbzM2$}&r
zioQ~4qwD&z9O(ji`b+f$O!;Y?1~cK^u;`~vZch=G^R$yRmv^JV|3)6{bq*j)c*X;w
zO_}U8Xy0cq3<?xiQ(-x^EIZVUk(S6W^I+&t(*r**>D~jF(V}z43O^4p9!Z#oZ8P7S
z4VuSZ|BJ}0UGzNa`3(G5LZ#a|#avZXe0g#8d;G3qQVhS9Z-Hpe9RtDdISA9z;1N)(
zKyEnEta@ZZBy#&u=Wbu|k!<6pefNUEg9<+S-5VjOKYTZ%^b>NGKjNmSS?!c!VydJ<
zG80IvRLqFPqnI43s1doUKdHoeoH4~heuXeH3L+c)F8k1a-<o}`3T*HiFZLuZvKBK7
zzF4|eXq4=f6%<eHU1%S!*t}&Wh|$-_mmBe#su7@~(H+NSSWe(uwN$}3$DgaP=WH16
ze5~57W?HPJMyfp#E+xU^9L$<FmDX>?rcD{%8&K)Ff?_x`{#hAsKy?`y;|}i|0Jzq7
zFz{tuE*UL8rwpd%`ml=EnQex{H=BV}TCKBO5ozyvr{Kq9VmkJ_0|N0PUg|#-oqy=;
zASru@Gq94MruMq*R!ar<q)J$ey|7+mcmLMf3az`s%Ap>{__#*L?kN}N%C)0)BZ~-M
zU^EAJpDg&h(`asue7Aam)Vr<JXRxQa@{ws759=6Aw3&a?bNO(Z{owKu;jv{ll{P_)
zq5jESK_A_n!gTX7<!OA1NM9igw>^~Ns1=_Xv}A4zdWDGyH-Lp`Ttk|!hXf}^i3lI?
z+TUOFoE^`2Eyks%bXz`ZVsycbaUC~Nk#!*rSZo)U!&eg;Nmbjh4%571OzI<(48v_3
zz;!sKT^ppd!Ngb3SM)x>Yfr+KSIw_wQ+s4wB2vnfn<$3Qng`kBMNn%u&c;HWt&<9Y
z8!!*@M(GS~8n3>O3E9iDcZxPqtfVgXpkJJev>6xibw^*3_^~lo-8W}kTmM|<w68x9
z0f8k7<o{)B^)`OWH;|#Vo284^*6*13oFdw7;|*D{9iD44q2M;KZiWO?1d{2)U$eRB
z?MRQ}FOUNdHoG#j3Z32awd&kfVKM1!DF_Yg5C)O<cS^HlJnJkHY80NEv&jC>VK3`=
z@$v4cw90&!xA+?DeYBobbxKHvRB8uX3?l?kV!|iWV0K#YgGoN{fv_i-^62r0b@f+E
z;BMS{#7GhBt+$Xnn5D*fHuB(#+ycAioscj6@JUPdKIq%3QZ~S;wrhPLT&{+5Kc=?>
z!=;sG3P<u06F6}D`Kj`JIV>R-sP0jtt><Qi4PXq3_XDmBZ9jF7lULlX$N`1a!;~a~
z#C|z*IMyi_ovXCI%vpo7YP9x+3T{uP_E)Xg>uVC<rVkmg*@{bRHL}irAjyBe_lXob
zA{<gO1q?CQ{2spFKVfhl=owDMB0K-h>BIZ@ne%Wi`c)@gWXZVSE=os-x%?e6XY)Yk
zAeZE5vUjh5({PRC_wS%I+Vfi@S^n+8Ur`u#2?3eBpm7(SGsN`nftcWaH%rReO=sNC
z@-c1e&)Mo^2@y7yKKbfd7U&3apI_!d*LKTwSclkVM*d=f$XO94kNTriVX)#T9bUnU
z&yZpkzCro71V4%S2Re)eqfv`5U~6(Ov0@MenfLeqpme-7fQSF4LTwXI&~`OspUfkF
zRXoUj7bAp1V!y79z=!b+nLlLwP?gcTGA~0wR72%%bM<)8Qly3@T2E+jSM>!Mav6k>
z)k2F2ajN)1O3D`=xgNiN3pQQ#l$98NkxR4yZU4ie_ww%7xJ&M~%O3XO8fxN;c7I(&
z7E)j8u<v#4Ap<ku!QR`ccGGQwzb`y@O>lbc^p^l6l37}733<v!V>3~gTqd(S(5#Gv
zzZ+WDpGfH#C(Lm^;6ia}_aTOI81B#TVsvaEQyP#!pC~yk=`-!8Xz!(tXJ%%Sw?SvL
z*C`ni_{z!ZgDd&j0vklZ1Crr_qp~3%s>!Z;23vhfRB#eR0!Sz%{o(gosxKxjHK^Aj
zQ&AGjH5gfX15!~RE_+42^l)d*8C}DeaIFp`N)?#TT-=X!+(b9tP?*$F+CU(0-mk^R
zS@$6)zr-C@ed_mul!8-T0fBduyt{KN^U%jK?inY8>p~ayCaC8RlLe~fBB;K`SdC&J
zfC)+N8>c|uv@7;@QIvf+A!Rbm(fQa2;pI?np0IBvwis=Ll9)Ni%gu#REzzqV%5M33
z4*Ij+(Z&W75|l`lgPGF)t5Rp17q)ZuGm-OwgP3M%C8?BMM+Pqm82DQ`3>W^0+{}&~
z+=FavIej3gPwbwz^*APTK@u4mQ%HYzoM*z^m--7g=NpR#AP0bq@bhEpWHGE7#;i8q
z_>{J6yH9bOJpSQB?_ZfJ=j=3o=ODfAv;Ep|=jIa~0cUre6oPR-=OEWBeOor>gSI9_
zvTmH}&3txWtGI!ei(xw+_dZ*K%i9NvW|sqo_f8!K`V)SL=~9L3dSIW6{W3xXZ-r2)
z2=Cv<X9#{I6>8#$9URljC>HobZLi08wupCV0Jjb&(5Qlh4PR}Q`Xj_DqR2UwIuyLW
zyFEK|npAcKYt@`+D);V+oKm$p#?~OCoQz1Kpl<DNq{Rcp8<wIs+Vv_XIbCINMATEy
zGPXiYyg>%;?v7+foj3{bpw@!B%=5LT`a3;oh)o}Lh{ES7B3pINhwJG!x4~$spBm)`
z<NVx*i1<v==8q-DgbzJOc9S{a5Ut%yC1~+jkb`4V$<^uH{#fl@)3$l=_vI?O7m%2y
zlgE7Oj7@C>MUVT3yIsQz&w$B%HnYmmp}bT93b3YbNXg%@IosXQ@OH<GIVemI){^ld
zLdweP0#L~gB<8S4bq8sybd&LDkL^F)%-e;WC&c-fU4T?<8=vPoQ)e^!rmSiC?!bNV
zY=_10O5Pm<SM~-#4g2L<QRi?`0<K!VRk1Z;n?1hki7SH;<E_^1nMpB=YSNnQeT~z&
z_@rEK)#Y(N@fn5Dz6-Dm0^3(G5sjVsn$-khF$5ESD!N@IaawUkg_Dsk(6|~d$3?X<
zQxFZAF_lSc`To3KaJy4JBx=kU`J}_wM63BEsbWv_(do^~7Yh3|EQwMzM%%XQ$%;nm
zO@WZ8o<}J~=KVxY5gQ>kFA_zko~=&DXN$}JAT$3H5p9C@*gctKS|gc$imMdu+u7n4
zZ-~Z`iRP<=w2@D4SQ3fmVxM{rzX&OBy&>}@8j17!lgF0sSjG10eU;JAa*Tmz+u_fQ
zx#GsTpb|V^an;a_pExH=3{(<d*!GX|`Dkg&-3(#}mk@r0OK>2DTD4-5fr9DO``VRF
z?)0pRVhP$_9J&!-7nJ(>BmSC!Ep_M~LpQzJ5fHG>(eUca;3FLg*X<Br#L--p`c~P;
zGb5TVYwrv`H?Fl3maB<jtgbMxT|F=>bN1DHsc9DJ$_7Q>ZIEJ9g}#eJt{x<dOiK4s
zHyK;9Pz_p`__p<!V1T9HH*tP4x3!m#OyYle`2M2z)3(ME{CR~c{qUE#mLVu1e#${7
z{FKu0Did1a4i%{^PI*h!G}NIVhp2l<9M?mR6-6D48fX-G%rnNGVrbh;kl3Fid?24<
zh`Od>i6TERTY$u~@Gko}Go)kkB`)MFq?>jl4&a+1nZOqXF=BVSB~<T`Bfr?tL?yAK
zG0TqI07d&wWT`-4`S5eczE?kPe?1eO!Sf-cqKMYTT*M-g`LJ%n6kKWixT;Sd1>d<-
znul_YYMuu-m8uVZRjQC`6Vgj~Va2r8&@qHo<loGXdcQ-AQ}RMnuk>sDhdeYHUoGWB
zujwW?*kW?;eA*DC7n-m;)o>Pjq-UnVb3~opD63^vo<I+$8CCBBbo_tn3WV=Ei4r=6
zF=pTCBd<p3NugK!sk$3rzTfeLI*z9w818?Z6_FC5a)q%ymXhzGkh<;J-+pWXSz?IP
zBe+aI_a~)%mCccixuj^8|0Zi;5|m&-`;q)UYzMr;)Z&7>zcl1E&cg@}PMlakz$Ci<
z+Me-^Gm#lNaij+~uVL?C#jSfI5L@2!^t+y*2x6S*-L~>#W4P;Msf3UK40kjS74+pl
z-*{c8=?E3^8mvC3YuA+=C^e+0`ZLv4OIp2dTOXtF#8i{TS&!UPW<~o-ku?%>UN@jG
zA&jpIUA-kl*+|HLB=ylcV)vR@N?*SARf8<^-A<Gf8~%?l(c6{NBP#Zu+e&;My|PWo
z(RgE=xNyFHp@oKU;gdcZvbP>t-#Dj|`LipsgZiw5J@6|brjFcGIgYU{!KGE%n1!XD
zrMA}HuGX*HopvJz#$EP@ZWvgICP&$+L-A0p`ED=*Oj}B}uAQ?^Gn9g_W8<;yWtwT>
z^PQtMJMa1of}?%w)NfR?P22k57+6T1a6BVleg61H!Xnur`)bg#F+ZB;u-t<;D?MEQ
z;+=gPcBh4?i_>#hG^rE?C+>MK!`r_PUF=sd)=}?+01^o@O2}3ZyPu<b*-GOJJwYXt
zLR=Pk4C@5*$Yg<a(*8+m^F7LH^q%N3dXYFnmZga(^HVU*b{|9libZq&pl0eg-mU5f
z5AWf~PG7wYM%PWVa{u?sVLr}8nFY61$VS{jFHVvTNTB&`0fOd9f*(^UOVyKCRz(ho
zSKa#Nf`z`gd|cLwjw{`Chqw-I1PUXJCHMqxIw45>@wsSPUL?V|iqZbECN@*~g0icb
z5J%w$tr6F%<t*XmMb_r>ix)D6H*?bh+Fi)6NhwSPi#W|Fdwtl#P<Ke(E8e{(krn8s
zOA)a~a{RG_%+X1QgpbI2(#Ps9)qqeHX&lnoWA|QPqFlAFRRQWaq_eKckUWTpeel^;
zU;(-8$00B}Z&LPI#kuif2=c=>UTb!>L*@Ks^(o`L&O;@fbSt5t)P5qfU&yRJMGHQm
z%Y8)2J1J?7Qc-46suz<dO65nx-<aNkdHJEZF7e;$MuaPX4^R`*VL~rkGO6$XHEwW!
zl2Y$=y0G)yq2!XXH@SKVv7Q=h?g@s_&q&&Lcrv+3ffGBnXYzAxss8zxbx!l|l8A~l
zc#<`f8OjoGTjp=;^~|JMSVChZ8klRH@ZO(o`c@1Zx4x^u?a@qT-7gTmvnnAZEW<5a
zvS9jLW};V1O?z&pa=6goT&5?W!(>_pH~8JX?Pjser1y`2DSC7sV%Cx-mQdNMm7afg
z1M5JVW~4q#NR7s$bxmXomJ1s8XfNO1?AvvQvITh^(AG>*(`iyTbu1Q~lEI=aTlKGX
zV%Xk_blA4e=R9{Q?Kk;WFS<@<Yi0UJzu&V2LRuDB^t~GXJ>HuJJkRlxVs%M{5)T32
zTfp)%XQ?jfP?#UjHDKx>d)*6r@cX6|H!}zx3Wu}7hv&u(6K4%fyPrkAy=|tKG4Ou$
z2uoH@QtZtto^OT1^qxSkNo*vTNf|lWFT5Y<t3%tlB*03LqdBGFTQ<CPN6D!)o#MLD
zP*YPgKAr2Tk*qx=Kyqro#6qIO0#|kTmUB)SZ#iGXCcWiq+7A5s3)#d(`<5daMbSII
z2j-4#7zZ*nX;n+U%x*;-Ty3OjOqrF9Mb=Ym-7E=KjID>c4Vh}A)KMquB*glT;IE8t
zu=4g<Jb2wd_kQ}mt9Pm7%WB!`W-ax;4g#m$26XZ`HP(F0_d&-@g(bJd_x?27UW&Lb
zl4Po1z|vs9y)CNtJsv|LYzk``V$D6PyTac()`kx5?2*!cN?J;X_n1ea)4J;#&&a0A
z3Gsk~(mva>Rma4@iDlvxYA2Jmi%^~96cRhgN}y4W8E#ys?(IeMmawoN@&(yr`Ialu
z;*7bob3GE)Cw;$$_%brx_2)(Wns{~~z3FFG%R2-)6Q&c0uLkJM44!@2apW)#X_uFZ
ze>3$m@143~>3hh0Wwo(qmx*~F*G_ESDJx!|$U1Gwh>K9(eRZB2q^{3HQY2Ysj^yrR
zCMm0YH}Z-IgEPL*W)DA`W+0Wb@Cts-4}_=CVd5Ybl4$tAtW{_1HJWEc5OT*5uEXIN
zlQP`ddfj!ckw4$E4)xLomsR@-1zB>d%1QU1Y=xrk3{6w{l!|T1sE+VY=`MWkj`%2t
z)_Q6pJcqK%<Tz@2#R=uO{*)FamAYroYFoQMcR4u!CtYZ&kRzHS!FA8p7xJNPRe-mx
zzGdH5cqyi3$IsSq?_O(WFU+R=Z!LT-L&>Ikv-YXvFM2@f(201tmW^HgzB9C%ip*A9
ze$U|TY&ZlYv~1rBOPwySKO%&_eq2w~KV16(8MB_sPBO5WS`Z!;Pa>40OJzxuX<T(K
z*G2@`AvOV%XblDvhzeEUx5#3*AFsDxK1F7vbKX%}6Udcdw)iTmEqt~sOJ=9EYN~L|
zA6Xi8FiNleLz*AIk8nwrp*NvOMwm9+MqN*(736j20;fDN5!h_y&r<N$JLw5d;lkdw
z>}V_sE8SwBdVk!-ILEyM_3fX#PBg1V{6UIpsL03aIicCN;i8gIpXUa}SN>^yist!K
z7yjHPWQrFZ@rspp-irpmoPrC<dS-?K2G!27JBd2hh@eT4*1rU{WF8h65v1wU&&usb
zGy<Y}`gJtLXILfZxBea}1Tx?p@8OV-`4{Q<#NZjpN@#5uiE$m}7oD?j7Vf`iiHG6-
zoPL9(xAlT49uZMHEbBPh(#1NaC@=(`DJ7r$Nzh49GgZOf^9s))YcJ9mt2@_zud44c
zQc&9U;QG7O-qe~R9|sM4_>!c1t$1VkCC1JCg#>||u8%=u`x&TexUQN)yQvgTAKxlq
zAh)*uc};b{uq86hwTx<wtw{=HK*z{WKjCLTz}b~~O6S^5GCKiBa?*BIX1{N0u}ycH
z(xBI?$J!|;8vZEx-8r(FG-{G+A4Gl1u@9xAc;Eb#Iy7EOE__xBpLOm2lMr##nQ`W8
zInacec3(AZ#OTY*dh`nsYrAvctQ03qLi2Kfkz39Gb&YK_)GoF^<hc2c_ds_t!N+ny
zV3e4kruTM+3m_L1-U3diRi%SSB*$5?>Ov$vbJ)Aj9v|{2`~_lWGy#lYvpZ;^_jf6x
zNf|MYmsD5gOdt~LR8e|urm%mw8qjaZdz2}e^BWaG9gwk6LuC97r*UQam3zMa?#2$0
z#8gSsH498|4aY9a99QY<sLM4`b%8_ml<?b%>3P<=bPM{X3Vp=MPt@3y@vOc^w0D%E
zGrP&W6=AA3yzC^nmPvn-rxBZOb8SpSOM?0O>6X@+3jD)SnCKoBE-Mv8N~1Z)4yn8P
zoq*D^c3vg<>OrSd)Iz$J5gNg+(k7)HI~-qyH95X9yfSU@pd>2<6CO{!2*)O`mD`NG
zlt`0lGh745{iXE}uZ;2<$K`y!&P3KJgebaha=0%3X`g5!4lNFY#xvw;rYS?&1l5U@
z`}Fxm2P|g_79sC<+V2k?9~|S>%_f5su;Ul*dC|q@S<v4+2=ALl4~^Cg)C8KN6DXf@
zwTQGFQZ7r(kizFrfB<Tw3(BmT=Ui!yf2*wT@Q>yDidk(K5f-qPZZ~-k4OAWFMUDb6
zTzW#C4(1x8s#I#5@6MqM^B?fZ#^5keVyS|>mNeBD7{AEUYsY;pY;79*jgDYH(UvGP
zLqswb(S%B;fJz9D_8QUbQ|%mU5Cl77AYkGJ+eOSOFAI{X*{t~E(pTe)WU<_1wFeMO
zJemDY!%))M94$jtYV9Y$8s66z9ePc;j((Aj?7zshtMZX)e7)jJv#*`D2qjjj{)LSW
zhk;WU1w9}(x*7(Z^6O4Rq#riPjA?nWonGm9;1j7w;W`!fD#rWZdTf7B$dSb|>uj7F
zR)yNzoY$Q>!kAc5Y_y3STI_26Z!s6`6k(3&e^}oIO%UD9J|HSjCwy6mkAKs*U<k4k
zval+PUf~j<_ug8cBdS8)NKDOevBLe}jcoKI3Gxs>kF{|L*DY%A@i^H#vN<p1D_x+n
zc{;}bG=TEx-78A%-0z%;E>HiW3p0A~J8>6DW)WjbOm1a~(OLWXi5Z&6(k1mr!s-LX
zn8l*_=bwp}$<EkW6c9-Ev(>)qXfyvo<*BYq6!*y@%{MytHha%RSB~fSvmfuPhEIwn
zNPXPnTKtr{2uxY?OD#V#o7Y;F74Z?5CDvzc$ZM9cFgr&Lia?nMQc(HFZ8#57TE6KS
zdYuzZn-4S7)&7X3^Pa4|FNWIM{dtZPUn;b5GM@G4D??qi|IXdOFD}l<L<P!B5oNf$
zn(MkO<_$?Vn%Nc9D>@0>WwIS2_0>0}5}O(^zS@H4@>r_ELCRQ2Wedlj2W2_E$~b9o
zS!IZ31a=Ic)>HuOfy@xN!w`iq)A_Otq-FK=wQP<&8y49RpI$nSb+2u^+qW-cb=GMh
zdp^j?bKQa~bJNt9-^$Az@5J&kUzEbgHj^M>9<X2M&gZ6Ezgs3STPL<0C%7*dGBtW@
za;2p(W4ck)dwxvnuQAo{Ax+v<{Sey!Qff1$WzQUitFK5mIr^?)mtST5-N>V@E6e9U
z_$-%a)T_NTUK~o4ag6kuMd(Z`6fA<ktwaK<1f}_C$*jM*YaZ$2NUat{QB9r0oJVQ`
z`CrSwROu(={Ze_and84>dQHWfkBp^aN2{+bHaMZXS?-}Gg12omu(1gzrKOvfh$2?P
zswS-QQtD0QY2LRpa@+E?cZ2J0q|49fuTROA*0_3o7oKs=G>DW5R<sp!pb`y5v?h5j
ztB;vSCq$cOsB2G<?e;s4M;=oNGj?p?Pay?Nw;Z_o4t1DE_Neyqyr8)37%O1q*3r&_
zk*`_`>Ubzbc63j{d<<ppvl7F_ElcLG-AGgI^*^{HW9l?7zXi{$Zs?!+jvvRliVnv(
zlA58#a-*P}VMWE3d@i_y?l%Z!ndUuI8E!dqo!4?G^_JXKy@k53V_aeA05})7Xac~w
zMkKK4fmV~s@h^HHKK`&k5muQ)se;dZXIlbc(#I-<V3n<k{Hgz`-m}VA>8YzQTe`<R
zG;e3$KM+%tw5u%2i>r(r%fXb$(r?})$3N0&(Z`WUV!u3)S*SZ%iENUSD>~icDQf$*
zJ$|0g_w6sC#KybCs_m9Fku}{(Ay?z%TM2`&9gOP>81qG?<Bp9ZTeN26)#W?yZds$X
zsw;O)nEoW!H|Oh0s(yW|YSd%$)bmF!GL`m$f^Xa2-k!^Al$#y{syNT8xBUoZKtf~-
za#{9n%@(IJt0c|VByW}ad&u^;^lQCD-8T$&{bj&au!L}0b<V%Xg)D~Wo40wJ>r2BL
z^%K+Dy7D!>1M^*j7LGWAPd!>gXMUzx^;n%2v9f*u*R31^*y8Nw?9j~Lo&rhuqmXq-
zu%fx{wmfMHLFAJVBeKXLmDW&dF5IMTPxW8*_YJPMz`xxdPXP?m%qBN2fK(SZ03zo6
zuHnP=0kwF--Bf++Boae1M#1Dhly7T;GIHj&HE4nBaI3Jbz?Y(Sd0RMdIcQ06Cp($u
z<m;G1eZRm2lB{=+<s|$>`ylbfqNC4+_jLdo3sJX)`Kz7vQR?Qsq<-v4ITd)cw`6Q_
zedCdz#+Hskd0ocFwUl&;JQOwreiMIQ^8V@l?T&QoM(ieQs!8iZ7?}iOdKA^?U2rPf
z;D8<T-HL}R`R|Uwr+gJ5h;F=gkKy}UVjp;2{<c53lTIepWa48?#Ql_$!M9_tN+pqB
zzC>ein?-OD>8x55-85KAu7wowK|MfZ9!!{?VgCKQKWV#x^y==n*cUkci&QaI!#%-W
zCw=%{m+WX?uHhjpg85(;BZS?9gP_x0zZwBSFli1pA>wA~@=t;2;`5&nYHItv>~Ra@
zns=o9?tHLr1bW2;I#|9nu;$q8DZ%#NtW7uU(by^^7_B-qy_eJcNX=gw%R*Z{Ej#(?
zxH%f_s{+dJ4n|i;69po5x>%K7nC)*fViPGVnc?rxDn6VMa!$=#&9TRMFRXdB+v0OY
z^(n3uK%CRC3on7DysSEbd&*VxvIC=0cbby-ZHUGkKN$>dCOgGs0xN=#2;5Z$>hLz*
zKeQ7)ygj2bf%`YWyxzz8?wO|O{1`N4qV$n=7<}T^bpD>z=UcgU6ssR}#yt6YlecZQ
zh84yJzhlgj$?r4xEnnxhGTt#oZInI3UJz0G_}ry}wdQgK`wUQ$E?7;IELwEGMKrCY
z1^mgdq?@0UTgVOYe2RYJF#Fjo#KXFWt_kpz=01Va)=rU$MqeHVjvJA{o>ffEXO6#h
z0S{q-n2tl3k+Bo7OFrY>zhN?zMJtyWn`cu$;CYOV9C`czKm%(4ZDxCa(X0~6y8lyf
zF+&Rd9{zhaxJn92S@|A*5ASP?%Pwq6f#CKZ(A8+w2hwwIM-R<K%Q;<kxQDyL5A!gB
z0M~ZfvJ^rGqez<nq_StfdSeJ7r{LcQRT%@@RZ#7r)PK7wx}DMn2DT~95cc<J)<l<L
zo=Tn59kVs3Ew6LRrW;e44TcljQ4K)beyqjpiTx@>oM926p-H$~;X0Q;A5cwbak+??
zrehlg9pGdgx!5b73uW)Q@x~BE#Prv^mfQT`?VN#Of)*{8-_HJ%+tb1YRbrW4z~Hbk
z(Yb;l&M1PmRRngA4)O-BmaN17)J@k|Moe_!e_-ZLWd-M-2os+3oe_W@6#b^^|MYnO
z{yP6Y`WFdAgd>T0g_s_b)rv>~Lk7VMfpv5L-NN60*wDcCo}(t$xBv2=|K%r)@?h_8
zbev%2AL#YpJ`NvXD+y{Ap5gaU-2cx7UV;nocwJNb{0~<L7y57S;=kV?d_asexPY{>
zVK4^V|NBjF)8DZIFnQ+rN{H0|KNlE<z3kuHQAhvd#{=7WU~h)RlG_ydf4%?@F}MJU
purHR#|NZE|&HsJKGt%()7`#f^d1!<EDv!V)IVmN{a&e=;{{tah1?vC+

literal 0
HcmV?d00001

diff --git a/torch/ao/sparsity/experimental/pruner/images/prune_4.png b/torch/ao/sparsity/experimental/pruner/images/prune_4.png
new file mode 100644
index 0000000000000000000000000000000000000000..fe7586edc13ced8d7d5c14d1be287c47e7fab400
GIT binary patch
literal 132748
zcmZsD1z1#F_x;eF(h@^=3kXOzNOyO4cXx+`qLhSmN;i_yNQZPtr!@R8==*-(^ZS4E
zJmU;A_s-mN&OUpuwe}h#733sP5D5@LAP|a_q^J@I1g!!BLCV3y0<U~9P)-AZkS#4m
zL=>b%M8FD8_GXqgrXY}HWQsbRhVlSTrnZulMIgK+LMy^VNiw<?G(=HQYyvc<bPR$}
zPnfA<XKgf%nv1IFcrB>9Frt+qs+Ac>CE2wqn}*I_7_#Y*ewp`rYb1-^{6YC}sMQ@b
zw!12o-&n{A@=Ws4^bv)(errV#J&D>M0PBy%lH8XjGw0|y4*D+m=<ewc1^)g}{4QxP
zYyDBx1xA>g1Y|<kjStB!fVM^jI==8JVjuy1R<h@&)GzK9L*E-P2tcn;T4TWeS=5bB
zw>WG64sSH@iVb8!5Klb_DR@RaS%Y|`nILh--pzy=r~w%;%@P51O_m8VGTr}ndP+s>
z$ldKWriaKH19xTO)A!+m>mf$SkC@pSz2tNk*Qrp0X?RoiRzT*goswCdb>$l@XSyT~
ze#H>U^s(AV9t~xI&?4<XM5O98!5VRupsG7ap;!{*LteE+?6nhWDT#sbKG{mBxR4F9
zV*>>jl~&nkKUTOiX;=wN2HnUum9P!yL^$USgOF{FCMT`l0aDp;vg54Py=G?Va>c|9
zl$ac*_VA-P#YXWp>#(q*5n{JWL>XfD04ut+@IJ$+RD{w(3=Ni}5v+|#l!xh^8hRfR
zB?c*-Uny7_hq$7SdEgz|Mzk$XWFxCpi-3MIUQsccThi!WJjqs&E{^o96x18x>-Rf>
zI?!EXEUJgI&N&pcLc>4EhtVR{hkJ@97GKx$*^v&#D(o_Xe<$El6mG8#Tst@xU!A<Q
z^warFC)dLAk&8_Z8?6ml6bcm{h9NtE5$Y}EoZ#Yw<_yAPk$4vbTE8F;IwW<inSr?U
zV+6F`3qjJW7i6RfXd_6!NU(z-3m@4ERdq82%^od;9%BfPvDV(8!#J(_i#;+7WY?P<
zLtAw}9*+daqOA{y%5py(T|mv-6L3z7mT*Kt#50sZ$Qa?VCgVvVQ_ru9dw>t)CB9j+
z*w{6o9r7mhad~$s3)8P+tn%c<I(UEfDBlmKkqG4MCn+x%>%FY)umyjiCJurVbnA~{
zcO+j4@fH-Yc?0kDJ47pv$Iet$L%`vYG5F{(Y(8#HScq1HrW31~#N3L<!V9&U#`@mZ
zSEb4fdHN&!>(gBS5%($F4*p#U85>eJ5N|M1--uzX29bebU!~VRnVV=NocheWhbfK~
z#x{XW!hZ6HaDmMEFV#s-X;x{dX`{teKC6>Ho4xCAljPn-)`(s+aQg7VYw92@d*Tfa
z_Y7l$Hk-vSuDk~K{lmdSXUi^UE5v9C=NA_oem}%~Ay=}w7awewUFy_Vj3N2Ij52N0
zo50W!mA9a;C<`MUz7jD&WC$c0>O=8>@JC1P>X1Z7!W5!=4=P~@+S7q82*5W$rGd>i
zz}G;T4wN;ZpGEi`aG#C(DrkM>RRf4AcyEPChwos8atHEAh&c#M5faCOF4B!$B~FOJ
z)eVOr+Ld5IK`bRUk;ot#M3aEu|0WMhGq6;o4y@W=(Qml%+6mQC%#Iu_5p@IQ6ro3f
zpg24|n{&ru63#SN(SnSJcvzS^H+09x1wkuL?Q`?E*7<8Mh}O8e9OiL5WCH@c7wmnX
zSV&@mdyFhKsHnb~8B%KCt3^&3pd9n-_smzy?UT!O<G)6752AmKDc^Ms>|&fYJpJ&a
zZtIhGSE_fUKSTW~#}Im<C4xu-W$;A$$=;G;z4#8lfDjJ<Wkt`5x`i4u^6h8F@t21^
zDn^`+JdUi6P>#%1k+af1Z}+JZCBrBrlt8qp-Q=6^qNJ>)nx*<A)g=$|h^t}eV)n&R
zslOyrZZvMNoid#&o?@S}J28uWG?G2Z?N)+LwoJ}XwoQipPAScspH?hVn-8n-qG)cC
zd(z1Y<pS%1_@XQ<EG#svqSpeYN<2}#v6nx#h5(I&f5OEQ^%bfup{$U$@Pd%_=cR(y
z{2#wK&kei?__EqU^uz>&`~{E3O)Pk<oj739YdGj%S#wlav`u`r(6p|qrLw@bO0gsw
zYs#A{k{z!b-?BLSYHq1+K{DM|v{d$O%yBAmYT$=vX^qVGrx9n}r1Dt1_vM1_ZHH=0
zFPh@<@xCQ>8Xn`H;Vs~O!8`C0<<sUX*Q?Vj`bpf1)5_$l>x*`8a$j_VeIs$c@E-3+
z*nsI4TN>LXyFH5l3M+mWF&SaCh2(Goy$C-ZG6y^-IzcktK1YMy@j!wpp4ofjhQ-Q}
z<2DSt4TD$n7E$Bkxys@hA{SvdXq#w^_)Y#hM8=b|ov(j-G@YoeNB1sK6;NSNDZQ(u
zqJGOG`=Ow-V6LF#ZG5`hd)6Vy;cl~%p|}sN2ES@%OLf$^_b5sR7iZs0=oq&3wDh#h
zbWM{D7?t<vMc<~X$dC-a9aK#nPSsU2Q_C$i(@@o3sw&X9)rc;g(a5TLFfg`$XH?Ts
zFtXKH7RWeLH&!?QJCkUMso$<zH-Cx4`*Ul&Z`?iX?I3Dl@R#8JoUYX*^4*<H8n?xj
zS)Le{A@}zt8k?0L<z7+ODF-Uo{tz)QaQ=Fxv(2-u{;e$R3yMDf-OzCbQzla*(}K=n
z({59oOUB;(%1!TZXY7&Ms@rhJA@L#qLhH}_+VI@)G2`4mM)|O9no*d{;Y|IOmR1uz
z4n3luYCnJd+-tSJ*}3k%X86s$Cv%K)l5w!Gb2~md$h>UdzO4Ts_rU$oe3unpNkmFm
zW!t?DiU#R*qz2Rpsu_5#Y<1Lu&%k;LrenDVg9bv!zgNs{yxlmxxo%Y>WFi<g6#C)f
zNX)6e&|)FQ!ZRY+BK$y+LDa_m4PGRBL*K&FLf^%1<Rz+1LCQww<hdbRj*vE5w&oUl
zPR#ARF+iiGB0SktC^3#YUcD2hxKo&zs}|RUYfMBQSriRZwK$tnb-8Y{PStnMnf{u`
z&ZTGP%SrhO``Rc43nmrSzO+XcUQ)xW6T2kcB(frIO?poqC)LyN<gsLIT3h3=_z&D$
z!o`ZkuWR{ZlT$y{@s)Yo%+5|3FZ6lF(G$BHw;Ws*t&hsGq$lvM_KTK9wnV~BApD~J
z)jT0^X5^XDa)@l#dlSRNAKo7JsD#d-LAiqt3$G5JWeRIj-tUu%olwhGB7#Lh_rf)b
zd(CxiW-f8=W3|9W_tt*@uR+;?rH>LLGNZf<<0%`Ga&l}LuUwT%CKa_9Y(MUEd0uQV
zX(rn=6DG!|&1~;=#@|FE^7hC_g~J5GV8ie!Cw(D&U(Bw`cCq#<+%md=j~<!%&Xk8g
zL!i|&>^g><op{7#bde3|Lkeq&XWi!hb<ILFS(MvI&d9)rvs<0CM*6L`k7;ZbhHV3P
zpReQnqppgYG-@=|e4GOxgJh9TLf1kWqZe_@bkA!=6`a~8^S_p!pBX)@oe4BQ=sY+t
zCk;_D%Cx51>4@u4X}TGEj-6)EyQpE6d$c%BexFw@Ri!NXS(e#wqw_&xu7G2bBZ|Xz
zbA0o{x@?+B`_?nVV(B6(A*yvRc7|E&(J<KHTS{!_OKZ*#HgrqpmNgz{WsDfwR`j-X
zgLRnoI5xgbjGhZ0^nLt_=Iy?=y4<D_B@x;3uDfTJo|YHSo3>b6a%hzm9?tUFU2gK_
zc<)+&vZ~uRJ!D*h`%U}1^@`(u=H+efZ!VH!ZUyJDo0S`p7!k@^$%C)+<vfbI+P+r4
zd#j1zA79?RM7`g-TZj`s>y0A6kw_)ZC(iKydZe*hbx38YynRx-9{-V=bD58ItGo7H
zu6*66C?40}E?2)^n9rKWWuUxX@EAE-Ts`C8-edC6Eo|NKJQ+L6Y$vfgu_|ia@GW|%
zKH3bM#we4hfY#;q<2c{_WxZ&PTi&N*-<s&MK0WNnwP1%}$Jd_WeS47+#M|l~eTZ?G
zvN+SG|GVQ#Xp`i(|Gw>+74Ip_`D%{@Li{2BmVn>2=);Ed?cJC2FQ%hInW}v5-altC
zemY+F%;Fv}=QDfrMn6toJKxhctHV7aKI|FIyf6PfQ$6ErkJHZK)4ZP1*SvMua0qvF
z_Vdz@<`?V!-1YeO`UiR1tjr6!>$f*L4?Pbu-)0IcE^kh6Zt~vXS3D|%f>DNtpjU=b
zK|4htmX*d>9yM2-eOysjzXORC0mCWJD*YBvVPM87d*%=54*pr2C@`1JcloPsgMg-H
zG|t+7_N65iomU>1WasA>tsD>mlb{Y<O?~~Nu(mel1%t|Z2#fSmo?yCA_)+AKA}Kq+
zzn_S$z$s?Pw-2855z~+#g9Ls*Lc;qeKs!hAwE^5C%2Y$jOjZ^|2Rw%dK|>ILV1Q>3
zz?~oB<-gCxA!tBQ&#yy*K;f1k=)cFv0gq3&1mOO3&Ywr9q%aU1@E0a<_sE9)=V)k^
zY^Z;pLq4A=q%0yO1stSo>||<c=WJo`aw%%k1-yXZAgSpL0^v|T-65ouD2{>8pRrWY
zaM6%`%VTVB%VcO`Z)D2kVe0@K4FbLK-~pc6nz|T*J#204oOwL>$exej0iHj-%uEJ;
zKE%bEk4!^W0W4zgWD4eFVqsz-<3|L8!Ec;Q%y^VU#s8iT{KZFR;o{=J!_4gN?#|@S
z&SdXo&dkcq&CSfh#>~dX2pqxa>}ls>=)q{`O#bIX{`nkHQ)go*O9vNAdpq#c=NcN>
zySng^kv)CUzqdcvY3gD5pKr2r{yQvSfXq+tFtakTF#r46z^QMZUgc4+^f0y26t%Pk
z<_!1_ehzMyH_ylafA9R~8~^J}jsKj<%F6Qkzfb+IxBl;`s?MfPBKEexce?QZXTAQO
z{NHc>J@F0m)6oBGD*nv#^Q*u_^CP}t{&&&%5p`XvHGy>`v=o&GcrgfwvZosaHSiD3
zpGV+11eL!fwH+}CBnXlc6;knl*w27#z!IzL8{BR>RCZKk%?2BcaPLJG7DjMObkV_z
zVzFiy8pz=(uml9*u^4oL%!au4(y@QA_xU!p)!+UsarZN?FPHtn{=@l;x!vec(a}d<
zXY;0#l1Eit*O7L)_uph3)sP`XI?!35|Hp#`IrPJ}|3lyX&0dAwN=KkwhU@hBW#d+Q
zNvzhp!)j-|oFr5l-tYhWWESX8-<Z}PAC4XgDd3qS;+O9a-R<JIx5m`1S#ByGZ<sH;
zkeF!_oqF-j8}wSeEHlcAO4PRTa+K)v4tT->NDbieApg&UU<NrlrE=T-Z|%$dn$e~>
zBHkvBX^n5ge3y%d%MS<3TCU*&_vi0|B|RUmx0fc1l$U(YhG%x0w59^oKAD6_(+k!p
z|Bqn;V-GYFUh#jt^j{KqI6=ES8x?47f4HbPJg8rGV>cUe`0+*Fy?MVnns>J_ck%9Q
zbeRqlI%(@-M&CUc4$ZCkpzh;K7K4D>`y>BlE9*3^i)akGTH*O}<NtpMPve@Th4g>C
zU0wptKRRrG^mDz|G>W8L{Al0y>#+HVpUb2loy+IS;j&NQVSJc-`;+OCVWmgo>38}i
zY;~PgD*4R)RDHke#doxec^Gu2uC+uq|NDDhkwA98y$05f!wk_~em{ATsd*^F{ii23
z2HoV@db~i>4>{i9`_n<@Ltr-Dh<IEIkpf~BilXhR#R>lJ)c;wJYeA3`%uJSIu6W}Q
zd4X0k;ZTFG@&fl?G1=jWJ+fhQ?=OK#OyY_{!aV38E2wqUme$Mbh7d1I2KD=K{;vpo
zn)+q*{v-ubJjO$QdH(A>t)i7fk=|^j&1mK}hXlH4hU$o3ZCY0!4nHp8e1bx)jQ<l8
ze*;A@m@EN1N0^Vq?<$A)r2EBEaY=cxkmWWoUKJeb86D@?+oQ(^`}!F@??$(S`C5bt
z@xreN`)-QQv2PU*f@r649%7X^V760#_0N?S#|Ua{e{YQ`i{2!g*6FLCYFKOXB>t$0
zPPa&oLlrOZ(D0Je<~{Gj)%ubg??JLxK%&ug%_zTeNorkGqfv<T?qM|)%G2&Dl!w+R
zH8?tP&af_TTqJoohU3U(;DYq_{n=VO|7ZJT<D(m7+>Sop$)hAieom_!0)jeD10~N~
zcHJnUPLH?z0mFbWk?d+l-`~~$c41j*Sph86@5Vm*<}8n&nXunAlD5mk%ti_BL};E)
zT+%}Qjt@x~hAs4b5W3Fypmv53dSF6^nW#P43=6U0l)Rd;Vm*#mPMJ~u)9Z29{n@I`
zH-1QzUV2wdgYM)i)uKOk*`8(rYz|+U4Mtuu_BtJ)Za!*%)SG*}nARSJ?)!9+!qNBl
z(l6kBE~eDW<+J$hwuU|s<gt}uip|d$$8q=CHg2Y(I`u@6qN9V)hQ4I^?aJ}|qIlZ9
zVHMGB<nFA$!8=S%J7hC8=A)nc`usP1B*V1r8pYU`_Uo7Z%I%li%6)HtH__q<v@t%8
z1}O-M-^V{*p>Y{>fO@RPvW03A6?DRJXO9s>W4nyw`P72v<UVo^pD;KRhOR>*s2-q*
zlB)>E&o)126O;&0R|lHF-3n9Nl@XWO5FSS^1?AFOG;?NiMjUL&vfaSn@rc9w013oy
zudLqfsP$4$Q&WJ){jd~Yx^CIen_R_eQvJdS--HIf=^L7Wj;@k)EpGy8i^zzs`;x0&
zACWtQ>%2|7^TSK~met=&?)tScLUwyg&Go?czILK)HWex~h;+VOa&JarZW+^c=!CHY
zRwvW;8`>VbQ7QCiL>V@{W#7w;UR}@CP~64)%jF{+JrBd$epK4`MIYZYtf&nGk;O4y
z+w>e+79!9Z3Vzp{xJ#QI6xG%<<#rqWiFOON^;2TB2Zy>Ze+KFP4X_S$$xu-9q6SpU
z<HK#SjqC2@0LE;!il)oB*pi|I(Wck=P9iT-fRb(fa!E8Ol`N)2S&qA>^?X8xk~ypv
zrkl8&r$IwL^G#!^2Y4K51=jigV#b<a6$VG6BbJzNzN~gyYqu@~cP2|30ehq`qgcP)
z&uX0Aa)LtEpD<0ns=U|8e0Q>FmsnIhD?e3BTB?NisO3j15KM=Q&ZASy9uL2uzQew>
z$(}aXxKybD>VT|Enceg5oJWV~PnIC@RT-w4I}kK@vymWDJTh6o%D9mKJO?b$=~#h+
zg_ZA?@?~&cVKfXwv86u;T8r1)EiDQoF(&VUvMKGmQTquYOsGrAE4_kNXjBuLE5E*H
zH4f8CQV}i`<n+1TOgo%ZmUrd)>5;qO@<Vp1P%+n~RCszLM@70YfDyVMm3Ke;vhnh#
zH?`Kv7+MSe)e4k5uuetMXwbORA~wVmsF}tQUpQuTe%MwIP&Ik>pT*}Ze99gm;03mm
z_~b78({{>^#V{~j?J*KTY{J#|QS|-Bny+J1)suX0zfLv3`FAkLfp0M)FzM*az9u!V
zMbW!Rv`>mD)O8`{$6maUflB7%4?l8X1q;#~O~nl;6c1ng=EXBQIJs<p)YXSj{;BAJ
zkcw8|S=AD;CiC(e{f+O0xsew#@tfJFTF6sCN2=1rLE^p7>fVJwU5E`^6iBK8?}ijO
zHXXVh>>akleSiLADS=dE@p$!Ht#5uN#vz(art=851lOZo!zygJswV4cFl%>oDgRSV
zOw;#k^agS)7Hp;r8}#b$dZ0+vYn)=0Fw@_~C=4J<l{}}2z)#=+aD+Z>G}G2+B&LS!
z^bE|1x=F~iciNR^{}y>uQgmvS-guIV6TI(5(3EIon^WrguHB|rzrJKG0$H!Ds$WC<
z_VVb6bHRq)8HEl`g&T+Ry@#DV-$in77f8L7`3iL}8w&Nc;PRfXcpw=3^Sdn9k_u}M
zCOE7TkIU{;(}SR-kPdoboo=(+Y67QsMY2L?ddzFq?r>5T7XoX{OvT<GK;^Gk-+Cv{
zuSkZvwvi;m<mi3a_$5iU7OVt$EM5prvA^_`BLZOs146@kaL>ko{QJZA_MpB=+o2mn
zf<{|^5T*cP<*W8zJXQDtx|&QyAqoNVW=sUF=p)>Bflfw8Y6A(MlCrvVMhmkH!AY}W
z(Gx0}k?ZZO3LKb*(E#>_-@8*XWmsUf7=;h5(O@*cyAuf%yIY`;H=CyEsx3uvE;@)%
zV$M$#Q_B?+dTu0I;m5+rR<(($ZR!O5uD+HK)Lgs6=A{M)N27fVGf^PhLj2GQPV?Zb
z>T267Z+0QCD!_T;B=QgNBLG2Yy0q?>dSYm`T*co2*RfpD{&1rwi9Krn&oRh>;$nh}
zpt~sg@*UH(1sOsZW*jQh<zgC7JuFUL8A|j5+%yzs#L~3)*m_v}kG~h}7reAMi+{rD
zP=boSWQj)pJ{w|>$Ay`fvEf3SCR9EWiy&jBjcaXS@pMEx15&*=Rekq+tj`@XENaYw
zsjbkC7t20pvAEGGZ+Dr$VBXN4Q?pUC6*N$I&Q+eR9d7iT_9H14jAB%qE44cbJk`S%
zjV~UK#uKBK^MbmQv7hMY8e23tuYnq^(f)37+V59>+Hc``Ptv^!m9#n5s}N1rKSkaG
z1l>)jiCb;>EvzV^x*JpGidMP60~(M1ADAIXPM!dh1Ie!#s<@_pHKnF4n_?_UklaWC
zQ<dmAy!*2Q1?HvL&AfHRMbdXQDQrhWOT4!wVub-j4Jd@K75YAFoOw7xof`3_s9xrk
zhx3Q@2YwBB>v=ZJb)OSXTpaQpKzxTix^nGsNE`C~?>ts-@!8Eh21+;Wn-4fCS=Rd~
z?ni4I%^f{YZ})5B)4;XFb7@CKFO*CH(0NwT`de$_m+I|^j$hDyLhH*X(5BD_-%poQ
zv`y@}fx%@l21c0;CHaMK{%JKtL&{WKv#;J>mh11(ouI27t|Nu}o1_WZf|}Qjj<*FK
zHG}Q9D8EU6ExS0!=OUTAa#Yvz%-@@>%I$>~(z6T+dZ|3ksshO*Oki>MfyjMP(X{1o
zsky|uUc<B~Mx8#*j@11t5ycyDIl7}le~Sn#@rLYL4nzH-(>zu6K0GP+N93rNOF-Uz
zRWJgC@)nN1&xwkii_Yzm`?9NfmT%ol0EEJ3ZMJ!yyu4yBw}6B2u~v;yD8im_BYG|c
z#StJRM#$xd+oPih`1YO8(Z%wo<;{obR*A(%KT55hk9o?oyYMDbg@OeNU}f=m;xZu%
z#YqLT0V<<>SNM=&5=V(?mqzG8YN%CU*_N6Ro!!(p5U}l3OH~$UtBhT1kQ?JQ1fZpH
zth44MqpgIX9EBKgjn+c8^Tuky%1M}H5qqD^Hs%b1QCwaqEVf6PT_RjKw=%a~3A2ka
zHf|)<n1-&o_Y)Y6L`3VoF55j4FH|8iYW{;$r^EtaLd5QNODd*8Us*1mnFF^|o^fZD
z(LWge6%V8)CLK))jVbgTg)tI@<p7-#PF6}R83TdY$fsXmm3|HKLwsp;mz}Yk>L_O?
zw^{^j$1KM+qxJ3{afDbPY3f4NU>2q#bJx=Je`@g%_h42@5JPxB7(JTo@bjzXk5_^x
z5g6ozshs1+h*P*D@ejniv3|Jo$>-YV4GaP=bCM|2)zY2;6Nh*}Xz)@HKp7<5M<`yW
zX`6j8b-6SG&_&rCr1ttk-5~xyJfQ}hK$#O%{n-1`z9BcBf1VG(;OTVN)7ti~nCLEb
zx+j_hhL2ccf@?aj5hiyj(8~(ydib_xjpJQ0J8E}4FHVhtVVLl<v;cTx$_vrDC>V`j
zgKKQp&_=jR5@cj6z$9#Q$5=b5bTQEivtSt_Eo89|_q^Pv@u27zK_tQ?ICKnk+V-ur
z&0Fc#Z;uE4FEx?=R#Y}&u#7NFeu_$YQmd%0vK;@liLSX+kJezuQB3xLEC)I&W{BFl
zIYY~F*WpvnU$Ci(ZBp4RNW^}XHy?s1D>4F9f$c@dKB6yb$}xQa6qzjhIw37rjkM|I
zBdrFeq@(mG?bRY>QY25X1eW`FY_{k*e|+VDgm(7^u*H%P98sPcUALT&#xJ07mg7#<
z^$wTY|5@`fji+eM#?yD5Qk9#yEf(u&z5Kby`P0K#S6$J9mZrMJLA^2w6+-Hwq*FM{
zYnPUq)@}))2f21CD|kH|Wk`}uCDVt6S^zDHH5pS(!-p>d_@USULa7=>nNxWmIjg~r
z^BRU>M)e5K-Ye)2W+BoCnXJu8&ujdFJitL%p<3!4RZ;%;f{%8)RC1pR6VEUdO8>3Q
z!vTV+A>X&N{11Wi8Fb|x^TD|Y_?8idfu=O&bq(4>07t#C+p}%LfQ`Gqx%d&o0!{C`
z5(t-^jM$(ggEiw)yR}`Yg+aFhL(2hhTWZ=MA|YMo_B$tP%V=k!SNQHE^NZ!Wyu%9J
z-sEq1#S1sK@xP`%e|rJ}gqxEG-}S~jtFWg{Q!Kz{xg&`ku33%!?^f4*rjOAB7!rgz
z<=D^)87ziDAN?P$r+R{3RcVBu3becKP0PB^F=jZOB!&%jZJ~X52Y-cB^&~vJ<_t?P
z<iFi#F9WSb?K5j=2hirJwk8ADiteT1P3StQcnAdaBnro3YgMt;a+|N66gE8ejc@z?
zZ{^c382wni1sHS}ry*Lr&yf}Z?l6+x$9ntb<bcZQ!D*0jo7iiIG+f^)mq95}yDt9U
znj-X!3hqdN9KH0ZBO@q=uKW5uk880u00wo^m~>r+Gx&X*nzRAjrm++_QQ*hu0fllW
zG7HVK_1<_9Pyur6Xo2-;>-G9(`ynR8;C3@jpKhcZ3Lj7*$h}b<g?95NqZA;}SXu#+
zm#kvA&(6f$ax7fmgQLB{$%NpvC&S#Dce)#d#H7Kzq*a{(F!Ydq)Mv~B_J^;e0pia)
zo$4CMtV%g1K%TLitu#!XqxN4iWqhWE0HbAt1Zc2fjnVgH+PHuZ4O!^#_5n%2_5SwK
z{i98Fq*j3g{ZZR(IqymU)MQYQ2EDdzZ3-q#T@I_a#5z@|D&oeZZT(V#6Jj}uzW;sg
zg8$<^M>bu#Dm0Ye)oS=ul<AV|%v5%V4NjmX--;Hndy|1V*RBU7jKWQ<)HmKot@TY4
zl~t*IkO<xE;`*FQ`=sP5r?2NrWuGTGSrDWwAGDFkg!V%Ez2E1Snvce2eUu$FS${JE
z0cs!vd`k9Jmgm-oriFM{=yGbVMTcN*?}J)bUom`9*6xFCV0Ij~KRL&l%{2d>x855O
z%U25ykOeP#IVgJG{QAPN1?Q2J;SfogYT>9z%GnP9`kbQVIR*j%Y62N@>HN}5p~G6X
zFoDAypx0JK$i0Ki?JWT5IZIwF!Nn|AGp;P2Nkr(3dftX%f<&a?{Wjdr@gqP%&@;;S
zm`fQO6}USN#x<;rJM2!k`lmja1P64l9MhLKe9i*0(tWyh`7eEs)zE5socu}u^Kpqb
z@5@6yET!c}XLDwgDBKy6(fOB*pNDRq=vH(|JT5M_Kfpe{t12Uijj|p?18g9#_bS?d
z%9xQbf8>u!>n=K3D}RGj-p^7kwE@5<j~;!auImr%=DUs;paPTS6M&Cj2zPr*!R^}v
zOH$1FvW21`{%9;G7}uJP=r*aa1F32~^i%O}ZQ1EEItx-t@r0^c<MSBj$iP49d*v{M
z#t&M~M&u;1Pz0mpPl>(v$E&?l4^+!I|Jl>vWq1?AoGzwE;F@!J@1SM)cCv#PrKmH2
z6ne5!*v&*v+`aYD{_x?dPoU{Z##r}5mkI6iitUl{5F5{BT8xa!heO>yk<-mF@I+@@
zoeZ9WJ)jz9ruCDECD^Pi#uM;s(aJ;@%(_K@aYtuxh(v?S6pksv9z>)s9mBNAGKS>z
ziNkSNRsGHreAbc7pT{&Wl%*17UdE`>;J%pWw&bcFdmB8Z5>L#x*tng!r>1F&?F<o*
z8b{~HDV!g*qBZp#geI{dBmtpQIqpp?_mdvny|wlsw!sDffy=?fmsXfx{98OP0t9hF
z5D|Yp<@6Kf26K4OS{?=MJk5GT#W_F)xYz;m$wHlqZP#|arQG14A4mxPIS1$qIfsp<
zs|`ElH6uK`nHUj?uiI#D(b2m5J$}!bFRAPM=DV&2iN3|CUWtGaMe;-!5U_vz!PWiY
z!yC_PZG%9VSKbq!qnUV+J_vs#<?(yCHmB6@hy8lB7HvJ00_du_p@aeFWW-3FHa+S6
zLPjgQi$%)R!%ZR4I1N-6_%raLxc6r#eI(W_j{^6<<WVQ65?Bdm(kFnCc&re+0{NfL
z5vC20e41X+Z>4$&lr`e=Qu9HI7v}YSk9(HbvWGDQ1H2#aFLjT8wk}a;`FtKf>S<94
z{z2xU@fV&0)E)!aso3!7LCw!f)xo?f0`|msFOoF(1`@3`94cYqG)g$3Q^ZREM!`l(
zXg)sCMS%EiRMLoPnx8?1-VbvvtI*>)(mMgWG6R!==CM;L*jaNg%jZ;z+Kx?YIqap~
z&U~%)m+tQXnOn!vvLLO^B;}Q3Jq7<fth*tRAr2ZlZQD%Mwz@Hykn2MVd66o#R`*$r
zE&*yyI{(DFLB7%xAb_qf5RjH)xqfO?1@vdz^}|Gj2zEd#Q6|u=AmV@Hc>U0Jd#L<L
zYB?swe$j~PXPeSvJ-{lcpf8sU!-<kjAgN76IhP{jU-YUL)h!%hgTYWk|0ISCFc|Wa
zz+P$1q<}h(6mxWbnxF^qqsf05N)4HUB{FtZ+}P}k&oSNuuh)A{yJf&rqOZAAJ@Av*
zG}qN42IB=*LteCKs9P|w`@IV&CKwc3WNx8pA%GKwhk67kpSKi#>4~03d*o=>We$4y
zW+ZjyaS})W(-zmOMyM@4nWTr<X>D-!a6CH}M)n}Z=qhUTfh)260Z0v!C&HgSNHeg$
z<n=l+8;!4rc6ap+xbM%7M~B8y>!C1LqC*baH~lK`>y5#m*Plc)>o%G5y<BQdkk{k<
zbnfAodmMM?^2DCY3ElUr`sm`ic`@oO%g70{ChJ};j|)@}sMg@oeT}U8*6{9aJb{Fm
zBn424y?aFZQ#Fky^BUoJ3w)qiysQ60XazX<5HV*!S^6RP6`hXH`?pG<yqoUs3TkQO
zzXjM8y)aq9kVp&zH;u?Qry+V!*2q|~1tBxaNZ!FiN_mfRAs&+?0TQ|!yF`(>YUKR&
zwEf6xXcCellw}72Qz~ZhT{zM>tdxJrODdzpVo^bb^P+GREXVK@Bca?Gtda-^tHYm`
z%%Z&^2ru45wP}310#M(cyHxC^#xc4zF{KKHb*%qpihA-ZvSWTxtOjW}FeL^gQ_By3
zALcpK*1VSAMv-htiZF+3Joywc8fe#O{8xelDPGhnUGE{)At|CmqhKGfky3Dim=UaR
znRuZAu?1D8<XOKAEGFwX9I~!xEeLuUgpha&2%g8OMCrwPi*$d99}Bcoj38wQqBN}<
zs>wMiZ*=$SagtwM4;zKw0y+$4i&_n+3g?}!<eCPJq97m5xrT{k3wy0#WEPqtJJ6Tl
zGn>ehV%z{7GyvjVib8-xASlBA{jorGhfy?RLEg~YfuMIYkdcAdv3W9TK_<X3?MNk)
z=8-i;w<sL#Cl~~h*ydzDz32h$CC75k_scYyJ_6bZWfEHgg$>WRzecIUeV*`RWb}J2
z(kb2?%)_Iy`k$vCZR<X0y-mDNJc0Usa`Gas{&F9kMVOM8p^WOeY7|F-s516P4A=&Z
z2I6LS#&d^1)f5+MZ~k3zAj9T|AX+m}z#-OkzflYdiH*q%5{~15#=+HqmNdg2y{MAw
zY=v{zfyW*&d-q=YgS9SDQ_~OK(sbPvW$<=GVn`1dU4QS*$d_ra<$(~{qS$6Sl0?=T
zC^%Sgm&es66pV|4k<g$K9{Y}HU*YJ0^>?5)2uuC0P<5U_UM(!7HABD^lU}9KFDBZG
z#CPBt#6*Nx$wI+4DKiRJb>kl>$PPp@U9@o@@AdJ49H|nPm0|l-@dby`1tk|OzhtP4
zb#!~wPN^yXm`msAU^4j!4s2ir0)nfK`R`?ae_i7$rw&94DN+giJ`S<!;=*W{6S|1}
zu1j>5c)s87wIo;4$sknC9k&t2Ft}i<Lyf`W5hY!<9%(AWyTs)9!LQ_yF2s>cx&dKx
zTV2;x)m1iWpSy8ML+ArqDkVByAE9$h9iN~4BFQBTsW9@((hvvBq!^hSKG7clZ%(lY
z`qaroQ)KV3x$17TVvwU4@pk1V)-0#%efn$1&vtdInEaQE&h*?cZW{cWMZ`~|MAI&r
zd)Cw5Vt~<?r0L7@AYq!h;E{kXj$y$}<@bM@b%9=@*-FBp0iOYUmPYqkgWCtFr#0Ox
z@b9zpa>riWVdZ1fV@PF{O4EO*Po+;Hn3EyUuPv5T+I6%g1(%up8s~xUM#K+Z2s5!(
z+(RCC^9~P+938Ep-C)3JBT<6RF+n1q+o1O0X0Iev+95ZpKIM_Ls8Se#)rLa1w&_;T
z4|)*$FpZziWK)>r_pcuk2p<5Qw<cT0g&dPECJT<lXRM;_W;co>53$@L!9J8BsOx>*
zCD0ZX&S$&ZxA_yFaiAdB=*Kzptg@}1_fAQQ0Yt1JaQ~IV$rWgGAO{=)09O)<i0_7Y
zux~uzG-sN2&7|98QHmy2Q;H~Xr3_d8MxzXiT%}N3iS9N(O-~CU1>gaC6_-F4?-nR@
zItVF15v!reA^R1cqLAn^Oo!Ah74;?tu2eE2knC5E>Oean(W*Aa*JI;{gWDUp4Ksvs
zau@8|b)|6nK_*0g-%ug~T6^HJkF!xrJ*uv94m~VIg<(Eb!p;Nim*W9BL>eA)$fHJy
z4@fyh#C2*le@G-4JtVR%DJmW&3Ab}+7RFej?{D=AARXI>2PF0YBHp{wl7{F6U%1P}
zLN-){4zCyv1WR{CwJO*Mvv-{bkhp}Le9{<nT+2Q+W_g|8VFn@l@syyAuOS^lIhgWA
zZ+R`DX1b-ZqhlZLZ>`^^SZH%Ip@ni{9EE~zY+!{%1D%r<R`J@rcZ<HfIvfII|Gbyt
ztFI5a8cvhRRKU$o2#Qh#K0qdwVk927e9nSNzyQ%Ps0Z^kDZSBwh&auHPjq~M%_EmE
zrS_Y@NR&+iGy(0rCd)aO*Aw$jx6BA*pWQx0)#Bf+hxfGgl5HmN-FL=1AAiaFE7fNf
zHlt4CbYJ1Io5dUfvKeeSr5<7*bPxzKB)b}&E`!(oH_)+rYea~Kr4Nt27z)TaTHo9?
zidVh|3;T4!;O3F3^wWq=gXgpAF_V2*zFC<7{hXS{Z1B2)R5bt{>7-y3p)IjlLeT@!
zSKJ*s^TXTZ$fQ4qAgVE$+)H)kV<2GP5D#^v2lN~C%#zh|v0jy`ph1wU6o;2hJ-3Mk
zso=rcvhP`h+?!_|!D8f)>a(o_8)Ryce`v_FfC#iapf^cEGp=4XY-IP>ObfdS79>pI
zrV;vOv6)6`rHEp?#We#sF$E7iyB~wkj){O>swg==v<1&PuIF<i)`}W}TK<OSTfR{t
zKxlESa-4Q@!i%jHz~IyA(62~?9|4>+dsNj&Oh66W2ulXj!hgN-qPCZ`1~XL%8Y&J`
zb43OjJ0cfd#IztUN{&orr{YNs8hBY6-gb97h+{+*C6kLux3~^8$0`DSrSw308oO_R
zF6{V1{U|=UL5Os3fR^h&5PYjTuj`|Erst$d>Lt)3bi)ZG-?P^8{TB-GkperftM&6!
zA_)(|jiM3aTlQE-k-v+#t(y~9egqndjI1lP8-q{ndPi;2V6^Xy%RNXT-+>&ED`ZKk
zQkmBQ8HR^vQq;+3)*XQ%y(5b7aZG$Ic7-6M>uvffg%hdYaNQlrXJepjX0CkAW2egI
zS9_6n(L)JSZ3>WU>ALQt&|5g>4dTPJ$-#zGtna5H1yrSm(|X>vwZ~FTN8(f}5zI9Q
zbqjQgeXqOL5R*owFVNoy7gNTj{FOlhKaqhUQ{SCO%1toyU_k0LyB&OjH!4@J(31;^
z{!}-gZu(EmPNG6YMy_^ubw*YnQjll9``Rs+yjx8WphYnaxXIM@#yQlnJgIr_<s>nv
zQ*q>UTs}1@ysj!3yHLlP*4#CZ^?Pbc0?qD??$49DP6L$X?ngi6B;RSq<?0Qbu2cMs
z@?*xKvr81iaTNafnv`%65GD@eUa!HsPe@V!N@vEQXWT|wU7;w5*!$Vqe0%;wJ_8-&
zaO0i4-8El4Nr%{4+v}=X$E2#UNSWj{<Y~;B-bnAbz46@e7n{qD6$ZG7zFmAsr0cM#
zggf1o{Za{jfV4cwT{Vjm{)=}e!Nb`r1BG+81K}ivhI%~BsCA9vqL!G~>)e7mRZ33B
z#H{f*L&}#c37xo4-O-LlOL-S>Gv8?T2}dcj^NN;weoBy+i{O*+?vN~X`_zyy9A;~V
zYx_G2tn!Z(wt)))S-Xl9gTT_v4s>rTaY^Yed6llkCbi7|%<q`R6LH04>68v?pzyyQ
z)SuMd1hVvL!v_Oe#i81;Wq7;=F*XoJpAjGlM5<wS-+)6|_8M*Cc4`|q)o@MDzv&u3
zX-o8R4V18t{y<ZmpbsXEltExOe`O2k=FG)7K#}1P+Hs$F$1I~YkRcg*l8GkurpQ#^
zb*4vnI71`%ex|fLz}&SpLBd`veCn(-Lud3Zy(s^N)M@TyymLp_@^MfeTd<bySqSNe
zgn)(*3FJL&5Lqt>xuS(vxN;&#{}1^>hNuXXR@ZS9L+O@^c~OOB$-z+CURqV!NpstB
zHFTMe7{!0W53j@Pacn@T@p)ST4&MwGg&+_Awwu5dXCl<4b1_^)!{YnuYsZd9qhC2D
z$PhZqezyk}7mW{u4zWh*YN?&W;h>UkXbAGPN>L<Nk;w!me0TbwYuw_GN4c7QJ5Ui3
zukK&An%%LN%bJ*U$0D7EyG=Kt`<vaI|02M`XA8x3yaea185d34T2Job=S&pxSzkT{
zyK6iMi9edXStEg#AqqNg3FtOVx;H0eD_nBfas6v8N{$pv#i29R^Gf|jst0yX)pgTE
zF_5{##+nl(Gsq8g?KS&oTfl5ocdSjLksU=J1^c-j$ic6`^wMm73TQYiB4NlUKsRy9
zpd1}ySV1y_fl0+{x!bT)ntG*+cUmM(&#TBxZXOV;QXC>NDHPel`N7lhAy^+A3UNK7
zy><jPN_HA8Pllm#gcnG&@cf0`_NU7}wjVa_VdZ|kH0sNj=rAgb2s7$=#z=3-z|l?b
zvTfxcCyJR%-yZA^#>!ueOOWVRRoD|B&CUM{3BSSyG8h$p-D)~<)+h<Rb4bdN*H4P=
zyV#o<jYLWy-}>H1;=fNJlJJ5ak{UuY0~&qmH3_L5y!E5=DBl-HVNoG>AYB%`*qoIT
z^rG&x$Vgx%&<@pL$`y~Dz@s?PU^?OC=z0{!dhLiip#kpxlxWZ*7c|#&z4dVvp=hJq
zw?DEZy0@wEX162*8dk04TPu>#=PHfYsp=8()J4Thq~U~X-Rb>|4J$ze&LIul-NdxG
zGjF;TR{cWpoQzV|x{V}Gz-GghhTmSn4g+2*5^&XJ%oElp2bU2%W2tI!P#|)r%w3^s
zgRmrdB~j3O6N1VWBTXo~e<YPjMo3Rm6%hJn#!5DjA)-YLadEMzEt6fBs4SR1Qa)*s
zutv7aZTAfCYo?tafPQb$_L+c?c28)xlnySnqwXu3@;tB`P_3D?e`ReNxt^oFKK23{
z-HQOm+HbpBL#p2`E}_d2PFSx0=`|>Xv){B=e&hSbLLda$1PdZOv7%e`yA%#JN@_FE
zM7Hu5^ul<b3TniJ`(k5rCS2&?|I}4(-p=$s*GnY2m9Er-f76?jVuJyU@;~T9MDVS*
zD;!LY%v<eRJsdGsG&f9%{Lhl-A2=rH{;IG5lM4{cgiis%r9x_`N@|%54?&TF0)@i}
z9KXSbQ?@xX;Ly@M^n)g0iY5T@rXE-mYO@Nd(!$O*Xx#o{K;@OM!oDP`r3yq2^oV+)
zvich+XEv)X7SnmhyhrGo8ebAf;W10~7*#5r2>5*(UIM5Film>CA-_9xQ3^SIhjC#W
z`DtyS(9012;M_n^bT)LJD-<lU!fGH42E;y~B#fO$O*IruS^isu$RRx`R1_XOa|+Mo
zI&0y?_+qO^xX~eUGIts~#R`dUizhhY-ga^%({Qdf?|7dR^5fFw*cVpl#GD!BK;}5D
z$8t4&L**TkI@W8Q0xHbU*q2;#OL>S(&A!V#05G*AvU5v)#xtrCpciTgEn82HpC!W%
zWL3z*#>f}c#aZ@KFAZ@2HtAWQccBB{VA8olW^k~RL!SbMkS+{mvq8psl|h<^ETH|B
zVnS*r-zcgme*DUYL%i^n&CGl(pxz0#l6RDMJ#}!2eU61L0x7{u5O9PmKzV#65y1K{
zxDOybmItn?DsgNYej`#Lu~c&Rr^q4^gIZ+^slutA_d&j$taz~rm)3u?_#yNTLTr*%
z*BM>)dtX*-+nMqV-t`!EoyUiJ4tQ-d+?uF=$q|o42Yd?8Q3tGrpB_B6iTHf=lG}nM
zSTAT&I6Hw-Ny*Oo_~$DobCfYkjz$>3zXVyM+x?tTPSFimDe!b>%5}Bx^sA&hm%@<+
zpbKjLZAIciK8kk0W4}bOjRm3%TS)|sh(~2p9Fjn!Frn?^t3|*vXfdDNJB3FMa5*T|
zk{{2Xiilyr6*%d))a>rg5S1K?tEZt119(Tm(x)rBWM+T;1zspYAc{A8;_!t7-ZdZ2
zU*=4JgB#=Q$(aEpJ3_(v8eobkf(RkspJd)AQ^<$UKv%DvN1R7r7J%sb_l9N93ZY=`
z^*grfRBgL)WC4aKEVetHCleN6s4Az;QDPwQYiUlG4TxWEzkhDE31S6)Ju}aDrZ)Nh
zWVVb*3n<wETrp{M(Qko<_rn}~iq}!+jlDlG)~yi0sn?4_@T#9wnXQijXgt~h?j<{b
zkWIa&yNLJTKLqC7=RCN!>v_yL(C8fl$0Emb<GB)hOi%((l5F~^U5cujW9_^qnC%U3
z*4Lu<MjgV#(YoV6yR2qYZ$J1sLWd(jXOw5u6-tyyC72PHw@2;Et;bzR1m-2q+at2(
z6ZRp}MTutZ>MsM|V9@p3#lJa?0NiBmNI4UL1a^7+4Xqhy2UQikbe`&ME!zV;l4af<
ze;SK|$PmzuIGYequsd>)yhzD_$+j_?sY%iu&+oE%xo4z!q#d1CG>Tf-hgLY0b6DW<
zE<L@1r>sir4>S;f419aN8cw36!x>#m5cDZ~y}Q?nk#tyx&-6Q%JK&3J0z4CqDQa4l
zIBOZ4s#Y+{`^2ZmA2H}WcS5)9M1V;23N(KJfV)FUd6U&(FCiO2>Ik65x%gjwBS~+r
zVQT@rbZpQ+ivJi5k)So8Nm-wy^~>%T@~?gLX?<_b#~B2Oc-?Gj$8_y(GyQHXpJepr
zc>WtnsTARDkph?9Nhw*G$lJdK(o76gk~}K#sAG(4z^YWtP~A^<m>rBd5wH#vHus`$
z=A`vdQo?EtK(CXw$O&lSlq|>>Msonl#_ClPZqYX=fYN@Gv#zAtj#Pk{2Ppk_#+o6v
z^u+*fgP)$;&;Ep`b{0dUP?+&3d<t^s*W%|r2%xB;SN?!Wr+!@gWh0=sDL|np9VJQ-
z{|ZARImFR%=uE%vH5T>*8pxKrX3&mqO04qnxNfSECr;mKzcc{y7SBV?-w6Ey?jU>s
zRP|=^xYu2L7n8~`M=u8j7-YZ$5^;pTfId`3k~FIHIM6-)%sUduVkl=e+^}CgxCnSL
zasp{I1>RR!BF%JC!aF~TkD!}}NO$I#2~5HVwqVdb`Dv0$vV4DkXd*F6aRRz*dp=L@
z(MkA}-u4&_y4dADw{T!{DwxAV8~Xf^<(gW8fs9r}Z;%S?Llr4=!#@JN$&@Mzs18EA
zbe5mqEyh})S!Qs}8bH+pHEtSzZCd9~bru0lK=tebz{C}BcZhXHac_mZkaGYcuW;lD
zfI{wyUBO;e8hLs;-e3tFZ5!5%G*WeV!RwV^G^@KZKxdaLGkM~Jx)KVRH@^=`I%$wN
z3rG9`F=d3e5+Xf|OL(*SsLh#ozcSbxaA~?a9|G35)+dQ<5l~x)_;|ma01fUi11;4*
z3@{KBf>_S^<g_reF>p_0bKBQ&KkkH^8CnXrh7}H7LUH4t(J+n;N?avXtyGb!HppT6
zleu3ZK_-xp7P{wnKLCD{scL(G`pAhDDxkUg0kN}-%Hw<Eayi0#^fh5IfqGI37R`uK
zuvaAD9bK(NJaIZHDt(GLBTy@>MyE%8H)9n@<CNR|?p0FBiAl~N5KxIdRv?C<)dc#t
zn)88{ce8)#6Js3<ffp=HVDkjb_Ex>IAQVl<9&Gi(6iG%>gb>6NkYugIcQd!w|L_r&
zFc58A)z02nj);|TD82BsV5(qd3sB?_0Xay05Nk2~3PtH<2`+=4LP3N*Oy|UoiMv_|
zBHpPH;}##*v?0qzMq0`#dV~>y!zR(&xfo06w`byo=<#Uvwxu(E_ZR9tUVv}Pjk)za
z34?AEW%;wj%N!*v;@k1(oHtJ%fbKq}s^0iyydCbnCTUl{TW6gsH)R#hPqkR6ee&Ua
zK(}DbffN_w?ty_f+Wh1^qBubWNxXRuOq&bPb!_`K6|VbBoLaeq0x&D<bfP;UZoVIF
zsl=kBLIHkkS)lVnng=?;xC8yI;yzSC%P1fY*#*Mjj13d?+VQ0ULG6FPr_b@B6L1u{
zycN-6^>Y-1z7z*nowDcwwN!*z<K^=PjQ|G(IBtC~w=xh4;XyQ;Z@5GMTAA7dpQ82I
zb_Pg4fcrli6HvJn9!OPw0^GkI=v)_1Av&tIJ=|E=0}VziPBbHJ%AQs$(N8sio$rb?
zFG4~D8F4rCYY;kE1QGuVtX`TY0cwOz`wA46x%*9Ke?=Wf$}M~i>$2KKP4kRRH66#E
z^rlx|?0s*@fjLgNw?5w)k7DjDrA4Rf?kO=13O<LQ-cA%RoHTFOECDibb|8k}F&<zq
z$D}baGcZr=2u{;+id^vA&an2{1`flrYkm(DkXIOxDVTJn*fb8%$WI-G#V7CPclwHb
zplBYM4ZY8??yp<0EBWrvzAtJHm`twPfgE!9l$C+4A484<vv<5P@NNm{foWVu`InKf
zGpPLZz0{>F2CH@a?!nb}>Asi+vgcHn6_9!>yk&}kNl&1G&ZZQhu+0TR)z=27aaE3=
zrh>uyb^oXlgrwkRgEs%i2R)+{4~(GJeWn}JRL$OH;Db2q^G%YDODfu$(y}A<a15Bc
z<cAjwaR2lJF)$L63-lPIeYQt3S{ppQsw5fqIJxzLLT>>P^oo7t^*1D;^`54#cA!A&
zv3Xtr73~_xgcC9?Db>4WGr>J;ozW@5X)zLh!yGdzgj&wb!YZCscPG`&N0i4ExJlA8
z#*uuh?Kivhqep7rMFioodr<I$9O;;b^su~B>?yo+;(X(ZF3FfU^>k`l8!Y!~o~)hq
zzZem6)_`(g#mt%v51YrlMM$dS8dGn%PO!7~9!cwOtFj*h5}ht-Yb4q<ipdY14k&uX
zxM%C>^%L+Z=^}PdBMhVx0#t0V>iZonN5D_y<#=`?L@KuG5BP)b0T5XpMNc8ao?Hr_
zLZen1nDC0`vDN#sPBxV0b=e(5YW)Myt3Ad1Cc<UieN&ghv3d0MX(`I<m)t($&a`X7
zrvS`!<OiDJNIyx^#T1$I^}zSZzTM}G1Cv9utRK$ReDMa<?nwz^ecL%(=O_Qjg2_8s
zwkGK&tD#M5B00C@XGF3|rcgA9Rz=f)N*>h)`yLsBPYiq9gU(1c$59Y4cD5HpTqlMO
z%6~=c#8B6+dT?x5UgHeZPU(@4{H*fRoCQXvH!ndoeeGkzE5Ju3>xsR2zSGip7jOZv
ztY2;mwp_ym2sRYlptE399!UYNU|xgTaNzecQ{uC`&;6-vWN36wuu9#clg#B#Zv5_t
zJlbCKTfi{j3=Gf?zgE?+5Q8qo*n;DaLns7j>_qJ|(f^YBR7Q~>aS>vCHC%m1aUr;R
z)TQ+X5GG=!MCN}16+?Sw#SGz88!uz}vCD&^Tmo>1zuJ2XwD4b96gP0xw@naceBcre
z#pCFfAx&U&44^@$%TuCyM<G=DZYz#ZNDEgM$ZJeZ0|ZYrW;h7c1#5W-Y%Sg;)s?m$
zl0{ON%3SctTPws+3_6a`1*RuONcQRC5Tyn3-wSuM_W0RTG}L@mAk9U)FieC@d?I8h
zpAoAt83G9qnzfdYG>bC*A3SS|jm{#50MzV}qopxbj55_CM+}Jknci|xk4bbNg5G-e
zlZH0{R-2_^z#SpWE}&Ag4E$(^s!6T>dG$=mKsjBX`|P>^8%c?I%|~!23`fc-U~Et@
zi%lW23z0UxnwxvBj99`zH0CR`4&V5uh9L6Znp_%EKLTpi#MepV-k5Qsd$Y>lMbmgm
z?}@EWoYx|$=^~u~5wB6;@p{yXL$S<yy80(TnlSU`4*<K6oDmZ(7CIfeW<!k>;Y$G6
z;eo{qO+cSj4WCG#pc;Io@flD>OOC~*g%GJe<jI(EL9owh;Q(|bvQBppC@AR&fJ|L|
zaKM0ojr9ol_*B;m(4);38yxKb%;MN^`zpFezOMCbbRMX_)b0R$X$F2M(!*HQjd=*5
zB3Gb49=E0yyEoM|0_+P7=tL!5Hnc{-S**PpppeL5z*DeLYi+*H9V`U=d;%7%h5ZK|
zgo4a^8N2|f>6nx+R1&H{nvECo9q<5tn95h0Qf3r}kpceijG=K)L*e1_tb@D!-BcAg
z{N5si;+<?3Kd5OMt(LdHis7f{<Y1SN*Cbd_oqKKIX2x-Wu`g%>HqP#k25?yh$EAY}
zvJCBar;OG(qkbAS9$ZP3y83{vyU&U!FbORadbv-UY$$&cvxh^s?3$Y{KIA*#?pTFI
z#>-As1B>ajmDSCEmnt9$NBS7M^t~_szvCYt@)2;syt4lG#5@5m$x((YEH<J4^xLko
za)cBx3(V#V(v(pwECszzu8gDL1gq#);pAO3JY{f+2TaU$<`7nxBORVH5f;{60LM{`
zYA>TkcopFdMa;HJ%7RdoV6L%Eh;$tkpN#M(@Ou}RhWtgK23bj+06p7AF<f1hZI#T6
zN^#aNr!5O-8Rg7#d7Z*yN;=H|s-0i*SdZngC%X-xg<*gyA1b<_fZ9k@crrh8<f2ts
z13T;q1H{=%ebG_B+C|wo21drck7fPjO#FXTy@x;5Z~Xsn931=DggEw|3E9Wq$4JMF
zV<jn)y^nM3?3GQ3jL<St<{?Cskt8EqM##wcUEZJD_jCLF{s1?;Ua#xAp3mpwaerLb
zB#EF*S}t4)-X+d-<?2<NU<v<6Xn=;GR#?H>rmAGdHv;m*weJ_`5&a*uA7szcS|`QO
za6VVMnOk)ZCPy?DmPD1Mxf2=cH^0y%*i&h6CDE;Wb2^hzB*l}d#|X(-K`IE6I)OQ}
zTY=OiK7#Jlhx`ak`41G^=VRDsz%Q&(c<^o)f2BWtB^yzU>|nF-6kUTw#t}I?aE0xs
z<zMqizoZ`?-A936KU1O+Jp+)&+N>z)zG!LbXX_P!kMuHi)vl7tMyh@rzW^2P4A?+l
zNDm?Z2pZ*<upN2ZTLUHY2XKO~U!T6iYya%`vBYfgTrd!V?nH)-9jCj-%vF_N#Gv34
zWS(btYw9XxV9a8UHl`lWK@q`C3qg>Gic(-H?pr}@V~|lDP3CYSndZ9xc1B0&F9@Mo
zXnStQ324&DEr`QDfpXsgq9oaQ*K>9NRN|9{Zg9zjM3%C?FYg~F3Kcu_GXgWt&wqkN
zfnA1?_L2kbEGz<&JqawXAAz5A>Am|%QSjs4#lC-_;~c$o&Ygon;(nP}R>;RnB~?c;
zw{-_1;S-g=Q=0#NgOYy-ZSz|V6!MZ3)j6AH=N|=&_YrOYGB}g&<5&cMfoFGf5*;tL
zIN8v92AE(@!7Ba@2tXIkL0R|VeFj03uhPT5RLe~atCu#k3oz%&_Q_gSwag@t?5SZP
z=`ft3;H&{&+*ir2K5RP($Dnw^IrpL5dEi;`m*mT4>7CfKRmlR>6`pc;St(#^^F`~r
zh~zIb6*;v4FQytYdnGOML(t_{vXHcIUI%$1=UU(PYuoon0Q%zh_)_mYI9^s{p=4IO
zHzoBGphXcA*6u;|UY<9dnmb94HHbb3o2|B|7T;LbO=6vQCH^?L{FntZYDnNHsd0Sp
zqzFi5?#53|*DItR=)a?v?5R)YTz>XOto9b5D_Va7#>m#iVCS}L)=f)+m(uyh+a0U6
z|F4uFCxUVak^3OD{e9j+@HIiAFS#vYCUgn)pa^MyaVc^91cRX>+|@E1%OQ0&k5X7T
z;IICmrAsj^eCgsx6>XeUu6<v5BL{{v=&eFYCx!6z1fLV-Qr~x=(0hU3qQ;#KZucgu
zH|VZ{7qP+Qjt9dH3-|70z<2Ci+BaDYDI`_b!$(vOEeohS$DonYd%eKZxZD=7UGI{x
z=`ezk_9b&>;~~-3sQs4N6AY9J!`s}tx|d(PyNgCu!)#nuRF36)2mh^`$3MXo{S+9W
zfAyv#K4wcj8;gL2B#OVu<&wLbd2VGo5b<V}<ps5(V<pV&`~9}M1q!qATXC_VSe#6B
zH5<IapDk+eykV{D^aR+&tErHZ%@f3>Jj`kiwGpSlso?(S=B`uz$t_+>b4^#W|G-sb
z7z7!~1sW{LwKQ9QhADoAV&|K`p8v#HzW`bveI~DeCSCGWytWzrt$1tyLKvD(i<@}L
zUMtmg(sah$P4WK3#J31;0=y%cK8{|}ZM8L#0&gahFftcUKd{AyO=z~R{&PiT7-_6S
zz)w-YzZbg|E(s<?2DK#wTDY*}XxP!0;`8zW)&W(e8KK)&3A_aUNuyx0bNk~>{|nGf
znZKTiLk|V^277@S<JV)%kNbM=j4y*lE>8_xV5=26haH2$vzX<Ofq&!D*uk%LES{t}
z`@|V;$`)~Hg<;9L_Cu)uZ~c|0*O&E+JjG&r{01Z&I*9J#63j<@xhRS|i&xkT69B=d
zEiPQvhjrdND$+%0+l8Nkj(e2H0#POP_DX5`52(!l@z99q5FlKEu+ka=^E-}0I_=i@
zRb!DFb6U_YIZR<NqzbMXQ`MFEhzp!#iE0aRykCd%?8E#kqIdEhy$d@ul_5#;DO?9)
z9a@a=F(oMQ2?`#K={umGRndEc?qLMc%36t|Q-^op2>7pRT=es%Sk{y%u=QRfxsu;m
zqqF08dt5AZUp>^lLV;)~f<95lzPf|vFcM63LYZ-iS8`c8GJbeLr*O<RdTN^ZuNKk>
z{_nE`7-Z+k8mK@lCgEUgHx4M0>uEw!j-9Vlj8pFaXsu2xZt>zj6dnN(i}QG1toz>%
zVp_|9FGp8KPRWJg8b?_Kk<iqk*!UyC+tpqlKJk_kg`=!Vo}{<jl+gi?XdVfEjiywc
z$bZ8P5p9ABujyAF;ec!Pv_3_K^XK0AK_>#O?AhWzrzB!*6o`xOIMgSR-2i`)E$+)D
zp00MX`cyxbKRgy0<8AoEhfsVMa!lX(*cZ9bQOL5g7dCx^V%)xF+e1z3JK!)Yn(JEE
zpRXqlxAQUK8dgD0!2e_cV8c#`NeD!%fL6!5*3h%v0r?}~m>GzSl2l}|Q+(^-f92_0
z#P|3p%(n|*T&@~mUSv33bjJPbK3fOp&O{+f?B!M9s^N4Wy+Ik4yukQEX{|u$&Aw7O
zn{%t&S*B$~-TTG&Pl)5Ph%_)HuwAm|6a^ZN#D%Wr*cr5#uh?N><ttL>Y4wQM7ED0>
zt&f4%5&XsvD%1oh5x8)i`n+OaI#F+t&&S+&7QBFTO|EIYEOV{MVju2;O*9F<o{+NU
zZFH)VG`{+>@Y;6I$l&*j9wcui3v)>SwM!CU;q>F#pL@jsG!!A8&-(+IT;|X>-M(y6
z`iY($v8v?0t}QDuH_8A@qcwV`XatN^OmBZL1dCmrlRNVBBe0B)K;;_Dj3D8U#<fmV
z&{HxoD$V3HT<^q@y(P1Qr$a0tY?;U{b|=Fzt(xOUrT;2uAf?<?KH^vh2dGsq#pHr=
zgPU%?Z$~!Eg~ct{aKd;-?9XPh1s%QeaKCx-Mt=&Yp?3Nm77|AQ?u~I5<F<I`82eZY
z#vD0L;+#DMl`{9Gk0vY#$r#=H)$J;zw)@g42II@S&gk7q9#Z1&figjY^;t}7kai=H
z;EZ(#`Q~vua{g_kq%%wCQ%UHxaMLTaxb=I9+by(G0-ps()Ffxbj?}8X)nM_dz{S{X
z>o`H<u#5Qdu8cLB(wA;~$#H0RRFSxL?ZpJWs7_T+NKMlPI3!jV!2r5uLDKr*t9cW$
zCU+9W8&B4ii<@&!uITCGFbF|;vPxDI=?|}PEED4GP*1yV^u9KH`~&g(e&Tkcva`MW
zxl1>8MY7u4G*uFEf-fT)w5qjhxoyC4>-~x%<f*x@{i$6t?VekQY_;)HsP4u7Cu^k5
zOQi5KGTVS!I+42WPs9Jsb4x^^+{!<|A=j$JJ|s)`wIAPwu-_Q7L6CIf{?gZ}36*9D
znp(lrf0p4c=s3*cU`F3Mc|s8IsKmu+7!HB?dBevV>aXkAVNRh0-i^`g&evd`AhHTZ
z`wr{rkHF)+OW>HC_*QOu>&>~*cjPkIx1O@fnn26rN{YVT$Lbn=qr0QXJju;?fgmX@
zR@L%ZdIv-|-IrwZGT08=T8KuLs@6xiZ#VG1Znqhxn#rh)TWEB|&aH|fWGcsivLTe0
zhV$72Mal8s-`_WOQ905M87J8H3xSN{`111B3GTO-uu-z>$=lQgreWNa)9E5cYoheh
zgjK9$v>tzmSk6aGju@(WOA2Ybtas*7^C$cW-%_joEI7o1YErL)?dPn#XT&E3=v`Ts
zi<7|1EVi+NBL`m){hH|P<4fXs-nGp${v5lq#eDY)q>BDwM}SThBcjspCm8}K9W^$a
zC#EJebs0-BoCeXCymlLR%wm|(fN$LGd^z+WX|%zFjsd$H42D9_<(PKsb8k~-d8;xP
zLE7<4IYC96UbPlUw$NF$^IPN(uY)s?2T{)_yP-4@g8RibL9><bPRSPMUO*M?M6(#M
zOfpddavm(}ZcWv~i5b4#J8DOknsyYnkS1!zkz#BaCUHXnr`gwH8J|n8g={6hM_+Ze
zlahby>J&zPKU1UfT2Aq1NDnk?Q~LFl|I~#$azsKTP<zM&>crbB^de9>q1acPofuY;
z3dk2}XZ&iTi_x5{5>B5aY6ZNY$*<MrS5&r|7q88U3ry!SvY2-%(Bfk6=|fuim`O2H
z;6+rlSwdd9xTpWPk9El`M0Z)6f4_Uo>BYNrjYxz7;sJxJP%!Nj2sJ7<m}uy6!q-G7
zowPYo@71?_Xo!>tNLac2r<~URIjr6YO=&Q+g;_}g>jYDBi<`H;i2wcBaAJ2n)Ic>V
zR6jcDVPr|3C2SzxnAoRRvHU{961F29M&@1=;_n({wQ7oYljbkn7|E<<gWVK*2;E90
zNmX<i>LMba)Ts41Ql)e){Tya_qB6o?&Uv2!b%1R&%Fm!^zBFX@sWN-_xQSRQWN(V0
z&W$r0sf5Su<5cQp0mWHJs~}JvDLqvfyk2eazO1LrTh{bxYLp7|?Nk)4%6@A2ZJKJ(
z*V3=oUC@xUff#_3(>W20l8GYjeRR3wnd@&%l{2TINUb2OuDP0&-W!<8Zan+LzDHWF
z!q$m;5J*#<b9@3cYzqu`wZnVo8BGzXZTc$Cv1Io`#e&A67~-E9inK(}J|Br&Mu-De
zb-Hh_uie==2<Tx<c0XGJ`URT^d5Hbe=Bi=Np{5r*GmURj)6c;{ggpHg2j^CLs8JfE
zxKC~!JvHN$?4s#<siVEWDu3UO(!+vWHC4_f9Bm|(qIvhND<SPp5?RG-R*`Ri6X-;~
zEqJ{C50gOB5kc2eA!7aq236e6+(&6=k(^f;kHXJRs{a&2Xb|)mVI@42`*N_@O2NdC
z1T$~Y(GWs7YVR0pRy7RvCUbRNM+hgVbzSvZ?!6GQ8x(t)H1TOSNZc{Wk>>qek{{8;
z4IBmLs?r8Wq9+q8vpK3ddafT@3!&E{aJ@xm&PQF^m8`m=QTbVoK|8It>FHH+COn*i
zKw+^phUlhPp}aHcpcpn3Z!H~lr6TdidZcVQbcCYk6OeLl1J3*;c8{B}5m72p#P2Y~
zD@2p(xG$eTJi_{6sd0zO!S|Egb8nyRRE-&n(!UR1^bip_XIkQKl_GwmawciVNl3*+
zI)!s8csrUQaPCwUOqdl=3Ey10F~szBP+|L@<X}aSO33=D#`iHWPZ{IM+EUPqGby~W
zcB6|U)>Tsx=dRMD)!CKgo){!sEk%<{hI%oz>&{V@xwpB??$?;TmxMUi5UPDMefLI+
z`ZX3?+lF$?SHbHrEm6NeVo1sDB=+T8U=6c&cH0ecp!|(v6^Oj15uSrWlNaVw{80P3
zxDegOG_ne2$W>VX+@Ij}R0S5n%dCl?Eaq;E!9<3b_CFr&F8WDuv$L=8zQDe(ff(Wq
zUpTY_3%(s={)K3Z$+Rz9>^*2h6r+p5OQe3H!^vV2Qa>e&^FPag`Z%4V25=q@+>^Fk
z|F}d}N=r!b#RW?5Uq02c*jA3|z*YNCSyK%#QW%<-hI=D+Ft84TXvb2af4yRsu>Pq|
zEjr%UI{-2M!BZZy;Z3zAe3`AVr5phgELvWhJGs-dkYhD!;Yb>@ybOw5`00zbtyYm$
zx)7p26)^l)x?a{^IubRYV<Zbb-KX7#c-P%R;6k8npEqxysk_diJl;uVf17ukWN`h^
zhC-x9RU_w*q>?+?waHz%7AjdwyIwaIsIw^m@@roivVyDv&6OTOtqbcm;IMAr?~@5c
ze3j0ABCBUTc*fYpTEPz*{&L<suBG(7thGTp!wQQ@mRO1~%s6a-Gh5$`q%fF`9riL+
z9W^PH;v?w+W~Ux$1_5^7$McHflS{FUcsr6c{L8^gQY_S&yZZA*YBtKUtVHG+{m}J&
zwGS3t&dE2piE<<0Zr~?h5qp~8<@<b1N8R(^NNqudhMRj_vwSiq>Em=|{7XMyVjIur
zfT#Tx*U*$uCn7EgCMq-Gv;$!uCjJsBI)NEthe-mv@H%Z6QTRSNgyY+TCO*qlUktm%
z0liq8IOQVUhFYx~5LO*bL8OPuy*ju5>gWL-0ytb)$V&y(&f=(G$EL)GJ;U}>u7*UA
zD9SH)#Ef9-FdY~XMurJeUG95K6D5l>1-9o7=BT%cxv)jq!e1Ch?A`F#@{Sj+sp|Gs
zDz8tj_Id-6&jRBD%!i|##@NyFt}qplW?}F(N?Q6x2e0E$$Wg%~RU+{zL_&EZGJZni
zHb#!ZUB#v<l3{?_fZ=dt%dHgCkP>vGI^7J)&7>*Dz2hpzB65mv)QqJu)~-_LncKoJ
z8I}4B+!srL8*pH4;rx}z{JRG#&JapDuts}-fH!|NE{kC8BMSe6ls=JOO5$4f@l~Gm
zBQYVC3NcIMJ7gSpJ?mjAs`T@1A=BoEbE-Y>FzCVdav&6zYm5Bw0SjF5T>-0z(>>S=
ztPf6(e8ardJ;rYX7$tAjh)gJ=1Entn9;+l53QH(8?=kUOD&+W(RCoJ8e0ptgdrxuF
zF^M8BCjC6jXDX(}o!da?`79UfUXK)cG|h2Bu)(e;y-ciEcXSr}kC?31n!5w6&Uxk8
zPByAWfzl5zn~+W>tD;t(U-jGms8^{kOMkdhO3z974Nz^DzN9x4Fh?HOhho=!tdM@%
z&t%rdBxW<12NFcyW>8155<Ny#mCQL#C#qvRVjUA6rdtLsOOxS0TtP>UD0{v;q8>~@
zU_=fo_${l5_rhw$q(%h&waqmMTv(`axj#N2vgPzwCjKa8K4{S1>Cq;iq{Ew<6B_!T
zb+F*F?EM;+LYA*X85XK{6sxZ(m0}JUo!J;dy(KI=V29kb2v&E{_xTS_xt`8n+`jXe
z_}@b)lYd_e>4cz=BaCDhQ)Ae_j!ie$Q%owpwQ*tX40r5g&7rm%qG0mwc}!KtT*2@*
z2}TM1!;n-hSuHg?2)A5iSpUW0FdF=11bSD$1eqzws6B19%4TpE%qaD%UY)lA>-X{r
z>;jX^fl?Dtc@e66`x2I#K7>`lxdY8N6vN#p5V*$fY4YlXaRwLr^qQ-Surl1&g$ljd
z^1omIC@pgoF+e34gYE8iDy}Kczi*D%`Ss1sBvNX=#aM&`*goiGK7^6!=v1A?=)P8E
zD>WB)*=U&MM#+AZKHf9P+j;KeX-75>nJ4=4=Pq-7FJvS!zVfpc=X8hkt3>^)t`%0f
z*p2TYzVOdNSPEtlA!Q-GO^(L=;g^h(%BC}W_uGC~wkb9nq#8$eabW4YkA?JcMeVPE
zT5-!eN+>(~iGLX%NGf^)kl8-WA7e)ZlnEq9ZSvusC?<z=sl<g~EaAzJK#X)fT!?h2
z{j}IJuMnx>Lw+9u^3wD$dLJ~8qp-y8aZ!MdK>ow3I&!W_NT_pUyxRKAwFDE{Cw~&=
z9k6(uaHulbyo+ZK18PiWZBK&e8k<<4tlVjoKOJ#YVR!?c^953ats9-f%7RLxV-TKE
z8WB-y6w23U-LqH38oZ-c=CVsvO!U<FjD|DLBp<1)weALYU0f!$GzEcx7pHsbNgAmK
zfCrjWD7$z<wtDEu*?Hw=)ixW8tOZO&nGEF%W1vEiOkP`FIhxiDt8G-4Md{ARu&N2L
zYl*(Vg>)@pjVYi^=;FxR9HQqd4vmb27X9e{9E9|}J8XVH4u6-@(GSg9V0GQCp%bw#
zL9O^n#Y@wX!J~e#<_2Wa>d}NJnuc1>%J|8){ONFxG0c7p-j;Tx0JA2<Tp%A!qAc~y
zZSoWxuljb-gu(|O*oc<6JC5!_a=5VEXqpmkcy9U|$u(=|Stpj%j&PycTrGyAbRuTd
z8}&EbtIYJQUqJQjS$?AzK@3bh{7lpO13H-tcS6#Wy#}|DVm`+jM}5kLn-Vyw7_m-~
z*mLzxojYWvQh27S7r8<#({jJA=05=-oR*K1a)YytkOP?~TLLkJ+nG&U2^Qt}_%(D5
z1y+Q&kqrX3r&!b)7_*QNdh|Qf!o=*>jZ6lN{6o8|l#9%R1pXC`L<oHY5yzoWIx%8z
z20jV;Lqj<JBY^8IJ{za(iGmenD&q!gX1oO<Qu({}!&cw%mjK9i0jC0uO+`vwCQ@UT
zR}>h$XT)WK{h&&A-0)|Om5!g56~!#a=taJEDw}XLnqw(^N%%;x6kB)tH<yw>Abx-P
z6SBoQd1V2qQ3YWs3bYN1vHsUcg|2sP=lNncmd8BZ_W*nh*V2D>g16*yuj%y7JUJ6Y
z%V85LecZA7XF)dcF1)jnBQ<&s5mtKcO<kXej`}>{vrArHuwl7@n$`}XgGspxjHwC?
z9>woPNW<I4uKRc~!x-5^pU^7)f`#pf*2%AhHmn32P3dy_Wo~Vv|J=Sy)xZjCPAL5S
zS(W*R@?Y1nuv>pP1Eu}VwAlJa-Gx-~$13OO(^&V&8&#q&h{E_4T!SQ70qfTFGU{QJ
zEjfkxVFbx?5=8+=H_9<5rzK`C7)I>!+-V4z8v?WW)<~3t7_kE{L@^kojeTuqK$U=_
zu|KAJOI|`w2pJ?cq?!8pRYsvX(I4%VmyyyV;s`ON3ounY4ei@j&S|bl&CWOpT+T3%
zWe`cSCF)tgYo_W!s{%FdxXv`e)raW8Qs=~bp;ox%^<J|7etJiRL-#{@d?~$(<eKax
zf4j&c=dBw>qM03tvwTZk@ej@tj$)dpsp~;8_X2w;S!D%{GMlA1x<fTMn3k_>CEn=Z
z!sRv=ShDVA!9-Hg<&WGQFm}8<!hh)bOz-H9`B8=en-bi`<}eH(iMM`P3-U{jP_Yxj
z?wCEn`QY~kk6f<y{vZ`?G<Uh;H)I5`@Pqh&h@q=cCcKRAqtliO)n#@)E@&1ptuk`y
z=*Xu|Q~LYMI}Bd@B-gxL+f#lrj9kaUHekL4Y<U6$zQZlJi7=awL7<V|205pQ3I!eO
z)9D>71p+1v{g&l2NhEv2VN<mza3Nk?sWdC{BSeoe0#k)MA=m0Vh_eKkptq3RywzBY
z!SIq3FOv7Swt_mV3nCi6CEAuPh8!iSNcZd=S(?5bc94}<7Moc-ZUf8Bw1%!x`ZL0k
ztr;oF9d_R=5F0cY4YiOP-Kz7)zxbYg@kI><bu=GF&9yk)lN}Cn?~9VAk<gbcQW|mx
z(n4w5(^PAZ-TQ<8?hks(6M^}pGS^Tdga?<&-L3a(`-ugt#9PIwz?-8D1C0?fX2cTs
zN3hBF7QF>$AkPUyf5tJ5=;QLPW?C#o_|Hw&<;F<za#$3(PZlQ5xsV+9^Y|i)?CNJf
zD2rGo(vF8)?FmC?_R5u-L^P|sY-tzD^;xEZh`vW^bh^m|HC{$RHoFmn{^X=gM;0fo
zODgOVQ%-bl4FWT=P&fV4)T^6ufMWN(Y1@~vRWY;O91v%SCXC{$zh<VLFfzmfyV7sT
zL57EvyE>Gnus(HxIK5-xsYBeZpf+ItKpV#(cC$0XF{9bW(YV7ejc)b(owmi5zDp=8
zS<m(2mO<6$0MwonKZ%o$nr<Kc*<fqbKfdf_mn3JS%=EP5u*(zI6Q1Jh1<e8xm21r@
zBcHyq@LL87uo~_zifGEVy?jEP*50`4q?vJ?x{Yuu3<ffCRs%jOCGgsiQw4<|*o-*%
z|L<p^L<Zv&^UuwcCG1ju`Lypor$B>|`jS|r&z$v;^(mZZ>de`lLiX#Y>)(eM#0xe+
z0;0X(l0iMghx=9+z86MALe#)L{7mHIoMx*`=F(j{5y#GCHr4dpqtehU(WK01LQ19q
zGPm9tb_1rU?J@(0#uEjuq$xUQU5tG_0^`#dyuK!nC(VV2qD#B)>mftgc!gL{zZ-0G
z_R*(LVGKW=z;wrIZ#BMYBXW+N$$XtW<L=wnW>BW02Sws>p3Bn~rYbi-T4eJ0KWp&r
zBBGgJ2O9jXThKVW%n&?TUlP0gFfE7}*$}=SPzma?jE;g@;#5qtJ0BAS24&K7%`_XM
zTmj$9;(yky-X}ri8gh<Q9z<}8n4#Y@3?q`()Z9%^y5kx<eQky=ibEc_A#g#ksNx@J
zB5~PD>o!(XM{4pe&@b{ix4uXn=_j~H|Cy}z#ZZXQ&!m5d0hY<?Ol0YOsQv;h4bLR{
zMBR-!BxX7|H1-(7c^9);)*!RY`qYss8AS>oyqqQG*<4vlV%2rTZv)qT4cgAAbLmDF
zs_L`u5be9%pGeCom-!ZSoo4mY+gHHc^-8ZosUA?HHB!&eqh$KVR18JdMG<&v7fO#;
zw?JY;Rv0(om;O2ra;A06BkhPtq{d%#k<*Me<$?58G>#(QL76xx4DT^%1kapubav?C
zDGCx%{@%Rac%4@iM9*4%%tY+DFp?F?@miL}4V|+kfw(r#oZ1f_fe2iDT6`*byxkJC
z51Nr%>Kq&+)6d6NTsubeiLZ0vQ~4kyF@k&G#xH70{>h~i4}qCileH5*Gn57rl=NNP
z89}cBfJw^0>dEL8Cw}tWCXw(!#z!+$qx`d}kRkNJzUefPszs&I=zF)$iU{_WNcKdV
zz+@?H*A=Y#(@maKYm7E0RffY{_qLFq(D-IDZ(x}nuh^}c4;w9}9j&wW;Y8<2oRH5z
z%~_Sz%w}T@XXMQ^_1}Y=2h~qybrVG@UYTU_f6^E!^5!X5eMUCYiAKG|d9#w*U@)-n
zLg@vrg%KBqMMX8%B-|vD<`1-jYrs=AJIw7k8WBAOd8+C#Z8kCRqg!oH$eE-F+$SDE
z)umjvh~51g*nTG6tVQMS1(G>u64$adJn!fgL&Y=Z9X6Bhxq8}%D{%318IoTY=4PKd
zaIIT<i<qPio0-^tMSH4~%u)T2@Zq?TSr_ZuW?BR)65i*X(j(tBoq3EoCJu51qD-qB
zQ|qnwlCela`5jGR{jSuLu9z$Ne)q(rVxRrL1`DWb`?7o#!f9z6@W1d9xN-bSZ{?Ny
zqP_j2fu7csY=aeJ0<y#2<C>-#q$_NCV`fz;8?hy?QkTawOBWBn<Xr?M=d#1|+7j8N
zM&$kwO0+jCXq-0*NXIe-hc8abfsLwv^MbupMoD1&s1lSK0{9(tS?{&WIr6wRD}sbM
z7uQ@sBJ~JDt4z$)nHkSQ7zQ$tk!7R-Pyw9x?^hX`Rc&>%EnLij%xQN)3>b3rglCAs
zlI|~K?Q^pZRhYJrXY+}Am7@fQ@!Q<xX)*LBQ9~Oip5}Lb&bhH~%C2*>yu*xOmxuw;
z2+e&`$RTxCFzd!{fYjpgf}*o7ojD0*)Lc==u2r}uLG2YB(rM=N#gK4@fU)@o!?I)3
z<GHJy?J_G;q=7n5$fkM)=lGQFZ`$}6RZFL5Bct{yHEsaAne7EgBD^3C{l(y%H&+VV
zR$hTI2Z<9pPeZ>YZg93`qn^%qcT<bA`f{+2FqU4ug_3eR*VZS<F=v%mDn(q#S~VJV
zJ|{C2!OI9-D8FzLlbuyeZlo&Jrec_xavM5&79xCxtolq=2R0;@E?AA|gyl>k>(70~
zgE4{W>_Y^*wR2<Dx@fj)k}F>LA-l}YX4+Y{K@=r$U7$H3q`gwhZ$TT=tTC&8(ljwD
zZzu22{|oEM?aAK6;H75^#!88nt&2_EmaM!^9OyL?>(1a#S?!oBwf^0$miKD4ZH>Z`
zCPAIqr}0%L<)va`(!d_9xR@=rxd+K*wF^u3_)hP2OI+=X>NItH*D?h}NTRvN)P)B3
ziem5Pw7*%gbrqv7Wzx|9Ru%?~eEvF4Vc+VaUEOjD)#EOiFcZ|M^OsOJjB{D{X{LI;
z*a=MJ>tN4d5Ap%O(HKObmbtX6JYk1+3kqlGWxPW8%mx>Z`KTKYr8|aOqOenNT}K1k
zJ4yr0!kpiXbrm)H(RBBIb%pJrZ)J1eYkniGGq%O&Es+2Xe-AB$QovHhYLgU2KfohP
zCWNpC&7wsbMeJMDODaOeNr}j#A>W<{Ni`+PxSxsb&%h(k&lN}MH}c|#{oj42Nv5>6
z=k_0X!n<%Vz8)d9Y+d%7&{--4ZDgyE{JPpM=`r%y-4r9J5Y43!>vN^|$w-o`H%(Zl
z;;3^vo+Rwc6$HPsJgVD~JlfVMZm(CaI_Er^RcJh>+<*K;X*w4xeMl!hBkx?G>Nw+T
za+<rk8(O(_4Qp_Rm|6N>k7G6I$8Ns`l7*Zkw|>Ujlz#UA9d$8-f5V)W*vZ7TKzf$d
z`P9n9sdm^%qY5dxL9GREh2+!11Pi#tll+Q25Y!DVH-B&Dj(XlYVLR>ewH-dqePr(q
zkR|dbKi?Dc@>+&>KeZ{&OGb9OFc$Y!TReM?j(r#_r(ZgU?TvItIAXNS+#7fTL8nL6
z7;%nwq3BMD#EkG|i|qE6JrZ9_ag>4IRr1jO=G8GyfWY-qVCw8-Vo6fMpZ)&Pdv6yo
z&W4OlV{fG{bjRGqLVNBI$op@_w;^dfpu@TTXy#A881Be#{)&FTxT*(i<8~(QU@5A0
zW_oVx->Fa(2zR)w^79q+HhC7gQbb876PcHJ{0DL+y=i;I)ArU&dm0me+BU}?`ON=A
z18wv)#)RWTt+!i|7j-G7OxK9W{2BcU?tMaib!`+Zz9&2`{Ng+3(zeBGsmfWfMtr3;
zG<;|U-ZK<ilPv<{Bao%q7yNZ2{&yM$3~YANzZJyBZxppX5r$@Y8`}*uu4?N%Ci7lm
zo#3;JRv;D4VnY+&GUXSwdyEwRu>pd_@Ir~adFe=v`0ny#{Wy|_M6FK2_Y~2|vC3W=
zbbM1{Ml!|meL$=5*~g%hN2J?&jtqE9GNVIwMTbsA6KN9Z&VR$3S%ndRMC*VnSoI6Z
zY&_|`SmY49$EpNmKL=|i^Ip&4l^4qFsmZB&b1{Q7gam_T*S`66rYAbSQ5M|%wJVlw
zp{qLj@y*T0230PJV-H?S2S^n!25W2z8Wo%6VmkhQPjkr9&oo9mcy9t`U>}A3;jO%U
z(o9pr>ZGCDV6|N5CgDi@5%OkErvrITo6FQ4;~a>G7nvEvKk03KvOI%6xW4cpNz?yn
z{1*%pSD^DJ>o~}KNM?Zloiw#La=h|mIc_WZRM~e=UunKcReXxHe;nsNrm`#cGF(#`
zP8MmKUctAR;YZtqRS@stL3ogCOZgL@^Qm#CN-XBH>MYfQ5JA2J^NkOKc}R~iLpK%K
zex$~B2g#e<O~Xy88K-_X96x_wj9;8j-;LhA<SZ2{G3C8aR$bG706Mcus+j)@Hs~S0
zw#wi_=8bPnT4A*#9<J-nIIein%-!UalB|Ed;f>hLGwUkL4`i2L{#({WD1@5vp8sGx
znGawpHhA*?YAk5z>~>BQay>c+P|<$mQm2k1-#A=?7e#UNWl7IsB|;g(Y;!^-N)8du
z43@b*_vYq(r>=02Uok$!BFZEnidTQi?CmVi^~D{^)Gx)%b0X`dA<fMyJ|XUyl11RQ
zwgfHqcyV*2kF7Ml{xEPh_XVAV{+Zzr4D)GWdsz0<;k$(q{-1PJ44usU<mR;e@!M=8
zAhIdGwKA<UVR3+Mam~g-O;=kwCRacJ%DUL9TQZAa^^G0%W%~lDj7cKA1u^no3O$jU
z)jfztUKgiENwXj<oyhx@2T?j^3;*~EI+zkg(ZO_3E_hKOWRyiC*XM<nbhj(XP=2G9
zbc_qn7Uu@c`N?s#oQfj{o_D)HL8ul&5o1T~OQP_HO<rV2Hyis^JSP$y`QJx#L5Gs0
zRCM!E$|bLtbx-D-`YB5uahiRjdDN6(0fFb0J%R%>mL>+QT0SPJ(=DXFGb$wJ4uU!M
zow}-PA^9(czMTY?_lwpmPl|=DG<!TS$S5itm6I2Rug2-jHBeBblUCrBDnYHv+{M=P
zU45C(9|HorOEHgZmL#C6J!q%0MM637o`}03cx>#5VA}g9L(k2+u=<QXnE1v;4v2sZ
z1zc~pE0aZA&0|b_LPFr-baERjgplFJL5{6yqvx+Rc~r<{5y;JL9f`U&Cbr<%<+Dk2
zALKV&i6D6k_iuquqMB<@V+gv;d0S6U{bREt&_>b%@B?nyZ@;6t;d9p-sRCy|zK<#E
z+2fRi+Hnv!8V-?T0?J%8>K)sgfTQv*7o<}xJpAS9$l22psYz*pHp(C>JoFFb9W`sT
z{ksMo1gSc*v3uYITH0lzszvB4yJ8U`n_)HdSb1+wUELbpME5iHb$nwIKlcE`2ia_5
zD`nzTyB<$x2fPY(Xya+(`-Z%^i0ibgeb;>b-$`zqT|`QI!dQ1U8IZZJLx4eU?3g&|
zi7D$Kq4QS$2<i+b#@>|>*)ybAYhU|sE@f=+T_<Yg=;1E{;6vuLR~Cc}>uR_a@qD+o
zWyBLW6QZ#`guD8|A&+Q^ZW0M=PRWwVNGxX4C0P&S0-Zvo-4{egj(T{!z#cVW<Pb`u
zWoSF&UiTvYUH{pi(<n}T4erOCnawfXv9yz~NrvF^iw_mUx?1tu%|=d3niB$0vtJh1
z`=9VmNkOr<S`P1E4O-&l@Vma=>`k^m8MUuU2r5BMYR&AxptQgUZ8GCm0F5!hvIw@x
zn;78>jC!<p31r|d1r2>Br!0k=57!mTxNkIudzX3N_(t~-m?<C8zk7Md{!}mQoeR<Z
zV-6L4C|92$lpstyA~AyI0Lg;85H4V!J?40yW8aYd^T=MnM=&EQV~E)V{Yk+m%V&XS
z5li61VK_hI9UeHzb$%fDO4d&HBp;e=A{JdxUg}%cf9Ho|v0s}0<j3;09`U|-sb4z4
z7O9mw8N$@t@v%PhT%z_(#_Cx;z#0>Lrv1Ht%2X@)SU*JT{O&23){nFiB(kwSU~IcO
zS;F;HGMV;hjLV}>yK$rZ*qD)5P-CQ^oZ?Xvx8iPZ0FUE(v+4TEox**RGYMu16aPQ^
zf<^4{f3|}EZdNZox8xi>xpMXrN?-!i-1w~*&N=^<*wXCXFB2#ft&cn@qH^DP+U833
zmJNMOMm-Oz%X<vMe9xJ9dUvU0A4p3GC-K8KZ6qYy2E`X8LfMMM=?WgrvXp*}-zi_T
z#(EbX^8K!x_%h^dkr^T%ej3=zm=qMnNs{y=y2C!sweN&Q9O1ThD#4dEY*8?1r#`Pd
z?Ct;ypb#`z7Yk#qdp4>TO{s7@Xeld!VNXS2mq^edNn<&fyljiFEp>pEL$H3pWG32e
zEr;)&Le6XbOq|hVRk`QVu2O@GiMFb9_+w`()w!X=veotAtIIIFk|e$*HPR+U=#2?u
zfS^Hva}jK5f}Fg@m+m0b0%glzl*=$?xJ2$E;lm*GEFq7JeVM?UMAl)xP0H%&;+t%0
zMs3s*1-!pNXy8UfquY3EB-zOIKQGHGCXcBLx;Q9Qla$^wu-pv*zE?%K$s>-KH|Asy
z1$?VuH6|F7je?KO86|5bO|ffx91S!pa(|Qp<111I$`I<JF;4GYHc9M)9MU$Tui_t+
zn(|p>)c;#`FEtHZv2);v9pV@Md{gjO+e3;bY!B<#pu!Ck5X5oN^KiZY4+_t*v3qF|
z?zv%E;}*^kAYJ&SXWi0g=uh4!)GaXXMxQ??qNEg~r6#5+MNSzMmIOj6n66&m?Y>1q
zR$BD&aeSj3mcnnBTh=3?`bG5qypzkQd&l5bdPX6CSZvk&4V*!)7<SFS>B|E4<HX0D
z#p*<Vgh)v$9JV6`AJP$|^cK>t(h}qrXu3lv?_gv(8mt)emwa){CFcuyTWt?TkA*Oi
zs7=K5NUHF{%WXclScbpT`r-eW`H`oGAWOl&sqI;_H-`b^QMN@h*KW(BXNCTG^pBwT
z9PRF_=uej4Z>W%DW7rowLtJpP@`l2(ACt>xZ#DX)Sw^({Z1mPBDY3KQ>wodxNKd}r
zcGKFawxIzIjWdkNC{*EJ;iJSc$Xa<tH0}+0x$|C%M6C`~UCJW{fjHT2<(`Wn&E1x`
zZN=E#kD{j|AqqK-_P<vQBnuYRvK*=Eox-4J-Aeq99ICSJ*Ef`54TLp2igS!R<bgP8
zwuS|c39R?kK0&7Bl^MhcVodSzzI}&ag7hJaH_Z`gL5GMVNAe;H|42db-CoG6+hr@e
zw$<h-ScAYfy*F&*`TPA{B&0M*(cR_w_iT7-%D-8P;mj02tl*HPpPYuUKHV2iGgu#b
zF5mfo-XrrfJ6GSQ6tld4uzwf{58NrkZXZ!e%rg}^&uY$3dEZw^2{7nTP4+rc{c5)U
z1Da(Hm0R96Hm=%9jsKp)@WES9HO|1MAyGmg==mK%zo0+@Gcmx3SPL$@iuYw3Rv}p^
zmHCL3j8i#IN4ja)y3qXRZ6-r3Sy=To+F!<}gex43w-_Hx<B9svF$BGST@n)8p($CU
zzq6nV;=fnIzraK|_CVP14e3QIp18d`w37&_?2@M@b_hn(a7x@BvdU=!4ygy(>py-E
zV<MrGa}J$E4-I(!Y?!3<w2Z!iR2bMLKO&=e*K(cNV1c<@p~r{%fH^p}AWh8FU>GT>
zWk2;z<dI3^w&p*x=e-4Hea~{Q-5^I+f=??ycV>f?>9!pw3(Rs~bW;57S?@G`_*3Kk
z*SXg>ju5B?hai*mdX}xIMx&74ujBF6HQf{h2-=18LMv#y*gw@*Z)Y!Ipu?-L{{9yR
zpGp%I%wyR#ojf$qDj^|iVt2Qo+31Fku7!1maqA^&yQJ(>5kUW4S3k?9?jVTAa_ljI
z(sO`!A}U6MuPvSRL%v-jiCBbXT4mmEfAF4nj`iHPEc1B^Z+PPTp@KrFWDkzzi?Ad%
z;p7j_dEjv=4<wpYj)9WQxLr&u?&M(J^fB$#wL<+bx>k@-jrg<9!vYq`vgW(*2++Z)
zqcH?!a6cZe<$zFZ?<d6B%Xk|}0sML!XI)}Qw|rV;BSQV??C*Eeu(u#l5odb#Zr6h8
zp$_rAD6UaFxTVASlYsgrCUE}!PB)9W-p0xeoOg0@{@)0*g@d9><NW2zI|V+?GODpe
zgg2r_QCE+?Ej&#yb@iOafR1PCsESgf{5)@WBWo|>j#}D4{0Q$aZNqmga7njgF`|T%
zaC*BG^_YTuKk~)GxlYI4#B1S4!h%`e%7F_h5?e&;IRjy^=9E|CJnC;KHp(~T76RrK
z=`0?;G2Z82es*JJ=XA35vpeJ+pGo)S1a_D8h2Ov5!T%9QznG$e6G|~0dK|?y{DzAH
z`S;3XeC;IWBfiyI9w+27_7vzRINz*0?>D5%y8EMeJU;@G`sKtXyZAJ~5-*N5yPub?
zRaNp%-4Z5bV8kFt6d;YGm5F-f22rKJY7;crAwZ;<P(hRJi>Q~}54o=)4WiKlXv%pa
zAHr$`Oq6I{hq8}?_qi}Lc|D(%yg`W;)4gOT>GDtXv7L#El@9s;QnIgs;YUeF#%@$(
z07Tej-X)EnfVIc?-DA7xf4qZ~9=?~f^m8G%SC7x5Q9R;2gwcyQf=qxa-iJe>*KZ|h
zB^Gm?6?@WsrKi#>=IoOVzE?f6Pq&!ZfrSgf%dL>l7q_7jsUnc0tXzR$+L?%KoOI8E
z`iD_wD3c5lMnwe|KIchg!Hxr_myrb{f*6p#9_KxBrrqbt??t^>B>gE-+PaSAKsLLq
z2hHy#>0eJL()+pb=YOE@9S+Dk7;3d^Gpjr33~M>WD8vu)&qw@XFvs(}odraP2*U*W
z>>8rNr|5BTI|Hxf`s)=D&wwusB0>R19YPIaFrjCc6E2MkT6!%a-{%!1<%EewLQC`_
zvi#}StOcfywT1hh^VY*G)Uu^q8Y1K76vULua3S1ixiz5{TISU|ZDoDtYG#dQk8GVw
z5#X9~p)g+E<gY`7^hr0FtTUGolNHoL<ywnwZ#fN_Yu%c}?DADhBO25-T1;#@;-^y5
zN6@Clhah5*1UDgcGLr!z->0`UdU_^Er|+?ywxOAnHo&w%6vYP>>;+S^%v>-9HZ#nB
zHEHO}v53=KL;d4cj$-@9(_mub^?KYb$Z1?l#06j46JYkx8~_L8CpFcAH5!%b==h*U
z`m$8)c%a4`v^f2z)hyDsiL1_71JAUVd;dMjDoMzC6q5WQSAM*UHvQp(WfS>B%7?^k
zoZtEpq*eX^g`9@ko>g4#6pV<+$06VmiZKd3rgzu0H*0{T^?x&9<rAJVnm`fjQKfy;
z%Xvu7G_>+De^}0^Gi_5do|#4#7q~Y0qL`j8l@3Ly6;C-Gzm%Z$%Ic82pD7NzD(!Ui
zhf|A`LdJ@CmxM-)qUF?E;sBkcE}}~=RHH;|HkjvKSROF^9grpas(P*ka-|p^3TvVk
zZ(!+qi~85|9k`L0nl-B)<ye-`*SNhoPb$vYSA#&A*~@GA#_s~9@xBcGlvVsow33?s
zFN&?Z<)hs<ga#2@QNnx_UWnZEKs5vP(b;mA+;<@7?yhz37UQD1jFQKITCyomi_I4|
zyu^L0)TDB5tvfaN#8!9<pH(uwL!F1G;^hr?G&cUV|G}6CiRvmak=k$fuIdY1;kOvb
zGM%Ez6pM;y?zTQG0CoZ1y9Zf`k$|n|nPBacS1CovBN$=6kY3tl&feT3eluLLGq=QL
zwv435LzOMfU2}<23?;}aZ~oLkEFr9(H3yws2o`f`X4ezo;U_>AEV|J2eS*nzO`CEs
z=zpnX7KL#NxzCUtcysveVHkOap%8;tj6A@AL&97`8;a(4PDJaRea){_w{g_yUPv%B
zVf=Y*g+(bQq|6IMTbU)&)oX{e6dzH_bIxP|bNK`GS9P9>_LMtH9ju3vCk+<QOqi_W
zxEo*>F{&m!#tC7u8zW7F5;c%pufnpzD<k*|oAIoOxUMT?0mbNYXjtd=+(=7m3~I-b
ziG+=~I{l^-l?IxhPRTmFPp+A3t8N;oHg|dN9sJMPMSJ-LYQ&&<6JEcwlObs#QA|Yn
zrgDx;fUp^Mr<xCpmmxw>{tmq3XWl@8M!pAiTNV`2Zl^z>Oly9be3{?G-_DrZizAaR
ziJQmzbfhE$*Oj(GE3}_(p=>5BmIIt6?=IuK{*q1zcWUE2wVAY`1Vkwaf;kGJ<8C6B
zOr=@tO>T_$$5bPcaFEDwMd2@i(5w)oxA+-fmp~2QaJH`?bwMv;+=SF9bt&a3qLjW3
z%`;$MT!{@%pM}RjU38E(M1;O|ViaBkvUUy~k#97rq^d{VcUgXS+=O^?2`~{=VthP7
zW@AjrD^#^5g86eS<f7-UfKjl=PoIBLaZW5kgI#Cp=4SF3MlFt?S?dAeE4|IpbHrTu
zN|%T~=?KM_0x|EOZ9hK~@f)%J!8mfIoPW!(^g>+7%WBm|<$q2bvZ!Eg%PFG{*I1N~
zhkaW2bg$A1H!*jBRU7zLy|8uX=1_oTrqx!iKY<k)@>&`kppQSWm;R9);%3J`W}jTJ
z>NYtw@_9{8@+T4nyNn^QiQ`Kn1iPY8<kx6SLIp7`{Ju1bU<j7TFvAA52?1#-YtcYJ
ztaVY46bgLzB|NzgL9#?Cn#yE_>Rhymmiq`UAUVCvPt2B2)s3<v_rv}xpA{;_@RLh$
zp}t(+dSIX7|IL+!{}#$+P#8uELafP|n)vvgP$psXsA<A~2E}w@zpI`m{{aq&g7-1h
z?3JCtPq$%^^kGwS5iupyYD?b)y+}GeYH<#PS?aCdf1T|?T}qE-o(3P?J$0S=H=6kk
zT`eHQ%LZ?sxAD1e<`%ei;aZhAt42elU+88<^$WJ-C?tI6HV8+0jP!`1c^h&`@K<V=
zI6$+&WrrPa!QFH>u@ryE^P`0P{{P?PZ1_Jf>J|_~Sv~7{b~%=%RLX3-%%zF8mDy!L
zG34Zy?3eyB0$MR~5kd#M{q%E=VHMcl)S>EVSPo>6nrIxIr~Q{r55`kYd*xbHFM{8s
z3LDIA;H}FOCvd24#q3WSsL|vY6pe#^dw_aWR&E4nol*q%<uxU`7PDoj?z?3i44;|l
zs|tXbV@Az8<6{D{loQ3Mag~}?O1<&M=}vc8>EtDL8UlLe`vAn<Y*hxbz4(OE(hS2S
z>4=z^CHE$DJ&y%J;wgY``h5FU{E=`{`$I}Bd`mBCf9>ao5A<^SBghs%`Gwsf$;;4I
z)jp?PS;{P(O^-G~;PJfUe|Z#x%c*p2jH|e#uau5$^0o(Usij~k@`rm^sbngwt6bEs
z0tJDkQYb(u9C$2mZqP95&3JoTt2g%GT)tMBg##)v-i2a3)3D0%07F8M_61H1tu*k@
zBWDn(>V;5>f6#o>P?KWbTb8L+<sAmBb_gqOyYL2NXCX;f2?*vh=yzPgWD$Jx@1I4m
zmw~5PGN-AUN}8koG*w1u3YDk)cgop-wBq&V5zxiuVkjikAcXZmk{+odW=RM4`Q0QQ
zT^Nr3XVS>h!=~Z+k%oA!=j`)~PEL^oT~O4}b{cdtQqtg&o|Mk_e)vs#-K1jLiVnZb
zD76ha{x%$~)zki1I=YP}Ti<A4i4o}dQg4V$G2;x*N`RU=kBywu4yoCtILO&pu%a;;
zb#9pjJ64%bR@VY>#4y7D{ZC%2GdB`AH#~nDHBJfex_UZSIr`{>P0;QCEZxo@$vsSf
zvOrW=N@icQ(TUIvH84h9a~zE{bKh|iz@_Uf7l5>dsyoO!;0ejp@26b|Pn{>TFp$}v
zdzO&S$}il2`7d#6HJ(A_Yj<Nocj5({6gi-f!uV0kt#1sQOC@xks@)8sqa|^be?Dcd
z`*kVSmpkx|sDh)mW=P{^M;_^6xq>i5)T8Va80ioGr!>HGC~sGjP9G=Hf3@~%L#3X@
zs{6ucM;~H3LS_d3>n5I1CIJ&Mr&YBEqNfwg{Yr5HneO=8xkqE~WF_?KjNd#Hqj1KA
zbmzq$2|M<LA-J<FR-J*htg=}UPd!4nD0^dTpjwt=@Hk1h$k;sj);|(!mM{rTr6zDo
zD8Hvxo$7xs<3I72$8W#Rs|>Nnwew2@g7*gYL2b1Nv1(%hmgMVQRuRG=m`xambUp{k
z2<5k}uQg0_8;gi3SEE0g6Lm(C5GN=Nn3MwXQ-e-RnwML=UK^Eisur?MC}4fFE<$qG
zffqWE%10%n$Ebnfc2ZBBOO=IESwD1(>rRd3V~Y`T>gUn;*a@dmT;bN-NxzBI+IQGK
zo_O%%!QsXJljrNtx4w)s&7LWbUtY(n`tMyxar<QOA%)9_D2>`zln#lYd43hI{R?C%
z{f6JHD*joO@)K@?R3I-gD4@&qU#N4e>-4E+B+qjB*mcO!SC+s;>xKVBa*JIcC797z
zkIn80ayUM~T#_zh+A4JAeM352mS)Qu#(%)LMN-2&Q@rY4E6JU6^nSEUMPu1<3BjBn
z1rxEmv;)PJbT#xHz=S!Xy@^XTlh`xQWhk68-J`0`VcssOJVmBY56Sc&m1f*b&4i-g
zY0_kCR7zDgUg>6vH!_Ys?lMo5w%s_EBo0`gyV8UdrZgVsg<vCLckCh4fGcfx``t&x
zY#sZ!O6M2DFS6^j)-TE0;*f0jus0)aL*tNZl%>O`XOuJ9rb+f@b{f}G8lAQVoVPHJ
z^+t!S4@$gU@yhq(gXYYAwZ}taXJ_)dW(K;IE8|<3FfHh*jd%OBeU~>D?*0Qn!kv1(
zJkh6XD@6f<TtoVKRU<5Hz8ge)xPcij{^6z8;{|MjqK{Mke!l3BWohFfag`w!xyw|k
zPx0h?+C6qHuRz|3O1<^kY_r<>!vL1OX0ByI@5RgM!2SA;KK{aM(#2Hwa7<QGrkq~)
zarB2<e0p;loR`r$L7#8=Zm<BfV$@%dVMv^9Avg5<Ua@`4by^xv7W+qO;%}zVDqf$s
zEVkXJ^=}7Fus!v<`fH+qFR#p0bbO^NDrnjhz|g-pLBvR6NI(tw-jVFhe{alHLRY|_
zoaT0EfN6;+3vygs9l6^#d2;n$A!8HBXP8meG5-1qRob3Kv4{<KJ5|`FJNqS}sS;+r
zRuKH*srdej4Cr#>gP|XNL*!YjZ|^Ru-IibZa$j&4U8^up62!g|BEaKI&leHcUCdx*
zVxXhc=^;glQX=#)v3bJa9UWB7N_^RT&n<VRFWMw|1uVV<xK?h)ljz_x^pY~K^0vwQ
zJchy1*sSH(3qdEhp-@6NH{>qw+}6kCHN#KxRhtdnG_pUruk6cJ`X`E94CHO(hr_#D
zzPG?f8u5E!6<vApwUMDQ9MVADNM%X0C40&JqB*~{rMKyL*(qgatY5B0(4qgyyRnNI
z$=~j0i|s=0@(&;1SH3<UcQvCgYkO3qgUKlI2C~g*DSKv_`G-FUsOoKR-gtNH_9yiR
zDT+7oso>>Bt)nzMI+X`T4a@9JR|Z}8lWBj|3)gG8d)7AUxYSm=nawk}HGdoenRs+B
zummr9(zvHQZZlII*i<zTKlUVTyV#s&$icojE_pG<(sLDVFK)%Jox|l%1=TWS_fdp5
zc!&?uZm}n_qhC7;W5x-8>*GR(+D>7KB%)N_`Xn{P{_G9V9dap|<y8J>Hvhf^ZJMOw
zNH>_rh<$&Awy!n6pH+;#?x=#b&CW2O9`8PGuJ^7tof#$TIasZU5Ph8(&ie+sYvEn@
zeA1xO+&+{n=4<?Gax1|k+K6G%_o4IT0UUNYP`ja)v(A_1wpC<_Vq27+Y()Hv4i*p2
zUg3PYrwTxN!&91C3)UCMyH;z{k3;(cmu)@;-prT1;_*PG50fHhxvpW*_ut!_W{>i3
zK<YzZUIz$G*!)Q}drAfclPkk7kzWi8txg(A+TQI=nypq5KL;C-rFAo+zXcYrKBW9}
zdFNTCqBLW6&+A{C=Wp7-jA)u@7I!Yq%3EpT_o;jII($I<9hOO@?^Q?|&Y<nk@nV{G
zZ<+~5o*sYs+8!rK2)!Q{I0BO#rG;p9#}Xq%XzE(!M6`*05CW%Np7-#RmN9e6Hb^rp
zm9kLNuoN9PZSN6=A4u1OP_6P8&N3@+L0X0GG8|Kjwh98uWGto0Ym+68;mcf-?nfCz
z+Ce;}dQ~t@;$DRpMwR+$Mk<hYTrObK>=#;JGqZ*J8YVYO)SINL9edvL{QvlR>!_%{
zKl~SlL57l{L2BrZ0YQ-NZiG>h8W2f=k?!seL0U-(NdZAhN>UIhK~j*AR#NzFKKFjt
zy6djH?ppq*o|!p&pMCazzn|yzV6J?t<2TFe-;#=~DR^?)O%mf0mEjRhbl(v5r`cW~
zIt>JtFgTBpi#I=z@7NlcR{h=?<|WkJcM~fYOpQ)PaU9F8Vi^$Bg_bRx08@X@(DJ-6
zfxotSF*0&VX+BkBwcbm5)Mmc^*?mc-9@^jR9we){%e8qvF@HS1Ub7f|C~tCqryvJ=
zAp-fWR94eok5pN~v4%A1C1iS49;nO6n|1O>#Zy>yw-jT_v$_1^)9b*?-bzeZy)$S?
zkS>1!>8wqW?HxZ@?cdl+7hrn{e`NtM3{g;`U)XWvD?D|`Gk}fr4D;hsD5g_6N;{c3
z2qPd7BA`!Y8os~Q3vzXSytzE`)hD&-$`rCx5S#ysqgZC4t<NSU9%<4Z5Yn*D&!~6Y
z;&E_?OAG4BK1B3Ku@ZjqT3p^xc7q({0Z4MUS0Lf9z;|0^Hu5TLt7~rI)rM}awXOJ%
zgIT!^5;PQacvdH6e&|Qo5l(UL$jFgd)^<0ZN>gretP*-naIB0pXKO4y+RXdf7(LtV
z&Ky|twmg#P^QzXOoS0l(?S*{Dm-iYcujXF;IQru<3KD!*7&k@gw)H$4-<RHx*-PT$
zd(TIJ%FugGi(46;tC(c1RtKVpd0f!)wr%jGc-NqPxoo^gWMXziaw}EMD>1<&;v#*B
zho>J;SI*m-J^QGv2LTc-hxlXzua0<mP&*4v?^><l1XU`V>{IO^xLNA%;|6=M7?G`4
zo>U&~S}}HQeddZbjr0rYt!$0zQy~FNj_hKeAuqvLJ^4IuKwCktCZd8-6=)<;p~7L5
z@LIL-Zd2}-+|{QKIob|&*&hfs9N|_YYXYYPA({97#{#Hbp28P?4NLd9lm4@@aTTb{
zlo-vlPG(#q@x#432XTr?4}<WeWF>@?S5#JNl~*ivyPh2p;`Hx-agiy4lJj*rruO0l
zo8t=)J%^t3^oBX;;qBG@MNt>HSUJh*eVa^pK@>v?4ePgwj<(IR$-qB^;ASglF4kBD
zWp*kDA?msQ@;yyL<czH;`Iqa=1uNqaod59MW9a|FnDCRn#s{#?UO&-tvk<bIC~77N
z*g$QCd11>&5zhx8B2Y$xFJehU7AM+Y{JU#vhXW;;eOBYx^&;i(B#&vV4??Gm+EAWl
zq@J*Q8)#=@l?AykzHP0{(JQv1T(X8;(S;peHy9iF<^wwaLD37I$!Kn_%+$i@11uRE
zimmk@b|Ok7;+EV!vD-q%&sbNJ(0B&o`RAQ92G0)HKFb#nc`C4F{kHnt^IS^zj2fNo
zI?!y{hW;GROSvn(1X;dypqM@MY+G)({>f;u`Om|z0w0S#{u|fxL^(+^9`KOeR`Rg$
z*FD6qBM>I+NM|Pz4RNLoBbZ)gw38L45f-7VKrO>>V@Y@vxS6n?Omg>!JnT*<{&W-9
z#+lXy5EQ=0=^h>Cd6D|<mP1}$K0~V+M4SA~zO4DI*`;vEFUO?NU+%x|@ZhKxtcFh9
zKeKRFx6~n1(qm*ok!lU%c|#A6KaWcl*PL0;k|#}C#>>5f!dQbyuvv0*cPShCIUiz*
zIUx<_Yj?0s@!{LCnowr9ig;Mm&nZIEO7}3lWOh9&`IlU#pH|J35Jn*#WXGu%xz#-?
za6$7p7MCrsz&ni|eg{A$#(pxrsjpPo7j%6m^V#GQq(QMuE$xaH;E&Fu%+^|^0+HSH
z?Qt<(O9y9vbTx%4)`JjIi9&c+U$ZX6o3tou)ejjh_ipj}>SRRwXypTx3d8<g>Oob@
zHwEaE!_TZCsRz=3T*sW3t#(%|b$qmf>9AdIf-x{AS+MupgPs-!n~A#Zq*5<0+zt57
z$Elr<@DaB;xdgC-q`&GwJ>C$f6P%2BPp<aJ&A%bX8y}-5Q2~s*pOXA4NspR7lp8(p
zQ*EWei6cF~ArVCd^&bm*y*FGogWc5%GD%D&>r(A{g&18LM*Yo{TrV<=`+52Cpp{UM
z)OvPA^v%0p7|!7pddLp4gtC)A@zL)wsSBxCXvhP-xjEFP33$rG<Rn>_=xCit)80Q3
z5@6+WIxxc**g7y&aOME;-MS32%MPzx=W`~Be)Y6YGKQccx?d8xdP&igY46E>>S}#|
z0f}&&Ogj$T8b5|LMwCEMQ05I*YA!AaDHWs)z-&h%Xy(rmQNhP_dr8C~f!BuXbo5yH
zNqIm(X+WAUg^pPg9-N*&o5GpGO}PB@cKc0A`#9CfG|@`PeZOg07-^b;XDh7u1Q@Vk
z!a?^D_Yve$KkK~DmojsrR*24)k&jqd;Ty%32&pwCAC+16KxNEya%Z@2c&0jm==8dz
z*`^HQwO$%9xFnt47KSLmED`1t(u#U=?c$2xF&*@H&hxx%Nf$+PAd`>cq2i(YnrtYy
zcr&|Nj^@8(awoJxiT|V`)QRHcQ5eCWB>rg3W~5c%f%NZCxgb5}!+^$aR}1wFW;H>Q
zBswMI#%+^|Qp)y?_5}j&G_OAU?r={6no`%KtBu?v1Ljw<t2hdY2YB)W;+jeX8x@n0
zL9q05mkE9FCWHlhsLBDMgfvR5FeFEDUAaz<rN4&u*TIR8n8lEjzvtww66a{TS&iA9
zr{pwGM))@*Ay}k54CC>{i%R|EJW>sf1OzSy6R{dhgX|<S8kPt5(zMMsGapSa-J=+_
zlAatnD-!Pe(5p?!YJ*&Sld0fkvDxL_h7hf}6O`0B*MqK6I=3Cbrb-4WcXJ>FF#Q>U
zF+Zxk;JEe7rh&=LW>jJ$?r8o`d8xE%A~l=5!E4?H5^-044fFo5Kf;_CCAUhYi}J&v
zR)vQ|lCF+d%`g0`Q`0W(omTHJ4V|TLsr~-xx0`(5fpSaj--F-ceUwT7^#$z)>*C|D
zuZtf^SMB?EH~*+Lts23<p4@AzxSIfOO&{_r?f-Ralx}X#M`Mn{kl`q|oi6v0C)*>d
zUA^n%RNDj1JpUB2B<TG&B4sZv9pikS1=kq4-aEK@Fk@vKt!Gzz+s+tcnq@!=(N^rF
zF5A~+|B0+cgH*wMwf^lvzprz8ec`6C!FeU*N?(#CzN{4`v)(O8OrgQ8n74{Id;Txy
zum&_yy0J=QB@NF!>wzI)g=0MbMUnrSTABsau*ytw9C|GEKIRg3b&}gs`<-gtLY?-H
z*I)GN*=1Efvsav{G65g1jAdX`hbP0HRPlIH>C0n|84s>#xEpUF%w7{=1tB9pR#%3l
z1<R3%4Xm&A__V2IYb*ItZ9Szn$R$rol=}oN^B{cn^q%}QIpk*F#0Ft88vH#aj87Rd
zM)quRVVm_Vq_s`8^U0cA`W5RE6+JDAWjg8LXu9LdUG+01i_y8P*xZ>GDbMeDU#EEw
zzWY)8+8zuf6m*q=l6(fJ33yi?ZjjM<5xK{ava;Xv)!G?*?L8`m@QqyRF^YE{URRHu
zDv!T0TFz4|{Ja#%8S1Jv=1&*e4U<p|)2K3ge%S=RiST&R&(s%R7#|BxDAjvsOjR~G
zyLXE_(r;en%inhrGc=rHOI)5|RpoU2di3MEf@mR&#VF$TMPZv;f4`nx#&%luR~m?n
zor+i%z|;z?^e1sg#%Xd>X`mZL_nxiHz~?;xNGdUt+xjdEpw8a*Iop{Hy(rd^FQBt3
zJAF-k@aoVt+hCrh_XoEv8)qDdJrj?orYV(p|ER6pt@?*_<C^O&2N!MEk9RYMXz|j8
zZrNIWXq4;&BRxu^-k3(K7@iq!?Q&B(L|2lQCJm>YNr?7}uRX!oZ-vk8Ie&8kiMGnt
z<WGPpn}v~i%(Xkc{-}AXy6R8B1ETb)?bN3YYGE28dQ+k$a#+~W)WLSl>}uTFbGG-e
zcknYLd({;$aR-^O!=hCJrk{Auf|j6Tak}xAdbo7Y$k(5e&|n=7qZBLM2oZ>FD%C5a
zg<)I(?AEjvU_H9iFK>4E5Gsepk+dWVD2m|>rK1rE9J%{dD|sp}&J-V=W?;sJV*uOg
ze}yD0St4q;;|t~OqWs+bl4M0^<no<G29Vl}wr|rBpzaJ=CqI5x7#<fFlE!8jgv6+{
zQh=7tU!$xrg3l~cqxlOUyf$KG&-j}dFsUe8MS)Y&J;+-fK<}=l9XRi8{(7|F8A!G?
zy;`D_p*NwI`a8983t((wD=R0309x)Qij(CNcjkL?S007pJ>-{YRZs)zF>`w3%d$#h
z_hzAT;x7ONuqumc81d@lSsUL(ZLo@r=lSm10ot&45h{ahLItOq_|@F!FoHqO?mkdi
zC*NGum)STTMi-f=)@A}^<_&6$h_?nXGubD&1i%a9Ag6I9=S$2dZqkk-w?JTo8d=3s
z5)-G;P1C0cx}i5-9hb@~fA9>O0=PdUg`uUXc_PP827`Uij*DDrz_cnfP4*@!CiuzJ
z>+p&A9?{A^_(DkGP%k}A)=zmFi+~vyj`RhYo5~)PX##u?nYEJ`W`dZ*`SSq?K_}GO
zGBq?V;<tfPS=x6)8&aOQj*H#aF}+ie(&pUiBsJ#O|DksVT@?TVAy=|~fJ#Wn3w5Y@
zi|03q#^7ymvZUB7ZQ&KRzRe^fwT5jS=vfsbL2j!4rfst%dyXD6JZ?<)lEvu0ZJU@I
zCyUV?dCmTe1T2mKn0g*JXByuG*Yz(|*{xZ3o_`_#%947?*>Qia0obgVL7QcD)E5rn
zVy2+2v{$efEQQ@7J@q<wP5{rJH;pOHOHCa{Nl^r<FTBGQ>ePuTuHc{=UZ-U5=220!
zGfgkXycUWi=*6WrwSXJkAkUrZN5l$;6(1dZ2j?eeMhkI1-he+9=Cf}rs_j+jU6o)}
z-OXZsn<j}E3W17dNYD}JpV2vV&tt|IBqw*wR(XT~M3+|S5@0Exv#P|^e44*Q{!_mA
zIW!Y*G4R?y8-!gy0ik&7!L?{U#Ycr+2Xu7EJbocHK5JMdOzOgH$MM8Jd`esaybAv5
z-6{;3NTg(6bp{Npm_|GWTnMGX3sAj+@!ov|;<FOJ-|XV6<>0u0j-o{`IO{SPVB_CC
z=nI-GMBYq?*5JHyZ(Xx;QvJ>FBNFD32&}OafSPs$*oDWBTc%?5vg-C)SLpJ^jlw#U
zSe6Eji*#hbShGkk0Q9d!(Qj_EoJns%ki@YrNtxApq4e0UJ_b%h1ig>s;vY*Q=a>a2
zHkQu($7<F$GvwWIs?MXQyJE>Ltx78QmCb|d0a=oj_1POjt59rr)o=KH79Xa~d~KXJ
zQ_n`t)(`SnX3Rh63zaZQ-IiOYqR1lrH_q9+I&t8$7msx$gLbX6E&mQMlfw6o#>8f=
zQ->)-%+QnkwRK>7#V(8r5}D;QWgg1W+j&xWl?@3FtL5#R1-5Y!&SK%ZO{DYD5mtVd
zU`|rJMDM~(9T*K+RvdFlO59@prlvud#*oyeF?aIlwE8+sZQI(;13VDc?btKqa=Eoz
zgBbdIzZ<?j)oE9H|LRhl|73TrRcxxPG|z_z<0gS)(zO10{xzR3AndZtDt?e$ADcE0
zlzwcR>%KDU5j!<dG%MiN6f#kgw^QwDbGBK1<viPJgUzbUFpEjJzg3oD&-wc>cVO7F
z(<-^Ribuhs2L6jpft_2G>wI_Or8iNcn*3&K9c}jmGdYd!^6Y)p<G_>bgpF#id5h&A
zqkmXB-(-UH6oIr$u&4-946%4vXhn!NJ;Jw~S}+t|93P{>oV7o*<-1k?d+DqTkICBE
zjw~$gu~OJ;^PVhGNG~B-Z`HdH#IZQGV#;4Xp#z$@5Benokd+LhtjEDm89CX}sla1P
zd&DIGaH_ssZM%aR3@)A6wJIbkt+y(4Zd%`uZxwdagjsWt6Bmg*44BzRw^9C-?JIs<
z2es4i`3ZP;Z{8SSy_}UCm;jN+?(4=+k%9_pt*vVmY!o!Cta4=vZOwEp$9VrHZYzgr
zNFE!UIZ+fE$dtUT&4(d?eg6GiQPx~9n_%&q{M?GKTPg*qz#hq9p!;Dq0F{T3yUF?d
z=<(*T)1@+7*LFt?t-O4|eJE>kIV|$o-Al&%AZu^S+xV+%`sh2aTB^&G_pD_;ex<|y
zBcTln-q+xv;eDThW`=gopmnTw{~8`NU%pHi(h=WC+}_5#EzN7+tf+npGUR6CRhV6x
z8W8G;Iq3jZ!6E!PTrqmWQ8-T?zJM5EXS=cb=~}<A`!~b4x!-kAUyX$hy|`$L5E;^p
zXy$=(mfQ40eqAWe;@_3^qpx!P|IR+x$Ov1ObJ5tC$nax2@DGfIrC+g_oV5wVC_V2D
zNg<pn<w0<(krlIYJ~g56wjzo2Wc^clsKO&^&4A&x>GBoV^ef#}0onDk=ZMwn3HMtU
zlIu!pft7#NvWAZakIdRgcH_~~gwIfKL3GZ_EI4wD;Oal15-88Jl01)%y!o{0A0Q{g
zZtrTlc}k=(U9>zMSI6G4qOqd#!ce*@T#oYwi<rlP#B?0Z*mZ|d8~(5_;#Q#G=;XTC
zEoE#qY4pMCA`GS~Bx25>CJ<OWcx;Y~osO2^%+<~Qu8u1Hw^Q<g#3!N`e=qNc$Zo~N
zKTQ5jan_=(hfLGD<NDtoFT>Ok!x7`Og(*i4SzkIuTpl?gQ(5F{dFi8TY8|_*K*R5@
z$O9UD+9~+%_xu}EGG#yQN%Rm_HK0$RXG8gnf0H|>IRp`+B@!`r5*s_0O#Ap|N?f%V
za=xoEefyvy67Z&KM@s}Rs1~niltcTK(Si3`05h>`%V8!@R=@ecE{JrN@2Al^`llF3
zY%LY5$L(pIE?>4!e^Xwhb(Tu<0Wn%n<9PGxtt_37<r<Os3KU*{`w=)3uA8&g^K1Dd
zJ{w&xWco#R5u{)1pN!pkIX6zQ*R0|rHs<xd*4Sp@?F$mW$ngArJ|YDhu=gI5im)7h
zzA4kmUd8EcdLgoLa9MK1^T+kW(~o;BMqA2n>k|z1=BlI8-egli6bhce_rn)VGh;2B
zimZy0^euocT@2EYQe2!BPDBSyz%(&nnB2hjtamG>c~8^EIupdwU|uf>z~=z7>Uqa$
zuOQ2>7szM%g!-g$9w7Q6K`!rxJQ~j+Ipw$E9rL}|nL308)zE)0X0o$-rsui{#TH(?
zXRq9;0r^2?_|T2ttU4(xm4vkN+vgmZ^Je#BXcL<_<Hq@I8%@0YThqMn+9u}zFz^2W
z)D#wWc67x1jab6i<k}FdeCujNr&yQaLI_JBYRuMz3Swv=7%DOi@!V&_ttS^wUJe-+
zYdZ{g2uAw%+K#gFbIj7g<kW;#Jh&V3eK{9Od7;Rw1^NXbAb1%=Yh(H)v7j7U#-_%B
zJ_GicI6mX_b7+n@UYzt_4U$p%?Tkmr4DVh1l>tm@9z%`x7-ah3$^1Z)az`>U7O8dq
zGz3mQ&^V)76()Ov-Df?)`b!cW7c>a4CN9={-0#z6n1%S|6`mktm(q7LNLTAl+c1s+
zMC~A<;Gi-{BD_{h$WMsF3qy(?{fY8C5#}6cQKWJz9+9#wdd|4p6_E?O9cOq=@!fPY
z5VyE0OyjzPmz9g(A^P#lS%A!1Jv*B57IRx0;J%}LU2m3J?sl`AKg$TJ$QeRutVZ+@
z0ydYv$$M6u%v4d8Ae)eyBhw?t^?qXa(sTR>&HcxHOsfgD-g(~Z(T$myGugmh3Sy^Z
zL6igN#!yL<4rIF%(#M3KCGowu$@aa0%OU$&$k}IcOfy%1Ic&Wk<Da4i)qnx`aG-^u
zoxqY`b@Dpyma7T>W59zR;a|NarOToTS9{rceoVL>@tlM5dA;4Wh-We!Yjz3H<YKp3
zzSq3>x_m{G*y~<y7r$LV7_GNiFgjUvb40v?y&x+Lv8V;<F>@e;f|VCS8}JstgloR&
z<r5H_p#W#WK;CIU$wJ^76n~NpklZ4Z%kq1}Xk<8x`qNuclxanN*yyzBSiyF2VWQ@D
zeq2rsFB>x3acCf<F$TPZ6|jhg^vQ`Ex+pBUsF3Z?i(+u4_rg&WP_WNXQC_dpDEBL;
zlTVG4x4s*SRVdg=v!muuJgfXcuWp{=Yq(p^9(@D^8>#l_ag$ioHM48LVQRjSy_IG_
z<UaA)Ez&hAlVIGnrLlm&^)wZ|Dww_bcaL?$x^ZWfJB-L4Nl+EPs*Mv43JJFw71e(|
z;L+vz*!z7lX&yHWmzi5v3h$V^S;6D{c8ffW1NU}_4Uo*lV+@&!F|kgJTh}erbHpX{
zO?aiZ6)1(@iobuxdpZw*-Rr^6R%sFb!VfEA2q^kr>lLIOO2$A3pBZ><>I$h~<Dl0e
zY2PPJ8VGSOz^gj0B^bl(=q*D-4wvJ4Va>Yi9nuC_^>L*tR3J$=8AKRmHmP-t%mh)m
zPp1xK8auM1gV3LK)K%_G5s1kku$kv*JROzfj*$^+xuafgC(PO6u2xce&b?1>au||)
z&X9@=aR-2u#@ok_xjWX;kBM&F12Gl}7`^$dY~J__E8ocT>zn@Rgziz!SLZsGV8dD=
z&boGAz4IB<kbO48I}f+L0v>ieEM<P!87h%T3|W^PFi;~Mm|+4)d!+1!Yq74EtJG{b
zie&cg%ECf2i}sP+v>%XrGX74v{#W>b^yX9#3#O9q^ehmj-WQ>s|1|&Y<!d2nfmdYr
zDq6rM-C>=IIE$CSGGMP;EB~cOAyVtvlV?PR&n}Dc_f7aisry^uDJ|?;J&G<}BCVi?
z_UT`t7m`}|)8wL|fNH_xc_CA|ZCA~%%1o^09=wCJ&hd34ylGcU9?1I-hp8=x88;`t
z<vi@&OvA4#^Dt8{;uAI#)Ynb^6CLv<vgJX`rHEMQ`o=eclSo?8S;{E=&<@%4)m}*7
zF&?%oa-0KAyh~|Hl4D?D%!wszGRX=@*WPYYiM9v<NA;)c#_Hou)=VreZwPC8KRrGV
z?8EBfaK(1%E$*)x#J!IBeT%cNj+ZdFe(!~eRDz)E85Yx`beyBx(&$+m*vEeyu4-Tr
z=M_+id$6w?KIYd10s)k7k<BxBQNgyhppVxj+mMOk<z%ZN?d!D9db=^3P<1Gly8<hQ
zMXb-Gaf4)sb|}ujk2$}Ba`AV<6~ckW`H@AM&{ICiE^3<Abd7Zl^LO5nt02ZG*zrxg
z*q}hmTFO9C`e^Yx(^_I`_PSJ%%xoLCQTl`BWxh}H+r8%C50^r>bJL>5nWQ?Ly}@la
z>Ww|(Tb}E$-_-DmQ_Pbv)hml!%yCH&mu>!VXnM%t)Kd(*Cu()w{O3{gho2kdoE9rt
zPk^ko4#dt2Vzx72F+($N7}vc+te)3&-9j>%`0DV8wS6`ZA^-iCI4+7i+#!ve?PFfO
zU#d7&(>H(-m9<6)^VJGq5{JE;%kR6U-ykK%O{B)8R;^Fn+<2`_NkgM&#LgrVJ%W%~
z`>2cD%8{JQ!JIODiQX!^H`+M4tdGZkMjrIjM@W4hiY`aRc2y;x@hs||&(mR}JScDK
z!a(Cd3Wd|Ck*h5YHJmcf6uK~X-fKMuQo~yDwIoG_@zQ@6JN3FS>GZj=5MW=`B7qu4
zVO!~};SRrGTx90DNyRK&dfkKYD+&1$F038_54#1eG$PY?0j*M{)@sm{YF04|f&|_h
zC%$AhK7>4cTl(M!>HLBL3ye&S%rhOf{K4R&SvAbNNhjLZ98?zG_T^xqp^pYq>`JmY
z>PmxLyQ3GG>jnarpMER+Lj4Xuq34qYJ;Flgg&T2i69Zr)`Sl6YD3-}t7|~H%K#eu|
z8;zi5N~m{~rMG$qv}bYX!Y(c|Rk>MdIFR9p+XGfOdF4scv04gow)^X+pctdZV|fny
z8GALrl$zia?m5?sdRmQzPE+giyn{ItJ7gG>ux7C4$JkV1O%1a<lTy+s!YA^}U7o0+
z`Xr_~u<Qcziu1G3h%|3TMj#%0Vb8^KIXbVB&FtDGz8EQuXL5#6C?*j4cj;3QMlhD#
zy5Ccu7j?hSvbfvlu#fJ!HL%wuHb!W}r<}Je{1V)S(>4-q#bi0e6Za4e58uqlxoi)c
zS#uvKgAAv3{fHxU#=)3qreKty!v54{n_hX9;u*t9?2675qxBsZH{J%bleFhdN1Qz+
zkAL<YNNrePMGQwet7pg?n7lgbjHj_2CWIR+cY_MMepl>9&J}t!d{q+gv_e1e7y4Lh
z^f9cykJ4>tI&MLN9^u`HmkCriOBN9i1cv_&)DgZueR&1vf$5>{7WBv^S!4y(8EFcI
zL*?z3y{H1x0=KxSC=edu(U)VpKS)*S$2El%lazmj=oJd^`KAaBJ>RV+qq?;-m#SS<
zR--Q)!(%M5Q#39x>d@QmWigF!ne<-a!vpfl7F{*C$3MLq{OTBshs5ZKhZBL;Pg#!0
zR)sBb^SA_}^=84TxTG@*Es;RIVdXD({hqZ@;R65fke9;dQ!RGBnAMPjP`yMJC^k-T
zuUDlp;<4UjWUGtjwQB&6pUQHCp8RtEVa!HM_aEA!ogZ$Ds|{r@T-*BtqlR@+(*>Qy
zw#-l7JosTZU6Sp52TeX-GglstoRUm>5a+)Rp{TaUg=FLgc*YOJpef@J81h1e7~`}!
zBfVWUew~sMQs+h$mti9>k52m6Kr$zneeBLk6RW5GEaON9t`H8QIYg(T;eC{w{_4@g
zN9F)_$;inXb)Uj0<Q-=((`(|1e7T(@_Or&sZ5M$I-6CAu9r=m6b^TKY1H2V%8>a4e
z;w7k5#7A08-0#n+oc%3Tr;+cEOf9r=3MWild}=hH?^}QL9`!%!ua7AvXr>2ion@n;
z-RLKcsKyA1L@kA!mm1x9Vy*%h(bH(4kl6g#aU)+6bi1#6Qv?@+!4Pc^Kwg-As<0^R
zWz{2_KoA#Ck!B0kh8iY1Obn&41<FGiC&m>B5T;hI5|>iVV5KlQbsB}`0}J=s)o_EX
zFzo<5*?AF!k<h1;84Br_3KHE5P|Klz6~5f6(s*ZhVGm4i*AQ5o2nrHyB+_k-!IqL3
z;cTGh^f+VU)YzVS#^R}ty$!_+4_We(%Ufdep4~JeTSsHEvO*j-HjL;5MRhE+hbwcV
zJ>_VxXmKt+C7%?{(9({p&D$RWtBDj-OWn75OEy2RVAInKd6N-_zgjS+fKn74KfW`>
z^(09gemBm0v~*um344sN*?8NRLWEEJ*HQlU<%S{L{K500&$2<pgRC^63`%;9AuQ1H
z7%SW}NO|VKx(1Di`I7SzlCJxZ|DA<CH5K2|fIIWh;h5x~!k&0{_6rbpMGyj%E{g~c
z7sA4YW}<`nwQb#gV_)h*1LCYJe_1_%!lfjPQkwMsGMd*SxX6+xyDm?n#W2R{fIDS%
zEIlUVtN{0_v_qA%)v=l5m1nt1FQ4_@ClAT?A&-vSrgO67=TG47V?;-4<L2Rz?7w>V
zp;#uA-1!Y$2jJG{zx=&B{gZCUQ%do3H%a#ZPWj(TO{o)$cu^VwFFTpxX>6)bxdl&0
zk5D|67m2GaRJ%`-ek`b!AqWQ)*k#>r{e2iV*t!38_P-5^0cn|18G^<O-ee|OT5Q&!
zV)KvP2vw#J(Z36)(kCLV<e>~xgt0<3DU3-NX(O3U55I%3lzE5c4PU5MfFjpQAVI!<
z1l&#kFKL=ygn|5Nw>iTQ!EhAmGhWKMEa(%G6iGHE<6;lW4#h7|UotA-P7sJ}UM4)b
z@)EKtWia(x=72=tYm(&9C^(bNe?3VTQdx?`vcQ9DQm-^m@=T8Jh7mySqq7)n@}li=
z2@7kh{S-?O$L63gB0VL|<$#J7QaM74#s3VGc0ZDg!JQ)Ey5RNNvVpyPLDWG`0^KIk
z`p(S8%T1jm8ArShH=TJa9r|sz^x>k23WBYZrjj|mf^r{y`Pva_D|Z*k!M#MXj60<q
z*Og#$p0~<uH8Xw-ttO(zGnYcpUve~2pA@kAs@c)oBk9+360JL9=IvK692-4BKv>e|
z;t3ABQOY+a49~xq?vW1$d~vvAIb{7Vd~5heg8pN%Em@+sJQ+>9PdiXGJf9llk#XOd
z*oe@?gz9|dH9jHGBJEx=7kY<E+Xnm)=DoQ`ZKHbEV>ya`<ZD;}7p1Q6r*oSBhj3eM
zips@uG$gKYU$R<Z*mLOt|7}9#ug+YF=RlMEF>gbP%3^LLjNk|)wOvNh%-E9h;C#}x
zCBz0B01agT0rA4xxzFskuOQTU!gPHJ)_5|WodH*pX7w78@WYoTMic%Bn%4+RyDkCJ
zrxQCghVY(!SH|U%zpx_ZLNe^wBhv1&P;`nr350r(ZLu_@F5FJE6=)foEt}Pg77zCt
zQlC7B-d|~{AzX;Tjb(=sLwzXxq2n3^A~eWqmvReLrhAMqf&Dczq;KB{!Ey}xh&>TW
zAo)*2NG?s6!krKCwuQFXr&*<yoTpi%_=&^VX^OK`jAvhg?Jg{b6a}~x;;(7E{wA5B
zh0U@5$;$?F`p33nkKtsP?Mw)dmIH_ACmUES-#p~dzu$FI)fCI4R$ng$`N#k?Bu8n2
z??TY)@sC_>X+ohF)}?n?!kRg9biEgz1nL>2KF~Y(LDqNk(i|p*OEe#wf-}1O@ETGe
zZ|tZSBlpiVcEN)$2(~ngQTi7Tnk?}J-%e`5nA1=LWxH*h;{4@l$As6CZ)Kj|{6kKZ
ziPzJwwNkOe{ahy~eX0$cKKu@mzX)^JNXsi5dPNzbNC4S)%-U!ZuP)1PxuvPe_wbp>
zwA%C@UHLjgm$bIWOEy!7B-_l?)9M-H-DKO85!Na0b?lI@8_`w+>|=fu@08?<B@1P?
zf*Shnr%Q|(qE22>f2{=v{3bXLRJ89K&*!uOn`Y@)z0ZkJ!TwZ*!a|oX-oxRb_+kXr
zd*uu=LQp#sD|mn$PN`v7uNerVgwjEoJbvl@H34yudK{?}sU3I}Y98MDqY@rrshrW`
z3P}pjKvZU7T?$r&8?M=o2KtBQi@-GgN{%wgz}xO#KHuOC2(5`xlXhvpl3Zx1k|p4E
z1vyut^tOcXRZp!;g}L{7-wT%@%Hcmqix7AQmlPheuKiZj>F03aqwwoZJkw$qZ#h@F
z&zx8S>(iGR{o;Yn=R|qGwaS8+2yU-phr~abu)@Zvfsn)CE*!UY?=)TZp>CXWp)}li
zW7qVtA&C`~{p>lkL*;oI8xPJCwq0*4ISMD+`mL$I)Yu^Y<{3w7%7IYxieSEsVYk$x
zCtn#AGPl5uB+KYSuu(W0f3_+{;Z0lWw0_}_%rrsV(tipFcY$w9z0A%Rzt;61g%r~f
zu=M-}ajsO>8v1rR8<Rd@$tTS13#0>Bh;JwnmwdENxL*13JW>U5o`u7_p|2H9+AU(D
zyX?14p*CTS6u0aSLp-NZL_MhM`a)gUJ*P^?Y?LX8!d%?x_|x!i2c3yQ$Ub{hDpaje
z=0cC*t>#M78pcbn#Jp`39Pw^ml6SLY{#E|2;xA#pCr)8^X7?R}*abGjI+<F0L%TO`
z@3fz{i-AtE{puU@@E|bS+R6oY3PX~DxLv>y<5TY9^uoJt5mNe%p9xTg(Ng(%xRd6<
z3*c7Kzk@@REc+9W2cU^RXI|F~T*K_S?qy1$gXNQShQn%K%b^B%jk&bA<V`CC0nD{9
zYQ-RwWiAF+>a$VLKLmYY*lTzZM20H~?rM#a^HRlUH&&k0R<^KFg)66>^H>f$^C9_N
zXjz_dh7lQK<$Str5`c$%B6<A-tna`3*<)zvGP4!2)(JC%8M-AjaKzDeNKjciWmFDc
z#BYefr9`(F6zal$45L&Yq9Ze02L<l2v2J{;XkaIcLqT^Y6X@<$3mjG$w7xoEph<p0
z$R1_eocV|@t|c(flh5fFq0stO^Yq>PeL8{Avkk;zK(*p4>y<0H&XnH-b#Vc~)<yaD
zmT}qG4+GU=iS>!>JSO#3!&mw)75??was~cd0jsF8(yhPOa=;8F*pjn!sEBx;cWrbk
z{jjv)6pfFK|Hy7vYlC8uwjVB3=~S86hW_2-4y3KTQIBEiX%Q{r9Cmm|xEP5r0y?a?
z;uY6k2a?`vDH1zk`Q#1n!ry1_-4-h^hJBmY6pT28i$LkxJIFmBlXc`ZdR9GmX4>%>
z*p>29!r?_Ruyl|p02&kMw|D*<-+wAtmaz*4(0>itSmQz8lw09E^HSXe1h+ECcN)+-
z`4qRIsuYa7c*4+nZ;!K?+z4^2jd*mFZBRO4H;TP5Y7frW6?ZOv+kuSY;e@&Z#lEc5
zGOAkFkpc#TykOxo;W4FCz_kQUo33g~!jm3pv~uX?S(a@ct)!1dN*E?h$-^|7H?#m>
zy(dA+n!LG|jsUwi!9g^xOYmD{&|odxoQoVa05qnAqLk{RPPT;UMQsIZK_Y7e1Xw5O
zE|z@c+jz0?J|b~6F>YZ$;m|E1#pTr;4cwgam$gKraZgOYMo~g&KV4s%ReXt@l%3Vy
zVfKttS7JKQ(hsblnu|~n?dG3!(N~~_M1~>Jl;NaxVG@v(j5C&pq%M5bluOVldh>FZ
z+`Hyq1ebc|3FL8@hfpQe_Thi`B{vR<<eypn?A``%uc(b$(j4V7BgGGaizI1!>KGme
z*cx|mTNe6jPE0QWb_=sG!l)*0t<~ou#|Kf1Uly~?UPMp(MLgd@M2z``%g<mviOr%^
zo`mwdoa_p?Zt#i5A0Edf3s?EQAA{+peBVe_<2yX}Y5!WUO~j&RErwU4u<cN-S*6~L
zlnudgkR5wr{tsfj0D`aXRo4yjNDBiG=x%o<;Zmuv3pkt+(JOxY@BGe1j2%XprmGeu
zRTxqbR5P<P!9<uosRCuOax93lnhpAnPbhxUB_L}}5JTQg**?mZaMPC9nJR3k{~jm%
zf*%*pGKjjHRDq+vjxLlEcYS7uO_8Sn#%xh|x7IiO^`oJi`~q;4@!4T~Z*SQK#K*Yg
zOm_2%ddF~z=s&@dkPLQ<kZ#%eRUbqptDhA^kF%saWb>tcaTN03;2EslB5OcKsM_eX
z9c<QjnQ~^MBigJ~vufkqZv!$beo71_jKEvtK~Bgc;Q%!pg5?LA1G=l3wZpLvhmIDt
z!QCX>D*4}dEaEBgoR(>WjM%(577YU{GQyB3VRN}6;S6T_nwk((cJwrmDuzOP&f}Xo
z<K#Uqypai6y5|ObN8+K#jg_Z8qvAy-RU6WQ0-3MrzjBO*J62HowUYg1G2$tlxZb-3
z7FL@+D25a1Uz3TTxl2VEN4SzLuco*rA0sq<SUGj%=t@*1(?dtcQR&`8U?royfpMyC
zy3tYpJXF-;;4^aRwRTqjbIjD$B}rbH$;q7KoslJZmOxAh(X5ZC9S;t_Egs|`9gb#h
zsJ)LKMTbVXewofu{Rf6O2)V7F!p&@?BJMrX?EDjHsC(Tsh%_%?zFiY@^J<oZHDEz>
z2r~C-UOiUJY<;B^JFn22gC%4VH{#j*YEI+c$9tUY!?;u!`7iT>U4F>w3gULXY(nq#
zS(4#@y(8%gM~*PpYH}u5K>_i;69KF&!iTP7AT-bRLg=JP%*18b^Kdx4Pn^Wht3}A#
zP+K&;g;2At^wz=0z1{47RiP8jdre8m#h)T36Dv|<I^>J3LYCDpUhx`iPUd-Td3KG?
z`_1@mNX+e)+xZ4OFD9|rfwXAnh>uHsjYS%pVv}h3Q7_~a3E%r8+Qfl0(HikQ9q}X>
z2{*xX519VC>$w<>A?vdkIGNPOMlbUJZgFbILyyT>GFp6*;<B~)>@NNu)%n$vegtPx
zMKVWCWRahjacbh4mFG;SQKIXaPGx4lsIbW|9P0Ls7Hhs%E2nC<OkUV}QEW>$`RYt~
z03Ar|JKME8CjN7<;=6Cz7AlK=<(g06E}3k#n_}hMVp*M(dC^^(@B8tqZ5(c&G`fqj
zo2I`(<eU1xsz+puj4y=qB<2QDx9D74JPVpV^OQM-t#**IX96EvE)rYn>~~pgs*V24
z3!V^gSfqklBd649Zp*Vp|C2_brQlj;A%Wg$9%np{II<($z&JB1CRjS7&0K#D&qGE{
z#}oeQd7Afr&Zm(aj@RMICT~9=I&senco&D|zhoULab^jGtB+HAUp+YcHMl6;f#k@U
zk8S^%_4Mk+gv3-AkY~uO8*zRwMFsLqKSL(aggX1%?;cv|K>KLWRcO8vYRn!8WhHf}
zZhe_mdLWkgoD$<6Fz-q~7L+QyPQ}08Q!Y_BQ?Fy}(61LTR=)<dve+t$ZfVqz($vA~
z|5%gY6|<SSkx6(o_N>@rBVTt)bl)wO3DYf^R~}~9=<~~#MErR>Gklv9^ZxmzQogZd
zsexq7B9dgB$>fRqgZa=_;aAT+C`cUUuHz(|^pKnj!|Gej0>U}{n@C-xlXLBN&M9pN
zM@jPmyrTcja~P!kwD&e8$6EXQ9zx(w8Jw|m_MnXlCvb8vWS^&=X|avW$0|b^Su8Ip
zSdQ(&@@M(G_%g+Dd-N<$qV|q(o3N;jSSqmHYbDbK=@VO_v!te6L#C(kxk@GjE55@q
zXSa_`JRrQ3TWg*n*!6~aRv(P+zo%@+LGxj1nwjMU4-lOsBIeXqzLvuY_orAahHC}d
zPVPrsSW>6wqD+KELf8#F+g+qh89P&Q1*{@0oRk*Q3D6&M`Hd!|*>h_Dc9}5csHbb(
z3J(+MFRG1e#0}<@xHUxPq85(sgcAx>s(IPYy9Ko0{1pW%l{wUbl5itfR<g61R9xaq
zYFF@Lv)DVUf4x2t=Yb%vDAGpf&d{*@S?w^g%}b!x%90O5L<K!N3qsShUaDX{vT6__
zq1d2E=<<Qx;}?2bN>|xQyZye_IXEPp+HUIT4`(IKcTIs7>mJVTUf==od056qG2|`y
zNg+H#$RkHB_KcL5EKfJj+bX#Dl!@N(`LYBD6`3)2+`GGbZZfNJuD#7`y_u{vb>v*L
zH!_=PUYVmfUB23y7nbw)X-%FRMjVR^eM*UFwjRJ4F6>c5|F)9hlgVzpGgLe5T`7$^
zt}*_}A2E?*QgwTD4{R^{kkuc#okZQ(YbT>Z`{?He#H%BU+pjwq$d><mkHI1QAy^^g
z);QI;(R$3MEM44Z?XxzZX!_fxOGkk<R<awdFSjNRr4Lx8F-s5Q9S)8*PV|KzJd7EX
z*+A4VHz<@X{Y{-HG~m42g0?F)%hB>ujjGn*1VSIAafdA%3w{$4Ake%_EXWe7o+K%1
ze%eOLiW~gV>)ImBDke*efD37t@-C1)r`e1X?3W&QODw<ab0f7C(h%pg;SMBfep8TI
zeaDk@MYMq4#jHI@8cnOv%g@HbsMF4vP#n-FWle~rMDV3@d)f<fEGFIRJwKYZPai8E
ziMJvu@pe3AO0h|6-IG5jm7~IaExw+mNHc_w-oM3J9aw0K!!p(D;+g(QyHI>Fh!#U|
zgnQKRcudKBGPW3&{$DQw7-OpYP*(IsqS2mto0l}(6%XP>^DLXR@I8fB1ea4@zKe`c
zS-Y*aV2XDlnr1dnNXtr}FL<ddJv?6YyYFAw6M_?%fVb&c2dSyVO9>JKCgI+yIiF;l
z$^Hm@j8&sV;$AC=k`^F2a7fYmJkuSYGm@&}^86)IjN-QuQF@`=lWEYQmxCF}2Px-e
zVq3o(5DWkvPh&2N2Rd{O?}iXYgO2k((d4ZE1U00Pgb3L~TzT^1=%qBS0%&XRBb=X6
zi!=C&SgiZ(*m$ua^dWbsU_CYR_-NHSuHp4})6w~)?fWBmXow(7GuZAIUZ&d9cEzEn
zcCJhe#3V>AYo_F8mCD}8e)t%K?~0n|=cI6*h)@g(a^it~fVCBAsm}JF2j?Cdo#vi}
zpn)xQXTbQ=s2~CTpHk{ESQxgFeEXG=ZfQH_9qPT1r^02Smz6X`pd5glEb+1WxXQRJ
z{l>gF7;i6%ctp$h)Rwbd)+RCEBJ7XfZ-#!$3w>dB^(0J#F+Mm@AXSg{nF9@zlyA>9
zvlt0>dToK)n2k^WA+CcQ*!m9?^v#9S2f5R28vs16&n@*&>sV81l@Ay6CPeykRQ*Kr
zoOVA><n-5T@|D|za*SdgAXOK4H2297IIMDUx-z3`1^P5xxd@L)k-?eQDO&2^Q5_Gz
z<N0(bdOxpv;RMnOC+X~C!KUCzB33LN;USRTeXl<)B$9&^%-?HmxKp?yGgfFpMgcc;
z<YP}siCPMiI|dG^NTI<|*%-177cB!B(jJ!!W_I@VZ?Fn=J;gGcX+qqWJNIm0sFB_J
zU^5cKL!8^<2n^G<OR3y1NJkhc<j#}<>Yd#E<T~;~**}U`=Q%j-J1e0Q8x~vI=KFN-
zocZQu(ZrriAxPm2eKNAqrS6$*DQ`DeH?+8yI0)-)c|lTP0ck;cS|)6Fm^6t8MSHL!
zvbg-A&o%v=>)W*5o&~I=S@9Bx8)IsRua`3V7->Se2YHWh<ttp5Uk6bR-a%)0L+`OA
z=%Qb`63Z+8lfvWTJ7)be{TO(dVvYR!8wr3I%hJgQWRrgyi8T7<*JsZnosEiLj*;)Q
zPfRDH!`>9g(*6AO&+q4fk`rG`B{COj&FywO-3T10`fTn|?B-f5t@bA-N+fD0+hZ6p
zEY*qW>r#1vTPw30;$pW@C#jl<j+3SamO_OZM=_a00Ah)Q>*KUcUs+8?Z*X%}B?|o4
zNRz8TED^@HxKMOaKf#ZGDp6<!?PSd?-9w2smOE>I*(+!W9oM4{8OG&0dXY{FO~XoS
zuW;PWBgd9m@SB_WyAN4Dcdq^2vXQr5awRQ4-8o%lH0=3qn3D78Zd0FY&R%N(sahK1
zy?}+XG_-pd&zd35*S_(7*9`i{OgOa}jl4Jhu%?BDt|46)<rBMTi$th~jTG74amuOY
zc#7e%@P@pg3W0{K*1{Rk@pi8Engde75fK|D7c_+b-1*@<x%{e#ev8J)S~VAyKH)Bh
zuP6`ANZ1E07mWZ+MH<bUn{?lnpGHvCpQhATBa|^TL)F$wat}fz)(O$CR_E<`w+iz=
z*hz(^@>Z`~r+9lLdjXNkkFya<o|`uH2<5v>bgtSCe%r|2l4{aJPP-9xZVc{J4ozIG
z=Dww;jflx^+)`cKbBL9&=NeAb`o){mhw^r@QA9g|P2w~zdUL^zOV^_w>_2j(YHQK`
zDqm0`#^jUY=Hh+PDPA)8`y(^Uv!&t>sW%t`FAolO$Xuq>J76d+7t<#3sDh=umR|9Q
zgZa%d(Xlt=uSw|_t9n_I_{Xdc+H?043pR(nou!}dCQM_9w#fwZ9LxqF&;`yKL3xnP
zg0G;Jdu$@z<$B~lXgfEfA#c~B+msR$w9p<*F=8{KoX2*Cee~c_Q|!s>>+KtB$3N7j
z8YruV$r-};7pf3OQmDtUe|$SiJ*wlXXV0J7iGpxXg5W4EHRnV6HF8=LJqx{ge`*Pq
z6Y{P+B3QeFl=3<b^kHHZoPcA`Ii4(pM~O;<kM`aGRR~tB#W$yJm2MgX+&KSxckAX5
zrXfh0<UjbrHXvj4c23M7B!&eYxvF<Z4@=LnJhV2>ibVgH-OCYAt5Q9&R}y`ZhmVp>
z7&X~lKp$QhEf8-bEOIa`l0lmohv{P-!}9tdsRs{@+qBn^`lB!SFmfQt@ON`zkEIZZ
z@8;5pvx(xYv7j&6U|^^3o?F0UxzM=E!n0j;O&CU54`TTW+l@bm^T7GJ+^tWTKfz+0
zSWjd0xSKW}1N;0{k@>>!yM4r~i3Uy#mQrSGVT@xEgZ%6xUDm3hdNwkMRmu(#rh4b(
zE&-d~!d?46s^JX#k+$Pie#88KtaA4pMniL~M$yt6Sy01KW?ynB!I;ny#2P2_15kmr
zFMFMbRl4sQONC#4p-!t(DU%bby-FELyeJZiOvdJQa(r<<vsp1VcnTJATCTUZ{~MiX
zm|R)G7hmI`nR1oRd?0&cv9YiJI`SUmCtIipt%OgERTC-@H8aD2pVL`3pR_2Z8xle(
z=y8s)(0epX3bo}kK%gcB9vsZS_~-0}XD;O46p1MB%-n<bsrBzv!=*Q+V#RAEqyFG(
zaEAN9qqvXp3HTS8j}z!6gliI+T6c5}a^SmNSro68b80oPF+!T*NAvuM3I7n*YZKn3
zcovm8dk>CS{|CR@bk1ZfhiAgEm>=-agi3hH3CkZM&DW4RCu7aH=<~iOHC8`@E<=>U
z;7a4{#iG}t?yNvn%KGe&zhebRG7PNDMMZw|5o=-1(;g6?;-4z;vy4-KSZRMoSrREK
zp>29Ie_{@DF3)n0Y3UbTgqemGd^V)ip!?bw#*LQj^p+ueKh@lFfxLy|OSju>6|JU!
z{7eKhhWV?k3n7KVt}8uipGTXEYVy1?loa#$oFgWE^Y2&R1La1a)xv8aFMvZ76R-#`
zJ-$2HGX4??3!pj!rW}wlq`G5tuRgd+qwFp@&J)f5x{oxV7s*Zi%^2(^KC*tBa)yC7
zT8KM&HU_0->7*Jg)ki%2^`i+}@^|7R<7e++>Bhfs8|82QXObe8LT@M{LY?F49fgN}
zG;v7ua}gjgefqk}0Urprv~Mol)TCUN7F%lXKp$+(YE+Z*VW)10Wr{Y#odOVN@}wjm
zexeeOXPscM1r5=JfT<_IMfpVqW?`iRzpr~XjTY9D(+gqpIqxm`5e-W(>8^^sxw7(q
zSFZnmUilfx!3oSPu8{!O<gsaP^jX?`;hhe{+~k|pfsxq%zlJ(E(ZhK*@_e0@4*HeI
zV<|69$0RoBk0=_ooeKYdFP_9qBDo_Z2_zY;@OuJl*56qfDd^mJej<R#fS&%r&E7zv
z;4Tzlq!y_`PYEd{i%Nie?)Ds+*H;T8*i`;3CP%QM5lm-CPAJ+#5`1X%-vBMdV08(2
zFW0GCjN6A-(m}g8Xv8$eZrb12tQ8FJ_8eHj)x~7a-S<hKi2}a+(?sdNH5Jt#c-C1%
zXjZorJy`jSj0WYw?~@RJOZDILWe1+GB!OfvZYB`nO;YIAM{BEOfu!nrn#=>|d83+`
zP<C3x!f~M;6PDj$ITvP718B>0poRMCT0Q^28s?L5ER6URC$;x|(+PePUTbQ_apvqO
zK-=Q_cKs3f>1rpELqp|2f|>Q)cfX^N-_@<VF^&<AJ^)DLMp{NS&6Ms2hXb%#!dvB=
z8KQRw0$a^Qr)VTo<l99aMX@H-eEd6C!O7raM_s8;8oLh+8rJXH4mEa~6-ta)FbJdd
zR}KPi;=!-9kH7OL^}qXCPik>H@Mz>7+V&I>N6LKa=_7GjxH0{4v}ih~2K^$pYM+pR
zvS{FGd$s?op9Ap)Z1U`yrXM-*`vR(vRW<VisSFp#FQhrf#q^+pmlpcQ{y#-nS_GDY
z*wa@)7ud{<sEbvfb!uAJ0U7A;Ek8xjLRCZt$#tbHH}V79KLWg@O6yz7sEmhCvwGjp
zKBaqXUtau{p=9sBd&ZE6g<-co1)YeeV=@<x`6sbInOX->*ma?m6SCj~k4f(3K464^
zCDP(&oYr5UJ_#d=d@u-9!44OD9Ub$8L)#wYg$68{Z#DkAXSX1yKgiV_bKR$E)l)hC
zz<EG9!PWXI3#hqy`;gqLbG`R#5|-1gAQXYR*XYJ>LK`^(c?IP?a8P)`3bAVsBhp3I
z*e_$r>p1fK;B7IQx^L3)@8*kK%JLoCJ%c}E2jM%vK;&Z$u%CQ!`L`Sh8P5ZfPRrtu
z#uyEWn@0;z^&1NmOj1zfr22RxqP9$S>!mbNLLhl;Pf{3kLug_exQLw}u-*J4ID{bx
z-2B_Wdc-=yjaiIp(pU>SFOSBa=LcSSJNjr#{%-IX5uKg^+hzyztHYPy>3>hzwZqb<
zHVGS*ZhkQw%-WjMT$gTq%j9m`K?Op7Yid)MVE(!Rmcb(Os9%XU)8IzIcdaNF7NgRw
z$NGgQjKGa;$`#-dI;z^ww`xJI3ZOC%Tim25jY#gbrLbz!^RXCdKn)*&^#TDMlnV-n
z-^<>9*YN|y*YaNU%U+RwBWvCVwiKd3)Nj7D;D)XL`$KeYo~Y7e>x&<%yreuuP8|T#
zbOY<&Wj3NR^RjF`->P7Z4&uulqO%1y!A}OF|9yal;u7#MdqPD_pJv(Y)Q`!;`W%(K
zI`ro!w*UXSOUy<tCzc|+AfC4o1ZvAz9at6$yRE57ZdcZ`L}R&T|G%E=a+pQ&A?Wqf
zU_Xm4?Y@SR)5!+3RRK+<hNlV7+*_@iyNH0!F$9AH>(pa2uHsFRJ+30|b$zO)|FHmE
zgz_KUQAAk2a{1}l&I%W44zX)pozn;#1fEp;H!AUU+0GxhEAzTMfPt<N?Dgh==xw^e
zF*!ay<*(GEFb&_o*^l_a28$Yjm0Wit#RbA_vi6#yR>|WpicSDtOyE{bi1@<)p9>z$
ziZa4rVOY9SK}~C-JJ%FQqFxGLxgL2QAR=}bfn~I%ie=hG)^f{)LS}Vt_`;;){97j_
zb(+VR+cz+@&2`*OD>1xt;L)5>ECcCN0&3~+{C_(*>6(3$J->CWM#>YKSX+&_nSH;g
zAutT^K56%WjL@zT+wH2R_vi!(4iD<{2jx0s|4kh5#t4R`$(OO2L_56uC>G**^;0mg
z7TE8my;RkE!>vC4nKCT-c&ndeMmtYRVx?31`tm4v156|GO+RWNeNS=#PWSfrO|QVC
z^YcxjOpNs{roc;A&=c|F(8{!+bF7hvoBVbro?L>{PkvG^BjRa$YkD&#Ujm^s%#uIT
z40nKOEa3@*$MdN2!)^2HIY6SDx{h%#`QOaGtylokB{c~uxlp$GK3m+rfKT}waMy&>
z3?QG#*DwHzA=aD!%yAz?`Z!&DU%Z<70n7tYq$j{wqUxBaEjG(0d}s9jb|vtzg*I-D
zFM&W?-R~N_Rn|#TX*Wa;;N&y7*whVt^hfI7m;bFw>^C=%<^{AygZbmaSl|`r0(`eS
z)7h?lw2A}SKnQ4EE%ATYd&{UQ*Dh+9lud7tl-hKIfOLb1APUmmA<~jcBP9)zBBdbR
zNJ^)qNS8E-lyt*)-JJ70-}}7(zklBtXN)r(=YV_P_jRo`*IaYW^;<j>`}TANN;M$B
zWLR6HOo4HJK<be9$($9CkJdDT`yl{c0D4_paIo761r0H#mZ_(dHbBRQ1VXMu;wM#~
zqeOap3@3MLtHXk@%t-egk__B1C;0V8Neo<`i!Mj-*}h{;eDUqT<<V0RrbYwhC2vnr
zFrL5f)uz$$&nMhdFAIw;cN<R(VXtei%Jiy#$Y6grcm7@f2&n#VLj#FyCWj28uO7%l
zPvC40O=p%{$2V(&&E?$Q8&A$;59_4SCOPOv`oU-@9W}1Bs>1fnjycx-Dy%FSuMfcU
zj0HwyuF$ph!KQ?qWayxuPx-C*m-Y)85DJW78}3kYZGqFa;I#vtvK=@Jb>gQBZ>96f
z-a5CUlLq2*V4E|;5VB~cT<p*z1raf@#Td>8ZiOyOR_y}ys~%bt4gZT`#T$ZC5E-Fi
zze+H^IH0l<ExIG5=Q^fwVPt}xHl-MdRia+2y?8AP*Co8S1srXHb1D_<ZULI|7!&ln
zp^z+%vKTwH4x$X63_4*@m3s({j|WH0HZ?)>gEIBlf6kN(KZ*q*TWxMxK9~dd>|p8|
zRa3TuESYMbu=pt6gph;3=+f(?VIEg2o`C)lS@sy1C7ioa)cf5JkRVw@jArdI)GVYG
zUHTHt2!Dr%&{FS}1J9$*qd)+99#Ymohq4EL2=~1i8i89^qMp5Q`_?JN{OO~aTrv(^
zj{n3xgGebb0ZVbu_i~x0oo*Ex3mpKy?Z8qmy0gA8{`(b;W$h=i{T_N~fT8STOW0EO
z`)3v4^nJNWsQ!^s3^oj-#XaA!lDtR(;K9a!5IjK|@`5t7wx2pV`cOs>&wY72xg8&d
zl-)ZZIByGOOf-jfx*#RgXo>T@f&1+^b~FUd>SgLF|2`c~=;?H%Aq=76z3q3*>97es
z4%W%JB#Rd}A(LQ9uqCuZrdHYH=q>urxSoSuN62R2Ppm*Jo|ikd%TbPd{zTbD-}Zam
z{^aLQpER9_Tjbpf^k<T>fJ<p4+@45Yg5(F*adu$WZ|p00Kv3F8J%`3I5<OcU&Ik*X
z;`-ZTB9m!H!17*&*0L8G#SOawc)$Tn*F@QYb=oG31(Oz6G<4=19d_il3h%{$Tp`9n
zgpunhJ8&(!k^nb+8_cOxN#2|d5!gg0^8-4Cly>qaOYZ=TqVq2fN8f&oEcpenG`1Y1
zd!fskDgKK;TFozWKfVeB0%?$|gPS}@QJ}X81j)!~y$tH1Mvua#?miQ0byRx|xE;Hu
zv$glGy+{`+whC}H?W=we6y3l*45rY?jZueQjczR08CY5tsm-R168q^K*mhAE?`|Bv
z1ZOrsIHHLZx}H@q6qp;F<&?bgpyk;hp6Hfw^W`dcLQ}ci`Qw3q({7d=@NHTJ1C#ih
zL`%8G3jWyWQ_m4vp;p0PSs0M^)I$|oJ#%AF-cE&^A5VeULoYGFR#;@3WaRT&KdP3-
z_=BU|AhOndJ7#@Q`@TLJ(`uhv2WE}YLi`;!!r-jgfl*7!{*vuK^>TSGt|yyo0K2`w
zKnopz2G>;;Xac*1%Vm$-X|8_&SDN&{qKT63qKBWRS^l}sZ{I;j*k8@*>lOH4jBh#E
z?5Znp5Tm~)MdA@@s16gdq))Q-UaK{@(>|GmiGVR#dZt-`S6@JbiD1NA%?$!jwn8f@
za-;^MZoghz={dHdo1x~X1t8U+WXa5mo8Ei_p;Y=1!ac`@MpwNg+Q$CVRM|!skpN!#
z$mZU?g_{G?epEbYQ2-5g15T-FvY1@L%i|Qm4K6FK<AYqr`hVLxF#qv~P*TaE!6sUY
zC%{P`%6^t^S3R`_eySR*pAm!+MKvr0V1s&$D?GCRKme>KiEb5NURZJmkXGP4{GO9!
zcyDC{pbpAJHfW<xF_>)*Ub+F70Vsv2NOx<CV-nU<=Pxb!QbjUtaQU-P2GNuymO|;b
z0Hld}S_8@NmvR@*Kl2KK!$3}{uTcnHK2RsYfB$_am)9<q=n~Z)jEn87{b~8Gn%r&0
z00kC7dMMAlc_KCKJS08=X4>Z+MRiPqh;6}eTXZc^$8&`paj3x2);1ofIQiw-eJ{;t
zbIm8uWw-Y5ha^yCULk5Sf1B}3w~PY7&RFC?Ymg2emr&H3vKME2(Y}<;Ur}}V;{!n|
z3b*9&g(*Hsv9I5tBl!XL)s?z@_wPYmIWax?_v7eNA{((((npK&9(3|Jzn|O!De}#l
zv^C%Sl=WU>;cMuUA%h!qhXz@hAawvZ!KHNn>Y24;@B|a-_nPw952rwuOuD*UmmJdf
zAM(<YoO#v9$rqg;w?|>)^hS6uPI!;7TWBK=m`8=DU_?2bewcF%U_YPQu9>I4TFO@I
z;iaCScb>@a|DM=FM0t7%UZ3R-Ry;^+M!jM&FV|9DE?u~_d>3ryt<mfi!*yB9e(nZL
zM)`}UB9x;u;)I_Qn~~}a&;8_OuHc;ywb2!=-;=3OA!qocICh(5dms5_dH~s6E`ei2
z+6Uls)jJ6Mbs!^y-lq6Jw{aChGMcb>ldOOsG<#-f=%gc48%%a>C!0m_EF}^!@EZxa
z{mfQ4uFSaVxBOuJ#j$eI_$_YR03FqOuSG4cFy{NFKbTdz%C_*qZngw>!=kGW^2ISc
z5{iv4k{4Ti4ZlBy_X`HoCG3LuNRZJQ>8XJJ5ejgQdgOeT5yeoP1-nJ2^qmLB?tZ>Q
z$%2{WVo#lVXkfIf&%M}thT7@E!l{0JHspKH|HTPN(T;bmzAY)D?{t>WkaC?j{;#T@
z9L4v1B%{ArbqiFlyg!)%2k=qAepX1%o~D9&IOz70$z*E4S>CEhvwE+kSlNP58BRt7
zN3u0v_@t^{?PNxKwHwA>LMCnPbV2(zQZMEX1Ir(lEy`|o0%SN0t&H*g)AB*FL`r;X
zBqv$%Z$hMW!Ooks=KDvut5!>|7n)?dWOg<dLul~aGnT|1qigl0Kf$$+Jm)VA(TXgV
zdpI@UEMjrJQ&%vD)%w|S>K_Opo;W=nM4l0=doNk?Ye-fh&v6@A+fIVvElT^P4kck|
zdVUALHV=myFhF<k^~N}$xWz(@>}3X(q@uu0F(T-F5ef#SqtrJz${%8cqFp@Of8Q+%
z1dhuFAva97-&rB|ur?sSo>USAlc9O0l|HD$1Ug48+&+8Z+8MO!Du>U<t?%Y1VR9({
z)p(55lag_{!fZ0d+m6H6p_^VRNx)NdknO+}%AT12QF#t71mPv`Ws3PwXADJ@m0jyH
zi4h;7E8sBExdynHi>qB<nmzHbSJ<@#d{KU1V|3868-Y7Rn+#0VZ=P1aWn|8}f3C9a
z6S}H7cp24^0@Ex3GlZgOC`?9@-HtwVwZF0dkc+z6qFvDlzcQCKS7Iy*j#F_;sws;L
zP+$D@*igzV0~F^1JB6yAM+OS8Q`y4X;uel#=jVrwoRZtx*L1b*AWun5O2mt=nA&)A
zePOq3G<)ZM;k$os_m5y$Jb~=$uJ4U!M<&>F2monX9k!etgTC;P&^X`Ehu?6bP7u{8
z?4tyw0VfbeV_FXFww#iu@h0lNdlFI0elRHNux8(o@}6_Ak^NkJ!pTA?N^&vqF0V4x
z$GXiY5XQCG!Mf{Y0_cTQ2oCp%iQ7M-OuDQSPpF#27*a`ot=VP^uGaUD9=)Obs`-w)
zU?A3QVXeSk09+KS9nA!Sw?U^5Ix6;_=JXe9@z&sBDgDOyyXRMvf`PyD?sfa#p&Hxh
zfYY-zbI2%ajYMwBz3Ct=R6#z&G4gV6<J+TOA#A}|+diPq<f~cvK0D3iyw5rV2t?Ej
zH!_+?<JquoQ=c#YeR4?HOl>0|0~1Q0Ak-mZWD6v_9<SUY+#RW>w_QsRHSP<+aKOU$
zJ(r1Tm0rxtopHP=P=INcmlt)%(1U{0x~SV;Ar_1Z3lM1&xuYz}e8{Zw9RDzRXsm|D
zm?2d}`Q*nKu~_o<h_d7ydvb9@x4C`f2k)DU`JpJu>(jjV2SOQNe-#^6jA<E(qkNNq
zFgcpKtvR+A=Ci5`faNhkzm>59m^)&;Q_p`r%iQWsn~G3}6#!R~=q~TK;SD$z)Duk3
zNts&!AN_76)(6%kb&z#T@9Y~vfmDPbVTzRKZ_>vFAbl{4{>l4X0qIN3SpvH2HB@`F
zC<SA9i2||g?$iKcK_Zl>MmvlLQ>R-T|1kYdfPBqIc9|T#H@YXRAg9SAT0U*$zvmM?
z!eu8GB*km}G;OaeKU-xl^H2H+l%jzim5H}ExPWb@$emzf@TByE0zPAc?E`HCsR(0o
z{{+-jYL;?9jYzL?v83ec3_`PX>50zN%{S*0%~47m^XKC=6#!;_C1pkNN8%~#p$j*s
zM%0i_$;icEfSOB3yKo0&`&jhcbrjBHWEoT&s!&=1e@j3$Z2_&pwrlellNn<hQ*7>Y
zJef{4CMy6cWQ?Vb8Fwa7r!1+9s!69hU0bLLaW-+vy59&Au<4_khEzLJYH~saS2b`+
z4-R@RdY5gMbcLEkb>|Jq$!@t7-EwqK{N?X^--iCr&_sny^2{w)n?pxP0ly9$d5ibV
z%3N2v1uFk}91V#3a2fQ@5x!w_;DGTmyW!~cu5?(Ymlhk#A2=$@b*(lBaK=sPE`(JC
zx}Jlih{ukNZ=X5`AcMGuBNdDw+L-_ib)_ZE{nwQ2{lB8UjonC?6Q9k4hSmIi{bKxa
z{2sK#p(Z73avJXqitYw>!bq5bamsJn$829yBw9o;NZF$10VCK(i2kj7t5wEguak60
zOvMqdn~&!cqK>e*9~g$QJtP4v(hb2PlmPBhh9@Q)vSG_^OzH@f)>910Q)$~O69Z5d
zUll&UKqg=tXw?B;RUo3o2O!CyqJ&_$WAma*<N4$HV<96;OaB-C)X?4sLn~yAEPsLZ
zc)C=Wln>Dp)gJK|FM}DeDJarK>X=EeHXkIyA>LH``Cnn;aqHQzP}G2>XvSVJvO~jw
zxxP3aV1Z*JI!+b?`}m$4&embRr`}e6odF8_vkbjIe0C6wg(o853?05EYIz>Y5ANx=
zVyU7DQgZl0wlhJlSk2Ou#;!6P!sjCMf?i!;d*5CK_YED-IM7@@f69cixgs(K<ieWb
z+8m617Wn5gca3s$?#N;YJ_=OCa4)fc3fQ{Ahe*<XEiWO2$4<<lG5J-RY(?<j=|?!q
zTyP7J@akIuYnE-q<3t0usjW!CUwL__m4lx;O6t}=WF1zLcnV||P~2lV`q+*gPCD+u
zp&Qx|!2bv<80$>xhsIJxSqY8z;V|)pYfmUt1(EOEkKqL!Q=>5({39|yG+HLC9Y6+j
zaDDF<_+G<3Hxa=%NR~Yw_k@dlNAk>UKjWZQbsZAWCbHHVEwX96t5R4WAx)FY^F!Ww
zB6+o;Y{tQB|5#JXcwYA;!|{jb|4^eZ?xApGo)S8MH8ACOWs!K5Ka+_Hz|7hHg9qzz
z{rw5qwB?PZ{X0<-@wDSIJxH<9dZ1kT8LL~y7OouhwNBNf5;X~jgWnN*KfZf0-Kn3a
z`|#Bhf3KEa6v3<}7%_(y!n@48wct+Q*vpC1(Jz+XAxbjFMkwEb=teaGmhDAB$b-}x
z=!!MvU53;anX9_!@t83b6~<O^jKJ{6)bV07pf^&^g|lg|!OeoV6toYG%Of=K%@x_B
zLcC~PegZLd+GuJ$DZsDwcaQ^f=dX`z$|+E@0`k|v(I=64IN_H`K@6Q)5X}GH&_f5)
zl{gMHDBU<GA^ugo`mCK}1LHL&H}9mQ?bm=O_{cU+2mad0pv-7fMYSO+NqI~GO8`D@
zVqi%^7H0y7^7BL|+)t4Jqt{54Wxpx-YR1x(TYQMEDTtQCsO`XLk+B<!T{-)zPwtY4
zgX=LW4pRcwl@tlZY+7h;KsZSUNvqF0DwJWBaFyRgfcyB{f{Gu(gI(91>;a$woAj66
zI|SgSy@iz&-10&=Bq$1}lcwREbTD#>Sv|Y28htyexFmQ5TK|R_Bp6Ph%A!~N3BV}A
z6_+uCwaJWk(9^z>Vuvy&YSYon;#<caWW$b#ur-M~%uGg>+pxL&A?Csstck?ZqWZ}$
z+XZ$7pcuExWa_|(RWFwYpED@FfFvf{r9^&TC!?}XGpa|%f%!FB)_$MS^(!o4#yQcF
zzFKJlinXP1?vlsfvOP1o5gBDf0osfD9KHg8*xOopuZ$=5QfKScfSdanGV0QIcm&BN
zQH8{kYG?9uYc2rtK7`kl{KE&KBa^5Ga`&6<30m?nQZ1Sf9jn>N=f-0f5`wpS%QA3=
zLRT%`R^ORK-?0_J|2XSok@{)mQ0+FHgJjh;gvt%(o~Ropdf$m_lb72x51ty-Qdu+y
zjqlf<m8Bne{*qY*oJXNzo^h;dm5Maii?1@NwcH%?J&I_^=v~HL4l%NDq!{%l+>I^^
zShO;3pPR7ORRt#jWC9g9eoe+=dY1LcoIAcdZCUB9A~_+DYU*8CoXnj-ZIesCcZ$vg
z@T{Nu-(2~jk1UlB6Wi;!I|aCTO6kM<YEX!6e^@&jp2z#&ZxjW6BYXUxE1(1)0x)LR
z95Ts@R*i~N+^QJ<EN^B0++T^-O`bofkwpEYEZZvNT2z6y4a_DME86*?H1E@?;-6<N
zC&RedVu_#lpmt44eb^92@HZGn>&=s#;X1BkGW_#*qoWuDE-!pf%VW{9lb<cYjaj{x
zyC@E^0#k~(tl6%fIfnZs?C8wn{txm7oa*Eh?Omw~RcR^uPd=a9dNKvoN&jIw04fTF
z@OEY%VT9Xkt<x0fzui+5e%MRpJ{xOSyBKVO?l^6oD@RGpK_6zrc_SNPEG_dxeb%2%
zrwpzYq~2$AF+EWMxmgp8Cfb-aR)Fjo#LivL1MAgguv4Wd|J}li9G5#8B^_92r16%R
z=lSDzN`JK_%<@q-cuYG&AJbfcCRbN5{6y;;&3&XNkJ0o)5vD>V+OTJ$z}*9_)lW}@
z-zE}#(H7ITKujJtAh*9x`MsmuS+3e4e}j()l^7UWV{FjjdVz%kdkHf>^}H)@0Y3Bq
zp2+V^xIk5zKn}JG!9Q;c;*}8UwYcNY;*S9@$20SKJ*`qhQ-`r99yHi^id0qB3W?*r
z0t(}N$R}t6%1k4lbJXHVe}q3K+6;h9w@R@614QAimQI`j@vMXxMHg^vP^efv>b0C6
z`-2{UroTb|lj3eR$);`vu_2&RrCBuSC54hanecGSZ4io~;%XbGki7&)DB{E8r;+R`
zIWl_FW2+cqFFbn>d&s5%nZbuLO7bpG@uXz87wQ+EB^NxpJ&7am#{qYkdQV;GAHp}%
z=V;sl|7qQVfarpX0^I0|3S-EMvSOIeNEN+`2e5oX5D4#W&^aN^#!Pt=7NlMSiT-0)
zJ6axFC)Ej*55orQL5DLq_4#mHkj*#HN-{~Qhfo?sTbG|FUP^vvm(xGX@LV}j4BFj5
z<z+V+e#?J{M~96-&r5RFEKiCz=Da@CprF|RGa+yhS8x{2r8>)8qN**X(J<jM^oB+Z
z@S!-=)df0Gfcrs&rfob`Y&sF#Nqj;>*H~GZ^Dlmv$)td@gsYIlG+Dlan`xB&D0mA|
z%pC}T1vk-a7{OAo7eLwgg+D=Kw_@UrIB;mT4Qyy#XnTUCABsAd<0wJGt=%Lw?Z{Qa
zm!@Cq{x}JM><~G;hj+<VofOc>^1n>!2lUfDQ5~*lUhFuDAT_Nqo_jJ^JIwHL#3{1L
zjDYXuXdZRnXAombhS^?HD>pbv)7S2*t36G4$~W*JHmV__dg5D|Ptm_^UHRMB;fe5r
z%h$`HVLna(-s=J=&A949FQOZ?*#E8BX9!-P%LMV0)==pOsazWQULC27(#o7hem#ep
zHhwi=$DV7)U4K9S`;XDx>3<^Ky>u&WJjT+BdWhpV3vhG7Z5w+b!I}1jNTVN(CK;Lu
z@RBp5$A)V&=+kPY1A9coV+1)=qP4u64-fJI%bF^C;?c5XM65zf)s6N(zPX{Xl|kpV
z`_!!nfW>P%T>iZmEjbijQjOmCiMqZ0!$15-bj^>VwA%FJ14bVIJOLjmSwGO71z|bg
z)wx(AZJr!I$?dwG69rc+FUYSup)3QqEUqyGbC$KB0Pb)-rF%HfwKwx}O|ee6&dgrU
z)c&dQ&Ktt*zqt6$HmVJPM;zcJ-2#20YGD)8oi8@j@}09wi0S2(cXdJ5RX2dVC<AqL
z&+MPprlVp9s3^0ve`@lKIvJxFkR5shvtu>uv(j~0B>AJo_qPC5Q9XaTCb-7+e64)B
ze|a?dF5NNcCrkk<)0|qbQ%vB=0#*}s*;v!xAnp#TAKb>9feFH|P3;-YnbTZUBS@K<
z`oW|Kpl^uRg{Fv)zDVSgqC}gS0z<&mx*09~q$WdPWA{hAam}e}{=3LQ{Esy*N!p3m
ze&z?p->+WU41{lV2LoNfPq@B__U*y0hn)^4fo9zlWYGt&_QL3UCVAzO3fjN23oqvq
z3oZTw!|nu20X@zvdg?uUeuCa}aBN6yD)b&EW2AlsZ*kyJ6s7fT`wo@ms<Vv=6Ah&U
zQ?agFF8t|Td@%=m|9gu383%#V<x$TrXp|7dEvqNZH;u1<EFhA_t|RKgm3u_eKTL@^
zZWB=`e8f&^do%(nQL5U#X1%(HX$vqOIVWt=0}kb2964Wp)jMEK%awhQ>CPW>YER~#
zd!hG~@7kp^7Q{0BnUt223hYJr6|mi&WLbYC6v}`*Z4<~ClyH<Zt4p5fYibmk=^XN~
z>WvTo{c3mp0fVpNr}Lf`$z$Eukmy(WJvp6jlusXs(bm6ci6kB9^a&@46r}YzrD?a&
zMq}ROlAd65VNQJNe}9fj$9ROfOBD6A^vyZbvx2`?WiaBMv#XPNL1j@Y28u*sZ3eTP
zw?R?8Kj;uL7BBWD#%@1Z4xOLz;s)6^P;!LMQ78q9|M5=lbQ^i5nV(tkZ+3q18ci4Y
zy}d3QZIPBXz7HdQOv4c)KIfvH?eS4;1`yA7U``{Ua47T8bU7hR#hQ{YjrKN`Sfky?
z?i3_rPXG#?!EqyB?*l&@0o%#4l9_>*8&Rsuw^#<Ek3q-hP7d22W)#qYTSyG#r~&$n
zknr~yKXEsS@RcYu`LApXo&hS3fQ>OAzu@-Aiv5rp$ZN05Ql)W--2^}f;{ME=&Ef_o
z4+HkYx6_D!aMFM*LOBoG<>3a1<{5n7({>38ze0=>zVEA_hwi#aY0;{<;1{?apf${y
zMT@ubChtA?0oJim-CanY<OX$&T}N;oAaU(Lvo^6%xj5>@z5@ViIJI2Nx8IKh1(pOf
zIMwr4JMIKkTc8Yi?l*1eU%Ck*HTVQlY6L(mWM2xDE=IO&&@u?`YX4-4+734}r8tS!
zjeS(agHR)xS>ht!3pQKJZJ;rLWECAA_rSdCS6b&m@WCCIpQTV@ehZ($4~$f_fz(O-
z=hDTuxFbFaDO_@RSE2B9Se|AIT1><I(J^2^XgNYR^ohJ4@jre078SY}SOV}*<R2H}
zZ63W__WrOwgz{DBqGBv>BhIS#@9E;VNanpDR$4%XR<`X<p#_{9>eAAC!p*D^pnoX=
zHkf^*dh<IA>JvQ*KqkbE34-k_=eR-_Sj0Ok@%HN~$|JdF02J2kHfa(vxq&F@1wG^u
zjeL?P7#jb67O6hkzo0TlV?_Z=8}%JPo1>P>xu2(cT-T;^(6@1YqjpdpiTow->#qjv
zS_sEj50#nREAa5AK|gA)$6CC*g5b722l!pHaE^-5H1PKc0EojvVu&wOd>|dDxOo_)
z`;X9t3b|iyYSs8Tpp}+!57r^RU*ET?{kh2Oe<+Dym_Jk#R`+o=1j-@r5FRC_Sbkuw
z_!2?JLnlD-M_jqvRRRU>40m1bimYVL@T$lHOYtHmnfth#y=2!JwK9O1r_+**MBS__
zq8wT$!BMjEX9WK7sjzmSqMGrz-0eY?4FN4oP%u$SG@>c#nMLx`eF1}}S(|qaZZkSi
zvdArfBEJT7D91e<0##b$jDdT26=?ArmJqGFDa=d+Uv_xB0nK-rsF#PquqZLW%06(>
zrBjD&azgz9(C8yKr&*3)_|8@gS)`9|-u?O*im56hC@}P`sue!+7fGL8k~Oma>6GFS
zpePWm?t_x8C`jS?SiR80w28JP3y#%pP4TBsms!5+AAM94W8$vC%}zIMJa6`bu4i=q
z%l}Y-|K6?t1T7Ita4o5RwbqM6XC7*m|L4E_d%gevpa1{n`DL~foPy+K2T_H#1^Hhp
zxP8z1sC;T}Uj!R12o;2E=g<Bvi~ryEqJr+v(HEcu#G~G}2^yN^pieylWqEaBT_b={
ze&mDJ9vDd+^S)I<{U0y%$6F*4#+!@g8ylm`CDON-WH97?ATb$9_u5yO@wwQB_AQiY
zXc=e(D&qe<2mAv>k<E=v)T{mP!T3Q81NG0Mcq5=Dnx{YkVFElxE1|e0U37?n^XCEx
zUF-l?T9ah><~y`(JJoI4S}yMpLn)0OZ5ldlcE9{zHvQiRt&yPRUeE;8=LfA#1vu+_
zrl%E!rA^bIb*41~2!k!jn*+%Uxd7wUeNbY~Kr6yP^vH#}=En)8UODclhZ*2^8u)`N
zRPx($F8hD~g5T6=*XzF5%sqEG9wVeqZ=ytZ%1Dj8hNC3TU(A4J1YL~nXM0KQT&dB8
zHD};IIR%WgUqYVb&BX@wx$-W#GMRi(O`LL7(C{A()4%t{ARGy;7vR}@By8HBF1vj%
z<zo=gUPa)}I|xBT9|E=*KJadhAirhoiLCd}M7}qjkCnP{4%R!-{@8|I*fBxo#cgN)
zpVV`!SP7P@P+Y7&1%1k!spDUs4nSSDdFNpsJmMGx;VKXie<IXKjJ%vKekL`IGQX+v
z50F#(tjXi8S%L~)I~gxjc#t2~m;LX9fTO&PL^AJ!8X<end=66QaY9wjAeE7MFW}7W
z0Y?6UOCi8fWOQDSVOpoeKc;?F5w=lif5in9H$T9!wD?^%k?;?`|F_c`DAh+VH4A_I
z&fH+wo`eMGII<NX>YpC54Yp(xqG3K%C`R#tA~58VcKkl$k|AJmT*R;$QVvUiuWy~v
zr26Fl`jsOD2-qet>of+If|9nN*@#P;<VL3fDp%F}x<DTZ8UWy2<pwn<iwikkfl_@q
z7@x)-Sdlx#+<&_J&;an({?b4>*qhl>p!k3NZYzY;Y|Oj6%+nwh*aFFFEOdJ962`uO
z?!4*-SjJWb$s&(I3=dA|wXGb`)WUGy0r12M!rXS3r+ZkYdZaH!9>c0IK<z#KIb&22
zx+78sFosIoz;385w^(X0^ln4!*9F^Fm&q^vp%Q%~*MDS7phl<kf3$A9wt4vS@+C<k
zYX^1RZqtP=D10h_(xJg}HfB_!B%wq!-sTZ`U~lvF5jWJird!>=LtzW=_z9LA92q*k
zqB6ARv?yNTbJ7CH?CLa<L)IK!v3pOnJ4dS&>j+Y4az=xzkhn|7e@o%3(4Z*zF={Yy
zSK!64Rg}g_MjKQ$VLim%V3sR542t);J$>;&IV(_lbG>_0{bqgZVHuypNiJN)6|fF&
zK>s!ayv$4_Ph0U{T2^Nn6v+~W##dP$u1{NhRJ)V4U(<ZWc^}a}1VnN#u{o=L5M>i)
z_W>5q8kCP}rB5S)PuGm5o*i)_C%)umnfEHY5vErQW+Ca{t6wKgq(``nd+K+_!4e7=
z3h+$pi@RN$K_UXHA7B%(S2ssSH^lzChF*J}@Hdcb>Ev}YtP#RB20*iM40d5;4K1%H
zo8dw2F}qkfk`$Tbtpjtae{U$N4AQFwpj!9}>P`N&ohG)_e|`0<fY9ntp`tR8K~+LW
zDRTh~&2%popk%!Qp5D`{PdJHPhZ7G2Izoi97U*176SStutv;GGY*mC81L2bZSW1BQ
z{XS7(E1<p{p`B2W>JqHGOXVJ)c&NI%ce6}$^8#qSil81y>{W;v=0of=bP~`Tj=BP|
z6L%sX0u6?5Hga0rK#Qvas5E?f{(hf-D***^GVmDgwLg)+EjxAb{HxT?0NXzU^m1Fk
z=eGthx?Nz!pr~xjPN7tX(uW7%kHKm($c+5Wc7cY7yFWTf8IWCeX=<AUO=hi}#~&JQ
ze6;ipX^|k3@vPico3NomBfBX+#PN$re>+Z2V^NJ(c!z^pWrqfsr4?w%(m8TCN#IFd
z{_RnGTc}Xnwg=xVV(z$DIL*&ZpDr5|g49kRh{9wn$~#H0;})P^EF7h_ile-PkU%f!
zoYun7`)KU3GESZ2f<H#rQ7-5%(w5s(0&MJz%LPP;A0zMavz6(CI+Lxf$6Gdi;hx;z
zX~<i>hugHIY+SVoD3UO~K)Hnz#Np~r61~7q{CU}*c_2joFodE;>9wlO?agiktk5s;
zf8O=qhnLW*!mGf0m`<N-Gh`;dRtW3`$<0bx@~W!1yz5?hrRKl`FVG(4x$K^Ly-bES
zaLA@E&T#YTVa1c)C)vF*F47YQ4_HgYFubV$#D&HmQsTIJx?H^?c)ZfWn51tCZmI5A
zFRf@v{U0{7!LFwv#ueTKY}$!sQJ|RSKL!*jn!;mEn#T$9)UO#h*kC2+kq%z;p#ebk
z#u9n~mXL{s`h(mpKK*{23FZU(C*l?Dc%o=iH^mO#h00G-3IxZ7Xb3H#)JA}93D`&o
zN@LUwPl&YE)vizi$S6&i_NtdPIL9)l&D&Y51Ivb;p&x9wwukB+?OI2?hr}H?LQ4XH
z7s?fG8-<(+CHcEhI?#pE0q1pnJ(LbI=V6Fi+A`Y1bfEEu7LCk+DTkMk!DaT>BCY$M
z@z8(>PoAO!CxsG7tQQea*!b~U@#S!Ng9N-lJ33ZoA3)ySsjT&y>`{$^&%?X5c#Jpc
z5~Q0;(*TVZK`Yp%x7h<mx^Ij>7@o;)JaN>>k3dpnaUz&}XXP%E(CUT9cBEF%b;S?X
zXix~h$_90lJ{`&3`>x;ComQQjvakDn1&K1$C@#XXP^P5y@p$*8?u^RjsR$l50eunA
z7xD@4uhPXwy_Cwx7%)!`RY!W$rq1FZq~apnv$jg){D2@V684y@+6dFM=WF+3n=NyO
zP-)c>@@11I-WGhwbZa(77rHrphv#?!#2gMl-B;^7>)(PWvkLkNRp4<pS#fT=e!2$j
z!OUW)PSTvq>AJ1RuJQdDRLnjM?9iZ4Br?K^x0pzRrgdvqF^Szq?rShfU<DZE#E}NP
z*AZMn@4!M~XugXh7|V4j850+O56x8KAiXA3q1icdv>IYu-L$Q{rAj4Zs%u8-7LqwQ
zT9UQc4s*(sUMQVC^LhuJedaVac`2~~o$1%<`$b(%S-wBnZ*a1A_rLP2z0;EJzZ3ua
zP=6(Qc)b_KU^~Kx)Rezs7MQ&oyx*ugzdmyS^eAXq0-)d+eVXU!Q;>A#SB^XUtY99n
zd|W<e5`G<4znEw6OW3R<h}ZT0n5KDvUiTJh^QoM9L#ra1DoS6(%T1+giQd!8n{xLP
z>5)W>s}}AX=a*m33r9a7({?BE4sg)Pl078&agSg1p|qU}POLfy+A6`QBH<fpmKaj>
zC+dm}a*iz+`?z$Kl*nlcG|q3%+IeoahBn>3@O;*{$0{05cAR&28ZX4Ug)2{X8Yf)S
zxaSXqN*kkgUw=zhMjN@-&O^v`kS*_b{9O6lj-Rn1I9}UXPoTC+PjmM81c)a^iq$U-
z0cQL@2e_ofj{buC+GtT4^0X>2F+zV(WxZ4IR)Z0t1hq`lFGvrx^D+*;rA7(dcj=)G
zS$X^Gx9oNmawH*}(P<lLg+$nbTPLdE{2S{B;`iVDJw)<u!(?7@@9(h6w`y=ZUH1hP
z?C#dzDOU4@_Df2<>BE=ie)Z|69n6*jcT!kbzG_s<`%%sUp+Ft`6$Dqw#rB}CGob}Q
zRhDc>2R&oQoYX>AlR?nRlm=|BTEyI158YI*6a?UB#|KPF7<z<$#;y9fBHo8)ChAK}
zi~QZmNoL<vwPoNU3D*v(Jairz%0f!*+x1!(O><rFS-x&hSOp?}=U;jP77+n?)S%3t
z`xdB3k>Oc)5g=6%TIT>To?iAD;P@vR7jYdXu^!Bnke@{L8_&H0+d;Bj!{%A{nl7Iq
zG&G%z%9$)i%9&~}Ls12=^Or@)K6)&@QvgdlR}=Kme%6-bHDAo%T+LJRZ4QA{)#M2j
zM9IxV<Jm{-YCiyHpLB#<S~wIWMO$V9>lmm|1aAO(?7_5f<ehVv$3%#O7fsXbRT7Yz
zN)VMuQ0uQ#E3mx1T9-Jflki%&@TlExpJ>F-vHkL<YJK)T#xA|K9nFdyO@p2HR@BL2
z!+baDs?U*i)!z6svB~2-<6rW&eiClaE<h)tG1||^J3Ftx$wriV@InP~e)4TyT>N7}
zf3Orwq&FJhPJ3H^v~kzF+%uesScwYP!z}H7&0%Q|lC!Go9{m_nZ}5*c-ysOu<|VmT
zI!VjM4=~14U5)KxcM<C6D*NY-&A;hO<VvJ%5Y+Mq4W*zX-^g!hA%F(9+f;w7aa7o7
zFV4ZYiI7-|I*iTv;`_H1!>Z8y=^)lV=Y&&?^c6S+trAOq{bPKuHa{S+4yY-3R8+^?
z7^^~*YyW1?4yoqX2f(-J-#`6jL|6Ef23hZ*sUi<-L#6psZC~V%=srgTzl!_@BvC`x
z+BH$ie`GJbaVGN9ZH0UjKQ$pfox==g<iExH%r|+zF5Y~rv_bEqh_Ae^&WeWRKk%sK
znQ4MdCY%NQ0av<3_MQxF5#M>4;)YpnDSltP1P3OQ1l(S*>bRv=GBB+E=pFBle;RMp
zXr|J6#K){t&QC9LS>ve{k7nANs@oYFCBZs|i@(dt=yaDeYRLNwy5K3W{P`PUFng_*
zajvdxEtB^!I7T9XnEAMJ%;j&IoIqpK#oX>O_v`OZ&O;evv$2H5z<j}Z8H)m;d2UaY
z+q4K7BUyVw=1t`MX{5eqC{kbxfOCrs2>0pZ)>q<6?SHru9D18QQ_jRde!3_#oMIt!
zQ_QRF4klLbRjbWATr~aA%x~Vii!al1<*tcBU2S48DbEqdFgI6O9I2xa_PTX~n?bQd
z;7_{<6XsB^#8G}7SCr6F=rxcdZJg}P&B)`FGYE7=qGA#yDqKC6r%EuRaUZPp(fKSO
z0<D7j=6kPU_uIm~ov&Abw;I`vJT8QewI#g@mn4PdbM-k+@MPao_<JNjskHdAZju&j
zIXR0%eNx*!ureZ!*XZ$M21}7WxL1rzvRLA3DZm8~VqdW1h+;3dGJxNzRYVd10JeSI
zO$L>DC#U4k1nJkt0Bvq{;*alzc*~+l+*>5-mZ)W_+LX7q<tD!3&yMNr!77Q)lE|h3
z@6||N+dV#Y|0%JBwjc(EmFh95JrD-NJ+yakpo^9kU*6aDL!=C^&f0(mncLypUYCp3
z;!8)c{D|}GcA9Z@)I?icIMxJQXQbn^FqpC#39?Bv&fetxl2(5i!`RHdvpuVZmd@BL
z?dL*s{v$+0Cm5}PINpt9Lb3J^Yx0S;{e68$^Q7f_H4z?c<L%J)DIj$<*x3JKH~)Ga
zW!*x|v;`)>t`<9AY=T*stv_U%9_r5<9<EBkOv)<vk$lfRxU;d%$=~ZrLK8Z!xQ`Gd
z-v2$WQZ0id*a46#&f6UG_jXxOsqYTnbltD{3N>uv?U$0&>Yh93O3O2N;wK?056?q=
zc@MbOt|LBw9eNW&e%@zzT+#7+Zr8`%^iBg?C2h;6-)1xy+BbpBVzkBHGK&fmacMo1
zwww;vIx7)4ZPBUd6EvC<ZU=AUd(HYfp3k{m&>zdw@I+P^aQPRZ6b*r2VN9)Kh{lsh
z!WKJSZFOa4FPOhIDzgerSkRao)2tZlYQ%7t9_tYy>s~aUhBx<^RW2=^!|-P0Gsj)D
zBFVTn>NlE}C+Wh^dqBt$c3)E#^^tD`IVEFxkfM6G(g}jh8z^_eUyrLa{Nn#Z&e>F{
z*-b?uUUSV29?sFDOnskmPoDyJ5M)(gwzLbh_uZ5(WZMShx8i#$V(_o@n77tL<lt<@
z*7*6C{q!uhY;i=2P3!TZPRhafMcW_bnhNJV+7vURsk$YKP~PCT_3^SAna@w;a5)Kc
z)&Ks)kX>7(!eCOrRi~G<w_8O+#K1+rl;0A)8A+-|$wePuV_~N7ZuyP}C<afI!CD2r
zouG^EikUf9x6N0L!VeW&qxy!NGcINsas;*|D%51q_gd|K*2-I8JlerFJ&oCT9d>Xc
zrxaUetTd6)LozB4U-(K#T10y)C$+;cZi^LRdVeT+a~4D^Ax8ohnbnrGnR`0mn^GFX
zLd_Oi`6@!PCEb%vhQ68!U`ELivQY~WFuT#9_dLvlxj;4YG1NJgxX*o&rG6FP4Shgw
zbCuupU@QAlty0@xsdK}A|NC~zynaRR@+-E4MPvT*k@PYJ5#-BXw-+NOWNU?s0<}jm
zoF8FEC~;g>rXLkCz}BWrw7Oe$7XsrgOuE}(JPlZMw!<E~@hY^kGsRciQOft;^S=UT
zesv8g{0YO3(30iPqrU6(BdkK+<~k0i%P+TKSjLzVMJ_KwHm8Xlyhs`2Ncam;e`+C%
z(!H1)Ab|HW!9NXWP;dFt%=@>;6d`&ih{0==4{82WfU~`zI1yb#VeE&>^<%S|x?Wot
zq6v^zzg=cNDXD+So^qr}m;0U_<9&UQvFp1>>g&{Zgz?=2^ejqo)NJ&Cz4?ba4Bb=+
znV8bBZoEf(zsI%|X#0f2)@bw3&?IYCML)G3y$^FbNSUjRYve;7k9<L!pd?Z+;o=tn
zRO8LR<+yv1Dh_lPIqo(TG=MxZj_r@7*wDJ?P_PNkp`#-Lj(=icUcNy`FGH$>%8Qg~
z{0@cj(W5k~w{ZM0Q9MP_>34_+yfKTI%pZRQ^*znSz4G4moSy+Dnsr)BR3Fyh)H$JK
z4Y|6;f~3E&8EF+&!3ituVQ>alg|c}|k6`1*(>Vyepz{c<hKW4D^H(8L+d<zY`oZ_O
zG5##H_@`WA{En8-s@AJ3E@vujROwGH(k)*uL-y-qJiBW5s2|VQtv}Mtz}ceMNibj;
zH>kC>n}xgIi+7tsJ#K)fEYv^d{P>FPcj3j;?3YGvBQxVPP9eN-r)*#aF&EIywRGY(
z6TBG+&g7&JTA0dIEo&hjDTx2A!BWZF?vQfV{cJU9U3T3FkQDBVGVgVWNwOk1->PKw
zbnbYZObz)6R0=)JoZNCi>*?Qums6yq5R(kwSnvByW=;0as^GVe#Qfn^Bm4%a(&?QO
zpWTzVhwml@4Yc!D^q)4@M@7*Y<9Y;XXj(SQe0qpA-mxFV?PWr0?_`iolEEt4`(p0k
zvJ`(!qZ}`>acym5MkIaHhxQ3Mp=Kg`iX9lD_|iXG5p|U|p6kl2ZcOD=v+r})|D&_`
z<uH&9y8l#QwzF7B5oofYz5u*sxBXDMxB?eFU2&?eXy3zcf4I;DNCAWvaNj9dhhDWG
zy%fQI%SJ2L62j&2yHLzHYyb9bD-(QoV0@$$ndXF*9bXJy?L~dhhuG{#weMYG$?1sW
z6RY>7J}Nq|qlajU#n8Kb^K$#<Jux^{?Cu@Qlt3JNCIaSFMaS8q0k5N0XN~56og!FD
zmtUR1xw#3rBt`5>N95l4O35QN$%K6EoRzEZ8JFLKidVNiE2sIU#fyrA#zj&b_D5C<
zir={n+Bec?WaCPL3D0_1&$OJ>G=o2Vx(cCradSIQ(WZ#DyG$pg9?ha<vUO*>Ts}1F
zSnoL2?<)PstuWcMxe;0lBf9zZAME-b&LU$$$a~~{i)0%jkmM*aTX`9IgI7?Q=;(Oq
zIY!aVbU>>}3@+_<ZVzTwt90KLk#8J3e`ic`iI1Q2dSSRmRQcmwORIHGrNev^LlWKc
z^KRU*oG7REU!EN)-%C_}x)nM)p8_&yuJLTZVMpR1WoU(qXKJ~CxRu<wMxtUOoAQ-!
zC=N?noxYaEs~Sg2y9tr9#8|ab(BW$c-F}~oDA?+%)a1Eo5I*_rZ8&<<Mb#=cvW=f|
z2=bnwMR3puCuc@kV~<!y8<QF*W1d#ta8()}iJxsz*!pNTJJ2SqDC{hs3VmT9@M`E5
zTjSB&bbnsUS&;q|q22>q68{O70wu6v&OX+dy_zn)nl91p;wSuZRlMUA9@kTl3m3Y7
z8>l}{+|z&pH4;pvdOIGp6u#G~-6NM(MgAPYH)o`-Lz4CLTweMt#P>8rT_yR4+Wqa*
z#%()?O~a!Ctzw3B#m&Bo`AA$Ie-Gm}>XhKPnEN41D1ZAxT6L__66V$NSj{<N=uTHl
zBx>pXI{1XZ@Rr&hab-yU1=RhJGalPX^4UmI-HVJM%||rvMLKL5{tY0_=2d7d;w49Z
zC2hS@Z#8JLExMx)dJ@*8)?M#alJ(mysj?#OeXix{0`^I3d>K=6)miaLjh-iKA_Sk;
zv5ih;n3;(B(8g^VwiC^_(|xbc=778>X8+lB9g<GX(l3M@fvoBB!x&(l9m77}QqTG;
zRVuH_lj-D#*0&{M>#yCfHo+F>-z3)W>U7xi-spj1h$2X8wx<A~M<{(4(W3l(05O^=
z$h6es7m^p1l6^R^tRjx8nyk}9kVMzdRXlh*ZCJ*z6}o3d@|&f353_`hrL%1<#A3&!
zb$chXW_xh%*?G0Qo5i{bxm8_+Mh}O?CFk2%csX&zf9;@!B}xPs9+$Jvhs2FzEXM1k
z2D=uYSGr6<L#LJknC$)%@u^#S+J)xL?O}+EQ>eCMW$$ta`$mhNYDQ@+^E-I1NpcLH
zm+gFaCBY9*`5;oH$%{SqY2xVHmEP!?T_adzd6i=D8;lrkwl5(@<S<^R@-l2{#^ke_
zEvm+DA|7{%VLWTlX&W=Mi85yn_mJ_9iAdRC>{<LK_{Mz~E3OKEB2xnP`CIq4Wpnq}
z#z7sQM;8?4&Nr#eXQ?UV&jcELFKT=Xi_||O#%{nfD@%k;(&no|RVDa?B}~Pz`J{38
z2zqj<_{qwi(<A3#8q%4yqrC!?_=jqRX-JdrwvACL+Vd@$Ttd0$dkad#(}$)w@#WB0
z-KGDm!WMDuV2*|Jk)CQD48V?rG^pW+fM_#Xz2+qtPV~iu^}V75$}X7FenX>^^QMDZ
zM(KvNA^v-r$nTu@xY!BQt*REDXbV=wvF;4_B65hLw`dj2W5ef|?pD)u9$S&gKOv<W
z*Yd7qwI_Jh1>#@X>2DfP7ue-!0w%#HcZwbfiHSntp|tvZ#(cZAAoeTrsc<D8L8uXU
zQTF(Ilg{V!^*J%c{BZ5d1q{jB3iZhY<4T1<tcu|;{+`TR^e+x{;kE}02J0kHLV71f
zz=pB@Z2}DX1AsyBx8)foKHsUjfY@mB8h`2ckaG(cPvXEZAGO}?6>6sr0);PS0ik{$
zz0st%>A@y>_V@7*$C=}i6jwkYH!88jnZ3kcfM{DZ*nOj=%c_z$P<%!$?m+G=UOX6j
z@>4$S#}=uaV8zb}jTeWK`YGr4yLSPv|5u{o+0(-6C|o##8CDQFJS=~{!Y~^1GIRuL
ziYU}>S5xBDi%!U22{e^@t+IP}kSKJXGIGCGAjEB=;haAkbUzRM9Cw1UvIO@2Rc@Xj
znHfJd!<9TAj^1e>C<g10%`lD3<!Uchgy^>&;_^S6&2HZazEdH;#?!gGs5|yPc8=+T
zyC+`Z70B1xz93)g+z1y?Lyt!~*e~SGIEvfN_UKjr(52E%lJ8kkekdnu*y39bM$=(X
zoeg#66hp(IzCYimSACULX3fp4^digk__{f9G*^3g1?cuo`lhEhlid>4&0s?A4O)=d
zXLQ;<X5W2gYs|J@dKA29aficti);K7?~CIj$?Ijw^s6wkJNf;mJ}x-tLny7!cvaDb
z)j`c{j5%_1Idro{|L7rM7=b(e#Osbig*GR(;;%hHPI4;C@sYV=tE}GU3MnsK+umR=
z?$b-|16RSzHty^v19?RQ!GBB#e^&E=9T7na`<Ii^SK5#xi}E*aTK|)+srOv32vBpQ
zm7axf^q|;GG031hEv3H(?Xpz|S+w>Y$-n;i5sZ~2u;&^Yw;W9r2;vsqa82~v$izW{
zjFmK}+mOvc=5wUw%RjK3l=GrEOvOL0+zHe8*UeqYy}Np%hf6F~X$y1e1;<uQk?&^u
zV&$n*!nDnEe>WfIaOcno>t>X<rcxlw#aW>6S~o0-Y{wvva}3~GHEafV8z@`vNpwtF
z9X$fepSL>e>VUb#&m~I_?+c}k2&6cLCd4J3ke%pZ=!t@zNO>eCHH_8lCAZmVu8Abg
z^Rqj?CjIylr^fXW2%fqdZTudX;$Nnynb~*wZxI$IEk}_Q-9p%?%NJdOO|d+h+PUBM
z6G~^tQHJ;p|Jd-#KI@q*$}uR7(llxqcQ3n1n?FnY7~Nx<u=(LzG<*CFpxe3U0%Evt
z&P#7ZcB+acqHQV-ew=?Q77*BNnpXhNvQvA2v4VWq*5G09pKr`tL?LO$+7sZ2ftW-E
zK)!8=3KdkX$ryLSJpPivV%s=hSeu_H1Uc*ACs5BOk;l6cN#GBl)HLn5miF<yy4w4D
z2|BngMd9CnA8j5>GX4fDhNr{c1#o2PN2PL`S$PxbIKS#Qwf5T7oBtrfa1?{tYi7Rv
zI0;RYvj};Obxo%kPAtnfYQ-?2_iH~fUeH@M`Gtc%k0FT~k;y~9*B{lZSSO25Oi~9=
zngyD!k4keB!Vx(fBrH|F2ZFwpIT6wAPO6@%``)Q)$iDQlN`$$OVfIq78cge+BVA9a
z2TZbMo(U5k<j2B3ADHgc3DtF%KlGTt_?5J2diZVe2A7YQa%xC*M?2^#omlWh_0ak9
zn^%I<f|~)ZhUSKfb<oskjd_`Yq!+0cI>E}XRf(&yQXf|jkIl+*I-JHsR~IUwVfQGV
z5U6&$nQ6Y7@iv#d=eeWV$>%l6Ad`a|a+FFYaNi>BUET+)C_V8b!`l;E0AVwY$6BJd
zC$`_uFXWO21WEC5+;LX7KP(i>)NNLI_FvRApXU803JPxfyV-obS(iH3r5GFB`X&Ag
z2kj$mo1SMTO-!uq@dC?<XMszs<7fdEs{VHgJMFK5RmP0uy6})f4#gP*dsmU1WM}Y#
zt`iUjFkTl&|4nq2`vyC31l_{ii#7(vE;92k)?p0$4*JiVg`_x&jstDbf-1glq#XrF
zyc2xepT$7I&Kq*s7fk;*cN1?E1r^z+PPsLmkhKJM5^GU%CbwkG^&Q?xDlEg*0mf1N
zF{;-*Ck^%Z0><uD3MrcQB1F1~*XdWmdNQ&^?iKBiG>PgvMwLWYqrv@p=K7Y!SrP9m
zmA#hkUeSh7#+_tWKVuqrEBsQ^x+uKq_-m>7CyMD&$^C8p@fYLD?9-%0o<_AZ-|uBR
zn1&z-C7{r%eMcZx#ff`qM=`1h+y5o=_ebsJzd#lNHP2J1=6QNpxDEcmg7ygh_wRPA
zZUjN^oz4r$47V^jLcf0gX6H_p%y6cXzWbbOKG|1o9?z4VD@nycAN-ecr1#!ZS(J#b
zJ>ftPY{{14$$b2(&}5Nz9LvH<HR}c4^C%$bbt#Sw3fX6B=qci%FpkW=D>U+zm)&~g
z1j~ZV<n6c5Kd`}`9LZpB=SZ{2zn`VTg?V&8TGbf5e6a)XA&~ZD&5e4_X7NZElo1!8
zEhYJ#xcgfVeIaooFHxa`@5RjLF`D}~6s2|d1;>%%whzl9;6~<Zp>hZBPR+iW<C38k
z=G>e{-;jwacLhCuGu3C9*dh0X!v0q`vCS5AQg4c^#MwEMe;rrXSR^u3%&LBmZm>TT
zy?Q94=(=?s_HAre6Heg^#3fB~Smp8eC6~kMvxBOmBB8-{uX@MG^i45;jXH<0(!gwT
z!oW3Z;WuD`f(6!w`?qB(h2OtF8xM&l_&;S(pB4NC(jQZbOk%bwN$Hn=N|4PkkkmIt
z_{K`X^A}<v`0M8l=)*NsA@sht0v3~R&nkND;cP_2VT3hEVF$G3?3Y@;M_T!|N~)%b
z{MTut;fSoqs>%JN;i=ZvfnbeV(4&VcuvqT*L<(1Q_@S_ENK`&l4W~JPv;Co!>|qb%
zng|>=*oUSk%|5|+J;98fa=f5(@B*CqE#1ZyXlU0mQ9rM;mcQ=j8&a}CvO6CXZG#rD
zKa&aasxga7^;IN(Lc?lp;s3M53_~~TK1E?I0SXHJkoL1vAGNN+QuFu=>>5V25B!S_
zPM!LyhZ%nZnX_$hO)UU<efK-+zSN^4WbQ<6j)tB;Vu4{z2$f%8gwj%gi%U_qt?ed&
zM=upd%46R(;J@3$zR7Om$B0f!WgoE|<{T?!I3BfE^XW=Jz{UV>RIaC8PgN8ccCnZX
zM1nXwxJKiTJ>R}4y+w~L^>iwMz4e1iM}^=}<0+!miiOhBYE;Y#H=GcaSR~FN)LF>g
z?9m^5b+)k@$J<kB)mcj>)ey{q+*O{w>hOA(VS3`NO2EAeG*dlws})k^wrGTM#opBJ
z;z3Wwu^T212-*Jd4Y_rjh^Z3|bkbb5(qwnygy%sMw)zKWR6+4$O7t<>xQ=}(>Z4CF
zsR`+>t#aNzT>fRM-~P$ryeA0liz&?MWb(ljS(Nops{?7BhiumXE_mIz`<}SJo5((x
z*z<Ir(eLUQ<tgprsp=p*PF3;(5~F9>Xe7_gqm>?{RNa|gUv%BB9Npwdf`>z&XPn!{
zq>Ad5*Q`@FV-{T^K6{o^r|Y=5mf*5n%jHJK^~S>q@pU~x!_Rh6<I!70Y9SP(t%`Zt
z>qV*B7qsfy$Ogb*K+VFGsQbM6wws^<Jw&$Ql4_9aK%}^Xydz$u%5o1yV1MJ}Z*zae
zOPQdG!^T~2Zcf_T-bvy|4<gvk`zQ}S{dtt`C@<ybv}mHiqX@^D0V6;Ss;>oVc(qk_
zKJ5Rl|KfsMYMtS(nD@f!ql|p392z-lXFAR^wywVIP{IvH_JId&4*IDa9@-ma_f<$O
z-c;2_$@IOHjUXluHFnO755{)q87G0UnUix23^M`1d&H5+72uyh(q{hCyYFNi^DK*_
z($YV~bj#fKMJG>K=QCJBu&0fKwqPG=HVHPxl&AR~xz@A?B=A2`HEejKva7JH5#lt!
zTPl|)X8Y8tm_G6fuj0~kB=uMPX;f;Ble79f@ooB7ga9Yw^qA%?VE~*@%X&(Uq-4Gv
z5aIQ%SGccfDdefJ!BJ-))`pj<WO_ZF9!N)-kil#Hrt@57WldvRXRL~-XD!^dJ6oix
zvv{LC+S_)p*f_iFq*0i}$Un6;`z$>~#oDzkA<o`bLhTL}1<5k0mfTH_S31$wxGRyw
zo>~kyhb>wh6<gfe#OaYnY7gNNeu6@)Xlmabtt6+wu<NOuN44cWbzosbE}=&Io^6{|
za@mh1JUQx)p9IXNp(FMY?fOdZ6P&zXqP=}Blcznh;z7wO6I^9-{T;3<)6dd5x!AU3
zqno+*8b-XGua;V*ScJYWY#&Lh9^<aR2ba)y>LQ+~Y>|ek9_vie8tJA@KX9{qsoYRP
z9@72d3Ov7vP~rFAarPprM9x;;ANc&ZPu*F3OK!^;3ql}NaJaPkHuId`z&v2N$!|il
z;}PnodumEa2AT?a9xB%mZl|QMfi>+D;V#cdE+MC(yaw_xi+Xhuh6AFBSHq@hZ69PV
zvCSwsyK;-6ChNN9G1fTd;F0*g6io*SxqIXJ8lyUk3&+PO;k9HX*<McnpArS}FO=7a
z!m`PzO}|EvT%DLc{Q;vbxkuEgNy+>K^t_ZUpA%ddE1RDzmhh=FnJ~avacZxX=terK
zJbKmA<7%%LX|5O585@pF)P4=T{W@6i_W$APt%IWc-uH2VrE6)BZct+BZcrKogr#9=
z2|)zuUb>_NB}GC(N~A<-B&C!FDd|SKzGr=X-ZQ`X%{b#fy0FhV&pG#f-B%zfjaue7
z^ap+e6>VnC;qHqTDO|MQu0{hN59mlMWOvOgeY)-o=Ju3kzyD7Ck^m9mz9tEBQZ7K}
zU`Mv8z!m*ItTfxep55TDxqtNinpmEfq{g#vJp@ZE4u{g$0soQOfA4XLSG2GqPPO(q
z;B{VpyfktqBjGzq8ZhSZMuk{q?ei%9Sv2OvclXj^*B4~#O%!E&%Gvuc(DJ_;3iZH$
zMH~75tD)i*LJk+1&4z~`%4|NOSx+{F6!}cqJ&uU=hPb`6#Zm1w34bOMH(JL-u)x=m
zU{I9CN~w56e$>&YIDk=s-V@<Ld>tKyYU<R4b)mMHiyR6b?(v6JoITiQXKrR$>AInu
zW<}=LbHA@a^vlggo?isSnb?h8iBwr?x=_9bCN7EIm55&v^L#NwWvU40lOBnfO{wr!
zo=4T>IjE1*j02Lx(vAwc(BdNOabXF%)eKPnw0p|r&7xn06+@BX1P^N%MegST=XZBI
zwZ+_USgJ49223Nb=MG{74s9#X;pl1}=>~vN>bS0|sx9f^yWVmUrm-Kjm-FQjcv*I?
z>^20Azo|MC$me(E%9kGCqYqtAnAnrpX%=E~%=-7Ry<oo<DK;!;KgQwt(f0@Cv(r?;
z^aipppD27^@sBlIFFDJASu(dro$S>ju}*rNDE9wq)&JjfZtaju^^~f^q3DZ@x_I>G
z#XynN&6BAJJiGTh<ofwsEa;MY+I9LcZ|MftdyaifVi>VhMb5d1c@ujgbxP90ogmu!
zSEBHI_pN+oedxByCrCjEGi@@_C=RP4hqd7Sf+fF|1oP)RPohL1MUdUXz7IFJcjDa7
z|5)Dq(Zrs)m|xYV?;*aX5<UVVeeI<lk1naLyKU0kinHt$NdpSAP1?X9vbNQ+p+=q5
zJw;`J!pN)|X(Tm29~BXMPSW}!6+gaw4b{%maK>N`0a<c};IRAb;K2q4VlaTYbC$|}
z!-#eQ=x%cO?vBnjy)veaUA7&)El-8=t4i<*g}Je^(LGd-W>iV!cp1C-CS6zIrx{QQ
z(hQTIKg>l<X48&5I^TOm4_J%*#Ay}{+8f6W>uzHCB6t~ZVwq(*6+91^$7@YCzHIEG
zh>-F!8xwBu_@6JVFtDtDzAXCP@vrZG(kM(gA!+z;qYI)IAZy)l?!RUs`p%y}M|rma
z#nH;8K2~{@lOEkq93)-A3jB%oLQ@RY8&Zj+h=^J(t{V6`!k+yu@H!lg#r8FeScPva
z6Fc6cw)W-Y%HY5zIblrAud0Mos(e5ja`U*-@3BdZd&eTjVnZfNh0SQiH?_<GrPAg<
zyia?5_%>gCNT<bO!FLi;A9W&o$@=krKaHDAn&lA&vD9%ii)~!crwT`P_YUPeeqbU|
zE311Od_56t$3qbN4lbaCpN<ktn)~+nXr#|=(tIPvrQoAY4T~VEG+rB84eLT5O}kH4
zhO55(?tDI>dYx55=ynA$zGOzJxw7n}<8RXRTI5+(FO)RVnFgQYS@#b1LCD!K1AYYH
z`gC#Ujlsa{sjzK~je;byw+ONdwLDCd^9xlN;hr);$FDQNv=Z};mNqK`Qx$3-_)zk*
zxIo9ZrYQHhv5aF+Kz!sx<DO4GcpBwIa*ucjFMHMoKH2*>JjDMW`Zm7a{13j<#n~1G
z{qgn*fx+d!g@B+lbT(@EEKo%xP01izm2PVA*<)!gh4Rh^o{PLS(8Z#B#6_F7fpUm6
zfAWqX2#}MWnJKB*lDH<6b%@75&1M<H!Pr|)Oe%Xi`=UaGV_a;=r&}ncHvfuM4+Y1w
zQEqGNL-v4DdX{|+uj3x%pj9LU+2>734a(iHn-57>0RZ>iK$+!mG7IuOi1W=C*Q0Ly
zo5^}8KPASO!JRgsuPwp8tfL$9y*H|Q2Opo&M<3BkHo?FrJNN`U$e&dUR*x@W0EZAH
z5gs53|78Ce0t~-wz6Tx4Zw_g`?JKgd{0zD%36|t#>cN?PWnQa8R4dZ7dZiOIW3uo2
zH{1B~A=-mFHJPhEqi=$rRG1&b%c?9Ca;vlvpWC#4$m{|p{m>;{Sa=z9)LKdtTm{gK
zY0q2cRNzRb-IpzDTpmwpGhw+A+gow-T4<})PyRJa{QPmni$5lYYg~rkz5pufm?UPg
z*-reC?~uPmO5b#*VXUq%HGM-9f5U0d-(^gt)5?4Q)}oviJ!!O{;(w%_3KnReDis>@
z7uAU=<kGClb~ImFDj!Y;pJwi>>3ZF|CU>QE#A!nu_BbC8d(&js4$JuW#OdQBQat5x
zz|3}dVO=+e+)|^;vTgE_-)$##czFEuMfFJnCwUdhV|AyUg@xZq&cB;pU55%{h#*+I
z-q5FYfO1JVR7<&jdW+wl9|sqh@@{Cby)3BmQh<9+0QnhJtVs~nI|Zr=L38+?X3m!T
zuZ{M{1^5~%d}gdoy5mhu_TBAJb*DrXf0PN`5(AzcRR)7OlsqZZ^x8`c{^_|w{Bwnq
z=hj&%Ye+i{s^U}b)ao%=k6o+xrJQM_+WsVQh4}sNet2o+#^;r*xGx2~u-KNDL(nOd
zJ}=0AVf}ve>oWb`IwDHaZk>d);+(30j)TMhWP1a+RyFbUyyr_h<TKNFmU!}T)t>%Y
z>LAmJ-gp>P)%WRyl-qoB`QxpckKw+TcG<SVvx7}lj+^aB&hJzZJt}3JJ}Auu*3|6^
zr(9NLmHavj)a5jcsySZ@NS}@xL?qrr4klFY&hhb_>G84X>rqX4Gk7%JP^Sv)&eT`N
zP0Hpt&p8Z+2E|jZVOeL|;Jhg3Q?2}QH<kcy-f6tq{<!j!#kEfELxCqgA70HyB2}n7
z(wnL-Hd*ww>LmOItBA*yL3F6~$b<@X!%WU~j$r7`v1hW8dBet^J)A6jv7%~c*YOKO
zF9y0n^W?UeH$%H#3Il6_j*qWPz5iAInYf`1PLEOS4OAOo2WjY3ZIRF<+xVjUJY!zy
z%ZIo+-Rh1~3qB%Xsa^0#6oKtSVKW+O?)as5DT?65yPfe*H`d2Y_Te|x75jnG4SQ7c
z-CKTNrVt8Ops4XTDV=0r;~VbYp8HW|?zdJ;@BZa*=Ce#0Ql0kBm2n#mQ~%r=3b4*7
zYHz3uSi38A(aj_qakFuM3nNhYhf|k?nHRj#J#=Zb@W<krV&h=zXN05sADV=Xw?R9O
zd0muoiCe4;{S8OO$9D9IG}u{d54!HDnLNo5D|9pJ-*is(qEr|^bD!^h8?;Rr$Lyg~
zj+w!_Iv=p3BVci)Y~Mpf!$~~&@!{o<M{-$+F1uW7#c#*G2Tpc|`00jGe#W6{5Bc|n
z&&O}x)lJ{^x)q5Wm-C1ypHrLoeA3(@-_2GXHhVb>=JOpA)E3L0#-&$K67lNS{(lP@
za}3|`{DOaR<z$pAFOdDOzqkwO`{N)Y4&>FS0g|Rlh$3^4{C4eX2kX%@;+rvsZzYD}
z-xcuo-r`cIW3wxkqbV>n)-ur=KIW2r5p{V-<s~lY!DZs(j+rvT^IOu`)idrCpf~PT
zvNXne3~H2$%ki+Jq(SBNHE|9j%-*i>7bdf>kh3V`nPj$S^XTUXo+PR&G`I_l4vh#J
z6x&`1yxN?9hbYG(RL&q@Tye?n|0Vc6qDP)9HaHww#YYkNiKAVe&AhSmY9WX{dLJba
z_twss8uH!DeWhDHkD0>$H0&qK%jWWMyIyiV^#Nknrqh=mHTi>1wkE6w<%|^%UB9ii
zIr#9^%SA-^SDmTt$cjrl<jAqsTD1|5<8$vZXa%9DSO9nB!yzxI;S+jvjbgcl-$j#Z
z)bs0egmTO`>Tfd#kl|7UJ}A~@>m<&HPsSAM@_Z3y&#;S^FBgo})vNtzNj^XvxULkK
zIp&~=as6(+&)pL>0(+Nq8HeTz?MX{Hc}w+`Yx#|4S==GVoew$ZK(%$PW~*BY8BCk^
zrBRQ$=rXY;A=`D15qpLo%90T$k*}+_vdAZ124_*~demaZN)~7E6ET)qvM4du{gv-~
zJXrzT5y^ebP6ifd3g^3qtBO4I;(IZFzja*V#uquuf>G>PEaV$HCnL~e(lGg&^(}a{
zd08kTNr{3zBH%g6yjdCZm-DB^n0<oXpT)|9l53~EZUv}#k)MQjQu~s1N#4CE5p3Cr
zh)uv!h`8osG3eovu;~8kJevLU^xxKTi2|_F470)af2|fA4PEn%UF*Scv~GbW3!34<
zLW3$kMITGiOFM`lQjx#sqX&;a&$MVp;G!POm+wh8>q)Cp{nX#NINd2-^o9+HSII5Z
zrHWGh2bzpaTQyt0CM%dZHuNbs#~4JHG?FmgV-Zy;cFa(SO^H{Hfs?A~^4Q2)U1WZj
z=7=H9pt;~#Sl=MQNuyPz#qT-+jzB%nYs^^9Cb|(sJNp`98d(>eoU+x=r;Jxfqspo$
z&0iI<A)dNm(pzAyc6<}_wI=D*Q}uIe3$`N|D;fWG9AeujYxaI2VPCnnw@=y0JOgvA
zs`4nnZEJ!WN*c{%{Px`_eV;b%?<%)Y>p6TBx>TH=?bU{fFVR@;(G=|YJ8gu(*lp)r
z+p;{y@b3$8(Np!#B(E)oCvMZPX~gObhu)oPaT?4DT}3K4zqxX?HJ;pEOYJuM4LBiS
zBR)YP03u7jEdC(?Hc+caiv@D%lpF<zfC%EvE;ZG(R1o-xPtF@*%-E9>`YyjmN63Q_
zqifbuq)2(jJGb#A6FuBsSDlMzhmLstYM<LW4g4K&^k%E~hKtwpg-;@W@)L3#h%%&H
zlawXwa&`Cgq3q$yc$r0NcGudyaw2$$uB^FpV%G*rQ<PyJV?$&sDR{dY#xkV$2i#NY
z5@X**sZ@EVtl;)j31VWa5q~;?=;)1`V2#qbTPmLKn1g^Hk;F<e+heX@Jt20K4Nt<I
zi5u5e!+FT-kvs%{Ivi+9;ddGy^tn+|;FLcua?Yc`_i2iv#(101g-dnswEy?T$KUEx
z4^o7;Qb+uZ9jY?{4z0@ezIhP020_y$qx-@3eNQ7r(6$X@zt@7STpGsC7b==JH%xw=
z1U20xW<9JnLM4|ri}f23cxJZUOLZ&tBnp^Gc7rp5wgsC!;Pu8}4DCA<(qEsKeE$!I
z`oj#B6UjONnQjAEEX%#Xr876nEMsDm*f|mVnBL(W7xUv$im%d17VPOB3uT9<u~g7x
z>PFteN|Db~*T}FG8WGz+XQ;Wuz}w|))<kiZOK9J#;G@?*(4iMuDkuQmZk0OARdHie
zQDUY~IhCrsEe{nFfL~J(*T9<Ps@(mS2`+h^<U?f+NAp_c4@<OPO|`~)3B2lvGkdDY
zGg!?V<h@ojIbKyonw;Srv-h;_0_th63yUB`=CpBW$MzNTM2Bd2&lZcgJC(AIz0H%O
z_xd%Z%2#jrM){uF85sLGywbJix|Lw8d8ynt!Dr^!<j%VhW;Rewyyg)6nz1j~l$cO#
z5}W21@-X+5_!)hy<BOZ~wHHr)I%rA1gSn5<fD#nuYr$1%4tMvmeqST%+;-Nk#s5dS
zn|8{xCm{&iI&I3bCqsm}EHPfOW66U2Am#A2`>TF0*5$-T0Y<5uRBO{6PdKNSHCpN^
z_0%_8Nj#&5?J|+!>F0Z+^;K#8dtn9}W*Uhs-m3PvpA5sL`&^7y?Q%)k{yeZhcd~h2
zIYTy8J*yLDu*E{DZLb-IVfG&fqKXK0Y!iLb4g7a(Q-(euQ2l&AofG+6DY)}!JrP(?
zr!Od0PdiQ9UMJ4&vgx-!V@dC+lJ$uo{n~=@MEvs$yeLtd?@eR5<YS|hQMkePSjm&C
zBsW;TJS@g#s4F8vzNOtIG>6yN`f0y6jl#R#e?52MB5fU5lYaA9&gAxu;QFmQziOjX
z?CT@0w`s{$8LI4gquh3aU7kL<o>Y(jBBhc2YWJE)r=IA?PCs(V*(yq^*2vqtDG`1n
z$Cf<Qzq$3cTe>_dCCMN@=kIp?-x0m!B#kYpX1DR7Rq_W3%GE?r2l8il0dPhy5-HB=
z=R~A~7w-Ht6WDk%m^WUP{^nGaN^0oFVoF89<KK9<q?hyIsU$_c2}4R#`%`y0q&k*l
zt?3)*Kd*c!l3~cL?-^QE@$a}KIdm|{&7m6Sl(3v;q<<rD{=-W(!sC5z$+7<!F2$+r
zsryPm=eJ%hET_sMYkoz$>4AxAP?^EmLGx8ky7L5udE<qWK%{x}pP(3Gd|^+E-;!4w
zG{Kzr+HHeZw*OpxAyZVj<(5usfEf*8TXrVrAf(C*@j9DXbz7F&BnMhR_UX_BaA?f@
z(xOU!2Jl%m%e0;+Bj(R~*`5Aay$XV7Q|+R?SKgibQjfAek&+spK795q>bRgrv^gj}
zkK3&Kr+em4QA{6$HpgX;udh6(o%Lkm&c1yXb0N!E-ly9sJjt23kl1~#^4DX;i#n<?
zR$SV=;mNx$t#{kIKy}u@R#hIJLw=m<*@=!%xRsEOb-yf%!H=t@9xJ<$k<iG{C|c<>
z?a}!#qpcwXe;4@Y4)E1*voqgyL#VY&FrHV<cn(*w>&#GyCn0t&I8rMtkuHPNtI{??
zj2E&xQ_mHVcJqMjGmM<EA~hgMD!u$$BA0U*PhIif_&|~gAm5S$ipK@uQJ&zV5f@sN
zaizuDaZz48HG1QJN8kVUp>Dgd$J2^?{S3dgK{ZBKp91V5q+Zw_Pju{-B0i;h)HW7V
zpU*}|o&=YD86+#a6Nq>sH%;uL&$2k^7yc$<?h=?3i%}S;m*!miroS4G@{>D*G9^VP
zMrXYo*Sq1Ai>t$8xzt+yN<7g9_1X$SPyDIime^$a+0(dQxpkpdR|%pLpR?@=ao0M<
zi_rZT9yz}H^{He*dtcNs^DB`RJgSFmXMkQOS^xRR>Y&AO1NbJQm?*NJUGaxJaFy)`
zL-XOfE5@`uUv#y(C<Z=KX?@U*<vP=R%@2sebg@3vz?8zPI|+U=`LhMJU3Q+K2LC%K
z`<kM#5^Wz|()&jr34TZA9a&wCaG)bAul?tba;q~q(o_w(X9MBI;zM-NkK64+`Z4UD
zzr22WU3Q8=hIYy#wKhuzm*ZY3`AR9W`6%l&*q?Bw>ZTz&TFwJde6_c+Uy)ZU!>g*+
zW3Aeq%xwbAJh~{%ZAZBk?Qpgu1LvPBc6~K*;o*CZLvi5PP?9p`JC`Twviac)>Uvf{
z9XJZ9kIhk>{V0-rdEF4qmU!V*Q>nWC))H=IGgTD!^&H#SRw^Mmx}41Eb>&x(m2i(D
zDC1rkmlV6ffXX{ZEkqG_K-v0fgXqG;tgFVn%f{Nj2fnrYT}hgZ=C;?WMo|;Zt0LSQ
zVMeGBm-8xFKdoy^98IM=)p5F4K<u9j`BUp@<Pkp`XU;0WRV!c2bEnftfsMs$4xdHK
ziz=%?u=Ge!b>(q<%d`)5D5}^f1Cr4n+R}XE5je66<&8xY1$ug@G5A&(YC0%L_A3dz
z-<dRM>QZGmQl(+Cdh4%@xj1K^+J=$9lSjXmhUp#*1EL5ON6cF{slIYflw^tTL)QD_
z?FijR)dHDhdt16z>mr+*V;2w36AX&E<%p&C<(p2WH`H|cc;uoYxh1yJ?jc*HaguNA
zu$n8Cs<CFjbia)K7(5>I?raBgv2xO_LX>s5?I0(0YxFREP{j(IL@h@k(Bp*(p{Sw?
zmNZ`uU`AA!84lrxVi1CDMpR4l@qip18y7w81u93(rEfqmaz4$|*x>9bzsZzaMC{Mv
zQ3t&s(F9#%tlj?d)aGaz1#id~k?-C|S*fhn>M>><7mrURnXTxn^qh>dI!&sHAojV+
zsulXTW&C3#;=?!;ryCDS<NHlS=o2JO{^5YnP^1Bg2E&j4ir|5WJBrA>Ga%|nog#qV
z8}j|(IZAJ*&tqVJ*D=Qc^&h~F(Sr~PD<YBWAjBZstr{Gl25BEqr)ehY(d)OqSNiu+
zJhW&LvGfR<XAS&Q6tV!t7nENC9!=l2S3lhVAw~JS)L*LqDnewd84evzL?2)`jiF?m
zPAW*<a}fGCjWSZ%ND-x?_Imd3A8IHr+@jsNb9=kv?UVPv$e8UM>}GP6le+$BiHEvq
zo<FQ9%#jqI5R?RQB%X%j@tGLqGy~MIH9Ett5Q#N{%ohpH<G(()9A)29S_%!!X8G^^
z)f~#H4{GV3l^UCSd~EPoXl+M-O)~|Kdyj<)9sKcP-Up#MVpjlovPMOMpj^;0U21yU
zQR4>Gs{!1u@PEG30+?WMJJ5hJU=h(#1HaJEFu%X+RlvC3&-BUYoyRPw&tlQ_LPvR9
zDAv2pwS#C3hD2cHQUyHdqkw|AY4h0DtIaa#oLw4N|5~Gr5iA3bA7@%e(zN%yWL4Um
z|Eg4?om2C{aDoDj1R82clBpb~Ugoo5MFK@5DZLj8K8y7VWUW}E{KA0r^<x}$fbkX3
zr!i2PI2L>uf4qqxHBf&!a*6s)h8rM~VflwZJ{T2reSri%&EuHqDC@g&pa>>4xckE7
z!ylI6MI2S#4R@svL^1N7(AGvG=8!y-G~l*B+tR89CNqye5-DI3rvpbOIn<F8x^=qw
z>wdQHYL+$7o4<czUc@(Up@5WyF*L6hXbSeSg)%k=8Yt&eLYA)9((|Or$M2k`KKcFW
za9Heffgn1)itfft{nIl5!MCGaP=U8YMO8cAeO2A}E_mx);49ySCg<y6^n6f+DCajp
zU~ni}1qfDh2Q05BAo%~>L_uyw*kKFU07Sr9;h8-Ay?snrA5iF%03U?iZXH17?f~rB
zU(hf-n*&g<Cf&a0cvEMAkCEc>8f-jsxYL%`^IRMQ^^lML<ro&{?_b2cWpTEO<t{FL
z;b{<@gY#Ti!LB&V9uFn7eX3^XRu&zC6D`jG9#U@yS1BQQt{+)KNQOkSn3W<UXx$px
z2s3%|@NBZ7LlL|Zz#lrWY}#bk3X%>&rzB9iW$Vc*YrZ?>O`9KXE`iUc#qQq^0rL#R
z6!8zg)mDOUw(CUp)c`6C6}2+@2(rf+H)C2#Dj5lTQBKcvpb_3J(i;?gw>7%Q0>xdh
z^Bz&5LKY}RVJ*q(X5RX!bxF`9xehru4|Sb0PM8MBS-g>)5{$+fnErV;+V=I<XI7kh
zAW-3uSI(!DsT8^wcbmjsDm-Ck_*8{y2*Osn;`{ukD0t{85Ev5_{p`I;@3VelEb7Q1
zuva(DjqTDOx1X=&F=oIKF(%IkY3f9{cy<2#23K@dFUOk*kj`|W?-ySW#)+&0!1`mC
zTaVBt(U000&MbcocHJ&I17W|T?qMvmhVC2?XtkPmo|_g)qfHh`6c1SubO~tLwgXD)
zUT_r1sqPF{3kI}{A0$K2@m7@iMEZ_H{2<t~s|>#77|PjuEw}TQ%adIb;ua3s9uFo#
z^v0Y<(cLZgc8eDSEi6iY-}g4!?>e$2Ch(O4QfA{C1MzfXz95Lu`I0XP6kZy-YC4+N
zeBk{wLMJuYjI~UU3&E+4x-(C5fa~gCq2JpW@bbq}zwnMj|0r<4V9*v{gxsRH`~(gi
zz%3!dF2lGurTFt@=7~>f=ZTFb0}$Szb)=jEYX^~FlNOL)VQANn#-Ija?}VfBI66QU
z4mGe)B|J^^1r)CiU_7!yx9lBPVgAL>>Nlb2q`d#m{cL9HEtKuc88~?6RHGz3Q_n-?
zk*yP|-L}ZNiau+cpn(olhSV$&g8Ybgxp#eeN(@x%mYF~&KVwep#l#i_BuU?}U>Sgt
z9C&Me143SA7J>PM#|S$fT4$~}IFj2z5vYH>6T7pktIgjUxC+YGeSc%}r^XmQoGjj{
z0ws^sA3_aKiq0zF+0fv_rBL+9K6jcMeNaU{Ox-h~-tRs-OR^9;wQPWK3&?Vnf$<_B
zI=_K4@?GL)qqx<&hq?uUwdbJB!+%KcWzYcL=vkcanX~q9T+O&xlCAtPI=f4{e|GKR
za8hSdr=ZNDeJ(aiFe(<QKTChj1C@BYB{(nL0X?JfjSsK5s2@FbQs;p$5R=YbMhO>D
z@58=_$Nnrj)Z>=z8&DIU0}G!UL{|_l2&#jNCMFGBQ*vNvGEQb`jP3J&f&Bz1+0P_f
zh44p^$0LJ!tIwXEkLBI4Sy!J8YyI5?L@7qTZ)XUq4^X3TSBDxcB`DGUBFL{P&stkh
zCUowg1^dVSmf3I`kjj=$St1m6b#PXv562>L3-p!cA;4Ttft{?psT2pC3kDDFbl`NX
z0e=c&kfQYQR(8V76A8#Jy$Sh!6{Ft@r|*~QO0jeU8;rH?Wnk5|noA%(@+U^dln9(K
zuTMk{gB?2J+MTc3lDp{Xk|@3fB+*HXCYTa1Axz3^N7ZkoO!;|9T9y8Q1e9lcd0-YW
z_enbtu<xwF;NL4W2~M{|0WtZ|G9qUqiwLoaU#4i^!-<jc`|GZheGVsNg;k*W6iwV&
z!fRhexkT^*7aG)s1^Z>7j|AI5obLIs!sDddpX<wyKU~A&=buLFN6)yN&jQPr>!3)1
z-&{o_m=wEPR3L4RTH2@TyOt@CM9X9tzv46huyj7jwOr~Q8-lZv9dhkZ{(pDhLv(u_
zb{*MsTST`Z!t`E>c8(-_lJueKpo#5_WD2TR5eVlOz~B+EW)pZ>^7z=ZSw>59AXBsH
z#a=G_3TJN!xY1>yq~tL)Te#N)Z!_~#_H7s_V_Y%SD?s{&EpV_<8<1+9QTiHCF)D~A
zE<iO9tv0aCDX`r2eWaN#B^c<zvJx!&KRzkf(aP2wh}Lo`(?YNy5a<S|SO_+7XB#1A
zR0GWIM5r41wg#jPNsnq&y}{<|4D;bw%}JRwlt{2d>c{xgoRS_Ls%hpQUuOC?vxj35
zs%x^9?cX~WkN4`A$?;!AsqW#LDFMRe?S_+-qbM@i$>+}71e9Abf~9ik2#>#@18f*&
zCQo$fIbGlah`ParLhI-K`jO;g3uOKua^mlbe6Z*J5SdR(8z&&^z928Qa)$1r<9)5M
z-{7(Q<dB{EQ7a;t?ZiMR3#i7(`~~y&i=^)haGrc&{VK>f@KLtfU{RLS^-F9Vk1;M9
zeGGk|!p|yKmYA-bgY%@)Fx4j(tg>Rx0#qa@Kc<{_I(f2u>zNa=^%^$tDVK1vr4H_r
z>?IqA85r47fU~&5Td>?S-kIBg@Xd#Ja2w;g{JgE^b6BulVMQV8o8-HKE2B2y%@0yk
zZSsM%^{o0M#cn}ZJ|gr?Fz|JbTYv~AMTOk~T?&dv+5#D)y0ANVOp4F(rm;jdKt1VI
z2<Ew#=&ZZa=}m$8y>t@!WJxg6OZkx_bxF;leb8yq?PGTZ)r58Lfjps3glt0;##XD|
zR}LK7IvGCCj9Rouf54nk)9-jQ9d2cIYW5zPsZUcBz=+8iF**O$|MJW@*_BZ$Q&i9C
zxdQT5NOanfl@TLCZB>9J4wC-hHS$Rr&hr3p_)FeUmqme#oKW5h<goREoX3@UZfZ*^
z5CW$~z&~uDuNxB<iMjUors_C~vHv6$HKrHwSJcUyY_jzr4wxaHd1-KuHKeTuV2w>t
z#1^uYI*M4DcI(}YU6LWoz@;V+j^{z8KIIo0nz#^6rNOKIcf4l6ItH`vm$)GgeR$yZ
zWOYnY^U37|i1XWBo^lK`D8#=<ImBTMe$aP+KVx?n+;EB?*7O|UwvqzB@RRQ*BEWz)
zT3!u3<74+%$>_rN)CPdrwbvW}T*5~^b;)Rf80IK0WdiKF>-{5Ooz-YR&*EN4Xk_N0
zatMI?xq7R#h)QA(CMz_jRFKlwiJ7iKX{<aG$dRvq&gdh=l>2w@_wV|IDy---1h*BL
zhE`7FD?DBmYOBi;V7rw)3)Ici7lw=bZG-6Hm)CJ;+#tdw<dd^7|HlVZ2GOqse+L|$
z-^N_EsM1m2v^=TxD0QFSfPr0t#m=Nqq=2nGNQp+W4u`nA0{ZM%G7Yh9O9p}VCx52R
zTosTn+nAV3S^1`^Uv3Al{Su^M2Op-Tl{<V?L>rDy9QOdf8$L9pb(IG0V2-xrS@7nW
zWqxq*zx&y3mGLc}G%i6LJhoI2Y$XnqjTdd?XQ-xe94ZMSj4GR8q|cs=K^b@)ysJd3
zb45Wl={cF73hx^0T33+vWL-iLc+P78TS?~1{3KDl3Ql#lm{E>3Y5dgJoS=lJgQ!-k
zf6$*{fexGzQ=Ybt-KNxxe^a%`Ym5K34+?Q?d=6UB35e*_KS2|b`Qz+=PqdJ&8TecT
zna`lFaKG5;uP!mHE9Y$MWYwvh&6VuSalRcnT4>!8cvLdGsC6q*?&53+^B6LQ^HAte
zX<V46kCTOz*v;XJ&ANwIx*jPXc~t?cL`;02P40AFE7vXl**>8G@nfL@nCqXc!588-
z_8{qWl0KAtpYoC;9(X_|pEQlAvzBOyRtv!EK3L@@gx+eP(rk6C2Y>(Tcak~5mO2!^
z`y>wRs4>#2{Jcf5D1@NRNu{-_`^5rpTC~#n6yC+uo2FqHhWCKZGJ===6YV;xQdyAE
zV5LP#Hmkr#Ho<?sOpuNjA^>%CLFE<G?uyoewM%9cuy?A9v#2w_56-uL2BNaq8`wy1
z)9gI66Xr4GXm^}V_8Tg=IB?CMV~P-9=>t);bYKa@#U_`t9c?HY<q{exCeRY>ZdN`I
z)sEHKB0;^xG@0=c)hR%Z;&Ft2h!_5DS;t47eD_*-y0V}5fWn}`=<t?wX%{%EZ=C)}
zH#FhS25tgq)s`v{aC99igIXr{fs!jYh8<1Nsm-JbQFTNIIAzhhw}CwtMU(f=U6(TX
zVzU}j14N<gmYz6ZzhTT1)y;h;B*`5HUD~e#(|Wfo|AkKPpNArNTvgYgnh)#p^zd*3
z<qR?9JK=?cP*|IXFNqP1jpVCKSvajAh$gM^SM%B!?fO1gliTnpo;qYWs{WBh!jeg%
z7rU2rgsJZ2aQ#!uQ}l2VI_m^6=n;Ab{YO~KhLnY59p*5m)T-2}*f?4kgk5JVL?kZL
zH8cw&ETU+W5coxXNJ{D|#HC!Ao^UbikGHY-UhxQ%VwIqsb;|~H9I`qflmwrOi5Kbh
z(3E2{39NKDS#tj90!Ha7sC%Cipa;YR0|9RbFvj0g#Iqdd2l291Hy2x97SrwcKT#U&
z>s6l-E&|TNG3VFlht$i|uTbHRnc$hu$@kVIeQp-*zN5+8;Ecjo!t7|Dp<%MV_Ap1p
zu!q_CdlYPg3i{*}nh7=DbjlpDuL}SM^=@sWJ6Lgfq<|*DNQFM{x5X<KjWUF3P3nT<
z244Kd&B*G*6o|J?AB8{(GEY%%Y4}`}E@SoStwW>!U&NGo>sr&@tdHf|grauhTsQ+j
z4`^0y<H{soMdSzQ2s4gW)=tYEAi#*8=g#}eP`@9XojUlo^dOCtM{F~Sh<?-J-fSMw
zYAI1sNkkqGNZx=Lge7zvi-?iGs-iXdVK@!~_3d&u2gEH#ON(}Gp%^n(kN;iZI;c*`
zN%$uvIM0gNLk<)IFmrpV>9LmaY`pgu+T0u+a5<b6Fl|y8<9l0xUCAN{Iy`8ZA`1py
zJ7=Pt8bhW=0tI&iY~)Dj7y|!3O8OGA*p>SD<k|*jp(un_%CyNxap<h3sCzW@!Z`q8
znQ&5ja#xxp2nZqqr1v?hLfV&+Oau=#zHU8O0rrQa+L9Y=VuCa}7b#xZdE+PSN#BL!
ztbm}BDhl}6kI1}&QbiAXfJw;ZSx0p)LQ$k)L??nw!*|QV5ru2OU8=hR=$T$K-IWEs
zQCrP=1i&VSFuwyU-Ctd{i2I8#jdZYl(ySZCIPj|7y0a#j5X4&zZ+w3-8<>IeZxP%E
z&lgELf<ev&!=8seVdAfb2Zea|q96NoF0w~OcX{8Bc-V~OY48;pUcdVf50ODXVa$s~
z)yvWHIk=f<OIeNsaUDWS@Y%G^_{rC{CDoj3F<laB163LW8Y}`yctfN5T5Q$9tPc}4
zxBER1?V-1mRI8m487qL_^4{Ldt1*KHEk@?CmL^Os!|YjaC!&)LysSw{1-={N`WNTG
zQ~1JR_5D(o<+fTNLZ_wdM>Dt)aMTAJ51#kh4|7mTu}A;`G^voIG;2akP;O4lvg);S
znC`P-1#I6uQ&Xc)3`5c|RxyF<pPw?Sh~S>K$2F}#=~=>Y1K9|hT`e_HX<<53n#H_i
zZh<%Zjkp@U;C6hKIh^u*cXnz_-Bg+;IL}{2CJs~an0(LH4&kB+)dCHc=#Vz$W$S(C
zx`#;&y@&n$d703xet&qO{ke7GWX43N^1a*Oc4Y@0o5AKssB$s1nPh>x;Qt;R5ssDW
z<k%Lf0QRtPMp2DyM4T}SDZ}u51bWnp;z`^XxO4GR(=tK#=Zk1#oSW=xsx&4P+F9-Y
z)@7mUa;c4&7wbh8txU4RdvK^Z^f9bdU>sHo>QQI0@w4(=r0^Ui*Dw7SHk#Ak<(-HR
z4b0gLRPkyviK|ECHYv5OpkNDOy@FAaj1DMut%dmVvu7;!uj%@IGw}X(MW=ELTjI6J
z#AnCh$k=dZsSa&)n+Y&WwyI!e`VEZe$Ob?A{Us>)E_iu1$#la~%;Y^%|5vi=qk1ox
z&b}T1p35;^rUyY2g<pApO_he>Id`HD`}NuGzLYkD4L!(X<UKaZ`&R+U03hZ)w%0Fi
z6xPO$W0h<gfF}ln()*b7jWzLTe1uS3$Xmi@QLb=eY=hUo310!6hOJJf*g9|6hKYr8
z@mDB`eZ@vhS3*|noj4mDj0mCbSB#M2+5*a?vg0z>5mo2`)X(L9BrWW3S(R(;Z*;CN
z`G__rvN(Z9Tg9RQft03f)gJ?UQ5t6kr}A*n&vO8T<+W;nW7-cv9cXywj8D#jxjdtW
zZyxjb4gj$E78ZLE2az1FosZYQxH=q(PK~E@)~z%~886<=H>b=Or4e>au6z-p0(sk<
z4lc4Yx!$@*jo6IN7>lvDjIjwy^O<Eb;L8p^1b5g`LZcrK=*Ml)1Gww0^4xx!(50^c
zP0B#Pj1Dg*1;KNIasZ17R)=KYAhS4=sV#vt=~eK!qvh)jGMy@3Q;jf^ZvAthbNRN?
zaPapRI?qKlWwKZ!=<79ps8$RfM2p}E4-n%+x8x`%+O|{{N+W2#d+^53|2??+;XoNZ
zzLrz;+50JOe~)mRNA*iOXAc9?{+~-a$-If`SB`IuHW5J-<GrEP7IEpx@$^#Tm$ty-
zhQcGtSMgQ~5z2fm3jzQGc1^d*9a?~U_IU>EB8Q>yfmN&cvhSy2SFet(h)e3X7=gTo
zJh{nFKPnj1F3ju}8E%V<!-Dtk*jTvF13Ku=QG`(h4djD*X4kRu8@!~or*+%aqK%x@
zMTA2V7mv%~<>@OR5Ar}H<l$?~=X~CoNvvCmdF_(RxPM;`mBxMkLCGDlcJd?i0lC4t
z&@z;fXW^?Ckcdjeog79tAKydmt6yyS`L)0B9~R#Kx97g#Ca0vz7lHoZr32;^2$Ym=
z;5HVh!TafG`B#Q4s&T_V;}!4?j6zX;#_q&H=-5hG(HM6$TChdf8#VxhPNy2Dw$S&0
z+a3qA=6KSZMu%9#zId@C%72wuXn<oq@*0~#lWTulD_co^8U=-6q-wh_+}<#i?tr(M
zrl-cHJI5h%F`%Yo-$sD0{jPI4iA$Vfl`B~n-{RgQel!o)OO<ud0KtZ$4?=Cv2M8pH
z<JFUmkJ9$@-~avI{FtMun5I5dzVPJd<e0*dy6!IoG9;yTP8uo=<Cb2M7A+hP4>Y-A
zAT}7t^mG+^E#0~X0=&r#O`M9XuwEz)Mj`0bzhcvl)3=2fN7=TZ!cfC;dD|jB9+a!+
z<dvMN#RP%yX`faEncUSkRB8hSp<sVDt22?!Ks97RROh1=gC3bz%(+!q5q6Co5t(E9
zdEZrm_9}|ZzwU`_9YQ+u1uCq*^1b%W0ZHpVRXdW)Ym|8&f#Y}D><8n1U_^NC@p@~8
z?UH6sK_2DRmpkmfK_V|P0@(wl+kB1j5B%I0JTH_CX~`(VV+I)-CY2#WH$QXzty!^(
z?C2WSTx&-jFid9p(;B~OsQmEn8sfv995nk^87ebzLVWQ2K~68H{of1yc~mHFUbp?Z
zZDPq=W9%Z{&VDk45Cd3n4}<7}kniDkH(ck+U-s|%p1tu{?DY)BzZd)PC5>e-m?@h4
zWDP$VeVY5>jn4`LZs!OpVViRkZ{iPJXq!~SCL5K+*b7qbH+43$SbfU&*#dKYR}ZLp
zGDN!a+;yHa`-eLpKyE=Mj&{@YjTHC^!>!a0?%eog>rVh|Jrjp>q^<x9&Ep`7FAUc*
zV0`ek#m8_VOCR1Kdb9}uL><k157`Y~^Pg&(5x^H%z!<#xWMq!g@cTQR_)#AzQDLK`
zWRzcBkrR7d!9p1j;jXe6kyheddRWM#Lh|l6qHppSj-~~d&b+%2feRq1%fig3W+BJ^
zEdqf!RG#|-Z>Gp3?xb)3L`Y23{3`Q)Mv6prZC6YTYmA+?dbG3N2X|g+GZY?5%;szy
zz_Bm8_&OeG_vw(@FvH_xmus|7WtB*xxVX<0^j%$*E~kZL?aS5=0}h)p$lnLLHg8A-
zp7ZoS8vQkn&%h6GG_5ad@~nQPunLZ0KbO0cOP`N{tD~~5-@m!YEXi<2-Yj9vT{h67
z4M1RTJ)Af%3A-t(an2BfR7d6NGT<Iiviyf50GBz18j$1Jq-&+a^=b3TuSyM58h!%F
zZhR$Dg>r+o5S9vvBF;?s0Bsv#cl)w{axa#WD18lD4nW}fiuYx=ODA%pFYCA?N<lVU
z*YEqyM8TfYkx~=wa4ArAT<?jOG*Cuzxt0(^dR11qMTxRI0@~KUQx_9Mn0R^UqDJCh
zIy%|Q_(LN}ch;JyyC8>b-dGB*g_G^R1NhTpp!y$@_5uj8gXAs#j75LTqQdXq3W&xk
zrhwBVQKv(K?ouYRuGJl~aL$T9h@GUs2T<i?C1LycF-0Zmq(((L?g;s#@uSA}t3q9V
z76JiNIL3fb&6D(dbVJP?7*q<{BsQ6a)$Y9LYGk$<jjr?0ZY0(yDCd>rV`eBdT+bK9
zS0x1ud){`}bx#IWsO15tv<EE?01l&HUfvwWszFkih9g~_Huk`234IZxfK)}w4+f$?
z()zEhQU_O(#lr_X01$FiM-M<-!xvLH+xKbp$4Cm}Nw~jR_Bjggy3PF?(7Dg{ti2Zy
zz=65P(!hLk2u8qNjQ0Q}*ZS&Nez2-#mbbb?S{8A`>P=2b!0qXOS}rvr-&GoYbmswe
zRP>_&FeoXb3T$Vr>_71NoJ>lK!S7lsKt%0Vi_40oLn+HIO7v65g+iGj-WozOGf!TY
zwN7CjAz3KjAz~egrWFrq*6oQPzwE+X67X4Tbp+J?Jo!7XOH;6dFn$c+cCyAhHp8NE
z_2L69zVuEq<1J=&Wm^$V8Or84bWLmKWhs@F*TjCmr@+OW?qf0=y>`G5Cg_>k7&)0{
z!GEP4Vk=wkD!jY%GI!=c@*sWdE4WR+)xB+{l2dB6bwzZNxnyN3c6EoIt{-vXGY!bR
z_DV9MYJXC#=)d3nHfcj-pxP`>v>NMNL0U1JUHthcyn7H%RDP6sb~uJ{Mr@307ko?R
zA9{$5hI{Wgzd}+41;$~b2N<vQ3%8x^Q&)*&<feRLS1LgEr(c(jgPt8?nJf@7UVA6w
z9aq;cSwHSHSNBAs{6T=wr#Un$Oj_aJ=|;Q{8$sC%H;AEcR8QT60~&rNhS?g3eADJh
zj2^<yJdnkVgp_oh0bkj|+DWg`i9ep*ubth4c$$a2aXay)*--kAvk~P=>R`yb1awmF
zw+Ct5j}bWob8t>{^iY?q-!rcDIZ6JD=d~ngIN|B8kEmb&oO_8U4XZlPZo>$7Wk#hY
zLKp}VhAq1(lkO_ZC=*5$ykvcNk&%tqaiE*x>p1()Ycp^xUO%?x?wW`s40Xm|Lkq*3
zfshzL^QP`ntr=fP9=zDJB82hdC($~aLJ!InbPK-eq2o@-$1BVhT$js*;gh|qgBLJd
z3=pqTj=zz`Cc&;w327MQDs7Ey?8q?ME%70vD9+V60s}TMGEK%tVIwfSC*i1m<sHs`
z5p4(Fi)RpuC0D2oOIgYD&y)u1j23h__%1@$_;^^xGWeD&%R0`}ZLb0m(M%7l;PlUW
zn0cE@mOol6*qU=OxnV^b3s>PK@Xk97kTAU!pq%M;aH+O>gMq~}l^=`iR;_a&lA9Ow
zXBq>Gz<mfkz7b{fy^+LuMvt}u6daytxYgke_mMVKTd|d7X012!TpJUWd6vr{_=d~L
z=Mxr;=gYJxJHC~4sDt972<^_($*Nr#gYj<{|28FjrIuXwM4HNU%4yAP=&Dc{o`wEE
zf!^u;i@`dv(YWQoFvrXa{;xzndA@^^3et<GK0K9=hLv@#%}F#$;fs@hb*gQkTvJbC
z^4m|u5tJ17YthhfLZfb$R6>)LXvf&i8C#6>UK{I%l6YbeOLUU&mPG$*F8MbJ@Z|tw
zlA*%FI#56fv!i{3Bb05jkGfLw6CkQCy+;Ky`Dg7WWxR6KtUgY3L1&A*e?m&a?84r^
zIYsNHjl3H<3FggvGrT4pDc3is(Ms_7cPNTdWh#6nz;@8H=p?B~@W4U183&><fI;2W
zzDHf&-2K~$U*YcM>E2%3{TGwg*bc8nr_bmMB7XMy+*Wl(8kAE9{*^En|4~!z^JX$q
zVRK0j$tx}0^?kCfBV>oVHcBz4mO?4*yfcHo&WK#rI#R@fb6^W(%{yP6BfV;lF<I$t
zrfLFe8;aiV&o<vCKw*7#@e8;LP=h+N1B=DUpEZo}jPp#j$Q|xks!`7m2By@W%|g%<
zC^ZyZU+vBJ3^R{#e9F<3Z8~w(v*iSewWY<r>B#I`)4RL+Fzd3QFZIe>nXZ~Nx%2H+
zCqx8QQ)U`<O-JT;1t@_qSRT{mSc)G=V$<Y(U2B=Dx+RhKsh46Cti|}J6ywh9(r>^r
zFfQ9prDxx_aCh;yo&68G`2P-hi`iRWX~dJB9NIJ^-jVmSJR+3;Olkm&gst<^mwtKv
zX%3?!;#5F!^AYR^tZLFwMUxo)QC+cBdDop-pZ82=t)Hrf<t}0hBKN~ax(zWC{{%+R
zAdpP*${jp`gwsykNir1}ZTES*Tt>@`_=oWH9Lpb3J;1S$m%et8Y6n1ft_zq>=6)?1
zmM@SETOJKUC@{Sy&zKBqMoijAIG|1Wj+%-z@S*=ejh@43j$MI}(@pom2y!b?kU5ko
z2j`x^q?>i%RRXLdvlXv0QxD6n+VkjwfsRIlz+IUqBE-5QvT<a{;g|=9ULblO5Da#m
zKc|Qa?Bd+RA;7u!H*QA|c@+Q6Lnb1*;jydM%wbfte2nu;34NDxiC4#wFRww4nC}&{
z^gCFqb&r|m1kD6R`+UAac)%N)FlL$$v4bAc2GJWG&20Ue1>#DyP8gn9B3+Uk)Vy8~
zt0N>geuMag?W`E_L>>`{Aw5U|pSm;?!oTOv$_1MtIvCh~8YW<KN1p<9jQ0qEW89a4
z{<9W1#hq~XyK+jh1KxM~bio(Z_$>L(%U>4(*cE}taCbXtZ=^8lyHYg6-Cx@}&J{?K
zsKvwrWPK}$-I#to78nnE4Qq#8#o+5}LfeKrqLl&`--|BCgmzV5Suv49J>y-Y)M$~y
zkq7qiutR8}s>nd<HHzPdA&PkZZpb>(YN5t8lz`=@(gi}=2F5eCgf+2dCM_Sm5w?cI
zpo~O1BdV4XlbBHksZ#f+wJ*<5*vgeB`VK<(Qx1~z24Z5Tp~@ru`Fno%(K~hpBze6{
zTgd%fO-P;V-}Zn)Mu<DR8{RV7-ruudWk~}9;-6%r2xtW7{XdxbuakLV2%sxqGAA_t
zxI`i)dn!66JuPjvs65lpJDP2Gt8!r#`1lh_Y_@R=z@ut8%Wsl25*h<-`^sj{IbpL!
zf}q>NUnFl!ZhV+O)*c2^i@Y!Gs$S$_pS{0i48jotEtYb+f7j-<`{nbx*eMxU&)l70
zGo=l@&q*ZwDIfiQgfs)wLRO5}qvjU!#Pl05oq&c)lO*S-l#J+_{zLs#?qHx{<dx)8
zFH#dBO%?T`>dz<2F)8A42uRG%am`}dA}?i*GlXE%v+wbA-UY>rg_$q8Sx^(do3*26
zmDXx9$tETU9+f|`;Gs788M>~JtfovDuPrSMrCA6YRxYNiYuUG$=30?73t#Sj4GU9<
z89~0nW019d*zVK8wbl%06nF@mJ_u+FaRAdXqDV&p1!Q#_ZS3}4pF8jHCswk-9A)AY
zHFIe;2%#ji<yp)Gz409Q{hT`~mYS?ocfkzKP58UB;7KQ8Tj@98+r;riN1BhfbU0`=
zh?S~4s<B%f#A>(makNS%ntr|Pre4DAlEFS1keI)3v~*W7o_To?z%oKFVLe{!oNuwh
z@DX@hm^x+afXg)p_!%4~O4T<FiBS5&S9ovfPsybpLx~#=U4)#`iFucFn6;N>y1qvF
zh;%_H#u?`Cmxs6WsF<ZPhT(jw$o%3WqA09Z$vG~&#P&X$##tG?(`I+k1v^4PqlgAG
zU0e*6&0|S5Kp4S4*;VZllO;=rS{^?DVC(IX=w72PYTd9qmfr+iP=ig|6g7?m{x<;|
z%F(6vy(YSg+^msztFd6p09uk1bB84#>Yj~io1QuCQ2G35gLgQ#%iDg@xY&@K2<Kmp
ztnSed=&VMK59_eHGfU3jb$5G)w`hD0Jv}eb&a9D%F4^f6+<2O+`6z$M6MiJ|K%mlv
zrbD;$jwZhB{+dr6yh<dLmkWopy7QvvL8l0At6ILQ^4%YC$mPlL5SWrw_v<qy7b<BS
zA3@?m`2JlccWzHKYYx}sZW|vh{Ck~Hb^pe9NzM_!7cpMsHg!Nr?x0n=^K8yhb*C{-
zn>sDRS9!17`ePQVbU+p@<dj?XA<TLgHKE-_hH&D{vknk$vN7Np7sBN>8q?jdy$(9o
z%*aA93R0X&^0RhiJO~9GVt$2L|B#dqu>QD^@L;yMy<J+FT$zj6Rc1+tr#flCPWjQQ
zh?hXnMjcCH&5wiOCAx|zG5i_04}YLbv-ibk8cd2z*eXty%dzzq^OF8W?o0c3zflz^
z(^C{K?yCHg0d=aKF`|8QJVah!Y*n7}X9bLO(lhYmeJeLAvt=(;?!XR@yi*RL>4X(u
zr^{Zu=`g;}<NxErEgzyvC0j`NH)jaeKCQRp1y$OYw3>k)2$a4m`4f%)j>f)>c>exc
z&Wl{yAQ!t(d>$W@MCvyE7Znc|(?v%fOdB~yA4p)`(>cT4jd}Sn1ip%&x5ym!vi&!8
zD0McNDI6?hnhBYUGuV(0LFz~sAGMDKUWC-+MshEIKcOeEihtDLVbSEyv!=_b;B*kp
z`~!bpj*bz3<}IQeFTPYS3=fMpj6{(l7wd0534XSfwx3gXjO?NoT57w;;M?GzZ2?vR
zc;Dk&vK$5kx)Z!98T8fOh(~#O1HSHsWUEH_$LBZ%KjZTI>OQrFV_)u}5_kMv>D*h<
zWGA5gPzs62x=zvIlgV!gXY$B$+NDyf_*4l39rBAdhre&mk^72IJd~xUd!%`JvOAjB
z7~(kbJu@Y~OSVhJo~cJC;s{2EnSFA<a%2rG_ziwahqrv)SC4>gBkQ??J$rHk>Zf&D
z%*6tcIC}Bid;gKxSfQB_53v5-LD0}yQdqd?Wa^w1mTwhjVOyOh#D9~<WlQz(w2P%#
zc$WwcWd|$VKK6eM$GXc4dufQ!gD{%EZ3IhJp_bu|Ke7EyXt(1$A5U9svRqBrcWc3(
z##vz*R>85`a<wlPYy0CrTA5;p(df3-af|Crg1N=JB0l58z)e9LVYq0L)d782Q5exG
zNjpZIRipxpIL#_dxr+05n{t?yhf)}g^FJ<T9R(A7{Ee0Ai*`Uuq2lolnNXS;lqa<L
zo=C`%2_bXt4yL44%%M!}t|Zj)|G<Dnc$eHoBaso&oqIY&&|_s964qsjyShWpGk2hT
z{qAFYI?Y8}ooWPyA)GlgluQ$f2hiJR8LI`PJZ>5=YyQmdX}6tqokp80lB@5@21<$S
zVVBKIbd%3=F<FQ*{%~hfDOI_;+(@|I<Eb(M(I??C+HC{}Rb$#xm~oZ5KJeXJdmdid
zsMBBQ8vemF*fZ0~zB5@{_Osadk#w3kIDmb5a8~Uwa6StOvBLT|WIb*_QDKhae8wg|
z)gUa+<9G{+lOxp>K00@yi4Th*X;bdD3VTu&^0}=U9%h$<SNE7Vv)*tWa42Dm9TTjc
z>5Zg&t^*#^4`{Vk_CI&g#Ujt`=k;XwTrc74;~!07Xp(T4jpYR0_v^vVb}L0|SZL>7
zVY$ktwq=}+b^G;rgpOqy<!$F$HZE{bb#D%e(^@&j-=#&ac@oCP2(?Lf1l*Wb%~;Lz
zSa(qv@tCaL<{SjyK1+0sIs6>lP2&KWkrl~RxoA)iUc12}JBr1lbQd4aLpXmN-Fkc}
z?&HZ{Tv<2RH{%TgB_L$^*SFDn0cCSU<~pdM?6eX<5B%3R%>MPZXlMimp7g}4^b3($
zm{^1-<ey8eXERo9nM*tQ`>5BbC#auLYfwv4Yq*l|nT(0WOMy6ar6?l8RgwkE;lkmN
zM|P%`vsEQ~(R|S|tL&r$sfAit1d;R)$h%Wr_i#@P=8=SP6HpibC|bS0Z@ihIOeWS1
z%=uQ1#+nqjGIvv`Gf^I?)B=czIKF1YdNF=CN2JN^&GlT>r#pFVx%=4uczaRSrA%of
z`~<e~%17}Yg(OPs;}n%@Gu!X|sy^bPVUQ|OFXJmwdSJHUvZfuW(@4Tv?-cXa60_p6
z=AO4o#7kkp+Fy6&de}^(=WE-pcr^R&LkONJb3!DJV4120v93eQ_XG2pW=9?#J;9+o
z<$x0q?P2qL;u}EgB;2Jz$quY!yFDDrp_kB+%*t}AX%`EMHtFbqAGO7S(Lz`5y!hy}
zJo8C$66JvqN#!;!gs#|f9Nb}MFu-j}w83g-lmqsTBp-@9*v}0=VO*o0(V2n2D5rq1
z#%V1h671d{`u@l+*_U|G<m--kl!-_AyR(b(qVF@_o(fPV6yk7|KRU8M@b+K~WmKd_
zw_o(Nn7X}LWbg#oZ$oRJ4Dm^H=svfhgLM5Agko1>S##gsmG#D*(Y`$Me$lSL%XEu>
zcZL{bTaNtyjvzWR*m6gGVlrCj4)Jy`CkOVPQDNrDpWPk%Nq=ul1kEJXoT%k=Jl1bn
zd0HbAC-&d-3jdG2?~cdvfBTN?dD)k}vdJh?_RK0XJJ}<IB$tfrEg?cBLMU5A*_(_)
zc4l@~wp7n?`u^_wb>GkLeqPT%&!5lh<@?RmmCJd4&d=vq@AvUOYJ;rr?7J4{gi-Z&
z{&dB&|BT^gn%wEF_V)I@km5&7W)`nC<Zcvz-Ab9CI6vO<B*0VFD~(|B{NK`CnU&`0
z+Wx9D0gn1s?DGT)4)ktwvNPDXj3uAO^4PXnnqisI48)GzRc<u3zRv{HpkyP^M#1+I
zgP-lwri|iO3AFBi8P-@j>;FElx4x}Yn%wUf?o#6QbRpqgW(gNUD`K>nIMI8>Rtb@Z
zTA#NMg%`i*JkhuMwX=Kaw>=|?hgXkg<9pD~%9HUd9Qf1p_?XaIi<?2Lx3o^MPeqpJ
z4w+eRq>)u~P(N3dABA~}sFQOR+2XzDN<C4{vK{+vHW(p^j#2v-!z<YARQd0=uAKA^
zU!L1b#o9Pft?CtVu(4vHs0jCx4`&~5IktYrQWh9ZGfwvlH6rGv642JC-q7L4rE}gZ
z@kLyvZS$j!n|!nlTp5h-Puy8VZ$*nnbr}BaHslgpEFD%;*OL0FXGt=FfXN}$6}|j!
zu06y8l}h07r*SY0S^qHo5Cm%bd(26`ZnktPPkS3Ac(NAf9Nh**Wh3cygcQQ6%6V8g
z!-cyD7}3Y;;$-|QJcY~TO}Z2rBJVnhf<={ow|X_E#?vc^ND>T3v#ZA@S<k6*?X}y?
z88-#8{yA?r&_N>=E=dw1m|N`Qh?z{Y4rj#^b8?_Ci2Nkly?Ax}ace5iGcpAU(N*->
zSjP#|@8S2$9ymFFa4jRxeI)dQ5ie0D!5b?GBZ^*Z%@WxRZmmi>LN}($*Vs}A%Xo6R
z^6K-9l<|B$-@_JQ<!&?{-YoHT%?(5KH*)ksPBpiqfPaaKT<RKnt93L;^p$!#d#n?q
zhbEf!QKb-jMsn29Xrza$&~7~IYNTA-v3Zv$!NbX?D18F^n;3Gd`UVi1igMN6{bAC2
zY;`R!o^wl7G_*&*dr9sUwn=%<+uSz!j^uYG)SSwi*dvrm1ZB}zDBjSMZz$wyR1osF
z8qEbLF!2Ukc=l_=-{iag;MV_6VL=7a!mRt5FnLRrn8Dc$Jhb|{&{t{$5%iH>?BgKz
zhzOi^Q_+w&b?et(_U{!ia?<uw;ot=y{@hLaV>P<1!mLeRD^7svE+Y`Y1dXQPJT!gr
zRCq^P<-IFaUxFaFh;88taR~ug<T(QF99xyznYw$wD6CQiaOPW=&q?xHQ!3*6_giZ@
zo@Ei=%K1HX*}9d094ES(lGNqcq0Bb1IOPsQzrdoz=61+-)L}4NIE9NDNTE7E_+zl+
zT|GeFdWOVb&ul&m_nBa7N0q1Rq1e^b>8iFqT*^iK4AGUErZQXd7UxJW8?&$GyVkwq
zDQS}!F%btCE9~c>1_cr!m-#F52H#nfK5pN)?xTq|?D5vyPAq0nNN|xVZBhzzw$nAP
zz=VI6Xb*l+kkt5P#aCiJFctl)1ylNZTar=2CAB|o^j3RSC07Xxe~-#>C5(A~V9)zU
zj?-9t)1gMy^pkc6P-G(IxgfkXP*m?YMU2P@w(ODwsjC}p*d7znjqL5y*7q2FQo=o(
zt|d~PIw(u>>K*&YR28k%OW_%`E}9(E!uHs*r!+2t<Vu<4W6Q#w-4~uHJ`8{Ka-dG~
zeq!X(&|30-6Dem4w*K)jNtIk_@);#&QS2|lyDhZtRr&J46dmZ9gyutz561G>96r;|
z7gEx0RW@c5n#7_g92reQI`2P`NLgm}3<;P~P@<*I=(G9p^B(IaO*v(PXjGYNj-nie
zh#*U7$yw=hsb(gk@h+CpGl7zdx2<E?=geNG$`yXPH*4HV5bHax`ruZrq@!m>Y70pV
zqplqcb=Y&!&hDX#DbgayLw`rlyEv84^r_%dDwEWPZZ#_CRL)FvZOyh^V5S;ov}Q1g
zO&uW+n50YlwBmEd7l#f<d-%#)phx3cY~?4z%e6Hqw24`fkPnSPsyBwPY=67@d;N>b
znNidGS1gcy&Cdl`YfY4WcKGQ!f5Q3EzCkZQkgsu{nhH<4g_z!!pgNswc=Ts7_sLNG
z_xDnt;^`e(0uxCXab|uD<Z805sV?=s;2{vZY}K3mS7n;`VUNf*+Z&ws7Zcb=>F$wB
zI8TwWoFtx!u#GQ0q_Ln_8oVCN{L`jA8h`Db@IeNB=uD}row1KLzg*&edeed&4lS1&
zcMv1JH&t9S0C<~Yp_bR$6PnrbVg*dX(NsGOL{&vyXGat5!UF6mcE7aZkl(v)rX{UO
zly;LqXW}dNYI|T#g8UE0Ah}+({h254(gcNV)5?cF%!j<MqCRucLYTZ{mvtB?P9l4}
zLX#qtcF|+1GVev0bKnulRjRAfoKM<_x7jLezlgFoZxY)km)~oqvQwKgAOG}#9a9=b
zo8c4jOTDLiD5J`7ezMv=jjqQRTa9Wr)vM9LcEh^Z_`_Yd3!a7Uq%Gl<*8}a3H7!kJ
z+{9#)bZOTbTNNAV#)g#|u6nq!`)c=OP_Nnk(3;%-Mdy0SW1=Iee*2Qb+nRm~f~WF*
zv}U<CjAvVa$M5`nyuj0UYgjsMt-2)PTkH=<ob}xPQySrCM>roB$0G~sARoVW?F=UY
zmSw1&KRY8f-db6TOrAr3Mg|{OfUX;9&OQ;ppL#onll1(p@!ku~`n!ol)wx5jikvt{
z-t)Z6@My`f{qQBaO#X@~NiH%QlJq>u6Gz8{Z;OwW$kKRaTwO|Vr%4vmH%%NnQNdq=
zXDvuu-i!MScnhw#p;}AlA`}_3x6u>k80Nm1C3o459sSkA=ADV3m%LP!>JqOIPpG}v
zjT7WYx5shhe7<w6_~2G~=VP&SarON)Wb!C~IEPKwl}6(wmOxZ916!r|Sgy!a*XK~(
zKJ;gnR=jbgs&m8ER#xjt?dea33Ymw4TwiQ^(nm;d=V`x7Fbd#(2PJf(Umep-#8Qi#
zxs%%gI#Pe5y!3<{o~HNvwWGJXG0FU~DxzQB4Q+4#t|jE(vYg`JcmW+$8Q)?rzlY8;
z$?VZPb-<*Ll;?yB)PP!cZWX+`D1%jhZ)f3I-VE9q!&+G0&7X08#QOR>brQ$;Ohw<r
zM3-&S;F|XztAj3KYujea>&7XxsKo|HyE?nKB=3Hdu6+H+vAPj&{w!?{Gxs(3EcPf}
z`}4a~7PYclk3>dxESorPX0rDdw25@S5H)dh|I&Y`A1%7n`Vv3@lN4jtlH{Q@S=p+$
zi92&;KA5<0(TvMEnD)n9HzmTYD{F&N=Lz_G9^%kQ3mc!U^6LG2y(FX)$Bl}3b&Z86
zEQ67$#Y@Cx`+Kvq4G`oqUZ>xi#K<Oq#PlnTG9N`&>t7nZYt1{|%>CjMX8B3YwV6-S
z`@>v!LNrg5K5Fijwerc5e_ZHP3iCa@>HTe?S?a)wg=ln``=B*yUhj?YzZ~L!0Ur^E
z$R<+y;;e315iZCQy>+|b_-Xhh?7Qdjmm+a7dwE3{8Rv^qu|^c1Y6MWnCB54Y+U}hG
zVyXVJhtBg85v8C&q?-@#$*V`Y45YWJdW})n-2^5G1v3;z0Sf(HJIo-wBC&k1$PsvO
zHbXCx&PC4Vj@ZMk@cdzEjsCxfQadkM|6HeAwIy!*oTcPHLBT@VHXihzF2j~U0HIBm
zx@euhUM>^nImT^R9Wb!BeBejb852RYLxt6*r;1N(ym4R>vqL5LTE8?^RAtG&zUz$_
zQsmnDOQ@!E^jC2A8G=S#B*CiBv~Fd`W`d^4Kj#u<R#IwgPGo;;NDZ<8`;g2g_JPO3
zd2V-u@khffMvlsNWmP#9uY<s>jJ<-WE<R_X5It?q*Y`LkDfH8?Lm97Z(>yrXC=xy_
zVvgmO%MfohZ54>l+p6Sm8$RR389c)m9ubXu{`}>a+xMHOa7zS-LvS&qw5?y?Q&9ZE
z)m07FC{nFB-uteiGy4~}>J9sliO8&F(A#fPVWWzXkN&X`ors0_xm>XRkA<)y@|YQS
zF7bNmKX(a5K&u<LF42~uMt63YMIlUL>w5xgv3_@>fT0`h<`;*6(Zvfe?}E-Q&^z8v
zeA`tINA??V=#VbS7P$Y}DZb}`G!?N2b{LqS6*au-+*h(+R)`gf3Qngas5DR@vHQar
zZpzI-rx$tEmfooypRj0GR__g2-7Pt6AC!SHolCE+=JHu`=^3+G=<vK4|2J%==;%hn
zQC2$SDQ}R})Jt;p^fq+pu%(aYi3Ngqx)z(B>T3?JnpvP|_>aodX$=T;)E99xjOlhZ
zGLy_N{29s}Tra%%;A`nu{1WWgS%kK1yn5rI=^LJ(>o|pjjNxYv9*NqtV;--0%(vre
zc5O`A0|_%;8lS<Ey5?!+IcqAL2-mp1?w?8%D6g%6xxc*DH;}Q^_co_)DmXG)1{6fW
zQYM3y(4*&J=0dniHRl@XhIvt_%oJAvOdbi7<Y$#+U^^7Md>jn!))cOzCcCoCHjSsF
zVa#@e&x20ngGSVOJAtBkERp$$0M6HJWMAUZ4zDOF-aP6x`pTgrj;;GRF8s1{MrEUR
z!QQRPZm%o}D$ClfW425QkCz%WRb!=GPqg$auRn{;qqRtaK|r}Ne74W842uo0Z@HIz
zJ>eI>jhxTSFF@3DdPkw}dkR()VAWevx$HsRa!0rXnT(7E(p!f%@7+TjH+Dg35Je5T
z*bER)n**Pf74$gn2`@Pp*SyXUyDqQy@H8;k$Elu<2TEB;kmu(48>rZt25(?nUXY~t
z80u<S$$Rqf9*`&^fXe-~qL^YAO`gwdKJtq64WupdIJXtxaSHLiB|C**Qsu|>7wQhC
zcJiLa2vwzieDL&bI!}}>x7FLvFh@iGz7+Y!IL^m&;^o*xDcNuPPp|(24$5Fl>oUyO
zIhp!4@bz+vkQ@NETY(c{PT$S0yq9@d_TmSCGCx^s>5mc|G<+jhp{fy7O9_C&NP_N1
z1W?FK#+fsut(V`ir7X)%6uQjRKiCRQ3hfv6pS(@sd2Z{#HfqN;BJ3^W4^4_lh5Itr
z#vZ}!Ex&CQ6_Z?)?v|;olw;Ax*!Exc2Vw6y&HUc==KT~MPJn+KhiupR*ksdF_F5X{
zjOK&f+@brK8vP?4$VahZuXASl-v4#D;&NJR!2<y5Yi7PPvatX5b&&?%y(T`nTKGS_
z87VK05HRsnZk{eIa$o=TE3*K3>ThNZ9gtP~x34$Ia}eiUJ>><TUe`ZY_s{>YAq|K6
zr0=gl!{NWZ$)Q6E$ov}x-4UoY|NECCKfIfMyZWoi|L`U%=r+<+Od9<m{C~V9eZ=b;
zv=sFrKKS3h3_=2Nl5gt_iU0fkz~7`t!@I8;?sysg5BIB_$$;b#A+L(rkz2aKy#=EW
zd1C1=aQi_&^DO9Txv3bv1pJ~oC+Ru^bW{|7hCw{heP0y*S%9zXTOF(qLy=bESjJ0z
zo8OWx#HT(&Ki;ee%se!Fkp2h(MgUR7f=6~4jU%prT*LrQ;yFUO-@E#wK)h{%UC97{
zUizCV{7r~Pc5|peAMt5O>lbA=^YlRFZrbTh%dZifKr?_9^d?yWP)iJWgYsBX%}9qB
z(k=+&jRjpwoA9=UP6HKiMrYtxl8UI^eg$ERC&i5B{=xJ^Ryf}uw>~S8<JRZQ`vl7m
zB&sz#`0-6U<-KTz|AyzvdQ}YSP8nQ32;EECE8M634@k4G#-BH1KR8|vibMy%gvKyX
z4zB*;bLF4y!zPFGH=9)HKePY<ac~eEqD1c69ow(#58%9t4FSKKp`UTtf=8gRG}vuE
z4O;g#^b~c{tfTo(*^*Ub_Kbx5T<R-lk)O+##cEA}A?lN13Gb%^KDyvySY<(SaO0~O
z9Pe;2JKVb~gL(h;0{+`C4GZoD$c>&%X7@AF1&vdG?<xP65WCfaJThp?Q7UNJt^a31
z@o=iW=W~AoFdPaU|G6%t(7bO0I<K6~Al9^D5lLy#%`u3DJn6?slh@r#=WaiS#jK#>
zr8c<Yy}^Wx2q;}hF*{Dlxa)^99OcOQSCgd~_GH7nc1`-WGSVs80*LMmOBj3_#o*HE
zr^z84mP_2X6X4U5kWT|XAtfF;3scqj?YGf5$=n@z8>bnr5=vf&eX4tJ<d(2PF1xB$
z95sToMM2P!mKrs6%0d4CCQOg<#GQK)>&^jm++IHf=K8m54`CcOIke%}K#}=bUgn~k
z&&iOzmVu@}K`EKq*FVq$7XhQcJ438@rtS@1TXMFnKU&<Gj+za{f3D$$SP-(SH%}>K
ztLkv*WmewaL7oh+BD|pBy%9j%j()5yv-c$LHOr+t=9bPhCW%m81oop(;B@7%sle`!
zu?syH2es~~b*C2olwZE<#f4I7gW+JeW?{4!G2oiP9Zu<9!zOh%fqh<t-)Z5P6(;?Q
zgo87vq>}8-1s5<SExJahclZFgL+#@?X2{uzMhpdx2F>&_$UHQcH2hh;v(Kp-O8^nq
z)lty=ZKl)|oXz0#+lPG@4)OxiHjFN?yQ~@mv4UnAF>muD<*%{_VkGbX7<Gs8cqq)A
z;&cJAA*&Z&!-R;d@83gORA-}jG6<l*;a-`?Ok)sIT7QzSmrDq(avE%2^rxGY6V8`c
zXvv#X_k)HPZTSY9GUaXvH*fac;Pss+LrBBo@_6OjzkS0jhe>@;gyeaiy9dkFOe5zT
z8lmW+2U|ksMah)TvWC~W#uS!eY)?i}JLn)MM+QcvLO-3NR{x{J$S)5O-<hN>J2uh@
zy({!E{M?G}#_H$^nQkRcGMQ-XYUt@!0yFuYO8y%iSR7WkqOnJW?VJS~K0hDjmqQZ?
zzTUOG_o5<rm^Zc1NOTI`faUU$$Cy0)y;r%%Gco(+tFt^?-{#wr`YQ+wW|e%ykPxwE
znEofbrjgq>?t^d&9f$AUD$V(iKgva5gvv-}F7rnsLVjjnzpqlTHy7DpjV`yF)WJ@5
zWP^QPPG4C4HNsUEpViOZ*gnm!89GFNXiwxlzuO(fZzqLg!7rW0D&b;03Ua#5Fy&Uf
ziRvWk`T!%x=QB9Q#5V1*gz(1wLr^Vg2Dq33#5ynsT6`ppJ--Qb7lzH;yo8132+LF)
z;gmg>4|NH=vs4`QpJx>W1k%C$8*&a3o*q0QWKp)-co89y7GMN3oFdR<@gR^O0#khB
zr{^9Qr+LU5)Bux7CcGPZoma}v(VDr4x_YaH3~Z8K4XkwEs96f1e<amcalrN%CBVC$
zmbgKUP2!BNo$@l^m=C%%<$yxbI&!VRb6{!hU=I8X_0lWv(_NB)INzKvdS>t{Js2*`
zB|NQARv6md5xj`)EqMkHkOsU{p_f?mwTivq9Uyy7@3$oVg6{Caejdih(F7Mcuof?t
z=^Nw2prK5+9jmqeyPt$cUS%q_vsV48g@<efOz8t5cNiDsrYl-bQ8Sd?3u+1qH@!~t
zS1&Lf1?RPI0<#71oojHKKT^=vol!l9B~E_%JBs|B4>WCsGv6Q0`k{Zn{5R@x%1}Ad
zup$WzlNLD7`tQi<_^^F=0=>K8O6iXG)|Q$a2}R)hgK#8O&&h0-S%G8ctzBexr-tjX
z>JhMZNWo|BU;N~=>1i>+qrg?dnSKA1w!eV*-ad`1r&o3j)}{;Zf*B1IY4eXj1^_>!
zd*@00WJ(yQ9ti#bNASlMp|O)d^Lmb3Qs?XB4{r!;HP2`Z;ZV;J08jC1kdO<x%&rcb
zghoMSqs$68=3UD^oDw9F|LtvEg8FQd```Mb{~Md65&8A@3@E>(SXp%Z6CgQ;3%#E#
zcd-L2XCS$p8PW`ww5dHWcl4dA-(9c!P|#O_8H8!h1*WQ*v*`s>-Sf#!HY9?}^fghy
zRiG@P7PGk~+D4thjEte%5%To<ab3wsNFg^XLOTx5Ckl&)O+kg~G#<FC3eQ2}Y;#H&
zi_C*i(CN6KC42O}{&yaw!UFW^-B(yZXyrSR><T{!VE@J@6{ibfNnZ;mV`k{Pyu>u!
zUIFE75Yd31sg&n~lSwFRl)<6ZZxXbKrQh&d7qdx^lJg3=^N)%iK)>>5X(Z?CH-4K3
zCxR5Q=MrbbGRqMejd-~~k|=}yXo2FbRNjT#0@MabHyruQ<uTi&WT_i3W8^moW(riI
zZL2oe5(~js%>t-EmQ28pK~6Iti?<{2NlD>xxhzB7Z{v8BmozbRfR{5DQ6Y$wcLHmm
zQSpTbIf;<6foW)Zw5>?;1dRBV9+4Ij7EL?oeC9J7L*y8?xkryfB7?V&U|MwB%Yjdt
zxq`CHpmISFgNRrF45-YOLFAc+Q6%8c)!5Zx|3qU=vJ)_P?b4pB5<IDg1%tZ*^zb{n
z&kS@RzYwem+;$_|X?}L9oFOqEx7}>WoblO4n(}!SEOi*21px>kIHF}9yvsA<{uBl0
zvt7-oH{(ojcVp05|4$Pjal$eT&BVx<>SP*&&O~v)!!7sk30EI*UIHc#oe{0EWzY}C
zZwk&}h!yY+zag~i{}T8Hse*OndZycqVHXz+w{pfwrn}__J^kgswE%jDMtuqmCPJTl
z+Y5#tZr_K5rgJMKQjy3Go_X1MZ}?+@V8s*hJ)Lr3$BtMzD|H}i={nuHjiMHq^uCmr
z!vy?cEc`A<37Lt#=#@8PRPs7apK!>9s!m{kXZRE_oXda0k$rG#jyw=^^t?<@6ER1j
zh&jSzm=8nqBfYeTXnvq)@R`vA0sNJA3afQKqB@afRARA_rMGVjTmLer4q7R!#;;d=
z#NgcmkNr{81!oAF$vc?6&2>nUtH?`nlB+5P(@qU9^0<GBQQnR~BCnHp<afQs4*w1v
zx-7C_ctzT%_1|4~{fS67r68j)jmn)0iutj}DA>YLV?Kx=$n>A#ly_Y&{u5Ra%*{ai
zQ-q>&g|iEOLpeA=*~Zlj*kSq*91Kvm6kdn@c99aX^|7?!-=SA#K7(7@iuLkc+flfQ
zPlo#-ClU&!1rd}cF1tNQ3|PU$4V{xG<=^kgvk)TAVHWzrh!OPq@1KJP8QeqA#hPc~
z&$Y(((4MQMx-QB7+@qa-;{{N-e8UiJP9)ax`C+g0PCZydi6n{PwEIm60+#^}7JptO
zx2S~k&c(QhgqWkyhvfIE4+&lhtbgsVn;D{gKtsM6n!MK)yMekz3(dDhh<9}uRJ(Zt
zM~quuXAzW)*qOl0Z4`x#(0VdiA#}dI3J^Fr-p_P;oyN@sv}P;d&65Mu!L?mzqx$L7
z%mI%MLm9|G99dxb(;W)G>Vt>ILj+Qu1<eskiL!;0(9da4M36ErViQ~N<dUeH-W|E=
z9rn$8TE-8WkF8*zlX_2jTulWPm~y8acp~&vhY%b$LmI!}c?=w2a){;JVSM6*B8;rp
zEIzxT?vkby=m+93{?EuK{2}yuCIV$5hsn7|zCI8pc)NVPfP@Kf5xG9?rZSLgIHC|+
zK+l`evvaQgRzm=mQn)0g%Yp(EJGXMdUC_^ofRW5ya#PU5*#fdxOc=!y4(vF!K!Yxq
zy)nn^I6uY-$kyo2SH#LihUJE~Arf4E5JOqTi4PON0^I>|Brd;CI`J$r%}?~Nk42*I
zez7OyuR-I9gN7BjK_spHxV2(?8rI3ew)yJNO`}fn3#?l>gp?96oStzu@GRNZZW%aY
z&#p5S=tQKhLd@?H)wp$z?3021wd8A@UN<Q6v4TM8N9G3*!-J@2fcKZzVSofzX4D2-
zpG{#-(Kr1Vad*VUrVL*v{qSY$2Ase{b_0C43}rSooiFF$bfbcvBRt36z%5&r>@7$<
zi2wk*vpiPe01`hKB%a5ACR9ix8pd-3%}O8KXy#T<z;-P=S&wl;1`?+cSSf$J3Astj
z$H2T{I4*_*`Z1Nz%TFE(k51P8;W&skNwW!M+{~>tWk|jzGy!fY<=u^^$l0_-Dispu
za4&ErjtCLfRdr-8%6*~h<c0SXJ<=QAZ_up24`VcB?yU|vz4D9_--eA_^F=+JH5fyN
z7HK>?Q1wp5@fGn3DMcqe7EU6>H_iI2;5d`4=Ho7rExmjg@MYc2;~P>6lX*vLVw>O&
zZY&hWdFWbeNOjx@){w$ly?1Ko{<pOGTz-y|!GZT^07&d)94C+RLSS)!{&6a=Q$px`
zz@H<B>0$S|D4t_OeCs@vl%n>+TPf}jrt5?v4P)2`aH=EV7RJJ!fR&kgUqKTc!*l>-
zIA7ERG|hK}T%n3k8}lERyF+O(-mhbH`XK&0qfs7~2J12ZU-j<qP$R@4QOoG7_!eYY
z0de`!A<qV;JeEb~3ZmJA$p7rEqi|*nK78jV4?k~o2^&+;O5L6J;(mHe7*cR%i@Qrr
zS3?&EMFU(l>5rZF+sGlP6Xsv%gmdA4a<2+`7T<%wPD;lHkDnG>_~4Z2iHIH2C?&o`
zl4F|!WUQUlLb4GVBAR1-?~6(cC=jg&IU+qmm&70z`g|&V_dFT0V8IaDl|&95R_s<i
z_7{>F{20sGWEIu7NNf2ZyWIY$%|`AX#2VpHc9Vkanx{u{K9;4`q)dNY&i?<hAhCa1
z^}}95Jjw<-7(%nKGxjnoD|*B<Q$g)R+HoDKCM(-FUW4f5_LKm<_|>O0LFrU879rK0
zSJ+;6BPL%pj)6uN`u-0rOdt(3H#qlJ8%S6FLOC)ux2I;v470$*OJuG-LQFhjQ=haH
zUx)nodm60IcpO=wk48Vaf};&o!CQ3OZO*+Kd2~InvXQ~^BO?bft}wNjX`2^Y$&=MS
zY&hBr1#kboPgw|Y9N5oEWP`mY@u-F4XVsXZ(3%|eB}Np>oj7w>EU)Pp<9i6HX%+nG
z)YK`E)xW8WSdQCB09_RIRu<W;yvWI+mp?W}-noKQUzjaXUvc3$zXsEvyOI}p26?7&
zaLu)Gk>i|*Q}~L!bFAg1a%y_S;b$tji;odwkVyhB?9Qy%Anz<A!OOis1p_*GD?I}~
z;q^;wY@{x63Gu5E9JfI~0@SWv!aJ)MzS5j}_hI;%q4et2Q);dlym*3@6My>7S>$f9
z@N{n?NBjZe8Ulz2kO*xG2_+^X^O0_54jq4Z=S{zX(P_=W6yf$d&%HS%hIJ!JINoE~
zQ&Kh=axC!C%3cu5X0AYf;dIQ$b_PDd2lg_T8|SNixU@+5JtXt^{|op3cHuHjgS$Z(
zgc8y0*#d7I0lqI=t7z<@bl170&@_jP%YvXOW(DGjo%M=eKkwqC9w7P}81RlQNSVWY
zD-}kfD}i1ZqC%Gxv!73~hghJ1A2%vacT^1QsM`rcUZ-191LqA_{s~MdB9Z!0s)PYS
ziw?%5NQUh^jb7%5^yAl}O2QhnCB|&6Aao!{WD8OQj#qvg{~o-8?Nqg`)vli_83ffH
zA<zjEu!2zE!wNTG<>;2{{grLL{a;9h^>zB~H>FUb3<Ewc{vSgpnV^dknERC-ViuP^
z84>cE41BFT-N6#b-VU$0eOe!iCbSgXQuzpx!z^^5?&gi5Z<)DPCNtV3TP#B({r&9q
zZh;aKWMzn%Awa!l3}#aaqQ|q^-4kYrKHnCoLgKDM17k^Z1I(cnx&OoXAo^(Y;a|o8
z2&lHPiSfwWzr*UX8X8-L1vU;us_OWrRWA5R3msbBX*C+%ZjtO35x$;$bO80)2&}Vs
zq2QOtkD6Q%+l*8%QXxB<kin;?8a|U@S%?(?vuE{h;Scp+nTeq6_|gCHk0+AQq(WuW
zkD1!7Zc6AJYU!c**^^%KotIvyBif9oBMn^n-@R;y&Yv2J=kQdzO;*%SUl3xEgx)6L
zk7Pzdox`<Te;E+DBf3kAPy{Xg`{!2#;`EXDbX|<wi3g8!ue1m|<R>Kcgi!pcVN61o
zLLo5!VYosRMJU!sP)7@g2vSH(W^ZWyO#^pGLWsu92~=Og!I-x%sZf`sazeAE#wcKY
zZ#W(;WWK%OhP|@^PMr)ASI@(V=|8mfhjYXwHlRLA*b+fb{z<Sx>tmQqdxW^W4BR_9
zyI=N~;Auz&lbHztV0q1|<SQAF6-G<M^WvV~C`5FMK(~D}wfP96P|Z#Px;LNBCpM+|
zlCD`oq0STQ0l-hLydJv28dsketR{hNsX*`n7tZ285chx2I1{cUjzfb(GN+2j8||)7
zjD(TaDn@=gjk|6b-K_l9eon;6&2Pbbl`DXFt{VykOo(EVJ47cR61u3qLm@*-8Ywt)
zarP5k9hsBcl7_-+GZJH|N=>2Ao3RT}u%`o!*-`%fTv~xOOI9(~BZWyx2y|PJ@|Gu+
zl^c{{$e~L*jyUFPYD8J_VKgTAb7Tk6#0JVe1Bz~RhNv@J>6d?Ya@PGJ3ztdSXiPGe
z<jKHTV<5!xXEMQ`oKJyvW|4c#IuDD&mu_-1^WN1Z<X6dD7DAr7qX-yh2A9Sm1jTx!
zeI}%kA($xVm995LQioI?)*6=5#h@X3M~m{D1o2rMiog#A<Q@xU9W2Cfw|<zlqW2I0
zYA;1`Ca8R4b`5V7i{^E?MK5NnRt@#+lY$dmI&+l7w|n~FgYaHlK7^z3A#fWMLqB#R
zDt)1l4qZUH0<F@WUKf0LPra5}))Eur)rii3p6sB?P6vVFYHLZT*8Ba^O<P>E;Y68;
zr9>KBf=@(nMPdU$N3K^x%L~ID;$$3}Yp(zDazEs}{)2i-A=1K*Of-2>*zo5#N=989
zQ6NMFZlh%4T)hN9V&+~K)D$15&b}Cxc-C60OzEqbk;<NiME-;wwl}PQ?^9ncbG;hX
z=C=Le*X~MEYEYWfH<P^rmXWY*`h}>re04GqlV(+lX-KGrhTtgr)ICQ%rLXiyH=(sR
z$}UYqt8=Haa@0D8Yf)ywrNRO~_NA-7`UZsdwn4bGYKtIYVf(!^A&M}bw5@z9&hB~j
z%SqT4U7(t~JU`m-@ZnPG0^0tI+eSkb*M+iCy#TkWz;7l7+0IBY1Oja}c@}<Vd1`&c
zr(dDP`nY7&tAs0N9-sHY<&q@K43?qA&tR8MmV>2|9u?ZwD(4?!oe2E&2c&do6{Lir
z{#yDyQIkLHZrIHBJv3C|B@bspLg*J*%rT%ZRUfA5l>bS|mW<N-tT07YL@9HGtG<ly
zJ_#bqizGy1FJgxJ(R!N@60c10*|J(oJEYmAgh&6@&A4B6wQ(E5W$zNHjhhk7_l-re
z%2q8@$M!i6{R0p{2*S*|_b-GTxg{KK78MJhqJcUpNVq_t|MXOQ@HVP(>&<Y=%TX5J
zyK7j?>rvGmY;3t=ue$X#g=qR9wk!pt<B0Bo^>f=B%XHgh1dlZ>CjnL(wF$K4TeSW9
z^np3u{l$bi1;?>trN@v0`)X`9%gc-pIeJ+~UN|6^oH;|6I~pC5S_a0dM&Y{OOTm#d
zwZ`}ej?>ZV14)3NjJ?p}#c|1GXW@C(;R|Bnh|1Ue;N!}IFVR&e|3Qo55D|qqi|WD`
zsnd|h2-}2!AL)W>r3^W16(+m`eU&b#>Ch+LCLI^qi;05HsV``inwS5+w>Df<T+5hy
zw|Yz>q7+23KIY-F$WFLa#QEN^6~ewqlzn&ImeewqN*nhPfk7)Z$1j?KuGy$Yqd2xo
zp~|pAc`zuCeA^4bOQ!ioRSu=7{vz=}>&B9(+iEMuet>-1+}P<|Jp@z#fzVr8!}@aD
zzw$nH#l<+~C<E&Vk?-w~nAfnZ)LUX2o2ZV#hzZO*=-V`5SlW%dLUG^SjVJXxkw78p
zS>6aOLFvr}*u2Czueef0($W!;#uZHE=jY+S2|}1WqVKhXQx74I>CucSnp2zym97^x
zYATGV`>0NWQVprskb{TYnSKgyiEC`eq5cD1=+YtdW^{yNla4CE8rgBF<EN-F6~u{R
zqEvOK;jI_f<J`>V%CY@Gg@KY=4VDTNzn!Gbm_;-&tDgpsOe6Wxs+d;ZKUeJrd$DOg
z;^R&zgZk;L=(*AI+<^z!1Poi}n8nlXql^Vb9F1Xas2`)cP_2P+Od-}d(7-ZcG<>jQ
zX;w<QsKeQs{GCA{*R%iCPlu&7D3TsAkXNdnCih5%(=30539&GpkL6Q>iF(@uIYqdg
zRL53#Ao=(Zs7Qhx><J~NxwIIj-GuvWleTqr94%RI#&WIf=mVO?J_*jZ{{>a8A1eVu
zgWA30pTF>PaFdofj;`8_-SCsWzdFSC1JWid0E#wnpX8PjXtyU<KCX-$OK2yfNZoE~
zA-#_Xt{U4WsE9Q<*fr?{YtR~aT|0H&2q00pP&9t}Pz8gaeUSmBk_@>!=On;#EP}&&
zeE6Djs}mZddWn^yCbY`o_>I-ZT{bVB1SWE}_&KIXr>GFkfQ~clc-O7EY*>jVQX?8>
zyRWd7@-_K)$*ZJ+LH_m&s{~&<|B6_@swuulAIdj8R<_mBI)-E3(1qFEZ>r;Zum;nO
ze+X{#NPbmqla~Vd)jb_-^ku&<_L_YNc-utl6Mfx~xQfdlTL)iz`IgTD&86#@?oTOa
z=TOhxn+eYs>KILtCRUY37B~|+MnyD3OuzyFR~3?khM&r|-18)feWh>{rVWO?8^KWh
z24(33wtB?tHD)iv5)wm6>_f*a8y^0)9Kk@>=Fby8IVsCu0Dt;Qrz03hUEpXy1;his
zjo%RkL4@zIAUtGw^|M<Q<Cg;{lk|nuigh4TfrdA|ZXEPwX%?Q{AU+xLB+c2Lg0gZ3
zcUq;U{50rAS3hO`4fc}U{M<7TaBc<%$FeokU2zbPlf2ux>V4{CwGbccIXQG2d@PyP
z85<tZkn_XHAoalv_OtQ$*wWS)d*%L~V9is8$Y_Tf_(m|f;ly?#kRp<r&mw%2^=r16
zS%2&W#2X`U+O`mzTisp?{_6T<jpxlaLcm~5V>@7};7I}yJ^8C?AAg{o1DmbTQs)lf
z`BuPyA;2)H8}P|FqkYTUkrLo6R4vGH?x-41LfuP7T)gSF37roGb@xI30_VUQi!iw|
z6}Hm+0>tOH4pYiu19WVXBHbw<LG-c1&|3EEGpvJl=*A9H2*IoA?4$*pH`Jkab?ZEb
z1ewLvZ-Tz)wC4bNl}+Bkhx+|b^REb0!10*?g6@mWyajRa>Y7R>v_S7!7AD!*9&XLf
zLaa#$qM&hd39v7cTkKL&pr*3aE1i5-2=8hAT*3};4t!90(%K>ne_>`x&9+wNCj{4-
zgv134YXO9Y+|TED!Q4Q%o1|hSmeRU^PL||8Ud5WCf_g>o-s@*~7!W->H`WXZOsS>T
zpLMr~JMu3uMne+>#chXGd1Q;Kwpvv$K!0wmF3BF`#}7gQ{g98K$Xi}R6n$Rd(1&lS
zjsY5fR-qt@wWTglzOE1-hdQ{S7;p@%c;4FgDC4yoj#AhnX~n5aJ?kxGQi8RIb1_y>
zvmt_L=2eN^@a2`#F6__|`Z>-uu6QtdELT-rPraQXW$E8c1LO?L<3M|g@}t&_a&AaH
z0<2Y)thFPP;O99+IeC%{)89U{_8?3Ieb;lM+7Yd>uUj+rxk0E^-a3PVaNX&<;x8+Q
zF%Qa}MK!I*Y7mo}I=D;`M$r#O(OX5)U+RMR?O6|$v!h(SHF`3yYgumhuZ}$&$3VWt
zvnZaBih44-pVL|u$YqcAZRlxXn~jh~{*KD$uc$}Qbcw?RWA2^BNZi}kdu%w<AUEqM
z34x$il3wP)y=DfdU9U#z^2YWkLXs%gD0MAftsYGQ(S+tDb#orXtxkKU{+%SQg3`1t
zP*A*HYW8?g(Ix>%CMMedwiVd=#6s_Mtb?OVzU`VySn-Oo_l$HyMJZ#;gr%cjx+f7y
ztpXa#M%AKluikmWm3;~dAz;86ZL9xWyFlSTmDC3L<TeXnaJ72;r{dH@X?6nDpt<0r
zwMzc`0J`8_CrS&Y4@e6Zou&xy3YGtLI!WXp+{s}#hf@=HH$aA1u?y0ZMO8Y*cw9{{
zrPKrBxK`K&S4883TZ6s0X+7J1ne<m(()$*B;Z<}S>aJ+7wD0pHs8glIiYsJI?PS&o
zica-Il+s$OOBwnK$$V6*?=_@qN5(H!&iP-}fk9+SCw0v=!&4vy={NiXR38(KZ#&Z&
zpspT0+kgG5aWhi6%Js{p9l;=s5O@>AUsQBqZJsON_L}hs0{#X4fNDrrM%is}O2`n2
zM)!i;OvU+pDss|TRICRMZmkU{W~PlAqA!`6evNw0)RVLVCC76B0$7O+YF*%><>V?8
zV!jhda{+~DDA{&#J-uf$P&Ha@W%#4IdBcMwoN14SNIa~gYSFDY#>3I9lyV|5!T7;w
zYt^onEkOs2O8=IFfH`(S>`@LXX2id<gwf^OJSE!46XPAtJE(%K@zNqW*}oLDLRD&a
zp`6b{d&?HPr4E!e{`4|gpQK*G++)jUs~V#%d-dC@kBxs;H0aV040gD2w~l&0?L+E?
zo}_o3x)^h_KlaQspoyq=^2c`Kal3C?ZVBBlUjP=4JW}vvk2>W3=+bzIe!H~l<d%>*
zU=e)fw3%jA8{GEq4tb%|1tKKaQmGpbCL(P#E-hs30`9?F`#J4pkGGrC4G*oGuJim1
zPtE*t`FDtrCOX2!qfw+?vXwZ?08!1OicwDl-STO}!9=zuNvn&h?6|WCk4s&!+O^;l
zoxfa0zjIbXyDucq#lF-tFUr5DGS;x0?<(#uLgt=Q2xOK!B%Jk6#<^8xbWivSFYL!$
zj!X@w@P#Cli&me9-p<T=&hS<9jz(oc!c?AiB)`j!LaU;2tL9nJzTm85hqxiu@$L`y
zjQt1<#lnc0G&U?rv}W`SsOXj0sf@420JG!i53$-gzbqDI6V1)_3{C_i>e`9+Z#+7o
zHp6G?uj>l_{J}VK`4`9;|Jn}Q5qZSEle}G#Iap6qJZh@;{>1}g&6DT-j?Zv6t_$jv
zlyq5l&}H1(aQi<3hFe_qEMUEVo?{fok3Am?^0_<8qrt<+4orh;VXmZ@^sRR<(C2Ji
z*YNN>t+H;_$~k^!lS?_IK!4rhH54gEIU|@|sH=ohIElKTKrG`d?P1j@pUQOh6yV<z
zHt)?rboS20*QDaHW;mo(m)L%y=EELn(!`p48l4MH&5ae}&)4DEHo6YbI$bI%a9LZ>
z-7*HpEE{CruhQuQ>!IDYHIgC<_vhZ|l@zQl+%(l^<Yn<fb<M2vzd_egO>jQhO}I&_
z6Z_e*ah^%#y%)89?HG>pY(!<RVJe7QLj(01{(O|B>LmBD7EElpDb4$pu9);KDAs;{
zPcqjvn0=LeTC8<E|J`TWxriTde&%ILiTZQN=4w#$?Rp$MJx$g-@!Lw$O~es@%2TdB
zVp_?t2ph5WH*vQpd5TeAsl58k_%DXr1Wzd5W@x(A?$h!2S41yq-ZqpzO8JwLIY7zm
z4@f)EJ`p;lO>B}Wz=5BYaSjtL$Q5u~(bUo=_P%FC4S=y4izLaDYEC6sx^Jy=UhLW}
zf6lt2W{oNVj{MYnMCDBxkAz#Aa!ap?tu-dw*vhOE23yQ>5S6s1u`Nd1DXJp_TCDS^
z{mDb9v#^_R=4rGu-(|&I>IKAFMok1yRMkP#;)083Bieovbv9ZiIj_XUbrm>CjISfF
zm%S`ymoLkGc1`g2-JgIDVVLNZnXvt3LA$OCB%gm|h^5@d1Xt0nTZYtkRezSVj$*TI
zO4<uFrZo{)96@VWb+h5IKb|SDGx0Mg;H@HCW>n;0VZ-FP0zdJ5>62A~DTYWUq!w2(
zm+TdY>R>3y`AoJ*yVEV^Qe$Z(*>M?9H)K>e7GE@Mi<ZyOSgvOT#^4;8+D}lvI>$sr
z?3`gaS$F^wGu%%6UdtLAR|y^rv%3p62!nuEA8x$RxN)qN6|>N9fUmf&9BU!ommvg7
zW9?|3N<An>M-gms)ma1+rHWQlP$4YqOBO&Fs{0Okrn^mc9iDPgc!I1Ys=}Wz2xlJU
zk^5OEX1yt}eJ~U)z!dL6_r9vtW}t+C1|8d)HG{9j*b?U|fpbvwu5kyXu+<`W0v%Oe
zypy3Vj$0z0NUoN}vov*&>S!~ny}s3|aHdf|H#Jl71rXg7t@-R^BnZO!M!x&EUer`y
z8)+24<)m&&?9a}M<{`C?%gq&9X`-4K%r7a;Y;f^0y<LzyDKOW`8Glc9WxLDwFIs9*
z+`_(9*}8`naMi?K`}!HzHYV$OQ695#j)()i10su#IL0Llc3qvO?3_ew=i!&RvioS^
zY!@?=EC+NC#SY4QTc?bAlh9hZO!4J*y26R2kl88_xnhcNL+LgsY}+~dG?>iBTfMfX
zqMezv%aQ2-VSK}9M&_&csR{Ylv+_KVvMU=aM1bFIn1+au@EENfD7>;qX^_uo6OJav
zn4LDjTtO>+tZQ#MBpaTgBYB5}wewMrt&=Z)##m4@+%DroUxgY_l3aoPvomE3%bpiY
z=RCG)Y#2SlS(WigHmivEhm<7PW2MGlEcQquBYDo}Z)x0I>9_H9<XNh)!{(%P!>P5`
zAXM9>Q=$lYbmcPZS<2a1W5!q+Ji*PpwRvL_2Weap17exm*1inCleg_iSv*=PSX#Y+
zn6f8%PDEp;(^rn(VGgfs9amT=v^k|}PJ|U9pT2a%h|Crk-X{Cvy?EBkon!|Fvs`D`
z%+f2!Z1zfZ*lO1gTWgY|#Z=s(cz`1M=(T33$2Q9}t}Mu2>tg(?mHGix?P7}k$9q30
zbSg_~#0$C3m@>6|uqm#PKzWUlsY(rYbnI6x0=DVwxMF;oi}FC~cf>?jse$ZN!hd$E
zDvqkSw)NPR<|N}-LXr$sezby19s3P2?6caLKGBa9%nL0?AM2&$O`|iUi;9OZ;q*>v
z+t~7vj&TthWmP%TEt)tbbXAv`9xP{Z#ZB;rmtMo?q$EE(S73=UG!;QPNlhq5>ITti
zT3Ov`Mg-mXtKzr3uL=}4fUM`}Qtz!lKk4E*^;Y~ksbZFcf+A==pB0(+?CFIycEj{;
zEbM_Qdo#LMI4nX0F)l|=ahZB=WNQ2C@XO9>Q?_2mp{y!Mro8II>UMFM#5qXFr||bX
zwLv}qHqZ~j+?_4rQOHHJ*MVd|DCKH`UEFz8_W+3WkM*%(6N#X=WoMlHjC`;7_lR~>
zD};%Dnc^vPRk>CdiG5;;x-m}H8idSpy5I*>lC-fnYo6$al_{I!#fnr_Y=ufitGUlU
z*xqcee+S(z&kuK>8G+8M8-ZzSa$`#%h9);wO_?;<Ig$41pTw;2yx;F+QtrJl&vQ2y
ztsfI>O(89>MeSA?&rXU9If~+45tWcE0lCe<GVTgek}>S?>Trcyw@;kEc(;>n#5&AY
zR~pF8bwNh5%A!dSfJo{4S|neIXe^A3$4Kt240g1YOA5~_{)WqOerKpNd&N9HK=h&J
z&1dz4<h4qv6O%8iwLSc-X4458MN#6OY3mB+oK;f8nh!Y!4^3wG=B7oSW>PvW-$2Zn
zG8=+0riTZd28~RpnNR*jG6wlnPyMSTXR<^r8v|_%?OK~=@*SE`AHWkxf9!=U*GLb=
z=fbH<lO}`Wz`VJ6{ts?ioJgimqIM7E<S#yVA4nIzHJ)FHQeYI=)0ut1q<unDW&Kz^
zf<#dCk-C+f+E>kSqblLj-}u|P#_U2HGfg2K_%n4-Y;1Qkd3BvcvFZUzD`_U(oX!nr
zoRSz%bpFv2p<)-X{#fPGXOb^B4O6u8NT;P<V$NzL<YXon8qnty6p;`j^r&3Dv=cTe
z5i4MGv!GgO0(hhg7Bn~(bx`gy-Q{hr2svsW<tSE32suXLecH7<ahcf8IHbe9z{Xu(
zV~potq<xu7?D@obJ}o#?F@@vL`Sw7YTMt&a4icM^LOl<qXp4L6f}VWJ(U*fdZK5u1
z1y^+Xc6GDSYGS6In#x0jQyoYEcxCir5c1vk*fY0|o%!;0Si`BzEVlX@Q_r2@Y`-R&
zooiU_NT*;t_CS6z6u_Z66&sOn=VQ6Wd`o|yQ3%T$Lc6yexA?XiTRtNRKeuAIy^$F(
z_Y6^A?*i;#E!_CSqeCfG^W0#f=T@5g=-ByB!VQyyJsjbvtGT6*D<edkJ_ZU=gYwe2
zIv#Z`pQ9Y!E(9S`e+s0%ek!xZb5<>v7CC@W?@l4NuH-8VY@a)2ZM5eV+4I?|sRiC;
zxlPww&}E#lxrSLI2=fW#A<Cd{jbvw0oBWAA|A3}FzUUT)zv<bN;aVg0S=sX0aK;ZE
zgwB!RSjfIStm<G>eykhqA~)3XE0WOmL(tWN4+1m>mR)sWhk`iu?MWn?B8!`l&xn25
zGr0xD-WUCsacGn~^{@%nl6@YDXpC>2l2PLdCF76!_b&%4RERqP^HBqtDH*jIx3tAT
z@AyeOJ^FOZVo?MJ;?VS-vigAwLh{Izkbfv_33pp{>oc202PFSXa|$wXYT=>B6spLK
z0PHiW<l)9#VX|d6Y@)bQ+!eyu5^E!@o}53q#w=Z=ictu(awZL@4`<52A`p}wyZV^V
z(3BH2Xe0St_?jf!9B)FIFcIoSa_}sSxIajbg&4wsl>-KAbD}t_aqW*Nf0Vx3ddzm5
z2#VP3E<t$=Cd8`8i$G!$Lc;h8%J`O@prKyO9;jfG2@&xl@|Yu6Ye`$5s|xpsr&>AM
zL=Ha07+jey{~_nv;(OMLLi;W6PXj$3n}>~9_=m<6RD`po?qnX!Q!P|6m8vr%#Z?RB
zLGd-DXBYnh4`)%4bCX6O8R>b{>8=u6Bx{%#S9t-0$99!_(&h_0pd53=z!fLyeA)lu
zoOi(Jn+W;l`xl3)n;m}$?+x&*E=X2UFoW(z{o;0F*2@*=*T2*lzh@<KzdBWWlE>iy
zkCG7jkBkQo&Y|8d0R>#8AQwr%^onEn0G_L0OvxpQG39I~Xy*OSBP&#rL&_B6j?6y%
z7}U8RX3fG6o$+rm=eBmAq0dD!GktQvX_y2HXKv2BXPY5mxSG{G=#i0Wqio&S@=S;p
ziR>KM`F}8GaAVN!B-pwa$>u%Y5o%K^)5WS~OjY3Wd@~H?opKN${@t9v_yEVuj`4;d
z1S$FsKr&-)Smz61x#gsEmC^uY9F_qpJP_1&HuXc#b%+%EnBe!Q;1IVq48MyBn2l;m
z5p9*w<Mu4Nmlf}Vuhi2>MWld>hY|F6*eDfR6o~QDmHTI9d#){V)~0FZaC29nymnYz
zuhWfpco2kP`b&l4*eX`;yOX#|LD^3R57m@!@^PrmVcT<<CBte`ySjs;_;DgrS(#3b
z`um^{z%+_lKCJ?xg4DnSpUW~jb=rhj^(P~WI!&W!=LzxsNzeYAw=&G!0odB^gY6hY
zwPnt5t8rd_L%I$tt?ic069c{K^Gm3Mpiv5!`ACz1!`Kg^)MM6xsX<v`Y(ZCvgD#8q
z9kNIAOn6x8y(uz(tEYgQjnWO{*^p+RhIvqR{<c@GdWeZ%5u25b7B}giIDT%%YC$*Q
zx)D|3vY%{uU*wptRC^fPq~`y;UxK(+1R`a*hrz7z^G*SiKlS@S`~V$~u5yZX-4c=$
zvDZT2Yuhoqr+a0#e*7<-6K<JV%$o}u(RGMU6;bFrl6P>jl->vvQo>k^Y!<{jm#bR4
zFb}v;QQ-$+ZIa>8EK-W9b(V(U-_YDM;?F;6*zY@Q`e;?H1((~w_}TL$tR&Gc<)rM+
z%aOLq5<Xp#BpeZ3oQdaxDI%E!QRp(;F0>Y25QTc6>myS1GcPxPul;)0ID3oql}j?4
zzy13h^$tj<>%4bYH$U8X(lpFbm9>2guyY@r-!_zZ)P&MAe)93|Np5bUCMW$A)?)?w
z$RGl)Cm|6;Yv}<z#nPPIKg}XUnp0syn^>q?0PA64-L%Z=(@ed~{15&?s1Qm{IzCd$
z)Dgoz*}nBIQ;``xf?&>(e3N_rfEzkT<He~ee{&Jdr@a(fW!0F2zv**tsZldbNW#UA
z!-j0nN~`RvB{WQzwS^dywdOWPDpw<bX|UXcM<-dR@xWA&!U)j6v&n3jvlCqLL4L{z
zY3Z@cmKL#Jc;U0!+Br}52gn-CPK6Y{C{>aBHQ^Vlp>@k8QWo>Z-~p=+-pelztKXSg
zFJNYvm3#++6to11=Nvw}joy&r1q3C4J%46lpj*Ky%?K!9elA{$TAYtXJu5!$6fu<S
z%W|G4+sRB9)n*)En5p^bYCQ=Z2`ea$$dZu9jZDg+@-9j}HkxSXK=Eb<1W)1M6}8d)
z!BJIr!|d%*y_ca*7Hi~{9`D5k$Q?NYD6m-@d<*k#<Bv?<pKwhWc;+hCV^_WXQ6)Cp
zYw~S4iG1Nww`k6FNgX`F6)Bd>1Tlr|<6owl#DOiwnX`!mFPWQ^sj<b+PMbPWK?A56
zhq1p~moDAzy}9cO%IcyAi6RF%BHlhFA74D#Y>%xO2_|K@*y;B%vZmC!e!UGCW4E6{
zr`T$7wjtKgdV5?Ag%g52)`;4$Fuq#9SAQV&Y0*To=lz@14slVsD?KV_cZvvZ-g(CO
zcTx4NkA{`!{1<aA5prXv0J^D@M|Cx+R`oKClKt;x(->VA+g~5;6L_}t4TF;T&brQ`
zlkLQ~@lkV}(2msNqJ_w!lj%B?8bqdF>-(72=50z(R$qvpu+DJpm=tK8bQF)j376>j
zYEAO#&)<D@acI%zlj3?G+KlQ9(Op+c(Et|P;mH>6Vi?KP#lz1jnioHGwQhIf!f0wo
zEMdF(8Sm-4a54PH>)};Cub)MA&yl(w7z;=M7uj;Y`N@$)mU^GSFZ@_x&z6w>bhlq(
zbW=U=#Z+#kdaJO+zKt<iWHAtBm6>twQD3#K#T-_ZqCMm1ZDP)U$ggowJ)yl<={nSA
zZdE&})C~(T{gY;|jMn&|2Y(TJ+NbtO{zzShb#YB2?atWymNDS2-k^$!VP7nZw*KR5
z>AXX7MFJ}MH)LK))S#{FE4wFmy8YJDe3;lggNhfgIMn91g!LWXw7vGzOl3NC=R9S4
zN1*6|WYl!mZRe<V_c%A-3-M2><~M`Vd&6Gee`w}Z(ormWFceiYNB<#K^zFex;`EUc
zxy;Z8FsvQv8sttMEh9BPpfoT8++ylbIHU(9QCY<^Hg9+p!pcRpMAA?C+|g<Q-Xqd%
zu4u00c1(5x&pcT+vkk2RzHVU1=1E`q5$&0sj|=Rt*pAuUG2Nq>>96}=Zs8CfESEwX
zciME+qW}&K%OC8DmYH{1f<xRMijWwO&w22SE5Fv57wpd!2-aBTaUBEZb#s2})~?i=
zF3+^Sk;Tv}SM*B)mW<?%NhPi~dFvaqlpJK%`o69?B*$TvffQ@rOY4~-WYG@}$%=Pb
zvHI(*6xEs>cLQs5jgBKcDGdC4V{4XTeRQ;Uia6Mbay(6E)3(Dn=jhrC^MWL%iD;y)
z5|(Z$KSXQeNG~?PL!Q1pX#e@bOJl~2xhtVEt#531FWATW7?B@;I^-A->7Zaooim1+
zxQpa#f%nU%P+KTrln#onAZc^L?BPuc14p?%O!&*xn4SyM<Mu3)Fqi5?VCjNqJpRZR
z@>W)5K4tq>Hf4_Ka9aO@HlE&F2g=dgs)q`eIS<uKb2qxTBVOVpn;PuZAc66&2lk(r
zG5<QaloVTXQ^XTLLnfqx)YPA?O;J<iP<nf>3La;YU0UU^$DS~lG~=jZ$$8+!I3Yag
z7`u+=BDxHfS+iq(y5NC=n-5-F6z87oy0?hRYW`jvBOM!6lxMsg?|CTrLjSE^_r{MD
z*6vJmy2#)L-$vzO+Eun?o{6}_`kynux@GU>#+h%j@Rfaw5R!Abm01hX(nZe}nJS7e
zn=swBJ%y-`e!(o7|5u;5OBtK}CuE*8)}5N-WU}K{LmuNnd_DO=9y)sN6*r3CuL2H}
z{2hS$@@Vv2tlbs!)JPZE)(NXmq!WXkjg`e6<m`>^-tiY31{v}t9y*#WNYDE(NaqiS
z_uV;CA4yao-p9%28m5@=aHKMOJQ{7Vmj5-r?_H8-{?m6e=TU9xmcFQM!$@}ATYUx6
z`TAx7`~}N($XZ68!O{8Ug?tl8@Fem~JFzN_sLTnOM{8f>;iry|%eOs~vXm#WV$lr(
z%pUk6=ibn7-4trb-YJ;g6pX;y@DAZrJ==7Cr(LonYGuy5Xx?*N!;d1?qBZ<>`1PxM
z`>h9CJ3<->0m&{3^>w@?pG{SHmtgkvc!E!x47GJ3VX7txq1bsEQI>aM@)bGC+|@$u
z!XCzcA2n^kxdaLaIT4z7`?Tg-25em(*^&3-q#_O`#&V(8kL@+QynJ?t1S{4uhae{B
z^<u@VUkcx1g<=Wslh_Qfai)J?lEAv5y+_@I;}|BF=F{2z9X7^NPHf?dS-Q3@_0M&Y
z1g_|Q9|NgK=8)9#M;)^4?Zp0ey;SGY(L5?l1QL<q|BJV`j*7C4-bQH#sTp$U5*3jc
zlypE!KtfVPKqN(4q$Q*okPZb25l|69x<imsL_k2KloAl7OW;L4`{8|m-&x;TC;mJC
zu-53BnP;BozVE&FwXf^i1&JrlvbZN>|3NyYv!%R0j4~->>6K~51_Cm5i$i*3M7}tP
zZ6sP=+~xm;?Flfu;S=*w5{ci?e>@H>#pv}b)yQ2n5(p+9yro1gKr9fk{T#*4eSw#z
zkw5lVkHSYD8s@2+OlmaOR5i%WmoVT#echjim5wAl`Oom2uWbvtHGh6(Mf}aCC?N{=
zK!H{#O`V_a$HzlkjpyD=Iu)0zElk8$=&zD1KVT;LcEFOkZv4Vqr}?)HjX{x(#JD&s
z%(Two`<^HtA>~dNxM{+{_kj0cx-B_Yn?@r+G8r?PT8wM?U5MUz{<WpXM?MnQmWX?H
zDM{XWw2Y24x#p_H)s`?34~_*g6}uMk?tRYPC0}T@<X^uXB|DaBQi(`u2rNrY75Mhl
zeU%Yz33ALZ7Kfb)mHgo2*LJEpvskH^xanfuZb|Le7#btrg@HAGx*(6Y=<q1!kcS)E
zd8^12*{zZrS0>)wRn;iCP)tR}-1VlG=;Gf!g$qH~9cszaUJVS^ALT@sNn)N1cGm{H
zKt_|#3rzpLe(qeP=IMmo1hTko=Mxz5JD|Q>78h0aS<-vX5re61qZQoh<;Qp?=N5IG
z<n<;mmm`=Z()FF}q6*>uvL214gy|c)9ZS3|<RP{b_h%A&mZI%nXS%LlvTUK{9q)A_
z85&g3{lcy!8m+H1pgWt7zQ@nCJ!IIVre{GC$M8>E+~uua*{*c-`Mi(6ZSjPw?tojU
z6B&@iH?(oL*-;&X$*8xvm!qvz(B{8#2)VV}VWZ3em$Q!jx)Z~j+fjPU(~R-Cn!=D6
z-IM~O(k-gH%W9q-Nl)naHyn(bhu?*5c$YGH@SO~OBS9CS#>pOvVPulysU-d5TA+T&
zz-q~_C%h3b=~37|{UXlM-~PRo%LQn9e6*PTr;dnq;|8j1InV5Hq;3?A^9y)CsM^W!
zK4MR><HW}HM0vdVtr(dLLeJi-vYwlcqdu?Cvteqk9nD*_?k!el@?2;@<O)u$h5V!W
zUi8jB^zt+P&vu$HynTe#sCqah96H0`^z5Zmh-6`I6)o0;`N)~z|GM5WT|DD)Z#Px0
zVKHPJI!e-knM!0&F6Q3qP~zg;w-3FT{4VP5yOB8`pLTRx9Y6X6n-5Cy8k=wO9q|)5
z-*<16rVTr=5=pmRtAmrwhlVJLWxklT<NLI>Go@^T-7T|h(3?4}<uGt^NOer#vUSH+
zxGnt~rz+OB>fyzwUlj~a=F#V|@P|I{sup};!cX%iU@TU^MO}NN1dSV`k<0oYF8~UZ
z!uI`M4TJ$oMy#WR-}?MuNAkDQuwpAOuTYy`g4~b7@9NdEquD{<7B7^t*^@j!*yZ06
z5#htPGLmxtLB)WsP}59rAt2j)?JL)UyFC9-hHGhWQL5teRdkz+(my3hnDa<q<M1Yw
z(O~yp^CsxU(+3rrN)dl>tnU;$*0|c-zmOgeotJXdK3zW`HGZi%mfA+C8O8si**v9B
z<&3h%OfHZ4d!H(?I8y#tH?x>K4X1lA=}}c3s3Yi`DWV;Iz^xz&K4`hp4Ab<xYE-AO
zL|8I_y6)LV%K}BTU6>g6)FMB^+d6T;IGmWqn}&JSG-2zEl&{<E&+PpaG$J(GG<0~W
zx4dD`RjSW7a8C)~g~WO5-v>g+HvBo|XOE!f71xfeCnE3aLP^*B78JKH{hrNGc>XJ(
z-w$h#_s#Bmeq(=2rZ$OkWyLvrUurONSbALD|Guk}z&|?IdWFK77QO(z*_jYiAka;C
zc43DzR-RkSHd(n!iLmKnw83jeZ*iiSo{X$3c#G9j_qf3m5dZ96F}&kJ_ZfQRHTvAw
zGN5dHUE>~nuX*Lw^Zupr!z}CgfmP3f`ZxaLnZMNL&h~vM<?CD2_nf+Qa%Z#9+~4n6
zRRX2ci5+2i4~x46Qj7m+{gw03&|9Jr%s!%h&BJVor;~H}BOtWcRw6lUn?eB)L8LNx
zHaff(wdU?}lGf?#?Lta-2^knBtcN!7wZuQ`|4vW;qeDK&1wZ+5-mBWXOp;`ciQd!+
zM0k=3(hL*C8gB2iZEMafp+ox68eNs$MdgTsuY;;dWj%SeUZ2uQm~F{-tH7((X@q!c
zh=pze?{R<-Wl9S)pM-drTn)G|V3_GI^=eeJeK#4#%^n{)r%dxxUEK*7YlGu+&qok+
z5D{A3;zokC;yaY3ucWe`)kK`VC$2w{(SjfAuq1jrzcDNJV$RFvX9_=Vno&>i;BVXM
zxY1bq_%7Thu+9+U+tav{$lW6v4?0o?_(-z(f)hi!3HI^P*b_~Q-{Yl~)OVix_4_$+
z74#3DwLV62%)+#zvd#a+qe6}IwIG%w47D3KBZ|VJU#V}(MG3O}N>B4KLVtgOl=Xs|
z;}ay_c_PD7)x)<Ty-*aQx-C6w=sXUz&cU0lte^B~xI@n{G*ym&`KI>y{>CS`^I}QJ
z#PhE(PPI<!^7$02&)=ndp5bFnX1x)eTKR`3kN7Uh=Fq-9__gK+o;ifzIOpfQxvLG_
z!^v3cxa~9FwHv)I|71{9wH0Pp(xCF~J#<@Ox9k%LM-MRRhfC_ODC%ug4P%~D@+A?~
zpprV6tU1sVX$|b9`KIy*#q|hl=;3R}BLrIwQJ$b;PJ=g1t~!+^+Rs#{-9MxbRriV2
z&vM&xVA6Lkj`oYF;mZ%(r5Ko!bM5ZAA%gbrj`VQBJR=d6WRDHCHpYlv32b(Uja4SS
zvbG&Q*{CMxVlF%JLZB%r{^5+}>ec%~$!;9ViO&aXUcPELJ&WSYMNfObEU7tW^-#i*
zDtZ6TcEcoGRH}c#upCl&d`F}<*-r{QV9RT&6VWr%xK(JcH@qcnkJuycSo==3Mu2^v
znsSfnUEx`~lcZ$=Ht6zL=Tpc4V}c#^nX=##n11AfyJ{et{lajOKs2jrN@<tdb~yhr
zr%=OxqnPhK4Y3Oqf1D5N(p}8)YKk85<ege2;5X1*2}PdgxlA}`ZYBi(fP3^^DkNZF
z$dz?4ni^*@$TgoMZY%zbedftv^sBT@CK^j_lwBG05C5#N<vv{|unUrSq2KzCtT(2N
zDCr(jeER5oeViQXs0Jdf=LpYO@e&YCgqB(huR&1fV$|@-_7gp5B`gVrwUknyj<pMN
zX7pi8zSw^!?Zyd`eaY5I-2{nheU}%ALib>A37BsnY$ApGV}~WL{#UGy^Z+ftKSeHl
z+W@}m?ow-!8GL#mVzi_1uPu?HF!G>}<rhdDT|^+fXSiGUDzd{A*+66@)PAav7T)uP
zKhc`zHjZCaUwwR4g)<>%)Em}&VUp3mDG4H5?K#Wh7=5(rAHwIaCJ!OTYdV68t%IPQ
z#!37xn0|Uu5{0n?TZS4)gBsLl3*qw}k6(F(yeTyb-n1DTVS41u@&%c7-T2Kj8!lrA
z#VeC&`U`6OGN8h-jRnQv(MB+>^im_3J2=C{V6lp#Q)Z!>GlVNgy@GSu@JY|;L)ee<
zPcK9cC9TX1F9wFl(3~1C&j%g)W{O)WH;GozqE3U+Oqd8v_?3e8KFhT$snDXn-BARX
z2<W#i!|ol5Qlv%7-jD`F7RB-87*YVUgapu}<xMF5u;|nidQ{PKv}IZlS=khu(~c4B
znO_Jy)BitFuRh79-qKiz7Pqdi{Kg|vViEbfHIhX-WRKK?B@?-;NY<LF=zL4^%aINE
z;JN0J4(ulKTxfn286sCQj)-u%WN7~!8~MA}ePI~}Ayy&c4C}435+~@2R>B1G4ut2<
zO{F?H?fOCn^Id`%{|D#+0{;MYa=G;0@0XZm|7RIDD*&zh%wl?rFZ1Rjhr7q$Uwg>$
z^$jbqmuK1xhmh9s^j~=DAB~QFax$XUI7=FuxCwJHM?pO405FvU>J%B*iMJat@Ok#~
zZ&RldJ8<LIN1bU<nL0a{XL>iD=(VJ80fMG@gC=JJxH&hW4^jC+b}@@DCji72yZeie
zC%_NX4dg>E#YsZX(v{z6hDB<GYKAEmVUGX?1RvM=AIBDUc=4(5-0E4Hw;=2P8nl%;
zHL4^1g~|FIqV34Y5<juO3OeX&aDo^Ixq}S?cGF_Td7G{f^7B>5?otys8T<h%;~0Fe
z8}L@|_U-_VF4km_{*xet_}hE|4Yw3zP~*_Nx`p^2T{3d$rGcN0jXw>HOIQeqv(DN7
zoOKEjo-*?(+Cx;^HvkB$!M~#)dW>?7pgl&wvW&MHwvFvbgZ4b6p(`Pj++}c8Lv&!;
z@y|QmpuyG$g{MI0O&nwo{yOKOaq(ptbU!sfs-1vyorn8$htAA&7Gh@x{1M!F(3ZEK
zjo98m#yJ*0T&QmG89t-*)F(FoE~J|SwXi6cCgw`N!iG&vgmphlL0$u@(#ULh08+rw
zMqjA`As5;>8FOhRtNQ_OjpB6nn~^9Or#%SzuN1<Iyp;=`aY>q@IYe6=R_;J*@dksA
ziPJBOt&xHE&MO(vEjsB)WHv4H3sw@HyE~ZLR)Z!<*5IA4XWmy&MA<1SFhmY}onKE5
zjsea8%$IH;ipg{~qR)tY%^Sp96W|%cY{m*kE)8`P3Sfn$V<2R48X+uZ`>)npIdg*^
z*SkmPc?Zjrrag&w5eIO{Uk-M01ILmam%5qA-P+f`KkQE3yLTS!Gm4%?+J7mN{0{f9
zl9W!S75r*g8rR=Yy8sJU?@XFp_q9;XY$C5Cj3gCWTWJh5p^uIZ1V6Y&7}@@dJwI_e
zvzyaHyF<5a>l=^=D!{v=`NuAK?B&(IDMTOpv;t^p6N!8({3FLQf;|a}FBvRFrjzaf
zzctcl1US3Sz)|t~5NlkF7;=mPk@?fQ@t945JpXEp^^0L?(*K9Els^&CTT<qSh;3YE
zJMUeLjJ8c;-vwrFv<r>xu0#J^_;fX_9bJMbHpg7|*flQxff%yAKr8Bg@K<CCdAJA;
z;o!b(;_LiryeBeu6WVAT#7<)p>bt`D=DM2Z>Y<o`yA|Ljb2qu>#}V(<8ac`%Uuqs1
zhPfhnl#Azzg@!g$MH>1niONkt%Db|?D>8;9jteCQXHjzd=IW%2w;_a!Mv&Xhy$Whf
zThhXu5+ZIg1iElGN=@tLW0-i?3%(;CYmDA~di24~V4VFN7+9NuyjkE6+AOlj<>5=?
zNQKN)xciqfzB3-}Y)QELrY;@B7I30kAk;ye${{TO$aG~z9Se`E+-@nm9ZaU1K-``{
zTz@k3Ky{As2hIj_Vr4p?!3%js#78C-X?q%+5UU<GKpY}eol{MnLZ*&a{6^^Q-YeCg
z$BQfb*h?=-UzW4u`hw^r{U<%sj>1edT#ZCr<oHpTK@ND_5K-7|(9WRdlGq7R%8`%b
z;}JGYyv!`hAL`gSSG>Ck@RYmaMTdUz>uHksI`W-n2!D|K9`?{3vMj+-XaCS+{KsE(
zQPQ&8cA(`B*V|VhTRU6RyY!tKj;=3r&tAeAqMECXq%!}`K1Uqa7>r^vDDM6JXUYZK
z4+vPa^On^=y7uJEs|&AT*<sWpTo@306rH)s^pTfFm|HZ&;h2hTbMG;`^0=tfced*p
zaMn2WIRvG^9%w`xp|2nhg*Q~LA7%~3IhgRA0pc?cr>EC7`0mfhN&Yj^hzs#K@j||v
zP03pE3~0Zlf)4jkr?qx-;2^4L?^)MV((?m8dV693*swW+omT9qFqcI!qiU`#8Jb1+
z!f#IjI~o3WVnG|!jfk4&cp<L_!fcbeJ*}7t{9DbD_7R78va}I4;c5tkAiajKHn<<F
za1<j?ATMZqV_8XV2SF$(w_Ajxv!bMCya>KpTlPm`{xbCCsWIep9$~anDEFuB_q{?<
zsj)<F_;UmSe9Fh@+QCth;H&IgI4rlPWBqbK@S!<_HH}#qYM{5~36P}gM(hrnNZ}{S
z5fzjo5iOS-7QaJ?k^v*UI$(iE(itn=MeSfwuadAq$VK!nbiy0};BtXA8FFT7U@}>a
z96PhVm+56g2E!kmdk5ywbUmo$3EJc9XnUd()%B?GQ^D~<bI_l8gMz8c;gTMoUS<3$
za+l>@8-OUJNY;bo{uyd2E{OCM@eq2N?hw2o0ZPNp?A=$W)5zErr3mK>5LW%16EwI_
zBM(J5&6JZEtbkvk^);UKnm@=j;9T3)1fn80SKtvLrvxD?I{imfj6!zx%`%)tUFM&~
z3xBV1pYG%}ej%br1f>KczG`$+0IWO0PQai4RKn+w%ug-JNESJ*2N2UQcCv~*Z0ti2
zeVnr{M6{<?dfq1@HIHd(ongistv3I%_-9`19vjZ++Y<xgYi?S)xdwXIKo^0`P5OEJ
zJw37Wr3~!{Jai;-!jYPR?T|ew1T~l*{eDDk&Zeg1a+FGN!fltu(gsBHAza@H&^9_7
z>PTMcNJQkRa>aa9r-k14L!Mv*J+|VCLr;y%CsUJ%aN%&^RRMaEJGNS&12PIYT$jP8
zsv2>yiv8joEp<uHA==77;OIGDhu<^anJ;$qVi8HFBV|K-fF8-Yl(2CIG?N5mV{#eq
z0`@!(9ZS~^gACC7x=hx&@6Pf3m1(*n*JN@MsV9M?F6|AxgmF+;THd(#yIKUat-C<<
z3y;ziNe~W7S=Vb5m<LZBy0v1YwymN20OBLTpH@H5sfTL*{2%+!YIr^`E~-5~%BEA0
zhed(+S@;FL-$qct@OyVaLhB48nQBbcY#lp6;^Z2|d56#pQeeIUM0cCyU7!DRaOW;d
zhy+FvTMz62ey>e9oLsd;s#9^@7&XTau>H~2CX|T#BIHD#m{FD|?np{(_{+3N1ddwA
zy5S=dTOnfceC5mm_z*!pgUmn2Ed#tTLHK+7&;DZf7w^?Ugq}X*W$-?+KoGGkgA}T@
z<#1EW3_ONbKrUU&b~F7+x>}JD=2SXS$gRiztFq^(gGmuVRVBni{49w`)qwKKvD3c3
ze2V-nWgnte6yV{d=)fQH+PYsXV2t?dKW8f+xED+3ZAx;W=%_*=JCcTJgFnZ2mTCeV
zcW6{7!q(wtFAq$7193qWe8c&{YjZPqpmONqI<Pi<VQ?|mgrq?8=F5I?esKlsxpHzA
z6JC;Y0xTm8EPp|))cxohv~*WMGFkrk_p-hBOa$vVGVFVU5aZ&$CUQMgbB)Vv2Z*S-
zYe=5$^Pj&B=ICTRwc1;G-M;pVcjbjY8wtB0)RwbW?1fYH3Tdwd9X1)HfG<}n`xT@>
z@EJK%544?e#Bfbp;7SpSNjuo@H{w(2GXlQ;E)ckCK~wSu-dyQTo`x8#Aj81lCRYP{
zeGK9@j}tg+K08&{*<9;Ob-MXK)1CjVl(=}b%X7u~UmyGsKEIgdu?&(w$-zz#k5{`#
zwLH>KU+qab!BU!vkU?M7FR+^dsoU#n%Tc&gB@#;m5!EO>K%Ta=1PE&90Sud9Hd2e{
zWXlLNQSZ1cyV1cocW7!MB0rq=;1#%0YtcVuKWfCoSrc>OsPIjVg@ePIf-L0QjlTyE
z!LLMRCOec*#&_M)hNB%er2M6)Sz57Fv3=mS(|H*@(NK;^mS((jE2DrYke5VIu+=nO
z+$t18vxOS_Iuu-QUO_27`bSOk+hG#k6kMIKjr$9kE<hzhB|HF!net~uDX%R`5zNOR
zGp^;xKfo9%gRZD)u7J_S_p=aAna&RbCr7^^wi!Z^3lcHr5?ddk&Mdi}w^8h6CVPl5
zpdNs~9qLwNuf%(8Pm6MqD@*)(G$H3+0~_rr#LSA=MV6l-!_)g-!P3-b--32t-WY?l
z&JJ;eB+I{Ixp2@8x8(m006>QvgXMBM_}tzK7`D32b;VVJjc2=B0=Eky#kIxriE2t5
zkFMPW1tz;0A5xx;enr%@7rh^QUJlEAI*ULnys5nG6+4TNuK?A{*GdG`!Y<q<FfIJ4
zByjotAsH8Nk;o|5D34vw%U6Ie{4w|Z<<SX8gX9xdOv4mNV)Yh6PjZ-kt5&RP?4=q!
zI(7^4><ZA(f1*>^voyQ&PEerlSjeq4H~@X`K{Sg@l7$TXTD6HV`8oZIF^UOjUhZQ{
ziSKrSra?lTQH&G^8c~RVPCe@hl$yN=D|-Tzj&Zzw#iUgCcj%J<cVH_bJId;$U<-fX
zCjNA^YM9}=jEID$MPj3qLQnR9^xM+;FW=>38QS7lIK2L#Otq#>!<o3Ti4y$OSQ_TB
zb0o#2WNIVC#F_VgHBOBo=O9GdCo}J=L3iHeElu_SR)*L}NJnZH?L#1qmm6I=ieG2o
zZalhZv2%o6AfCpK!4B))x4R6<ZIaLLH1+dbIQW<_PV=@$F?)Y&+my^So9Jh-WboU!
zKM&jFrrTI}as*E=EDirM4b=oApQ8p<Dq`d#@jpgBvhbOA#J7Q=u1I!Q2!3O;r{yP5
z^@k`RMpTdgwJOeKhW+Ufp>M!(^i=-89?_GBN7t@}6mCyRU%Dd5D`FUQFns%ECHz+j
z5;lcnlgE#|$aau?iRZ#~5u}x7hd6f9<mXBOTtZNchb!fzLEJkblSWQ=F)kA?EjZD1
zku2nSq**rN*o8n$rdFxLYRGdc{Qn6o`mexI`8N_jM4%9@e?9iT8^Yq)e(|rXwJCD5
zjj@8&M;1LlkTm`OEqS<mWYyM;loJ+mWj`LkJ$TFy@6r9&=X&56e00@+sz>log2%P9
zG4u(!+j<mm7kd6@6!kZFUhmolw2(XU{V8%igCSRH;SnD&!FvM#b-oTUMs7B<+9j8x
z8+-uod6Dq=)sc^m1cVHONPRsHc#kSC5=-eKEshqU7y?^8q$_R!Kly>@@SY{Zf4#7)
zNZ_MO#5EoBks@RmdC&jTl8a+7I<+|U_fPFcZ~j=fZHmrK{0ShZ__})g8{JzwnYa-2
zr#_?k<{ytT3g7SFCy?u3*|@p}i|;Nn)V$e-^Ak0)BvgrDBUSl~*|GYA2^WG)APW2s
z-Z0J?!G7ev3?YU2E(F+7__kkYW4S;prIXf*eIS3TdCPx)G$Cj^Kh#mb9A+nwnP?%J
z(Pqd=b|JVO{$<s<8<G2F(q5+@unir@h$0bc_-;EkDrC5oB3T1c+5H+@(nRj@sc^-7
zvZK(qJMJXr4{6mh1d*#%%=)BZqJH;>Cy2*;+Ri5I;;jq)!PuZ2(vXQ~kAv>|z3?Os
z%5k#(Fu%XCtn41RV5FLg1mqeRraB4<4~NIOX!bHKZh;+=OEv~8vNxo5bR<6+)qt>Q
z0wy1imwBa^=1af6958e@w>02ESL&i#oP+M|)Z9VF)GJ5!hD->%y*%Jb!x3tkK}4Y}
zfHyj!oo-1=NmqrqqOJ}OCBoU$FExHkAQMbq*em9V#ISfGMDscU5>^Mqm<6oCV9s6g
zhDvi&S1^e09bCX{qNeBc%X0-mJ*z8}%hNI9L_c5uUUA!nfKNI1^nqy);@AN`&*@~S
zXhdYZ^7`+eU4X=*rlZ<E9GZa-oeN|Ih3<BX7j6b1<()|-SUsj~?18S#hRbr#qo6vG
z<`3BHeFk4ZYbaZyMwCwKx3Ba3-Bb7d6325vfvn>7;k;Nm-QM*X9~Wo`I}7DNMI^f?
zcjPClsSQ6ono?Z{pvLc+Wa6Cce1!pk*6jkIH3ojnu80=$cmLnRgx}rw4x_k0rBDMI
zYu9e(U8*_aMAbCgWgEQ~$WwQx{{EgiGHE1AtT98ZXlfu>j3TBa6Aw+7T<<g&H~7n|
z?E_%x97ah`C&19>JZGH)hNrE6wl4e~nl7?2AnEn%h(~gikLo;Jc|4PdWikRfxI{1;
z7n;T9;lnv%1G^%-VbOe+!o*6p9?bo7yzC5Mw%*|0b`7yCx3L?1@H0h|q|6^QdMjWC
zTn-}$g}RvCF|RKCGuFQh^v}2r=Wq?<Yd-pGf`v>MjpvW{Yd>;SdK6y)YZ_?Q#K1ab
zra^baLyJaYwzO<MWQD@_#pLaF+8Vf=T9>~|M(sses%ArB2fmga2CQm=K0lyG=gpxI
zQ0FeqCYA;e=h==2YJTsG9E?j3P_T#(Ud*_M|LTV)WuG9wYvP>QFdHN63~YgA1LtfD
z^5<cF1@-%2l{gBRiak)~0tr06=fwYJWJ2QNp`yzCL(?hPUX>+WjI0Y(rK6Ic&26A=
zjFXf_yj?!`Qk~4QMEu(*n+HClHD<9+A>DwCSOebfe*3Fs7_BmICaAkPX1#_DI|K{2
znH)YHwf#DgsKRNMb_R(mU-*NqMo{$3r@POFpI^X!wFgah(<0p}g6y_O(pEyNtHjkN
z`R*-`yO-6R8Lrla9U^Ej2!^?Reoz&^|Kisq%#<ZnR{ak>9(RIGtHyZdOZ)Y^B!_<h
zu#0Ql+SGdyhYs>}0;Y=-QY@?G+T-)l26SKyf~2A<Sw+{C7h5hl1y>HSzfIuc7x=jj
z`7JHR60x)_JyyfB^gUhCy2n(pm|fD*h-|fslu92X_KJz8Xl}5_J(4W#PHdRzQS03c
zv;H^N6A6w8e<zq4li}}U%grI+k&x^RrbRt?I?aVhm4{}RuDNR8Y#~260r+tx4EPZB
z5KHQfd)G_*uD4lDLRzoE+|^!8W2bQ@@i%yiB5+D&;Y3(CjQp_;D9!gEn<O}MsI!tG
zM%`jtBIpc%KCf573IPx9>z4<atM&#R3_ZSScv;qNf!}CBDY+8xPGOIarDgSRIql=>
zbCc_9HN64TRO7(E*Ol`8J`h{uz~$+3r-DTkW=K1O)p@TuQy;hkWj(O{UY;m16_io`
zHmm-#(Sq4f1BJ3ZQcOZ8Dmxf&^dmXnv;62)PVnFU0de}YVY$^=DB9k8gEnUr1SYIL
zcgDPL1B^8R=e)D4<ugNSzh}C>RA}DRbRI~Zlj?D7*Y16LIw<>XGVLS_2>q8a4#7ud
z0_ZZ>7wm@XjjMLAk!^KqeS`BUVN{`4p7q(RJaN$docr%=mZ7YmPY|h~$3^<SFXOtb
zzKCjdM+zqZ5P56Ro97p-T^2<}KVI9^6Z&XbD1-##0UZd&4OEvO9+~^;gHODXxFTv=
zqZjo$lLLnuJ*WhLAaiP|8WwxyI@7DV9|A>4F^o8iPax$UB2O3C($zoXZ&>e~eHZ>&
zpinFlTW;hH&@|z`fK8K!z=uP?Py3Zkn$}wWvF2bldyceBp_OcF`}(wmo`h3o_1h7V
zS!_9T{E8dd6%o_nWzHv?kLSPQF%Kh`jcr>@!FOjFeBYyKF3h|A?hr`@s}%a?qsELe
zLTF-E*KvVd;QJ@!@`=>y@4+*79>`Ui;ZknZo+7b@;9-aLB6}%vKqsGfto%4(N_m%&
zBqI5Tk$N{7R!kxT!SOI#Vd)q|D2}q?x>0hLx0SthXp#Qjo(fsSe1<GSVwr$Z4=g0i
z8@FPZuH2%0(+BN;uDN=+8R`SKNalZhPbUgxV(UOy0qdR<(8Uno6|qn#@rHb~`oD28
zmPq68gk7F4lB?vtAd6uz`p#rQM>t&|8U#i0o7Qi1^2F8~?0hA6k|+b)dHH`H`zm3(
zXaY)+nBt1hsX8OaYyL^Bc7Ui|1FY4z4*;*X7<!%a1I10}9O4g9gNkCC2Q@LdRf5Yc
zaglK|GAoSLB0#qlQ0wx;32X>Ic#l>^`}kW)CldV!Q<cWwKwmbuF~W9~nRz3bnS!$8
zV<ejs&@0Fd#YOY|f{OAY4T%Ap4L;d5wT{8L-AY(5$>)1YVW_?qyM4u<@3$Tk^&;z>
zAgeNRBAi)b1db&9%8f#|lYB&4Bg7tmi(bAuCWcv<g#yM6S~|)d^V41*znWH&x)X(9
z)|f`J<KwBoA)E6VpGsth%%)k_$i*Nn(-jB@jsnkik>Jo;0I8LBkLqLOKv6WLvb?ns
zwF`oF1|bXFqp^6<ot!_X8qk{Faf%J^Tb2!Nl8r}LPG_DO*EI-0*XrEWdHi@MnS{Qc
z^Wx<&k>#d^z*y{3^-;}ACI>;O2b3t;LL$68vxz+VD+R`&oo4Og_v~?c0*V81;Kwc_
z2m6diO7+_(UIY8M?N~d^+5-4W+~4o_Z!$E+#V3-b^(8d+9p$GLkYgK+Bpr>%IXDxN
zN6y^StwpX&8T#hWg7oX?bh@0-b9f%K+bG3w427os3EElJ@q=wR!@FuiGHyP4hoJ0Q
z?zS>fq+$^_sTE>bankF&{J}ylc&Os}U%o>?C#cD+P`7;#d}CHd8<aT{zsdz=v(5Eo
zrPO=@`?el@L|rosE5MC(7+RgbbZl>8d1FocM#Eh?kC;D|U!qM!$==ONhqMS3@(+J>
z>$BqCI7J<Wn72>wz@{D-ySM*-RE{8Xl#hl}F}`sk$HgkX&M}Z3zy6@+YJ3V=nkN6h
z@_-T~d;S@hVG7}E;5`+SNa67S>{4mdrsm{|;9CJT#y4-Elj8zeliQ5hJ%jUiH`x+`
zAt@BIzM9BKHiO#Wb73_>Qmp#<2lr(JL8`0dOa2?$s;))2OPQG0V3p6>CUX^RY0iTa
zu7DY>^|86G8jSVaZBZq;Xq}-LoZ#I%q6H13nh~`lfIeHJj0m?b{N3*tP@^q(jQM(y
zGXBx9|FDHuk@t8UxBPfl0uK*Amsx!Jg<hYURQEn4Lc`HqTLMNiGhA0GtY*RGpd12*
zV5k{AIPH!=<o!<R6ke`eVCix;5_)6LMao)9%GK`1xLy4L)(gz0a#xQKR-`Q{+kLSF
zK>hFSWOp_7W>(Ld-CKoZQhWfc3Fgpr>v!tF8pVcM&n@~Vk>P49BjFbW*SI}4m|9U;
zQrRbGrDz>%Wr=L18Ys5aSold6lMU_y2jg@AwUPf&w!89V+wJ4QLTY~i@xPDO9({Jv
zf2=69?hX=DVEP2RLkPx_kYGq508d}?pY!3Q_%!i8@XYF4r%*Pro!l_L=3L%+msSNf
zCE<*j?=ukya+m=xXfY(nl2H!2GiI?ToV4!cot~+Vq`Z2zQ*b=qehmCD9azn8D;0yM
z$Y5D10B(plV@x+_Gew5-2|=o|h5r1q_&-a!_r{kUjH`pFPG2eiu5tuLh>-l@N&P?h
z(2X4#_N8wj-5%<dKPkZ~Q-P%QTdf(iE`mFC<vIJ;_j_qmdYdIBq|+sSIlrCs?Z-<g
zE#%PCmQ=DueAJbGREZQv8TPl|O_B;Wf*of5Q~(|6uaW3q+?mJCZnu$#g>ZZt1I7;P
zlw|&w7ni$Q&+^-~#PNBA2qShy64Kjer#2Joc^}m4G0lDJy6Fw9=9{8KwytFMuEjes
z8(N$(cW(x@4$yA(h7H~nRqkw;mz2=O<pe^P1E(HS^!j;^U5-ER*WS6~oo+GIt{MJ?
z)gSm|>dMmhM>3oNEqjaiHtekhU8bml)2<uwPUEd~Kj#;{%@x*h6%Qc7f9$=OXEK-7
z{%Aqk1(M?gg~QMHeqW8{`7J!rHhD+vQE%tV-isfd!NtMODeqseTmAuJ;?S}sDBY0i
z4MY5AeT_;fLe!8oat~^=Oe;&i!gWRSjlI)W&S@x}+DD;$(R;y3grsGk;<E`{7-g#8
zi*2&lPpeN7_rDy77h=Q@;{%5-QjL3ob$TRmbop8iBcC^4nU&(gKhtKl60Z#(?`I$M
zHaoqkTV|<J@W#+!o(BOCWS28@a)fA>c{cU=Ro8~X5yDLFJ;axA0p}|*MUf&8GhoJ$
zGH@&BkB9hU^cB3g8+H%Hs?O=I8qme=un?lBoO$({T^h8;@bcm=#daearj*+uEe)(%
zDCT0;kqeI>!g+5{gVFat3XVVFy!Xt`c!m7#5M?S$>~?CdvbuQc`T!{TC?#|+u=_lf
zf2FY?`X{4X7}cMClf&=}c|4}`%TF$!N!%7u+L)$rNvb@|-*8b@wX5e|moS4M-+mU=
zPCF;TmYKG=nX^_1C9k|Med+G2m&V4`{ijH0Y!Um(xkTKIs1(16B<V-emp5B}^JPMB
zPleaUywP7z`nLw~4COdJmaf*kc>D7^kA*z9%BiN1fR{!D;q~ph^CA;IUH4m>VV{?`
z$gevuI7FSD8f2L>dzV@t!AfnxNQgmsaDA|EDXpPnm%VC2*`gzq|C`X_>j1$1)f7VH
z@Vh1+InP^%4>mH9O75I0lh*Hoc(rYvP|f+}A^)RYB@bcR)U9{e8bKB=2VWTWH_#(K
z>1C~L0Y7mj`%SjP>VQ8>98GD|U7D?%bgVwp@}svFLL}Q|c9&(!Bl3+iM<v?7_%|_u
zAwwNq?KRwGXF-UV&nu>&O4#rPV7~$hk44k^FTa{TwrMT@K)l9IW1f(~U~#rOR+het
z&8DkHr2o#K)jv+3-Y*g^rAj*v7cN7d5mUl^Z?Bz4jwxeYxN>=FhI|9c$WAGmg$qA}
zOR&fYiiM2Z?AEHk&XF#lw>&{R){oZQu(Z~8a^X|evofP~3ik7Xbi8K5b|H_lCT@x<
zoO*<~znA$(q2$#Gk*c1D+cSChMWH)6)Aq=w>Gb<W`Nx$y$+#?l11e_PqdqUYr4=~_
zb#Uf>HMKVSK<yO3r}UHp8+cS(iKr?nIhYB5i!%kwy#Y$p-e10?7jNR&5k%&L4V!9k
zCk$}mBe}>D(He)|y0E?iGz1B0HU9Te99L>!Pph$C8{DZe4IX03{Z#9ss;Kkpa<pQW
z?e04sai8{~KEPgl_qc9*2=pSeE^25L_qVQF#%S}4KD%wh-xu;2X1UgLc-4TZxN4cU
z!9asm&g+1@&$f&10_Q^K7|a<LmI8jADT8c7oWv=4e5CeIdbc0W!2Ig@04-Hp7k+<6
zAsczR%b#RbZgNP}C4M0cSCaARcr`p%vZcwhFc@N}W3t$eQ*3t4edK>Q3P(Q2=c@6n
z!Y+}(t~y)|*+~U3E1%2gpPV~6G7W{o)Sa<XtZr9w@Acv}usS<Q(wIEcLmoA}<9kyy
zMKhzd+x9OXZ%H?Av{4UZ_TD`=^KasVSVPgR;7t=s(NkS=x0Sk}=BM^3Ekz}om(VDL
z;XFP!-*Sej7FEzMa8ItmoXg~GTYbX3vazCtahCcpVI*Z&?)rAT!;S6dJDU?@P{=Wc
z_zd~sJ-A;}k-a=IaQ3&(R>qw@o!eHpP?JiQJ+2VJ`DC5@f1Yh^&3o@whYor_NbNgH
z`WYD4|Kk6bG$i;h-xLg)c=6BVSf_AXM0kx1LkAZhL3rw8jj`-4VPc`^lghVmNDn*S
zy?UGV-kWf$F@&k~d*PLo&zB{Sd8K;i68AZ|Mra&apGWyTihloQm%oM`eK;~y({+8*
z(XK#ZET(OrjQcC1bK;07VO(m}>M%Us?JZcdfd>4@p7=8|yDG!}cQjU<>~o2?Xdu@-
z_py!`aCP#s7Dla1=eqfJlx16kPi`NcL%Zquy^a)Unfh16NsXWK$j8^1=^4A>*HzbP
zq;aCIW!;BX!q|yU(~^GQEVM1VE{C;%A4TaSGv9Q7aBwkLATjG$-q1F|^`+VL;;k&&
zzr!%$=+P6tc9=#oU1@6xE-;ua-$gl6*8-gqbLOFopKo$~;uWv24Cm5U&Wq~ZL~QSU
z8LkMYY&_v2^S5(C2iC*t=C{zE-zO`5nm{()Ni9?&Ki=`kvT;a>l8Xr!8xP#I^@F^i
z{&1#=RhiP9Ft4Y%RXgb6>Z1}efMk4g=n$tzjU3#T{3M4-@&5iFuLYeb+W!0XxJ`$=
zjrC7zCA~36Wo>er%Dc1#M=zEQw%>}AGB8|0#oUrQ_PJS!W1;?quAy|C4gVjM8a9tu
z&5MEEfMe-G<Tq$iWg>hvtJv~tVyUe6KF(}x;8BMyC9n|qtLet0HD(;H3k)Uu$5*j=
znYaG~pr2694NGi8_q*dRJr8wwFUQ{Oii?%}M8<qD?enc&rqe`Ku`ThNN8d?TI*J9b
zX|UlQls@PrJ%Rm3=5+D<OyyVKvQFBN;Nap_S|&WwBymoq)=%@u>-}wSK9;O@^Hj}0
zyc%`(1|Ph7O>Tg-E+#B0T_xwPO!%7~1qru|`>V|?d5lif7=aIr3m-?zZSpM^M3h{e
zetWOW?j)yC;>t{KiU7baVofm@G7y3?T*>VZh{}Cu>aAO}40zwG2A1U9wu3p&A2Yc+
zv1gYd{mhbVXVC~38o3Zy{OP&lO407Mq8rzOiavg{KY9YlG{&kfnHOpBDv&^QuRvN$
z?};l7gHI{F4Flhs+E?M~6kP`3+@nh?(1jMVik1zHBF3?<pMJqiO%X<?;!2$GSa~u6
zI){=p9k!NFy@&4g2{udMPN2-SVl$eO;*wMc`fU%}Dn^w2CO@hVD;DPl?#odfkMSX%
z3swx3F?6BXqamdy4&Jo^LGj0to-hwwP~b~_loaG(@^=2p;(owDoCtqlM6sA`%=!dH
zk5zpc;6LP7@<;nI*0ACa`hH~jt_yb??d6g?;ua`dvWIq;c?WBxc{VDPM{{4>k~RRm
z`SKV>yfp=x;~rNX;X}0%GdLN}^Ofh)+cvF;{cAsS@%WBg93E$1=bTEJ;+{Rv$HbqA
z&U$cA#FeFmwhHGQX>pr>!A2P)Rla&l_;vk$y>{lbx_ogXGJzr7xFdn7`20?->3O-W
z3@GLF29H=7dag)CEU(~*P+xhDxAH7~U@{WFS;rBpzk(wNX1j<=OL2P`Uq`m@L9qTs
zZ%!Ji*b+y|Jc@}hw8bd5Dtg3`#6V(Iyy{Z-58TxgREqcY2~o#FkL8it)%qN7k6Y9v
zrCm#dvPFW%e9+`7qsFOBCyX5amCE+R#yx=cZ|e|KwPVy`y^2yQs7}RTw2k>_7PhZD
zUifTfIe3D`xXI+y^IER$6n6n}+%-Ei+d50nmScp1z~wSY+K}*WL^Azaq<TEwhuLfn
zO^dbpy4V@#$rXU+M-Z)En*v+kD%`!nfEj=Mc}w-?;;8NGrv?7@`V6-}nj1f?Z(95n
zC4r`Pn1c4YnTLot4*=0WOx_m<)@)!@Ym6_LoDb;s_>)VEFMPZ<M)FolhU&DR=G~+2
z7_@>edcbi2tU<Nl3$vMmg_bf+fH}^LDMiv-hYvh(?9zBwrZddBicc8}GdWfFyHYa5
zg(ywUuz|40+YzcrHbW=#RRn9mX&#pJK)MXg3^pU*FRSq`-l0<_d*O}YTsLaN7UPqX
zx~j|O8yvfl?){3=_YK}G$xiC=sNhhH_PvMT33cm6&}7l4j8fLQ%oTh`@6zYe8*05V
zj+BvG2m^saxZJ4jl{A;Q)vK}+(lsXQ31k)=qn~A!=Dm$zFiCJZ9s3iQs#B}JN!d{@
zN?8f^>_1O^z$^f``wQ=@vyam0-8lDoFaC4KRV!NFo3r}KJ(rdZ*s|WsY1lnq8#?>A
zvJXqSAoW$;*Wl^Fpda4wgp1l<>&yFI?NTy*IzMGUY2DjvGH%|#Q_u2*=>%{gZJ{-a
zoTG@@%D>oYEYZDnX%)RE7ONUIWxS`$I*Fe=VWuZb0IWpy3@TPUwAgy@WPVYFE`okf
z?k~6=Knb4%eUMixOf4MXQHc$7;W73jzCd%K$ru%`^~khCh^F?i&CmBeTJeG9Yu|Q+
z2b_Y<GFQP<uWmB3l9%$)^gB|Lw8FTkS^Vm3ip7H!j{QFdltQ<KQhYHTs3#&zmVFq*
zh*36Q+cKBDaix1TFv0}=c+?<5=}RVEtaDe#*$?5jx4-8n7&CX8KI!kJ?vPg{nw_2A
z2%pybSqJs4fcP0}vJJz*0Gpgiz~eU>Qy&|C>Y4HHsDlT2K{X59W8^!Udpv|DDsS5X
z)y#&>PtbyDR&Bn`ep`UN=VrXb!<W@z{U)WwZ1@lBW$l@Q_w>YY`e@;evb?X3cJrk(
z{;(MN|67cS(EPh}eETR16yyuu7^G2(kRh=hZG9QWn@pDN+`spO${bq@m93NST|DUT
zuB>TZb)pPCmLXrK65krUNFMm&rcB$!(6_F7-$LHwGl6dw%{?G{?e^yFDaMQM)|ss?
z8L9AYRCQTtFuV`nJ4ul76G-~pVclv1rfXU%dy1*N3S?^5^GZ`=QsHoFCechD`c*N~
zA0t_+_yi-&eQ^-jhNE#P(ojEZRR(hpa78X>=1=Qnq3bv|>*9I5@HThx$i^9^j(bZY
zlv}hI8aDzsD2P^^!-)obEe^wlZvf1~MZ)>BY8?M+q-FbjXl_MtH>Gus@v#7Nb*|KR
zhZl{G=QB<CewThMK(xCybuW3vWy!R!{p?SGvEHNY@E6?)t^^-K#eRw7aOb2d@9{F+
z2AoGCo~cPU#8UhH@m6@30N=vtowO;74%sEkW(LK~`shMGWE?0qrR8~c2ZbEa_A5B9
zzqQI?yh@Q!)pK`965kH6NDjf`U!eRr9H$Ia@<$Y`ODN0R$5jZ8#RbF7@KcU4N`rqX
zNm%9?4r~gd5y@2gdge^k{ZUFp2j|RT*{CdOQ)c>)c=0HS$}ae7#MU^fVZHPg5XvF3
z7{<~vyXjitV;M|!sHJr-rY@z1%kNv#Q5BeE@$eV6EkKuRsWlAd$(K`xkdZ!QdhcA$
zcin3~B&6pTO!_-fs3+AWw9<Y*qN;MzubW7Rasnsr94#EOu3XF<y2Kwj?UzD_*BLz>
zf@=m~UNvy3@$l>p<V~(tRSfHl8_%RUu^-Cbl-Ylsz7~RQUEIelFSS%iZP4*)oUw_V
z9?IEx?8dVdH{C%vei+ZYi3>`joSzwM+hg2IYe1v-vbHt``CP=61w7>Lefx|ip>QBz
zh@%(V*~1-LnlI(@T^BDWps|=cn1Ht|M)h5?k9Rb9Z;eQV9$Peur&?Pgjr}f`$LA!T
zrY_s-uF5pR706e8)>=?m;CJzvvpXqHf`!M~?;`DT;HChFC;9o4&qshS3CW@UR7&VE
zif$mzp&it4$8opCL&f<Y`Lz;Tg_n?$<$YWALT)>UEWHZPoQzAzlj0r)E6$1)BEIMn
zr4IjK@<js*pI6Z($z;!Z8pN4oAFfFH`dYTm_*DS^JIcMsJG6;d?#{Qj>%n~ObX}re
zNkJJ{v0;6`GA|N(7LrZ{nEjBl{rCXK@Qy5;l_ck)^Mt0_@%fp+u+OxCZQ}E6V#jIv
zAO&kF(d-$69vc5DSqOeftD004NS?kT6hSX}dL{9R=uUL0_%2(&Bfq)sue2IIJntF)
zt{IVJ)>Gq7^d2cCcD`JECPtYm=DIgFg4<q_!VI`bd%B%<vtVgMi(9>)Fa47~mlc&5
zrPua=T~)b;jvE^D8`l~lJzh`~vn~)G^q#D%^xpJ%=yd9Fe<Upji;D_x(d1^kY!$OY
zF>_7HO!S+*@`ZhF7U%ozVUgaOmS)_u)EjpS!?udJz};W}-!zSv1gHvmen-4_L4j7D
z9eKV&J^C(MWpW%2)%q^`HqCF%nJ3o>;$LwCzLsdOk;H=ESi=s6wx70$oANVr-Pij$
zo9Hv^Z~4OLLS(4kzWd@!Bj0lJl$cMKgSzC2)QcGbx`F0VCdz&U7H5u2J_lBBy*Zi;
z(U#L$eeITlmsRK6WS!|B-TSlOZWt-`G5(x4_OtU)Yksf-zpGJAnzK5#2>ZcY*AJWh
z`ruRP2?<Y}!C28pHu^M}s3C|q5^vhba+oL0iRIwN`?~X<<LfGynx0$^lj05M!>>06
z&)jekHFi1~s3_AaK(!{QN)t}s^Q~?2skSO>1u=A9#k(I_wyeu5PeBizqg>4q_e9=-
zOT0UMif|fr-+|gHW=VyOAmXd({-IDmpZ_J32Pb`zBdl=TiH_=-+L(%r#qE#=q0BvT
zgzG)L4+03a*9<57@~``T;KjD5crF4^YI>-^%l&M#$i0Bmuh2*!zyBzfLVA3Q?x=I{
z4z=EWPZxC>E984NF`g-(veqY@nkJ`$^WFFSlBU(3wteUPwh2oBMEliZv6wvaH<Wpl
zW?t(GN~-K<Z)%hAb`l-Gd5nU6mTZDmLvT|6OTc4pY%#Xb_R{RCbSPyP=hPM#KUEip
z(hVFx!szH8L4mJ<xTq|>E=QC7Xhq1zV>m?qC{C?wKPi2Y6CfMV6_9s1_D++##z}c-
zKi<-nV<ml5Ebn`E8ED)XjuU|o?>*DOc67Ypofc|yG20J@iCtr+o!%nWJFRv3xV3i*
zLo6dik%3poJL_Xv&D_ve(3<Hg0bO|8__=JQyWe39F<%fV+9EyWP)giQ<79jxA$I^S
zyFaYvjtH?W<}}8MiqOkhK!0g6Oy2BAx(7ijz~bjG%C@8uf5K_sI`M}o10fpGkU%~#
z=`X%9o*3T|BC(fc-;-PF+ufSi-178p>ikK}r=z(?1Vw&vHZkoZItL6cMXgsZ-<!_#
zCr+n~JlEf6wR**pJN4t?s=LbgQdd~Lz8w_E59)E&-02@D&EDX5L-t2lGeIiDVW3;0
zJZUNUi!Qol|G~rP%lUs@TY2}o1*+{U06zKN)YtTc`}I#3(PY)YAocM{(spyBXgQ!C
zaBY<^EktJTvk-5&kS}nHEJRk<6FZ1ncgWO<Gkj|M5GT1+a(*E)W1lO~V=XcI5y{o5
zn>O6(BLV(#_0v}QYH<=%k?7w~+{}Mk2rP6J)XP|LdvxL-gzfJZMEB$=NQ-}{70hs}
zC;Y@qR@7TLiXOYjV~#3A_>+^TB&S|s<{pZLOBAMbBLoXqd?v@!g|89|6n5tA``h=W
z>aQmFsWERkUtGx3Ybo^A`O?cO-TEQUWXrR!%`=%%>yOf-8=~UrZp=9PUB?{S9{hX5
z;>w=#Tb@3JQ}P16@*2HE=CyGuMN#fr^b0|bx>r9F&#fxw?AP#ZnVgq`b5uRBSeSL8
z*N{4sTBJ*EI~|8p#|KvI8#ksKRwZnByS=P?u#2Od^blOAb^D3a&3EBm6Oxo)kag-0
zJKZg3`P-L<#l!t0K1^@>zU1^*owlr+dTEnk71>Wg_X>yPD|+SM_S*bSchilnIqQnQ
z_1eEQojOvVvT3t;y0y(7OkQ>xP;6V-IG>s4CBM_eu#~KHA1*%L%eKs})8oKzdKp=K
z{^%dHXmtlT)@qG=BHkNS`R@@iF4ED9h79e0HKmD&X-fE!rdfJha9ggA^uG8h+#!eH
zXq9v__9Tid$bJ&f9l9@-Su^Co(e=1-uA4N3$c8?Il(g(6<-C=K)u{&7n0>DGQ{D1+
z@+?&8eoW(O6sICBTdH~Ycy0Z<vZUr>8)jf8_pc1fGqn0$Ym-q{X>P0}C<>0qWVh!M
z30hW&9H5O%CFxHBuy?|lV3p)7b>MGaVhV9LcF6rG6?>`gsd=oXLcV#dIvH2&K>S{7
z?!CW%K0eEjm+Qf!Z<(k3DEPbj&JyvcFWXRTwo+aEr^{)4w#`Rnz@45fEX1C*2w$?j
zHYCGq%%5nr-dL-QUAokCs)#X`hd<|p#g+vT5xV4(s*=(34|Lw#WHJlaiW1gv(}5iK
z>z`|&{8za!BqoxOzg+#9(PF@}#Swzxi@=!;kQ@Etj^1yVvvkJjrPmJkuMz(ol7BRt
zWhA}zaKFuTeZ^+M%sJGo=<%+=8GhS}u9>BBqh9%<@jjata0z(eA4yLAV@AV6BCY$Z
z-DYkecYxFaPw7%O3H2GY9q;Ldi0s3i7c?W{E1SVpe4Ka9)9L{2H+pqJA?z_3v&Mgh
zMCswde-{gmm}jcFs$_==zuwdw>eX;Cl?!&tbMXt)nsJU&v@c%d5q0hGJ{GDzJF7bx
z@43$P>X&o<jDO8Hbn&NVLH@GSQ$9Pnug$wzD5a;Z?^E|XOWvL<ccH#GOF~&QR3;hq
z+4;hBqo-cHzpLs^G|3Hy{HL4KoHbM!5YRQE*5B<$4F}0oeh4MX?YNCh`59K4q^uFV
z!a=E!-~58UD&e!k<fcopGyU5B*R@df0IJ!NMA`_6cbh7ayfqpFt~Ft0lHo2&Wa1@f
z9x9;EMy#k;J@&Yz@{()&{)(V;Mqa6HhSh!QSmA%nRi_FUA@z>i|KOd4SLupyv8alR
z)#;$%GmfLCFJF0fe&va7>iZ^Lk5eCb7JMIN;uja$7RAN(@cj`hb{zs%v}j+J2m4_j
z8Tq!q-4oIYV=A0H3R;wA-g~r`+~x73kt${;935Uo-g=^o1wjQZ-#&X({~WXz=k;LR
z2s2lF_X_NO_dZyCV4NZZe~7-=5NWSqNRx)gH2<6KY5Z?4I%=d8b&X?DN|$qc)t>5f
znY=OK+dCc^>n0SFn<oD;@F2onZQ!c)6?7)C$_Xy4W>wr7ZtP(YE8ZY0nViyw#-?zL
za^)T<?)Xt>x_H}gORV&CIXHH3n_(k7k$|Dj-|gk1f|Bb|Hs{FmBS^>e(6!inHJ)Le
z044)r#vyXjbh&=$b>8Bk5L($RVH^XAXZn6ktho3-o-I8!WM*wbChu>Dt`d=Od!1**
z)gP2jG1E<Cqk_{yfg%UFtF~E<5B121y9o9cPf4HI+VQiX)bJU31P79}0OMz#L1%Nm
zm<brrG4_XBwyX+Cd}O!0yv`Ac?l3p)He?%0Xu2JKs-1nN_H8Y;GvN4J31!-OR(J94
znK**lNWn>YM)Wdf2=$}4L1Rn)#)aQAzX(_nvX!U(1ld@7dCEz`hH6H~BPG30N9-p4
z0aFBX8}WUCV4#7)fVr_kHYe)Sg5yL*#^i?Blc=fba$<{p6?qeR3oR|@X)jqYaxx}R
zsgjS!P*T33SF(6-SvN$nTEM8mI769_CdEdscfMtxzosP~6!=~b+lAHHuNjhVQbZfc
zJoN1JP2hF4RAdD|qB!@HiDzCs!g2EWFPHb)d=F1o%#?|0co(D`tXuJA&-f{teJ|bv
zorpdVUxzd93-&^b)6=jZJZM*O3f%t)#QH5*NUAh^{GCU)Kp&dvb_I|$#ex0(wybp*
zOWa>fvqp8?`keb6{!X9#iQ()`llQ3m1Y+M*J}L2EOFSNg{fN(0FA_1L=%~|d0?w_i
zwdW%~Gja<T|K)IN!u9;P+$h%&DTNM@w(C@`>(uJ2{j1s>t1=((Oc$kO%d3p)oa?GR
zto9pT51g0esul<oQxZ%XILaQXM2S1~GHe-B6^@#I3k>Uj4cNI=I{$u`mY!UxcrSW?
zj~E(OHe~lb6Cwz1dLpih0;dL8rKR{+L)_?bthD?xNYccy^0xcWZI#}`zJLx>-YzO)
z>u9$&*_;FES#?jQlLZ(T*HW4T<?~-GV-=G-YI%2Q+_ed9t^T1m8P>vyGFt3}tg{EA
zI!5DaP|8}ZmvygjO#LIpzLnaEc^P`gpg3lCd@yYd8wK*8K6BNx{pL)>^78HX>u)jO
zJPR3XvKG0e7IGwT3frILDPsL*`5)J#7|U-G_LUP2LgS<wO0F+mz1siuf>28wwH4-X
zL~u*TjK<v1CyDbO3g&D+)(L3r?&f>31Yg0Nu2GZwWx6|O?@bQG6O{y9F6OX)vHfdL
z8d1r{UbO#lW|Q-PL~EAe3p`^CYlPSItbGT|tOv;i!;OgPlGPW3EawGxQgVB?qa>?A
zTcjRrVZrE6*m;xXysebUMmiP4<b>VrP;+lP{^LWx{kU>38!O$M|7_2|)xML7k{jLz
zhx{WXfonfZE$L35a>-Y}Hh;ts>qiLWl}%SNk-+V#K{R*&LKR2Gd=!nSZ8RV1#2D`F
zv@-A8Cce*B?G5fE6f|Rs;vH_A&f?gUST1aG3;9zK52}kct$SSisPO0zl@UG~EJ<N9
z?xfqYXTA_^Gg;W6JeM<wz>+>v3Q%Jrcynwy`0mq<`5I)VU@{J(>*jIR%V5uhB4x7*
z(-yO}6#9`Hi2WH$nPH4jHE!A4c{6qq6UU!Xw{5*dAxQDc?ZW+7(QQnA?Mn(<{lK5j
z^|oFcp7lvy(Y!D4mVI)kept#2s3q-3JByO3+WkS(9Dua}VKLxKZ(z&w>jb)9+Fv(g
zytf+eG;My~-F6xwqEDb?46)9zS#2I<S!bC^Exz_iRk=@K=yOWW_4m!*%G#<;HXoQI
zxqDZ(Grzs4P~kj|UIima)+R%7>#o=>nJ8!D;v27?mO|f)Ug3E&;R#A}>7Xw@3_$1P
zLBFG_+yVie&r3OGAfU70fCl*aiTAy=ev|DA=djf0TIX>6Z0YUB`<Zfsq&8ucUCE8x
zQHgK9ID%al8I2dsnZW|<tVP<BJ=3P+KPRa2vavhd(~=cqmZ_3?&0kjR|HzGN-WU|v
zd2ByU7*O<2`FO=oB$Dm7_F{X;))s#A^SN2CBF3(&q=saxbg>IN<YCEDch=b`e`^K`
zSCr^qBmHZY72u>(@K#V#<eC@SF#Z0guELjBV3NK4sQz;|v6%fnImN8a5*(k}jQbsE
zD@WR^w6s)3PA5#?L6l!Gv<Un4B1-tW<oK4I^ns*Hgh=4L*Q1m8v$}tsc%^9{%K
zZYwN4+Y;_~F=`yzYaOZmt1a=^!Cr1~d|`69b~|fCrr&|~<1Qp4S-Uxcg;nb6Q%7sl
z36YPn&lMO0Wo(;|<nZGZP-%PGh*|~ONDI59NcwaBk0mdOKRTI-dBHvAhb`M4Ej$?e
zga6LsrY&pa{;Sx?RP>kgv<bgG_~QmJOC8=Y7kEGXDk+62ho^CDTRy>P=9e{czZd(~
zyjXQHC||y`He1-^btZ6&zM|g*uGysxJ8p1H?E5tHaYgzE2N%zmogWPzKK$tDl;Wc#
zm85k0=7*jG(JMYJI0GEiH?Wbn3E03ZV_|U3fX(K9sIZ3C=?lL<f%aDYGYsMeZ(LCZ
zCIKs@C7_K2E(_W@RhT#2)3v{ocywwVa5E(T^{Sqopj&E>aPk<-eU;lHda-TO%KqIu
zJJfAH+uhyyOonfpA?S*zXDcT>F5L>e`svn(We;asXvtZaxJdIP@lVT=yRhwE+Us9-
zP7Oa#%Bajc-oVIw$eT;Ca9U(q=8I{n&&oe#JeX;Epn2EjTMuMz9X>c`A8>y3kT-YW
z@=p(9lI|(&df<9^n&qR36%G$O{yNIYxCF|s1)aH`-9JAHyzdBksN;f$BWvJe;A;8)
z8N4h~z=rclw>-%OXZ|od8%Xxm%(EBp2Obhy*zFj7+L`TOD)1D(O{KQ~n$j;Dp7%)U
z(4MEMV#B>7tevBzjiFd@z2Nx*k?0E*=MPF{Oj#a0>pAd9Akzug98;|Y0|dM~l|xp`
zb{JKx7dUa#_=1a#;;HoC{EHSXm*R6=u_<S7zAi9Wqq6s0i2{v;t#AW6*i;L01I8>5
zhQpoqTV8(+NVMi_%(z*1`0l5Ykk4G6_e;N9N=|+G>(uv}z6K|=9`G80>}y%<lD-{<
zj^DU@Jd@TxnCs-=$F%a*xtc|5p7(Jm_)M7C`6i6T#_#i{Yj<9+*weSq|LOA2TK9wX
zKVJ{_KOVDA_0@$urny2Jrmk@St%u~82tLX~7q}3u+`0AQl{nk)I=~$qwx^@Fg44Ai
zu&p8D7@7)RCNIlkaOhJL*G4yww_Sm*X;Z)Pxo~efx?w2;|I{t4?96L#vL90}e{wSD
z(!A+sfNR3CfSv5HBikSN*|@1>dL<sX`}f<r+%oyl`kfpPR42u=Pi1oV-!A6ZT6(~0
z9k9mO+Fmv}Z1$QMj?N{|RcGir@kC}uux~vwBcyF!^*n~h$MVhRx&Fy3Em5eN{Ij#+
z=$|rSgG%7I%c~bgwcvPH3Qz)uv6qQNRu`y(a8S@+vB8$jkcGL+e#dR&MF!V80yj<1
zpRTA|6j=VSTTXSGx}eAOGY(58q|O#N#Ko4e`rFEkG_C#Rf)C6aez-4Dmb0&`S@JBl
z?&niypTyqR8tROXo=)%r9(<dk_4>tw?XvG3TO?e}fR}~~0uKZL?Jfds`8s8;mNgMn
zc&axT9{tu5TYmQ{=&p1}UBMs$;akEYKlqZfL90xFCx>sX&YLNiobn*FVd6o@`yag=
z1G3itPklG}mKsy6DsT<3!}ISAmGk4-V|Bfsf9GL5_8NGhasL$Hx##`wHUu3rtK|Q5
z+cxEdz-{0W71n$F)IptD^y70J6*#qk$8`69SuGJZ*V16uhm=U*G3#rA#}W(4TQU1S
z2kCKW=1^EM!R$BiCgv@m?Ze(nm1Za|((aC&zTssGc+rm)FhTgrGns-X)gYr<2y($y
zVE-Au%osxn%Go*KQD9ID3x+aGLAe5Q3<OLJE(yGb9dtm&7sUW(gmp+3MD>EUjY%7B
z1uud`SF+$GFy)%ma$I3RSchbE7-*N-RX3lvpp!5#P{yd2Aya`cauCPzKk@4?9!ve)
RTxQPz1fH&bF6*2Ung9js;Jp9<

literal 0
HcmV?d00001


From 04108592a362848a5d3af4332f7628a14e312174 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Fri, 27 Aug 2021 11:53:27 -0700
Subject: [PATCH 305/530] New TLS to disable forward mode AD (#63117)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63117

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D30388097

Pulled By: albanD

fbshipit-source-id: f1bc777064645db1ff848bdd64af95bffb530984
---
 c10/core/AutogradState.cpp            |  6 ++++--
 c10/core/AutogradState.h              | 15 +++++++++++++--
 c10/core/GradMode.cpp                 |  1 -
 c10/core/GradMode.h                   | 14 ++++++++++++++
 c10/core/InferenceMode.h              | 10 ++++++----
 torch/csrc/autograd/autograd_meta.cpp |  6 ++++++
 6 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/c10/core/AutogradState.cpp b/c10/core/AutogradState.cpp
index 9684a76b78564..4667acb435193 100644
--- a/c10/core/AutogradState.cpp
+++ b/c10/core/AutogradState.cpp
@@ -4,8 +4,10 @@ namespace c10 {
 
 namespace {
 // By default, grad mode is enabled and inference mode is disabled
-thread_local AutogradState autograd_state_tls =
-    AutogradState(/* grad_mode */ true, /* inference_mode */ false);
+thread_local AutogradState autograd_state_tls = AutogradState(
+    /* grad_mode */ true,
+    /* inference_mode */ false,
+    /* fw_grad_mode */ true);
 } // namespace
 
 AutogradState& AutogradState::get_tls_state() {
diff --git a/c10/core/AutogradState.h b/c10/core/AutogradState.h
index 1447594433fe4..a1d13a42891da 100644
--- a/c10/core/AutogradState.h
+++ b/c10/core/AutogradState.h
@@ -12,13 +12,19 @@ struct C10_API AutogradState {
   static AutogradState& get_tls_state();
   static void set_tls_state(AutogradState state);
 
-  AutogradState(bool grad_mode, bool inference_mode)
-      : grad_mode_(grad_mode), inference_mode_(inference_mode) {}
+  AutogradState(bool grad_mode, bool inference_mode, bool fw_grad_mode)
+      : grad_mode_(grad_mode),
+        inference_mode_(inference_mode),
+        fw_grad_mode_(fw_grad_mode) {}
 
   void set_grad_mode(bool enabled) {
     grad_mode_ = enabled;
   }
 
+  void set_fw_grad_mode(bool enabled) {
+    fw_grad_mode_ = enabled;
+  }
+
   void set_inference_mode(bool enabled) {
     inference_mode_ = enabled;
   }
@@ -27,6 +33,10 @@ struct C10_API AutogradState {
     return grad_mode_;
   }
 
+  bool get_fw_grad_mode() const {
+    return fw_grad_mode_;
+  }
+
   bool get_inference_mode() const {
     return inference_mode_;
   }
@@ -34,6 +44,7 @@ struct C10_API AutogradState {
  private:
   bool grad_mode_ : 1;
   bool inference_mode_ : 1;
+  bool fw_grad_mode_ : 1;
 };
 
 } // namespace c10
diff --git a/c10/core/GradMode.cpp b/c10/core/GradMode.cpp
index a5db198083b2b..c2ea8698732d7 100644
--- a/c10/core/GradMode.cpp
+++ b/c10/core/GradMode.cpp
@@ -1,4 +1,3 @@
-#include <c10/core/AutogradState.h>
 #include <c10/core/GradMode.h>
 
 #include <stdexcept>
diff --git a/c10/core/GradMode.h b/c10/core/GradMode.h
index 1168bb1ae67c3..d83ff6d0d0d3b 100644
--- a/c10/core/GradMode.h
+++ b/c10/core/GradMode.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/core/AutogradState.h>
 #include <c10/macros/Macros.h>
 
 namespace c10 {
@@ -27,4 +28,17 @@ struct TORCH_API NoGradGuard : public AutoGradMode {
   NoGradGuard() : AutoGradMode(/*enabled=*/false) {}
 };
 
+// A RAII, thread local (!) guard that enables or disables forward grad mode
+// upon construction, and sets it back to the original value upon destruction.
+struct TORCH_API AutoFwGradMode {
+  AutoFwGradMode(bool enabled)
+      : prev_mode(AutogradState::get_tls_state().get_fw_grad_mode()) {
+    AutogradState::get_tls_state().set_fw_grad_mode(enabled);
+  }
+  ~AutoFwGradMode() {
+    AutogradState::get_tls_state().set_fw_grad_mode(prev_mode);
+  }
+  bool prev_mode;
+};
+
 } // namespace c10
diff --git a/c10/core/InferenceMode.h b/c10/core/InferenceMode.h
index 9748d6eccfb54..704c43b522c6d 100644
--- a/c10/core/InferenceMode.h
+++ b/c10/core/InferenceMode.h
@@ -53,10 +53,12 @@ struct TORCH_API InferenceMode {
   InferenceMode(bool enabled = true)
       : prev_mode(AutogradState::get_tls_state()),
         prev_keyset(c10::impl::tls_local_dispatch_key_set()) {
-    // Enabling inference mode means disabling grad mode
-    // And disabling inference mode means enabling grad mode
-    AutogradState::set_tls_state(
-        AutogradState(/* grad_mode */ !enabled, /* inference_mode */ enabled));
+    // Enabling inference mode means disabling grad modes
+    // And disabling inference mode means enabling grad modes
+    AutogradState::set_tls_state(AutogradState(
+        /* grad_mode */ !enabled,
+        /* inference_mode */ enabled,
+        /* fw_grad_mode */ !enabled));
     DispatchKeySet included = enabled
         ? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView)
         : prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView);
diff --git a/torch/csrc/autograd/autograd_meta.cpp b/torch/csrc/autograd/autograd_meta.cpp
index 248847f66ca60..f35c122225831 100644
--- a/torch/csrc/autograd/autograd_meta.cpp
+++ b/torch/csrc/autograd/autograd_meta.cpp
@@ -183,6 +183,12 @@ void AutogradMeta::set_fw_grad(const Variable& new_grad_, const Variable& self,
 }
 
 const Variable& AutogradMeta::fw_grad(uint64_t level, const Variable& self) const {
+  // TLS that disables forward AD
+  // This is only used for custom Function implementation
+  if (!c10::AutogradState::get_tls_state().get_fw_grad_mode()) {
+    return ForwardGrad::undef_grad();
+  }
+
   // Ensure that concurent fw_grad() "reads" are thread safe
   std::lock_guard<std::mutex> lock(mutex_);
 

From 6a76ee04de5f10b76cc8f97cc254da43905d170b Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <marjanf@fb.com>
Date: Fri, 27 Aug 2021 12:45:01 -0700
Subject: [PATCH 306/530] Adding alltoall_single collective to collective
 quantization API (#63154)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63154

The collective quantization API now supports alltoall, alltoall_single, and allscatter. The test is also included.
ghstack-source-id: 136856877

Test Plan: buck test mode/dev-nosan //caffe2/test/distributed/algorithms/quantization:DistQuantizationTests_nccl -- test_all_to_all_single_bfp16

Reviewed By: wanchaol

Differential Revision: D30255251

fbshipit-source-id: 856f4fa12de104689a03a0c8dc9e3ecfd41cad29
---
 .../quantization/test_quantization.py         | 61 +++++++++++++++++++
 .../algorithms/quantization/quantization.py   | 14 +++--
 2 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py
index 505f805b2cc10..e60539face11c 100644
--- a/test/distributed/algorithms/quantization/test_quantization.py
+++ b/test/distributed/algorithms/quantization/test_quantization.py
@@ -148,6 +148,46 @@ def test_all_to_all_bfp16(self):
                 dtype=torch.float32,
                 qtype=DQuantType.BFP16)
 
+        @requires_nccl()
+        @sandcastle_skip_if(BACKEND != "nccl", "Only nccl backend supports all_to_all_single_fp16")
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_all_to_all_single_fp16(self):
+            store = dist.FileStore(self.file_name, self.world_size)
+            dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl')
+            device = torch.device(f"cuda:{self.rank}")
+            group = list(range(0, self.world_size))
+            group_id = dist.new_group(range(self.world_size))
+            rank_to_GPU = self._init_multigpu_helper()
+            self._test_all_to_all_single(
+                group,
+                group_id,
+                self.rank,
+                cuda=True,
+                rank_to_GPU=rank_to_GPU,
+                dtype=torch.float32,
+                qtype=DQuantType.FP16
+            )
+
+        @requires_nccl()
+        @sandcastle_skip_if(BACKEND != "nccl", "Only nccl backend supports all_to_all_single_bfp16")
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_all_to_all_single_bfp16(self):
+            store = dist.FileStore(self.file_name, self.world_size)
+            dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl')
+            device = torch.device(f"cuda:{self.rank}")
+            group = list(range(0, self.world_size))
+            group_id = dist.new_group(range(self.world_size))
+            rank_to_GPU = self._init_multigpu_helper()
+            self._test_all_to_all_single(
+                group,
+                group_id,
+                self.rank,
+                cuda=True,
+                rank_to_GPU=rank_to_GPU,
+                dtype=torch.float32,
+                qtype=DQuantType.BFP16
+            )
+
         def _test_all_gather(
                 self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float, qtype=None):
             for dest in group:
@@ -203,5 +243,26 @@ def _test_all_to_all(
                 for t1, t2 in zip(out_tensors, expected_tensors):
                     self.assertEqual(t1, t2)
 
+        def _test_all_to_all_single(
+            self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float, qtype=DQuantType.FP16
+        ):
+            if group_id is not None:
+                size = len(group)
+                in_splits = [i + 1 for i in group]
+                out_splits = [rank + 1 for _ in group]
+                in_tensor = torch.ones([sum(in_splits), size], dtype=dtype) * rank
+                out_tensor = torch.ones([(rank + 1) * size, size], dtype=dtype)
+                expected_tensor = torch.cat(
+                    [torch.ones([rank + 1, size], dtype=dtype) * i for i in group]
+                )
+                if cuda:
+                    rank_to_GPU = rank_to_GPU[rank][0]
+                    in_tensor = in_tensor.cuda(rank_to_GPU)
+                    expected_tensor = expected_tensor.cuda(rank_to_GPU)
+                    out_tensor = out_tensor.cuda(rank_to_GPU)
+                    quantize_alltoall_single = quant.auto_quantize(dist.all_to_all_single, qtype, quant_loss=None)
+                    quantize_alltoall_single(out_tensor, in_tensor, out_splits=out_splits, in_splits=in_splits, group=group_id)
+                    self.assertEqual(out_tensor, expected_tensor)
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/algorithms/quantization/quantization.py b/torch/distributed/algorithms/quantization/quantization.py
index d58c58cad09e2..a5e9b4652a805 100644
--- a/torch/distributed/algorithms/quantization/quantization.py
+++ b/torch/distributed/algorithms/quantization/quantization.py
@@ -90,18 +90,14 @@ def auto_quantize(func, qtype, quant_loss=None):
     """
     This is a prototype API that automatically quantize the input tensors, choose the precision types, and
     pass other necessary arguments and then dequantizes the output.
-
     Currently it only supports:
         . FP16 and BFP16 quantization method supported for gloo and nccl backends
         . all_gather, all_to_all collective ops
-
     Note: BFP16 only supports 2D tensors.
-
     Args:
         func (callable): A function representing collective operations.
         qtype (QuantType): Quantization method
         quant_loss (float, optional): This can be used to improve accuracy in the dequantization.
-
     Returns:
         (callable): the same collective as func but enables automatic quantization/dequantization.
     """
@@ -129,6 +125,16 @@ def wrapper(*args, **kwargs):
             for i, t in enumerate(_dequantize_tensor_list(out_tensors, qtype, quant_loss=quant_loss)):
                 tensors[i] = t
 
+        elif (func == dist.all_to_all_single):
+            tensors = args[0]
+            out_splits = kwargs.get('out_splits', None)
+            in_splits = kwargs.get('in_splits', None)
+            # Quantizing the input/output tensor
+            input_tensors = _quantize_tensor(args[1], qtype)
+            out_tensors = _quantize_tensor(tensors, qtype)
+            dist.all_to_all_single(out_tensors, input_tensors, out_splits, in_splits, group=group)
+            for i, t in enumerate(_dequantize_tensor(out_tensors, qtype, quant_loss=quant_loss)):
+                tensors[i] = t
         else:
             raise RuntimeError(
                 f"The collective op {func} is not supported yet"

From 8fc1064b7f562ec5b5c6a5e6a0b59069101e6c87 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Fri, 27 Aug 2021 12:55:26 -0700
Subject: [PATCH 307/530] [PyTorch] Reduce code size of register_prim_ops.cpp
 (#61494)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61494

Creating a constexpr array and then looping over it is much cheaper than emitting a function call per item.
ghstack-source-id: 136639302

Test Plan:
fitsships

Buildsizebot some mobile apps to check size impact.

Reviewed By: dhruvbird, iseeyuan

Differential Revision: D29646977

fbshipit-source-id: 6144999f6acfc4e5dcd659845859702051344d88
---
 torch/csrc/jit/runtime/custom_operator.h     |    2 +-
 torch/csrc/jit/runtime/operator.h            |   13 +-
 torch/csrc/jit/runtime/register_ops_utils.h  |   95 +-
 torch/csrc/jit/runtime/register_prim_ops.cpp | 3074 +++++++++---------
 4 files changed, 1629 insertions(+), 1555 deletions(-)

diff --git a/torch/csrc/jit/runtime/custom_operator.h b/torch/csrc/jit/runtime/custom_operator.h
index 45ad6676376ce..e39789bfe9da3 100644
--- a/torch/csrc/jit/runtime/custom_operator.h
+++ b/torch/csrc/jit/runtime/custom_operator.h
@@ -19,7 +19,7 @@ struct TORCH_API RegisterOperators {
   /// Registers a vector of already created `Operator`s.
   /// The operator element is now optional to filter null ops. It's backward
   /// compatible and works for selective operator registration.
-  RegisterOperators(std::vector<c10::optional<Operator>> operators) {
+  explicit RegisterOperators(std::vector<c10::optional<Operator>> operators) {
     for (c10::optional<Operator>& o : operators) {
       if (o) {
         registerOperator(std::move(o.value()));
diff --git a/torch/csrc/jit/runtime/operator.h b/torch/csrc/jit/runtime/operator.h
index e243e8ff57f2d..ccdbfa03f5e8c 100644
--- a/torch/csrc/jit/runtime/operator.h
+++ b/torch/csrc/jit/runtime/operator.h
@@ -220,13 +220,24 @@ TORCH_API bool aliasAnalysisHasSpecialCaseFor(c10::Symbol sym);
 // string.
 template <typename Func>
 c10::optional<Operator> OperatorGenerator(
-    torch::detail::SelectiveStr<true> schema_str,
+    const char* schema_str,
     Func&& op,
     AliasAnalysisKind alias_analysis) {
   return c10::optional<Operator>(Operator(
       std::string(schema_str), std::forward<Func>(op), alias_analysis));
 }
 
+template <typename Func>
+c10::optional<Operator> OperatorGenerator(
+    torch::detail::SelectiveStr<true> schema_str,
+    Func&& op,
+    AliasAnalysisKind alias_analysis) {
+  return OperatorGenerator(
+      static_cast<const char*>(schema_str),
+      std::forward<Func>(op),
+      alias_analysis);
+}
+
 template <typename Func>
 c10::optional<Operator> OperatorGenerator(
     torch::detail::SelectiveStr<false> schema_str,
diff --git a/torch/csrc/jit/runtime/register_ops_utils.h b/torch/csrc/jit/runtime/register_ops_utils.h
index e068b7877aff1..5d00872d9ca7e 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.h
+++ b/torch/csrc/jit/runtime/register_ops_utils.h
@@ -35,15 +35,15 @@
 
 namespace torch {
 namespace jit {
-inline c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+constexpr inline c10::AliasAnalysisKind aliasAnalysisFromSchema() {
   return c10::AliasAnalysisKind::FROM_SCHEMA;
 }
 
-inline c10::AliasAnalysisKind aliasAnalysisConservative() {
+constexpr inline c10::AliasAnalysisKind aliasAnalysisConservative() {
   return c10::AliasAnalysisKind::CONSERVATIVE;
 }
 
-inline c10::AliasAnalysisKind aliasAnalysisSpecialCase() {
+constexpr inline c10::AliasAnalysisKind aliasAnalysisSpecialCase() {
   return c10::AliasAnalysisKind::INTERNAL_SPECIAL_CASE;
 }
 
@@ -430,9 +430,46 @@ void listCopyAndSort<at::Tensor>(Stack* stack);
 
 void listSetItem(Stack* stack);
 
+struct OperatorGeneratorArgs {
+  const char* schema_str;
+  bool isOperationCreator;
+  union {
+    void (*operation)(Stack*);
+    OperationCreator operationCreator;
+  };
+  AliasAnalysisKind aliasAnalysis;
+
+  explicit constexpr OperatorGeneratorArgs(
+      torch::detail::SelectiveStr<true> schema_str,
+      void (*op)(Stack*),
+      AliasAnalysisKind aa)
+      : schema_str(schema_str),
+        isOperationCreator(false),
+        operation(op),
+        aliasAnalysis(aa) {}
+
+  explicit constexpr OperatorGeneratorArgs(
+      torch::detail::SelectiveStr<true> schema_str,
+      OperationCreator opCreator,
+      AliasAnalysisKind aa)
+      : schema_str(schema_str),
+        isOperationCreator(true),
+        operationCreator(opCreator),
+        aliasAnalysis(aa) {}
+
+  template <typename... Args>
+  explicit constexpr OperatorGeneratorArgs(
+      torch::detail::SelectiveStr<false>,
+      Args...)
+      : schema_str(nullptr),
+        isOperationCreator(false),
+        operation(nullptr),
+        aliasAnalysis(AliasAnalysisKind::INTERNAL_SPECIAL_CASE) {}
+};
+
 #define DEFINE_GENERIC_BINARY_OP(                                             \
     aten_op, op, int_float_result, complex_result)                            \
-  OperatorGenerator(                                                          \
+  OperatorGeneratorArgs(                                                      \
       TORCH_SELECTIVE_SCHEMA(#aten_op                                         \
                              ".int_int(int a, int b) -> " #int_float_result), \
       [](Stack* stack) {                                                      \
@@ -441,7 +478,7 @@ void listSetItem(Stack* stack);
         push(stack, op);                                                      \
       },                                                                      \
       aliasAnalysisFromSchema()),                                             \
-      OperatorGenerator(                                                      \
+      OperatorGeneratorArgs(                                                  \
           TORCH_SELECTIVE_SCHEMA(                                             \
               #aten_op                                                        \
               ".float_float(float a, float b) -> " #int_float_result),        \
@@ -451,7 +488,7 @@ void listSetItem(Stack* stack);
             push(stack, op);                                                  \
           },                                                                  \
           aliasAnalysisFromSchema()),                                         \
-      OperatorGenerator(                                                      \
+      OperatorGeneratorArgs(                                                  \
           TORCH_SELECTIVE_SCHEMA(                                             \
               #aten_op                                                        \
               ".complex_complex(complex a, complex b) -> " #complex_result),  \
@@ -464,7 +501,7 @@ void listSetItem(Stack* stack);
 
 // define implementations for primitive number ops
 #define DEFINE_GENERIC_OP(aten_op, int_op, float_op, int_result, float_result) \
-  OperatorGenerator(                                                           \
+  OperatorGeneratorArgs(                                                       \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> " #int_result),   \
       [](Stack* stack) {                                                       \
         int64_t a, b;                                                          \
@@ -472,7 +509,7 @@ void listSetItem(Stack* stack);
         push(stack, int_op);                                                   \
       },                                                                       \
       aliasAnalysisFromSchema()),                                              \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA(                                              \
               #aten_op ".float(float a, float b) -> " #float_result),          \
           [](Stack* stack) {                                                   \
@@ -483,7 +520,7 @@ void listSetItem(Stack* stack);
           aliasAnalysisFromSchema())
 
 #define DEFINE_INT_FLOAT_OP(aten_op, op, result)                            \
-  OperatorGenerator(                                                        \
+  OperatorGeneratorArgs(                                                    \
       TORCH_SELECTIVE_SCHEMA(#aten_op                                       \
                              ".int_float(int a, float b) -> " #result),     \
       [](Stack* stack) {                                                    \
@@ -493,7 +530,7 @@ void listSetItem(Stack* stack);
         push(stack, op);                                                    \
       },                                                                    \
       aliasAnalysisFromSchema()),                                           \
-      OperatorGenerator(                                                    \
+      OperatorGeneratorArgs(                                                \
           TORCH_SELECTIVE_SCHEMA(#aten_op                                   \
                                  ".float_int(float a, int b) -> " #result), \
           [](Stack* stack) {                                                \
@@ -505,7 +542,7 @@ void listSetItem(Stack* stack);
           aliasAnalysisFromSchema())
 
 #define DEFINE_INT_OP(aten_op, op)                                  \
-  OperatorGenerator(                                                \
+  OperatorGeneratorArgs(                                            \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> int"), \
       [](Stack* stack) {                                            \
         int64_t a, b;                                               \
@@ -515,7 +552,7 @@ void listSetItem(Stack* stack);
       aliasAnalysisFromSchema())
 
 #define DEFINE_STR_CMP_OP(aten_op, op)                               \
-  OperatorGenerator(                                                 \
+  OperatorGeneratorArgs(                                             \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".str(str a, str b) -> bool"), \
       [](Stack* stack) {                                             \
         auto b = pop(stack).toStringRef();                           \
@@ -530,7 +567,7 @@ void listSetItem(Stack* stack);
 // in unintended implicit conversions
 #define DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION_GENERIC(          \
     aten_op, int_op, float_op, result, string_val)                \
-  OperatorGenerator(                                              \
+  OperatorGeneratorArgs(                                          \
       TORCH_SELECTIVE_SCHEMA(#aten_op string_val                  \
                              "(Scalar a, Scalar b) -> " #result), \
       [](Stack* stack) {                                          \
@@ -586,7 +623,7 @@ void listSetItem(Stack* stack);
       DEFINE_STR_CMP_OP(aten_op, op)
 
 #define DEFINE_UNARY_INT_OP(aten_op, op, result)                  \
-  OperatorGenerator(                                              \
+  OperatorGeneratorArgs(                                          \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a) -> " #result), \
       [](Stack* stack) {                                          \
         int64_t a;                                                \
@@ -596,7 +633,7 @@ void listSetItem(Stack* stack);
       aliasAnalysisFromSchema())
 
 #define DEFINE_UNARY_FLOAT_OP(aten_op, op, result)                    \
-  OperatorGenerator(                                                  \
+  OperatorGeneratorArgs(                                              \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".float(float a) -> " #result), \
       [](Stack* stack) {                                              \
         double a;                                                     \
@@ -608,7 +645,7 @@ void listSetItem(Stack* stack);
 #define DEFINE_UNARY_OP(aten_op, op, int_result, float_result)            \
   DEFINE_UNARY_INT_OP(aten_op, op, int_result),                           \
       DEFINE_UNARY_FLOAT_OP(aten_op, op, float_result),                   \
-      OperatorGenerator(                                                  \
+      OperatorGeneratorArgs(                                              \
           TORCH_SELECTIVE_SCHEMA(#aten_op ".Scalar(Scalar a) -> Scalar"), \
           [](Stack* stack) {                                              \
             IValue x;                                                     \
@@ -623,7 +660,7 @@ void listSetItem(Stack* stack);
           },                                                              \
           aliasAnalysisFromSchema())
 #define DEFINE_BOOL_OP(aten_op, op)                                     \
-  OperatorGenerator(                                                    \
+  OperatorGeneratorArgs(                                                \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".bool(bool a, bool b) -> bool"), \
       [](Stack* stack) {                                                \
         bool a, b;                                                      \
@@ -632,7 +669,7 @@ void listSetItem(Stack* stack);
       },                                                                \
       aliasAnalysisFromSchema())
 #define DEFINE_STRING_OP(op_name, string_op, result)                    \
-  OperatorGenerator(                                                    \
+  OperatorGeneratorArgs(                                                \
       TORCH_SELECTIVE_SCHEMA(#op_name ".str(str a, str b) ->" #result), \
       [](Stack* stack) {                                                \
         auto b = pop(stack).toStringRef();                              \
@@ -646,7 +683,7 @@ void listSetItem(Stack* stack);
 //-----------------------------------------------------------------------------
 //-----------------------------------------------------------------------------
 #define DEFINE_UNARY_COMPLEX_OP(aten_op, op, result)                      \
-  OperatorGenerator(                                                      \
+  OperatorGeneratorArgs(                                                  \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".complex(complex a) -> " #result), \
       [](Stack* stack) {                                                  \
         c10::complex<double> a;                                           \
@@ -670,7 +707,7 @@ void listSetItem(Stack* stack);
   DEFINE_UNARY_INT_OP(aten_op, op, int_result),                           \
       DEFINE_UNARY_FLOAT_OP(aten_op, op, float_result),                   \
       DEFINE_UNARY_COMPLEX_OP(aten_op, op, complex_result),               \
-      OperatorGenerator(                                                  \
+      OperatorGeneratorArgs(                                              \
           TORCH_SELECTIVE_SCHEMA(#aten_op ".Scalar(Scalar a) -> Scalar"), \
           [](Stack* stack) {                                              \
             IValue x;                                                     \
@@ -700,7 +737,7 @@ void listSetItem(Stack* stack);
     int_result,                                                               \
     float_result,                                                             \
     complex_result)                                                           \
-  OperatorGenerator(                                                          \
+  OperatorGeneratorArgs(                                                      \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> " #int_result),  \
       [](Stack* stack) {                                                      \
         int64_t a, b;                                                         \
@@ -708,7 +745,7 @@ void listSetItem(Stack* stack);
         push(stack, int_op);                                                  \
       },                                                                      \
       aliasAnalysisFromSchema()),                                             \
-      OperatorGenerator(                                                      \
+      OperatorGeneratorArgs(                                                  \
           TORCH_SELECTIVE_SCHEMA(                                             \
               #aten_op ".complex(complex a, complex b) -> " #complex_result), \
           [](Stack* stack) {                                                  \
@@ -717,7 +754,7 @@ void listSetItem(Stack* stack);
             push(stack, complex_op);                                          \
           },                                                                  \
           aliasAnalysisFromSchema()),                                         \
-      OperatorGenerator(                                                      \
+      OperatorGeneratorArgs(                                                  \
           TORCH_SELECTIVE_SCHEMA(                                             \
               #aten_op ".float(float a, float b) -> " #float_result),         \
           [](Stack* stack) {                                                  \
@@ -728,7 +765,7 @@ void listSetItem(Stack* stack);
           aliasAnalysisFromSchema())
 
 #define DEFINE_INT_COMPLEX_OP(aten_op, op, result)                          \
-  OperatorGenerator(                                                        \
+  OperatorGeneratorArgs(                                                    \
       TORCH_SELECTIVE_SCHEMA(#aten_op                                       \
                              ".int_complex(int a, complex b) -> " #result), \
       [](Stack* stack) {                                                    \
@@ -738,7 +775,7 @@ void listSetItem(Stack* stack);
         push(stack, op);                                                    \
       },                                                                    \
       aliasAnalysisFromSchema()),                                           \
-      OperatorGenerator(                                                    \
+      OperatorGeneratorArgs(                                                \
           TORCH_SELECTIVE_SCHEMA(                                           \
               #aten_op ".complex_int(complex a, int b) -> " #result),       \
           [](Stack* stack) {                                                \
@@ -750,7 +787,7 @@ void listSetItem(Stack* stack);
           aliasAnalysisFromSchema())
 
 #define DEFINE_FLOAT_COMPLEX_OP(aten_op, op, result)                      \
-  OperatorGenerator(                                                      \
+  OperatorGeneratorArgs(                                                  \
       TORCH_SELECTIVE_SCHEMA(                                             \
           #aten_op ".float_complex(float a, complex b) -> " #result),     \
       [](Stack* stack) {                                                  \
@@ -760,7 +797,7 @@ void listSetItem(Stack* stack);
         push(stack, op);                                                  \
       },                                                                  \
       aliasAnalysisFromSchema()),                                         \
-      OperatorGenerator(                                                  \
+      OperatorGeneratorArgs(                                              \
           TORCH_SELECTIVE_SCHEMA(                                         \
               #aten_op ".complex_float(complex a, float b) -> " #result), \
           [](Stack* stack) {                                              \
@@ -773,7 +810,7 @@ void listSetItem(Stack* stack);
 
 #define DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX_AVOID_COLLISION_GENERIC( \
     aten_op, int_op, float_op, complex_op, result, string_val)        \
-  OperatorGenerator(                                                  \
+  OperatorGeneratorArgs(                                              \
       TORCH_SELECTIVE_SCHEMA(#aten_op string_val                      \
                              "(Scalar a, Scalar b) -> " #result),     \
       [](Stack* stack) {                                              \
@@ -821,7 +858,7 @@ void listSetItem(Stack* stack);
 
 #define DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX_WITHOUT_INT_COMPLEX_PAIR(     \
     aten_op, int_op, float_op, complex_op, result)                         \
-  OperatorGenerator(                                                       \
+  OperatorGeneratorArgs(                                                   \
       TORCH_SELECTIVE_SCHEMA(#aten_op "(Scalar a, Scalar b) -> " #result), \
       [](Stack* stack) {                                                   \
         IValue x, y;                                                       \
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 60458a0ae11e5..2953b686ee379 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -86,943 +86,858 @@ auto powWrapper(T a, U b) {
   return pow(a, b);
 }
 
-RegisterOperators reg(
-    {OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::str(t elem) -> str"),
-         [](Stack* stack) {
-           std::stringstream ss;
-           ss << pop(stack);
-           push(stack, ss.str());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::list(str t) -> str[]"),
-         [](Stack* stack) {
-           auto str = pop(stack).toStringRef();
-           c10::List<std::string> chars;
-           chars.reserve(str.size());
-           for (auto c : str) {
-             chars.push_back(std::string(1, c));
-           }
-           push(stack, std::move(chars));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::cpu(Tensor(a) self) -> Tensor(a|b)"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.cpu());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::layout(Tensor a) -> int"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.layout());
-         },
-         aliasAnalysisFromSchema()),
-     Operator(
-         prim::tolist,
-         // This operator has to be unschematized because the return type
-         // depends on the type hint and input. The implementation of this
-         // operator below is intended to be as close to the Python
-         // implementation in torch/csrc/utils/tensor_list.cpp as possible.
-         [](const Node* /*node*/) -> Operation {
-           return [](Stack* stack) {
-             // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-             int elem_ty_val;
-             // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-             int dim_val;
-             at::Tensor t;
-
-             pop(stack, elem_ty_val);
-             pop(stack, dim_val);
-             pop(stack, t);
-
-             // If the Tensor is not on the CPU, transfer it.
-             if (!t.device().is_cpu()) {
-               t = t.cpu();
-             }
-
-             // Rebuild the output type using elem_ty_val and dim_val. Start
-             // with the element type corresponding to elem_ty_val.
-             TypePtr out_ty;
-             if (elem_ty_val == 0) {
-               out_ty = IntType::get();
-             } else if (elem_ty_val == 1) {
-               out_ty = FloatType::get();
-             } else if (elem_ty_val == 2) {
-               out_ty = BoolType::get();
-             } else if (elem_ty_val == 3) {
-               out_ty = ComplexType::get();
-             } else {
-               TORCH_CHECK(
-                   false,
-                   "Unsupported element type for tolist; only int, float, complex and bool are supported");
-             }
-
-             // Check that type of the Tensor matches that of the annotation.
-             // Make an exception for the case in which the annotated type is
-             // float/complex and the Tensor data type is also float/complex;
-             // the elements will be casted to double/c10::complex<double>
-             // later.
-             TORCH_CHECK(
-                 (out_ty == FloatType::get() && t.is_floating_point()) ||
-                     (out_ty == ComplexType::get() && t.is_complex()) ||
-                     tryScalarTypeFromJitType(out_ty) == t.scalar_type(),
-                 "Output annotation element type and runtime tensor element type must match for tolist()");
-
-             // Check that the dimension of the Tensor matches that of the
-             // annotation.
-             TORCH_CHECK(
-                 dim_val == t.dim(),
-                 "Output annotation list dimension and runtime tensor dimension must match for tolist()");
-
-             // Wrap out_ty in a ListType dim times.
-             for (const auto i : c10::irange(dim_val)) {
-               (void)i; // Suppress unused variable warning
-               out_ty = ListType::create(out_ty);
-             }
-
-             int64_t dim = t.dim();
-             auto sizes = t.sizes();
-             auto strides = t.strides();
-             size_t element_size = t.element_size();
-             char* data = static_cast<char*>(t.data_ptr());
-             auto result = tensorToListRecursive(
-                 data,
-                 0,
-                 dim,
-                 out_ty,
-                 t.scalar_type(),
-                 sizes,
-                 strides,
-                 element_size);
-             push(stack, std::move(result));
-           };
-         },
-         aliasAnalysisSpecialCase()),
-     // only used internally in range() translation
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::__range_length(int lo, int hi, int step) -> int"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           int64_t lo, hi, step;
-           pop(stack, lo, hi, step);
-           // error handling when step_val = 0 during runtime
-           if (step == 0) {
-             throw std::runtime_error("range() arg 3 must not be zero");
-           }
-           if (step > 0 && lo < hi) {
-             push(stack, 1 + (hi - 1 - lo) / step);
-           } else if (step < 0 && lo > hi) {
-             push(stack, 1 + (lo - 1 - hi) / (0 - step));
-           } else {
-             push(stack, 0);
-           }
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::__derive_index(int index, int start, int step) -> int"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           int64_t index, start, step;
-           pop(stack, index, start, step);
-           push(stack, start + index * step);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::TupleUnpack(Any tup) -> ..."),
-         [](Stack* stack) { tupleUnpack(*stack); },
-         aliasAnalysisSpecialCase()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::unchecked_cast(t x) -> t"),
-         noop,
-         aliasAnalysisSpecialCase()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::IntImplicit(Tensor a) -> int"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           checkImplicitTensorToNum(a, /*to int*/ true);
-           push(stack, a.item<int64_t>());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::ComplexImplicit(Tensor a) -> complex"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           checkImplicitTensorToNum(a, /*to int*/ false);
-           push(stack, a.item<c10::complex<double>>());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::FloatImplicit(Tensor a) -> float"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           checkImplicitTensorToNum(a, /*to int*/ false);
-           push(stack, a.item<double>());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::ScalarImplicit(Tensor a) -> Scalar"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           checkImplicitTensorToNum(a, /*to int*/ false);
-           push(stack, a.item());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Bool.Tensor(Tensor a) -> bool"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.is_nonzero());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Bool.int(int a) -> bool"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           int64_t i;
-           pop(stack, i);
-           push(stack, (bool)i);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Bool.float(float a) -> bool"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           double d;
-           pop(stack, d);
-           push(stack, (bool)d);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Int.Tensor(Tensor a) -> int"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.item<int64_t>());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Int.bool(bool a) -> int"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           bool b;
-           pop(stack, b);
-           push(stack, static_cast<int64_t>(b));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Int.float(float a) -> int"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           double d;
-           pop(stack, d);
-           push(stack, static_cast<int64_t>(d));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Int.Scalar(Scalar a) -> int"),
-         [](Stack* stack) {
-           IValue scalar;
-           pop(stack, scalar);
-           if (scalar.isInt()) {
-             push(stack, std::move(scalar));
-           } else {
-             // toScalar() needed to avoid strict type check in IValue::toInt.
-             push(stack, static_cast<int64_t>(scalar.toScalar().toInt()));
-           }
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Int.str(str a) -> int"),
-         [](Stack* stack) {
-           auto s = pop(stack).toString();
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           std::string::size_type sz;
-           int64_t val = static_cast<int64_t>(c10::stoll(s->string(), &sz));
-           if (sz == s->string().size()) {
-             push(stack, val);
-           } else {
-             std::stringstream error_str;
-             error_str << "invalid literal for int() "
-                       << "with base 10: '" << s->string() << "'";
-             throw std::runtime_error(error_str.str());
-           }
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Float.Tensor(Tensor a) -> float"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.item<double>());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Float.Scalar(Scalar a) -> float"),
-         [](Stack* stack) {
-           IValue scalar;
-           pop(stack, scalar);
-           if (scalar.isDouble()) {
-             push(stack, std::move(scalar));
-           } else if (scalar.isComplexDouble()) {
-             push(stack, scalar.toComplexDouble().real());
-           } else {
-             push(stack, static_cast<double>(scalar.toInt()));
-           }
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Float.int(int a) -> float"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           int64_t i;
-           pop(stack, i);
-           push(stack, (float)i);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Float.bool(bool a) -> float"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           bool b;
-           pop(stack, b);
-           push(stack, (float)b);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Float.str(str a) -> float"),
-         [](Stack* stack) {
-           auto s = pop(stack).toString();
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           std::string::size_type sz;
-           double b = c10::stod(s->string(), &sz);
-           if (sz == s->string().size()) {
-             push(stack, b);
-           } else {
-             std::stringstream error_str;
-             error_str << "could not convert string "
-                       << "to float: '" << s->string() << "'";
-             throw std::runtime_error(error_str.str());
-           }
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Complex.Scalar(Scalar a) -> complex"),
-         [](Stack* stack) {
-           IValue scalar;
-           pop(stack, scalar);
-           if (scalar.isComplexDouble()) {
-             push(stack, std::move(scalar));
-           } else if (scalar.isDouble()) {
-             push(stack, c10::complex<double>(scalar.toDouble(), 0));
-           } else {
-             push(stack, c10::complex<double>(scalar.toInt(), 0));
-           }
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::Complex.Tensor_Tensor(Tensor a, Tensor b) -> complex"),
-         [](Stack* stack) {
-           at::Tensor a, b;
-           pop(stack, a, b);
-           push(
-               stack, c10::complex<double>(a.item<double>(), b.item<double>()));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::format(str self, ...) -> str"),
-         [](Stack* stack) {
-           size_t num_inputs = pop(stack).toInt();
-           format(*stack, num_inputs);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::einsum.sublist(Tensor a, ...) -> Tensor"),
-         [](Stack* stack) {
-           size_t num_inputs = pop(stack).toInt();
-           einsum(*stack, num_inputs);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::NumToTensor.Scalar(Scalar a) -> Tensor"),
-         [](Stack* stack) {
-           at::Scalar s;
-           pop(stack, s);
-           push(stack, at::scalar_to_tensor(s));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::RaiseException(str msg) -> ()"),
-         [](Stack* stack) { throw JITException(pop(stack).toStringRef()); },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Size(int[] sizes) -> int[]"),
-         [](Stack* stack) {},
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::size(Tensor self) -> int[]"),
-         [](Stack* stack) {
-           auto t = std::move(pop(stack)).toTensor();
-           pack(stack, t.sizes().vec());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::EnumName(AnyEnumType enum) -> str"),
-         [](Stack* stack) {
-           IValue e = pop(stack);
-           push(stack, e.toEnumHolder()->name());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::EnumValue.int(AnyEnumType enum) -> int"),
-         [](Stack* stack) {
-           IValue e = pop(stack);
-           push(stack, e.toEnumHolder()->value());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "prim::EnumValue.float(AnyEnumType enum) -> float"),
-         [](Stack* stack) {
-           IValue e = pop(stack);
-           push(stack, e.toEnumHolder()->value());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::EnumValue.str(AnyEnumType enum) -> str"),
-         [](Stack* stack) {
-           IValue e = pop(stack);
-           push(stack, e.toEnumHolder()->value());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         // note the compiler knows to type TupleIndex more accurately than it
-         // is listed here.
-         TORCH_SELECTIVE_SCHEMA("prim::TupleIndex(Any tup, int i) -> Any"),
-         [](Stack* stack) {
-           int64_t index = pop(stack).toInt();
-           auto tuple = pop(stack).toTuple();
-           auto norm_index = normalizeIndex(index, tuple->elements().size());
-           if (norm_index < 0 ||
-               norm_index > static_cast<int64_t>(tuple->elements().size())) {
-             throw std::out_of_range("Tuple list index out of range");
-           }
-           stack->emplace_back(tuple->elements()[norm_index]);
-         },
-         aliasAnalysisSpecialCase()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::ne.int_list(int[] a, int[] b) -> bool"),
-         listNe<int64_t>,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "prim::unchecked_unwrap_optional(t(a)? optional) -> t(a)"),
-         noop,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::device(Tensor a) -> Device"),
-         [](Stack* stack) { push(stack, pop(stack).toTensor().device()); },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::dtype(Tensor a) -> int"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, static_cast<int64_t>(a.scalar_type()));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::__not__(bool self) -> bool"),
-         [](Stack* stack) { push(stack, !pop(stack).toBool()); },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::__is__(t1 self, t2 obj) -> bool"),
-         [](Stack* stack) {
-           IValue self, obj;
-           pop(stack, self, obj);
-           push(stack, self.is(obj));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::__isnot__(t1 self, t2 obj) -> bool"),
-         [](Stack* stack) {
-           IValue self, obj;
-           pop(stack, self, obj);
-           push(stack, !self.is(obj));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::element_size(Tensor self) -> int"),
-         [](Stack* stack) {
-           at::Tensor arg = pop(stack).toTensor();
-           push(stack, arg.element_size());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::numel(Tensor self) -> int"),
-         [](Stack* stack) {
-           at::Tensor arg = pop(stack).toTensor();
-           push(stack, arg.numel());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::dim(Tensor self) -> int"),
-         [](Stack* stack) {
-           at::Tensor arg = pop(stack).toTensor();
-           push(stack, arg.dim());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::get_device(Tensor self) -> int"),
-         [](Stack* stack) {
-           RECORD_FUNCTION("get_device", std::vector<c10::IValue>());
-           auto result =
-               at::get_device((std::move(peek(stack, 0, 1))).toTensor());
-           drop(stack, 1);
-           pack(stack, result);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::storage_offset(Tensor self) -> int"),
-         [](Stack* stack) {
-           RECORD_FUNCTION("storage_offset", std::vector<c10::IValue>());
-           auto result =
-               ((std::move(peek(stack, 0, 1))).toTensor()).storage_offset();
-           drop(stack, 1);
-           pack(stack, result);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::is_contiguous(Tensor self) -> bool"),
-         [](Stack* stack) {
-           RECORD_FUNCTION("is_contiguous", std::vector<c10::IValue>());
-           auto result =
-               ((std::move(peek(stack, 0, 1))).toTensor()).is_contiguous();
-           drop(stack, 1);
-           pack(stack, result);
-         },
-         aliasAnalysisFromSchema()),
-     // these ops are generic over the list element type.
-     // CREATING GENERIC_LIST_OPS
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::select.t(t[](a) list, int idx) -> t(*)"),
-         listSelect,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::__getitem__.t(t[](a) list, int idx) -> t(*)"),
-         listSelect,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::append.t(t[](a!) self, t(c -> *) el) -> t[](a!)"),
-         listAppend,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::reverse.t(t[](a!) self) -> ()"),
-         listReverse,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::extend.t(t[](a!) self, t[] other) -> ()"),
-         listExtend,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::copy.t(t[](a) self) -> t[]"),
-         listCopy,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::_set_item.t(t [](a!) l, int idx, t(b -> *) el) -> t[](a!)"),
-         listSetItem,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::clear.t(t[](a!) self) -> ()"),
-         listClear,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::Delete.t(t[](a!) self, int idx) -> ()"),
-         listDelete,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::insert.t(t[](a!) self, int idx, t(b -> *) el) -> ()"),
-         listInsert,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::pop.t(t[](a!) self, int idx=-1) -> t(*)"),
-         listPop,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::add.t(t[] a, t[] b) -> t[]"),
-         listAdd,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::add_.t(t[](a!) self, t[] b) -> t[]"),
-         listInplaceAdd,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> t[]"),
-         listSlice,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::list.t(t[] l) -> t[]"),
-         listList,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::mul.left_t(t[] l, int n) -> t[]"),
-         listMulIntLeft,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::mul.right_(int n, t[] l) -> t[]"),
-         listMulIntRight,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::mul_.t(t[](a!) l, int n) -> t[](a!)"),
-         listMulIntLeftInPlace,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::len.t(t[] a) -> int"),
-         listLen,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::eq.int_list(int[] a, int[] b) -> bool"),
-         listEq<int64_t>,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::eq.device(Device a, Device b) -> bool"),
-         [](Stack* stack) {
-           auto a = pop(stack).toDevice();
-           auto b = pop(stack).toDevice();
-           push(stack, a == b);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::ne.device(Device a, Device b) -> bool"),
-         [](Stack* stack) {
-           auto a = pop(stack).toDevice();
-           auto b = pop(stack).toDevice();
-           push(stack, a != b);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::eq.bool(bool a, bool b) -> bool"),
-         [](Stack* stack) {
-           auto a = pop(stack);
-           auto b = pop(stack);
-           push(stack, a == b);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::ne.bool(bool a, bool b) -> bool"),
-         [](Stack* stack) {
-           auto a = pop(stack);
-           auto b = pop(stack);
-           push(stack, a != b);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::Uninitialized() -> Any"),
-         [](Stack* stack) { push(stack, IValue::uninitialized()); },
-         aliasAnalysisSpecialCase()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::Print(...) -> ()"),
-         [](Stack* stack) {
-           auto num_inputs = pop(stack).toInt();
-           std::stringstream ss;
-           bool first = true;
-           for (const IValue& i : last(stack, num_inputs)) {
-             if (!first)
-               ss << " ";
-             first = false;
-             ss << i;
-           }
-           drop(stack, num_inputs);
-           ss << std::endl;
-           auto* handler = getPrintHandler();
-           TORCH_INTERNAL_ASSERT(handler);
-           handler(ss.str());
-         },
-         aliasAnalysisSpecialCase()),
-     // This is an alternative to aten::cat op that takes variable number of
-     // parameters as input.
-     // Format:
-     //    prim::VarConcat(Tensors..., dim) -> Tensor
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::VarConcat(...) -> Tensor"),
-         [](Stack* stack) {
-           auto num_inputs = pop(stack).toInt();
-           auto dim = pop(stack).toInt();
-           std::vector<at::Tensor> inputs(num_inputs - 1);
-           for (int i = 0; i < num_inputs - 1; ++i) {
-             inputs[num_inputs - 2 - i] = pop(stack).toTensor();
-           }
-           push(stack, at::cat(inputs, dim));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::VarStack(...) -> Tensor"),
-         [](Stack* stack) {
-           auto num_inputs = pop(stack).toInt();
-           auto dim = pop(stack).toInt();
-           std::vector<at::Tensor> inputs(num_inputs - 1);
-           for (int i = 0; i < num_inputs - 1; ++i) {
-             inputs[num_inputs - 2 - i] = pop(stack).toTensor();
-           }
-           push(stack, at::stack(inputs, dim));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::eq.enum(AnyEnumType a, AnyEnumType b) -> bool"),
-         [](Stack* stack) {
-           IValue x = pop(stack);
-           IValue y = pop(stack);
-           push(stack, x == y);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::ne.enum(AnyEnumType a, AnyEnumType b) -> bool"),
-         [](Stack* stack) {
-           IValue x = pop(stack);
-           IValue y = pop(stack);
-           push(stack, x != y);
-         },
-         aliasAnalysisFromSchema()),
-     // We define aten::dequantize in both native_functions.yaml and here,
-     // however, aten::dequantize.any defined here overrides
-     // aten::dequantize.tensors in native_functions.yaml. The variants here
-     // are only for graph mode quantization, and they should be removed once
-     // we deprecate graph mode quantization, and use the variants in
-     // native_functions.yaml.
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::dequantize.tensor(Tensor qtensor) -> Tensor"),
-         [](Stack* stack) {
-           at::Tensor qtensor;
-           pop(stack, qtensor);
-           push(stack, at::dequantize(qtensor));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::dequantize.list(Tensor[] qtensors) -> Tensor[]"),
-         [](Stack* stack) {
-           auto qtensors = pop(stack).toTensorVector();
-           push(stack, at::dequantize(qtensors));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::dequantize.any(Any tensors) -> Any"),
-         [](Stack* stack) { dequantize(*stack); },
-         aliasAnalysisFromSchema()),
-     DEFINE_UNARY_OP_WITH_COMPLEX(aten::log, std::log(a), float, float),
-     DEFINE_STRING_OP(aten::add, a + b, str),
-     DEFINE_COMPARISON_OP_WITH_COMPLEX(aten::eq, a == b),
-     DEFINE_COMPARISON_OP_WITH_COMPLEX(aten::ne, a != b),
-     DEFINE_GENERIC_OP(
-         aten::polar,
-         c10::polar(static_cast<double>(a), static_cast<double>(b)),
-         c10::polar(static_cast<double>(a), static_cast<double>(b)),
-         complex,
-         complex),
-     DEFINE_INT_FLOAT_OP(
-         aten::polar,
-         c10::polar(static_cast<double>(a), static_cast<double>(b)),
-         complex),
-     DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION(
-         aten::polar,
-         c10::polar(static_cast<double>(a), static_cast<double>(b)),
-         c10::polar(static_cast<double>(a), static_cast<double>(b)),
-         Scalar),
-     DEFINE_COMPARISON_OP(aten::lt, a < b),
-     DEFINE_COMPARISON_OP(aten::gt, a > b),
-     DEFINE_COMPARISON_OP(aten::le, a <= b),
-     DEFINE_COMPARISON_OP(aten::ge, a >= b),
-     DEFINE_BINARY_OP_WITH_COMPLEX(aten::add, a + b),
-     DEFINE_BINARY_OP_WITH_COMPLEX(aten::sub, a - b),
-     DEFINE_BINARY_OP_WITH_COMPLEX(aten::mul, a* b),
-     DEFINE_BOOL_OP(aten::__and__, a&& b),
-     DEFINE_BOOL_OP(aten::__or__, a || b),
-     DEFINE_BOOL_OP(aten::__xor__, a != b),
-     DEFINE_UNARY_OP(aten::round, round_to_even(a), float, float),
-     DEFINE_UNARY_OP(aten::floor, floor(a), int, int),
-     DEFINE_UNARY_OP(aten::ceil, ceil(a), int, int),
-     DEFINE_UNARY_OP_WITH_COMPLEX(aten::neg, -a, int, float),
-     DEFINE_UNARY_OP_WITH_COMPLEX(aten::exp, std::exp(a), float, float),
-     // Pass in two ops for handling int and float separately as % in C++ only
-     // works for int The modulus calculation is different between C++ and
-     // Python (on negative), we preserve the python behavior as it's more
-     // common and match python syntax, hence the conversion.
-     DEFINE_GENERIC_OP(
-         aten::remainder,
-         (b + (a % b)) % b,
-         fmod((b + fmod(a, b)), b),
-         int,
-         float),
-     DEFINE_INT_FLOAT_OP(aten::remainder, fmod((b + fmod(a, b)), b), float),
-     DEFINE_SCALAR_BINARY_OP(
-         aten::remainder,
-         (b + (a % b)) % b,
-         fmod((b + fmod(a, b)), b),
-         Scalar),
-     // NB: This is the python truediv operation
-     DEFINE_GENERIC_OP_WITH_COMPLEX(
-         aten::div,
-         static_cast<double>(a) / static_cast<double>(b),
-         a / b,
-         a / b,
-         float,
-         float,
-         complex),
-     DEFINE_SCALAR_BINARY_OP(
-         aten::div,
-         static_cast<double>(a) / static_cast<double>(b),
-         a / b,
-         float),
-     DEFINE_GENERIC_OP(
-         aten::floordiv,
-         floordiv(a, b),
-         std::floor(a / b),
-         int,
-         float),
-     DEFINE_INT_FLOAT_OP(aten::floordiv, std::floor(a / b), float),
-     DEFINE_SCALAR_BINARY_OP(
-         aten::floordiv,
-         floordiv(a, b),
-         std::floor(a / b),
-         Scalar),
-     // int ** int produces a float, because negative exponents produce float
-     // results
-     DEFINE_GENERIC_OP_WITH_COMPLEX(
-         aten::pow,
-         static_cast<double>(powWrapper(a, b)),
-         static_cast<double>(powWrapper(a, b)),
-         static_cast<c10::complex<double>>(pow(a, b)),
-         float,
-         float,
-         complex),
-     DEFINE_INT_FLOAT_OP(
-         aten::pow,
-         static_cast<double>(powWrapper(a, b)),
-         float),
-     DEFINE_FLOAT_COMPLEX_OP(aten::pow, pow(a, b), complex),
-     DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION(
-         aten::pow,
-         static_cast<double>(pow(a, b)),
-         static_cast<double>(pow(a, b)),
-         float),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::pow.int_to_int(int a, int b) -> int"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           int64_t a, b;
-           pop(stack, a, b);
-           push(stack, powWrapper(a, b));
-         },
-         aliasAnalysisFromSchema()),
-     // min and max are in prim:: because there is a difference between
-     // the python builtin 'min' and 'torch.min'
-     DEFINE_BINARY_OP(prim::min, a < b ? a : b),
-     DEFINE_BINARY_OP(prim::max, a > b ? a : b),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::type(Device self) -> str"),
-         [](Stack* stack) {
-           auto d = pop(stack);
-           push(
-               stack,
-               DeviceTypeName(d.toDevice().type(), /* lower_case=*/true));
-         },
-         aliasAnalysisFromSchema()),
-     // tensor length op (size of 1st dimension)
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::len.Tensor(Tensor t) -> int"),
-         [](Stack* stack) {
-           at::Tensor t = pop(stack).toTensor();
-           if (t.dim() == 0) {
-             AT_ERROR("len() of a 0-d tensor");
-           }
-           push(stack, t.sizes()[0]);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::ord(str string) -> int"),
-         [](Stack* stack) {
-           auto string = pop(stack).toStringRef();
-           TORCH_CHECK(
-               string.size() == 1,
-               "String for ord() must be 1 character, found ",
-               string.size());
-           uint8_t ord = string.at(0);
-           push(stack, int64_t(ord));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::lower(str self) -> str"),
-         [](Stack* stack) {
-           auto string = pop(stack).toStringRef();
-           std::stringstream ss;
-           for (char c : string) {
-             ss << static_cast<char>(::tolower(c));
-           }
-           push(stack, ss.str());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::__contains__.int_list(int[] l, int item) -> bool"),
-         listContains<int64_t>,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::__contains__.str_list(str[] l, str item) -> bool"),
-         listContains<std::string>,
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::len.str(str s) -> int"),
-         [](Stack* stack) {
-           auto string = pop(stack).toStringRef();
-           push(stack, static_cast<int64_t>(string.size()));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::dict() -> Dict(str, Tensor)"),
-         [](Stack* stack) {
-           auto dict =
-               c10::impl::GenericDict(StringType::get(), TensorType::get());
-           push(stack, dict);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::__getitem__.str(str s, int index) -> str"),
-         [](Stack* stack) {
-           auto index = pop(stack).toInt();
-           auto string = pop(stack).toStringRef();
-           auto norm_index = normalizeIndex(index, string.size());
-           char c = string.at(norm_index);
-           push(stack, std::string(&c, 1));
-         },
-         aliasAnalysisFromSchema()),
+static const OperatorGeneratorArgs opGenArgs[] = {
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::str(t elem) -> str"),
+        [](Stack* stack) {
+          std::stringstream ss;
+          ss << pop(stack);
+          push(stack, ss.str());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::list(str t) -> str[]"),
+        [](Stack* stack) {
+          auto str = pop(stack).toStringRef();
+          c10::List<std::string> chars;
+          chars.reserve(str.size());
+          for (auto c : str) {
+            chars.push_back(std::string(1, c));
+          }
+          push(stack, std::move(chars));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::cpu(Tensor(a) self) -> Tensor(a|b)"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.cpu());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::layout(Tensor a) -> int"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.layout());
+        },
+        aliasAnalysisFromSchema()),
+
+    // only used internally in range() translation
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::__range_length(int lo, int hi, int step) -> int"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          int64_t lo, hi, step;
+          pop(stack, lo, hi, step);
+          // error handling when step_val = 0 during runtime
+          if (step == 0) {
+            throw std::runtime_error("range() arg 3 must not be zero");
+          }
+          if (step > 0 && lo < hi) {
+            push(stack, 1 + (hi - 1 - lo) / step);
+          } else if (step < 0 && lo > hi) {
+            push(stack, 1 + (lo - 1 - hi) / (0 - step));
+          } else {
+            push(stack, 0);
+          }
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::__derive_index(int index, int start, int step) -> int"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          int64_t index, start, step;
+          pop(stack, index, start, step);
+          push(stack, start + index * step);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::TupleUnpack(Any tup) -> ..."),
+        [](Stack* stack) { tupleUnpack(*stack); },
+        aliasAnalysisSpecialCase()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::unchecked_cast(t x) -> t"),
+        noop,
+        aliasAnalysisSpecialCase()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::IntImplicit(Tensor a) -> int"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          checkImplicitTensorToNum(a, /*to int*/ true);
+          push(stack, a.item<int64_t>());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::ComplexImplicit(Tensor a) -> complex"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          checkImplicitTensorToNum(a, /*to int*/ false);
+          push(stack, a.item<c10::complex<double>>());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::FloatImplicit(Tensor a) -> float"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          checkImplicitTensorToNum(a, /*to int*/ false);
+          push(stack, a.item<double>());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::ScalarImplicit(Tensor a) -> Scalar"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          checkImplicitTensorToNum(a, /*to int*/ false);
+          push(stack, a.item());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Bool.Tensor(Tensor a) -> bool"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_nonzero());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Bool.int(int a) -> bool"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          int64_t i;
+          pop(stack, i);
+          push(stack, (bool)i);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Bool.float(float a) -> bool"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          double d;
+          pop(stack, d);
+          push(stack, (bool)d);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Int.Tensor(Tensor a) -> int"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.item<int64_t>());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Int.bool(bool a) -> int"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          bool b;
+          pop(stack, b);
+          push(stack, static_cast<int64_t>(b));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Int.float(float a) -> int"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          double d;
+          pop(stack, d);
+          push(stack, static_cast<int64_t>(d));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Int.Scalar(Scalar a) -> int"),
+        [](Stack* stack) {
+          IValue scalar;
+          pop(stack, scalar);
+          if (scalar.isInt()) {
+            push(stack, std::move(scalar));
+          } else {
+            // toScalar() needed to avoid strict type check in IValue::toInt.
+            push(stack, static_cast<int64_t>(scalar.toScalar().toInt()));
+          }
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Int.str(str a) -> int"),
+        [](Stack* stack) {
+          auto s = pop(stack).toString();
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          std::string::size_type sz;
+          int64_t val = static_cast<int64_t>(c10::stoll(s->string(), &sz));
+          if (sz == s->string().size()) {
+            push(stack, val);
+          } else {
+            std::stringstream error_str;
+            error_str << "invalid literal for int() "
+                      << "with base 10: '" << s->string() << "'";
+            throw std::runtime_error(error_str.str());
+          }
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Float.Tensor(Tensor a) -> float"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.item<double>());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Float.Scalar(Scalar a) -> float"),
+        [](Stack* stack) {
+          IValue scalar;
+          pop(stack, scalar);
+          if (scalar.isDouble()) {
+            push(stack, std::move(scalar));
+          } else if (scalar.isComplexDouble()) {
+            push(stack, scalar.toComplexDouble().real());
+          } else {
+            push(stack, static_cast<double>(scalar.toInt()));
+          }
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Float.int(int a) -> float"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          int64_t i;
+          pop(stack, i);
+          push(stack, (float)i);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Float.bool(bool a) -> float"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          bool b;
+          pop(stack, b);
+          push(stack, (float)b);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Float.str(str a) -> float"),
+        [](Stack* stack) {
+          auto s = pop(stack).toString();
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          std::string::size_type sz;
+          double b = c10::stod(s->string(), &sz);
+          if (sz == s->string().size()) {
+            push(stack, b);
+          } else {
+            std::stringstream error_str;
+            error_str << "could not convert string "
+                      << "to float: '" << s->string() << "'";
+            throw std::runtime_error(error_str.str());
+          }
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Complex.Scalar(Scalar a) -> complex"),
+        [](Stack* stack) {
+          IValue scalar;
+          pop(stack, scalar);
+          if (scalar.isComplexDouble()) {
+            push(stack, std::move(scalar));
+          } else if (scalar.isDouble()) {
+            push(stack, c10::complex<double>(scalar.toDouble(), 0));
+          } else {
+            push(stack, c10::complex<double>(scalar.toInt(), 0));
+          }
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::Complex.Tensor_Tensor(Tensor a, Tensor b) -> complex"),
+        [](Stack* stack) {
+          at::Tensor a, b;
+          pop(stack, a, b);
+          push(stack, c10::complex<double>(a.item<double>(), b.item<double>()));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::format(str self, ...) -> str"),
+        [](Stack* stack) {
+          size_t num_inputs = pop(stack).toInt();
+          format(*stack, num_inputs);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::einsum.sublist(Tensor a, ...) -> Tensor"),
+        [](Stack* stack) {
+          size_t num_inputs = pop(stack).toInt();
+          einsum(*stack, num_inputs);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::NumToTensor.Scalar(Scalar a) -> Tensor"),
+        [](Stack* stack) {
+          at::Scalar s;
+          pop(stack, s);
+          push(stack, at::scalar_to_tensor(s));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::RaiseException(str msg) -> ()"),
+        [](Stack* stack) { throw JITException(pop(stack).toStringRef()); },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Size(int[] sizes) -> int[]"),
+        [](Stack* stack) {},
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::size(Tensor self) -> int[]"),
+        [](Stack* stack) {
+          auto t = std::move(pop(stack)).toTensor();
+          pack(stack, t.sizes().vec());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::EnumName(AnyEnumType enum) -> str"),
+        [](Stack* stack) {
+          IValue e = pop(stack);
+          push(stack, e.toEnumHolder()->name());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::EnumValue.int(AnyEnumType enum) -> int"),
+        [](Stack* stack) {
+          IValue e = pop(stack);
+          push(stack, e.toEnumHolder()->value());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "prim::EnumValue.float(AnyEnumType enum) -> float"),
+        [](Stack* stack) {
+          IValue e = pop(stack);
+          push(stack, e.toEnumHolder()->value());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::EnumValue.str(AnyEnumType enum) -> str"),
+        [](Stack* stack) {
+          IValue e = pop(stack);
+          push(stack, e.toEnumHolder()->value());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        // note the compiler knows to type TupleIndex more accurately than it
+        // is listed here.
+        TORCH_SELECTIVE_SCHEMA("prim::TupleIndex(Any tup, int i) -> Any"),
+        [](Stack* stack) {
+          int64_t index = pop(stack).toInt();
+          auto tuple = pop(stack).toTuple();
+          auto norm_index = normalizeIndex(index, tuple->elements().size());
+          if (norm_index < 0 ||
+              norm_index > static_cast<int64_t>(tuple->elements().size())) {
+            throw std::out_of_range("Tuple list index out of range");
+          }
+          stack->emplace_back(tuple->elements()[norm_index]);
+        },
+        aliasAnalysisSpecialCase()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::ne.int_list(int[] a, int[] b) -> bool"),
+        listNe<int64_t>,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "prim::unchecked_unwrap_optional(t(a)? optional) -> t(a)"),
+        noop,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::device(Tensor a) -> Device"),
+        [](Stack* stack) { push(stack, pop(stack).toTensor().device()); },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::dtype(Tensor a) -> int"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, static_cast<int64_t>(a.scalar_type()));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::__not__(bool self) -> bool"),
+        [](Stack* stack) { push(stack, !pop(stack).toBool()); },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::__is__(t1 self, t2 obj) -> bool"),
+        [](Stack* stack) {
+          IValue self, obj;
+          pop(stack, self, obj);
+          push(stack, self.is(obj));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::__isnot__(t1 self, t2 obj) -> bool"),
+        [](Stack* stack) {
+          IValue self, obj;
+          pop(stack, self, obj);
+          push(stack, !self.is(obj));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::element_size(Tensor self) -> int"),
+        [](Stack* stack) {
+          at::Tensor arg = pop(stack).toTensor();
+          push(stack, arg.element_size());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::numel(Tensor self) -> int"),
+        [](Stack* stack) {
+          at::Tensor arg = pop(stack).toTensor();
+          push(stack, arg.numel());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::dim(Tensor self) -> int"),
+        [](Stack* stack) {
+          at::Tensor arg = pop(stack).toTensor();
+          push(stack, arg.dim());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::get_device(Tensor self) -> int"),
+        [](Stack* stack) {
+          RECORD_FUNCTION("get_device", std::vector<c10::IValue>());
+          auto result =
+              at::get_device((std::move(peek(stack, 0, 1))).toTensor());
+          drop(stack, 1);
+          pack(stack, result);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::storage_offset(Tensor self) -> int"),
+        [](Stack* stack) {
+          RECORD_FUNCTION("storage_offset", std::vector<c10::IValue>());
+          auto result =
+              ((std::move(peek(stack, 0, 1))).toTensor()).storage_offset();
+          drop(stack, 1);
+          pack(stack, result);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::is_contiguous(Tensor self) -> bool"),
+        [](Stack* stack) {
+          RECORD_FUNCTION("is_contiguous", std::vector<c10::IValue>());
+          auto result =
+              ((std::move(peek(stack, 0, 1))).toTensor()).is_contiguous();
+          drop(stack, 1);
+          pack(stack, result);
+        },
+        aliasAnalysisFromSchema()),
+    // these ops are generic over the list element type.
+    // CREATING GENERIC_LIST_OPS
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::select.t(t[](a) list, int idx) -> t(*)"),
+        listSelect,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::__getitem__.t(t[](a) list, int idx) -> t(*)"),
+        listSelect,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::append.t(t[](a!) self, t(c -> *) el) -> t[](a!)"),
+        listAppend,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::reverse.t(t[](a!) self) -> ()"),
+        listReverse,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::extend.t(t[](a!) self, t[] other) -> ()"),
+        listExtend,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::copy.t(t[](a) self) -> t[]"),
+        listCopy,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::_set_item.t(t [](a!) l, int idx, t(b -> *) el) -> t[](a!)"),
+        listSetItem,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::clear.t(t[](a!) self) -> ()"),
+        listClear,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::Delete.t(t[](a!) self, int idx) -> ()"),
+        listDelete,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::insert.t(t[](a!) self, int idx, t(b -> *) el) -> ()"),
+        listInsert,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::pop.t(t[](a!) self, int idx=-1) -> t(*)"),
+        listPop,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::add.t(t[] a, t[] b) -> t[]"),
+        listAdd,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::add_.t(t[](a!) self, t[] b) -> t[]"),
+        listInplaceAdd,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> t[]"),
+        listSlice,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::list.t(t[] l) -> t[]"),
+        listList,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::mul.left_t(t[] l, int n) -> t[]"),
+        listMulIntLeft,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::mul.right_(int n, t[] l) -> t[]"),
+        listMulIntRight,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::mul_.t(t[](a!) l, int n) -> t[](a!)"),
+        listMulIntLeftInPlace,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::len.t(t[] a) -> int"),
+        listLen,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::eq.int_list(int[] a, int[] b) -> bool"),
+        listEq<int64_t>,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::eq.device(Device a, Device b) -> bool"),
+        [](Stack* stack) {
+          auto a = pop(stack).toDevice();
+          auto b = pop(stack).toDevice();
+          push(stack, a == b);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::ne.device(Device a, Device b) -> bool"),
+        [](Stack* stack) {
+          auto a = pop(stack).toDevice();
+          auto b = pop(stack).toDevice();
+          push(stack, a != b);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::eq.bool(bool a, bool b) -> bool"),
+        [](Stack* stack) {
+          auto a = pop(stack);
+          auto b = pop(stack);
+          push(stack, a == b);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::ne.bool(bool a, bool b) -> bool"),
+        [](Stack* stack) {
+          auto a = pop(stack);
+          auto b = pop(stack);
+          push(stack, a != b);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::Uninitialized() -> Any"),
+        [](Stack* stack) { push(stack, IValue::uninitialized()); },
+        aliasAnalysisSpecialCase()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::Print(...) -> ()"),
+        [](Stack* stack) {
+          auto num_inputs = pop(stack).toInt();
+          std::stringstream ss;
+          bool first = true;
+          for (const IValue& i : last(stack, num_inputs)) {
+            if (!first)
+              ss << " ";
+            first = false;
+            ss << i;
+          }
+          drop(stack, num_inputs);
+          ss << std::endl;
+          auto* handler = getPrintHandler();
+          TORCH_INTERNAL_ASSERT(handler);
+          handler(ss.str());
+        },
+        aliasAnalysisSpecialCase()),
+    // This is an alternative to aten::cat op that takes variable number of
+    // parameters as input.
+    // Format:
+    //    prim::VarConcat(Tensors..., dim) -> Tensor
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::VarConcat(...) -> Tensor"),
+        [](Stack* stack) {
+          auto num_inputs = pop(stack).toInt();
+          auto dim = pop(stack).toInt();
+          std::vector<at::Tensor> inputs(num_inputs - 1);
+          for (int i = 0; i < num_inputs - 1; ++i) {
+            inputs[num_inputs - 2 - i] = pop(stack).toTensor();
+          }
+          push(stack, at::cat(inputs, dim));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::VarStack(...) -> Tensor"),
+        [](Stack* stack) {
+          auto num_inputs = pop(stack).toInt();
+          auto dim = pop(stack).toInt();
+          std::vector<at::Tensor> inputs(num_inputs - 1);
+          for (int i = 0; i < num_inputs - 1; ++i) {
+            inputs[num_inputs - 2 - i] = pop(stack).toTensor();
+          }
+          push(stack, at::stack(inputs, dim));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::eq.enum(AnyEnumType a, AnyEnumType b) -> bool"),
+        [](Stack* stack) {
+          IValue x = pop(stack);
+          IValue y = pop(stack);
+          push(stack, x == y);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::ne.enum(AnyEnumType a, AnyEnumType b) -> bool"),
+        [](Stack* stack) {
+          IValue x = pop(stack);
+          IValue y = pop(stack);
+          push(stack, x != y);
+        },
+        aliasAnalysisFromSchema()),
+    // We define aten::dequantize in both native_functions.yaml and here,
+    // however, aten::dequantize.any defined here overrides
+    // aten::dequantize.tensors in native_functions.yaml. The variants here
+    // are only for graph mode quantization, and they should be removed once
+    // we deprecate graph mode quantization, and use the variants in
+    // native_functions.yaml.
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::dequantize.tensor(Tensor qtensor) -> Tensor"),
+        [](Stack* stack) {
+          at::Tensor qtensor;
+          pop(stack, qtensor);
+          push(stack, at::dequantize(qtensor));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::dequantize.list(Tensor[] qtensors) -> Tensor[]"),
+        [](Stack* stack) {
+          auto qtensors = pop(stack).toTensorVector();
+          push(stack, at::dequantize(qtensors));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::dequantize.any(Any tensors) -> Any"),
+        [](Stack* stack) { dequantize(*stack); },
+        aliasAnalysisFromSchema()),
+    DEFINE_UNARY_OP_WITH_COMPLEX(aten::log, std::log(a), float, float),
+    DEFINE_STRING_OP(aten::add, a + b, str),
+    DEFINE_COMPARISON_OP_WITH_COMPLEX(aten::eq, a == b),
+    DEFINE_COMPARISON_OP_WITH_COMPLEX(aten::ne, a != b),
+    DEFINE_GENERIC_OP(
+        aten::polar,
+        c10::polar(static_cast<double>(a), static_cast<double>(b)),
+        c10::polar(static_cast<double>(a), static_cast<double>(b)),
+        complex,
+        complex),
+    DEFINE_INT_FLOAT_OP(
+        aten::polar,
+        c10::polar(static_cast<double>(a), static_cast<double>(b)),
+        complex),
+    DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION(
+        aten::polar,
+        c10::polar(static_cast<double>(a), static_cast<double>(b)),
+        c10::polar(static_cast<double>(a), static_cast<double>(b)),
+        Scalar),
+    DEFINE_COMPARISON_OP(aten::lt, a < b),
+    DEFINE_COMPARISON_OP(aten::gt, a > b),
+    DEFINE_COMPARISON_OP(aten::le, a <= b),
+    DEFINE_COMPARISON_OP(aten::ge, a >= b),
+    DEFINE_BINARY_OP_WITH_COMPLEX(aten::add, a + b),
+    DEFINE_BINARY_OP_WITH_COMPLEX(aten::sub, a - b),
+    DEFINE_BINARY_OP_WITH_COMPLEX(aten::mul, a* b),
+    DEFINE_BOOL_OP(aten::__and__, a&& b),
+    DEFINE_BOOL_OP(aten::__or__, a || b),
+    DEFINE_BOOL_OP(aten::__xor__, a != b),
+    DEFINE_UNARY_OP(aten::round, round_to_even(a), float, float),
+    DEFINE_UNARY_OP(aten::floor, floor(a), int, int),
+    DEFINE_UNARY_OP(aten::ceil, ceil(a), int, int),
+    DEFINE_UNARY_OP_WITH_COMPLEX(aten::neg, -a, int, float),
+    DEFINE_UNARY_OP_WITH_COMPLEX(aten::exp, std::exp(a), float, float),
+    // Pass in two ops for handling int and float separately as % in C++ only
+    // works for int The modulus calculation is different between C++ and
+    // Python (on negative), we preserve the python behavior as it's more
+    // common and match python syntax, hence the conversion.
+    DEFINE_GENERIC_OP(
+        aten::remainder,
+        (b + (a % b)) % b,
+        fmod((b + fmod(a, b)), b),
+        int,
+        float),
+    DEFINE_INT_FLOAT_OP(aten::remainder, fmod((b + fmod(a, b)), b), float),
+    DEFINE_SCALAR_BINARY_OP(
+        aten::remainder,
+        (b + (a % b)) % b,
+        fmod((b + fmod(a, b)), b),
+        Scalar),
+    // NB: This is the python truediv operation
+    DEFINE_GENERIC_OP_WITH_COMPLEX(
+        aten::div,
+        static_cast<double>(a) / static_cast<double>(b),
+        a / b,
+        a / b,
+        float,
+        float,
+        complex),
+    DEFINE_SCALAR_BINARY_OP(
+        aten::div,
+        static_cast<double>(a) / static_cast<double>(b),
+        a / b,
+        float),
+    DEFINE_GENERIC_OP(
+        aten::floordiv,
+        floordiv(a, b),
+        std::floor(a / b),
+        int,
+        float),
+    DEFINE_INT_FLOAT_OP(aten::floordiv, std::floor(a / b), float),
+    DEFINE_SCALAR_BINARY_OP(
+        aten::floordiv,
+        floordiv(a, b),
+        std::floor(a / b),
+        Scalar),
+    // int ** int produces a float, because negative exponents produce float
+    // results
+    DEFINE_GENERIC_OP_WITH_COMPLEX(
+        aten::pow,
+        static_cast<double>(powWrapper(a, b)),
+        static_cast<double>(powWrapper(a, b)),
+        static_cast<c10::complex<double>>(pow(a, b)),
+        float,
+        float,
+        complex),
+    DEFINE_INT_FLOAT_OP(
+        aten::pow,
+        static_cast<double>(powWrapper(a, b)),
+        float),
+    DEFINE_FLOAT_COMPLEX_OP(aten::pow, pow(a, b), complex),
+    DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION(
+        aten::pow,
+        static_cast<double>(pow(a, b)),
+        static_cast<double>(pow(a, b)),
+        float),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::pow.int_to_int(int a, int b) -> int"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          int64_t a, b;
+          pop(stack, a, b);
+          push(stack, powWrapper(a, b));
+        },
+        aliasAnalysisFromSchema()),
+    // min and max are in prim:: because there is a difference between
+    // the python builtin 'min' and 'torch.min'
+    DEFINE_BINARY_OP(prim::min, a < b ? a : b),
+    DEFINE_BINARY_OP(prim::max, a > b ? a : b),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::type(Device self) -> str"),
+        [](Stack* stack) {
+          auto d = pop(stack);
+          push(
+              stack, DeviceTypeName(d.toDevice().type(), /* lower_case=*/true));
+        },
+        aliasAnalysisFromSchema()),
+    // tensor length op (size of 1st dimension)
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::len.Tensor(Tensor t) -> int"),
+        [](Stack* stack) {
+          at::Tensor t = pop(stack).toTensor();
+          if (t.dim() == 0) {
+            AT_ERROR("len() of a 0-d tensor");
+          }
+          push(stack, t.sizes()[0]);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::ord(str string) -> int"),
+        [](Stack* stack) {
+          auto string = pop(stack).toStringRef();
+          TORCH_CHECK(
+              string.size() == 1,
+              "String for ord() must be 1 character, found ",
+              string.size());
+          uint8_t ord = string.at(0);
+          push(stack, int64_t(ord));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::lower(str self) -> str"),
+        [](Stack* stack) {
+          auto string = pop(stack).toStringRef();
+          std::stringstream ss;
+          for (char c : string) {
+            ss << static_cast<char>(::tolower(c));
+          }
+          push(stack, ss.str());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::__contains__.int_list(int[] l, int item) -> bool"),
+        listContains<int64_t>,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::__contains__.str_list(str[] l, str item) -> bool"),
+        listContains<std::string>,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::len.str(str s) -> int"),
+        [](Stack* stack) {
+          auto string = pop(stack).toStringRef();
+          push(stack, static_cast<int64_t>(string.size()));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::dict() -> Dict(str, Tensor)"),
+        [](Stack* stack) {
+          auto dict =
+              c10::impl::GenericDict(StringType::get(), TensorType::get());
+          push(stack, dict);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::__getitem__.str(str s, int index) -> str"),
+        [](Stack* stack) {
+          auto index = pop(stack).toInt();
+          auto string = pop(stack).toStringRef();
+          auto norm_index = normalizeIndex(index, string.size());
+          char c = string.at(norm_index);
+          push(stack, std::string(&c, 1));
+        },
+        aliasAnalysisFromSchema()),
 #define CREATE_COPY_OP(other_type, c_type)                               \
-  OperatorGenerator(                                                     \
+  OperatorGeneratorArgs(                                                 \
       TORCH_SELECTIVE_SCHEMA("aten::copy_." #other_type                  \
                              "(Tensor(a!) self, " #other_type            \
                              " other) -> Tensor(a!)"),                   \
@@ -1035,170 +950,168 @@ RegisterOperators reg(
       },                                                                 \
       aliasAnalysisFromSchema())
 
-     CREATE_COPY_OP(Tensor, at::Tensor),
-     CREATE_COPY_OP(int, int64_t),
-     CREATE_COPY_OP(float, double),
+    CREATE_COPY_OP(Tensor, at::Tensor),
+    CREATE_COPY_OP(int, int64_t),
+    CREATE_COPY_OP(float, double),
 #undef CREATE_COPY_OP
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::backward(Tensor self, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()"),
-         [](Stack* stack) {
-           bool create_graph = pop(stack).toBool();
-           auto retain_graph = pop(stack).toOptional<bool>();
-           IValue gradient_ivalue = pop(stack);
-           at::Tensor gradient = gradient_ivalue.isNone()
-               ? at::Tensor()
-               : gradient_ivalue.toTensor();
-           at::Tensor self = pop(stack).toTensor();
-           bool keep_graph = retain_graph ? retain_graph.value() : create_graph;
-           self.backward(gradient, keep_graph, create_graph);
-         },
-         aliasAnalysisConservative()),
-     //
-     // create a clone of these declarations with a _hacked_twin overload name
-     // and nullability scrubbed from TensorList arg types
-     // TOOD find out why this exists and how to do it without the hack
-     //
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"),
-         [](Stack* stack) {
-           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
-           auto self = pop(stack).toTensor();
-           auto result = at::index(self, indices);
-           push(stack, std::move(result));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::_index_put_impl_.hacked_twin(Tensor(a!) self, Tensor[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)"),
-         [](Stack* stack) {
-           auto unsafe = pop(stack).toBool();
-           auto accumulate = pop(stack).toBool();
-           auto values = pop(stack).toTensor();
-           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
-           auto self = pop(stack).toTensor();
-           auto result =
-               at::_index_put_impl_(self, indices, values, accumulate, unsafe);
-           push(stack, std::move(result));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::index_put_.hacked_twin(Tensor(a!) self, Tensor[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)"),
-         [](Stack* stack) {
-           auto accumulate = pop(stack).toBool();
-           auto values = pop(stack).toTensor();
-           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
-           auto self = pop(stack).toTensor();
-           auto result = at::index_put_(self, indices, values, accumulate);
-           push(stack, std::move(result));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::index_put.hacked_twin(Tensor self, Tensor[] indices, Tensor values, bool accumulate=False) -> Tensor"),
-         [](Stack* stack) {
-           auto accumulate = pop(stack).toBool();
-           auto values = pop(stack).toTensor();
-           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
-           auto self = pop(stack).toTensor();
-           auto result = at::index_put_(self, indices, values, accumulate);
-           push(stack, std::move(result));
-         },
-         aliasAnalysisFromSchema()),
-     // reference function parse_to_conversion in python_arg_parsing.h
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::to.prim_Device(Tensor(a) self, Device? device, int? dtype=None, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           bool non_blocking;
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           bool copy;
-           pop(stack, non_blocking, copy);
-           c10::optional<at::ScalarType> scalarType =
-               pop(stack).toOptional<at::ScalarType>();
-           c10::optional<c10::Device> device =
-               pop(stack).toOptional<c10::Device>();
-           at::Tensor self = pop(stack).toTensor();
-           push(
-               stack,
-               to_dispatch(self, device, scalarType, non_blocking, copy));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::to.prim_dtype(Tensor(a) self, int? dtype=None, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           bool non_blocking;
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           bool copy;
-           pop(stack, non_blocking, copy);
-           c10::optional<at::ScalarType> scalarType =
-               pop(stack).toOptional<at::ScalarType>();
-           c10::optional<c10::Device> device = c10::nullopt;
-           at::Tensor self = pop(stack).toTensor();
-           push(
-               stack,
-               to_dispatch(self, device, scalarType, non_blocking, copy));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::is_cuda(Tensor a) -> bool"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.is_cuda());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::is_xpu(Tensor a) -> bool"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.is_xpu());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::data(Tensor(a) a) -> Tensor(a)"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, autograd::Variable(a).variable_data());
-         },
-         aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::backward(Tensor self, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()"),
+        [](Stack* stack) {
+          bool create_graph = pop(stack).toBool();
+          auto retain_graph = pop(stack).toOptional<bool>();
+          IValue gradient_ivalue = pop(stack);
+          at::Tensor gradient = gradient_ivalue.isNone()
+              ? at::Tensor()
+              : gradient_ivalue.toTensor();
+          at::Tensor self = pop(stack).toTensor();
+          bool keep_graph = retain_graph ? retain_graph.value() : create_graph;
+          self.backward(gradient, keep_graph, create_graph);
+        },
+        aliasAnalysisConservative()),
+    //
+    // create a clone of these declarations with a _hacked_twin overload name
+    // and nullability scrubbed from TensorList arg types
+    // TOOD find out why this exists and how to do it without the hack
+    //
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"),
+        [](Stack* stack) {
+          auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
+          auto self = pop(stack).toTensor();
+          auto result = at::index(self, indices);
+          push(stack, std::move(result));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::_index_put_impl_.hacked_twin(Tensor(a!) self, Tensor[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)"),
+        [](Stack* stack) {
+          auto unsafe = pop(stack).toBool();
+          auto accumulate = pop(stack).toBool();
+          auto values = pop(stack).toTensor();
+          auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
+          auto self = pop(stack).toTensor();
+          auto result =
+              at::_index_put_impl_(self, indices, values, accumulate, unsafe);
+          push(stack, std::move(result));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::index_put_.hacked_twin(Tensor(a!) self, Tensor[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)"),
+        [](Stack* stack) {
+          auto accumulate = pop(stack).toBool();
+          auto values = pop(stack).toTensor();
+          auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
+          auto self = pop(stack).toTensor();
+          auto result = at::index_put_(self, indices, values, accumulate);
+          push(stack, std::move(result));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::index_put.hacked_twin(Tensor self, Tensor[] indices, Tensor values, bool accumulate=False) -> Tensor"),
+        [](Stack* stack) {
+          auto accumulate = pop(stack).toBool();
+          auto values = pop(stack).toTensor();
+          auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
+          auto self = pop(stack).toTensor();
+          auto result = at::index_put_(self, indices, values, accumulate);
+          push(stack, std::move(result));
+        },
+        aliasAnalysisFromSchema()),
+    // reference function parse_to_conversion in python_arg_parsing.h
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::to.prim_Device(Tensor(a) self, Device? device, int? dtype=None, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          bool non_blocking;
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          bool copy;
+          pop(stack, non_blocking, copy);
+          c10::optional<at::ScalarType> scalarType =
+              pop(stack).toOptional<at::ScalarType>();
+          c10::optional<c10::Device> device =
+              pop(stack).toOptional<c10::Device>();
+          at::Tensor self = pop(stack).toTensor();
+          push(
+              stack, to_dispatch(self, device, scalarType, non_blocking, copy));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::to.prim_dtype(Tensor(a) self, int? dtype=None, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          bool non_blocking;
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          bool copy;
+          pop(stack, non_blocking, copy);
+          c10::optional<at::ScalarType> scalarType =
+              pop(stack).toOptional<at::ScalarType>();
+          c10::optional<c10::Device> device = c10::nullopt;
+          at::Tensor self = pop(stack).toTensor();
+          push(
+              stack, to_dispatch(self, device, scalarType, non_blocking, copy));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::is_cuda(Tensor a) -> bool"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_cuda());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::is_xpu(Tensor a) -> bool"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_xpu());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::data(Tensor(a) a) -> Tensor(a)"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, autograd::Variable(a).variable_data());
+        },
+        aliasAnalysisFromSchema()),
 // these ops are not defined for Tensor
 #define CREATE_COMPARATOR_LIST_OPS_SPECIALIZED(decl_type, value_type)        \
-  OperatorGenerator(                                                         \
+  OperatorGeneratorArgs(                                                     \
       TORCH_SELECTIVE_SCHEMA("prim::min." decl_type "_list(" decl_type       \
                              "[] l, " decl_type "[] r) -> " decl_type "[]"), \
       minList<value_type>,                                                   \
       aliasAnalysisFromSchema()),                                            \
-      OperatorGenerator(                                                     \
+      OperatorGeneratorArgs(                                                 \
           TORCH_SELECTIVE_SCHEMA("prim::max." decl_type "_list(" decl_type   \
                                  "[] l, " decl_type "[] r) -> " decl_type    \
                                  "[]"),                                      \
           maxList<value_type>,                                               \
           aliasAnalysisFromSchema()),                                        \
-      OperatorGenerator(                                                     \
+      OperatorGeneratorArgs(                                                 \
           TORCH_SELECTIVE_SCHEMA("prim::min.self_" decl_type "(" decl_type   \
                                  "[] self) -> " decl_type),                  \
           listMin<value_type>,                                               \
           aliasAnalysisFromSchema()),                                        \
-      OperatorGenerator(                                                     \
+      OperatorGeneratorArgs(                                                 \
           TORCH_SELECTIVE_SCHEMA("prim::max.self_" decl_type "(" decl_type   \
                                  "[] self) -> " decl_type),                  \
           listMax<value_type>,                                               \
           aliasAnalysisFromSchema()),
-     CREATE_COMPARATOR_LIST_OPS_SPECIALIZED("int", int64_t)
-         CREATE_COMPARATOR_LIST_OPS_SPECIALIZED("float", double)
-             CREATE_COMPARATOR_LIST_OPS_SPECIALIZED("bool", bool)
+    CREATE_COMPARATOR_LIST_OPS_SPECIALIZED("int", int64_t)
+        CREATE_COMPARATOR_LIST_OPS_SPECIALIZED("float", double)
+            CREATE_COMPARATOR_LIST_OPS_SPECIALIZED("bool", bool)
 #undef CREATE_COMPARATOR_LIST_OPS_SPECIALIZED
 // python string is methods return false if empty
 #define DEFINE_STRING_IS_OP(op_name, char_op)                          \
-  OperatorGenerator(                                                   \
+  OperatorGeneratorArgs(                                               \
       TORCH_SELECTIVE_SCHEMA(#op_name "(str self) -> bool"),           \
       [](Stack* stack) {                                               \
         auto string = pop(stack).toStringRef();                        \
@@ -1211,15 +1124,15 @@ RegisterOperators reg(
       },                                                               \
       aliasAnalysisFromSchema())
 
-                 DEFINE_STRING_IS_OP(aten::isdigit, ::isdigit),
-     DEFINE_STRING_IS_OP(aten::isspace, ::isspace),
-     DEFINE_STRING_IS_OP(aten::isalnum, ::isalnum),
-     DEFINE_STRING_IS_OP(aten::isalpha, ::isalpha),
-     DEFINE_STRING_IS_OP(aten::isdecimal, ::isdigit),
-     DEFINE_STRING_IS_OP(aten::isnumeric, ::isdigit),
+                DEFINE_STRING_IS_OP(aten::isdigit, ::isdigit),
+    DEFINE_STRING_IS_OP(aten::isspace, ::isspace),
+    DEFINE_STRING_IS_OP(aten::isalnum, ::isalnum),
+    DEFINE_STRING_IS_OP(aten::isalpha, ::isalpha),
+    DEFINE_STRING_IS_OP(aten::isdecimal, ::isdigit),
+    DEFINE_STRING_IS_OP(aten::isnumeric, ::isdigit),
 
 #define DEFINE_STRING_CHAR_MAP_OP(op_name, char_op)         \
-  OperatorGenerator(                                        \
+  OperatorGeneratorArgs(                                    \
       TORCH_SELECTIVE_SCHEMA(#op_name "(str self) -> str"), \
       [](Stack* stack) {                                    \
         auto string = pop(stack).toStringRef();             \
@@ -1231,14 +1144,121 @@ RegisterOperators reg(
       },                                                    \
       aliasAnalysisFromSchema())
 
-     DEFINE_STRING_CHAR_MAP_OP(aten::upper, ::toupper),
-     DEFINE_STRING_CHAR_MAP_OP(aten::swapcase, ([](char c) {
-                                 if (c == static_cast<char>(::toupper(c))) {
-                                   return static_cast<char>(::tolower(c));
-                                 } else {
-                                   return static_cast<char>(::toupper(c));
-                                 }
-                               }))});
+    DEFINE_STRING_CHAR_MAP_OP(aten::upper, ::toupper),
+    DEFINE_STRING_CHAR_MAP_OP(aten::swapcase, ([](char c) {
+                                if (c == static_cast<char>(::toupper(c))) {
+                                  return static_cast<char>(::tolower(c));
+                                } else {
+                                  return static_cast<char>(::toupper(c));
+                                }
+                              }))};
+
+static std::vector<c10::optional<Operator>> createOperators(
+    const OperatorGeneratorArgs* args,
+    int length) {
+  std::vector<c10::optional<Operator>> result;
+  result.reserve(length);
+  for (int ii = 0; ii < length; ++ii) {
+    if (args[ii].schema_str) {
+      if (args[ii].isOperationCreator) {
+        result.push_back(OperatorGenerator(
+            args[ii].schema_str,
+            args[ii].operationCreator,
+            args[ii].aliasAnalysis));
+      } else {
+        result.push_back(OperatorGenerator(
+            args[ii].schema_str, args[ii].operation, args[ii].aliasAnalysis));
+      }
+    }
+  }
+  return result;
+}
+
+RegisterOperators reg(([]() {
+  auto v = createOperators(opGenArgs, sizeof(opGenArgs) / sizeof(opGenArgs[0]));
+  v.push_back(Operator(
+      prim::tolist,
+      // This operator has to be unschematized because the return type
+      // depends on the type hint and input. The implementation of this
+      // operator below is intended to be as close to the Python
+      // implementation in torch/csrc/utils/tensor_list.cpp as possible.
+      [](const Node* /*node*/) -> Operation {
+        return [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          int elem_ty_val;
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          int dim_val;
+          at::Tensor t;
+
+          pop(stack, elem_ty_val);
+          pop(stack, dim_val);
+          pop(stack, t);
+
+          // If the Tensor is not on the CPU, transfer it.
+          if (!t.device().is_cpu()) {
+            t = t.cpu();
+          }
+
+          // Rebuild the output type using elem_ty_val and dim_val. Start
+          // with the element type corresponding to elem_ty_val.
+          TypePtr out_ty;
+          if (elem_ty_val == 0) {
+            out_ty = IntType::get();
+          } else if (elem_ty_val == 1) {
+            out_ty = FloatType::get();
+          } else if (elem_ty_val == 2) {
+            out_ty = BoolType::get();
+          } else if (elem_ty_val == 3) {
+            out_ty = ComplexType::get();
+          } else {
+            TORCH_CHECK(
+                false,
+                "Unsupported element type for tolist; only int, float, complex and bool are supported");
+          }
+
+          // Check that type of the Tensor matches that of the annotation.
+          // Make an exception for the case in which the annotated type is
+          // float/complex and the Tensor data type is also float/complex;
+          // the elements will be casted to double/c10::complex<double>
+          // later.
+          TORCH_CHECK(
+              (out_ty == FloatType::get() && t.is_floating_point()) ||
+                  (out_ty == ComplexType::get() && t.is_complex()) ||
+                  tryScalarTypeFromJitType(out_ty) == t.scalar_type(),
+              "Output annotation element type and runtime tensor element type must match for tolist()");
+
+          // Check that the dimension of the Tensor matches that of the
+          // annotation.
+          TORCH_CHECK(
+              dim_val == t.dim(),
+              "Output annotation list dimension and runtime tensor dimension must match for tolist()");
+
+          // Wrap out_ty in a ListType dim times.
+          for (const auto i : c10::irange(dim_val)) {
+            (void)i; // Suppress unused variable warning
+            out_ty = ListType::create(out_ty);
+          }
+
+          int64_t dim = t.dim();
+          auto sizes = t.sizes();
+          auto strides = t.strides();
+          size_t element_size = t.element_size();
+          char* data = static_cast<char*>(t.data_ptr());
+          auto result = tensorToListRecursive(
+              data,
+              0,
+              dim,
+              out_ty,
+              t.scalar_type(),
+              sizes,
+              strides,
+              element_size);
+          push(stack, std::move(result));
+        };
+      },
+      aliasAnalysisSpecialCase()));
+  return v;
+})());
 
 void dictSetItem(Stack* stack) {
   auto value = pop(stack);
@@ -1408,123 +1428,125 @@ void dictConstructFromList(Stack* stack) {
 }
 
 #define CREATE_DICT_OPS(key_type)                                              \
-  OperatorGenerator(                                                           \
+  OperatorGeneratorArgs(                                                       \
       TORCH_SELECTIVE_SCHEMA("aten::len.Dict_" key_type "(Dict(" key_type      \
                              ", t) self) -> int"),                             \
       dictLen,                                                                 \
       aliasAnalysisFromSchema()),                                              \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::keys." key_type "(Dict(" key_type      \
                                  ", t) self) -> " key_type "[](*)"),           \
           dictKeys,                                                            \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::values." key_type "(Dict(" key_type    \
                                  ", t) self) -> t[](*)"),                      \
           dictValues,                                                          \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::__getitem__.Dict_" key_type            \
                                  "(Dict(" key_type ", t) self, " key_type      \
                                  " key) -> t(*)"),                             \
           dictIndex,                                                           \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::get." key_type "(Dict(" key_type       \
                                  ", t) self, " key_type " key) -> t(*)?"),     \
           dictGet<false>,                                                      \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::get.default_" key_type                 \
                                  "(Dict(" key_type ", t) self, " key_type      \
                                  " key, t default_value) -> t(*)"),            \
           dictGet<true>,                                                       \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA(                                              \
               "aten::setdefault." key_type "(Dict(" key_type                   \
               ", t)(a!) self, " key_type                                       \
               "(b -> *) key, t(c -> *) default_value) -> t(*)"),               \
           dictSetDefault,                                                      \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::Delete.Dict_" key_type                 \
                                  "(Dict(" key_type ", t)(a!) self, " key_type  \
                                  " key) -> ()"),                               \
           dictDelete,                                                          \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::pop.Dict_" key_type "(Dict(" key_type  \
                                  ", t)(a!) self, " key_type " key) -> t(*)"),  \
           dictPop<false>,                                                      \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::pop.Dict_default_" key_type            \
                                  "(Dict(" key_type ", t)(a!) self, " key_type  \
                                  " key, t default_value) -> t(*)"),            \
           dictPop<true>,                                                       \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::popitem." key_type "(Dict(" key_type   \
                                  ", t)(a!) self) -> ((" key_type ", t))"),     \
           dictPopItem,                                                         \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::clear." key_type "(Dict(" key_type     \
                                  ", t)(a!) self) -> ()"),                      \
           dictClear,                                                           \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::update." key_type "(Dict(" key_type    \
                                  ", t)(a!) self, Dict(" key_type               \
                                  ", t)(a!) to_add) -> ()"),                    \
           dictUpdate,                                                          \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::items." key_type "(Dict(" key_type     \
                                  ", t) self) -> ((" key_type ", t)[])"),       \
           dictItems,                                                           \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::copy.Dict_" key_type "(Dict(" key_type \
                                  ", t)(a) self) -> Dict(" key_type ", t)"),    \
           dictCopy,                                                            \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::__contains__." key_type                \
                                  "(Dict(" key_type ", t) dict, " key_type      \
                                  " key) -> bool"),                             \
           dictContains,                                                        \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::_set_item." key_type "(Dict(" key_type \
                                  ", t)(a!) l, " key_type                       \
                                  "(b -> *) idx, t(c -> *) v) -> ()"),          \
           dictSetItem,                                                         \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::dict." key_type "((" key_type          \
                                  ", tVal)[] inputs) -> Dict(" key_type         \
                                  ", tVal)"),                                   \
           dictConstructFromList,                                               \
           aliasAnalysisFromSchema()),                                          \
-      OperatorGenerator(                                                       \
+      OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA("aten::dict.Dict_" key_type "(Dict(" key_type \
                                  ", t)(a) self) -> Dict(" key_type ", t)"),    \
           dictCopy,                                                            \
           aliasAnalysisFromSchema())
 
-RegisterOperators reg_dict_ops({
+static const OperatorGeneratorArgs dict_ops[] = {
     CREATE_DICT_OPS("str"),
     CREATE_DICT_OPS("int"),
     CREATE_DICT_OPS("bool"),
     CREATE_DICT_OPS("float"),
     CREATE_DICT_OPS("complex"),
     CREATE_DICT_OPS("Tensor"),
-});
+};
+RegisterOperators reg_dict_ops(
+    createOperators(dict_ops, sizeof(dict_ops) / sizeof(dict_ops[0])));
 
 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
-c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+constexpr c10::AliasAnalysisKind aliasAnalysisFromSchema() {
   return c10::AliasAnalysisKind::FROM_SCHEMA;
 }
 
@@ -2095,393 +2117,394 @@ TORCH_LIBRARY_IMPL(aten, CatchAll, m) {
       });
 }
 
+static const OperatorGeneratorArgs opGenArgs1[] = {
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::rangelist(int n) -> int[]"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          int64_t n;
+          pop(stack, n);
+          c10::List<int64_t> elems;
+          elems.reserve(n);
+          for (const auto i : c10::irange(n)) {
+            elems.push_back(i);
+          }
+          push(stack, std::move(elems));
+        },
+        aliasAnalysisFromSchema()),
+    // note: this op needs to share a name with the Scalar -> Tensor conversion
+    // because all _to_tensor conversion have to have the same operator namet
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::NumToTensor.bool(bool a) -> Tensor"),
+        [](Stack* stack) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          bool b;
+          pop(stack, b);
+          push(stack, at::scalar_to_tensor(b));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::device(str a) -> Device"),
+        [](Stack* stack) {
+          push(stack, c10::Device(pop(stack).toStringRef()));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::percentFormat(str self, ...) -> str"),
+        [](Stack* stack) {
+          size_t num_inputs = pop(stack).toInt();
+          percentFormat(*stack, num_inputs);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::to.prim_other(Tensor(a) self, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"),
+        [](Stack* stack) {
+          at::Tensor self;
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          bool non_blocking;
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          bool copy;
+          pop(stack, self, non_blocking, copy);
+          c10::optional<c10::Device> device = c10::nullopt;
+          c10::optional<at::ScalarType> scalarType = c10::nullopt;
+          push(
+              stack, to_dispatch(self, device, scalarType, non_blocking, copy));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::requires_grad(Tensor a) -> bool"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.requires_grad());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::grad(Tensor a) -> Tensor(*)"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.grad());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::is_sparse(Tensor a) -> bool"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_sparse());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::is_sparse_csr(Tensor a) -> bool"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_sparse_csr());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::is_mkldnn(Tensor a) -> bool"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_mkldnn());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::is_mlc(Tensor a) -> bool"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_mlc());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::is_vulkan(Tensor a) -> bool"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_vulkan());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::is_quantized(Tensor a) -> bool"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_quantized());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::is_meta(Tensor a) -> bool"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_meta());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::is_ort(Tensor a) -> bool"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.is_ort());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::name(Tensor a) -> str?"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          if (a.name() == "") {
+            push(stack, IValue());
+          } else {
+            push(stack, a.name());
+          }
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::index(Device self) -> int?"),
+        [](Stack* stack) {
+          auto d = pop(stack).toDevice();
+          if (d.has_index()) {
+            push(stack, d.index());
+          } else {
+            push(stack, IValue());
+          }
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        // TODO return generator object when torchscript supports RNG
+        // first-class
+        TORCH_SELECTIVE_SCHEMA("aten::manual_seed(int seed) -> ()"),
+        [](Stack* stack) { at::manual_seed(pop(stack).toInt()); },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::cuda(Tensor(a) self) -> Tensor(a|b)"),
+        [](Stack* stack) {
+          at::Tensor a;
+          pop(stack, a);
+          push(stack, a.cuda());
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::AutogradZero() -> Tensor"),
+        [](Stack* stack) { stack->emplace_back(at::Tensor()); },
+        aliasAnalysisSpecialCase()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "prim::ReductionSizes(int[] size, int[] red_axes, bool keepdim = False) -> int[]"),
+        [](Stack* stack) {
+          bool keepdim = pop(stack).toBool();
+          c10::List<int64_t> axes = pop(stack).toIntList();
+          c10::List<int64_t> size = pop(stack).toIntList();
+          if (keepdim) {
+            for (const auto& axis : axes) {
+              size.set(axis, 1);
+            }
+          } else {
+            int64_t index = 0;
+            auto iter = size.begin();
+            std::sort(axes.begin(), axes.end());
+            for (const auto& axis : axes) {
+              // move iter to the next axis
+              iter += axis - index;
+
+              // input iter points to axis and is updated to axis + 1
+              iter = size.erase(iter);
+
+              // update current index for iter
+              index = axis + 1;
+            }
+          }
+          push(stack, IValue(std::move(size)));
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::BroadcastSizes(...) -> int[]"),
+        [](Stack* stack) {
+          auto num_inputs = pop(stack).toInt();
+          std::vector<int64_t> size;
+          size.reserve(8);
+          for (const auto i : c10::irange(num_inputs)) {
+            size =
+                at::infer_size(size, peek(stack, i, num_inputs).toIntVector());
+          }
+          drop(stack, num_inputs);
+          push(stack, IValue(size));
+        },
+        aliasAnalysisSpecialCase()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::warn(str message, int stacklevel=2) -> ()"),
+        [](Stack* stack) {
+          TORCH_CHECK(false, "warn is implemented directly in the interpreter");
+        },
+        aliasAnalysisFromSchema()),
+
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "onnx::Reshape(Tensor input, Tensor shape) -> Tensor"),
+        [](Stack* stack) {
+          at::Tensor input, shape;
+          pop(stack, input, shape);
+          shape = shape.contiguous();
+          AT_ASSERT(shape.ndimension() == 1);
+          at::IntArrayRef shape_list(shape.data_ptr<int64_t>(), shape.size(0));
+          push(stack, input.reshape(shape_list));
+        },
+        aliasAnalysisSpecialCase()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("onnx::Shape(Tensor t) -> Tensor"),
+        [](Stack* stack) {
+          auto t = pop(stack).toTensor();
+          at::IntArrayRef sizes = t.sizes();
+          auto sizes_tensor = torch::empty(
+              {static_cast<int64_t>(sizes.size())}, at::dtype(at::kLong));
+          auto accessor = sizes_tensor.accessor<int64_t, 1>();
+          for (const auto i : c10::irange(sizes.size())) {
+            accessor[i] = sizes[i];
+          }
+          stack->emplace_back(sizes_tensor);
+        },
+        aliasAnalysisSpecialCase()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::AutogradAnyNonZero(...) -> bool"),
+        [](Stack* stack) {
+          auto num_inputs = pop(stack).toInt();
+          bool result = false;
+          for (const IValue& v : last(stack, num_inputs)) {
+            if (v.isTensor()) {
+              if (v.toTensor().defined()) {
+                result = true;
+                break;
+              }
+            } else if (v.isTensorList()) {
+              for (const at::Tensor& t : v.toTensorVector()) {
+                if (t.defined()) {
+                  result = true;
+                }
+              }
+              if (result) {
+                break;
+              }
+            } else {
+              TORCH_INTERNAL_ASSERT(false);
+            }
+          }
+          drop(stack, num_inputs);
+          stack->emplace_back(result);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::AutogradAllZero(...) -> bool"),
+        [](Stack* stack) {
+          auto num_inputs = pop(stack).toInt();
+          bool result = true;
+          for (const IValue& v : last(stack, num_inputs)) {
+            TORCH_INTERNAL_ASSERT(v.isTensor());
+            if (v.toTensor().defined()) {
+              result = false;
+              break;
+            }
+          }
+          drop(stack, num_inputs);
+          stack->emplace_back(result);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::AutogradAllNonZero(...) -> bool"),
+        [](Stack* stack) {
+          auto num_inputs = pop(stack).toInt();
+          bool result = true;
+          for (const IValue& v : last(stack, num_inputs)) {
+            TORCH_INTERNAL_ASSERT(v.isTensor());
+            if (!v.toTensor().defined()) {
+              result = false;
+              break;
+            }
+          }
+          drop(stack, num_inputs);
+          stack->emplace_back(result);
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("prim::AutogradAdd(Any a, Any b) -> Any"),
+        [](Stack* stack) {
+          at::Tensor a, b;
+          pop(stack, a, b);
+          // NOLINTNEXTLINE(bugprone-branch-clone)
+          if (!a.defined() && !b.defined()) {
+            // undef + undef == undef
+            stack->emplace_back(a);
+          } else if (!a.defined()) {
+            stack->emplace_back(b);
+          } else if (!b.defined()) {
+            stack->emplace_back(a);
+          } else {
+            stack->emplace_back(a + b);
+          }
+        },
+        aliasAnalysisSpecialCase()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::_size_if_not_equal(int[] self_size, int[] other_size) -> int[]?"),
+        [](Stack* stack) {
+          IValue self_size, other_size;
+          pop(stack, self_size, other_size);
+          auto s = self_size.toIntVector();
+          auto o = other_size.toIntVector();
+          if (s == o) {
+            push(stack, IValue());
+          } else {
+            push(stack, s);
+          }
+        },
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::_unwrap_optional(t(a)? optional) -> t(a)"),
+        [](Stack* stack) {
+          auto val = pop(stack);
+          TORCH_CHECK(!val.isNone(), "Unwrapping null optional");
+          push(stack, std::move(val));
+        },
+        aliasAnalysisFromSchema())};
+
 RegisterOperators reg1(
-    {OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::rangelist(int n) -> int[]"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           int64_t n;
-           pop(stack, n);
-           c10::List<int64_t> elems;
-           elems.reserve(n);
-           for (const auto i : c10::irange(n)) {
-             elems.push_back(i);
-           }
-           push(stack, std::move(elems));
-         },
-         aliasAnalysisFromSchema()),
-     // note: this op needs to share a name with the Scalar -> Tensor conversion
-     // because all _to_tensor conversion have to have the same operator namet
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::NumToTensor.bool(bool a) -> Tensor"),
-         [](Stack* stack) {
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           bool b;
-           pop(stack, b);
-           push(stack, at::scalar_to_tensor(b));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::device(str a) -> Device"),
-         [](Stack* stack) {
-           push(stack, c10::Device(pop(stack).toStringRef()));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::percentFormat(str self, ...) -> str"),
-         [](Stack* stack) {
-           size_t num_inputs = pop(stack).toInt();
-           percentFormat(*stack, num_inputs);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::to.prim_other(Tensor(a) self, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"),
-         [](Stack* stack) {
-           at::Tensor self;
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           bool non_blocking;
-           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-           bool copy;
-           pop(stack, self, non_blocking, copy);
-           c10::optional<c10::Device> device = c10::nullopt;
-           c10::optional<at::ScalarType> scalarType = c10::nullopt;
-           push(
-               stack,
-               to_dispatch(self, device, scalarType, non_blocking, copy));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::requires_grad(Tensor a) -> bool"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.requires_grad());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::grad(Tensor a) -> Tensor(*)"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.grad());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::is_sparse(Tensor a) -> bool"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.is_sparse());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::is_sparse_csr(Tensor a) -> bool"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.is_sparse_csr());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::is_mkldnn(Tensor a) -> bool"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.is_mkldnn());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::is_mlc(Tensor a) -> bool"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.is_mlc());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::is_vulkan(Tensor a) -> bool"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.is_vulkan());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::is_quantized(Tensor a) -> bool"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.is_quantized());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::is_meta(Tensor a) -> bool"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.is_meta());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::is_ort(Tensor a) -> bool"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.is_ort());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::name(Tensor a) -> str?"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           if (a.name() == "") {
-             push(stack, IValue());
-           } else {
-             push(stack, a.name());
-           }
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::index(Device self) -> int?"),
-         [](Stack* stack) {
-           auto d = pop(stack).toDevice();
-           if (d.has_index()) {
-             push(stack, d.index());
-           } else {
-             push(stack, IValue());
-           }
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         // TODO return generator object when torchscript supports RNG
-         // first-class
-         TORCH_SELECTIVE_SCHEMA("aten::manual_seed(int seed) -> ()"),
-         [](Stack* stack) { at::manual_seed(pop(stack).toInt()); },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("aten::cuda(Tensor(a) self) -> Tensor(a|b)"),
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.cuda());
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::AutogradZero() -> Tensor"),
-         [](Stack* stack) { stack->emplace_back(at::Tensor()); },
-         aliasAnalysisSpecialCase()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "prim::ReductionSizes(int[] size, int[] red_axes, bool keepdim = False) -> int[]"),
-         [](Stack* stack) {
-           bool keepdim = pop(stack).toBool();
-           c10::List<int64_t> axes = pop(stack).toIntList();
-           c10::List<int64_t> size = pop(stack).toIntList();
-           if (keepdim) {
-             for (const auto& axis : axes) {
-               size.set(axis, 1);
-             }
-           } else {
-             int64_t index = 0;
-             auto iter = size.begin();
-             std::sort(axes.begin(), axes.end());
-             for (const auto& axis : axes) {
-               // move iter to the next axis
-               iter += axis - index;
-
-               // input iter points to axis and is updated to axis + 1
-               iter = size.erase(iter);
-
-               // update current index for iter
-               index = axis + 1;
-             }
-           }
-           push(stack, IValue(std::move(size)));
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::BroadcastSizes(...) -> int[]"),
-         [](Stack* stack) {
-           auto num_inputs = pop(stack).toInt();
-           std::vector<int64_t> size;
-           size.reserve(8);
-           for (const auto i : c10::irange(num_inputs)) {
-             size =
-                 at::infer_size(size, peek(stack, i, num_inputs).toIntVector());
-           }
-           drop(stack, num_inputs);
-           push(stack, IValue(size));
-         },
-         aliasAnalysisSpecialCase()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::warn(str message, int stacklevel=2) -> ()"),
-         [](Stack* stack) {
-           TORCH_CHECK(
-               false, "warn is implemented directly in the interpreter");
-         },
-         aliasAnalysisFromSchema()),
-
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "onnx::Reshape(Tensor input, Tensor shape) -> Tensor"),
-         [](Stack* stack) {
-           at::Tensor input, shape;
-           pop(stack, input, shape);
-           shape = shape.contiguous();
-           AT_ASSERT(shape.ndimension() == 1);
-           at::IntArrayRef shape_list(shape.data_ptr<int64_t>(), shape.size(0));
-           push(stack, input.reshape(shape_list));
-         },
-         aliasAnalysisSpecialCase()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("onnx::Shape(Tensor t) -> Tensor"),
-         [](Stack* stack) {
-           auto t = pop(stack).toTensor();
-           at::IntArrayRef sizes = t.sizes();
-           auto sizes_tensor = torch::empty(
-               {static_cast<int64_t>(sizes.size())}, at::dtype(at::kLong));
-           auto accessor = sizes_tensor.accessor<int64_t, 1>();
-           for (const auto i : c10::irange(sizes.size())) {
-             accessor[i] = sizes[i];
-           }
-           stack->emplace_back(sizes_tensor);
-         },
-         aliasAnalysisSpecialCase()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::AutogradAnyNonZero(...) -> bool"),
-         [](Stack* stack) {
-           auto num_inputs = pop(stack).toInt();
-           bool result = false;
-           for (const IValue& v : last(stack, num_inputs)) {
-             if (v.isTensor()) {
-               if (v.toTensor().defined()) {
-                 result = true;
-                 break;
-               }
-             } else if (v.isTensorList()) {
-               for (const at::Tensor& t : v.toTensorVector()) {
-                 if (t.defined()) {
-                   result = true;
-                 }
-               }
-               if (result) {
-                 break;
-               }
-             } else {
-               TORCH_INTERNAL_ASSERT(false);
-             }
-           }
-           drop(stack, num_inputs);
-           stack->emplace_back(result);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::AutogradAllZero(...) -> bool"),
-         [](Stack* stack) {
-           auto num_inputs = pop(stack).toInt();
-           bool result = true;
-           for (const IValue& v : last(stack, num_inputs)) {
-             TORCH_INTERNAL_ASSERT(v.isTensor());
-             if (v.toTensor().defined()) {
-               result = false;
-               break;
-             }
-           }
-           drop(stack, num_inputs);
-           stack->emplace_back(result);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::AutogradAllNonZero(...) -> bool"),
-         [](Stack* stack) {
-           auto num_inputs = pop(stack).toInt();
-           bool result = true;
-           for (const IValue& v : last(stack, num_inputs)) {
-             TORCH_INTERNAL_ASSERT(v.isTensor());
-             if (!v.toTensor().defined()) {
-               result = false;
-               break;
-             }
-           }
-           drop(stack, num_inputs);
-           stack->emplace_back(result);
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA("prim::AutogradAdd(Any a, Any b) -> Any"),
-         [](Stack* stack) {
-           at::Tensor a, b;
-           pop(stack, a, b);
-           // NOLINTNEXTLINE(bugprone-branch-clone)
-           if (!a.defined() && !b.defined()) {
-             // undef + undef == undef
-             stack->emplace_back(a);
-           } else if (!a.defined()) {
-             stack->emplace_back(b);
-           } else if (!b.defined()) {
-             stack->emplace_back(a);
-           } else {
-             stack->emplace_back(a + b);
-           }
-         },
-         aliasAnalysisSpecialCase()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::_size_if_not_equal(int[] self_size, int[] other_size) -> int[]?"),
-         [](Stack* stack) {
-           IValue self_size, other_size;
-           pop(stack, self_size, other_size);
-           auto s = self_size.toIntVector();
-           auto o = other_size.toIntVector();
-           if (s == o) {
-             push(stack, IValue());
-           } else {
-             push(stack, s);
-           }
-         },
-         aliasAnalysisFromSchema()),
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "aten::_unwrap_optional(t(a)? optional) -> t(a)"),
-         [](Stack* stack) {
-           auto val = pop(stack);
-           TORCH_CHECK(!val.isNone(), "Unwrapping null optional");
-           push(stack, std::move(val));
-         },
-         aliasAnalysisFromSchema())});
+    createOperators(opGenArgs1, sizeof(opGenArgs1) / sizeof(opGenArgs1[0])));
 
 void hashValue(Stack* stack) {
   auto value = pop(stack);
   push(stack, value.hash());
 }
 
-RegisterOperators reg2({
+static const OperatorGeneratorArgs opGenArgs2[] = {
     // registered as Any[] so that heterogenous tuples can be called with len()
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::len.any(Any[] a) -> int"),
         listLen,
         aliasAnalysisFromSchema()),
 
 // these ops have a specialized implementation for the list element type
 #define CREATE_SPECIALIZED_LIST_OPS(decl_type, value_type) \
-  OperatorGenerator(                                       \
+  OperatorGeneratorArgs(                                   \
       TORCH_SELECTIVE_SCHEMA(                              \
           "aten::remove." decl_type "(" decl_type          \
           "[](a!) self,                                                           \
         " decl_type " el) -> ()"),                         \
       listRemove<value_type>,                              \
       aliasAnalysisFromSchema()),                          \
-      OperatorGenerator(                                   \
+      OperatorGeneratorArgs(                               \
           TORCH_SELECTIVE_SCHEMA(                          \
               "aten::index.list_" decl_type "(" decl_type  \
               "[] self,                                                               \
         " decl_type " el) -> int"),                        \
           listIndex<value_type>,                           \
           aliasAnalysisFromSchema()),                      \
-      OperatorGenerator(                                   \
+      OperatorGeneratorArgs(                               \
           TORCH_SELECTIVE_SCHEMA(                          \
               "aten::count." decl_type "(" decl_type       \
               "[] self,                                                               \
@@ -2500,100 +2523,100 @@ RegisterOperators reg2({
 
     // `listContains<T>` is not implemented for non-primitive types
     // TODO: Add List[bool] once .to<c10::List<bool>> doesn't throw an error
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::__contains__.float_list(float[] l, float item) -> bool"),
         listContains<double>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::sort.int(int[](a!) self, bool reverse=False) -> ()"),
         listSort<int64_t>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::sort.float(float[](a!) self, bool reverse=False) -> ()"),
         listSort<double>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::sort.Tensor(Tensor[](a!) self, bool reverse=False) -> ()"),
         listSort<at::Tensor>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::sort.bool(bool[](a!) self, bool reverse=False) -> ()"),
         listSort<bool>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::sort.str(str[](a!) self, bool reverse=False) -> ()"),
         listSort<std::string>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::sorted.int(int[](a) input) -> (int[])"),
         listCopyAndSort<int64_t>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::sorted.float(float[](a) input) -> (float[])"),
         listCopyAndSort<double>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::sorted.Tensor(Tensor[](a) input) -> (Tensor[])"),
         listCopyAndSort<at::Tensor>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::sorted.bool(bool[](a) input) -> (bool[])"),
         listCopyAndSort<bool>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::sorted.str(str[](a) input) -> (str[])"),
         listCopyAndSort<std::string>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::eq.float_list(float[] a, float[] b) -> bool"),
         listEq<double>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::eq.Tensor_list(Tensor[] a, Tensor[] b) -> bool"),
         listEq<at::Tensor>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::eq.bool_list(bool[] a, bool[] b) -> bool"),
         listEq<bool>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::eq.str_list(str[] a, str[] b) -> bool"),
         listEq<std::string>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::ne.float_list(float[] a, float[] b) -> bool"),
         listNe<double>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::ne.Tensor_list(Tensor[] a, Tensor[] b) -> bool"),
         listNe<at::Tensor>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::ne.bool_list(bool[] a, bool[] b) -> bool"),
         listNe<bool>,
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::ne.str_list(str[] a, str[] b) -> bool"),
         listNe<std::string>,
         aliasAnalysisFromSchema()),
 
 #define DEFINE_CONVERT_BASE_OP(op_name, prefix, char_op) \
-  OperatorGenerator(                                     \
+  OperatorGeneratorArgs(                                 \
       TORCH_SELECTIVE_SCHEMA(#op_name "(int i) -> str"), \
       [](Stack* stack) {                                 \
         auto i = pop(stack).toInt();                     \
@@ -2610,7 +2633,7 @@ RegisterOperators reg2({
     DEFINE_CONVERT_BASE_OP(aten::hex, "x", std::hex),
     DEFINE_CONVERT_BASE_OP(aten::oct, "o", std::oct),
 
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::bin(int i) -> str"),
         [](Stack* stack) {
           auto i = pop(stack).toInt();
@@ -2630,7 +2653,7 @@ RegisterOperators reg2({
         },
         aliasAnalysisFromSchema()),
     // TODO: deprecate this in favor of aten::getelem
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "prim::StringIndex(str string, int index) -> str"),
         [](Stack* stack) {
@@ -2641,7 +2664,7 @@ RegisterOperators reg2({
           push(stack, std::string(&c, 1));
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::chr(int i) -> str"),
         [](Stack* stack) {
           auto i = pop(stack).toInt();
@@ -2659,7 +2682,7 @@ RegisterOperators reg2({
     // only used in loop unrolling, not exposed to end users
     DEFINE_INT_OP(aten::__round_to_zero_floordiv, a / b),
 
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::modf(float a) -> (float, float)"),
         [](Stack* stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -2671,7 +2694,7 @@ RegisterOperators reg2({
           push(stack, b, c);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::frexp(float a) -> (float, int)"),
         [](Stack* stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -2685,7 +2708,7 @@ RegisterOperators reg2({
           push(stack, m, e);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::ldexp(float x, int i) -> float"),
         [](Stack* stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -2785,7 +2808,7 @@ RegisterOperators reg2({
         float,
         float,
         float),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::abs(Tensor x) -> Tensor"),
         [](Stack* stack) {
           at::Tensor x;
@@ -2808,7 +2831,7 @@ RegisterOperators reg2({
         std::copysign(a, b),
         std::copysign(a, b),
         float),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::_tensor_to_list(Tensor self) -> int[]"),
         [](Stack* stack) {
           at::Tensor t;
@@ -2821,7 +2844,7 @@ RegisterOperators reg2({
           push(stack, std::move(elems));
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::_list_to_tensor(int[] self) -> Tensor"),
         [](Stack* stack) {
           c10::List<int64_t> l = pop(stack).toIntList();
@@ -2833,7 +2856,7 @@ RegisterOperators reg2({
           push(stack, std::move(t));
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::sum.int(int[] self) -> int"),
         [](Stack* stack) {
           c10::List<int64_t> l = pop(stack).toIntList();
@@ -2844,7 +2867,7 @@ RegisterOperators reg2({
           push(stack, sum);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::sum.float(float[] self) -> float"),
         [](Stack* stack) {
           c10::List<double> l = pop(stack).toDoubleList();
@@ -2855,7 +2878,7 @@ RegisterOperators reg2({
           push(stack, sum);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::sum.complex(complex[] self) -> complex"),
         [](Stack* stack) {
           c10::List<c10::complex<double>> l = pop(stack).toComplexDoubleList();
@@ -2866,7 +2889,7 @@ RegisterOperators reg2({
           push(stack, sum);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::sum.bool(bool[] self) -> int"),
         [](Stack* stack) {
           c10::List<bool> l = pop(stack).toBoolList();
@@ -2879,7 +2902,7 @@ RegisterOperators reg2({
           push(stack, sum);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::any.str(str[] self) -> bool"),
         [](Stack* stack) {
           auto l = pop(stack).toList();
@@ -2892,7 +2915,7 @@ RegisterOperators reg2({
           push(stack, false);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::any.int(int[] self) -> bool"),
         [](Stack* stack) {
           c10::List<int64_t> l = pop(stack).toIntList();
@@ -2905,7 +2928,7 @@ RegisterOperators reg2({
           push(stack, false);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::any.float(float[] self) -> bool"),
         [](Stack* stack) {
           c10::List<double> l = pop(stack).toDoubleList();
@@ -2918,7 +2941,7 @@ RegisterOperators reg2({
           push(stack, false);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::any.bool(bool[] self) -> bool"),
         [](Stack* stack) {
           c10::List<bool> l = pop(stack).toBoolList();
@@ -2931,7 +2954,7 @@ RegisterOperators reg2({
           push(stack, false);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::all.int(int[] self) -> bool"),
         [](Stack* stack) {
           c10::List<int64_t> l = pop(stack).toIntList();
@@ -2944,7 +2967,7 @@ RegisterOperators reg2({
           push(stack, true);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::all.float(float[] self) -> bool"),
         [](Stack* stack) {
           c10::List<double> l = pop(stack).toDoubleList();
@@ -2957,7 +2980,7 @@ RegisterOperators reg2({
           push(stack, true);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::all.bool(bool[] self) -> bool"),
         [](Stack* stack) {
           c10::List<bool> l = pop(stack).toBoolList();
@@ -2970,7 +2993,7 @@ RegisterOperators reg2({
           push(stack, true);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::divmod.int(int x, int y) -> (int, int)"),
         [](Stack* stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -2992,7 +3015,7 @@ RegisterOperators reg2({
               static_cast<int64_t>(divresult.rem));
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::divmod.float(float x, float y) -> (float, float)"),
         [](Stack* stack) {
@@ -3010,7 +3033,7 @@ RegisterOperators reg2({
           push(stack, (a - rem) / b, rem);
         },
         aliasAnalysisFromSchema()),
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::id(AnyClassType? x) -> int"),
         [](Stack* stack) {
           IValue a;
@@ -3024,7 +3047,7 @@ RegisterOperators reg2({
         aliasAnalysisFromSchema()),
 
 #define DEFINE_DIVMOD_MIXED_OP(type_a, type_b)                               \
-  OperatorGenerator(                                                         \
+  OperatorGeneratorArgs(                                                     \
       TORCH_SELECTIVE_SCHEMA("aten::divmod." #type_a "_" #type_b "(" #type_a \
                              " x," #type_b " y) -> (float, float)"),         \
       [](Stack* stack) {                                                     \
@@ -3044,13 +3067,13 @@ RegisterOperators reg2({
     DEFINE_DIVMOD_MIXED_OP(float, int),
 
 #undef DEFINE_DIVMOD_MIXED_OP
-    OperatorGenerator(
+    OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::hash.generic(t value) -> int"),
         hashValue,
         aliasAnalysisFromSchema()),
 
 #define DEFINE_COMPLEX_OP(type_a, type_b, actual_type_a, actual_type_b)       \
-  OperatorGenerator(                                                          \
+  OperatorGeneratorArgs(                                                      \
       TORCH_SELECTIVE_SCHEMA("aten::Complex." #type_a "_" #type_b "(" #type_a \
                              " x," #type_b " y) -> complex"),                 \
       [](Stack* stack) {                                                      \
@@ -3064,7 +3087,7 @@ RegisterOperators reg2({
 
 #define DEFINE_COMPLEX_OP_WITH_TENSOR_ARG(                                    \
     type_a, type_b, actual_type_a, actual_type_b)                             \
-  OperatorGenerator(                                                          \
+  OperatorGeneratorArgs(                                                      \
       TORCH_SELECTIVE_SCHEMA("aten::Complex." #type_a "_" #type_b "(" #type_a \
                              " x," #type_b " y) -> complex"),                 \
       [](Stack* stack) {                                                      \
@@ -3075,7 +3098,7 @@ RegisterOperators reg2({
         push(stack, comp);                                                    \
       },                                                                      \
       aliasAnalysisFromSchema()),                                             \
-      OperatorGenerator(                                                      \
+      OperatorGeneratorArgs(                                                  \
           TORCH_SELECTIVE_SCHEMA("aten::Complex." #type_b "_" #type_a         \
                                  "(" #type_b " x," #type_a " y) -> complex"), \
           [](Stack* stack) {                                                  \
@@ -3099,7 +3122,10 @@ RegisterOperators reg2({
     DEFINE_COMPLEX_OP_WITH_TENSOR_ARG(Tensor, float, at::Tensor, double),
     DEFINE_COMPLEX_OP_WITH_TENSOR_ARG(Tensor, int, at::Tensor, int),
     DEFINE_COMPLEX_OP_WITH_TENSOR_ARG(Tensor, bool, at::Tensor, bool),
-});
+};
+
+RegisterOperators reg2(
+    createOperators(opGenArgs2, sizeof(opGenArgs2) / sizeof(opGenArgs2[0])));
 
 } // namespace
 } // namespace jit

From ff4569ae2939c3e81092fdf43c9d5f2f08453c42 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Fri, 27 Aug 2021 13:21:04 -0700
Subject: [PATCH 308/530] Sparse CUDA: rename files *.cu -> *.cpp (#63894)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63894

This PR introduces a few code structure changes. There is no need to use
.cu extension for pure c++ code without cuda. Moved
`s_addmm_out_csr_sparse_dense_cuda_worker` to a separate cpp file from
cu file.

cc nikitaved pearu cpuhrsch IvanYashchuk ngimel

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D30548771

Pulled By: cpuhrsch

fbshipit-source-id: 6f12d36e7e506d2fdbd57ef33eb73192177cd904
---
 aten/src/ATen/native/sparse/cuda/SoftMax.cu   |  2 +-
 .../native/sparse/cuda/SparseBlasLegacy.cpp   | 74 +++++++++++++++++++
 .../native/sparse/cuda/SparseBlasLegacy.h     | 18 +++++
 .../{SparseCUDABlas.cu => SparseCUDABlas.cpp} |  4 +-
 .../{SparseCUDABlas.cuh => SparseCUDABlas.h}  |  0
 .../sparse/cuda/SparseCUDATensorMath.cu       | 61 +--------------
 .../sparse/cuda/SparseCUDATensorMath.cuh      |  2 -
 .../native/sparse/cuda/SparseCsrTensorMath.cu |  3 +-
 .../ATen/native/sparse/cuda/SparseMatMul.cu   |  2 +-
 9 files changed, 100 insertions(+), 66 deletions(-)
 create mode 100644 aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp
 create mode 100644 aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h
 rename aten/src/ATen/native/sparse/cuda/{SparseCUDABlas.cu => SparseCUDABlas.cpp} (99%)
 rename aten/src/ATen/native/sparse/cuda/{SparseCUDABlas.cuh => SparseCUDABlas.h} (100%)

diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
index d5bc66b7fb23b..c55ea3b540b5a 100644
--- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
@@ -11,7 +11,7 @@
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
-#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
 
 #include <THC/THCThrustAllocator.cuh>
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp
new file mode 100644
index 0000000000000..b13e7fe595d8f
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp
@@ -0,0 +1,74 @@
+/*
+Functions here use deprecated cuSPARSE API that was removed in CUDA 11.
+This file will be removed eventually.
+*/
+#include <ATen/Dispatch.h>
+#include <ATen/SparseTensorUtils.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
+
+namespace at {
+namespace native {
+
+void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, const Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, const Tensor& dense) {
+  TORCH_INTERNAL_ASSERT(nnz > 0);
+
+  // No half support, so we don't have to use CUDATypeConversion
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      values.scalar_type(), "addmm_sparse_cuda", [&] {
+        scalar_t cast_beta = beta.to<scalar_t>();
+        scalar_t cast_alpha = alpha.to<scalar_t>();
+        Tensor r__;
+        if (cast_beta == scalar_t(0)) {
+          r_.zero_();
+        } else if (!at::sparse::is_same_tensor(t, r_)) {
+          r_.copy_(t);
+        }
+        if (r_.stride(0) == 1 && r_.stride(1) == r_.size(0)) {
+          r__ = r_;
+        } else {
+          // Note: This storage arrangement is preferred due to most of the CUDA kernels handle only contiguous tensors
+          r__ = r_.transpose(0, 1).clone(at::MemoryFormat::Contiguous);
+          r__.transpose_(0, 1);
+        }
+        TORCH_INTERNAL_ASSERT(r__.transpose(-1, -2).is_contiguous());
+        Tensor dense_;
+        char transpose_dense;
+        if (dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
+          transpose_dense = 'n';
+          dense_ = dense;
+        } else if (dense.stride(1) == 1 && dense.stride(0) == dense.size(1)) {
+          transpose_dense = 't';
+          dense_ = dense;
+        } else {
+          transpose_dense = 't';
+          dense_ = dense.contiguous();
+        }
+
+        sparse::cuda::csrmm2(
+          'n',
+          transpose_dense,
+          m,
+          n,
+          k,
+          nnz,
+          cast_alpha,
+          values.data_ptr<scalar_t>(),
+          crow_indices.data_ptr<int32_t>(),
+          col_indices.data_ptr<int32_t>(),
+          dense_.data_ptr<scalar_t>(),
+          (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
+          cast_beta,
+          r__.data_ptr<scalar_t>(),
+          r__.stride(1));
+
+        if (!at::sparse::is_same_tensor(r__, r_)) {
+          r_.copy_(r__);
+        }
+      }
+    );
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h
new file mode 100644
index 0000000000000..67eaffb13a75c
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/core/Scalar.h>
+
+/*
+Functions here use deprecated cuSPARSE API that was removed in CUDA 11.
+Here only 32-bit indices sparse indices are supported.
+This file will be removed eventually.
+*/
+
+namespace at {
+namespace native {
+
+void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, const Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, const Tensor& dense);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
similarity index 99%
rename from aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
rename to aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
index dd03e2bfeacbe..db0088a084c6d 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
@@ -2,7 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/util/Exception.h>
 #include <ATen/cuda/Exceptions.h>
-#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 
 #include <cusparse.h>
@@ -14,7 +14,7 @@
 // Using these APIs in any other systems will result in compile-time or run-time failures.
 // Their support will be extended in the next releases.
 
-#if defined(__CUDACC__) && (CUSPARSE_VERSION >= 11000 || (!defined(_MSC_VER) && CUSPARSE_VERSION >= 10301))
+#if defined(CUDART_VERSION) && (CUSPARSE_VERSION >= 11000 || (!defined(_MSC_VER) && CUSPARSE_VERSION >= 10301))
 #define IS_SPMM_AVAILABLE() 1
 #else
 #define IS_SPMM_AVAILABLE() 0
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.h
similarity index 100%
rename from aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh
rename to aten/src/ATen/native/sparse/cuda/SparseCUDABlas.h
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 511e69ef4b408..0331f5e4d932e 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -5,8 +5,9 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/native/sparse/SparseTensorMath.h>
+#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
 #include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
-#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/CUDAUtils.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
@@ -50,64 +51,6 @@ namespace {
   }
 }
 
-void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& crow_indices, Tensor& col_indices, Tensor& values, const Tensor& dense) {
-  TORCH_INTERNAL_ASSERT(nnz > 0);
-
-  // No half support, so we don't have to use CUDATypeConversion
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
-      values.scalar_type(), "addmm_sparse_cuda", [&] {
-        scalar_t cast_beta = beta.to<scalar_t>();
-        scalar_t cast_alpha = alpha.to<scalar_t>();
-        Tensor r__;
-        if (cast_beta == scalar_t(0)) {
-          r_.zero_();
-        } else if (!is_same_tensor(t, r_)) {
-          r_.copy_(t);
-        }
-        if(r_.stride(0) == 1 && r_.stride(1) == r_.size(0)) {
-          r__ = r_;
-        } else {
-          // Note: This storage arrangement is preferred due to most of the CUDA kernels handle only contiguous tensors
-          r__ = r_.transpose(0, 1).clone(at::MemoryFormat::Contiguous);
-          r__.transpose_(0, 1);
-        }
-        Tensor dense_;
-        char transpose_dense;
-        if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
-          transpose_dense = 'n';
-          dense_ = dense;
-        } else if(dense.stride(1) == 1 && dense.stride(0) == dense.size(1)) {
-          transpose_dense = 't';
-          dense_ = dense;
-        } else {
-          transpose_dense = 't';
-          dense_ = dense.contiguous();
-        }
-
-        sparse::cuda::csrmm2(
-          'n',
-          transpose_dense,
-          m,
-          n,
-          k,
-          nnz,
-          cast_alpha,
-          values.data_ptr<scalar_t>(),
-          crow_indices.data_ptr<int32_t>(),
-          col_indices.data_ptr<int32_t>(),
-          dense_.data_ptr<scalar_t>(),
-          (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
-          cast_beta,
-          r__.data_ptr<scalar_t>(),
-          r__.stride(1));
-
-        if (!is_same_tensor(r__, r_)) {
-          r_.copy_(r__);
-        }
-      }
-    );
-}
-
 // NB: Deleted spaddcmul (aka addcmul_, but not actually wired up), spaddcdiv (not
 // wired at all)
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh
index 1a99e818e1bad..9448b2aa46b6c 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh
@@ -6,8 +6,6 @@
 
 namespace at { namespace native {
 
-void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& crow_indices, Tensor& col_indices, Tensor& values, const Tensor& dense);
-
 void s_addmm_out_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& indices, Tensor& values, const Tensor& dense);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
index ea765e076fb04..b21d892fcdf84 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
@@ -19,7 +19,8 @@
 #include <ATen/cuda/CUDAUtils.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 
-#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
 #include <ATen/native/sparse/cuda/SparseCUDATensorMath.cuh>
 
 #include <thrust/device_ptr.h>
diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
index 2d041de6ea411..d5f31a1980bac 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@@ -18,7 +18,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAUtils.h>
 #include <cusparse.h>
-#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 
 #include <thrust/device_vector.h>

From 18cb3fc91004ac9e551301748246aaa2a5a5dd04 Mon Sep 17 00:00:00 2001
From: Patrick Hu <patrickhu@fb.com>
Date: Fri, 27 Aug 2021 13:37:38 -0700
Subject: [PATCH 309/530] [FX] Validate data type of target on Node
 Construction (#64050)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64050

Test Plan: Imported from OSS

Reviewed By: jamesr66a

Differential Revision: D30585535

Pulled By: yqhu

fbshipit-source-id: 96778a87e75f510b4ef42f0e5cf76b35b7b2f331
---
 test/test_fx.py   |  6 ++++++
 torch/fx/graph.py |  9 ++++++++-
 torch/fx/node.py  | 14 ++++++++++----
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index 27f64e1cd1827..47873d7ef9b41 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -1280,6 +1280,12 @@ def test_wrong_topo(self):
         with self.assertRaisesRegex(RuntimeError, 'was used before it has been defined'):
             graph.lint()
 
+    def test_wrong_target_type(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        with self.assertRaises(ValueError):
+            n = torch.fx.Node(graph=graph, name='foo', op='call_function', target='foo',
+                              args=(), kwargs={})
+
     def test_example_shape_prop(self):
         class TestCase(torch.nn.Module):
             def __init__(self):
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 88c7b54a06ce4..1ee6f05f79809 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -1066,8 +1066,15 @@ def check_arg(arg : Node, n : Optional[Node] = None) -> None:
         # Check targets are legit
         if self.owning_module:
             for node in self.nodes:
+                if node.op == 'call_function':
+                    if not callable(node.target):
+                        raise ValueError(f'Node {node} target {node.target} has type {torch.typename(node.target)} but '
+                                         'a Callable is expected')
+                else:
+                    if not isinstance(node.target, str):
+                        raise ValueError(f'Node {node} target {node.target} has type {torch.typename(node.target)} but '
+                                         'a str is expected')
                 if node.op in ['get_attr', 'call_module']:
-                    assert isinstance(node.target, str)
                     target_atoms = node.target.split('.')
                     m_itr = self.owning_module
                     for i, atom in enumerate(target_atoms):
diff --git a/torch/fx/node.py b/torch/fx/node.py
index e00f25f47a2ee..8c4faf7d4fa27 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -114,13 +114,19 @@ class Node:
     """
     def __init__(self, graph: 'Graph', name: str, op: str, target: 'Target',
                  args: Tuple['Argument', ...], kwargs: Dict[str, 'Argument'],
-                 type : Optional[Any] = None) -> None:
+                 return_type : Optional[Any] = None) -> None:
         self.graph = graph
         self.name = name  # unique name of value being created
         assert op in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr', 'output', 'root']
         self.op = op  # the kind of operation = placeholder|call_method|call_module|call_function|get_attr
-        if op in ['call_method', 'call_module']:
-            assert isinstance(target, str)
+        if op == 'call_function':
+            if not callable(target):
+                raise ValueError(f'Node [graph = {graph}, name = \'{name}\'] target {target} has type {torch.typename(target)} '
+                                 'but a Callable is expected')
+        else:
+            if not isinstance(target, str):
+                raise ValueError(f'Node [graph = {graph}, name = \'{name}\'] target {target} has type {torch.typename(target)} '
+                                 'but a str is expected')
         self.target = target  # for method/module/function, the name of the method/module/function/attr
         # being invoked, e.g add, layer1, or torch.add
 
@@ -146,7 +152,7 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: 'Target',
         # generated function return type. (Note this is a special case. ``return``
         # does not produce a value, it's more of a notation. Thus, this value
         # describes the type of args[0] in the ``return`` node.
-        self.type : Optional[Any] = type
+        self.type : Optional[Any] = return_type
         self._prev = self
         self._next = self
         self._erased = False

From 358c46f99eea23ec86e4358a5d4253e4059e962c Mon Sep 17 00:00:00 2001
From: Vincent Phan <vincentphan@fb.com>
Date: Fri, 27 Aug 2021 13:51:38 -0700
Subject: [PATCH 310/530] Implement leaky relu op

Summary: Implemented leaky relu op as per: https://www.internalfb.com/tasks/?t=97492679

Test Plan:
buck build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 //xplat/caffe2:pt_vulkan_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_api_test
adb shell "/data/local/tmp/vulkan_api_test"

all tests pass, including new ones

Reviewed By: SS-JIA

Differential Revision: D30186225

fbshipit-source-id: fdb1f8f7b3a28b5504581822185c0475dcd53a3e
---
 .../ATen/native/vulkan/glsl/leaky_relu.glsl   |  28 +++++
 .../ATen/native/vulkan/glsl/leaky_relu_.glsl  |  27 ++++
 aten/src/ATen/native/vulkan/ops/Clamp.cpp     | 117 ++++++++++++++++++
 aten/src/ATen/test/vulkan_api_test.cpp        |  43 +++++++
 4 files changed, 215 insertions(+)
 create mode 100644 aten/src/ATen/native/vulkan/glsl/leaky_relu.glsl
 create mode 100644 aten/src/ATen/native/vulkan/glsl/leaky_relu_.glsl

diff --git a/aten/src/ATen/native/vulkan/glsl/leaky_relu.glsl b/aten/src/ATen/native/vulkan/glsl/leaky_relu.glsl
new file mode 100644
index 0000000000000..f947e78f1843d
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/leaky_relu.glsl
@@ -0,0 +1,28 @@
+#version 450 core
+#define PRECISION $precision
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1) uniform PRECISION                    sampler3D uInput;
+layout(set = 0, binding = 2) uniform PRECISION restrict           Block {
+  ivec4 size;
+  float negative_slope;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const vec4 inval = texelFetch(uInput, pos, 0);
+    const vec4 negative_values = vec4(lessThan(inval, vec4(0.0f)));
+    const vec4 positive_values = vec4(1.0) - negative_values;
+    const vec4 mask = negative_values * vec4(uBlock.negative_slope) + positive_values;
+    const vec4 outval = inval * mask;
+    imageStore(uOutput, pos, outval);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/leaky_relu_.glsl b/aten/src/ATen/native/vulkan/glsl/leaky_relu_.glsl
new file mode 100644
index 0000000000000..345e66942c155
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/leaky_relu_.glsl
@@ -0,0 +1,27 @@
+#version 450 core
+#define PRECISION $precision
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION restrict Block {
+  ivec4 size;
+  float negative_slope;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const vec4 inval = imageLoad(uOutput, pos);
+    const vec4 negative_values = vec4(lessThan(inval, vec4(0.0f)));
+    const vec4 positive_values = vec4(1.0) - negative_values;
+    const vec4 mask = negative_values * vec4(uBlock.negative_slope) + positive_values;
+    const vec4 outval = inval * mask;
+    imageStore(uOutput, pos, outval);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
index c6f046e84fd17..7982b0eda0d7a 100644
--- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
@@ -404,6 +404,121 @@ Tensor& hardshrink_(
   return self;
 }
 
+Tensor leaky_relu(
+    const Tensor& self_arg,
+    const Scalar& negative_slope) {
+  api::Context* const context = api::context();
+
+  const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
+  const vTensor& v_self = convert(self);
+
+  vTensor v_output{
+    context,
+    v_self.sizes(),
+    v_self.options(),
+  };
+
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+      const struct Block final {
+        uvec3 extents;
+        uint32_t _;
+        float negative_slope;
+      } block {
+        v_output.extents(),
+        0u,
+        negative_slope.to<float>(),
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(leaky_relu),
+          v_output.extents(),
+          context->gpu().adapter->local_work_group_size(),
+          // Write-only access bypasses synchronization but inserts appropriate
+          // barriers if necessary.
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_pool.submit(context->gpu().queue, command_buffer);
+
+  return convert(v_output);
+}
+
+Tensor& leaky_relu_(
+    Tensor& self,
+    const Scalar& negative_slope) {
+  api::Context* const context = api::context();
+
+  TORCH_CHECK(
+      self.is_vulkan(),
+      "Vulkan: In-place leaky relu is only supported on Vulkan tensors.");
+
+  vTensor& v_self = convert(self);
+
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    if C10_LIKELY(v_self.has_image()) {
+      const struct Block final {
+        uvec3 extents;
+        uint32_t _;
+        float negative_slope;
+      } block {
+        v_self.extents(),
+        0u,
+        negative_slope.to<float>(),
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(leaky_relu_),
+          v_self.extents(),
+          context->gpu().adapter->local_work_group_size(),
+          // Read-Write access triggers an async synchronization if necessory
+          // and inserts appropriate barriers if hazards are detected.
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Read | vTensor::Access::Write),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_pool.submit(context->gpu().queue, command_buffer);
+
+  return self;
+}
+
 Tensor sigmoid(const Tensor& self) {
   return ops::activation(self, VK_KERNEL(sigmoid));
 }
@@ -433,6 +548,8 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl(TORCH_SELECTIVE_NAME("aten::hardswish_"), hardswish_);
   m.impl(TORCH_SELECTIVE_NAME("aten::hardtanh"), hardtanh);
   m.impl(TORCH_SELECTIVE_NAME("aten::hardtanh_"), hardtanh_);
+  m.impl(TORCH_SELECTIVE_NAME("aten::leaky_relu"), leaky_relu);
+  m.impl(TORCH_SELECTIVE_NAME("aten::leaky_relu_"), leaky_relu_);
   m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid"), sigmoid);
   m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid_"), sigmoid_);
   m.impl(TORCH_SELECTIVE_NAME("aten::tanh"), tanh);
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 2873d3c0584c8..d4b466aa920f2 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -979,6 +979,49 @@ TEST(VulkanAPITest, hardshrink_) {
   }
 }
 
+TEST(VulkanAPITest, leaky_relu) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  for (const auto negative_slope : {0.01, 0.001, 1.0, -0.001}) {
+    const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_vulkan = in_cpu.vulkan();
+
+    const auto out_cpu = at::leaky_relu(in_cpu, negative_slope);
+    const auto out_vulkan = at::leaky_relu(in_vulkan, negative_slope);
+
+    const auto check = almostEqual(out_cpu, out_vulkan.cpu());
+
+    if (!check) {
+      showRtol(out_cpu, out_vulkan.cpu());
+    }
+
+    ASSERT_TRUE(check);
+  }
+}
+
+TEST(VulkanAPITest, leaky_relu_) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  for (const auto negative_slope : {0.01, 0.001, 1.0, -0.001}) {
+    auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
+    auto vulkan = cpu.vulkan();
+
+    at::leaky_relu_(cpu, negative_slope);
+    at::leaky_relu_(vulkan, negative_slope);
+
+    const auto check = almostEqual(cpu, vulkan.cpu());
+    if (!check) {
+      showRtol(cpu, vulkan.cpu());
+    }
+
+    ASSERT_TRUE(check);
+  }
+}
+
 TEST(VulkanAPITest, hardswish) {
   if (!at::is_vulkan_available()) {
     return;

From bdde898d9c26edc7f9bc37164cf5480c450d5189 Mon Sep 17 00:00:00 2001
From: Erjia Guan <erjia@fb.com>
Date: Fri, 27 Aug 2021 14:15:23 -0700
Subject: [PATCH 311/530] Add README to datapipes (#63982)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63982

Add a readme to `datapipes` for developer. This is can be a replacement of https://github.com/pytorch/pytorch/blob/master/torch/utils/data/datapipes_tutorial_dev_loaders.ipynb

After this PR is landed, the README.md will be added to PyTorch Wiki

Test Plan: Imported from OSS

Reviewed By: soulitzer

Differential Revision: D30554198

Pulled By: ejguan

fbshipit-source-id: 6091aae8ef915c7c1f00fbf45619c86c9558d308
---
 torch/utils/data/datapipes/README.md          | 103 ++++++++++
 .../data/datapipes_tutorial_dev_loaders.ipynb | 178 ------------------
 2 files changed, 103 insertions(+), 178 deletions(-)
 create mode 100644 torch/utils/data/datapipes/README.md
 delete mode 100644 torch/utils/data/datapipes_tutorial_dev_loaders.ipynb

diff --git a/torch/utils/data/datapipes/README.md b/torch/utils/data/datapipes/README.md
new file mode 100644
index 0000000000000..69cd56d3cfbd1
--- /dev/null
+++ b/torch/utils/data/datapipes/README.md
@@ -0,0 +1,103 @@
+The [`datapipes`](https://github.com/pytorch/pytorch/tree/master/torch/utils/data/datapipes) folder holds the implementation of the `IterDataPipe` and `MapDataPipe`.
+
+This document serves as an entry point for DataPipe implementation.
+
+## Implementing DataPipe
+For the sake of an example, let us implement an `IterDataPipe` to apply a callable over data under [`iter`](https://github.com/pytorch/pytorch/tree/master/torch/utils/data/datapipes/iter).
+For `MapDataPipe`, please take reference from files in [map](https://github.com/pytorch/pytorch/tree/master/torch/utils/data/datapipes/map) folder and implement the corresponding `__getitem__` method.
+
+### Naming
+The naming convention for DataPipe is Operation-er and with suffix of `IterDataPipe` because each DataPipe behaves like a container to apply the operation to data yielded from the source DataPipe.
+And, when importing the DataPipe into `iter` module under `datapipes`, each DataPipe will be aliased as Op-er without the suffix of `IterDataPipe`.
+Please check [`__init__.py`](https://github.com/pytorch/pytorch/blob/master/torch/utils/data/datapipes/iter/__init__.py) in `iter` module for how we aliasing each DataPipe class.
+Like the example of `IterDataPipe` to map a function, we are going to name it as `MapperIterDataPipe` and alias it as `iter.Mapper` under `datapipes`.
+
+### Constructor
+As DataSet now constructed by a stack of DataPipe-s, each DataPipe normally takes a source DataPipe as the first argument.
+```py
+class MapperIterDataPipe(IterDataPipe):
+    def __init__(self, dp, fn):
+        super().__init__()
+        self.dp = dp
+        self.fn = fn
+```
+Note: Avoid loading data from the source DataPipe in `__init__` function, in order to support lazy data loading and save memory.
+
+### Iterator
+For `IterDataPipe`, an `__iter__` function is needed to consume data from the source `IterDataPipe` then apply operation over the data before yield.
+```py
+class MapperIterDataPipe(IterDataPipe):
+    ...
+
+    def __iter__(self):
+        for d in self.dp:
+            yield self.fn(d)
+```
+
+### Length
+In the most common cases, as the example of `MapperIterDataPipe` above, the `__len__` method of DataPipe should return the length of source DataPipe.
+```py
+class MapperIterDataPipe(IterDataPipe):
+    ...
+
+    def __len__(self):
+        return len(self.dp)
+```
+Note that `__len__` method is optional for `IterDataPipe`.
+Like `CSVParserIterDataPipe` in the [Using DataPipe sector](#using-datapipe), `__len__` is not implemented because the size of each file streams is unknown for us before loading it.
+
+Besides, in some special cases, `__len__` method can be provided, but it would either return an integer length or raise Error depending on the arguments of DataPipe.
+And, the Error is required to be `TypeError` to support Python's build-in functions like `list(dp)`.
+Please check NOTE [ Lack of Default `__len__` in Python Abstract Base Classes ] for detailed reason in PyTorch.
+
+### Registering DataPipe with functional API
+Each DataPipe can be registered to support functional API using the decorator `functional_datapipe`.
+```py
+@functional_datapipe("map")
+class MapperIterDataPipe(IterDataPipe):
+    ...
+```
+Then, the stack of DataPipe can be constructed in functional-programming manner.
+```py
+>>> import torch.utils.data.datapipes as dp
+>>> datapipes1 = dp.iter.FileLoader(['a.file', 'b.file']).map(fn=decoder).shuffle().batch(2)
+
+>>> datapipes2 = dp.iter.FileLoader(['a.file', 'b.file'])
+>>> datapipes2 = dp.iter.Mapper(datapipes2)
+>>> datapipes2 = dp.iter.Shuffler(datapipes2)
+>>> datapipes2 = dp.iter.Batcher(datapipes2, 2)
+```
+In the above example, `datapipes1` and `datapipes2` represent the exact same stack of `IterDataPipe`-s.
+
+## Using DataPipe
+For example, we want to load data from CSV files with the following data pipeline:
+- List all csv files
+- Load csv files
+- Parse csv file and yield rows
+
+To support the above pipeline, `CSVParser` is registered as `parse_csv_files` to consume file streams and expand them as rows.
+```py
+@functional_datapipe("parse_csv_files")
+class CSVParserIterDataPipe(IterDataPipe):
+    def __init__(self, dp, **fmtparams):
+        self.dp = dp
+        self.fmtparams = fmtparams
+
+    def __iter__(self):
+        for filename, stream in self.dp:
+            reader = csv.reader(stream, **self.fmtparams)
+            for row in reader:
+                yield filename, row
+```
+Then, the pipeline can be assembled as following:
+```py
+>>> import torch.utils.data.datapipes as dp
+
+>>> FOLDER = 'path/2/csv/folder'
+>>> datapipe = dp.iter.FileLister([FOLDER]).filter(fn=lambda filename: filename.endswith('.csv'))
+>>> datapipe = dp.iter.FileLoader(datapipe, mode='rt')
+>>> datapipe = datapipe.parse_csv_files(delimiter=' ')
+
+>>> for d in datapipe: # Start loading data
+...     pass
+```
diff --git a/torch/utils/data/datapipes_tutorial_dev_loaders.ipynb b/torch/utils/data/datapipes_tutorial_dev_loaders.ipynb
deleted file mode 100644
index 0a9b834a86862..0000000000000
--- a/torch/utils/data/datapipes_tutorial_dev_loaders.ipynb
+++ /dev/null
@@ -1,178 +0,0 @@
-{
- "metadata": {
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.10"
-  },
-  "orig_nbformat": 2,
-  "kernelspec": {
-   "name": "python3610jvsc74a57bd0eb5e09632d6ea1cbf3eb9da7e37b7cf581db5ed13074b21cc44e159dc62acdab",
-   "display_name": "Python 3.6.10 64-bit ('dataloader': conda)"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2,
- "cells": [
-  {
-   "source": [
-    "## DataPipes development tutorial. Loaders DataPipes."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "source": [
-    "As DataSet now constructed by stacking `DataPipe`-s it is recommended to keep `DataPipe` functionality as primitive as possible. For example loading data from CSV file will look like sequence of DataPipes: ListFiles FileLoader CSVParser.\n",
-    "\n"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "source": [
-    "`ExampleListFilesDataPipe` scans all files in `root` folder and yields full file names. Avoid loading entire list in `__init__` function to save memory."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import csv\n",
-    "import io\n",
-    "import os\n",
-    "\n",
-    "from torch.utils.data import IterDataPipe, functional_datapipe\n",
-    "\n",
-    "\n",
-    "class ExampleListFilesDataPipe(IterDataPipe):\n",
-    "    def __init__(self, *, root):\n",
-    "        self.root = root\n",
-    "\n",
-    "    def __iter__(self):\n",
-    "        for (dirpath, dirnames, filenames) in os.walk(self.root):\n",
-    "            for file_name in filenames:\n",
-    "                yield os.path.join(dirpath, file_name)"
-   ]
-  },
-  {
-   "source": [
-    "`ExampleFileLoaderDataPipe` registered as `load_files_as_string` consumes file names from source_datapipe and yields file names and file lines."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@functional_datapipe('load_files_as_string')\n",
-    "class ExampleFileLoaderDataPipe(IterDataPipe):\n",
-    "    def __init__(self, source_datapipe):\n",
-    "        self.source_datapipe = source_datapipe\n",
-    "\n",
-    "    def __iter__(self):\n",
-    "        for file_name in self.source_datapipe:\n",
-    "            with open(file_name) as file:\n",
-    "                lines = file.read()\n",
-    "                yield (file_name, lines)\n"
-   ]
-  },
-  {
-   "source": [
-    "`ExampleCSVParserDataPipe` registered as `parse_csv_files` consumes file lines and expands them as CSV rows."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@functional_datapipe('parse_csv_files')\n",
-    "class ExampleCSVParserDataPipe(IterDataPipe):\n",
-    "    def __init__(self, source_datapipe):\n",
-    "        self.source_datapipe = source_datapipe\n",
-    "\n",
-    "    def __iter__(self):\n",
-    "        for file_name, lines in self.source_datapipe:\n",
-    "            reader = csv.reader(io.StringIO(lines))\n",
-    "            for row in reader:\n",
-    "                yield [file_name] + row\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "['/home/vitaly/dataset/data/datapipes/load/iter/test/example_2.csv', '10', \" 'foo'\"]\n['/home/vitaly/dataset/data/datapipes/load/iter/test/example_2.csv', '11', \" 'bar'\"]\n['/home/vitaly/dataset/data/datapipes/load/iter/test/example_1.csv', '12', \" 'aaaa'\"]\n['/home/vitaly/dataset/data/datapipes/load/iter/test/example_1.csv', '13', \" 'bbbb'\"]\n"
-     ]
-    }
-   ],
-   "source": [
-    "FOLDER = 'define your folder with csv files here'\n",
-    "FOLDER = '/home/vitaly/dataset/data'\n",
-    "dp = ExampleListFilesDataPipe(root = FOLDER).filter(lambda filename: filename.endswith('.csv')).load_files_as_string().parse_csv_files()\n",
-    "\n",
-    "for data in dp:\n",
-    "    print(data)"
-   ]
-  },
-  {
-   "source": [
-    "This approach allows to replace any DataPipe to get different functionality. For example you can pick individual files.\n"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "['/home/vitaly/dataset/data/datapipes/load/iter/test/example_1.csv', '12', \" 'aaaa'\"]\n['/home/vitaly/dataset/data/datapipes/load/iter/test/example_1.csv', '13', \" 'bbbb'\"]\n"
-     ]
-    }
-   ],
-   "source": [
-    "FILE = 'define your file with csv data here'\n",
-    "FILE = '/home/vitaly/dataset/data/datapipes/load/iter/test/example_1.csv'\n",
-    "dp = ExampleFileLoaderDataPipe([FILE]).parse_csv_files()\n",
-    "\n",
-    "for data in dp:\n",
-    "    print(data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ]
-}
\ No newline at end of file

From 8406dba65af414f4ac9bb569ca8d70752611e4ba Mon Sep 17 00:00:00 2001
From: Jessica Choi <jsc@fb.com>
Date: Fri, 27 Aug 2021 14:46:31 -0700
Subject: [PATCH 312/530] Removing references to ProcessGroupAgent in comments
 (#64051)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64051

cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23

Test Plan: Imported from OSS

Reviewed By: mrshenli

Differential Revision: D30587076

Pulled By: jaceyca

fbshipit-source-id: 414cb95faad0b4da0eaf2956c0668af057f93574
---
 torch/csrc/distributed/rpc/message.h                        | 6 +++---
 torch/testing/_internal/dist_utils.py                       | 2 --
 .../testing/_internal/distributed/rpc/dist_autograd_test.py | 3 +--
 .../distributed/rpc/faulty_rpc_agent_test_fixture.py        | 2 --
 torch/testing/_internal/distributed/rpc/rpc_test.py         | 2 +-
 5 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/torch/csrc/distributed/rpc/message.h b/torch/csrc/distributed/rpc/message.h
index 93eff094243f8..17a7808912b11 100644
--- a/torch/csrc/distributed/rpc/message.h
+++ b/torch/csrc/distributed/rpc/message.h
@@ -101,9 +101,9 @@ enum MessageType {
 //        can then serialize and send tensors chunck-by-chunk, in the streaming
 //        fashion.
 //    type (MessageType): type of the message.
-//    id (int64_t): message id, this is used by ProcessGroupAgent to match
-//                  request and response. Other implementation can ignore it
-//                  if they have their own ways to do matching.
+//    id (int64_t): message id, this is used to match request and response.
+//               Other implementation can ignore it if they have their own
+//               ways to do matching.
 //
 // Layers above ``RpcAgent`` only converts ScriptCall, ScriptResp, PythonCall,
 // and PythonResp into a Message, and it is up to the RpcAgent
diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py
index bdb21a7941c17..284a541444cdd 100644
--- a/torch/testing/_internal/dist_utils.py
+++ b/torch/testing/_internal/dist_utils.py
@@ -171,8 +171,6 @@ def wait_until_owners_and_forks_on_rank(
 
 def initialize_pg(init_method, rank: int, world_size: int) -> None:
     # This is for tests using `dist.barrier`.
-    # For `RpcAgent` other than `ProcessGroupAgent`,
-    # no `_default_pg` is initialized.
     if not dist.is_initialized():
         dist.init_process_group(
             backend="gloo",
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index e50c30d4974b7..017a61b7debf5 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -1463,8 +1463,7 @@ def test_backward_node_failure_python_udf(self):
                     dist_autograd.backward(context_id, [res.sum()])
 
                 # Mark rank 0 is done in the store, since the RPC framework on
-                # some nodes might be broken at this point (listenLoop() in
-                # ProcessGroupAgent might've exited).
+                # some nodes might be broken at this point.
                 store.set('test_backward_node_failure_python_udf_rank0_done', "True")
             else:
                 # Wait for backward to finish on rank 0.
diff --git a/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py b/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py
index ae151137a4705..24f7ab81c5594 100644
--- a/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py
+++ b/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py
@@ -50,8 +50,6 @@ def setup_fault_injection(self, faulty_messages, messages_to_delay):
 
     def get_shutdown_error_regex(self):
         error_regexes = [
-            "Encountered exception in ProcessGroupAgent::enqueueSend",
-            "Encountered exception in ProcessGroupAgent::listenLoop()",
             "Exception in thread pool task",
             "Connection reset by peer",
             "Connection closed by peer"
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index ae57ea5f40f8c..1a44ef6e63b65 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -2897,7 +2897,7 @@ def test_handle_send_exceptions(self):
         )
         rpc._set_rpc_timeout(10)
         # This barrier is needed to ensure that some workers do not exit before
-        # others have been brought up, for non ProcessGroupAgent backends.
+        # others have been brought up.
         initialize_pg(self.file_init_method, self.rank, self.world_size)
         dist.barrier()
         if self.rank == 1:

From 90a6498a1288a4248b4cfe603949fd5b2e60dc0f Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Fri, 27 Aug 2021 14:59:08 -0700
Subject: [PATCH 313/530] Add autograd not implemented boxed fallback (#63458)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63458

See description and discussion from https://github.com/pytorch/pytorch/pull/62450

Test Plan: Imported from OSS

Reviewed By: heitorschueroff

Differential Revision: D30518572

Pulled By: soulitzer

fbshipit-source-id: 3b1504d49abb84560ae17077f0dec335749c9882
---
 test/cpp/api/autograd.cpp                     | 257 ++++++++++++++++++
 tools/build_variables.bzl                     |   1 +
 torch/csrc/api/include/torch/autograd.h       |   1 +
 .../autograd_not_implemented_fallback.cpp     | 189 +++++++++++++
 .../autograd_not_implemented_fallback.h       |  11 +
 torch/csrc/autograd/function.h                |   8 +
 6 files changed, 467 insertions(+)
 create mode 100644 torch/csrc/autograd/autograd_not_implemented_fallback.cpp
 create mode 100644 torch/csrc/autograd/autograd_not_implemented_fallback.h

diff --git a/test/cpp/api/autograd.cpp b/test/cpp/api/autograd.cpp
index 80d892d5195c9..edb73f90852a2 100644
--- a/test/cpp/api/autograd.cpp
+++ b/test/cpp/api/autograd.cpp
@@ -1,6 +1,8 @@
 #include <gtest/gtest.h>
+#include <ATen/core/boxing/impl/test_helpers.h>
 
 #include <torch/torch.h>
+#include <ATen/core/op_registration/op_registration.h>
 
 #include <torch/csrc/autograd/functions/basic_ops.h>
 
@@ -869,6 +871,261 @@ TEST(CustomAutogradTest, BackwardWithCreateGraphWarns) {
   }
 }
 
+/**
+ * Tests for AutogradNotImplementedFallback
+ * - Check that we created the NotImplemented kernel when inputs require grad
+ *   but when no inputs require grad, we should not create this node
+ * - check_inplace logic
+ * - view ops (TODO: not an official view yet, update this once InplaceOrView kernel is landed)
+ * - TODO: Tests for NDEBUG checks?
+ * - tensorlist input and output
+ * - multiple outputs / non-tensor output
+ * - rebase_history vs set_history
+ */
+namespace {
+
+torch::Tensor inplace_op(const torch::Tensor& self, const torch::Tensor& other) {
+  return self.add_(other);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> two_arg_inplace_op(const torch::Tensor& self, const torch::Tensor& other) {
+  other.add_(self);
+  self.add_(other);
+  return std::tuple<torch::Tensor, torch::Tensor>(self, other);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> two_pairs_of_view_op(const torch::Tensor& self, const torch::Tensor& other) {
+  // This is not allowed. We test below that this calling into the boxed kernel will raise an error
+  auto self_view = self.view(-1);
+  auto other_view = other.view(-1);
+  return std::tuple<torch::Tensor, torch::Tensor>(self_view, other_view);
+}
+
+int64_t ret_single_non_tensor(const torch::Tensor& self, const torch::Tensor& other) {
+  return 12;
+}
+
+torch::Tensor opt_op(const torch::Tensor& self, const c10::optional<at::Tensor>& other) {
+  if (other.has_value()) {
+      return self + other.value();
+  } else {
+      return self.clone();
+  }
+}
+
+torch::Tensor my_custom_op(const torch::Tensor& self, const torch::Tensor& other) {
+  return self + other;
+}
+
+std::tuple<torch::Tensor, torch::Tensor, int64_t> ret_tuple_non_tensor(const torch::Tensor& self, const torch::Tensor& other) {
+  auto a = self - other;
+  auto b = self + other;
+  return std::tuple<torch::Tensor, torch::Tensor, int64_t>(a, b, 12);
+}
+
+torch::Tensor view_op(const torch::Tensor& self, const torch::Tensor& other) {
+  return self.view(-1);
+}
+
+std::vector<at::Tensor> ret_tensor_vector(const torch::Tensor& self, const torch::Tensor& other) {
+  std::vector<at::Tensor> out;
+  out.push_back(self + other);
+  out.push_back(self - other);
+  return out;
+}
+
+torch::Tensor tensorlist_op(const torch::Tensor& self, at::TensorList other) {
+  const auto& res = self.clone();
+  for (const auto& t : other) {
+      res.add_(t);
+  }
+  return res;
+}
+
+#define REGISTER_TEST_OP(name, schema, fn)                                               \
+  auto m = MAKE_TORCH_LIBRARY(_test);                                                    \
+  m.def(schema);                                                                         \
+  auto m_autograd = MAKE_TORCH_LIBRARY_IMPL(_test, Autograd);                            \
+  auto m_cpu = MAKE_TORCH_LIBRARY_IMPL(_test, CPU);                                      \
+  m_cpu.impl(name, c10::DispatchKey::CPU, TORCH_FN(fn));                                 \
+  m_autograd.impl(name, c10::DispatchKey::Autograd, autogradNotImplementedFallback());
+
+template <typename F>
+void assertBasicChecks(F op) {
+  auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+  auto c = torch::tensor({1.}, {torch::kFloat32});
+
+  // If any inputs require grad,
+  auto out1 = op(a, b);
+  ASSERT_THROWS_WITH(out1.backward(), "is not implemented");
+
+  // # Should not have grad_fn if none require grad
+  auto out2 = op(b, c);
+  ASSERT_THROWS_WITH(out2.backward(), "element 0 of tensors does not require grad and does not have a grad_fn");
+
+  // TODO: Forward AD Tests?
+}
+
+} // namespace
+
+TEST(TestAutogradNotImplementedFallback, RetSingleNonTensor) {
+  REGISTER_TEST_OP("ret_single_non_tensor", "_test::ret_single_non_tensor(Tensor self, Tensor other) -> int", ret_single_non_tensor);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_single_non_tensor", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<int64_t, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+  };
+
+  auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+
+  ASSERT_EQ(op(a, b), ret_single_non_tensor(a, b));
+}
+
+TEST(TestAutogradNotImplementedFallback, DoubleViewOP) {
+  REGISTER_TEST_OP("two_pairs_of_view_op", "_test::two_pairs_of_view_op(Tensor(a) self, Tensor(b) other) -> (Tensor(a), Tensor(b))", two_pairs_of_view_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::two_pairs_of_view_op", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<std::tuple<torch::Tensor, torch::Tensor>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+  };
+  auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+  ASSERT_THROWS_WITH(op(a, b),
+    "Expected only a single output in the operator schema to have a non-write alias annotation");
+}
+
+TEST(TestAutogradNotImplementedFallback, InplaceOp) {
+  REGISTER_TEST_OP("inplace_op", "_test::inplace_op(Tensor(a!) self, Tensor other) -> Tensor(a!)", inplace_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::inplace_op", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<torch::Tensor, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+  };
+
+  auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+
+  // Check in-place
+  ASSERT_THROWS_WITH(op(a, b),
+    "a leaf Variable that requires grad is being used in an in-place operation");
+  op(b, a);
+  a = a.clone();
+  b = b.clone();
+  auto c = op(a, b);
+  ASSERT_TRUE(torch::allclose(c, inplace_op(a, b)));
+
+  // Test in-place on view
+  auto base = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true).clone();
+  auto view = base.view(-1);
+  auto t = torch::tensor({1.}, {torch::kFloat32});
+
+  torch::Tensor v_nograd;
+  {
+    c10::NoGradGuard guard;
+    v_nograd = base.view(-1);
+    op(v_nograd, t);
+  }
+
+  ASSERT_THROWS_WITH(op(v_nograd, t), "A view was created in no_grad mode");
+  ASSERT_EQ(op(view, t).unsafeGetTensorImpl(), view.unsafeGetTensorImpl());
+
+  // TODO: once we have InplaceOrView kernel, renable this since version counter would actually
+  // be incremented
+  // ASSERT_THAT(op(view, t).grad_fn()->name(), ::testing::HasSubstr("AsStridedBackward"));
+}
+
+TEST(TestAutogradNotImplementedFallback, DoubleInplaceOp) {
+  REGISTER_TEST_OP("two_arg_inplace_op", "_test::two_arg_inplace_op(Tensor(a!) self, Tensor(b!) other) -> (Tensor(a!), Tensor(b!))", two_arg_inplace_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::two_arg_inplace_op", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<std::tuple<torch::Tensor, torch::Tensor>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+  };
+  auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+
+  // Both are modified in-place!
+  ASSERT_THROWS_WITH(op(a, b),
+    "a leaf Variable that requires grad is being used in an in-place operation");
+  ASSERT_THROWS_WITH(op(b, a),
+    "a leaf Variable that requires grad is being used in an in-place operation");
+}
+
+TEST(TestAutogradNotImplementedFallback, OptOp) {
+  REGISTER_TEST_OP("opt_op", "_test::opt_op(Tensor self, Tensor? other) -> Tensor", opt_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::opt_op", "");
+  auto op = [&](const torch::Tensor& _1, const c10::optional<torch::Tensor>& _2) {
+    return callOpUnboxed<torch::Tensor, const torch::Tensor&, const c10::optional<torch::Tensor>&>(opHandle, _1, _2);
+  };
+
+  auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+
+  ASSERT_TRUE(torch::allclose(op(a, b), opt_op(a, b)));
+  ASSERT_TRUE(torch::allclose(op(a, {}), opt_op(a, {})));
+}
+
+TEST(TestAutogradNotImplementedFallback, OutOfPlaceAddition) {
+  REGISTER_TEST_OP("my_custom_op", "_test::my_custom_op(Tensor self, Tensor other) -> Tensor", my_custom_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::my_custom_op", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<torch::Tensor, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+  };
+
+  assertBasicChecks(op);
+}
+
+TEST(TestAutogradNotImplementedFallback, RetTupleNonTensor) {
+  REGISTER_TEST_OP("ret_tuple_non_tensor", "_test::ret_tuple_non_tensor(Tensor self, Tensor other) -> (Tensor, Tensor, int)", ret_tuple_non_tensor);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_tuple_non_tensor", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    torch::Tensor out0;
+    torch::Tensor out1;
+    int64_t out2;
+    auto out = callOpUnboxed<std::tuple<torch::Tensor, torch::Tensor, int64_t>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+    std::tie(out0, out1, out2) = std::move(out);
+    return out0;
+  };
+
+  assertBasicChecks(op);
+}
+
+TEST(TestAutogradNotImplementedFallback, ViewOp) {
+  REGISTER_TEST_OP("view_op", "_test::view_op(Tensor(a) self, Tensor other) -> Tensor(a)", view_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::view_op", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<torch::Tensor, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+  };
+  assertBasicChecks(op);
+}
+
+TEST(TestAutogradNotImplementedFallback, RetTensorVector) {
+  REGISTER_TEST_OP("ret_tensor_vector", "_test::ret_tensor_vector(Tensor self, Tensor other) -> Tensor[]", ret_tensor_vector);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_tensor_vector", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<std::vector<at::Tensor>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2)[0];
+  };
+  assertBasicChecks(op);
+}
+
+TEST(TestAutogradNotImplementedFallback, TensorlistOp) {
+  REGISTER_TEST_OP("tensorlist_op", "_test::tensorlist_op(Tensor self, Tensor[] other) -> Tensor", tensorlist_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::tensorlist_op", "");
+  auto op = [&](torch::Tensor _1, at::TensorList _2) {
+    return callOpUnboxed<torch::Tensor, const torch::Tensor&, at::TensorList>(opHandle, _1, _2);
+  };
+
+  auto a = torch::tensor({1.}, {torch::kFloat32});
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+  auto c = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  std::vector<torch::Tensor> vec = {b, c};
+  auto out = op(a, vec);
+
+  ASSERT_THROWS_WITH(torch::autograd::grad({out}, {vec[0]}), "One of the differentiated Tensors does not require grad");
+  ASSERT_THROWS_WITH(torch::autograd::grad({out}, {vec[1]}), "is not implemented");
+
+  ASSERT_TRUE(at::allclose(op(a, vec), tensorlist_op(a, vec)));
+}
+
+
 // TODO add these tests if needed
 // test_once_differentiable
 // test_sparse_backward
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 650830b3143f0..b2a1016118d28 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -128,6 +128,7 @@ libtorch_edge_profiler_sources = libtorch_profiler_sources + [
 core_trainer_sources = [
     "torch/csrc/autograd/anomaly_mode.cpp",
     "torch/csrc/autograd/autograd.cpp",
+    "torch/csrc/autograd/autograd_not_implemented_fallback.cpp",
     "torch/csrc/autograd/cpp_hook.cpp",
     "torch/csrc/autograd/custom_function.cpp",
     "torch/csrc/autograd/engine.cpp",
diff --git a/torch/csrc/api/include/torch/autograd.h b/torch/csrc/api/include/torch/autograd.h
index 83aa102de0128..809fbe8bd3350 100644
--- a/torch/csrc/api/include/torch/autograd.h
+++ b/torch/csrc/api/include/torch/autograd.h
@@ -2,3 +2,4 @@
 
 #include <torch/csrc/autograd/autograd.h>
 #include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
new file mode 100644
index 0000000000000..ab9cb49ec63a7
--- /dev/null
+++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
@@ -0,0 +1,189 @@
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
+
+#include <c10/util/irange.h>
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/ivalue.h>
+
+#include <torch/csrc/autograd/autograd.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/functions/basic_ops.h>
+#include <torch/csrc/autograd/functions/utils.h>
+#include <torch/csrc/autograd/VariableTypeUtils.h>
+
+#include <vector>
+
+namespace torch { namespace autograd {
+
+namespace {
+
+template <typename F>
+void _foreach_tensor(
+    F fn,
+    torch::jit::Stack* stack,
+    size_t stack_start,
+    size_t size) {
+  // Enumerate over tensors in a stack, including ones in TensorLists
+  int idx_tensor = 0;
+  for (const auto idx_arg : c10::irange(size)) {
+    auto& ivalue = (*stack)[stack_start + idx_arg];
+    if (ivalue.isTensor()) {  // true for optional tensor that has value
+      const auto& tensor = ivalue.toTensor();
+      fn(idx_tensor, idx_arg, tensor);
+      idx_tensor++;
+    } else if (ivalue.isTensorList()) {
+      for (const auto& iv : ivalue.toListRef()) {
+        const auto& tensor = iv.toTensor();
+        fn(idx_tensor, idx_arg, tensor);
+        idx_tensor++;
+      }
+    }
+  }
+}
+
+}
+
+void autogradNotImplementedFallbackImpl(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
+  // Mimics the logic of a VariableType NotImplemented kernel
+  const auto& schema = op.schema();
+  const auto& op_name = schema.operator_name().name;
+  const auto& arguments = schema.arguments();
+  const auto& returns = schema.returns();
+  const auto num_arguments = arguments.size();
+  const auto num_returns = returns.size();
+  const auto stack_start = stack->size() - num_arguments;
+  const bool grad_mode = GradMode::is_enabled();
+  std::vector<const at::Tensor*> tensors_requiring_grad_on_stack;
+
+  // Keep track of which outputs are output of in-place modification
+  // so we can rebase_history if necessary
+  std::vector<bool> is_inplace_output;
+  bool any_is_inplace_output = false;
+  std::vector<bool> is_aliased_output;
+  is_inplace_output.reserve(num_returns);
+  is_aliased_output.reserve(num_returns);
+
+  for (const auto i : c10::irange(num_returns)) {
+    const auto& alias_info = returns[i].alias_info();
+    is_inplace_output.push_back(alias_info.has_value() && alias_info->isWrite());
+    any_is_inplace_output |= alias_info.has_value() && alias_info->isWrite();
+    is_aliased_output.push_back(alias_info.has_value());
+
+  }
+  int aliased_input_idx = -1;
+  int aliased_output_idx = -1;
+  for (const auto i : c10::irange(num_returns)) {
+    const auto& alias_info = returns[i].alias_info();
+    if (alias_info.has_value() && !alias_info->isWrite()) {
+      AT_ASSERT(
+        aliased_output_idx == -1,
+        "Expected only a single output in the operator schema to have a non-write alias annotation (i.e., 'Tensor(a)'). "
+        "Non-composite functions where multiple outputs are aliased with inputs aren't supported."
+        "Please rewrite your function as a composite function.");
+      aliased_output_idx = i;
+    }
+  }
+  for (const auto i : c10::irange(num_arguments)) {
+    const auto& alias_info = arguments[i].alias_info();
+    if (alias_info.has_value() && !alias_info->isWrite()) {
+      AT_ASSERT(
+        aliased_input_idx == -1,
+        "Expected only a single input in the operator schema to have a non-write alias annotation (i.e., 'Tensor(a)'). "
+        "Non-composite functions where multiple inputs are aliased with outputs aren't supported. "
+        "Please rewrite your function as a composite function.");
+      aliased_input_idx = i;
+    }
+  }
+
+  size_t num_tensor_inputs = 0;  // Only used for DEBUG-only checks
+
+  _foreach_tensor([&](size_t _, size_t idx_arg, const at::Tensor& t) {
+    if (grad_mode && t.requires_grad()) {
+      tensors_requiring_grad_on_stack.push_back(&t);
+    }
+    num_tensor_inputs++;
+    TORCH_CHECK_NOT_IMPLEMENTED(!isFwGradDefined(t), "Trying to use forward AD with ", op_name, " that does not support it.");
+  }, stack, stack_start, num_arguments);
+
+  const bool any_requires_grad = tensors_requiring_grad_on_stack.size() > 0;
+
+  _foreach_tensor([&](size_t _, size_t i, const at::Tensor& t) {
+    const auto& alias_info = arguments[i].alias_info();
+    if (alias_info.has_value() && alias_info->isWrite()) {
+      check_inplace(t, any_requires_grad);
+    }
+  }, stack, stack_start, num_arguments);
+
+  std::shared_ptr<NotImplemented> grad_fn;
+  if (any_requires_grad) {
+    grad_fn = std::shared_ptr<NotImplemented>(new NotImplemented(op_name), deleteNode);
+    grad_fn->set_next_edges(collect_next_edges(tensors_requiring_grad_on_stack));
+  }
+
+  #ifndef NDEBUG
+  // See NOTE [ TensorImpl and Storage Pointer Sanity Checks ]
+  auto stack_args_copy = std::vector<c10::IValue>(stack->begin() + stack_start, stack->end());
+  std::vector<c10::intrusive_ptr<c10::TensorImpl>> impl_saved;
+  impl_saved.reserve(num_tensor_inputs);
+  std::vector<c10::optional<c10::Storage>> storage_saved;
+  storage_saved.reserve(num_tensor_inputs);
+  _foreach_tensor([&](size_t idx, size_t _, const at::Tensor& t) {
+    storage_saved.push_back(t.has_storage() ? c10::optional<c10::Storage>(t.storage()) : c10::nullopt);
+    impl_saved.push_back(t.getIntrusivePtr());
+  }, &stack_args_copy, 0, num_arguments);
+  #endif
+  if (aliased_input_idx != -1 || any_is_inplace_output) {
+    at::AutoDispatchBelowAutograd guard;
+    op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
+  } else {
+    // If neither in-place nor view
+    at::AutoDispatchBelowADInplaceOrView guard;
+    op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
+  }
+  #ifndef NDEBUG
+  _foreach_tensor([&](size_t idx_tensor, size_t _, const at::Tensor& t) {
+    if (storage_saved.at(idx_tensor).has_value())
+      TORCH_INTERNAL_ASSERT(storage_saved.at(idx_tensor).value().is_alias_of(t.storage()), op_name);
+    if (impl_saved.at(idx_tensor))
+      TORCH_INTERNAL_ASSERT(impl_saved.at(idx_tensor) == t.getIntrusivePtr(), op_name);
+  }, &stack_args_copy, 0, num_arguments);
+  _foreach_tensor([&](size_t idx_tensor, size_t idx_ret, const at::Tensor& t) {
+    if (!is_inplace_output[idx_ret])
+      TORCH_INTERNAL_ASSERT(t.use_count() <= 1, op_name);  // Okay to return undefined tensor
+    if (!is_aliased_output[idx_ret] && t.has_storage())
+      TORCH_INTERNAL_ASSERT(t.storage().use_count() == 1);
+  }, stack, stack->size() - num_returns, num_returns);
+  // There should be only a single base-view pair, make sure their storage is aliased
+  if (aliased_input_idx != -1 && aliased_output_idx != -1) {
+    const c10::IValue& aliased_input_iv = stack_args_copy[aliased_input_idx];
+    const c10::IValue& aliased_output_iv = (*stack)[stack->size() - num_returns + aliased_output_idx];
+    // We do not support views embedded inside tensorlist
+    TORCH_INTERNAL_ASSERT(aliased_input_iv.isTensor(), op_name);
+    TORCH_INTERNAL_ASSERT(aliased_output_iv.isTensor(), op_name);
+    const at::Tensor& aliased_input = aliased_input_iv.toTensor();
+    const at::Tensor& aliased_output = aliased_input_iv.toTensor();
+    if(is_aliased_output[aliased_input_idx] && aliased_input.has_storage())
+      TORCH_INTERNAL_ASSERT(aliased_input.storage().is_alias_of(aliased_output.storage()), op_name);
+  }
+  #endif
+
+  if (any_requires_grad) {
+    _foreach_tensor([&](size_t idx_tensor, size_t idx_ret, const at::Tensor& t) {
+      if (isDifferentiableType(t.scalar_type())) {
+        if (is_inplace_output[idx_ret]) {
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+          rebase_history(const_cast<at::Tensor&>(t), grad_fn);
+        } else {
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+          set_history(const_cast<at::Tensor&>(t), grad_fn);
+        }
+      }
+    }, stack, stack->size() - num_returns, num_returns);
+  }
+}
+
+torch::CppFunction autogradNotImplementedFallback() {
+  return torch::CppFunction::makeFromBoxedFunction<&autogradNotImplementedFallbackImpl>();
+}
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.h b/torch/csrc/autograd/autograd_not_implemented_fallback.h
new file mode 100644
index 0000000000000..4b2cbd14b9d86
--- /dev/null
+++ b/torch/csrc/autograd/autograd_not_implemented_fallback.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+
+namespace torch {
+namespace autograd {
+
+TORCH_API torch::CppFunction autogradNotImplementedFallback();
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index 25336dfa9d911..2a1de8e82a774 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -563,6 +563,14 @@ struct MakeNextFunctionList : IterArgs<MakeNextFunctionList> {
       next_edges.emplace_back();
     }
   }
+  void operator()(const Variable* variable) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (variable->defined()) {
+      next_edges.push_back(impl::gradient_edge(*variable));
+    } else {
+      next_edges.emplace_back();
+    }
+  }
   void operator()(const c10::optional<Variable>& variable) {
     // NOLINTNEXTLINE(bugprone-branch-clone)
     if (variable.has_value() && variable->defined()) {

From 6ccb74b837535f8f5e7a687ee1a17fea52972ab3 Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Fri, 27 Aug 2021 14:59:08 -0700
Subject: [PATCH 314/530] Update codegen to use boxed kernel (#63459)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63459

 - Replaces the usual registration basically when "requires_derivative" is True (as in we still need a grad_fn), but `fn.info` is `None` (TODO maybe make sure differentiable inputs > 0 also to match requires_derivative).
 - Adds some (temporary?) fixes to some sparse functions See: https://github.com/pytorch/pytorch/issues/63549
 - To remove the codegen that generates NotImplemented node (though that should only be one line),  because there are some ops listed under `RESET_GRAD_ACCUMULATOR` that have a extra function call. We would need to make this list of ops available to c++, but this would either mean we'd have to codegen a list of strings, or move the RESET_GRAD_ACCUMULATOR to cpp land. We could do this in a future PR if necessary.

Test Plan: Imported from OSS

Reviewed By: heitorschueroff

Differential Revision: D30518571

Pulled By: soulitzer

fbshipit-source-id: 99a35cbced46292d1b4e51594ae4d534c2caf8b6
---
 tools/autograd/gen_inplace_or_view_type.py |  4 ++
 tools/autograd/gen_variable_type.py        | 46 ++++++++++++++++++----
 tools/autograd/templates/VariableType.h    |  1 +
 3 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/tools/autograd/gen_inplace_or_view_type.py b/tools/autograd/gen_inplace_or_view_type.py
index 6c42bec1e5d12..524cca262f4f2 100644
--- a/tools/autograd/gen_inplace_or_view_type.py
+++ b/tools/autograd/gen_inplace_or_view_type.py
@@ -124,6 +124,10 @@
 );
 """)
 
+AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION = CodeTemplate("""\
+m.impl("${unqual_operator_name_with_overload}", torch::autograd::autogradNotImplementedFallback());
+""")
+
 INPLACE_REDISPATCH = CodeTemplate("""\
 {
   at::AutoDispatchBelowADInplaceOrView guard;
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index a64f7341e281c..d0a9048df47f3 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -1,5 +1,8 @@
 # Generates VariableType.h/cpp
 #
+# **If any changes are being made to the VariableType codegen please also check
+# if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+#
 # VariableType is a subclass of at::Type that provides the binding code
 # necessary to provide a differentiable version of ATen operators. There are a
 # number of different things we could mean:
@@ -30,7 +33,8 @@
 from .gen_inplace_or_view_type import (
     get_view_info, is_tensor_type, is_tensor_list_type, unpack_args, get_base_name,
     use_derived, modifies_arguments, WRAPPER_REGISTRATION, TMP_VAR, METHOD_DEFINITION,
-    ASSIGN_RETURN_VALUE, gen_formals, ALL_VIEW_FUNCTIONS, unpacked_name
+    ASSIGN_RETURN_VALUE, gen_formals, ALL_VIEW_FUNCTIONS, unpacked_name,
+    AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION
 )
 
 from tools.codegen.api.types import (Binding, DispatcherSignature, BaseCType, intArrayRefT,
@@ -404,13 +408,39 @@ def gen_variable_type_func(
         name = cpp.name(f.func)
         formals = gen_formals(f)
 
-        type_definition = METHOD_DEFINITION.substitute(
-            return_type=cpp.returns_type(f.func.returns).cpp_type(),
-            type_wrapper_name=type_wrapper_name(f),
-            type_definition_body=emit_body(fn),
-            formals=formals,
-        )
-        wrapper_registration = gen_wrapper_registration(f)
+        if fn.info is None and not get_base_name(f) in RESET_GRAD_ACCUMULATOR \
+                and not get_base_name(f) in DONT_REQUIRE_DERIVATIVE \
+                and len(gen_differentiable_outputs(fn)) > 0 \
+                and not get_base_name(f) in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE \
+                and not get_base_name(f) in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT \
+                and not get_base_name(f) in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
+            # NOTE: [ Registering AutogradNotImplemented boxed kernel ]
+            #
+            # When there is no derivatives.yaml entry, we register a generic boxed
+            # NotImplemented kernel to set grad_fn to be NotImplemented, so that forward
+            # proceeds as usual but an error is properly produced on backward.
+            # TODO: it would be nice to not have these special cases
+            #
+            # There are several cases where still let codegen handle it:
+            # 1) ops that need to reset grad accumulator (we let codegen handle this case
+            #     because) the list is (currently) only accessible in Python.
+            # 2) User explicitly specifies DONT_REQUIRE_DERIVATIVE. This basically makes
+            #    autograd a fallthrough with NDEBUG checks. This can be useful for when all
+            #    outputs are integral.
+            # 3) When there are no differentiable outputs. This is similar to (2).
+            # 4) There are certain ops where we skip certain NDEBUG checks. this is similar
+            #    to (1).
+            type_definition = ""
+            wrapper_registration = AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION.substitute(
+                unqual_operator_name_with_overload=f.func.name)
+        else:
+            type_definition = METHOD_DEFINITION.substitute(
+                return_type=cpp.returns_type(f.func.returns).cpp_type(),
+                type_wrapper_name=type_wrapper_name(f),
+                type_definition_body=emit_body(fn),
+                formals=formals,
+            )
+            wrapper_registration = gen_wrapper_registration(f)
 
     # See Note [Manual Backend kernels]
     assert (name in MANUAL_BACKEND) == f.manual_kernel_registration
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index fc8ffa5799c11..333e8a0d7ada5 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -7,6 +7,7 @@
 #include <c10/util/intrusive_ptr.h>
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
 
 #include <cstdint> // for size_t
 #include <functional> // for function

From 3abbcf079d38d468a45073b13cb13627c9c0f367 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Fri, 27 Aug 2021 16:02:49 -0700
Subject: [PATCH 315/530] .github: Add cpp_docs job to current gcc5 workflow
 (#64044)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64044

Adds the cpp_docs job to the current workflow, also modifies the scripts
surrounding building docs so that they can be powered through
environment variables with sane defaults rather than having to have
passed arguments.

Ideally should not break current jobs running in circleci but those
should eventually be turned off anyways.

Coincides with work from:
* https://github.com/seemethere/upload-artifact-s3/pull/1
* https://github.com/seemethere/upload-artifact-s3/pull/2

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D30610010

Pulled By: seemethere

fbshipit-source-id: f67adeb1bd422bb9e24e0f1ec0098cf9c648f283
---
 .circleci/scripts/cpp_doc_push_script.sh      | 25 +++++++----
 .circleci/scripts/python_doc_push_script.sh   | 27 ++++++++----
 .github/templates/linux_ci_workflow.yml.j2    | 42 +++++++++++--------
 .../generated-linux-xenial-py3.6-gcc5.4.yml   | 42 +++++++++++--------
 4 files changed, 85 insertions(+), 51 deletions(-)

diff --git a/.circleci/scripts/cpp_doc_push_script.sh b/.circleci/scripts/cpp_doc_push_script.sh
index c6b4f00a06f0f..aa26a740aca7d 100755
--- a/.circleci/scripts/cpp_doc_push_script.sh
+++ b/.circleci/scripts/cpp_doc_push_script.sh
@@ -10,21 +10,30 @@ pt_checkout="/var/lib/jenkins/workspace"
 # Since we're cat-ing this file, we need to escape all $'s
 echo "cpp_doc_push_script.sh: Invoked with $*"
 
+# for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}}
+# the order of operations goes:
+#   1. Check if there's an argument $1
+#   2. If no argument check for environment var DOCS_INSTALL_PATH
+#   3. If no environment var fall back to default 'docs/'
+
+# NOTE: It might seem weird to gather the second argument before gathering the first argument
+#       but since DOCS_INSTALL_PATH can be derived from DOCS_VERSION it's probably better to
+#       try and gather it first, just so we don't potentially break people who rely on this script
+# Argument 2: What version of the Python API docs we are building.
+version="${2:-${DOCS_VERSION:-master}}"
+if [ -z "$version" ]; then
+echo "error: cpp_doc_push_script.sh: version (arg2) not specified"
+  exit 1
+fi
+
 # Argument 1: Where to copy the built documentation for Python API to
 # (pytorch.github.io/$install_path)
-install_path="$1"
+install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}"
 if [ -z "$install_path" ]; then
 echo "error: cpp_doc_push_script.sh: install_path (arg1) not specified"
   exit 1
 fi
 
-# Argument 2: What version of the Python API docs we are building.
-version="$2"
-if [ -z "$version" ]; then
-echo "error: cpp_doc_push_script.sh: version (arg2) not specified"
-  exit 1
-fi
-
 is_master_doc=false
 if [ "$version" == "master" ]; then
   is_master_doc=true
diff --git a/.circleci/scripts/python_doc_push_script.sh b/.circleci/scripts/python_doc_push_script.sh
index ed8f28d54f333..167eaca2d1df1 100755
--- a/.circleci/scripts/python_doc_push_script.sh
+++ b/.circleci/scripts/python_doc_push_script.sh
@@ -13,28 +13,37 @@ echo "python_doc_push_script.sh: Invoked with $*"
 
 set -ex
 
+# for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}}
+# the order of operations goes:
+#   1. Check if there's an argument $1
+#   2. If no argument check for environment var DOCS_INSTALL_PATH
+#   3. If no environment var fall back to default 'docs/'
+
+# NOTE: It might seem weird to gather the second argument before gathering the first argument
+#       but since DOCS_INSTALL_PATH can be derived from DOCS_VERSION it's probably better to
+#       try and gather it first, just so we don't potentially break people who rely on this script
+# Argument 2: What version of the docs we are building.
+version="${2:-${DOCS_VERSION:-master}}"
+if [ -z "$version" ]; then
+echo "error: python_doc_push_script.sh: version (arg2) not specified"
+  exit 1
+fi
+
 # Argument 1: Where to copy the built documentation to
 # (pytorch.github.io/$install_path)
-install_path="$1"
+install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}"
 if [ -z "$install_path" ]; then
 echo "error: python_doc_push_script.sh: install_path (arg1) not specified"
   exit 1
 fi
 
-# Argument 2: What version of the docs we are building.
-version="$2"
-if [ -z "$version" ]; then
-echo "error: python_doc_push_script.sh: version (arg2) not specified"
-  exit 1
-fi
-
 is_master_doc=false
 if [ "$version" == "master" ]; then
   is_master_doc=true
 fi
 
 # Argument 3: The branch to push to. Usually is "site"
-branch="$3"
+branch="${3:-${DOCS_BRANCH:-site}}"
 if [ -z "$branch" ]; then
 echo "error: python_doc_push_script.sh: branch (arg3) not specified"
   exit 1
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 52c0a09a9e1c5..7d9020790710e 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -441,11 +441,15 @@ jobs:
 {% endblock %}
 {%- endif -%}
 {%- if enable_doc_jobs %}
-  pytorch_python_doc_build:
+  pytorch_doc_build:
     runs-on: linux.2xlarge
+    strategy:
+      matrix:
+        docs_type: [cpp, python]
     needs: [calculate-docker-image, build, !{{ ciflow_config.root_job_name }}]
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      DOCS_TYPE: ${{ matrix.docs_type }}
     steps:
       - name: Log in to ECR
         run: |
@@ -483,7 +487,7 @@ jobs:
       - name: Unzip artifacts
         run: |
           unzip -o artifacts.zip
-      - name: Build Python Doc in Docker
+      - name: Build ${{ matrix.docs_type }} docs
         run: |
           set -ex
           time docker pull "${DOCKER_IMAGE}" > /dev/null
@@ -496,6 +500,8 @@ jobs:
             -e IN_CI \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e CIRCLE_SHA1="$GITHUB_SHA" \
+            -e DOCS_VERSION="${target}" \
+            -e DOCS_TYPE \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
@@ -505,34 +511,36 @@ jobs:
             -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
             -w /var/lib/jenkins/workspace \
             "${DOCKER_IMAGE}" \
-            bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/python_doc_push_script.sh docs/$target $target site"
+            bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh"
       - name: Chown workspace
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: driazati/upload-artifact-s3@21c31d0a7bcb056ca50bd6ce197ba6507c26a1be
-        if: ${{ github.event_name == 'pull_request' }}
-        name: Upload Docs Preview
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Upload Python Docs Preview
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
         with:
-          name: deploy
           retention-days: 14
           if-no-files-found: error
-          path: pytorch.github.io/docs/merge
-      - name: Show Docs Preview URL (Click Me)
-        if: ${{ github.event_name == 'pull_request' }}
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-        run: |
-          echo "See rendered docs at https://docs-preview.pytorch.org/$PR_NUMBER/"
+          path: pytorch.github.io/docs/merge/
+          s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Upload C++ Docs Preview
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cppdocs' }}
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          path: cppdocs/
+          s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}/cppdocs
       - name: Archive artifacts into zip
         run: |
-          zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io"
+          zip -r "docs_${DOCS_TYPE}.zip" "${GITHUB_WORKSPACE}/pytorch.github.io" "${GITHUB_WORKSPACE}/cppdocs"
       - uses: actions/upload-artifact@v2
         name: Store PyTorch Build Artifacts
         with:
-          name: pytorch_github_io
+          name: docs_${{ matrix.docs_type }}
+          path: docs_${{ matrix.docs_type }}.zip
           if-no-files-found: error
-          path: pytorch_github_io.zip
       - name: Hold runner for 2 hours or until ssh sessions have drained
         # Always hold for active ssh sessions
         if: always()
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index b5f062c53cb05..6cc391ba15991 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -429,11 +429,15 @@ jobs:
           # Prune all of the docker images
           docker system prune -af
 
-  pytorch_python_doc_build:
+  pytorch_doc_build:
     runs-on: linux.2xlarge
+    strategy:
+      matrix:
+        docs_type: [cpp, python]
     needs: [calculate-docker-image, build, ciflow_should_run]
     env:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      DOCS_TYPE: ${{ matrix.docs_type }}
     steps:
       - name: Log in to ECR
         run: |
@@ -474,7 +478,7 @@ jobs:
       - name: Unzip artifacts
         run: |
           unzip -o artifacts.zip
-      - name: Build Python Doc in Docker
+      - name: Build ${{ matrix.docs_type }} docs
         run: |
           set -ex
           time docker pull "${DOCKER_IMAGE}" > /dev/null
@@ -487,6 +491,8 @@ jobs:
             -e IN_CI \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e CIRCLE_SHA1="$GITHUB_SHA" \
+            -e DOCS_VERSION="${target}" \
+            -e DOCS_TYPE \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
@@ -496,34 +502,36 @@ jobs:
             -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
             -w /var/lib/jenkins/workspace \
             "${DOCKER_IMAGE}" \
-            bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/python_doc_push_script.sh docs/$target $target site"
+            bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh"
       - name: Chown workspace
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: driazati/upload-artifact-s3@21c31d0a7bcb056ca50bd6ce197ba6507c26a1be
-        if: ${{ github.event_name == 'pull_request' }}
-        name: Upload Docs Preview
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Upload Python Docs Preview
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
         with:
-          name: deploy
           retention-days: 14
           if-no-files-found: error
-          path: pytorch.github.io/docs/merge
-      - name: Show Docs Preview URL (Click Me)
-        if: ${{ github.event_name == 'pull_request' }}
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-        run: |
-          echo "See rendered docs at https://docs-preview.pytorch.org/$PR_NUMBER/"
+          path: pytorch.github.io/docs/merge/
+          s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Upload C++ Docs Preview
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cppdocs' }}
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          path: cppdocs/
+          s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}/cppdocs
       - name: Archive artifacts into zip
         run: |
-          zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io"
+          zip -r "docs_${DOCS_TYPE}.zip" "${GITHUB_WORKSPACE}/pytorch.github.io" "${GITHUB_WORKSPACE}/cppdocs"
       - uses: actions/upload-artifact@v2
         name: Store PyTorch Build Artifacts
         with:
-          name: pytorch_github_io
+          name: docs_${{ matrix.docs_type }}
+          path: docs_${{ matrix.docs_type }}.zip
           if-no-files-found: error
-          path: pytorch_github_io.zip
       - name: Hold runner for 2 hours or until ssh sessions have drained
         # Always hold for active ssh sessions
         if: always()

From 2d75ab0c8fe793ceddd3aee74f25c956d5d8d2ec Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Fri, 27 Aug 2021 16:15:55 -0700
Subject: [PATCH 316/530] [TensorExpr] Update tutorial. (#64109)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64109

Test Plan: Imported from OSS

Reviewed By: bertmaher

Differential Revision: D30614050

Pulled By: ZolotukhinM

fbshipit-source-id: e8f9bd9ef2483e6eafbc0bd5394d311cd694c7b2
---
 test/cpp/tensorexpr/tutorial.cpp | 488 +++++++++++++++++++------------
 1 file changed, 304 insertions(+), 184 deletions(-)

diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp
index 16605e5e6d501..0ec0968bebf8f 100644
--- a/test/cpp/tensorexpr/tutorial.cpp
+++ b/test/cpp/tensorexpr/tutorial.cpp
@@ -38,21 +38,30 @@
 #include <iostream>
 #include <string>
 
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
 #include <torch/csrc/jit/tensorexpr/expr.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/stmt.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/torch.h>
 
 using namespace torch::jit::tensorexpr;
 
+// Helper function to print a snippet from a big multi-line string
+static void printLinesToFrom(const std::string& input_str, int from, int to);
+
 int main(int argc, char* argv[]) {
-  std::cout << "*** Structure of tensor expressions ***" << std::endl;
+  std::cout << "*** Structure of tensor expressions and statements ***"
+            << std::endl;
   {
     // A tensor expression is a tree of expressions. Each expression has a type,
-    // and that type defines what sub-expressions it the current expression has.
+    // and that type defines what sub-expressions the current expression has.
     // For instance, an expression of type 'Mul' would have a type 'kMul' and
     // two subexpressions: LHS and RHS. Each of these two sub-expressions could
     // also be a 'Mul' or some other expression.
@@ -72,15 +81,21 @@ int main(int argc, char* argv[]) {
     // like we did in the previous example). Expression handles overload common
     // operations and allow us to express the same semantics in a more natural
     // way:
-    ExprHandle l = 1;
+    ExprHandle l = 5;
     ExprHandle r = Var::make("x", kInt);
     ExprHandle m = l * r;
     std::cout << "Tensor expression: " << *m.node() << std::endl;
-    // Prints: Tensor expression: 1 * x
+    // Prints: Tensor expression: 5 * x
+
+    // Converting from handles to raw expressions and back is easy:
+    ExprHandle handle = Var::make("x", kInt);
+    ExprPtr raw_expr_from_handle = handle.node();
+    ExprPtr raw_expr = alloc<Var>("x", kInt);
+    ExprHandle handle_from_raw_expr = ExprHandle(raw_expr);
 
-    // In a similar fashion we could construct arbitrarily complex expressions
-    // using mathematical and logical operations, casts between various data
-    // types, and a bunch of intrinsics.
+    // We could construct arbitrarily complex expressions using mathematical
+    // and logical operations, casts between various data types, and a bunch of
+    // intrinsics.
     ExprHandle a = Var::make("a", kInt);
     ExprHandle b = Var::make("b", kFloat);
     ExprHandle c = Var::make("c", kFloat);
@@ -96,238 +111,232 @@ int main(int argc, char* argv[]) {
     // placeholder similar to Var, but with dimensions info.
     //
     // Let's construct a simple load:
-    BufHandle A("A", {ExprHandle(64), ExprHandle(32)}, kInt);
-    ExprHandle i = Var::make("i", kInt), j = Var::make("j", kInt);
+    BufHandle A("A", {64, 32}, kInt);
+    VarPtr i_var = alloc<Var>("i", kInt), j_var = alloc<Var>("j", kInt);
+    ExprHandle i(i_var), j(j_var);
     ExprHandle load = Load::make(A.dtype(), A, {i, j});
     std::cout << "Tensor expression: " << *load.node() << std::endl;
     // Prints: Tensor expression: A[i, j]
-  }
 
-  std::cout << "*** Tensors, Functions, and Placeholders ***" << std::endl;
-  {
-    // A tensor computation is represented by Tensor class objects and
-    // consists of the following pieces:
-    //   - domain, which is specified by a Buf expression
-    //   - a tensor statement, specified by a Stmt object, that computation to
-    //   be performed in this domain
-
-    // Let's start with defining a domain. We do this by creating a Buf object.
-
-    // First, let's specify the sizes:
-    std::vector<ExprPtr> dims = {
-        alloc<IntImm>(64),
-        alloc<IntImm>(32)}; // IntImm stands for Integer Immediate
-    // and represents an integer constant
-
-    // Now we can create a Buf object by providing a name, dimensions, and a
-    // data type of the elements:
-    BufPtr buf = alloc<Buf>("X", dims, kInt);
-
-    // Next we need to spefify the computation. We can do that by either
-    // constructing a complete tensor statement for it (statements are
-    // examined in details in subsequent section), or by using a convenience
-    // method where we could specify axis and an element expression for the
-    // computation. In the latter case a corresponding statement would be
-    // constructed automatically.
-
-    // Let's define two variables, i and j - they will be axis in our
-    // computation.
-    VarPtr i = alloc<Var>("i", kInt);
-    VarPtr j = alloc<Var>("j", kInt);
-    std::vector<VarPtr> args = {i, j};
-
-    // Now we can define the body of the tensor computation using these
-    // variables. What this means is that values in our tensor are:
-    //   X[i, j] = i * j
-    ExprPtr body = alloc<Mul>(i, j);
-
-    // Finally, we pass all these pieces together to Tensor constructor:
-    Tensor X = Tensor(buf, args, body);
-    std::cout << "Tensor computation: " << X << std::endl;
+    // Tensor Expressions constitute Tensor Statements, which are used to
+    // represent computation of a given operator or a group of operators from a
+    // fusion group.
+    //
+    // There are three main kinds of tensor statements:
+    //  - block
+    //  - store
+    //  - loop
+    //
+    // A Store represents a store to a single element of a tensor (or to a
+    // group of elements if it's a vectorized store). Store statements,
+    // similarly to Load expressions, have a base and indices, but on top of
+    // that they also include a value - an expression representing what needs
+    // to be stored at the given memory location. Let's create a Store stmt:
+    StmtPtr store_a = Store::make(A, {i, j}, i + j);
+    std::cout << "Store statement: " << *store_a << std::endl;
+    // Prints: Store statement: A[i, j] = i + j;
+
+    // An operator fills the entire tensor, not just a single element, and to
+    // represent this we need to use For stmt: let's wrap our store stmt with
+    // two nested loops to represent that variables i and j need to iterate
+    // over some ranges.
+    ForPtr loop_j_a = For::make(VarHandle(j_var), 0, 32, store_a);
+    ForPtr loop_i_a = For::make(VarHandle(i_var), 0, 64, loop_j_a);
+
+    std::cout << "Nested for loops: " << std::endl << *loop_i_a << std::endl;
     // Prints:
-    // Tensor computation: Tensor X[64, 32]:
+    // Nested for loops:
     // for (int i = 0; i < 64; i++) {
     //   for (int j = 0; j < 32; j++) {
-    //     X[i, j] = i * j;
+    //     A[i, j] = i + j;
     //   }
     // }
 
-    // TODO: Add an example of constructing a Tensor with a complete Stmt.
-
-    // Similarly to how we provide a more convenient way of using handles for
-    // constructing Exprs, Tensors also have a more convenient API for
-    // construction. It is based on Compute API, which takes a name,
-    // dimensions, and a lambda specifying the computation body:
-    Tensor Z = Compute(
-        "Z",
-        {{64, "i"}, {32, "j"}},
-        [](const VarHandle& i, const VarHandle& j) { return i / j; });
-    std::cout << "Tensor computation: " << Z << std::endl;
+    // A Block statement is used when we need a sequence of other statements.
+    // E.g. if a fusion group contains several operators, we initially define
+    // separate loopnest for each of them and put them all into a common block:
+    BufHandle B("B", {64, 32}, kInt);
+    StmtPtr store_b = Store::make(B, {i, j}, A.load(i, j));
+    ForPtr loop_j_b = For::make(VarHandle(j_var), 0, 32, store_b);
+    ForPtr loop_i_b = For::make(VarHandle(i_var), 0, 64, loop_j_b);
+
+    BlockPtr block = Block::make({loop_i_a, loop_i_b});
+    std::cout << "Compound Block statement: " << std::endl
+              << *block << std::endl;
     // Prints:
-    // Tensor computation: Tensor Z[64, 32]:
-    // for (int i = 0; i < 64; i++) {
-    //   for (int j = 0; j < 32; j++) {
-    //     Z[i, j] = i / j;
+    // Compound Block statement:
+    // {
+    //   for (int i = 0; i < 64; i++) {
+    //     for (int j = 0; j < 32; j++) {
+    //       A[i, j] = i + j;
+    //     }
+    //   }
+    //   for (int i = 0; i < 64; i++) {
+    //     for (int j = 0; j < 32; j++) {
+    //       B[i, j] = A[i, j];
+    //     }
     //   }
     // }
 
-    // Tensors might access other tensors and external placeholders in their
-    // expressions. It can be done like so:
-    Placeholder P("P", kInt, {64, 32});
-    Tensor R = Compute(
-        "R",
+    // Manually constructing nested loops and blocks to represent a computation
+    // might be laborious, and instead we can use a 'Compute' API. This API
+    // requires us to specify dimensions and a lambda to compute a single
+    // element of the resulting tensor and returns a `Tensor` structure. This
+    // structure is simply a pair of a buffer that was created to represent the
+    // result of the computation (BufPtr) and a statement representing the
+    // computation itself (StmtPtr).
+    Tensor C = Compute(
+        "C",
         {{64, "i"}, {32, "j"}},
-        [&](const VarHandle& i, const VarHandle& j) {
-          return Z.load(i, j) * P.load(i, j);
-        });
-    std::cout << "Tensor computation: " << R << std::endl;
+        [&](const VarHandle& i, const VarHandle& j) { return i * j; });
+    std::cout << "Stmt produced by 'Compute' API: " << std::endl
+              << *C.stmt() << std::endl;
     // Prints:
-    // Tensor computation: Tensor R[64, 32]:
+    // Stmt produced by 'Compute' API:
     // for (int i = 0; i < 64; i++) {
     //   for (int j = 0; j < 32; j++) {
-    //     R[i, j] = (Z(i, j)) * (P[i, j]);
+    //     C[i, j] = i * j;
     //   }
     // }
 
-    // Placeholders could be thought of as external tensors, i.e. tensors for
-    // which we don't have the element expression. In other words, for `Tensor`
-    // we know an expression specifying how its elements can be computed (a
-    // mathematical formula). For external tensors, or placeholders, we don't
-    // have such an expression. They need to be considered as coming to us as
-    // inputs from outside - we can only load data from them.
-    //
-    // TODO: Show how reductions are represented and constructed
+    // To construct statements to represent computations with reductions, we
+    // can use a 'Reduce' API - it is similar to 'Compute' but takes a couple
+    // of extra arguments defining how to perform the reduction. Let's define a
+    // simple 2D sum of C using that:
+    Tensor D = Reduce(
+        "D",
+        {},
+        Sum(),
+        [&](const VarHandle& i, const VarHandle& j) { return C.load(i, j); },
+        {{64, "i"}, {32, "j"}});
+    std::cout << "Stmt produced by 'Reduce' API: " << std::endl
+              << *D.stmt() << std::endl;
   }
 
-  std::cout << "*** Loopnests and Statements ***" << std::endl;
+  std::cout << "*** Loopnests transformations ***" << std::endl;
   {
-    // Creating a tensor expression is the first step to generate an executable
-    // code for it. A next step is to represent it as a loop nest and apply
-    // various loop transformations in order to get an optimal implementation.
-    // In Halide's or TVM's terms the first step was to define the algorithm of
-    // computation (what to compute?) and now we are getting to the schedule of
-    // the computation (how to compute?).
+    // When a statement for the computation is generated, we might want to
+    // apply some optimizations to it. These transformations allow us to end up
+    // with a statement producing the same results, but more efficiently.
     //
-    // Let's create a simple tensor expression and construct a loop nest for it.
-    Placeholder A("A", kFloat, {64, 32});
-    Placeholder B("B", kFloat, {64, 32});
-    Tensor X = Compute(
-        "X",
+    // Let's look at a couple of transformations that are used in NNC. We will
+    // begin with constructing a Block statement like we did before.
+
+    Tensor C = Compute(
+        "C",
         {{64, "i"}, {32, "j"}},
-        [&](const VarHandle& i, const VarHandle& j) {
-          return A.load(i, j) + B.load(i, j);
-        });
-    Tensor Y = Compute(
-        "Y",
+        [&](const VarHandle& i, const VarHandle& j) { return i * (j + 1); });
+    BufHandle c_buf(C.buf());
+    Tensor D = Compute(
+        "D",
         {{64, "i"}, {32, "j"}},
         [&](const VarHandle& i, const VarHandle& j) {
-          return sigmoid(X.load(i, j));
+          return c_buf.load(i, j) - i;
         });
-    std::cout << "Tensor computation X: " << X << "Tensor computation Y: " << Y
-              << std::endl;
+    StmtPtr block = Block::make({C.stmt(), D.stmt()});
+    std::cout << "Stmt produced by 'Compute' API: " << std::endl
+              << *block << std::endl;
     // Prints:
-    // Tensor computation X: Tensor X[64, 32]:
-    // for (int i = 0; i < 64; i++) {
-    //   for (int j = 0; j < 32; j++) {
-    //     X[i, j] = (A[i, j]) + (B[i, j]);
+    // Stmt produced by 'Compute' API:
+    // {
+    //   for (int i = 0; i < 64; i++) {
+    //     for (int j = 0; j < 32; j++) {
+    //       C[i, j] = i * (j + 1);
+    //     }
     //   }
-    // }
-
-    // Tensor computation Y: Tensor Y[64, 32]:
-    // for (int i = 0; i < 64; i++) {
-    //   for (int j = 0; j < 32; j++) {
-    //     Y[i, j] = sigmoid(X(i, j));
+    //   for (int i_1 = 0; i_1 < 64; i_1++) {
+    //     for (int j_1 = 0; j_1 < 32; j_1++) {
+    //       D[i_1, j_1] = (C[i_1, j_1]) - i_1;
+    //     }
     //   }
     // }
 
-    // Creating a loop nest is as quite simple, we just need to specify a list
-    // of all and a list of output tensors:
-    // NOLINTNEXTLINE(bugprone-argument-comment)
-    LoopNest loopnest(/*outputs=*/{Y}, /*all=*/{X, Y});
-
-    // An IR used in LoopNest is based on tensor statements, represented by
-    // `Stmt` class. Statements are used to specify the loop nest structure, and
-    // to take a sneak peek at them, let's print out what we got right after
-    // creating our LoopNest object:
-    std::cout << *loopnest.root_stmt() << std::endl;
+    // One transformation we can apply to this computation is inlining: i.e.
+    // taking the expression that defines values of C and substituting a load
+    // from C with it.
+    // To do that, we first need to create a special object called LoopNest -
+    // all transformations are methods of this class. To create a loopnest we
+    // need to provide a list of output buffers and the root statement:
+    LoopNest nest(block, {D.buf()});
+
+    // We can always retrieve the Stmt back from LoopNest:
+    std::cout << "LoopNest root stmt: " << std::endl
+              << *nest.root_stmt() << std::endl;
     // Prints:
+    // LoopNest root stmt:
     // {
     //   for (int i = 0; i < 64; i++) {
     //     for (int j = 0; j < 32; j++) {
-    //       X[i, j] = (A[i, j]) + (B[i, j]);
+    //       C[i, j] = i * (j + 1);
     //     }
     //   }
     //   for (int i_1 = 0; i_1 < 64; i_1++) {
     //     for (int j_1 = 0; j_1 < 32; j_1++) {
-    //       Y[i_1, j_1] = sigmoid(X(i_1, j_1));
+    //       D[i_1, j_1] = (C[i_1, j_1]) - i_1;
     //     }
     //   }
     // }
 
-    // To introduce statements let's first look at their three main types (in
-    // fact, there are more than 3 types, but the other types would be easy to
-    // understand once the overall structure is clear):
-    //  1) Block
-    //  2) For
-    //  3) Store
-    //
-    // A `Block` statement is simply a list of other statements.
-    // A `For` is a statement representing one axis of computation. It contains
-    // an index variable (Var), boundaries of the axis (start and end - both are
-    // `Expr`s), and a `Block` statement body.
-    // A `Store` represents an assignment to a tensor element. It contains a Buf
-    // representing the target tensor, a list of expressions for indices of the
-    // element, and the value to be stored, which is an arbitrary expression.
-
-    // Once we've constructed the loop nest, we can apply various tranformations
-    // to it. To begin with, let's inline computation of X into computation of Y
-    // and see what happens to our statements.
-    loopnest.computeInline(loopnest.getLoopBodyFor(X));
-    std::cout << *loopnest.root_stmt() << std::endl;
+    // Now we can apply the inlining transformation:
+    nest.computeInline(C.buf());
+    std::cout << "Stmt after inlining:" << std::endl
+              << *nest.root_stmt() << std::endl;
     // Prints:
+    // Stmt after inlining:
     // {
     //   for (int i = 0; i < 64; i++) {
     //     for (int j = 0; j < 32; j++) {
-    //       Y[i, j] = sigmoid((A[i, j]) + (B[i, j]));
+    //       D[i, j] = i * (j + 1) - i;
     //     }
     //   }
     // }
-    //
-    // As you can see, the first two loops have disappeared and the expression
-    // for X[i,j] has been inserted into the Y[i,j] computation.
-
-    // Loop transformations can be composed, so we can do something else with
-    // our loop nest now. Let's split the inner loop with a factor of 9, for
-    // instance.
-    std::vector<ForPtr> loops = loopnest.getLoopStmtsFor(Y);
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    ForPtr j_inner;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    ForPtr j_tail;
-    int split_factor = 9;
-    loopnest.splitWithTail(
-        loops[1], // loops[0] is the outer loop, loops[1] is inner
-        split_factor,
-        &j_inner, // further transformations
-        &j_tail);
-    // loops[1] will become the outer loop, j_outer, after splitWithTail.
-    std::cout << *loopnest.root_stmt() << std::endl;
+
+    // We can also apply algebraic simplification to a statement:
+    StmtPtr simplified = IRSimplifier::simplify(nest.root_stmt());
+    std::cout << "Stmt after simplification:" << std::endl
+              << *simplified << std::endl;
     // Prints:
+    // Stmt after simplification:
     // {
     //   for (int i = 0; i < 64; i++) {
-    //     for (int j_outer = 0; j_outer < (32 - 0) / 9; j_outer++) {
-    //       for (int j_inner = 0; j_inner < 9; j_inner++) {
-    //         Y[i, j_outer * 9 + j_inner] = sigmoid((A[i, j_outer * 9 + ...
+    //     for (int j = 0; j < 32; j++) {
+    //       D[i, j] = i * j;
+    //     }
+    //   }
+    // }
+
+    // Many loopnest transformations are stateless and can be applied without
+    // creating a LoopNest object. In fact, we plan to make all transformations
+    // stateless.
+    // splitWithTail is one such transformation: it splits an iteration space
+    // of a given loop into two with a given factor.
+    ForPtr outer_loop = to<For>(to<Block>(simplified)->stmts().front());
+    LoopNest::splitWithTail(outer_loop, 13);
+    // Call simplifier once more to fold some arithmetic.
+    simplified = IRSimplifier::simplify(simplified);
+    std::cout << "Stmt after splitWithTail:" << std::endl
+              << *simplified << std::endl;
+    // Prints:
+    // Stmt after splitWithTail:
+    // {
+    //   for (int i_outer = 0; i_outer < 4; i_outer++) {
+    //     for (int i_inner = 0; i_inner < 13; i_inner++) {
+    //       for (int j = 0; j < 32; j++) {
+    //         D[i_inner + 13 * i_outer, j] = i_inner * j + 13 * (i_outer * j);
     //       }
     //     }
-    //     for (int j_tail = 0; j_tail < (32 - 0) % 9; j_tail++) {
-    //       Y[i, j_tail + ((32 - 0) / 9) * 9] = sigmoid((A[i, j_tail + ...
+    //   }
+    //   for (int i_tail = 0; i_tail < 12; i_tail++) {
+    //     for (int j = 0; j < 32; j++) {
+    //       D[i_tail + 52, j] = i_tail * j + 52 * j;
     //     }
     //   }
     // }
 
-    // TODO: List all available transformations
-    // TODO: Show how statements can be constructed manually
+    // NNC supports a wide range of loop nest transformations, which we are not
+    // listing here. Please refer to documentation in
+    // https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/tensorexpr/loopnest.h
+    // for more details.
   }
 
   std::cout << "*** Codegen ***" << std::endl;
@@ -335,13 +344,14 @@ int main(int argc, char* argv[]) {
     // An ultimate goal of tensor expressions is to be provide a mechanism to
     // execute a given computation in the fastest possible way. So far we've
     // looked at how we could describe what computation we're interested in, but
-    // we haven't looked at how to actually execute it. So far all we've been
-    // dealing with was just symbols with no actual data associated, in this
-    // section we would look at how we can bridge that gap.
+    // we haven't looked at how to actually execute it.
+    //
+    // All we've been dealing with was just symbols with no actual data
+    // associated, in this section we would look at how we can bridge that gap.
 
     // Let's start by constructing a simple computation for us to work with:
-    Placeholder A("A", kInt, {64, 32});
-    Placeholder B("B", kInt, {64, 32});
+    BufHandle A("A", {64, 32}, kInt);
+    BufHandle B("B", {64, 32}, kInt);
     Tensor X = Compute(
         "X",
         {{64, "i"}, {32, "j"}},
@@ -349,7 +359,8 @@ int main(int argc, char* argv[]) {
           return A.load(i, j) + B.load(i, j);
         });
 
-    // And let's lower it to a loop nest, as we did in the previous section:
+    // And let's lower it to a loop nest, as we did in the previous section. We
+    // can pass Tensor object directly:
     LoopNest loopnest({X});
     std::cout << *loopnest.root_stmt() << std::endl;
     // Prints:
@@ -416,6 +427,115 @@ int main(int argc, char* argv[]) {
     // X[10] = A[10] + B[10] = 8
   }
 
-  // TODO: Show how TorchScript IR is translated to TE
+  std::cout << "*** Lowering TorchScript IR to TensorExpr IR ***" << std::endl;
+  {
+    // This section requires a LLVM-enabled PyTorch build, so we have to use a
+    // guard:
+#ifdef TORCH_ENABLE_LLVM
+
+    // Often we would like to convert a TorchScript IR to TE rather than
+    // construct TE IR from scratch.  NNC provides an API to perform such
+    // lowering: it takes a TorchScript graph and returns an object that can be
+    // used to invoke the generated kernel.
+    // This API is currently used by the TorchScript JIT fuser and can also be
+    // used ahead of time to pre-compile parts of a model.
+    //
+    // To get familiar with this API let's first start with defining a simple
+    // TorchScript graph:
+    const auto graph_string = R"IR(
+        graph(%A : Float(5, 3, strides=[3, 1], device=cpu),
+              %B : Float(5, 3, strides=[3, 1], device=cpu)):
+          %AB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %B)
+          %one : int = prim::Constant[value=1]()
+          %AAB : Float(5, 3, strides=[3, 1]) = aten::mul(%A, %AB)
+          %AAB_plus_B: Float(5, 3, strides=[3, 1]) = aten::add(%AAB, %B, %one)
+          return (%AAB_plus_B))IR";
+    auto graph = std::make_shared<torch::jit::Graph>();
+    parseIR(graph_string, &*graph);
+
+    // This graph defines a simple computation of A*A*B + B where A and B are
+    // input 5x3 tensors.
+
+    // To lower this TorchScript graph to TE, we just need to create a
+    // TensorExprKernel object. In its constructor it constructs the
+    // corresponding TE IR and compiles it for the given backend (in this
+    // example for CPU using LLVM compiler).
+    TensorExprKernel kernel(graph);
+
+    // We can retrieve the generated TE stmt from the kernel object:
+    StmtPtr kernel_stmt = kernel.getCodeGenStmt();
+    std::cout << "TE Stmt constructed from TorchScript: " << std::endl
+              << *kernel_stmt << std::endl;
+    // Prints:
+    // TE Stmt constructed from TorchScript:
+    // {
+    //   for (int v = 0; v < 5; v++) {
+    //     for (int _tail_tail = 0; _tail_tail < 3; _tail_tail++) {
+    //       aten_add[_tail_tail + 3 * v] = (tA[_tail_tail + 3 * v]) *
+    //       ((tA[_tail_tail + 3 * v]) * (tB[_tail_tail + 3 * v])) +
+    //       (tB[_tail_tail + 3 * v]);
+    //     }
+    //   }
+    // }
+
+    // We can also examine generated LLVM IR and assembly code:
+    std::cout << "Generated LLVM IR: " << std::endl;
+    auto ir_str = kernel.getCodeText("ir");
+    printLinesToFrom(ir_str, 15, 20);
+    // Prints:
+    // Generated LLVM IR:
+    //   %9 = bitcast float* %2 to <8 x float>*
+    //   %10 = load <8 x float>, <8 x float>* %9 ...
+    //   %11 = bitcast float* %5 to <8 x float>*
+    //   %12 = load <8 x float>, <8 x float>* %11 ...
+    //   %13 = fmul <8 x float> %10, %12
+    //   %14 = fmul <8 x float> %10, %13
+
+    std::cout << "Generated assembly: " << std::endl;
+    auto asm_str = kernel.getCodeText("asm");
+    printLinesToFrom(asm_str, 10, 15);
+    // Prints:
+    // Generated assembly:
+    //         vmulps  %ymm1, %ymm0, %ymm2
+    //         vfmadd213ps     %ymm1, %ymm0, %ymm2
+    //         vmovups %ymm2, (%rax)
+    //         vmovss  32(%rcx), %xmm0
+    //         vmovss  32(%rdx), %xmm1
+    //         vmulss  %xmm1, %xmm0, %xmm2
+
+    // We can also execute the generated kernel:
+    auto A =
+        at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
+        2.0;
+    auto B =
+        at::ones({5, 3}, torch::TensorOptions(torch::kCPU).dtype(at::kFloat)) *
+        3.0;
+    std::vector<at::Tensor> inputs = {A, B};
+    std::vector<torch::IValue> stack = torch::fmap<torch::IValue>(inputs);
+    kernel.run(stack);
+    auto R = stack[0].toTensor();
+
+    // Let's print one of the elements from the result tensor to verify that the
+    // computation did happen and was correct:
+    std::cout << "R[2][2] = " << R[2][2] << std::endl;
+    // Prints:
+    // R[2][2] = 15
+    // [ CPUFloatType{} ]
+#endif
+  }
   return 0;
 }
+
+void printLinesToFrom(const std::string& input_str, int from, int to) {
+  std::istringstream f(input_str);
+  std::string s;
+  int idx = 0;
+  while (getline(f, s)) {
+    if (idx > from) {
+      std::cout << s << "\n";
+    }
+    if (idx++ > to) {
+      break;
+    }
+  }
+}

From 07c5cb8c48d655ba73adc2da2b88399f3ab48638 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Fri, 27 Aug 2021 17:37:05 -0700
Subject: [PATCH 317/530] [Static Runtime] Optimize memory planner
 initialization (#64101)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64101

Checking `getOutOfPlaceOperation(n)` is a very expensive operation, especially in multithreaded environments, due to a lock acquisition when the NNC cache is queried. This slows down the memory planner initialization time, and by extension, the latency for the first static runtime inference.

There are two optimizations in this diff:
* Cache the result of `p_node->has_out_variant()` to avoid the call to `getOutOfPlaceOperation`. This speeds up calls to `canReuseInputOutputs`, which in turn speeds up `isOptimizableContainerType`
* Precompute all `isOptimizableContainerType` during static runtime initialization to avoid a pass over all of each node's inputs.

Test Plan: All unit tests pass: `buck test caffe2/benchmarks/static_runtime/...`

Reviewed By: movefast1990

Differential Revision: D30595579

fbshipit-source-id: 70aaa7af9589c739c672788bf662f711731864f2
---
 torch/csrc/jit/runtime/static/impl.cpp | 31 ++++++++++++++++++--------
 torch/csrc/jit/runtime/static/impl.h   | 11 +++++++++
 torch/csrc/jit/runtime/static/ops.cpp  | 29 +++++++++++++++---------
 torch/csrc/jit/runtime/static/ops.h    |  8 +++++--
 4 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 643842a74691c..ee8e9038b1c48 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -319,7 +319,9 @@ LivenessMap GetLivenessMap(
 //   first: Values that are candidates for memory planning
 //   second: A deterministc order of all values
 std::pair<std::vector<const Value*>, std::vector<const Value*>>
-GetMemoryPlanningCandidates(const std::shared_ptr<torch::jit::Graph>& graph) {
+GetMemoryPlanningCandidates(
+    const std::shared_ptr<torch::jit::Graph>& graph,
+    const FastMap<Node*, bool>& node_has_out_variant) {
   // for determinism
   FastSet<const Value*> seen_values;
   std::vector<const Value*> all_values;
@@ -328,7 +330,8 @@ GetMemoryPlanningCandidates(const std::shared_ptr<torch::jit::Graph>& graph) {
   // these need to be removed from "can_reuse" after analyzing all nodes
   FastSet<const Value*> cannot_reuse;
   for (auto* n : graph->nodes()) {
-    bool can_reuse_inputs_outputs = canReuseInputsOutputs(n);
+    bool can_reuse_inputs_outputs =
+        canReuseInputsOutputs(n, node_has_out_variant);
     for (const auto* v : n->inputs()) {
       if (!seen_values.count(v)) {
         all_values.emplace_back(v);
@@ -628,6 +631,7 @@ StaticModule::StaticModule(
 
   // construct SSA definition for non-constant nodes
   int node_idx = 0;
+  FastMap<Node*, bool> node_has_out_variant;
   for (Node* node : graph_->nodes()) {
     if (node->kind() == prim::Constant) {
       continue;
@@ -639,14 +643,22 @@ StaticModule::StaticModule(
       input_ssa_defs.emplace_back(value_to_ssa_def.at(input));
     }
     node_inputs_ssa_def_map_[node_idx] = input_ssa_defs;
-    nodes_.emplace_back(
-        ProcessedNode(node, std::move(ivalue_inputs), opts.enable_out_variant));
+    auto pnode =
+        ProcessedNode(node, std::move(ivalue_inputs), opts.enable_out_variant);
+    node_has_out_variant.emplace(node, pnode.has_out_variant());
+    nodes_.emplace_back(std::move(pnode));
     for (const auto i : c10::irange(node->outputs().size())) {
       value_to_ivalue[node->outputs()[i]] = nullptr;
       value_to_ssa_def[node->outputs()[i]] = std::make_pair(node_idx, i);
     }
     node_idx++;
   }
+  for (auto& pnode : nodes_) {
+    if (pnode.outputs().size() == 1 &&
+        isOptimizableContainerType(pnode.node(), node_has_out_variant)) {
+      node_is_optimizable_container_type_.emplace(pnode.node());
+    }
+  }
   for (auto output : graph_->outputs()) {
     output_ssa_defs_.emplace_back(value_to_ssa_def[output]);
   }
@@ -657,7 +669,7 @@ StaticModule::StaticModule(
 
   if (opts_.optimize_memory) {
     auto lm = GetLivenessMap(graph_, external_values_, alias_db);
-    auto values = GetMemoryPlanningCandidates(graph_);
+    auto values = GetMemoryPlanningCandidates(graph_, node_has_out_variant);
     value_to_same_storage_values_ =
         GenerateSameStorageValues(lm, external_values_, values, alias_db);
   }
@@ -1177,7 +1189,8 @@ void StaticRuntime::check_for_memory_leak(bool output_returned) {
         // check for intermediates
         if (!ival->isNone()) {
           TORCH_CHECK(
-              ival->isTensor() || isOptimizableContainerType(pnode.node()),
+              ival->isTensor() ||
+                  static_module_.is_optimizable_container_type(pnode.node()),
               error_msg);
           if (ival->isTensor()) {
             const auto& t = ival->toTensor();
@@ -1262,9 +1275,9 @@ MemoryPlanner::MemoryPlanner(
           const auto& type = out_v->type();
           if (type->castRaw<TensorType>()) {
             managed_tensor_values.insert(out_v);
-          } else if (isOptimizableContainerType(pnode.node())) {
-            // We "leak" certain container types because their allocations take
-            // a long time
+          } else if (runtime->is_optimizable_container_type(pnode.node())) {
+            // We "leak" certain container types because their allocations
+            // take a long time
             leaked_values.insert(out_v);
           }
         }
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index 6cff047b4d2ce..d8a99f78cad2d 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -160,6 +160,11 @@ class TORCH_API StaticModule {
     return nodes_;
   }
 
+  bool is_optimizable_container_type(Node* n) const {
+    auto it = node_is_optimizable_container_type_.find(n);
+    return it != node_is_optimizable_container_type_.end();
+  }
+
   const c10::optional<c10::FunctionSchema>& schema() const {
     return schema_;
   }
@@ -204,6 +209,8 @@ class TORCH_API StaticModule {
   // map a value to the set of values that may share the same storage with it
   FastMap<const Value*, std::vector<const Value*>>
       value_to_same_storage_values_;
+
+  FastSet<Node*> node_is_optimizable_container_type_;
 };
 
 class TORCH_API StaticRuntime {
@@ -287,6 +294,10 @@ class TORCH_API StaticRuntime {
 
   void check_for_memory_leak(bool output_returned = true);
 
+  bool is_optimizable_container_type(Node* n) const {
+    return static_module_.is_optimizable_container_type(n);
+  }
+
  private:
   // helper method for copying input args/kwargs into inputs_
   void set_inputs(
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index f171d2889f551..3b586689a6c5c 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -25,6 +25,7 @@
 #include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <mutex>
+#include <unordered_map>
 
 C10_DEFINE_bool(
     static_runtime_enable_fast_math,
@@ -312,27 +313,33 @@ bool hasVarArgs(Node* n) {
   return false;
 }
 
-// Expensive check, use sparingly.
-// This is needed to make sure that we only switch to out variants for the
-// supported overloads, which is checked in the `Generate` step in
-// `SROperatorRegistry()->Create(op_name)->Generate(n)`
-bool canReuseInputsOutputs(Node* n) {
+bool canReuseInputsOutputs(
+    Node* n,
+    const FastMap<Node*, bool>& node_has_out_variant) {
+  auto it = node_has_out_variant.find(n);
+  if (it != node_has_out_variant.end()) {
+    return it->second;
+  }
   return getOutOfPlaceOperation(n) != nullptr;
 }
 
 // returns true if the producers of the inputs
 // to this operations are out of place.
 // This means the IValues will not change run to run
-bool inputsCanRunOutOfPlace(Node* n) {
+bool inputsCanRunOutOfPlace(
+    Node* n,
+    const FastMap<Node*, bool>& node_has_out_variant) {
   for (auto* input : n->inputs()) {
-    if (!canReuseInputsOutputs(input->node())) {
+    if (!canReuseInputsOutputs(input->node(), node_has_out_variant)) {
       return false;
     }
   }
   return true;
 }
 
-bool isOptimizableContainerType(Node* n) {
+bool isOptimizableContainerType(
+    Node* n,
+    const FastMap<Node*, bool>& node_has_out_variant) {
   const auto& type = n->output()->type();
   bool is_supported_type = false;
   if (type->kind() == TypeKind::ListType) {
@@ -348,7 +355,7 @@ bool isOptimizableContainerType(Node* n) {
         });
     is_supported_type = iter != types.end();
   }
-  return is_supported_type && inputsCanRunOutOfPlace(n);
+  return is_supported_type && inputsCanRunOutOfPlace(n, node_has_out_variant);
 }
 
 REGISTER_OPERATOR_FUNCTOR(
@@ -356,7 +363,7 @@ REGISTER_OPERATOR_FUNCTOR(
     prim_ListConstruct,
     [](Node* n) -> SROperator {
       const auto& type = n->output()->type()->expectRef<ListType>();
-      bool can_optimize = isOptimizableContainerType(n);
+      bool can_optimize = isOptimizableContainerType(n, FastMap<Node*, bool>());
       return [can_optimize, &type](ProcessedNode* p_node) {
         const auto& out_l = p_node->Output(0);
         if (!out_l.isNone() && can_optimize) {
@@ -376,7 +383,7 @@ REGISTER_OPERATOR_FUNCTOR(
     prim::TupleConstruct,
     prim_TupleConstruct,
     [](Node* n) -> SROperator {
-      bool can_optimize = isOptimizableContainerType(n);
+      bool can_optimize = isOptimizableContainerType(n, FastMap<Node*, bool>());
       return [can_optimize](ProcessedNode* p_node) {
         const auto& out_l = p_node->Output(0);
         if (!out_l.isNone() && can_optimize) {
diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h
index ff5d69e1cb895..311143ca7392f 100644
--- a/torch/csrc/jit/runtime/static/ops.h
+++ b/torch/csrc/jit/runtime/static/ops.h
@@ -133,8 +133,12 @@ bool opIsRegistered(const c10::Symbol& op_name);
 // as native ops in Static Runtime
 bool nativeOpIsRegistered(const c10::Symbol& op_name);
 
-bool canReuseInputsOutputs(Node* n);
-bool isOptimizableContainerType(Node* n);
+bool canReuseInputsOutputs(
+    Node* n,
+    const FastMap<Node*, bool>& node_has_out_variant);
+bool isOptimizableContainerType(
+    Node* n,
+    const FastMap<Node*, bool>& node_has_out_variant);
 
 std::function<void(ProcessedNode*)> getOutOfPlaceOperation(Node* n);
 std::function<void(ProcessedNode*)> getNativeOperation(Node* n);

From 9ccb9299e072bb611fc67169e6f0d1fb9e49bedd Mon Sep 17 00:00:00 2001
From: Ilqar Ramazanli <iramazanli@fb.com>
Date: Fri, 27 Aug 2021 18:51:09 -0700
Subject: [PATCH 318/530] To add Nesterov Adam algorithm description to
 documentation (#63793)

Summary:
It has been discussed before that adding description of Optimization algorithms to PyTorch Core documentation may result in a nice Optimization research tutorial. In the following tracking issue we mentioned about all the necessary algorithms and links to the originally published paper  https://github.com/pytorch/pytorch/issues/63236.

In this PR we are adding description of Nesterov Adam Algorithm to the documentation.  For more details, we refer to the paper  https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ

<img width="439" alt="NAdam" src="https://user-images.githubusercontent.com/73658284/131185124-e81b2edf-33d9-4a9d-a7bf-f7e5eea47d7c.png">

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63793

Reviewed By: NivekT

Differential Revision: D30617057

Pulled By: iramazanli

fbshipit-source-id: cd2054b0d9b6883878be74576e86e307f32f1435
---
 torch/optim/nadam.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py
index 55a790610c5a5..deaaf20b1d710 100644
--- a/torch/optim/nadam.py
+++ b/torch/optim/nadam.py
@@ -6,7 +6,34 @@
 class NAdam(Optimizer):
     r"""Implements NAdam algorithm.
 
-    It has been proposed in `Incorporating Nesterov Momentum into Adam`_.
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \gamma_t \text{ (lr)}, \: \beta_1,\beta_2 \text{ (betas)},
+                \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}                   \\
+            &\hspace{13mm} \: \lambda \text{ (weight decay)}, \:\psi \text{ (momentum decay)}    \\
+            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
+                v_0 \leftarrow 0 \text{ ( second moment)}                                 \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm}if \: \lambda \neq 0                                                    \\
+            &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
+            &\hspace{5mm} \mu_t \leftarrow \beta_1 \big(1 - \frac{1}{2}  0.96^{t \psi} \big)     \\
+            &\hspace{5mm} \mu_{t+1} \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{(t+1)\psi}\big)\\
+            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
+            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
+            &\hspace{5mm}\widehat{m_t} \leftarrow \mu_{t+1} m_t/(1-\prod_{i=1}^{t+1}\mu_i)\\[-1.ex]
+            & \hspace{11mm} + (1-\mu_t) g_t /(1-\prod_{i=1}^{t} \mu_{i})                         \\
+            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
+            &\hspace{5mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
+                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_.
 
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining

From a3a7a67048c11ee74fbdd54037a6dbaf90367964 Mon Sep 17 00:00:00 2001
From: Yuchen Huang <hyc@fb.com>
Date: Fri, 27 Aug 2021 18:57:22 -0700
Subject: [PATCH 319/530] [iOS][GPU] Consolidate array and non-array kernel for
 hardswish (#63369)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63369

ghstack-source-id: 136918152

(Note: this ignores all push blocking failures!)

Test Plan:
- `buck test pp-macos`
- Op tests in PyTorchPlayground app
- Run mobilenetv3 test

https://pxl.cl/1Ncls

Reviewed By: xta0

Differential Revision: D30354454

fbshipit-source-id: 88bf4f8b5871e63170161b3f3e44f99b8a3086c6
---
 aten/src/ATen/native/metal/MetalShaders.h     | 41 ++++++++++---------
 .../native/metal/mpscnn/tests/MPSCNNTests.h   |  1 +
 .../native/metal/mpscnn/tests/MPSCNNTests.mm  | 12 ++++++
 .../metal/mpscnn/tests/MetalOpTestRunner.mm   |  1 +
 .../ATen/native/metal/ops/MetalHardswish.mm   |  4 +-
 5 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/aten/src/ATen/native/metal/MetalShaders.h b/aten/src/ATen/native/metal/MetalShaders.h
index 5c256723a59e5..0ee703f2ee261 100644
--- a/aten/src/ATen/native/metal/MetalShaders.h
+++ b/aten/src/ATen/native/metal/MetalShaders.h
@@ -393,31 +393,32 @@ kernel void clamp(texture2d_array<half, access::read> in_arr[[texture(0), functi
     }
 }
 
-kernel void hardswish(texture2d_array<half, access::read> in[[texture(0)]],
-                      texture2d_array<half, access::write> out[[texture(1)]],
+constant bool hardswish_is_arr = (ushort_arg_0 > 1 || ushort_arg_1 > 4);
+constant bool hardswish_is_tex = !hardswish_is_arr;
+kernel void hardswish(texture2d_array<half, access::read> in_arr[[texture(0), function_constant(hardswish_is_arr)]],
+                      texture2d<half, access::read> in_tex[[texture(0), function_constant(hardswish_is_tex)]],
+                      texture2d_array<half, access::write> out_arr[[texture(1), function_constant(hardswish_is_arr)]],
+                      texture2d<half, access::write> out_tex[[texture(1), function_constant(hardswish_is_tex)]],
                       ushort3 gid[[thread_position_in_grid]]) {
-    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    const ushort oH = ushort_arg_2;
+    const ushort oW = ushort_arg_3;
+    if (gid.x >= oW || gid.y >= oH) {
         return;
     }
     ushort2 gid_ = gid.xy;
-    half4 value = in.read(gid_, gid.z);
-    half4 mask1 = half4(value < 3.0);
-    half4 mask2 = half4(value > -3.0);
-    half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
-    out.write(outval, gid_, gid.z);
-}
-
-kernel void hardswish_nonarray(texture2d<half, access::read> in[[texture(0)]],
-                               texture2d<half, access::write> out[[texture(1)]],
-                               ushort2 gid[[thread_position_in_grid]]) {
-    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
-        return;
+    if (hardswish_is_arr) {
+      half4 value = in_arr.read(gid_, gid.z);
+      half4 mask1 = half4(value < 3.0);
+      half4 mask2 = half4(value > -3.0);
+      half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
+      out_arr.write(outval, gid_, gid.z);
+    } else {
+      half4 value = in_tex.read(gid_);
+      half4 mask1 = half4(value < 3);
+      half4 mask2 = half4(value > -3.0);
+      half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
+      out_tex.write(outval, gid_);
     }
-    half4 value = in.read(gid);
-    half4 mask1 = half4(value < 3);
-    half4 mask2 = half4(value > -3.0);
-    half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
-    out.write(outval, gid);
 }
 
 constant bool out_is_arr = (ushort_arg_3 > 1 || ushort_arg_2 > 4);
diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h
index ee992d9db5abd..599f2ceb64f4c 100644
--- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h
+++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h
@@ -41,6 +41,7 @@ bool test_softmax();
 bool test_sigmoid();
 bool test_hardsigmoid();
 bool test_hardswish();
+bool test_hardswish2();
 bool test_upsampling_nearest2d_vec();
 bool test_upsampling_nearest2d_vec2();
 bool test_adaptive_avg_pool2d();
diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
index 69497a976a130..5a8f6de86996b 100644
--- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
+++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
@@ -262,6 +262,18 @@ bool test_hardswish() {
   });
 }
 
+bool test_hardswish2() {
+  __block std::vector<int64_t> size{1, 3, 44, 44};
+  return TEST(size, __PRETTY_FUNCTION__, ^bool {
+    auto X =
+        at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat)) * 12 - 6;
+    auto X2 = X.metal();
+    auto Y1 = at::hardswish_(X);
+    auto Y2 = at::hardswish_(X2).cpu();
+    return almostEqual(Y1, Y2);
+  });
+}
+
 bool test_addmm() {
   bool result = true;
   for (int i = 0; i < ITER_COUNT; ++i) {
diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm b/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm
index d8b69adcc9d1e..f337e1dfc824e 100644
--- a/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm
+++ b/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm
@@ -69,6 +69,7 @@ - (void)registerTests {
   REG_TEST("test_sigmoid", test_sigmoid);
   REG_TEST("test_hardsigmoid", test_hardsigmoid);
   REG_TEST("test_hardswish", test_hardswish);
+  REG_TEST("test_hardswish2", test_hardswish2);
   REG_TEST("test_upsampling_nearest2d_vec", test_upsampling_nearest2d_vec);
   REG_TEST("test_upsampling_nearest2d_vec2", test_upsampling_nearest2d_vec2);
   REG_TEST("test_adaptive_avg_pool2d", test_adaptive_avg_pool2d);
diff --git a/aten/src/ATen/native/metal/ops/MetalHardswish.mm b/aten/src/ATen/native/metal/ops/MetalHardswish.mm
index 8d3526a4c6b2a..d571e483233dd 100644
--- a/aten/src/ATen/native/metal/ops/MetalHardswish.mm
+++ b/aten/src/ATen/native/metal/ops/MetalHardswish.mm
@@ -24,9 +24,9 @@
   id<MTLComputeCommandEncoder> encoder =
       [commandBuffer.buffer computeCommandEncoder];
   id<MTLComputePipelineState> state = [[MetalContext sharedInstance]
-      specializedPipelineState:mpscnn::kernelFor(
-                                   X, "hardswish", "hardswish_nonarray")
+      specializedPipelineState:"hardswish"
                      Constants:@[
+                       @(X.numberOfImages),
                        @(X.featureChannels),
                        @(X.height),
                        @(X.width)

From 0d0605eaa9243c938faddd3fb60f922c4a48c953 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Fri, 27 Aug 2021 20:58:20 -0700
Subject: [PATCH 320/530] [quant][graphmode][fx] Add reference quantized linear
 module (#63627)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63627

Added reference quantized linear module for the custom backend flow, the reference quantized module will
have the following code:
```
        w(float) -- quant - dequant \
        x(float) ------------- F.linear ---
```
In the full model, we will see
```
        w(float) -- quant - *dequant \
        x -- quant --- *dequant --  *F.linear --- *quant - dequant
```
and the backend should be able to fuse the ops with `*` into a quantized linear

Test Plan:
python test/test_quantization.py TestQuantizeFx.test_conv_linear_reference

Imported from OSS

Reviewed By: vkuzo

Differential Revision: D30504750

fbshipit-source-id: 5729921745c2b6a0fb344efc3689f3b170e89500
---
 .../core/test_quantized_module.py             |  51 +++----
 test/quantization/fx/test_quantize_fx.py      |  72 +++++++++-
 .../quantized/_reference/modules/__init__.py  |   2 -
 .../_reference/modules/linear_relu.py         |  28 ----
 .../nn/quantized/_reference/modules/linear.py | 124 +++++++++++++-----
 .../nn/quantized/_reference/modules/utils.py  |  45 +++++++
 .../quantization/fx/quantization_patterns.py  |  19 ++-
 torch/quantization/quantization_mappings.py   |   5 +-
 8 files changed, 240 insertions(+), 106 deletions(-)
 delete mode 100644 torch/nn/intrinsic/quantized/_reference/modules/linear_relu.py
 create mode 100644 torch/nn/quantized/_reference/modules/utils.py

diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py
index 10d5831e87758..bc8a6b397eef8 100644
--- a/test/quantization/core/test_quantized_module.py
+++ b/test/quantization/core/test_quantized_module.py
@@ -6,7 +6,6 @@
 import torch.nn.quantized as nnq
 import torch.nn.quantized._reference as nnqr
 import torch.nn.quantized.dynamic as nnqd
-import torch.nn.functional as F
 import torch.quantization
 
 from torch.quantization import (
@@ -70,24 +69,21 @@ def test_linear_api(self):
             [4, 8],
             [True, False],
             [True, False],
-            [True, False],
             [True, False])
         for (batch_size, in_features, out_features, use_bias,
-             use_fused, per_channel, is_reference) in options:
+             use_fused, per_channel) in options:
             self._test_linear_api_impl(
                 batch_size, in_features, out_features, use_bias, use_fused,
-                per_channel, is_reference)
+                per_channel)
 
-    def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias, use_fused, per_channel, is_reference):
+    def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias, use_fused, per_channel):
         if torch.backends.quantized.engine == 'qnnpack':
             per_channel = False
 
-        # (use_fused, is_reference) -> quantized class
+        # use_fused -> quantized class
         class_map = {
-            (True, True) : nniqr.LinearReLU,
-            (True, False) : nniq.LinearReLU,
-            (False, True) : nnqr.Linear,
-            (False, False) : nnq.Linear,
+            True: nniq.LinearReLU,
+            False: nnq.Linear,
         }
 
         W = torch.rand(out_features, in_features).float()
@@ -107,7 +103,7 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias,
         B = torch.rand(out_features).float() if use_bias else None
         scale = 0.5
         zero_point = 3
-        qlinear = class_map[(use_fused, is_reference)](in_features, out_features)
+        qlinear = class_map[use_fused](in_features, out_features)
 
         qlinear_copy = qlinear  # deepcopy does not work right now
         # qlinear_copy = copy.deepcopy(qlinear)
@@ -127,21 +123,11 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias,
 
         # Check if the module implementation matches calling the
         # ops directly
-        if is_reference:
-            weight = qlinear._qweight
-            bias = qlinear._bias
-            weight_dequant = weight.dequantize()
-            X_q_dq = X_q.dequantize()
-            Z_ref = F.linear(X_q_dq, weight_dequant, bias)
-            if use_fused:
-                Z_ref = F.relu(Z_ref, inplace=True)
-            Z_ref = torch.quantize_per_tensor(Z_ref, scale, zero_point, torch.quint8)
+        W_pack = qlinear._packed_params._packed_params
+        if use_fused:
+            Z_ref = torch.ops.quantized.linear_relu(X_q, W_pack, scale, zero_point)
         else:
-            W_pack = qlinear._packed_params._packed_params
-            if use_fused:
-                Z_ref = torch.ops.quantized.linear_relu(X_q, W_pack, scale, zero_point)
-            else:
-                Z_ref = torch.ops.quantized.linear(X_q, W_pack, scale, zero_point)
+            Z_ref = torch.ops.quantized.linear(X_q, W_pack, scale, zero_point)
 
         self.assertEqual(Z_ref, Z_q)
         self.assertTrue(
@@ -163,16 +149,12 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias,
             else:
                 self.assertEqual(model_dict[key], loaded_dict[key])
 
-        loaded_qlinear = class_map[(use_fused, is_reference)](
+        loaded_qlinear = class_map[use_fused](
             in_features, out_features)
         loaded_qlinear.load_state_dict(loaded_dict)
-        if is_reference:
-            self.assertEqual(qlinear._qweight, loaded_qlinear._qweight)
-            self.assertEqual(qlinear._bias, loaded_qlinear._bias)
-        else:
-            linear_unpack = torch.ops.quantized.linear_unpack
-            self.assertEqual(linear_unpack(qlinear._packed_params._packed_params),
-                             linear_unpack(loaded_qlinear._packed_params._packed_params))
+        linear_unpack = torch.ops.quantized.linear_unpack
+        self.assertEqual(linear_unpack(qlinear._packed_params._packed_params),
+                         linear_unpack(loaded_qlinear._packed_params._packed_params))
         self.assertEqual(qlinear.scale, loaded_qlinear.scale)
         self.assertEqual(qlinear.zero_point, loaded_qlinear.zero_point)
         # make sure loaded_qlinear has the same dir as qlinear since
@@ -180,8 +162,7 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias,
         self.checkScriptable(loaded_qlinear, [[X_q]], check_save_load=True)
         self.assertTrue(dir(qlinear) == dir(loaded_qlinear))
         self.assertEqual(qlinear._weight_bias(), loaded_qlinear._weight_bias())
-        if not is_reference:
-            self.assertEqual(qlinear._weight_bias(), torch.ops.quantized.linear_unpack(qlinear._packed_params._packed_params))
+        self.assertEqual(qlinear._weight_bias(), torch.ops.quantized.linear_unpack(qlinear._packed_params._packed_params))
         Z_q2 = loaded_qlinear(X_q)
         self.assertEqual(Z_q, Z_q2)
 
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 762919eeb04ea..7ae29e03f6a46 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -3,6 +3,7 @@
 import torch.nn.functional as F
 import torch.nn as nn
 import torch.nn.quantized as nnq
+import torch.nn.quantized._reference as nnqr
 import torch.nn.quantized.dynamic as nnqd
 import torch.nn.intrinsic as nni
 import torch.nn.intrinsic.quantized as nniq
@@ -571,7 +572,7 @@ def forward(self, x):
                 LinearModule,
                 (),
                 (linear_module_input,),
-                ns.call_module(nn.Linear) if is_reference else ns.call_module(nnqd.Linear),
+                ns.call_module(nnqr.Linear) if is_reference else ns.call_module(nnqd.Linear),
                 None,
             ),
             (
@@ -579,7 +580,7 @@ def forward(self, x):
                 LinearModule,
                 (),
                 (linear_module_input,),
-                ns.call_module(nn.Linear if is_reference else nnq.Linear),
+                ns.call_module(nnqr.Linear if is_reference else nnq.Linear),
                 None,
             ),
         ]
@@ -608,6 +609,13 @@ def test_conv_linear_reference(self):
         """ Test quantizing functional conv and linear with reference option
         """
         tests = self._get_conv_linear_test_cases(is_reference=True)
+
+        def _get_keys(prefix, is_dynamic):
+            all_keys = [prefix + "." + k for k in ["weight_qscheme", "weight_dtype"]]
+            if not is_dynamic:
+                all_keys.extend([prefix + "." + k for k in ["weight_scale", "weight_zero_point"]])
+            return all_keys
+
         for (is_dynamic, ModuleClass, module_constructor_inputs,
              inputs, quantized_node, weight_prepack_node) in tests:
             quant_type = QuantType.DYNAMIC if is_dynamic else QuantType.STATIC
@@ -623,13 +631,19 @@ def test_conv_linear_reference(self):
             qr = result_dict["quantized_reference"]
 
             def checkWeightQParams(model):
-                for module_name in ("linear", "conv"):
+                for module_name in ("conv",):
                     if hasattr(model, module_name):
                         self.assertTrue(hasattr(qr.get_submodule(module_name), "_weight_qparams"))
                         self.assertTrue("Reference" in qr.get_submodule(module_name)._get_name())
+                for module_name in ("linear",):
+                    if hasattr(model, module_name):
+                        self.assertTrue(hasattr(qr.get_submodule(module_name), "weight_qscheme"))
+                        self.assertTrue(hasattr(qr.get_submodule(module_name), "weight_scale"))
+                        self.assertTrue(hasattr(qr.get_submodule(module_name), "weight_zero_point"))
+                        self.assertTrue("Reference" in qr.get_submodule(module_name)._get_name())
 
-            def checkSerDeser(model):
-                for module_name in ("linear", "conv"):
+            def checkSerDeser(model, is_dynamic):
+                for module_name in ("conv",):
                     if hasattr(model, module_name):
                         # make sure seralization works
                         state_dict = copy.deepcopy(model.state_dict())
@@ -641,6 +655,20 @@ def checkSerDeser(model):
                         module._weight_qparams["scale"] = None
                         model.load_state_dict(state_dict)
                         self.assertTrue(torch.equal(prev_scale, module._weight_qparams["scale"]))
+                for module_name in ("linear",):
+                    if hasattr(model, module_name):
+                        # make sure seralization works
+                        state_dict = copy.deepcopy(model.state_dict())
+                        all_keys = _get_keys(module_name, is_dynamic)
+                        for key in all_keys:
+                            self.assertTrue(key in state_dict)
+                        # check load_state_dict restores states
+                        module = getattr(model, module_name)
+                        prev_scale = module.weight_scale
+                        module.weight_scale = None
+                        model.load_state_dict(state_dict)
+                        module = getattr(model, module_name)
+                        self.assertTrue(torch.equal(prev_scale, module.weight_scale))
 
 
             checkWeightQParams(qr)
@@ -648,7 +676,7 @@ def checkSerDeser(model):
             # make sure the qparams are preserved after copy
             checkWeightQParams(qr)
 
-            checkSerDeser(qr)
+            checkSerDeser(qr, is_dynamic)
 
     @skipIfNoFBGEMM
     def test_dynamic_quant_weight_observer(self):
@@ -2941,6 +2969,38 @@ def forward(self, x):
             ]
             self.checkGraphModuleNodes(m, expected_node_list=node_list)
 
+    def test_ref_linear_module(self):
+        """ Make sure the numerics for models with ref linear module
+        matches models with fbgemm/qnnpack module
+        """
+        class M1(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 5)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        class M2(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 5)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.linear(x))
+
+        for M in [M1, M2]:
+            m = M().eval()
+            m = prepare_fx(m, {"": default_qconfig})
+            m_copy = copy.deepcopy(m)
+            m = convert_fx(m, is_reference=False)
+            m_ref = convert_fx(m_copy, is_reference=True)
+            data = torch.randn(5, 10)
+            result = m(data)
+            result_ref = m_ref(data)
+            self.assertTrue(torch.equal(result, result_ref))
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     """Unit tests for individual ops
diff --git a/torch/nn/intrinsic/quantized/_reference/modules/__init__.py b/torch/nn/intrinsic/quantized/_reference/modules/__init__.py
index bf8ff3a3db5e1..33b18d8cf7d3f 100644
--- a/torch/nn/intrinsic/quantized/_reference/modules/__init__.py
+++ b/torch/nn/intrinsic/quantized/_reference/modules/__init__.py
@@ -1,9 +1,7 @@
 import torch
-from .linear_relu import LinearReLU
 from .conv_relu import ConvReLU1d, ConvReLU2d, ConvReLU3d
 
 __all__ = [
-    'LinearReLU',
     'ConvReLU1d',
     'ConvReLU2d',
     'ConvReLU3d',
diff --git a/torch/nn/intrinsic/quantized/_reference/modules/linear_relu.py b/torch/nn/intrinsic/quantized/_reference/modules/linear_relu.py
deleted file mode 100644
index 39c595376fded..0000000000000
--- a/torch/nn/intrinsic/quantized/_reference/modules/linear_relu.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import torch
-import torch.nn.intrinsic as nni
-import torch.nn.quantized._reference as nnqr
-import torch.nn.functional as F
-
-class LinearReLU(nnqr.Linear):
-    _FLOAT_MODULE = nni.LinearReLU
-
-    def __init__(
-            self,
-            in_features,
-            out_features,
-            bias=True,
-            dtype=torch.qint8):
-        super().__init__(in_features, out_features, bias, dtype)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x_dequant = x.dequantize()
-        weight_dequant = self._qweight.dequantize()
-        float_result = F.linear(x_dequant, weight_dequant, self._bias)
-        float_result = F.relu(float_result, inplace=True)
-        # NEEDFIX: we don't have dtype in the Linear module APIs right now!
-        result = torch.quantize_per_tensor(
-            float_result, self.scale, self.zero_point, torch.quint8)
-        return result
-
-    def _get_name(self):
-        return "QuantizedLinearReLU(Reference)"
diff --git a/torch/nn/quantized/_reference/modules/linear.py b/torch/nn/quantized/_reference/modules/linear.py
index 276dc0161ded8..1df5499433d1c 100644
--- a/torch/nn/quantized/_reference/modules/linear.py
+++ b/torch/nn/quantized/_reference/modules/linear.py
@@ -1,51 +1,115 @@
 import torch
-import torch.nn.quantized as nnq
+import torch.nn as nn
 import torch.nn.functional as F
-from typing import Optional
+from typing import Optional, Dict, Any
+from .utils import _quantize_and_dequantize_weight
+from .utils import _save_weight_qparams
+from .utils import _get_weight_qparam_keys
 
-class Linear(nnq.Linear):
-    """ A backend independent version of nn.quantized.Linear
-        we will not pack the parameters in this module, since weight packing is an
-        optimization for quantized backends supported in PyTorch (fbgemm/qnnpack),
-        this is useful when user want to use this module in other backends like Glow.
+class Linear(nn.Linear):
+    """ A reference quantized linear module that fits into the FX
+    Graph Mode Quantization workflow
+    activation will be floating point Tensor, we will store floating
+    point weight as well in the module, but in forward we'll quantize
+    and dequantize the weight before running the floating point functional
+    linear operator.
     """
-    def __init__(self, in_features, out_features, bias_=True,
-                 dtype=torch.qint8):
-        super().__init__(in_features, out_features, bias_, dtype)
-        self._qweight, self._bias = self._packed_params._weight_bias()
-        del self._packed_params
+    def __init__(
+            self,
+            in_features: int,
+            out_features: int,
+            bias_: bool = True,
+            device: Optional[torch.device] = None,
+            dtype: Optional[torch.dtype] = None,
+            weight_qparams: Optional[Dict[str, Any]] = None):
+        super().__init__(in_features, out_features, bias_, device, dtype)
+        if weight_qparams is None:
+            weight_qparams = {
+                "qscheme": torch.per_tensor_affine,
+                "dtype": torch.quint8,
+                "scale": 1.0,
+                "zero_point": 0
+            }
+        self.weight_qscheme = weight_qparams["qscheme"]
+        self.weight_dtype = weight_qparams["dtype"]
+        assert self.weight_qscheme in [None, torch.per_tensor_affine, torch.per_channel_affine], \
+            Exception(f"qscheme: {self.weight_qscheme} is not support in reference quantized linear module")
+        if self.weight_qscheme is not None:
+            self.register_buffer(
+                "weight_scale",
+                torch.tensor(weight_qparams["scale"], dtype=torch.float, device=device))
+            self.register_buffer(
+                "weight_zero_point",
+                torch.tensor(
+                    weight_qparams["zero_point"],
+                    dtype=torch.int, device=device))
+            if self.weight_qscheme == torch.per_channel_affine:
+                self.register_buffer(
+                    "weight_axis",
+                    torch.tensor(weight_qparams["axis"], dtype=torch.int, device=device))
+            else:
+                # added for TorchScriptability, not used
+                self.register_buffer(
+                    "weight_axis",
+                    torch.tensor(0, dtype=torch.int, device=device))
 
     def _get_name(self):
         return "QuantizedLinear(Reference)"
 
+    def get_weight(self):
+        """
+        Fake quantize (quantize and dequantize) the weight with
+        the quantization parameters for weight, this is used to
+        simulate the numerics for the quantized weight in a quantized
+        model
+        """
+        # supress mypy warning
+        assert isinstance(self.weight, torch.Tensor)
+        assert isinstance(self.weight_scale, torch.Tensor)
+        assert isinstance(self.weight_zero_point, torch.Tensor)
+        assert isinstance(self.weight_axis, torch.Tensor)
+        return _quantize_and_dequantize_weight(
+            self.weight, self.weight_qscheme, self.weight_dtype, self.weight_scale,
+            self.weight_zero_point, self.weight_axis)
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x_dequant = x.dequantize()
-        weight_dequant = self._qweight.dequantize()
-        float_result = F.linear(x_dequant, weight_dequant, self._bias)
-        # NEEDFIX: we don't have dtype in the Linear module APIs right now!
-        result = torch.quantize_per_tensor(
-            float_result, self.scale, self.zero_point, torch.quint8)
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.linear ---
+
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.linear --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized linear
+        """
+        weight_dequant = self.get_weight()
+        result = F.linear(x, weight_dequant, self.bias)
         return result
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
         super()._save_to_state_dict(destination, prefix, keep_vars)
-        destination[prefix + '_qweight'] = self._qweight
-        destination[prefix + '_bias'] = self._bias
+        _save_weight_qparams(
+            destination, prefix, self.weight_qscheme, self.weight_dtype,
+            self.weight_scale, self.weight_zero_point, self.weight_axis)
 
     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
-        self._qweight = state_dict[prefix + '_qweight']
-        self._bias = state_dict[prefix + '_bias']
-        state_dict.pop(prefix + '_qweight')
-        state_dict.pop(prefix + '_bias')
+        for key in _get_weight_qparam_keys(state_dict, prefix):
+            setattr(self, key, state_dict[prefix + key])
+            state_dict.pop(prefix + key)
 
         super()._load_from_state_dict(
             state_dict, prefix, local_metadata, False,
             missing_keys, unexpected_keys, error_msgs)
 
-    def _weight_bias(self):
-        return self._qweight, self._bias
-
-    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
-        self._qweight = w
-        self._bias = b
+    @classmethod
+    def from_float(cls, float_linear, weight_qparams):
+        qref_linear = Linear(
+            float_linear.in_features, float_linear.out_features,
+            float_linear.bias is not None, device=float_linear.weight.device,
+            dtype=float_linear.weight.dtype, weight_qparams=weight_qparams)
+        qref_linear.weight = torch.nn.Parameter(float_linear.weight.detach())
+        if float_linear.bias is not None:
+            qref_linear.bias = torch.nn.Parameter(float_linear.bias.detach())
+        return qref_linear
diff --git a/torch/nn/quantized/_reference/modules/utils.py b/torch/nn/quantized/_reference/modules/utils.py
new file mode 100644
index 0000000000000..7c366503dd872
--- /dev/null
+++ b/torch/nn/quantized/_reference/modules/utils.py
@@ -0,0 +1,45 @@
+import torch
+from typing import Dict, Any
+
+def _quantize_and_dequantize_weight(
+        weight: torch.Tensor,
+        weight_qscheme: torch.qscheme,
+        weight_dtype: torch.dtype,
+        weight_scale: torch.Tensor,
+        weight_zero_point: torch.Tensor,
+        weight_axis: torch.Tensor):
+    """ Quantize and then dequantize the weight based on
+    the quantization parameters
+    """
+    if weight_qscheme == torch.per_tensor_affine:
+        weight = torch.quantize_per_tensor(weight, weight_scale, weight_zero_point, weight_dtype)
+        weight_dequant = weight.dequantize()
+    elif weight_qscheme == torch.per_channel_affine:
+        weight = torch.quantize_per_channel(
+            weight, weight_scale,
+            weight_zero_point, weight_axis.item(), weight_dtype)  # type: ignore[arg-type]
+        weight_dequant = weight.dequantize()
+    else:
+        weight_dequant = weight
+    return weight_dequant
+
+def _save_weight_qparams(destination, prefix, weight_qscheme, weight_dtype, weight_scale, weight_zero_point, weight_axis):
+    destination[prefix + "weight_qscheme"] = weight_qscheme
+    destination[prefix + "weight_dtype"] = weight_dtype
+    if weight_qscheme is not None:
+        destination[prefix + "weight_scale"] = weight_scale
+        destination[prefix + "weight_zero_point"] = weight_zero_point
+        if weight_qscheme == torch.per_channel_affine:
+            destination[prefix + "weight_axis"] = weight_axis
+
+def _get_weight_qparam_keys(
+        state_dict: Dict[str, Any],
+        prefix: str):
+    keys = ["weight_qscheme", "weight_dtype"]
+    weight_qscheme = state_dict[prefix + "weight_qscheme"]
+    if weight_qscheme is not None:
+        keys.append("weight_scale")
+        keys.append("weight_zero_point")
+        if weight_qscheme == torch.quantize_per_channel:
+            keys.append("weight_axis")
+    return keys
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 6362961ad8daa..e8b873658b504 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -869,6 +869,7 @@ def convert(self,
                 # Get the float linear and attach qscheme and qparams
                 # the the module
                 float_linear = self.linear
+                fused_linear = None
                 if isinstance(float_linear, (torch.nn.qat.Linear, torch.nn.intrinsic.qat.LinearReLU)):
                     float_linear = float_linear.to_float()
                     # change qat linear to linear
@@ -876,10 +877,12 @@ def convert(self,
                     setattr(modules[parent_name], name, float_linear)
                     # Attach weight fake quant to the linear module
                     if isinstance(float_linear, torch.nn.intrinsic.LinearReLU):
+                        fused_linear = float_linear
                         float_linear = float_linear[0]
                     weight_post_process = self.linear.weight_fake_quant
                 else:
                     if isinstance(float_linear, torch.nn.intrinsic.LinearReLU):
+                        fused_linear = float_linear
                         float_linear = self.linear[0]  # type: ignore[index]
                     # Attach the weight observer to the module
                     weight_post_process = qconfig.weight()  # type: ignore[union-attr]
@@ -887,7 +890,21 @@ def convert(self,
                     weight_post_process(float_linear.weight)  # type: ignore[operator]
 
                 weight_qparams = get_qparam_dict(weight_post_process)
-                _to_reference(float_linear, weight_qparams)
+                # TODO: include the configuration in backend_config_dict
+                # we can have a map from module to reference module
+                # and allow user to register new ones
+                qlinear_cls = get_static_quant_module_class(
+                    type(float_linear), is_reference=is_reference)
+                ref_linear = qlinear_cls.from_float(float_linear, weight_qparams)
+
+                # if the parent is a fused linear (Sequential), we can replace the first
+                # item to ref linear, otherwise we can update
+                # the linear instance in the module tree
+                if fused_linear is not None:
+                    fused_linear[0] = ref_linear
+                else:
+                    parent_name, name = _parent_name(self.linear_node.target)
+                    setattr(modules[parent_name], name, ref_linear)
                 op_out = quantized_graph.create_node(
                     'call_module',
                     self.linear_node.target,
diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py
index 775d40bb23efa..03b177805bac3 100644
--- a/torch/quantization/quantization_mappings.py
+++ b/torch/quantization/quantization_mappings.py
@@ -25,16 +25,14 @@
 
 # Default map for swapping float module to reference quantized modules
 DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS : Dict[Callable, Any] = {
+    nn.Linear: nnqr.Linear,
     nn.Conv1d: nnqr.Conv1d,
     nn.Conv2d: nnqr.Conv2d,
     nn.Conv3d: nnqr.Conv3d,
-    nn.Linear: nnqr.Linear,
     nni.ConvReLU1d: nniqr.ConvReLU1d,
     nni.ConvReLU2d: nniqr.ConvReLU2d,
     nni.ConvReLU3d: nniqr.ConvReLU3d,
-    nni.LinearReLU: nniqr.LinearReLU,
     # QAT Modules
-    nnqat.Linear: nnqr.Linear,
     nnqat.Conv2d: nnqr.Conv2d,
     nnqat.Conv3d: nnqr.Conv3d,
     nniqat.ConvBn1d: nnqr.Conv1d,
@@ -45,7 +43,6 @@
     nniqat.ConvBnReLU3d: nniqr.ConvReLU3d,
     nniqat.ConvReLU2d: nniqr.ConvReLU2d,
     nniqat.ConvReLU3d: nniqr.ConvReLU3d,
-    nniqat.LinearReLU: nniqr.LinearReLU,
 }
 
 # Default map for swapping float module to quantized ones

From f4496528e38684b4482636998bf4bb63d5dd3140 Mon Sep 17 00:00:00 2001
From: Priya Ramani <priyaramani@fb.com>
Date: Fri, 27 Aug 2021 22:50:20 -0700
Subject: [PATCH 321/530] [Light] Fix error message (#64010)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64010

Fixing typos in a error message

Test Plan:
Error message before fix:
Lite Interpreter verson number does not match. The model version must be between 3 and 5But the model version is 6

Error message after fix:
Lite Interpreter version number does not match. The model version must be between 3 and 5 but the model version is 6

Reviewed By: larryliu0820

Differential Revision: D30568367

fbshipit-source-id: 205f3278ee8dcf38579dbb828580a9e986ccacc1
---
 torch/csrc/jit/mobile/import.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index db9f0b8c20cf5..d2865d071ea72 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -317,12 +317,12 @@ void BytecodeDeserializer::parseMethods(
       caffe2::serialize::kMinSupportedBytecodeVersion <= model_version &&
           // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
           model_version <= caffe2::serialize::kMaxSupportedBytecodeVersion,
-      "Lite Interpreter verson number does not match. ",
+      "Lite Interpreter version number does not match. ",
       "The model version must be between ",
       caffe2::serialize::kMinSupportedBytecodeVersion,
       " and ",
       caffe2::serialize::kMaxSupportedBytecodeVersion,
-      "But the model version is ",
+      " but the model version is ",
       model_version);
 
   bool has_debug_handles = debug_handles.has_value();

From d0c63e857d12f3ddc04a80defb2530694b4f263d Mon Sep 17 00:00:00 2001
From: "Tugsbayasgalan (Tugsuu) Manlaibaatar" <tmanlaibaatar@fb.com>
Date: Sat, 28 Aug 2021 11:44:58 -0700
Subject: [PATCH 322/530] Enhancement for smart serialization for out schemas
 (#63096)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63096

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D30415255

Pulled By: tugsbayasgalan

fbshipit-source-id: eb40440a3b46258394d035479f5fc4a4baa12bcc
---
 test/cpp/jit/test_interpreter.cpp             |  9 ++++
 test/cpp/jit/test_utils.cpp                   | 15 ++++++
 test/cpp/jit/test_utils.h                     |  1 +
 test/jit/test_ignorable_args.py               |  7 +++
 .../jit/runtime/calculate_necessary_args.h    | 43 ++++++++++++----
 .../csrc/jit/runtime/interpreter/code_impl.h  | 25 +++++++++-
 torch/csrc/jit/serialization/python_print.cpp | 50 ++++++++++++++-----
 7 files changed, 126 insertions(+), 24 deletions(-)

diff --git a/test/cpp/jit/test_interpreter.cpp b/test/cpp/jit/test_interpreter.cpp
index a2418918336c5..bfdc1f3a0cb7e 100644
--- a/test/cpp/jit/test_interpreter.cpp
+++ b/test/cpp/jit/test_interpreter.cpp
@@ -175,6 +175,15 @@ TEST(InterpreterTest, IgnorableArgsInSchema) {
   ASSERT_TRUE(op_to_specified_args_non_const["aten::conv2d"] == 6);
 }
 
+TEST(InterpreterTest, IgnorableArgsInSchemaWithOut) {
+  auto graph = build_mobile_export_with_out();
+  MobileCode function(graph, "");
+  auto op_to_specified_args = function.op_to_num_specified_args();
+  ASSERT_TRUE(op_to_specified_args.size() == 1);
+  // this should be 3 when the add_out flag is set to True
+  ASSERT_TRUE(op_to_specified_args["aten::add.out"] == 4);
+}
+
 TEST(InterpreterTest, runAsyncBasicTest) {
   /*
   TODO: there are some problem with C++ parsing script program involving
diff --git a/test/cpp/jit/test_utils.cpp b/test/cpp/jit/test_utils.cpp
index 27667f068588b..f2fb9e1fb0606 100644
--- a/test/cpp/jit/test_utils.cpp
+++ b/test/cpp/jit/test_utils.cpp
@@ -123,6 +123,21 @@ std::shared_ptr<Graph> build_mobile_export_analysis_graph() {
   return g;
 }
 
+std::shared_ptr<Graph> build_mobile_export_with_out() {
+  const auto graph_string = R"IR(
+    graph(%x.1 : Tensor,
+          %y.1 : Tensor):
+      %8 : NoneType = prim::Constant()
+      %6 : int = prim::Constant[value=1]()
+      %7 : Tensor = aten::add(%x.1, %y.1, %6, %y.1)
+      return (%8))IR";
+
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+  g->lint();
+  return g;
+}
+
 std::shared_ptr<Graph> build_mobile_export_analysis_graph_nested() {
   // this is pretty much same test as build_mobile_export_analysis_graph(),
   // but some aten::slice operators are hidden under block statement to check
diff --git a/test/cpp/jit/test_utils.h b/test/cpp/jit/test_utils.h
index 5e640aed0114a..1a1e1b82b10e8 100644
--- a/test/cpp/jit/test_utils.h
+++ b/test/cpp/jit/test_utils.h
@@ -74,6 +74,7 @@ std::pair<tensor_list, tensor_list> runGradient(
 
 std::shared_ptr<Graph> build_lstm();
 std::shared_ptr<Graph> build_mobile_export_analysis_graph();
+std::shared_ptr<Graph> build_mobile_export_with_out();
 std::shared_ptr<Graph> build_mobile_export_analysis_graph_with_vararg();
 std::shared_ptr<Graph> build_mobile_export_analysis_graph_nested();
 std::shared_ptr<Graph> build_mobile_export_analysis_graph_non_const();
diff --git a/test/jit/test_ignorable_args.py b/test/jit/test_ignorable_args.py
index b195e3cc4faaa..fb63c1973bf0e 100644
--- a/test/jit/test_ignorable_args.py
+++ b/test/jit/test_ignorable_args.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import torch
 from torch._C import parse_ir
 from torch.testing import FileCheck
 
@@ -43,3 +44,9 @@ def test_slice_ignorable_args_for_slice(self):
         # because in %16, %15 and %0 are default values for the schema.
         FileCheck().check("torch.slice(torch.slice(torch.tensor(_0), 0, 2), 1, None, 1)").run(src)
         self.assertEqual(function(), function_copy())
+
+    def test_add_out_ignorable_args(self):
+        @torch.jit.script
+        def fn(x: torch.Tensor, y: torch.Tensor):
+            torch.add(x, y, out=y)
+        FileCheck().check("torch.add(x, y, out=y)").run(fn.code)
diff --git a/torch/csrc/jit/runtime/calculate_necessary_args.h b/torch/csrc/jit/runtime/calculate_necessary_args.h
index 5f37660ee14a8..07df670b01040 100644
--- a/torch/csrc/jit/runtime/calculate_necessary_args.h
+++ b/torch/csrc/jit/runtime/calculate_necessary_args.h
@@ -7,18 +7,42 @@
 namespace torch {
 namespace jit {
 
-inline size_t CalculateNecessaryArgs(
+inline std::pair<size_t, size_t> CalculateNecessaryArgs(
     const std::vector<Argument>& schema_args,
-    at::ArrayRef<Value*> actual_inputs) {
+    at::ArrayRef<Value*> actual_inputs,
+    bool allow_trailing_out_args) {
+  if (schema_args.size() == 0) {
+    return std::make_pair(0, 0);
+  }
+
+  // count number of out arguments
+  auto schema_idx = schema_args.size() - 1;
+  if (allow_trailing_out_args) {
+    // skip over out arguments in the end.
+    while (schema_idx >= 0) {
+      auto current_arg = schema_args.at(schema_idx);
+      if (!current_arg.is_out()) {
+        break;
+      }
+      schema_idx--;
+    }
+  }
+
+  auto num_out = schema_args.size() - schema_idx - 1;
+
   if (schema_args.size() < actual_inputs.size()) {
-    return actual_inputs.size();
+    return std::make_pair(actual_inputs.size(), num_out);
+  }
+
+  // if it is the default args, we reset the index to the last element
+  if (!allow_trailing_out_args) {
+    schema_idx = schema_args.size() - 1;
   }
   // keeps track of trailing unnecessary args
-  int schema_size = schema_args.size();
-  for (int schema_idx = schema_size - 1; schema_idx > -1; schema_idx--) {
+  while (schema_idx >= 0) {
     // this means it is not default argument, so it is necessary
     if (!schema_args.at(schema_idx).default_value().has_value()) {
-      return schema_idx + 1;
+      return std::make_pair(schema_idx + 1, num_out);
     } else {
       auto schema_value =
           schema_args.at(schema_idx).default_value().value().toIValue();
@@ -27,16 +51,17 @@ inline size_t CalculateNecessaryArgs(
       // well.
       auto actual_value = toIValue(actual_inputs[schema_idx]);
       if (!actual_value.has_value()) {
-        return schema_idx + 1;
+        return std::make_pair(schema_idx + 1, num_out);
       }
       // if the IR has same value as default value of the schema,
       // it is not neccessary argument.
       if (schema_value != actual_value.value()) {
-        return schema_idx + 1;
+        return std::make_pair(schema_idx + 1, num_out);
       }
     }
+    schema_idx--;
   }
-  return 0;
+  return std::make_pair(0, num_out);
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h
index 00648de905767..682c695138674 100644
--- a/torch/csrc/jit/runtime/interpreter/code_impl.h
+++ b/torch/csrc/jit/runtime/interpreter/code_impl.h
@@ -105,6 +105,8 @@ struct CodeImpl {
   // This is because for all usages, at most 3 args are used.
   std::unordered_map<std::string, size_t> op_to_num_specified_args_;
 
+  std::unordered_map<std::string, size_t> op_to_num_out_args_;
+
   // running count of uses as we emit. When we reach use_count_[v] =
   // v.uses().size() we know it is the final use and we can move rather than
   // load.
@@ -292,6 +294,12 @@ struct CodeImpl {
     }
   }
 
+  void emitLoadInputs(at::ArrayRef<Value*> inputs, size_t start, size_t end) {
+    for (size_t i = start; i < end; i++) {
+      emitUse(inputs[i], false);
+    }
+  }
+
   virtual void emitOperator(Node* node) {
     emitLoadInputs(node->inputs());
     const Operator& op = node->getOperator();
@@ -737,13 +745,19 @@ struct MobileCodeImpl : CodeImpl {
         auto op_schema = node->getOperator().schema();
         // skip if schema has vararg
         if (!op_schema.is_vararg()) {
-          auto numInclude =
-              CalculateNecessaryArgs(op_schema.arguments(), node->inputs());
+          auto specifiedArgs = CalculateNecessaryArgs(
+              op_schema.arguments(), node->inputs(), false);
+          // preserving the old behavior
+          auto numInclude = specifiedArgs.first;
+          // TODO uncomment this
+          // auto numInclude = specifiedArgs.first + specifiedArgs.second;
           auto unique_name = op_schema.overload_name() != ""
               ? op_schema.name() + "." + op_schema.overload_name()
               : op_schema.name();
           auto it = op_to_num_specified_args_.insert(
               std::pair<std::string, size_t>(unique_name, 0));
+          op_to_num_out_args_.insert(std::pair<std::string, size_t>(
+              unique_name, specifiedArgs.second));
           auto prev_value = it.first->second;
           it.first->second = std::max(numInclude, prev_value);
         }
@@ -769,6 +783,13 @@ struct MobileCodeImpl : CodeImpl {
           num_include = it->second;
         }
         emitLoadInputs(node->inputs(), num_include);
+        // TODO: uncomment this
+        // auto num_out = op_to_num_out_args_.find(unique_op_name)->second;
+        // auto num_specified_before_out = num_include - num_out;
+        // emitLoadInputs(node->inputs(), 0, num_specified_before_out);
+        // emitLoadInputs(node->inputs(), node->inputs().size() - num_out,
+        // node->inputs().size());
+
         insertInstruction(OP, operator_table_.size());
       }
       operator_table_.emplace_back(op.getOperation(node));
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index 1ab968967392f..80123c625ea65 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -1162,23 +1162,47 @@ struct PythonPrintImpl {
         // calculate how many args are specified.
         // see (https://github.com/pytorch/pytorch/pull/56079) for more
         // details.
-        size_t necessary_args =
-            CalculateNecessaryArgs(schema.arguments(), node->inputs());
-        for (const auto i : c10::irange(necessary_args)) {
-          if (i > 0)
+        size_t num_schema_args = schema.arguments().size();
+
+        // we only want to do this extra logic only when necessary.
+        if (num_schema_args > 0) {
+          // calculate how many args are specified.
+          // see (https://github.com/pytorch/pytorch/pull/56079) for more
+          // details.
+          auto specified_args =
+              CalculateNecessaryArgs(schema.arguments(), node->inputs(), true);
+
+          auto num_necessary = specified_args.first;
+          auto num_out = specified_args.second;
+
+          for (size_t i = 0; i < num_necessary; ++i) {
+            if (i > 0)
+              stmt << ", ";
+            auto v = useOf(node->inputs().at(i));
+            // print the kwarg name if it is a kwarg only argument.
+            if (i < num_schema_args) {
+              auto arg = schema.arguments().at(i);
+              if (arg.kwarg_only()) {
+                stmt << arg.name() << "=";
+              }
+            } else {
+              // vararg functions like format can have extra arguments
+              AT_ASSERT(schema.is_vararg());
+            }
+            stmt << *v;
+          }
+
+          // print out args
+          for (size_t i = num_schema_args - num_out; i < num_schema_args; i++) {
             stmt << ", ";
-          auto v = useOf(node->inputs().at(i));
-          // print the kwarg name if it is a kwarg only argument.
-          if (i < schema.arguments().size()) {
             auto arg = schema.arguments().at(i);
-            if (arg.kwarg_only()) {
-              stmt << arg.name() << "=";
+            TORCH_INTERNAL_ASSERT(arg.is_out());
+            // figure out the corresponding input at this index
+            auto input_idx = node->inputs().size() - (num_schema_args - i);
+            if (input_idx < node->inputs().size()) {
+              stmt << arg.name() << "=" << *useOf(node->inputs().at(input_idx));
             }
-          } else {
-            // vararg functions like format can have extra arguments
-            AT_ASSERT(schema.is_vararg());
           }
-          stmt << *v;
         }
         stmt << ")";
       } break;

From 223f886032978487099da4f54e86e9e0549cde0c Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Sat, 28 Aug 2021 11:46:40 -0700
Subject: [PATCH 323/530] Move Parallel[Native|TBB] to GHA (#64123)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64123

Reviewed By: driazati

Differential Revision: D30620966

Pulled By: malfet

fbshipit-source-id: 9a23e4b3e16870f77bf18df4370cd468603d592d
---
 .circleci/cimodel/data/pytorch_build_data.py  |   2 -
 .circleci/config.yml                          |  90 ----
 .github/generated-ciflow-ruleset.json         |   6 +
 .github/scripts/generate_ci_workflows.py      |  38 +-
 ...rallelnative-linux-xenial-py3.6-gcc5.4.yml | 430 ++++++++++++++++++
 ...-paralleltbb-linux-xenial-py3.6-gcc5.4.yml | 430 ++++++++++++++++++
 6 files changed, 892 insertions(+), 104 deletions(-)
 create mode 100644 .github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
 create mode 100644 .github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml

diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index 5a85674d74fe9..156494589831b 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -7,8 +7,6 @@
             ("5.4", [  # All this subtree rebases to master and then build
                 ("3.6", [
                     ("important", [X(True)]),
-                    ("parallel_tbb", [X(True)]),
-                    ("parallel_native", [X(True)]),
                     ("pure_torch", [X(True)]),
                 ]),
             ]),
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 1bb32b5cc0a3d..8df67e6fe2bc8 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7166,70 +7166,6 @@ workflows:
           build_environment: "pytorch-linux-pytorch_linux_xenial_py3_6_gcc5_4_distributed-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
-      - pytorch_linux_build:
-          name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
-          requires:
-            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-      - pytorch_linux_test:
-          name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_test
-          requires:
-            - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-          resource_class: large
-      - pytorch_linux_test:
-          name: pytorch_linux_pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed_test
-          requires:
-            - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
-          build_environment: "pytorch-linux-pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-          resource_class: large
-      - pytorch_linux_build:
-          name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
-          requires:
-            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-      - pytorch_linux_test:
-          name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_test
-          requires:
-            - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-          resource_class: large
-      - pytorch_linux_test:
-          name: pytorch_linux_pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed_test
-          requires:
-            - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
-          build_environment: "pytorch-linux-pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-          resource_class: large
       - pytorch_linux_build:
           name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build
           requires:
@@ -9386,32 +9322,6 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-xenial-py3.6-gcc7"
           image_name: "pytorch-linux-xenial-py3.6-gcc7"
-      - pytorch_linux_build:
-          name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
-          requires:
-            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
-          build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-      - pytorch_linux_test:
-          name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_test
-          requires:
-            - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
-          build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-          resource_class: large
-      - pytorch_linux_build:
-          name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
-          requires:
-            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
-          build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-      - pytorch_linux_test:
-          name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_test
-          requires:
-            - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
-          build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-          resource_class: large
       - pytorch_linux_build:
           name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build
           requires:
diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json
index d13561190d01f..0fb27af006c85 100644
--- a/.github/generated-ciflow-ruleset.json
+++ b/.github/generated-ciflow-ruleset.json
@@ -10,6 +10,8 @@
       "linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
+      "parallelnative-linux-xenial-py3.6-gcc5.4",
+      "paralleltbb-linux-xenial-py3.6-gcc5.4",
       "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
       "periodic-linux-xenial-cuda11.1-py3.6-gcc7",
       "periodic-win-vs2019-cuda11.1-py3",
@@ -27,6 +29,8 @@
       "linux-bionic-py3.8-gcc9-coverage",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
+      "parallelnative-linux-xenial-py3.6-gcc5.4",
+      "paralleltbb-linux-xenial-py3.6-gcc5.4",
       "win-vs2019-cpu-py3"
     ],
     "ciflow/cuda": [
@@ -63,6 +67,8 @@
       "linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
+      "parallelnative-linux-xenial-py3.6-gcc5.4",
+      "paralleltbb-linux-xenial-py3.6-gcc5.4",
       "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
       "periodic-linux-xenial-cuda11.1-py3.6-gcc7"
     ],
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index f1819dbac589d..dd115405e03ea 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -272,18 +272,32 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
             labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}
         ),
     ),
-    # CIWorkflow(
-    #     arch="linux",
-    #     build_environment="paralleltbb-linux-xenial-py3.6-gcc5.4",
-    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
-    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
-    # ),
-    # CIWorkflow(
-    #     arch="linux",
-    #     build_environment="parallelnative-linux-xenial-py3.6-gcc5.4",
-    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
-    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
-    # ),
+    CIWorkflow(
+        arch="linux",
+        build_environment="paralleltbb-linux-xenial-py3.6-gcc5.4",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+        test_runner_type=LINUX_CPU_TEST_RUNNER,
+        # This is a master only job despit on_pull_request is set to True
+        on_pull_request=True,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            trigger_action_only=True,
+            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU},
+        ),
+    ),
+    CIWorkflow(
+        arch="linux",
+        build_environment="parallelnative-linux-xenial-py3.6-gcc5.4",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+        test_runner_type=LINUX_CPU_TEST_RUNNER,
+        # This is a master only job despit on_pull_request is set to True
+        on_pull_request=True,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            trigger_action_only=True,
+            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU},
+        ),
+    ),
     # CIWorkflow(
     #     arch="linux",
     #     build_environment="pure_torch-linux-xenial-py3.6-gcc5.4",
diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
new file mode 100644
index 0000000000000..402ce38129052
--- /dev/null
+++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
@@ -0,0 +1,430 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: parallelnative-linux-xenial-py3.6-gcc5.4
+
+on:
+  pull_request:
+    types: [unassigned]
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: parallelnative-linux-xenial-py3.6-gcc5.4
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
+  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  TORCH_CUDA_ARCH_LIST: 5.2
+  IN_CI: 1
+  # This is used for the phase of adding wheel tests only, will be removed once completed
+  IN_WHEEL_TEST: 1
+  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
+  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+
+concurrency:
+  group: parallelnative-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
+  calculate-docker-image:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.2xlarge
+    needs: [ciflow_should_run]
+    env:
+      DOCKER_BUILDKIT: 1
+    timeout-minutes: 90
+    outputs:
+      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
+    steps:
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
+      - name: Calculate docker image tag
+        id: calculate-tag
+        run: |
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "::set-output name=docker_tag::${DOCKER_TAG}"
+          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+      - name: Check if image should be built
+        id: check
+        env:
+          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
+          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
+        run: |
+          set -x
+          # Check if image already exists, if it does then skip building it
+          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
+            exit 0
+          fi
+          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
+            # if we're on the base branch then use the parent commit
+            MERGE_BASE=$(git rev-parse HEAD~)
+          else
+            # otherwise we're on a PR, so use the most recent base commit
+            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
+          fi
+          # Covers the case where a previous tag doesn't exist for the tree
+          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
+          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
+            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
+            exit 1
+          fi
+          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
+          # If no image exists but the hash is the same as the previous hash then we should error out here
+          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
+            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
+            echo "       contact the PyTorch team to restore the original images"
+            exit 1
+          fi
+          echo ::set-output name=rebuild::yes
+      - name: Build and push docker image
+        if: ${{ steps.check.outputs.rebuild }}
+        env:
+          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
+          DOCKER_SKIP_S3_UPLOAD: 1
+        run: |
+          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
+          cd .circleci/docker && ./build_docker.sh
+
+  build:
+    runs-on: linux.2xlarge
+    needs: [calculate-docker-image, ciflow_should_run]
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-build
+    steps:
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Build PyTorch
+        run: |
+          docker run \
+            -e BUILD_ENVIRONMENT \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        run: |
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Archive artifacts into zip
+        run: |
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
+      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        name: Store PyTorch Build Artifacts on S3
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Prune all of the docker images
+          docker system prune -af
+
+  generate-test-matrix:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ubuntu-18.04
+    needs: [ciflow_should_run]
+    env:
+      TEST_RUNNER_TYPE: linux.2xlarge
+      ENABLE_DISTRIBUTED_TEST: 1
+      ENABLE_JIT_LEGACY_TEST: ''
+      ENABLE_MULTIGPU_TEST: ''
+      ENABLE_NOGPU_NO_AVX_TEST: ''
+      ENABLE_NOGPU_NO_AVX2_TEST: ''
+      ENABLE_SLOW_TEST: ''
+      NUM_TEST_SHARDS: 1
+      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      NOGPU_RUNNER_TYPE: linux.2xlarge
+      PR_BODY: ${{ github.event.pull_request.body }}
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
+      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
+    container:
+      image: python:3.9
+    steps:
+      - name: Install dependencies
+        run: pip install typing-extensions
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - name: Generating test matrix
+        id: set-matrix
+        run: .github/scripts/generate_pytorch_test_matrix.py
+
+  test:
+    needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run]
+    strategy:
+      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-test
+      TEST_CONFIG: ${{ matrix.config }}
+      SHARD_NUMBER: ${{ matrix.shard }}
+      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
+    steps:
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
+        run: |
+          bash .github/scripts/install_nvidia_utils_linux.sh
+          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+      - name: Determine shm-size
+        run: |
+          shm_size="1g"
+          case "${BUILD_ENVIRONMENT}" in
+            *cuda*)
+              shm_size="2g"
+              ;;
+            *rocm*)
+              shm_size="8g"
+              ;;
+          esac
+          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+      - name: Unzip artifacts
+        run: |
+          unzip -o artifacts.zip
+      - name: Output disk space left
+        run: |
+          sudo df -H
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Test PyTorch
+        env:
+          BUILD_ENVIRONMENT: parallelnative-linux-xenial-py3.6-gcc5.4-${{ matrix.config }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+        run: |
+          if [[ $TEST_CONFIG == 'multigpu' ]]; then
+            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
+          else
+            TEST_COMMAND=.jenkins/pytorch/test.sh
+          fi
+          if [[ $NUM_TEST_SHARDS -ne 2 ]]; then
+            export SHARD_NUMBER=0
+          fi
+          # TODO: Stop building test binaries as part of the build phase
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086
+          docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e GITHUB_ACTIONS \
+            -e IN_CI \
+            -e IN_WHEEL_TEST \
+            -e SHARD_NUMBER \
+            -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
+            -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e CONTINUE_THROUGH_ERROR \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="${SHM_SIZE}" \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Zip test reports for upload
+        if: always()
+        env:
+          COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        run: |
+          # Remove any previous test reports if they exist
+          rm -f test-reports-*.zip
+          zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml'
+      - uses: actions/upload-artifact@v2
+        name: Store PyTorch Test Reports
+        if: always()
+        with:
+          name: test-reports-${{ matrix.config }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test-reports-*.zip
+      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        name: Store PyTorch Test Reports on S3
+        if: always()
+        with:
+          name: test-reports-${{ matrix.config }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test-reports-*.zip
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload test statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-test
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
+        run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
+          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml
new file mode 100644
index 0000000000000..59eceb58ea230
--- /dev/null
+++ b/.github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml
@@ -0,0 +1,430 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: paralleltbb-linux-xenial-py3.6-gcc5.4
+
+on:
+  pull_request:
+    types: [unassigned]
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: paralleltbb-linux-xenial-py3.6-gcc5.4
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
+  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  TORCH_CUDA_ARCH_LIST: 5.2
+  IN_CI: 1
+  # This is used for the phase of adding wheel tests only, will be removed once completed
+  IN_WHEEL_TEST: 1
+  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
+  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+
+concurrency:
+  group: paralleltbb-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
+  calculate-docker-image:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.2xlarge
+    needs: [ciflow_should_run]
+    env:
+      DOCKER_BUILDKIT: 1
+    timeout-minutes: 90
+    outputs:
+      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
+    steps:
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
+      - name: Calculate docker image tag
+        id: calculate-tag
+        run: |
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "::set-output name=docker_tag::${DOCKER_TAG}"
+          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+      - name: Check if image should be built
+        id: check
+        env:
+          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
+          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
+        run: |
+          set -x
+          # Check if image already exists, if it does then skip building it
+          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
+            exit 0
+          fi
+          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
+            # if we're on the base branch then use the parent commit
+            MERGE_BASE=$(git rev-parse HEAD~)
+          else
+            # otherwise we're on a PR, so use the most recent base commit
+            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
+          fi
+          # Covers the case where a previous tag doesn't exist for the tree
+          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
+          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
+            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
+            exit 1
+          fi
+          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
+          # If no image exists but the hash is the same as the previous hash then we should error out here
+          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
+            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
+            echo "       contact the PyTorch team to restore the original images"
+            exit 1
+          fi
+          echo ::set-output name=rebuild::yes
+      - name: Build and push docker image
+        if: ${{ steps.check.outputs.rebuild }}
+        env:
+          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
+          DOCKER_SKIP_S3_UPLOAD: 1
+        run: |
+          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
+          cd .circleci/docker && ./build_docker.sh
+
+  build:
+    runs-on: linux.2xlarge
+    needs: [calculate-docker-image, ciflow_should_run]
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      JOB_BASE_NAME: paralleltbb-linux-xenial-py3.6-gcc5.4-build
+    steps:
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Build PyTorch
+        run: |
+          docker run \
+            -e BUILD_ENVIRONMENT \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        run: |
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Archive artifacts into zip
+        run: |
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
+      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        name: Store PyTorch Build Artifacts on S3
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Prune all of the docker images
+          docker system prune -af
+
+  generate-test-matrix:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ubuntu-18.04
+    needs: [ciflow_should_run]
+    env:
+      TEST_RUNNER_TYPE: linux.2xlarge
+      ENABLE_DISTRIBUTED_TEST: 1
+      ENABLE_JIT_LEGACY_TEST: ''
+      ENABLE_MULTIGPU_TEST: ''
+      ENABLE_NOGPU_NO_AVX_TEST: ''
+      ENABLE_NOGPU_NO_AVX2_TEST: ''
+      ENABLE_SLOW_TEST: ''
+      NUM_TEST_SHARDS: 1
+      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      NOGPU_RUNNER_TYPE: linux.2xlarge
+      PR_BODY: ${{ github.event.pull_request.body }}
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
+      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
+    container:
+      image: python:3.9
+    steps:
+      - name: Install dependencies
+        run: pip install typing-extensions
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - name: Generating test matrix
+        id: set-matrix
+        run: .github/scripts/generate_pytorch_test_matrix.py
+
+  test:
+    needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run]
+    strategy:
+      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      JOB_BASE_NAME: paralleltbb-linux-xenial-py3.6-gcc5.4-test
+      TEST_CONFIG: ${{ matrix.config }}
+      SHARD_NUMBER: ${{ matrix.shard }}
+      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
+    steps:
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Checkout PyTorch
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          submodules: recursive
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          .github/scripts/display_ec2_information.sh
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
+        run: |
+          bash .github/scripts/install_nvidia_utils_linux.sh
+          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+      - name: Determine shm-size
+        run: |
+          shm_size="1g"
+          case "${BUILD_ENVIRONMENT}" in
+            *cuda*)
+              shm_size="2g"
+              ;;
+            *rocm*)
+              shm_size="8g"
+              ;;
+          esac
+          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+      - name: Unzip artifacts
+        run: |
+          unzip -o artifacts.zip
+      - name: Output disk space left
+        run: |
+          sudo df -H
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Test PyTorch
+        env:
+          BUILD_ENVIRONMENT: paralleltbb-linux-xenial-py3.6-gcc5.4-${{ matrix.config }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+        run: |
+          if [[ $TEST_CONFIG == 'multigpu' ]]; then
+            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
+          else
+            TEST_COMMAND=.jenkins/pytorch/test.sh
+          fi
+          if [[ $NUM_TEST_SHARDS -ne 2 ]]; then
+            export SHARD_NUMBER=0
+          fi
+          # TODO: Stop building test binaries as part of the build phase
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086
+          docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e GITHUB_ACTIONS \
+            -e IN_CI \
+            -e IN_WHEEL_TEST \
+            -e SHARD_NUMBER \
+            -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
+            -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e CONTINUE_THROUGH_ERROR \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="${SHM_SIZE}" \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Zip test reports for upload
+        if: always()
+        env:
+          COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        run: |
+          # Remove any previous test reports if they exist
+          rm -f test-reports-*.zip
+          zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml'
+      - uses: actions/upload-artifact@v2
+        name: Store PyTorch Test Reports
+        if: always()
+        with:
+          name: test-reports-${{ matrix.config }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test-reports-*.zip
+      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        name: Store PyTorch Test Reports on S3
+        if: always()
+        with:
+          name: test-reports-${{ matrix.config }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test-reports-*.zip
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload test statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: paralleltbb-linux-xenial-py3.6-gcc5.4-test
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
+        run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
+          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # Prune all of the docker images
+          docker system prune -af

From 8b6266fe4f2986f4707bcd884e16d50728191214 Mon Sep 17 00:00:00 2001
From: Facebook Community Bot <facebook-github-bot@users.noreply.github.com>
Date: Sat, 28 Aug 2021 11:50:49 -0700
Subject: [PATCH 324/530] Automated submodule update: FBGEMM (#64129)

Summary:
This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM).

New submodule commit: https://github.com/pytorch/FBGEMM/commit/f14e79481460a7c0dedf452a258072231cb343e6

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64129

Test Plan: Ensure that CI jobs succeed on GitHub before landing.

Reviewed By: jspark1105

Differential Revision: D30621549

fbshipit-source-id: 34c109e75c96a261bf370f7a06dbb8b9004860ab
---
 third_party/fbgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/fbgemm b/third_party/fbgemm
index d4902e94367b9..e922280540acf 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit d4902e94367b9f074cadd29d7dc5ef6b0c69c6c1
+Subproject commit e922280540acf7920ea21f99b8db064f89bb8c11

From aefa2f3e643ab50fe4d8238ccdeb980143b6b454 Mon Sep 17 00:00:00 2001
From: Ilqar Ramazanli <iramazanli@fb.com>
Date: Sat, 28 Aug 2021 15:54:53 -0700
Subject: [PATCH 325/530] To add RMSProp algorithm documentation (#63721)

Summary:
It has been discussed before that adding description of Optimization algorithms to PyTorch Core documentation may result in a nice Optimization research tutorial. In the following tracking issue we mentioned about all the necessary algorithms and links to the originally published paper  https://github.com/pytorch/pytorch/issues/63236.

In this PR we are adding description of RMSProp to the documentation.  For more details, we refer to the paper   https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf

<img width="464" alt="RMSProp" src="https://user-images.githubusercontent.com/73658284/131179226-3fb6fe5a-5301-4948-afbe-f38bf57f24ff.png">

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63721

Reviewed By: albanD

Differential Revision: D30612426

Pulled By: iramazanli

fbshipit-source-id: c3ac630a9658d1282866b53c86023ac10cf95398
---
 torch/optim/rmsprop.py | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index 4aab0b3116fdb..dc72181b351f8 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -6,15 +6,44 @@
 class RMSprop(Optimizer):
     r"""Implements RMSprop algorithm.
 
-    Proposed by G. Hinton in his
-    `course <https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
-
-    The centered version first appears in `Generating Sequences
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \alpha \text{ (alpha)},\: \gamma \text{ (lr)},
+                \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}                   \\
+            &\hspace{13mm}   \lambda \text{ (weight decay)},\: \mu \text{ (momentum)},\: centered\\
+            &\textbf{initialize} : v_0 \leftarrow 0 \text{ (square average)}, \:
+                \textbf{b}_0 \leftarrow 0 \text{ (buffer)}, \: g^{ave}_0 \leftarrow 0     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm}if \: \lambda \neq 0                                                    \\
+            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
+            &\hspace{5mm}v_t           \leftarrow   \alpha v_{t-1} + (1 - \alpha) g^2_t
+                \hspace{8mm}                                                                     \\
+            &\hspace{5mm} \tilde{v_t} \leftarrow v_t                                             \\
+            &\hspace{5mm}if \: centered                                                          \\
+            &\hspace{10mm} g^{ave}_t \leftarrow g^{ave}_{t-1} \alpha + (1-\alpha) g_t            \\
+            &\hspace{10mm} \tilde{v_t} \leftarrow \tilde{v_t} -  \big(g^{ave}_{t} \big)^2        \\
+            &\hspace{5mm}if \: \mu > 0                                                           \\
+            &\hspace{10mm} \textbf{b}_t\leftarrow \mu \textbf{b}_{t-1} +
+                g_t/ \big(\sqrt{\tilde{v_t}} +  \epsilon \big)                                   \\
+            &\hspace{10mm} \theta_t \leftarrow \theta_{t-1} - \gamma \textbf{b}_t                \\
+            &\hspace{5mm} else                                                                   \\
+            &\hspace{10mm}\theta_t      \leftarrow   \theta_{t-1} -
+                \gamma  g_t/ \big(\sqrt{\tilde{v_t}} + \epsilon \big)  \hspace{3mm}              \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to
+    `lecture notes <https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_ by G. Hinton.
+    and centered version `Generating Sequences
     With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
-
     The implementation here takes the square root of the gradient average before
     adding epsilon (note that TensorFlow interchanges these two operations). The effective
-    learning rate is thus :math:`\alpha/(\sqrt{v} + \epsilon)` where :math:`\alpha`
+    learning rate is thus :math:`\gamma/(\sqrt{v} + \epsilon)` where :math:`\gamma`
     is the scheduled learning rate and :math:`v` is the weighted moving average
     of the squared gradient.
 

From 4f969db325a7a70878bd3eae5bbb3fecd598d4ca Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Sat, 28 Aug 2021 19:18:10 -0700
Subject: [PATCH 326/530] [nnc] Fix batchnorm implementation (#64112)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64112

Fixes #64062

Test Plan: Imported from OSS

Reviewed By: zhxchen17

Differential Revision: D30622897

Pulled By: bertmaher

fbshipit-source-id: 7d7c6131aa786e61fa1d0a517288396a0bdb1d22
---
 test/test_jit_fuser_te.py                    | 25 ++++++++++++++++++++
 torch/csrc/jit/tensorexpr/operators/norm.cpp | 19 +++++----------
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 014f142cf1443..6d2432aa151f8 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1912,6 +1912,31 @@ def eager(x):
             x = torch.ones((8, 1))
             torch.testing.assert_close(eager(x), script(x))
 
+    def test_batch_norm(self):
+        def test(fn, args):
+            trace = torch.jit.trace(fn, args)
+            self.assertAllFused(trace.graph_for(*args))
+            torch.testing.assert_allclose(fn(*args), trace(*args))
+
+        def bn(i, x):
+            return torch.batch_norm(i, x, x, x, x, False, 0.1, 1e-4, False).relu()
+
+        def bn_no_weight(i, x):
+            return torch.batch_norm(i, None, x, x, x, False, 0.1, 1e-4, False).relu()
+
+        def bn_no_bias(i, x):
+            return torch.batch_norm(i, x, None, x, x, False, 0.1, 1e-4, False).relu()
+
+        def bn_neither(i, x):
+            return torch.batch_norm(i, None, None, x, x, False, 0.1, 1e-4, False).relu()
+
+        for device in self.devices:
+            i = torch.randn(4, 16, 32, 40, device=device)
+            x = torch.randn(16, device=device)
+            for fn in [bn, bn_no_weight, bn_no_bias, bn_neither]:
+                test(fn, (i, x))
+
+
 works_list = [
     '__radd__',
     '__rdiv__',
diff --git a/torch/csrc/jit/tensorexpr/operators/norm.cpp b/torch/csrc/jit/tensorexpr/operators/norm.cpp
index 610f928d4e0b8..2e19d735d1809 100644
--- a/torch/csrc/jit/tensorexpr/operators/norm.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/norm.cpp
@@ -38,11 +38,15 @@ Tensor computeBatchNorm(
             constant(inputs[7]) // eps
         };
 
+        ExprHandle weight = FloatImm::make(1);
+        ExprHandle bias = FloatImm::make(0);
         if (hasWeight) {
-          exprInputs.push_back(tensorOrConstant(inputs[1], {c}));
+          weight = tensorOrConstant(inputs[1], {c});
+          exprInputs.push_back(weight);
         }
         if (hasBias) {
-          exprInputs.push_back(tensorOrConstant(inputs[2], {c}));
+          bias = tensorOrConstant(inputs[2], {c});
+          exprInputs.push_back(bias);
         }
         promoteInputs(exprInputs);
 
@@ -50,18 +54,7 @@ Tensor computeBatchNorm(
         ExprHandle mean = exprInputs[1];
         ExprHandle var = exprInputs[2];
         ExprHandle eps = exprInputs[3];
-        ExprHandle weight = FloatImm::make(1);
-        ExprHandle bias = FloatImm::make(0);
-
-        if (hasWeight) {
-          weight = exprInputs[4];
-        }
-        // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-        if (hasBias) {
-          bias = exprInputs[5];
-        }
 
-        // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
         auto inv_var = rsqrt(var + eps);
         auto alpha = inv_var * weight;
         auto beta = bias - mean * alpha;

From 405c15516c3a052fbad33fa8fcb6cde84ebc97d8 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Sat, 28 Aug 2021 19:57:10 -0700
Subject: [PATCH 327/530] Parse int64 sizes/strides (#64076)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64076

We were parsing sizes into int32s, so if you had a tensor with more
than 2^32 elements, you couldn't represent it.
ghstack-source-id: 136933273

Test Plan: parseIR with size of 4e9

Reviewed By: ZolotukhinM

Differential Revision: D30521116

fbshipit-source-id: 1e28e462cba52d648e0e2acb4e234d86aae25a3e
---
 torch/csrc/jit/frontend/schema_type_parser.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp
index db1a1e83bc7ce..b4e6ca880ebce 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.cpp
+++ b/torch/csrc/jit/frontend/schema_type_parser.cpp
@@ -235,7 +235,7 @@ TypePtr SchemaTypeParser::parseRefinedTensor() {
           const std::string& num = L.expect(TK_NUMBER).text();
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           std::string::size_type num_len;
-          size_t stride = c10::stoi(num, &num_len);
+          auto stride = c10::stoll(num, &num_len);
           strides.push_back(stride);
         });
         return;
@@ -260,7 +260,7 @@ TypePtr SchemaTypeParser::parseRefinedTensor() {
     const std::string& num = L.expect(TK_NUMBER).text();
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     std::string::size_type num_len;
-    size_t dim = c10::stoi(num, &num_len);
+    auto dim = c10::stoll(num, &num_len);
     dims.emplace_back(dim);
   });
   if (seen_strides) {

From 2e6221a232d39917e2736b248c53fa85dfb8986e Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Sat, 28 Aug 2021 19:57:10 -0700
Subject: [PATCH 328/530] [nnc] Make 64-bit dimensions work (#64077)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64077

We were assuming kernel dimensions fit in 32 bits (the old fuser made
this assumption too), but we should be able to support 64.
ghstack-source-id: 136933272

Test Plan: unit tests; new IR level test with huge sizes

Reviewed By: ZolotukhinM

Differential Revision: D30596689

fbshipit-source-id: 23b7e393a2ebaecb0c391a6b1f0c4b05a98bcc94
---
 test/cpp/tensorexpr/test_kernel.cpp           |  40 ++++--
 test/cpp/tensorexpr/test_llvm.cpp             |  78 ++++++-----
 test/cpp/tensorexpr/test_loopnest.cpp         |   7 +-
 test/cpp/tensorexpr/test_reductions.cpp       |   1 -
 torch/csrc/jit/tensorexpr/block_codegen.cpp   |  11 +-
 .../csrc/jit/tensorexpr/bounds_inference.cpp  |   2 +-
 torch/csrc/jit/tensorexpr/bounds_overlap.cpp  |  13 +-
 torch/csrc/jit/tensorexpr/cuda_codegen.cpp    |  19 +--
 torch/csrc/jit/tensorexpr/eval.cpp            |  56 ++++++--
 torch/csrc/jit/tensorexpr/eval.h              |  14 ++
 torch/csrc/jit/tensorexpr/expr.h              |   2 +-
 torch/csrc/jit/tensorexpr/ir.cpp              |   6 +-
 torch/csrc/jit/tensorexpr/ir.h                |  24 ++++
 torch/csrc/jit/tensorexpr/ir_printer.cpp      |   8 ++
 torch/csrc/jit/tensorexpr/ir_simplifier.cpp   | 129 ++++++++----------
 torch/csrc/jit/tensorexpr/kernel.cpp          |  81 +++++------
 torch/csrc/jit/tensorexpr/kernel.h            |   4 +-
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp    |  39 +++---
 torch/csrc/jit/tensorexpr/llvm_jit.h          |   4 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp        | 114 +++++++++-------
 .../jit/tensorexpr/mem_dependency_checker.cpp |  27 ++--
 torch/csrc/jit/tensorexpr/registerizer.cpp    |   5 +-
 torch/csrc/jit/tensorexpr/tensor.cpp          |   7 +-
 23 files changed, 397 insertions(+), 294 deletions(-)

diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index 625fadb811710..f4d3b16b964f2 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -198,6 +198,22 @@ TEST_F(Kernel, _3) {
   }
 }
 
+TEST_F(Kernel, Huge) {
+  const auto graph_string = R"IR(
+      graph(%x.1 : Float(4000000000, strides=[1], requires_grad=0, device=cpu)):
+        %1 : int = prim::Constant[value=0]()
+        %2 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::unsqueeze(%x.1, %1)
+        %3 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::relu(%2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+  TensorExprKernel k(graph);
+  std::ostringstream oss;
+  oss << *k.getCodeGenStmt();
+  const std::string& verification_pattern = "# CHECK: 4000000000";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
 TEST_F(Kernel, ParallelStrided) {
   const auto graph_string = R"IR(
       graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu),
@@ -786,9 +802,9 @@ TEST_F(Kernel, SumOneAxis) {
         // Check the IR we produced
         const std::string& verification_pattern =
             R"IR(
-# CHECK: for (int v = 0; v <
+# CHECK: for (int64_t v = 0ll; v <
 # CHECK-NEXT: sum
-# CHECK-NEXT: for (int v_1 = 0; v_1 <
+# CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 <
 # CHECK-NEXT:   sum)IR";
         torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
 
@@ -847,10 +863,10 @@ TEST_F(Kernel, SumMultipleAxes) {
         // Check the IR we produced
         const std::string& verification_pattern =
             R"IR(
-# CHECK: int v = 0
-# CHECK: int v_1 = 0
-# CHECK: int v_2 = 0
-# CHECK: int v_3 = 0
+# CHECK: int64_t v = 0
+# CHECK: int64_t v_1 = 0
+# CHECK: int64_t v_2 = 0
+# CHECK: int64_t v_3 = 0
 # CHECK: sum)IR";
         torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
 
@@ -1115,8 +1131,8 @@ TEST_F(Kernel, InlineProducerIntoReduction) {
   // We should have only one loop in the end.
   const std::string& verification_pattern =
       R"IR(
-        # CHECK: for (int v = 0; v < 5;
-        # CHECK-NEXT: for (int v_1 = 0; v_1 < 3;
+        # CHECK: for (int64_t v = 0ll; v < 5
+        # CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 < 3
         # CHECK-NEXT:   sum
         # CHECK-NOT: for)IR";
   torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
@@ -1154,11 +1170,11 @@ TEST_F(Kernel, InlineReductionIntoConsumer) {
   // We should have two loops in the end.
   const std::string& verification_pattern =
       R"IR(
-        # CHECK: for (int v = 0; v < 5;
-        # CHECK-NEXT: for (int v_1 = 0; v_1 < 3;
+        # CHECK: for (int64_t v = 0ll; v < 5
+        # CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 < 3
         # CHECK-NEXT:   sum
-        # CHECK: for (int v_2 = 0; v_2 < 5;
-        # CHECK-NEXT: for (int v_3 = 0; v_3 < 3;
+        # CHECK: for (int64_t v_2 = 0ll; v_2 < 5
+        # CHECK-NEXT: for (int64_t v_3 = 0ll; v_3 < 3
         # CHECK-NEXT:   aten_mul
         # CHECK-NOT: for)IR";
   torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index 139763b071317..0e5cf5eb03a3d 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -1501,42 +1501,54 @@ TEST(LLVM, RFactorVectorizedReduction) {
   ExpectAllNear(b_v, b_ref, 1e-5);
 }
 
-TEST(LLVM, SimpleParallel) {
-  for (int test_cfg = 0; test_cfg < 4; test_cfg++) {
-    // Compute a simple operation, and try all loop-axis combination to be
-    // parallel or sequential.
-    const int M = 4;
-    const int N = 6;
-    Tensor f = Compute(
-        "f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) {
-          return cast<float>(m + n);
-        });
-    LoopNest loop_nest({f});
-    auto const& loops = loop_nest.getLoopStmtsFor(f);
-    ForPtr m = loops[0];
-    ForPtr n = loops[1];
-    if (test_cfg & 0x1) {
-      m->set_parallel();
-    }
-    if (test_cfg & 0x2) {
-      n->set_parallel();
-    }
-    loop_nest.prepareForCodegen();
-    StmtPtr stmt = loop_nest.root_stmt();
-    LLVMCodeGen cg(stmt, {f});
+template <bool outer, bool inner>
+static void testSimpleParallel() {
+  // Compute a simple operation, and try all loop-axis combination to be
+  // parallel or sequential.
+  const int M = 4;
+  const int N = 6;
+  Tensor f = Compute(
+      "f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) {
+        return cast<float>(m + n);
+      });
+  LoopNest loop_nest({f});
+  auto const& loops = loop_nest.getLoopStmtsFor(f);
+  ForPtr m = loops[0];
+  ForPtr n = loops[1];
+  if (outer) {
+    m->set_parallel();
+  }
+  if (inner) {
+    n->set_parallel();
+  }
+  loop_nest.prepareForCodegen();
+  StmtPtr stmt = loop_nest.root_stmt();
+  LLVMCodeGen cg(stmt, {f});
 
-    PaddedBuffer<float> f_v(M, N, "f_v");
-    std::vector<void*> args({f_v.data()});
-    int value = cg.value<int>(args);
-    ASSERT_EQ(value, 0);
-    PaddedBuffer<float> f_ref(M, N, "f_ref");
-    for (int m = 0; m < M; m++) {
-      for (int n = 0; n < N; n++) {
-        f_ref(m, n) = m + n;
-      }
+  PaddedBuffer<float> f_v(M, N, "f_v");
+  std::vector<void*> args({f_v.data()});
+  int value = cg.value<int>(args);
+  ASSERT_EQ(value, 0);
+  PaddedBuffer<float> f_ref(M, N, "f_ref");
+  for (int m = 0; m < M; m++) {
+    for (int n = 0; n < N; n++) {
+      f_ref(m, n) = m + n;
     }
-    ExpectAllNear(f_v, f_ref, 1e-5);
   }
+  ExpectAllNear(f_v, f_ref, 1e-5);
+}
+
+TEST(LLVM, SimpleParallelSS) {
+  testSimpleParallel<false, false>();
+}
+TEST(LLVM, SimpleParallelSP) {
+  testSimpleParallel<false, true>();
+}
+TEST(LLVM, SimpleParallelPS) {
+  testSimpleParallel<true, false>();
+}
+TEST(LLVM, SimpleParallelPP) {
+  testSimpleParallel<true, true>();
 }
 
 TEST(LLVM, CompositeParallel) {
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index 28934f622d057..c2b33e2a184d2 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -4734,8 +4734,8 @@ TEST(LoopNest, VectorizeUse) {
 }
 
 const char* int64Loop = R"IR(
-# CHECK: for (int64_t n = 0; n < 12; n++) {
-# CHECK:   b[n] = (a[n]) + 1;
+# CHECK: for (int64_t n = 0ll; n < 12ll; n++) {
+# CHECK:   b[n] = (a[n]) + 1ll;
 # CHECK: }
 )IR";
 
@@ -4744,7 +4744,8 @@ TEST(LoopNest, Int64Direct) {
   Placeholder a("a", kLong, {N});
   Placeholder b("b", kLong, {N});
   VarHandle n("n", kLong);
-  StmtPtr s = For::make(n, 0, N, b.store({n}, a.load({n}) + LongImm::make(1l)));
+  StmtPtr s = For::make(
+      n, LongImm::make(0l), N, b.store({n}, a.load({n}) + LongImm::make(1l)));
   s = IRSimplifier::simplify(s);
   std::ostringstream oss;
   oss << *s;
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index 411b58db57f9e..3d2c0ecc27bfe 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -1712,7 +1712,6 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) {
 #CHECK-NOT: tmp
       )IR";
   torch::jit::testing::FileCheck().run(expected_ir, oss.str());
-
   SimpleIREvaluator cg(s, {b, c, m, n, k});
 
   cg.call({in, out, M, N, K});
diff --git a/torch/csrc/jit/tensorexpr/block_codegen.cpp b/torch/csrc/jit/tensorexpr/block_codegen.cpp
index 1ae3330799c64..51b7b77f6d39d 100644
--- a/torch/csrc/jit/tensorexpr/block_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/block_codegen.cpp
@@ -76,7 +76,7 @@ void BlockAnalysis::visit(ForPtr v) {
     v->body()->accept(this);
   } else if (loop_options.is_gpu_thread_index()) {
     auto block_size = v->stop();
-    block_size_ = to<IntImm>(block_size)->value();
+    block_size_ = *intValue(block_size);
     v->body()->accept(this);
   } else {
     IRVisitor::visit(v);
@@ -185,15 +185,14 @@ void BlockPrinter::PrintArguments(const std::unordered_set<BufPtr>& bufs) {
 
     // The dims for the multi-dim tensors
     for (unsigned long d = 0; d < num_dims; d++) {
-      auto dim_val = to<IntImm>(multidimbuf->dim(d));
-      this->dim_values_map.emplace(this->dim_names[d], dim_val->value());
+      auto dim_val = *intValue(multidimbuf->dim(d));
+      this->dim_values_map.emplace(this->dim_names[d], dim_val);
     }
 
     // The dimensions for the flattened tensors
-    auto val = to<IntImm>(buf->dim(0));
+    auto val = *intValue(buf->dim(0));
     if (block_analysis_->is_buf_store_target(buf)) {
-      this->dim_values_map.emplace(
-          this->flat_dim_names[num_dims - 1], val->value());
+      this->dim_values_map.emplace(this->flat_dim_names[num_dims - 1], val);
     }
   }
 
diff --git a/torch/csrc/jit/tensorexpr/bounds_inference.cpp b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
index 55dbacf087899..649fd0e69da8e 100644
--- a/torch/csrc/jit/tensorexpr/bounds_inference.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
@@ -185,7 +185,7 @@ std::vector<ExprPtr> getBoundExtents(
   std::vector<ExprPtr> extents;
   for (size_t i = 0; i < starts.size(); ++i) {
     ExprPtr dim = IRSimplifier::simplify(
-        alloc<Add>(alloc<Sub>(stops[i], starts[i]), alloc<IntImm>(1)));
+        alloc<Add>(alloc<Sub>(stops[i], starts[i]), immLike(stops[i], 1)));
 
     extents.push_back(dim);
   }
diff --git a/torch/csrc/jit/tensorexpr/bounds_overlap.cpp b/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
index 4ac5c6b96fb9a..fdfff12ad7666 100644
--- a/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
@@ -130,8 +130,8 @@ std::vector<Bound> subtractBound(Bound a, Bound b, OverlapKind overlap) {
     auto vars = VarFinder::find(lowDiff);
     if (vars.size() == 1) {
       lowDiff = IRSimplifier::simplify(alloc<Sub>(
-          SubstituteInClone(b.start, {{*vars.begin(), alloc<IntImm>(1)}}),
-          SubstituteInClone(a.start, {{*vars.begin(), alloc<IntImm>(1)}})));
+          SubstituteInClone(b.start, {{*vars.begin(), immLike(b.start, 1)}}),
+          SubstituteInClone(a.start, {{*vars.begin(), immLike(a.start, 1)}})));
     }
   }
 
@@ -139,8 +139,8 @@ std::vector<Bound> subtractBound(Bound a, Bound b, OverlapKind overlap) {
     auto vars = VarFinder::find(highDiff);
     if (vars.size() == 1) {
       highDiff = IRSimplifier::simplify(alloc<Sub>(
-          SubstituteInClone(b.end, {{*vars.begin(), alloc<IntImm>(1)}}),
-          SubstituteInClone(a.end, {{*vars.begin(), alloc<IntImm>(1)}})));
+          SubstituteInClone(b.end, {{*vars.begin(), immLike(b.end, 1)}}),
+          SubstituteInClone(a.end, {{*vars.begin(), immLike(a.end, 1)}})));
     }
   }
 
@@ -157,12 +157,13 @@ std::vector<Bound> subtractBound(Bound a, Bound b, OverlapKind overlap) {
 
   if (hasHead) {
     res.emplace_back(
-        a.start, IRSimplifier::simplify(alloc<Sub>(b.start, alloc<IntImm>(1))));
+        a.start,
+        IRSimplifier::simplify(alloc<Sub>(b.start, immLike(b.start, 1))));
   }
 
   if (hasTail) {
     ExprPtr tailStart =
-        IRSimplifier::simplify(alloc<Add>(b.end, alloc<IntImm>(1)));
+        IRSimplifier::simplify(alloc<Add>(b.end, immLike(b.end, 1)));
     res.emplace_back(tailStart, a.end);
   }
 
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index b342f1464b0c2..30d42075189fb 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -45,18 +45,9 @@ class ScopedVarName {
   VarPtr var_ = nullptr;
 };
 
-static int as_int(ExprPtr expr) {
-  auto v = to<IntImm>(expr);
-  if (!v) {
-    throw malformed_input(
-        "cuda_codegen: non Int expr interpreted as int", expr);
-  }
-
-  return v->value();
-}
-
 static bool is_zero(ExprPtr expr) {
-  return as_int(expr) == 0;
+  auto v = intValue(expr);
+  return v && *v == 0;
 }
 
 static const at::cuda::NVRTC& nvrtc() {
@@ -222,11 +213,11 @@ void CudaPrinter::print_flat_alloc(AllocatePtr alloc) {
   // TODO: this should be merged with the storage flattener.
   int64_t flat_size = 1;
   for (auto dim : dims) {
-    IntImmPtr dim_i = to<IntImm>(dim);
+    auto dim_i = intValue(dim);
     if (dim_i) {
-      flat_size *= dim_i->value();
+      flat_size *= *dim_i;
     } else {
-      throw std::runtime_error("Only IntImm dimensions are supported for now");
+      throw std::runtime_error("Only integer dimensions are supported for now");
     }
   }
   os() << dtypeToCppString(alloc->dtype()) << " " << (*alloc->buffer_var())
diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp
index 05c3ff8245221..e42ce77820e11 100644
--- a/torch/csrc/jit/tensorexpr/eval.cpp
+++ b/torch/csrc/jit/tensorexpr/eval.cpp
@@ -10,6 +10,17 @@ namespace tensorexpr {
 
 RegisterCodeGen<SimpleIREvaluator> ir_eval_codegen_reg("simple_ir_eval");
 
+int64_t Value::intValue() const {
+#define TYPE_CASE(Type, Name)        \
+  if (dtype_ == k##Name) {           \
+    return int64_t{Name##values[0]}; \
+  }
+  AT_FORALL_INT_TYPES(TYPE_CASE);
+#undef TYPE_CASE
+  throw unsupported_dtype();
+  return 0;
+}
+
 template <typename T>
 inline typename std::enable_if<std::is_integral<T>::value, T>::type mod_value(
     T lhs,
@@ -537,15 +548,16 @@ class SimpleIREvaluatorImpl : public IRVisitor {
   TORCH_API void visit(ForPtr v) override {
     ExprPtr var_node = v->var();
     v->start()->accept(this);
-    int start = value_.as<int>();
+    auto dtype = value_.dtype();
+    auto start = value_.intValue();
     v->stop()->accept(this);
-    int stop = value_.as<int>();
+    auto stop = value_.intValue();
     if (eval_context_.count(var_node)) {
       throw malformed_input("could not find var_node in For context", v);
     }
 
-    for (int i = start; i < stop; i++) {
-      eval_context_[var_node] = Value(i);
+    for (auto i = start; i < stop; i++) {
+      eval_context_[var_node] = Value(dtype, i);
       if (v->body()) {
         v->body()->accept(this);
       }
@@ -555,9 +567,9 @@ class SimpleIREvaluatorImpl : public IRVisitor {
 
   TORCH_API void visit(RampPtr v) override {
     v->base()->accept(this);
-    int base = value().as<int>();
+    auto base = value().intValue();
     v->stride()->accept(this);
-    int stride = value().as<int>();
+    auto stride = value().intValue();
     int lanes = v->lanes();
 
     std::vector<int> values(lanes);
@@ -609,6 +621,24 @@ class SimpleIREvaluatorImpl : public IRVisitor {
     }
   }
 
+  template <typename T>
+  std::vector<int64_t> toLongVec(T&& t) {
+    return std::vector<int64_t>{std::begin(t), std::end(t)};
+  }
+
+  std::vector<int64_t> indexVec(const Value& v) {
+    switch (v.dtype().scalar_type()) {
+#define TYPE_CASE(Type, Name) \
+  case ScalarType::Name:      \
+    return toLongVec(v.as_vec<Type>());
+      AT_FORALL_INT_TYPES(TYPE_CASE);
+#undef TYPE_CASE
+      default:
+        throw unsupported_dtype();
+    }
+    return {};
+  }
+
   TORCH_API void visit(LoadPtr v) override {
     auto iter = buffer_mapping_.find(v->buf());
     if (iter == buffer_mapping_.end()) {
@@ -618,7 +648,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
 
     ExprPtr flat_idx = flatten_index(v->buf()->dims(), v->indices());
     flat_idx->accept(this);
-    std::vector<int> index = value().as_vec<int>();
+    auto index = indexVec(value());
     ScalarType v_sdtype = v->dtype().scalar_type();
     switch (v_sdtype) {
 #define TYPE_CASE(Type, Name)                        \
@@ -647,7 +677,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
 
     ExprPtr flat_idx = flatten_index(v->buf()->dims(), v->indices());
     flat_idx->accept(this);
-    std::vector<int> index = value().as_vec<int>();
+    auto index = indexVec(value());
     ScalarType v_sdtype = v->value()->dtype().scalar_type();
 
     switch (v_sdtype) {
@@ -696,7 +726,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
       buf_dtypes.push_back((int8_t)b->dtype().scalar_type());
       for (ExprPtr dim_expr : b->dims()) {
         dim_expr->accept(this);
-        buf_dims.push_back(value().as<int>());
+        buf_dims.push_back(value().intValue());
       }
     }
     for (ExprPtr a : v->args()) {
@@ -706,7 +736,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
       if (value().dtype() == kLong) {
         val = value().as<int64_t>();
       } else if (value().dtype() == kInt) {
-        val = value().as<int>();
+        val = value().intValue();
       } else {
         throw malformed_input(
             "extra_args in ExternalCalls must have int64 dtype", v);
@@ -789,10 +819,10 @@ class SimpleIREvaluatorImpl : public IRVisitor {
   void visit(AllocatePtr v) override {
     BufPtr b = v->buf();
     std::vector<ExprPtr> dims = b->dims();
-    int total_byte_size = b->dtype().byte_size();
+    int64_t total_byte_size = b->dtype().byte_size();
     for (auto& dim : dims) {
       dim->accept(this);
-      total_byte_size *= value_.as<int>();
+      total_byte_size *= value_.intValue();
     }
     auto int_count = (total_byte_size + sizeof(int) - 1) / sizeof(int);
     std::unique_ptr<std::vector<int>> buffer(new std::vector<int>(int_count));
@@ -824,7 +854,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
 
   void visit(CondPtr v) override {
     v->condition()->accept(this);
-    if (value().as<int>()) {
+    if (value().intValue()) {
       if (v->true_stmt()) {
         v->true_stmt()->accept(this);
       }
diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h
index 38ec99bd431cf..494ba283ea902 100644
--- a/torch/csrc/jit/tensorexpr/eval.h
+++ b/torch/csrc/jit/tensorexpr/eval.h
@@ -29,6 +29,18 @@ class Value {
     Intvalues.push_back(0);
   }
 
+  template <typename T>
+  Value(Dtype dtype, T v) : dtype_(dtype) {
+#define TYPE_CASE(Type, Name)  \
+  if (dtype == k##Name) {      \
+    Name##values.push_back(v); \
+    return;                    \
+  }
+    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+#undef TYPE_CASE
+    throw unsupported_dtype();
+  }
+
 #define VALUE_CTOR(Type, Name)      \
   Value(Type v) : dtype_(k##Name) { \
     Name##values.push_back(v);      \
@@ -50,6 +62,8 @@ class Value {
   template <typename T>
   const std::vector<T>& as_vec() const;
 
+  int64_t intValue() const;
+
   Dtype dtype() const {
     return dtype_;
   }
diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h
index a4f317f48e666..fbbea12387920 100644
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -319,7 +319,7 @@ class TORCH_API BufHandle : public ExprHandle {
 // object. For example: VarHandle x('x'); ExprHandle x2 = x;
 class TORCH_API VarHandle : public ExprHandle {
  public:
-  VarHandle() : ExprHandle(nullptr) {}
+  VarHandle() : ExprHandle() {}
   explicit VarHandle(Dtype dtype) : ExprHandle(Var::make(dtype)) {}
   VarHandle(const std::string& name_hint, Dtype dtype)
       : ExprHandle(Var::make(name_hint, dtype)) {}
diff --git a/torch/csrc/jit/tensorexpr/ir.cpp b/torch/csrc/jit/tensorexpr/ir.cpp
index f66c0c5ba0701..2680f5366b46e 100644
--- a/torch/csrc/jit/tensorexpr/ir.cpp
+++ b/torch/csrc/jit/tensorexpr/ir.cpp
@@ -88,17 +88,17 @@ ExprPtr flatten_index(
     throw malformed_input("dimensions mismatch in flatten_index");
   }
   if (ndim == 0) {
-    return alloc<IntImm>(0);
+    return alloc<LongImm>(0);
   }
   std::vector<ExprPtr> strides(ndim);
   // stride[i] = stride[i+1]*dims[i+1], i < ndim-1
   // stride[i] = 1,                     i = ndim-1
-  strides[ndim - 1] = alloc<IntImm>(1);
+  strides[ndim - 1] = immLike(dims[ndim - 1], 1);
   for (size_t i = 1; i < ndim; i++) {
     strides[ndim - 1 - i] = alloc<Mul>(strides[ndim - i], dims[ndim - i]);
   }
 
-  ExprPtr total_index = alloc<IntImm>(0);
+  ExprPtr total_index = immLike(indices[0], 0);
   for (const auto i : c10::irange(ndim)) {
     total_index = alloc<Add>(total_index, alloc<Mul>(indices[i], strides[i]));
   }
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h
index 7fe1fd1a07abb..1218082e6af98 100644
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -344,6 +344,30 @@ ExprPtr getImmediateByType(Dtype dtype, T initialVal) {
   return getImmediateByType<T>(dtype.scalar_type(), initialVal);
 }
 
+template <typename T>
+ExprPtr immLike(ExprPtr e, T v) {
+  return getImmediateByType<T>(e->dtype(), v);
+}
+
+template <typename T>
+ExprPtr immLike(ExprHandle e, T v) {
+  return immLike(e.node(), v);
+}
+
+inline c10::optional<int64_t> intValue(ExprPtr e) {
+#define TYPE_CASE(Type, Name)      \
+  if (auto v = to<Name##Imm>(e)) { \
+    return v->value();             \
+  }
+  AT_FORALL_INT_TYPES(TYPE_CASE);
+#undef TYPE_CASE
+  return c10::nullopt;
+}
+
+inline c10::optional<int64_t> intValue(ExprHandle e) {
+  return intValue(e.node());
+}
+
 template <typename T>
 T immediateAs(ExprPtr e) {
 #define TYPE_CASE(Type, Name)                \
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index 2e1fc6e6952a7..ca90d9995e0d2 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -206,11 +206,19 @@ static void formatImm(std::ostream& os, T v) {
   }
 }
 
+static void formatIntSuffix(std::ostream& os, int64_t v) {
+  os << "ll";
+}
+
+template <typename T>
+static void formatIntSuffix(std::ostream& os, T v) {}
+
 template <
     typename T,
     std::enable_if_t<!std::is_floating_point<T>::value>* = nullptr>
 static void formatImm(std::ostream& os, T v) {
   os << +v;
+  formatIntSuffix(os, v);
 }
 
 // NOLINTNEXTLINE
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index 23216dd4002f7..6820bbb5748a2 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -430,8 +430,7 @@ ExprPtr PolynomialTransformer::mutate(AddPtr v) {
 
     // Otherwise this is a new polynomial with no scalar and two variable
     // terms.
-    return alloc<Polynomial>(
-        hasher_, getImmediateByType(v->dtype(), 0), lhsTerm, rhsTerm);
+    return alloc<Polynomial>(hasher_, immLike(v, 0), lhsTerm, rhsTerm);
   }
 
   // Adds are commutative.
@@ -452,19 +451,17 @@ ExprPtr PolynomialTransformer::mutate(AddPtr v) {
   // Simple Term with a scalar and variable type.
   if (scalar) {
     return alloc<Polynomial>(
-        hasher_,
-        scalar,
-        alloc<Term>(hasher_, getImmediateByType(v->dtype(), 1), variable));
+        hasher_, scalar, alloc<Term>(hasher_, immLike(v, 1), variable));
   }
 
   // If LHS is neither Term not Polynomial, wrap it in a Term.
   if (!lhsTerm && !lhsPoly) {
-    lhsTerm = alloc<Term>(hasher_, getImmediateByType(v->dtype(), 1), lhs_new);
+    lhsTerm = alloc<Term>(hasher_, immLike(v, 1), lhs_new);
   }
 
   // Same for RHS.
   if (!rhsTerm && !rhsPoly) {
-    rhsTerm = alloc<Term>(hasher_, getImmediateByType(v->dtype(), 1), rhs_new);
+    rhsTerm = alloc<Term>(hasher_, immLike(v, 1), rhs_new);
   }
 
   // If we now have a poly and a term, we can insert.
@@ -480,8 +477,7 @@ ExprPtr PolynomialTransformer::mutate(AddPtr v) {
   }
 
   // If all else fails we have a new Polynomial with two new variable Terms.
-  return alloc<Polynomial>(
-      hasher_, getImmediateByType(v->dtype(), 0), lhsTerm, rhsTerm);
+  return alloc<Polynomial>(hasher_, immLike(v, 0), lhsTerm, rhsTerm);
 }
 
 ExprPtr PolynomialTransformer::subTerms(
@@ -490,7 +486,7 @@ ExprPtr PolynomialTransformer::subTerms(
     bool negated) {
   // If RHS not already negated, negate it.
   if (!negated) {
-    ExprPtr minusOne = getImmediateByType(rhs->dtype(), -1);
+    ExprPtr minusOne = immLike(rhs, -1);
     ExprPtr negateScalar = evaluateOp(alloc<Mul>(minusOne, rhs->scalar()));
     rhs = alloc<Term>(hasher_, negateScalar, rhs->variables());
   }
@@ -529,8 +525,7 @@ ExprPtr PolynomialTransformer::subPolynomials(
 
   for (auto rt : rhs->variables()) {
     // Polynomials add their terms, so negate the RHS's Terms.
-    ExprPtr negated = evaluateOp(
-        alloc<Mul>(getImmediateByType(rt->dtype(), -1), rt->scalar()));
+    ExprPtr negated = evaluateOp(alloc<Mul>(immLike(rt, -1), rt->scalar()));
     TermPtr newRHS = alloc<Term>(hasher_, negated, rt->variables());
     addOrUpdateTerm(varmap, newRHS);
   }
@@ -594,7 +589,7 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) {
     auto ret = subPolynomials(lhsPoly, rhsPoly);
     if (!ret) {
       // Cancelled out completely.
-      return getImmediateByType(v->dtype(), 0);
+      return immLike(v, 0);
     }
     return ret;
   }
@@ -605,8 +600,8 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) {
   // Polynomial - Term.
   if (lhsPoly && rhsTerm) {
     // Negate the term.
-    ExprPtr negate = evaluateOp(alloc<Mul>(
-        getImmediateByType(rhsTerm->dtype(), -1), rhsTerm->scalar()));
+    ExprPtr negate =
+        evaluateOp(alloc<Mul>(immLike(rhsTerm, -1), rhsTerm->scalar()));
     TermPtr newTerm = alloc<Term>(hasher_, negate, rhsTerm->variables());
     return insertTerm(lhsPoly, newTerm);
   }
@@ -614,7 +609,7 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) {
   // Term - Polynomial.
   if (rhsPoly && lhsTerm) {
     // Negate every part of the Polynomial.
-    ExprPtr minusOne = getImmediateByType(lhsTerm->dtype(), -1);
+    ExprPtr minusOne = immLike(lhsTerm, -1);
     ExprPtr negateScalar = evaluateOp(alloc<Mul>(minusOne, rhsPoly->scalar()));
 
     std::vector<TermPtr> variables;
@@ -645,7 +640,7 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) {
     ExprPtr newScalar = evaluateOp(alloc<Sub>(lhs_new, rhsPoly->scalar()));
 
     // Negate each term in the Polynomial RHS.
-    ExprPtr minusOne = getImmediateByType(rhsPoly->dtype(), -1);
+    ExprPtr minusOne = immLike(rhsPoly, -1);
     std::vector<TermPtr> variables;
     for (auto t : rhsPoly->variables()) {
       ExprPtr negate = evaluateOp(alloc<Mul>(minusOne, t->scalar()));
@@ -657,15 +652,14 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) {
 
   if (lhsTerm && rhsScalar) {
     // Negate the constant.
-    ExprPtr negate = evaluateOp(
-        alloc<Mul>(getImmediateByType(rhs_new->dtype(), -1), rhs_new));
+    ExprPtr negate = evaluateOp(alloc<Mul>(immLike(rhs_new, -1), rhs_new));
     return alloc<Polynomial>(hasher_, negate, lhsTerm);
   }
 
   if (lhsScalar && rhsTerm) {
     // Negate the RHS Term.
-    ExprPtr negate = evaluateOp(alloc<Mul>(
-        getImmediateByType(rhsTerm->scalar()->dtype(), -1), rhsTerm->scalar()));
+    ExprPtr negate = evaluateOp(
+        alloc<Mul>(immLike(rhsTerm->scalar(), -1), rhsTerm->scalar()));
 
     return alloc<Polynomial>(
         hasher_, lhs_new, alloc<Term>(hasher_, negate, rhsTerm->variables()));
@@ -675,29 +669,24 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) {
   if (lhsScalar) {
     // Create a negated term.
     return alloc<Polynomial>(
-        hasher_,
-        lhs_new,
-        alloc<Term>(hasher_, getImmediateByType(v->dtype(), -1), rhs_new));
+        hasher_, lhs_new, alloc<Term>(hasher_, immLike(v, -1), rhs_new));
   }
 
   if (rhsScalar) {
     // Negate the scalar.
-    ExprPtr negate = evaluateOp(
-        alloc<Mul>(getImmediateByType(rhs_new->dtype(), -1), rhs_new));
+    ExprPtr negate = evaluateOp(alloc<Mul>(immLike(rhs_new, -1), rhs_new));
     return alloc<Polynomial>(
-        hasher_,
-        negate,
-        alloc<Term>(hasher_, getImmediateByType(v->dtype(), 1), lhs_new));
+        hasher_, negate, alloc<Term>(hasher_, immLike(v, 1), lhs_new));
   }
 
   // no scalar...
   if (!lhsTerm && !lhsPoly) {
-    lhsTerm = alloc<Term>(hasher_, getImmediateByType(v->dtype(), 1), lhs_new);
+    lhsTerm = alloc<Term>(hasher_, immLike(v, 1), lhs_new);
   }
 
   bool createdRHSnegated = false;
   if (!rhsTerm && !rhsPoly) {
-    rhsTerm = alloc<Term>(hasher_, getImmediateByType(v->dtype(), -1), rhs_new);
+    rhsTerm = alloc<Term>(hasher_, immLike(v, -1), rhs_new);
     createdRHSnegated = true;
   }
 
@@ -714,7 +703,7 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) {
   // Insert wrapper Term into negated RHS Poly.
   if (rhsPoly) {
     CHECK(lhsTerm);
-    ExprPtr minusOne = getImmediateByType(rhsPoly->dtype(), -1);
+    ExprPtr minusOne = immLike(rhsPoly, -1);
     ExprPtr newScalar = evaluateOp(alloc<Mul>(minusOne, rhsPoly->scalar()));
 
     // Negate each term in the Polynomial RHS.
@@ -728,8 +717,7 @@ ExprPtr PolynomialTransformer::mutate(SubPtr v) {
     return insertTerm(poly, lhsTerm);
   }
 
-  return alloc<Polynomial>(
-      hasher_, getImmediateByType(v->dtype(), 0), lhsTerm, rhsTerm);
+  return alloc<Polynomial>(hasher_, immLike(v, 0), lhsTerm, rhsTerm);
 }
 
 // Multiply two terms together, usually creating a new term with the variable
@@ -930,7 +918,7 @@ ExprPtr PolynomialTransformer::mutate(MulPtr v) {
 
   // Handle special case mul by 0.
   if (scalar && immediateEquals(scalar, 0)) {
-    return getImmediateByType(v->dtype(), 0);
+    return immLike(v, 0);
   }
 
   // Catch cases of rounding (Div(A/B) * B).
@@ -994,13 +982,11 @@ ExprPtr PolynomialTransformer::mutate(MulPtr v) {
   // Multiplying Polynomial by variable can be wrapped in a term and handled
   // by polyByTerm also.
   if (lhsPoly) {
-    auto term =
-        alloc<Term>(hasher_, getImmediateByType(rhs_new->dtype(), 1), rhs_new);
+    auto term = alloc<Term>(hasher_, immLike(rhs_new, 1), rhs_new);
     return polyByTerm(lhsPoly, term);
   }
   if (rhsPoly) {
-    auto term =
-        alloc<Term>(hasher_, getImmediateByType(lhs_new->dtype(), 1), lhs_new);
+    auto term = alloc<Term>(hasher_, immLike(lhs_new, 1), lhs_new);
     return polyByTerm(rhsPoly, term);
   }
 
@@ -1014,8 +1000,7 @@ ExprPtr PolynomialTransformer::mutate(MulPtr v) {
   }
 
   // Two variables, create a new Term.
-  return alloc<Term>(
-      hasher_, getImmediateByType(v->dtype(), 1), lhs_new, rhs_new);
+  return alloc<Term>(hasher_, immLike(v, 1), lhs_new, rhs_new);
 }
 
 ExprPtr factorizeDivision(ExprPtr lhs_new, ExprPtr rhs_new) {
@@ -1048,10 +1033,8 @@ ExprPtr factorizeDivision(ExprPtr lhs_new, ExprPtr rhs_new) {
     return nullptr;
   }
 
-  leftScalar = evaluateOp(
-      alloc<Div>(leftScalar, getImmediateByType(leftScalar->dtype(), GCD)));
-  rightScalar = evaluateOp(
-      alloc<Div>(rightScalar, getImmediateByType(rightScalar->dtype(), GCD)));
+  leftScalar = evaluateOp(alloc<Div>(leftScalar, immLike(leftScalar, GCD)));
+  rightScalar = evaluateOp(alloc<Div>(rightScalar, immLike(rightScalar, GCD)));
 
   if (lhsTerm) {
     lhs_new = alloc<Term>(lhsTerm->hasher(), leftScalar, lhsTerm->variables());
@@ -1127,12 +1110,12 @@ ExprPtr PolynomialTransformer::mutate(ModPtr v) {
   // x % 1 == 0.
   if (rhs_new->isConstant() && immediateEquals(rhs_new, 1)) {
     // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    return getImmediateByType(v->dtype(), 0);
+    return immLike(v, 0);
   }
 
   // x % x => 0.
   if (hasher_.hash(lhs_new) == hasher_.hash(rhs_new)) {
-    return getImmediateByType(v->dtype(), 0);
+    return immLike(v, 0);
   }
 
   TermPtr lhsTerm = to<Term>(lhs_new);
@@ -1149,13 +1132,13 @@ ExprPtr PolynomialTransformer::mutate(ModPtr v) {
     if (rhs_new->isConstant() &&
         immediateEquals(
             evaluateOp(alloc<Mod>(lhsTerm->scalar(), rhs_new)), 0)) {
-      return getImmediateByType(v->dtype(), 0);
+      return immLike(v, 0);
     }
 
     // (x * y * z) % x => 0.
     for (auto component : lhsTerm->variables()) {
       if (hasher_.hash(component) == hasher_.hash(rhs_new)) {
-        return getImmediateByType(v->dtype(), 0);
+        return immLike(v, 0);
       }
     }
 
@@ -1189,7 +1172,7 @@ ExprPtr PolynomialTransformer::mutate(ModPtr v) {
           immediateEquals(
               evaluateOp(alloc<Mod>(lhsTerm->scalar(), rhsTerm->scalar())),
               0)) {
-        return getImmediateByType(v->dtype(), 0);
+        return immLike(v, 0);
       }
     }
   }
@@ -1862,7 +1845,7 @@ ExprPtr polyGCD(PolynomialPtr poly) {
     return nullptr;
   }
 
-  return getImmediateByType(poly->dtype(), GCD);
+  return immLike(poly, GCD);
 }
 
 // A ModRound is a div-mod-mul in which the divisor in div and multiplier in mul
@@ -1981,7 +1964,7 @@ c10::optional<class ModRound*> isModRound(TermPtr e) {
   }
 
   if (!scalar) {
-    scalar = getImmediateByType(multiplier->dtype(), 1);
+    scalar = immLike(multiplier, 1);
   }
 
   // TODO: this leaks memory!
@@ -2261,23 +2244,23 @@ ExprPtr TermExpander::mutate(PolynomialPtr v) {
     }
 
     // Negate the term back to positive since we'll be subtracting it.
-    ExprPtr negated = evaluateOp(alloc<Mul>(
-        getImmediateByType(node->scalar()->dtype(), -1), node->scalar()));
+    ExprPtr negated =
+        evaluateOp(alloc<Mul>(immLike(node->scalar(), -1), node->scalar()));
     TermPtr newRHS = alloc<Term>(node->hasher(), negated, node->variables());
     lastNode = alloc<Sub>(lastNode, newRHS->accept_mutator(this));
   }
 
   if (scalarWritten || immediateEquals(v->scalar(), 0)) {
     if (!lastNode) {
-      return getImmediateByType(v->dtype(), 0);
+      return immLike(v, 0);
     }
     return lastNode;
   }
 
   if (immediateIsNegative(v->scalar())) {
     // Negate the scalar and subtract.
-    ExprPtr negated = evaluateOp(
-        alloc<Mul>(getImmediateByType(lastNode->dtype(), -1), v->scalar()));
+    ExprPtr negated =
+        evaluateOp(alloc<Mul>(immLike(lastNode, -1), v->scalar()));
     lastNode = alloc<Sub>(lastNode, evaluateOp(negated));
   } else {
     // we want to avoid a cast to the scalar if it would happen.
@@ -2344,7 +2327,7 @@ ExprPtr TermExpander::mutate(MinTermPtr v) {
 ExprPtr TermExpander::mutate(RoundOffPtr v) {
   TermPtr term = alloc<Term>(
       simplifier_->hasher(),
-      getImmediateByType(v->dtype(), 1),
+      immLike(v, 1),
       alloc<Div>(v->lhs(), v->rhs()),
       v->rhs());
   return term->accept_mutator(this);
@@ -2352,8 +2335,10 @@ ExprPtr TermExpander::mutate(RoundOffPtr v) {
 
 ExprPtr buf_flat_size(BufPtr v) {
   std::vector<ExprPtr> dims = v->dims();
-
-  ExprPtr flattened = getImmediateByType(kInt, 1);
+  if (dims.size() == 0) {
+    return alloc<LongImm>(1);
+  }
+  ExprPtr flattened = immLike(dims[0], 1);
   for (auto& dim : dims) {
     flattened = alloc<Mul>(flattened, dim);
   }
@@ -2684,7 +2669,7 @@ ExprPtr distributeDiv(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
     return nullptr;
   }
   ExprPtr check_n_value = IRSimplifier::simplify(
-      alloc<CompareSelect>(rhsScalar, alloc<IntImm>(0), kGT));
+      alloc<CompareSelect>(rhsScalar, immLike(rhsScalar, 0), kGT));
   if (!immediateEquals(check_n_value, 1)) {
     return nullptr;
   }
@@ -2719,7 +2704,7 @@ ExprPtr distributeDiv(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
   // range
   auto end = got->second.second;
   ExprPtr check_start = IRSimplifier::simplify(
-      alloc<CompareSelect>(start, alloc<IntImm>(0), kGE));
+      alloc<CompareSelect>(start, immLike(start, 0), kGE));
   ExprPtr check_end =
       IRSimplifier::simplify(alloc<CompareSelect>(end, rhsScalar, kLE));
   if (!check_start->isConstant() || !check_end->isConstant() ||
@@ -2731,7 +2716,7 @@ ExprPtr distributeDiv(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
 
   // simplify type 1) exprs: '(i+x)/n' => 'x/n'
   ExprPtr sign_check =
-      IRSimplifier::simplify(alloc<CompareSelect>(main, alloc<IntImm>(0), kGE));
+      IRSimplifier::simplify(alloc<CompareSelect>(main, immLike(main, 0), kGE));
   ExprPtr main_mod = IRSimplifier::simplify(alloc<Mod>(main, rhsScalar));
   ExprPtr mod_check = IRSimplifier::simplify(
       alloc<CompareSelect>(alloc<Add>(main_mod, end), rhsScalar, kLE));
@@ -2742,6 +2727,7 @@ ExprPtr distributeDiv(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
 
   // simplify type 2 exprs: '(i+j*n)/n' => 'j'
   auto ret_var = to<Var>(ret);
+  // FIXME: Allow any integral type.
   if (ret_var && ret_var->dtype() == kInt) {
     // retrieve j's range info
     auto got = var_bound_info.find(ret_var);
@@ -2750,8 +2736,8 @@ ExprPtr distributeDiv(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
     }
 
     // check if j is not negative
-    sign_check = IRSimplifier::simplify(
-        alloc<CompareSelect>(got->second.first, alloc<IntImm>(0), kGE));
+    sign_check = IRSimplifier::simplify(alloc<CompareSelect>(
+        got->second.first, immLike(got->second.first, 0), kGE));
     if (sign_check->isConstant() && immediateEquals(sign_check, 1)) {
       return ret_var;
     }
@@ -2801,7 +2787,7 @@ ExprPtr distributeMod(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
     return nullptr;
   }
   ExprPtr check_n_value = IRSimplifier::simplify(
-      alloc<CompareSelect>(rhsScalar, alloc<IntImm>(0), kGT));
+      alloc<CompareSelect>(rhsScalar, immLike(rhsScalar, 0), kGT));
   if (!immediateEquals(check_n_value, 1)) {
     return nullptr;
   }
@@ -2838,7 +2824,7 @@ ExprPtr distributeMod(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
   // range
   auto end = got->second.second;
   ExprPtr check_start = IRSimplifier::simplify(
-      alloc<CompareSelect>(start, alloc<IntImm>(0), kGE));
+      alloc<CompareSelect>(start, immLike(start, 0), kGE));
   ExprPtr check_end =
       IRSimplifier::simplify(alloc<CompareSelect>(end, rhsScalar, kLE));
   if (!check_start->isConstant() || !check_end->isConstant() ||
@@ -2848,7 +2834,7 @@ ExprPtr distributeMod(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
 
   // simplify type 1) exprs: '(i+x)%n' => 'i+x%n'
   ExprPtr sign_check =
-      IRSimplifier::simplify(alloc<CompareSelect>(main, alloc<IntImm>(0), kGE));
+      IRSimplifier::simplify(alloc<CompareSelect>(main, immLike(main, 0), kGE));
   ExprPtr main_mod = IRSimplifier::simplify(alloc<Mod>(main, rhsScalar));
   ExprPtr mod_check = IRSimplifier::simplify(
       alloc<CompareSelect>(alloc<Add>(main_mod, end), rhsScalar, kLE));
@@ -2860,6 +2846,7 @@ ExprPtr distributeMod(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
   // simplify type 2) exprs: '(i+j*n)%n' => 'i'
   ExprPtr main_div = IRSimplifier::simplify(alloc<Div>(main, rhsScalar));
   auto j_var = to<Var>(main_div);
+  // FIXME: Allow any integral type.
   if (j_var && j_var->dtype() == kInt) {
     // retrieve j's range info
     auto got = var_bound_info.find(j_var);
@@ -2868,8 +2855,8 @@ ExprPtr distributeMod(ExprPtr lhs, ExprPtr rhs, VarBoundInfo var_bound_info) {
     }
 
     // check if j is not negative
-    sign_check = IRSimplifier::simplify(
-        alloc<CompareSelect>(got->second.first, alloc<IntImm>(0), kGE));
+    sign_check = IRSimplifier::simplify(alloc<CompareSelect>(
+        got->second.first, immLike(got->second.first, 0), kGE));
     if (sign_check->isConstant() && immediateEquals(sign_check, 1)) {
       return var_key;
     }
@@ -2920,7 +2907,7 @@ ExprPtr SimplifierUnderContext::mutate(ModPtr v) {
       auto start = got->second.first;
       auto end = got->second.second;
       ExprPtr check_start = IRSimplifier::simplify(
-          alloc<CompareSelect>(start, alloc<IntImm>(0), kGE));
+          alloc<CompareSelect>(start, immLike(start, 0), kGE));
       ExprPtr check_end =
           IRSimplifier::simplify(alloc<CompareSelect>(end, rhsScalar, kLE));
       if (check_start->isConstant() && check_end->isConstant() &&
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index f72fbf7c18c37..0d0d19e004981 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -202,11 +202,11 @@ c10::optional<TensorInfo> getTensorInfoJit(torch::jit::Value* v) {
 c10::optional<TensorInfo> getTensorInfo(BufHandle b) {
   std::vector<int64_t> dims;
   for (auto dim : b.dims()) {
-    auto val = to<IntImm>(dim.node());
+    auto val = intValue(dim.node());
     if (!val) {
       return c10::nullopt;
     }
-    dims.push_back(val->value());
+    dims.push_back(*val);
   }
   return TensorInfo{dims, static_cast<at::ScalarType>(b.dtype().scalar_type())};
 }
@@ -396,7 +396,7 @@ ExprHandle tensorOrConstant(
   return constant(v);
 }
 
-size_t normalizeAndCheckIndex(int64_t idx, int64_t list_size) {
+int64_t normalizeAndCheckIndex(int64_t idx, int64_t list_size) {
   if (idx < 0) {
     // Handle negative indexing
     idx = list_size + idx;
@@ -405,7 +405,7 @@ size_t normalizeAndCheckIndex(int64_t idx, int64_t list_size) {
   if (idx < 0 || idx >= list_size) {
     AT_ERROR("Invalid index ", idx, " for list_size", list_size);
   }
-  return static_cast<size_t>(idx);
+  return idx;
 }
 
 ExprHandle broadcast(BufHandle b, const std::vector<ExprHandle>& axes) {
@@ -441,8 +441,8 @@ std::vector<ExprHandle> computeIndicesToBroadcast(
   auto axisIt = outputAxes.rbegin();
   auto sizeIt = inputSizes.rbegin();
   while (sizeIt != inputSizes.rend()) {
-    auto const& size = sizeIt->AsNode<IntImm>();
-    if (size && size->value() == 1) {
+    auto const& size = intValue(*sizeIt);
+    if (size && *size == 1) {
       bcast.emplace_back(0);
     } else {
       bcast.emplace_back(*axisIt);
@@ -525,7 +525,9 @@ static at::ScalarType tensorType(BufPtr b) {
 std::vector<int64_t> bufferSizes(BufPtr b) {
   std::vector<int64_t> sizes;
   for (size_t i = 0; i < b->ndim(); i++) {
-    sizes.push_back(to<IntImm>(b->dim(i))->value());
+    auto dim = intValue(b->dim(i));
+    TORCH_INTERNAL_ASSERT(dim);
+    sizes.push_back(*dim);
   }
   return sizes;
 }
@@ -543,7 +545,8 @@ ExprHandle TensorExprKernel::chunk(
   std::vector<ExprHandle> indices;
   for (size_t i = 0; i < axes.size(); ++i) {
     if (i == norm_dim) {
-      indices.push_back(axes[i] + IntImm::make((int)chunkIdx * (int)step));
+      indices.push_back(
+          axes[i] + ExprHandle(immLike(axes[i], chunkIdx * step)));
     } else {
       indices.push_back(axes[i]);
     }
@@ -642,7 +645,7 @@ std::vector<ExprHandle> TensorExprKernel::sizesFromVaryingShape(
     const c10::VaryingShape<int64_t>& shape) {
   std::vector<ExprHandle> dims;
   for (const auto i : c10::irange(*shape.size())) {
-    dims.push_back(IntImm::make(*shape[i]));
+    dims.push_back(*shape[i]);
   }
   return dims;
 }
@@ -664,7 +667,7 @@ std::vector<ExprHandle> TensorExprKernel::sizesForValue(
 
   if (v->type()->isSubtypeOf(FloatType::get()) ||
       v->type()->isSubtypeOf(IntType::get())) {
-    return {1};
+    return {int64_t{1}};
   }
   if (v->type()->isSubtypeOf(NoneType::get())) {
     return {};
@@ -820,7 +823,7 @@ std::vector<ExprHandle> TensorExprKernel::inferSizesForValue(
       TORCH_INTERNAL_ASSERT(n->input(1)->node()->kind() == prim::Constant);
       int64_t dim = n->input(1)->node()->i(attr::value);
       auto shape = sizesForValue(inputs[0]);
-      size_t norm_dim = normalizeAndCheckIndex(dim, shape.size());
+      auto norm_dim = normalizeAndCheckIndex(dim, shape.size());
       ExprHandle concat_dim_size = 0;
       for (auto input : inputs) {
         concat_dim_size = concat_dim_size + sizesForValue(input)[norm_dim];
@@ -889,11 +892,11 @@ ExprHandle clamp(
 }
 
 static bool isOne(ExprHandle e) {
-  auto const& n = e.AsNode<IntImm>();
+  auto const& n = intValue(e);
   if (!n) {
     return false;
   }
-  return n->value() == 1;
+  return *n == 1;
 }
 
 std::pair<std::vector<ExprHandle>, bool> broadcastShapesImpl(
@@ -1150,6 +1153,7 @@ std::pair<ScalarType, std::vector<BufHandle>> processCatList(
   }
   return {highType, nonEmptyInputs};
 }
+
 Tensor computeCatWoConditionals(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape) {
@@ -1184,8 +1188,7 @@ Tensor computeCatWoConditionals(
   }
 
   int64_t concat_dim = c10::get<int64_t>(arg_dim);
-  size_t norm_concat_dim =
-      normalizeAndCheckIndex(concat_dim, outputShape.size());
+  auto norm_concat_dim = normalizeAndCheckIndex(concat_dim, outputShape.size());
 
   auto gen_code_for_input = [&](const BufHandle& inp,
                                 size_t inp_pos,
@@ -1196,7 +1199,8 @@ Tensor computeCatWoConditionals(
     std::vector<ExprPtr> store_indices(dims.size());
     for (size_t i = 0; i < dims.size(); ++i) {
       for_vars[i] = alloc<Var>(
-          "i" + c10::to_string(inp_pos) + "_" + c10::to_string(i), kInt);
+          "i" + c10::to_string(inp_pos) + "_" + c10::to_string(i),
+          dims[i].dtype());
       load_indices[i] = for_vars[i];
       if (i == norm_concat_dim) {
         store_indices[i] = alloc<Add>(for_vars[i], concat_dim_size);
@@ -1209,8 +1213,8 @@ Tensor computeCatWoConditionals(
     auto load_promoted = promoteToDtype(ExprHandle(load_expr), high_type);
     StmtPtr st = alloc<Store>(output_buf, store_indices, load_promoted.node());
     for (size_t i = dims.size(); i > 0; --i) {
-      st =
-          alloc<For>(for_vars[i - 1], alloc<IntImm>(0), dims[i - 1].node(), st);
+      st = alloc<For>(
+          for_vars[i - 1], immLike(dims[i - 1], 0), dims[i - 1].node(), st);
     }
     return st;
   };
@@ -1221,7 +1225,7 @@ Tensor computeCatWoConditionals(
     auto input_dims =
         ExprVectorToExprHandleVector(non_empty_inputs[i].node()->dims());
     if (concat_dim_size == nullptr) {
-      concat_dim_size = alloc<IntImm>(0);
+      concat_dim_size = immLike(input_dims[norm_concat_dim], 0);
     }
     block->append_stmt(gen_code_for_input(
         non_empty_inputs[i], i, concat_dim_size, input_dims));
@@ -1253,7 +1257,7 @@ Tensor computeCat(
         }
 
         int64_t dim_ = c10::get<int64_t>(argDim);
-        size_t dim = normalizeAndCheckIndex(dim_, axes.size());
+        auto dim = normalizeAndCheckIndex(dim_, axes.size());
         // Promote input types.
         // Note that we need to consider all inputs, including empty - they
         // also affect the resultant dtype.
@@ -1273,18 +1277,18 @@ Tensor computeCat(
         std::vector<ExprHandle> newAxes(axes.begin(), axes.end());
         ExprHandle load = promoteToDtype(
             tensorOrConstant(nonEmptyInputs[0], newAxes), highType);
-        size_t offset = to<IntImm>(nonEmptyInputs[0].node()->dim(dim))->value();
-        newAxes[dim] = newAxes[dim] - IntImm::make(offset);
+        auto offset = *intValue(nonEmptyInputs[0].node()->dim(dim));
+        newAxes[dim] = newAxes[dim] - ExprHandle(immLike(newAxes[dim], offset));
 
         for (size_t ii = 1; ii < nonEmptyInputs.size(); ++ii) {
           auto input = nonEmptyInputs[ii];
           load = ifThenElse(
-              CompareSelect::make(axes[dim], IntImm::make(offset), kLT),
+              CompareSelect::make(axes[dim], offset, kLT),
               load,
               promoteToDtype(tensorOrConstant(input, newAxes), highType));
 
-          offset += to<IntImm>(input.node()->dim(dim))->value();
-          newAxes[dim] = axes[dim] - IntImm::make(offset);
+          offset += *intValue(input.node()->dim(dim));
+          newAxes[dim] = axes[dim] - ExprHandle(immLike(axes[dim], offset));
         }
 
         return load;
@@ -2334,12 +2338,12 @@ Tensor tensorexpr::computeOperandValue(
             ExprHandle cur_stride = 1;
             std::vector<ExprPtr> dims, indices;
             for (size_t idx = 0; idx < view_dims.size(); idx++) {
-              dims.push_back(alloc<IntImm>(view_dims[idx]));
+              dims.push_back(alloc<LongImm>(view_dims[idx]));
               indices.push_back(axes[idx].node());
             }
             ExprHandle flat_idx = ExprHandle(flatten_index(dims, indices));
             std::vector<ExprHandle> orig_buf_indexes(A.ndim(), ExprHandle(0));
-            ExprHandle stride = IntImm::make(1);
+            ExprHandle stride = ExprHandle(immLike(flat_idx, 1));
             for (size_t idx = 0; idx < A.ndim(); idx++) {
               size_t dim_idx = A.ndim() - idx - 1;
               // We don't need to generate mod-div for the first dimension -
@@ -2799,7 +2803,7 @@ static std::vector<ExprHandle> toExprHandles(const std::vector<T>& sizes) {
   std::vector<ExprHandle> dims;
   dims.reserve(sizes.size());
   for (auto const& size : sizes) {
-    dims.emplace_back(IntImm::make(size));
+    dims.emplace_back(size);
   }
   return dims;
 }
@@ -2831,8 +2835,7 @@ Tensor TensorExprKernel::bindInput(const torch::jit::Value* input) {
       std::vector<DimArg> inputTensorDims;
       for (size_t i = 0; i < *tt->sizes().size(); i++) {
         auto const size = *tt->sizes()[i];
-        inputTensorDims.emplace_back(
-            DimArg(IntImm::make(size), "i" + c10::to_string(i)));
+        inputTensorDims.emplace_back(DimArg(size, "i" + c10::to_string(i)));
       }
       auto const strides = tt->strides();
       result = Compute(
@@ -2841,12 +2844,11 @@ Tensor TensorExprKernel::bindInput(const torch::jit::Value* input) {
           [&](const std::vector<VarHandle>& axes) {
             ExprHandle idx = 0;
             for (size_t i = 0; i < axes.size(); i++) {
-              idx = idx + axes[i] * IntImm::make(*strides[i]);
+              idx = idx + axes[i] * *strides[i];
             }
             return inBuffer.load(idx);
           });
       bufs_.emplace(input, result.buf());
-
       bufferArgs_.emplace_back(inBuffer);
       break;
     }
@@ -2956,10 +2958,10 @@ Tensor TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
   return Compute(
       "output_1", dims, [&](const std::vector<VarHandle>& axes_input) {
         std::vector<ExprHandle> axes(axes_input.begin(), axes_input.end());
-        auto absolute_position = IntImm::make(0);
+        auto absolute_position = ExprHandle(immLike(axes[0], 0));
         for (size_t i = 0; i < axes.size(); ++i) {
-          absolute_position =
-              absolute_position + (IntImm::make(default_strides[i]) * axes[i]);
+          absolute_position = absolute_position +
+              (ExprHandle(immLike(axes[i], default_strides[i])) * axes[i]);
         }
         std::vector<size_t> sorted_stride_indices =
             reverse_sort_indices(strides);
@@ -2967,10 +2969,11 @@ Tensor TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
         for (size_t stride_index : sorted_stride_indices) {
           auto stride = strides[stride_index];
           auto size = sizes[stride_index];
-          auto index = Div::make(absolute_position, IntImm::make(stride));
+          auto index = absolute_position /
+              ExprHandle(immLike(absolute_position, stride));
           if (size != 1) {
-            absolute_position =
-                Mod::make(absolute_position, IntImm::make(stride));
+            absolute_position = absolute_position %
+                ExprHandle(immLike(absolute_position, stride));
           }
           new_axes[stride_index] = index;
         }
@@ -2992,7 +2995,7 @@ void TensorExprKernel::bindConstant(const torch::jit::Value* v) {
   std::vector<ExprHandle> te_sizes;
   te_sizes.reserve(sizes.size());
   for (auto s : sizes) {
-    te_sizes.push_back(IntImm::make(s));
+    te_sizes.push_back(s);
   }
 
   BufPtr buf = alloc<Buf>(
diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h
index 99a3b123a6816..4b92b020fce31 100644
--- a/torch/csrc/jit/tensorexpr/kernel.h
+++ b/torch/csrc/jit/tensorexpr/kernel.h
@@ -19,7 +19,7 @@ template <typename T>
 inline std::vector<int64_t> bufferSizes(const T& t) {
   std::vector<int64_t> sizes;
   for (size_t i = 0; i < t->ndim(); i++) {
-    sizes.push_back(to<IntImm>(t->dim(i))->value());
+    sizes.push_back(*intValue(t->dim(i)));
   }
   return sizes;
 }
@@ -62,7 +62,7 @@ ExprHandle tensorOrConstant(
     const ArgValue& v,
     const std::vector<ExprHandle>& axes);
 
-size_t normalizeAndCheckIndex(int64_t idx, int64_t list_size);
+int64_t normalizeAndCheckIndex(int64_t idx, int64_t list_size);
 
 ExprHandle broadcast(BufHandle b, const std::vector<ExprHandle>& axes);
 
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index a93fd64df0a68..026d52bfc938c 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -275,17 +275,17 @@ class LLVMCodeGenImpl : public IRVisitor {
 };
 
 extern "C" {
-typedef void (*ParallelCallee)(int index, int8_t* packed_data);
+typedef void (*ParallelCallee)(int64_t index, int8_t* packed_data);
 void DispatchParallel(
     int8_t* func,
-    int start,
-    int stop,
+    int64_t start,
+    int64_t stop,
     int8_t* packed_data) noexcept {
   // TODO: preserve the func type.
   try {
     ParallelCallee callee = reinterpret_cast<ParallelCallee>(func);
     at::parallel_for(start, stop, 1, [&](int64_t f_begin, int64_t f_end) {
-      for (int index = f_begin; index < f_end; index++) {
+      for (int64_t index = f_begin; index < f_end; index++) {
         callee(index, packed_data);
       }
     });
@@ -537,10 +537,6 @@ void LLVMCodeGenImpl::emitKernel(
 
   irb_.CreateRet(value_);
 
-  if (llvm::verifyFunction(*fn_, &llvm::outs())) {
-    throw std::runtime_error("Function verification failed");
-  }
-
   // print graph debug info before optimization
   llvm::SmallVector<char, 0> asmBuffer;
   llvm::raw_svector_ostream asmStream(asmBuffer);
@@ -550,6 +546,10 @@ void LLVMCodeGenImpl::emitKernel(
   GRAPH_DEBUG(
       "\nLLVM module before optimizations\n\n", asmStream.str().str(), "\n");
 
+  if (llvm::verifyFunction(*fn_, &llvm::outs())) {
+    throw std::runtime_error("Function verification failed");
+  }
+
   optimize(*module_);
 
   asmBuffer.set_size(0);
@@ -1144,8 +1144,8 @@ void LLVMCodeGenImpl::visit(LoadPtr v) {
   // Handle the case where the load is contiguous and unmasked efficiently
   auto idx_ramp = to<Ramp>(v->flat_index());
   if (idx_ramp) {
-    auto stride_imm = to<IntImm>(idx_ramp->stride());
-    if (stride_imm && stride_imm->value() == 1) {
+    auto stride_imm = intValue(idx_ramp->stride());
+    if (stride_imm && *stride_imm == 1) {
       v->base_handle()->accept(this);
       auto base = this->value_;
       idx_ramp->base()->accept(this);
@@ -1256,7 +1256,7 @@ void LLVMCodeGenImpl::processParallelFor(ForPtr v) {
 
   // Create the new body closure code.
   auto func_type =
-      llvm::FunctionType::get(VoidTy_, {IntTy_, Int8PtrTy_}, false);
+      llvm::FunctionType::get(VoidTy_, {LongTy_, Int8PtrTy_}, false);
   llvm::Function* func = llvm::Function::Create(
       func_type, llvm::Function::PrivateLinkage, "func", module_.get());
   auto func_body = llvm::BasicBlock::Create(getContext(), "func_body", func);
@@ -1268,6 +1268,10 @@ void LLVMCodeGenImpl::processParallelFor(ForPtr v) {
       packed_func_args_raw, packed_caller_args->getType());
 
   // Unpack the arguments from the opaque buffer.
+  if (v->var()->dtype().scalar_type() != c10::kLong) {
+    index = irb_.CreateIntCast(
+        index, dtypeToLLVM(v->var()->dtype()), v->var()->dtype().is_signed());
+  }
   body_closure_args = unpackFuncArgs(packed_func_args, body_arg_vars.size());
   // Set the codegen to the new func.
   // TODO: this should be replaced by RAII wrappers.
@@ -1290,12 +1294,14 @@ void LLVMCodeGenImpl::processParallelFor(ForPtr v) {
       irb_.CreatePointerCast(packed_caller_args, Int8PtrTy_);
   llvm::Value* func_value = irb_.CreatePointerCast(func, Int8PtrTy_);
   llvm::FunctionType* dispatcher_fntype = llvm::FunctionType::get(
-      VoidTy_, {Int8PtrTy_, IntTy_, IntTy_, Int8PtrTy_}, false);
+      VoidTy_, {Int8PtrTy_, LongTy_, LongTy_, Int8PtrTy_}, false);
   FunctionCallee dispatcher_callee =
       module_->getOrInsertFunction("DispatchParallel", dispatcher_fntype);
   llvm::Function* dispatcher =
       llvm::cast<llvm::Function>(dispatcher_callee.getCallee());
   dispatcher->addFnAttr(llvm::Attribute::NoUnwind);
+  start = irb_.CreateIntCast(start, LongTy_, true);
+  stop = irb_.CreateIntCast(stop, LongTy_, true);
   irb_.CreateCall(
       dispatcher, {func_value, start, stop, packed_caller_args_ptr});
   value_ = llvm::ConstantInt::get(IntTy_, 0);
@@ -1320,7 +1326,7 @@ void LLVMCodeGenImpl::visit(ForPtr v) {
   irb_.SetInsertPoint(condBlock);
 
   // Set up phi node for index variable.
-  auto idx = irb_.CreatePHI(IntTy_, 2);
+  auto idx = irb_.CreatePHI(start->getType(), 2);
   idx->addIncoming(start, preheader);
   if (!varToVal_.count(v->var())) {
     varToVal_.emplace(v->var(), idx);
@@ -1345,7 +1351,8 @@ void LLVMCodeGenImpl::visit(ForPtr v) {
   body = irb_.GetInsertBlock();
 
   // Increment the index variable and branch back to loop test.
-  auto inc = irb_.CreateAdd(idx, llvm::ConstantInt::getSigned(IntTy_, 1));
+  auto inc =
+      irb_.CreateAdd(idx, llvm::ConstantInt::getSigned(start->getType(), 1));
   irb_.CreateBr(condBlock);
   idx->addIncoming(inc, body);
 
@@ -1430,8 +1437,8 @@ void LLVMCodeGenImpl::visit(StorePtr v) {
   // Handle the case where the store is contiguous and unmasked efficiently
   auto idx_ramp = to<Ramp>(v->flat_index());
   if (idx_ramp) {
-    auto stride_imm = to<IntImm>(idx_ramp->stride());
-    if (stride_imm && stride_imm->value() == 1) {
+    auto stride_imm = intValue(idx_ramp->stride());
+    if (stride_imm && *stride_imm == 1) {
       idx_ramp->base()->accept(this);
       auto first_idx = value_;
 
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h
index 8585900abc8d6..a837899cdce1d 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.h
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.h
@@ -20,8 +20,8 @@ namespace tensorexpr {
 extern "C" {
 void DispatchParallel(
     int8_t* func,
-    int start,
-    int stop,
+    int64_t start,
+    int64_t stop,
     int8_t* packed_data) noexcept;
 }
 
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index d3a4b919bef33..11020cc2eda08 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -127,8 +127,8 @@ class Vectorizer : public IRMutator {
     ExprPtr start = v->start();
     ExprPtr stop = v->stop();
 
-    IntImmPtr start_imm = to<IntImm>(start);
-    IntImmPtr stop_imm = to<IntImm>(stop);
+    auto start_imm = intValue(start);
+    auto stop_imm = intValue(stop);
     if (!start_imm) {
       throw std::runtime_error(
           "Can't vectorize due to non-constant loop start!");
@@ -140,8 +140,8 @@ class Vectorizer : public IRMutator {
     }
 
     var_ = var;
-    start_ = start_imm;
-    lanes_ = stop_imm->value();
+    start_ = immLike(start, *start_imm);
+    lanes_ = *stop_imm;
 
     StmtPtr new_body = body->accept_mutator(this);
     if (new_body == body) {
@@ -531,11 +531,11 @@ class FunctionInliner : public IRMutator {
       if (auto index_var = to<Var>(i)) {
         index_vars_.insert(index_var);
         producer_index_vars_.push_back(index_var);
-      } else if (to<IntImm>(i) != nullptr) {
+      } else if (intValue(i)) {
         // If the index can be a constant, then that dimension must have size 1
         // (since we don't support in-place writes). Resolves issue 52581.
         TORCH_INTERNAL_ASSERT(
-            to<IntImm>(i)->value() == 0,
+            *intValue(i) == 0,
             "Constant index impression should always be zero");
         producer_index_vars_.push_back(nullptr);
       } else {
@@ -553,8 +553,7 @@ class FunctionInliner : public IRMutator {
       ExprPtr func_caller_param = dims.at(i);
       if (func_callee_arg == nullptr) {
         TORCH_INTERNAL_ASSERT(
-            to<IntImm>(func_caller_param) != nullptr &&
-                to<IntImm>(func_caller_param)->value() == 0,
+            intValue(func_caller_param) && *intValue(func_caller_param) == 0,
             "We are implicitly assuming that if you have an index of 0, that must also be inlined into an index of 0");
         continue;
       }
@@ -1140,7 +1139,7 @@ bool LoopNest::optimizeConditionals() {
     // only include the RHS of the conditions in the if-then-else expressions
     // we need to start with `0` which is the initial bound, given that we
     // only handle normalized loops (check for this is done below).
-    std::vector<ExprPtr> comp_values = {alloc<IntImm>(0)};
+    std::vector<ExprPtr> comp_values;
     std::vector<ExprPtr> sub_exprs;
     auto ifthenelse_exprs = NodeFinder<IfThenElse>::find(store);
     if (ifthenelse_exprs.empty()) {
@@ -1155,6 +1154,8 @@ bool LoopNest::optimizeConditionals() {
             ifthenelse_exprs.front(), &cond_var, &comp_values, &sub_exprs)) {
       continue;
     }
+    TORCH_INTERNAL_ASSERT(comp_values.size() >= 1);
+    comp_values.insert(comp_values.begin(), immLike(comp_values[0], 0));
 
     auto fors = getLoopStmtsFor(store);
     if (cond_var != fors.back()->var()) {
@@ -1290,10 +1291,10 @@ void LoopNest::vectorizeInnerLoops() {
 }
 
 void LoopNest::sliceHead(ForPtr f, int factor, ForPtr* head, ForPtr* tail) {
-  if (to<IntImm>(f->start()) && to<IntImm>(f->stop())) {
-    int start_val = to<IntImm>(f->start())->value();
-    int stop_val = to<IntImm>(f->stop())->value();
-    int size_val = stop_val - start_val;
+  if (intValue(f->start()) && intValue(f->stop())) {
+    auto start_val = *intValue(f->start());
+    auto stop_val = *intValue(f->stop());
+    auto size_val = stop_val - start_val;
     if (factor >= size_val) {
       *head = f;
       *tail = nullptr;
@@ -1311,7 +1312,7 @@ void LoopNest::sliceHead(ForPtr f, int factor, ForPtr* head, ForPtr* tail) {
   }
 
   ExprPtr head_end = alloc<Min>(
-      alloc<Add>(f->start(), alloc<IntImm>(factor)), f->stop(), true);
+      alloc<Add>(f->start(), immLike(f->stop(), factor)), f->stop(), true);
   *head = alloc<For>(f->var(), f->start(), head_end, Stmt::clone(f->body()));
   p->insert_stmt_before(*head, f);
 
@@ -1330,10 +1331,10 @@ void LoopNest::sliceHead(ForPtr f, int factor) {
 }
 
 void LoopNest::sliceTail(ForPtr f, int factor, ForPtr* head, ForPtr* tail) {
-  if (to<IntImm>(f->start()) && to<IntImm>(f->stop())) {
-    int start_val = to<IntImm>(f->start())->value();
-    int stop_val = to<IntImm>(f->stop())->value();
-    int size_val = stop_val - start_val;
+  if (intValue(f->start()) && intValue(f->stop())) {
+    auto start_val = *intValue(f->start());
+    auto stop_val = *intValue(f->stop());
+    auto size_val = stop_val - start_val;
     if (factor >= size_val) {
       *head = nullptr;
       *tail = f;
@@ -1351,7 +1352,7 @@ void LoopNest::sliceTail(ForPtr f, int factor, ForPtr* head, ForPtr* tail) {
   }
 
   ExprPtr tail_start = alloc<Max>(
-      f->start(), alloc<Sub>(f->stop(), alloc<IntImm>(factor)), true);
+      f->start(), alloc<Sub>(f->stop(), immLike(f->stop(), factor)), true);
   *tail = alloc<For>(f->var(), tail_start, f->stop(), Stmt::clone(f->body()));
   p->insert_stmt_after(*tail, f);
 
@@ -1390,17 +1391,17 @@ void LoopNest::splitWithTail(
   }
 
   bool tail_is_needed = true;
-  if (to<IntImm>(f->start()) && to<IntImm>(f->stop())) {
-    int start_val = to<IntImm>(f->start())->value();
-    int stop_val = to<IntImm>(f->stop())->value();
-    int size_val = stop_val - start_val;
-    int tail_size = size_val % factor;
+  if (intValue(f->start()) && intValue(f->stop())) {
+    auto const start_val = *intValue(f->start());
+    auto const stop_val = *intValue(f->stop());
+    auto const size_val = stop_val - start_val;
+    auto const tail_size = size_val % factor;
     if (tail_size == 0) {
       tail_is_needed = false;
     }
   }
 
-  IntImmPtr factor_expr = alloc<IntImm>(factor);
+  ExprPtr factor_expr = immLike(f->stop(), factor);
   ExprPtr size = alloc<Sub>(f->stop(), f->start());
   ExprPtr split_count = alloc<Div>(size, factor_expr);
   ExprPtr tail_size = alloc<Mod>(size, factor_expr);
@@ -1423,7 +1424,7 @@ void LoopNest::splitWithTail(
 
     StmtPtr body_tail =
         SubstituteInClone(f->body(), {{f->var(), combined_index2}});
-    *tail = alloc<For>(i_tail, alloc<IntImm>(0), tail_size, body_tail);
+    *tail = alloc<For>(i_tail, immLike(tail_size, 0), tail_size, body_tail);
 
     p->insert_stmt_after(*tail, f);
   } else {
@@ -1433,10 +1434,11 @@ void LoopNest::splitWithTail(
   StmtPtr body_inner =
       Substitute(f->removeBody(), {{f->var(), combined_index1}});
 
-  *inner = alloc<For>(i_inner, alloc<IntImm>(0), factor_expr, body_inner);
+  *inner =
+      alloc<For>(i_inner, immLike(factor_expr, 0), factor_expr, body_inner);
   // The input loop `f` will be the outer loop after split.
   f->set_var(i_outer);
-  f->set_start(alloc<IntImm>(0));
+  f->set_start(immLike(split_count, 0));
   f->set_stop(split_count);
   f->set_body(*inner);
 }
@@ -1458,20 +1460,20 @@ void LoopNest::splitWithMask(ForPtr f, int factor, ForPtr* inner) {
   ExprPtr start = IRSimplifier::simplify(f->start());
   ExprPtr stop = IRSimplifier::simplify(f->stop());
   if (start->isConstant() && stop->isConstant()) {
-    int start_val = immediateAs<int>(start);
-    int stop_val = immediateAs<int>(stop);
-    int size_val = stop_val - start_val;
-    int tail_size = size_val % factor;
+    auto start_val = *intValue(start);
+    auto stop_val = *intValue(stop);
+    auto size_val = stop_val - start_val;
+    auto tail_size = size_val % factor;
     if (tail_size == 0) {
       tail_is_needed = false;
     }
   }
 
-  IntImmPtr factor_expr = alloc<IntImm>(factor);
+  auto factor_expr = immLike(f->stop(), factor);
   ExprPtr size = alloc<Sub>(f->stop(), f->start());
   // split_count = (size + factor - 1) / factor
   ExprPtr split_count = alloc<Div>(
-      alloc<Sub>(alloc<Add>(size, factor_expr), alloc<IntImm>(1)), factor_expr);
+      alloc<Sub>(alloc<Add>(size, factor_expr), immLike(size, 1)), factor_expr);
 
   const std::string& loop_var_name = f->var()->name_hint();
   Dtype loop_var_dtype = f->var()->dtype();
@@ -1487,8 +1489,8 @@ void LoopNest::splitWithMask(ForPtr f, int factor, ForPtr* inner) {
   // TODO: is it ok that we're doing it eagerly? In the other implementation we
   // are only materializing predicates at the last, lowering, step.
   if (tail_is_needed) {
-    IntImmPtr start = to<IntImm>(f->start());
-    if (!start || start->value() != 0) {
+    auto start = intValue(f->start());
+    if (!start || *start != 0) {
       throw unimplemented_lowering();
     }
 
@@ -1499,10 +1501,11 @@ void LoopNest::splitWithMask(ForPtr f, int factor, ForPtr* inner) {
   }
   body_inner = Substitute(body_inner, {{f->var(), combined_index}});
 
-  *inner = alloc<For>(i_inner, alloc<IntImm>(0), factor_expr, body_inner);
+  *inner =
+      alloc<For>(i_inner, immLike(factor_expr, 0), factor_expr, body_inner);
   // The input loop `f` will be the outer loop after split.
   f->set_var(i_outer);
-  f->set_start(alloc<IntImm>(0));
+  f->set_start(immLike(split_count, 0));
   f->set_stop(split_count);
   f->set_body(*inner);
 }
@@ -2177,7 +2180,7 @@ bool LoopNest::normalize(ForPtr f) {
       {{f->var(), (VarHandle(f->var()) + ExprHandle(f->start())).node()}});
   f->set_body(IRSimplifier::simplify(for_body_normalized));
   f->set_stop(IRSimplifier::simplify(alloc<Sub>(f->stop(), f->start())));
-  f->set_start(alloc<IntImm>(0));
+  f->set_start(immLike(f->stop(), 0));
   return true;
 }
 
@@ -2242,7 +2245,7 @@ bool LoopNest::flatten(const std::vector<ForPtr>& loops, ForPtr* flattened) {
       normalized_loops[0]->var()->name_hint() + "_flat",
       normalized_loops[0]->var()->dtype());
   VarMapping var_mapping;
-  ExprPtr stop = alloc<IntImm>(1);
+  ExprPtr stop = immLike(flat_var, 1);
   for (size_t i = 0; i < normalized_loops.size(); ++i) {
     size_t idx = normalized_loops.size() - i - 1;
     auto curr_loop = normalized_loops[idx];
@@ -2255,7 +2258,7 @@ bool LoopNest::flatten(const std::vector<ForPtr>& loops, ForPtr* flattened) {
       Substitute(normalized_loops.back()->removeBody(), var_mapping);
 
   normalized_loops.front()->set_var(flat_var);
-  normalized_loops.front()->set_start(alloc<IntImm>(0));
+  normalized_loops.front()->set_start(immLike(stop, 0));
   normalized_loops.front()->set_stop(stop);
   normalized_loops.front()->set_body(flattened_body);
   *flattened = normalized_loops.front();
@@ -2357,7 +2360,7 @@ void LoopNest::compressBuffer(BufPtr buf, StmtPtr stmt) {
   std::vector<ExprPtr> new_dims(buf->dims());
   for (size_t i = 0; i < dims.size(); ++i) {
     if (dims[i]) {
-      new_dims[i] = alloc<IntImm>(1);
+      new_dims[i] = immLike(buf->dims()[i], 1);
     }
   }
   buf->set_dims(new_dims);
@@ -2368,7 +2371,7 @@ void LoopNest::compressBuffer(BufPtr buf, StmtPtr stmt) {
     std::vector<ExprPtr> new_indices(indices);
     for (size_t i = 0; i < dims.size(); ++i) {
       if (dims[i]) {
-        new_indices[i] = alloc<IntImm>(0);
+        new_indices[i] = immLike(indices[i], 0);
       }
     }
     return new_indices;
@@ -2652,12 +2655,13 @@ LoopNest::AccessResult LoopNest::cacheAccesses(
 
   // Determine the size of the cache, and create a loop var for each dimension.
   for (size_t i = 0; i < info.start.size(); ++i) {
-    ExprPtr dim = IRSimplifier::simplify(
-        alloc<Add>(alloc<Sub>(info.stop[i], info.start[i]), alloc<IntImm>(1)));
+    ExprPtr dim = IRSimplifier::simplify(alloc<Add>(
+        alloc<Sub>(info.stop[i], info.start[i]), immLike(info.stop[i], 1)));
 
     tmp_dims.push_back(dim);
 
-    new_loop_vars.push_back(alloc<Var>(var_names[i % var_names.size()], kInt));
+    new_loop_vars.push_back(
+        alloc<Var>(var_names[i % var_names.size()], info.stop[i]->dtype()));
     new_loop_vars_expr.push_back(new_loop_vars[i]);
   }
 
@@ -2708,8 +2712,8 @@ LoopNest::AccessResult LoopNest::cacheAccesses(
         tmp_buf, new_loop_vars_expr, getImmediateByType(tmp_buf->dtype(), 0));
 
     for (int64_t i = new_loop_vars.size() - 1; i >= 0; --i) {
-      tmp_init =
-          alloc<For>(new_loop_vars[i], alloc<IntImm>(0), tmp_dims[i], tmp_init);
+      tmp_init = alloc<For>(
+          new_loop_vars[i], immLike(tmp_dims[i], 0), tmp_dims[i], tmp_init);
     }
 
     if (is_block) {
@@ -2730,7 +2734,7 @@ LoopNest::AccessResult LoopNest::cacheAccesses(
 
     for (int64_t i = new_loop_vars.size() - 1; i >= 0; --i) {
       tmp_store = alloc<For>(
-          new_loop_vars[i], alloc<IntImm>(0), tmp_dims[i], tmp_store);
+          new_loop_vars[i], immLike(tmp_dims[i], 0), tmp_dims[i], tmp_store);
     }
 
     if (is_block) {
@@ -2749,7 +2753,7 @@ LoopNest::AccessResult LoopNest::cacheAccesses(
 
     for (int64_t i = new_loop_vars.size() - 1; i >= 0; --i) {
       tmp_store = alloc<For>(
-          new_loop_vars[i], alloc<IntImm>(0), tmp_dims[i], tmp_store);
+          new_loop_vars[i], immLike(tmp_dims[i], 0), tmp_dims[i], tmp_store);
     }
 
     if (is_block) {
@@ -2766,7 +2770,7 @@ LoopNest::AccessResult LoopNest::cacheAccesses(
 
     for (int64_t i = new_loop_vars.size() - 1; i >= 0; --i) {
       tmp_store = alloc<For>(
-          new_loop_vars[i], alloc<IntImm>(0), tmp_dims[i], tmp_store);
+          new_loop_vars[i], immLike(tmp_dims[i], 0), tmp_dims[i], tmp_store);
     }
 
     if (is_block) {
@@ -2914,7 +2918,8 @@ void LoopNest::computeAt(StmtPtr s, ForPtr f) {
   std::vector<ExprPtr> temp_indices(dims.size());
   for (const auto i : c10::irange(dims.size())) {
     // TODO: Use name-hint of the producer indices instead of 'idx'
-    temp_indices[i] = alloc<Var>(std::string("idx") + c10::to_string(i), kInt);
+    temp_indices[i] =
+        alloc<Var>(std::string("idx") + c10::to_string(i), dims[i]->dtype());
   }
 
   // Prepare substitute rules for constructing the temp statement from the prod
@@ -2955,7 +2960,10 @@ void LoopNest::computeAt(StmtPtr s, ForPtr f) {
     // dimensions in reversed order.
     size_t dim_idx = dims.size() - 1 - i;
     bd = alloc<For>(
-        to<Var>(temp_indices[dim_idx]), alloc<IntImm>(0), dims[dim_idx], bd);
+        to<Var>(temp_indices[dim_idx]),
+        immLike(dims[dim_idx], 0),
+        dims[dim_idx],
+        bd);
   }
 
   // Add constructed stmts to the consumer loop
diff --git a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
index 8f6f2b106b1b2..e1688e37cbe7f 100644
--- a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
+++ b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
@@ -185,13 +185,13 @@ void AccessInfo::dumpDOT(std::ostream& os) const {
     if (bounds_.size() > 0) {
       for (size_t i = 0; i < bounds_.size() - 1; ++i) {
         os << *IRSimplifier::simplify(
-                  alloc<Add>(bounds_[i].end, alloc<IntImm>(1)))
+                  alloc<Add>(bounds_[i].end, immLike(bounds_[i].end, 1)))
            << ", ";
       }
 
       size_t i = bounds_.size() - 1;
       os << *IRSimplifier::simplify(
-          alloc<Add>(bounds_[i].end, alloc<IntImm>(1)));
+          alloc<Add>(bounds_[i].end, immLike(bounds_[i].end, 1)));
       os << "]\"\n ";
     }
     if (isWrite()) {
@@ -632,7 +632,7 @@ bool executionSafetyCheck(
     // Invert the startDiff so mod works.
     if (diffNegative != strideNegative) {
       startDiff =
-          IRSimplifier::simplify(alloc<Sub>(alloc<IntImm>(0), startDiff));
+          IRSimplifier::simplify(alloc<Sub>(immLike(startDiff, 0), startDiff));
     }
 
     // If both accesses have the same stride, and the difference in start
@@ -650,7 +650,7 @@ bool executionSafetyCheck(
     CompareSelectOperation op = strideNegative ? kLT : kGT;
 
     ExprPtr check = IRSimplifier::simplify(
-        alloc<CompareSelect>(startDiff, alloc<IntImm>(0), op));
+        alloc<CompareSelect>(startDiff, immLike(startDiff, 0), op));
 
     // If the start difference modulo the minimum stride is offset from that
     // stride, then the ranges have distinct strides.
@@ -731,7 +731,7 @@ void MemDependencyChecker::visit(ForPtr v) {
     for (const auto i : c10::irange(indices.size())) {
       VarFinder vf;
       if (vf.find(indices[i]).count(var) == 0) {
-        loopIndicesStride[i] = alloc<IntImm>(0);
+        loopIndicesStride[i] = immLike(indices[i], 0);
       } else {
         // If we've previously swapped the start and end of this bound, we
         // should apply the substitution to the reverse of the bounds.
@@ -740,19 +740,19 @@ void MemDependencyChecker::visit(ForPtr v) {
               SubstituteInClone(info->bounds()[i].end, {{var, v->start()}}));
           info->bounds()[i].start = IRSimplifier::simplify(SubstituteInClone(
               info->bounds()[i].start,
-              {{var, alloc<Sub>(v->stop(), alloc<IntImm>(1))}}));
+              {{var, alloc<Sub>(v->stop(), immLike(v->stop(), 1))}}));
 
         } else {
           info->bounds()[i].start = IRSimplifier::simplify(
               SubstituteInClone(info->bounds()[i].start, {{var, v->start()}}));
           info->bounds()[i].end = IRSimplifier::simplify(SubstituteInClone(
               info->bounds()[i].end,
-              {{var, alloc<Sub>(v->stop(), alloc<IntImm>(1))}}));
+              {{var, alloc<Sub>(v->stop(), immLike(v->stop(), 1))}}));
         }
 
         ExprPtr zeroStep = indices[i];
         ExprPtr oneStep = SubstituteInClone(
-            indices[i], {{var, alloc<Add>(var, alloc<IntImm>(1))}});
+            indices[i], {{var, alloc<Add>(var, immLike(var, 1))}});
         loopIndicesStride[i] =
             IRSimplifier::simplify(alloc<Sub>(oneStep, zeroStep));
 
@@ -785,7 +785,7 @@ void MemDependencyChecker::visit(ForPtr v) {
         bound.start = IRSimplifier::simplify(
             SubstituteInClone(bound.start, {{var, v->start()}}));
         bound.end = IRSimplifier::simplify(SubstituteInClone(
-            bound.end, {{var, alloc<Sub>(v->stop(), alloc<IntImm>(1))}}));
+            bound.end, {{var, alloc<Sub>(v->stop(), immLike(v->stop(), 1))}}));
 
         // If the start < end then swap the order of the bound.
         ExprPtr diff =
@@ -1037,8 +1037,8 @@ void MemDependencyChecker::insertBuffers(
     IndexBounds bounds;
     for (auto d : b->dims()) {
       bounds.push_back(
-          {alloc<IntImm>(0),
-           IRSimplifier::simplify(alloc<Sub>(d, alloc<IntImm>(1)))});
+          {immLike(d, 0),
+           IRSimplifier::simplify(alloc<Sub>(d, immLike(d, 1)))});
     }
     auto info =
         std::make_shared<AccessInfo>(nextAccess_++, type, nullptr, var, bounds);
@@ -1126,8 +1126,9 @@ void MemDependencyChecker::visit(AllocatePtr v) {
   // avoid failing the bound check. But this is not the correct approach and
   // should be fixed.
   ExprPtr flat_size = buf_flat_size(v->buf());
-  flat_size = IRSimplifier::simplify(alloc<Sub>(flat_size, alloc<IntImm>(1)));
-  bounds.push_back({alloc<IntImm>(0), flat_size});
+  flat_size =
+      IRSimplifier::simplify(alloc<Sub>(flat_size, immLike(flat_size, 1)));
+  bounds.push_back({immLike(flat_size, 0), flat_size});
 
   auto info = std::make_shared<AccessInfo>(
       nextAccess_++, AccessType::Alloc, nullptr, var, bounds);
diff --git a/torch/csrc/jit/tensorexpr/registerizer.cpp b/torch/csrc/jit/tensorexpr/registerizer.cpp
index bc26581970383..8684f2aabc810 100644
--- a/torch/csrc/jit/tensorexpr/registerizer.cpp
+++ b/torch/csrc/jit/tensorexpr/registerizer.cpp
@@ -18,7 +18,7 @@ void AccessInfo::addStore(StorePtr store, const std::shared_ptr<Scope>& scope) {
   last_usage_ = store;
 
   store_cost_ =
-      IRSimplifier::simplify(alloc<Add>(store_cost_, alloc<IntImm>(1)));
+      IRSimplifier::simplify(alloc<Add>(store_cost_, immLike(store_cost_, 1)));
   stores_.push_back(store);
 
   conditionId_ = scope->conditionId();
@@ -34,7 +34,8 @@ void AccessInfo::addLoad(
   first_usage_ = first_usage_ ? block_->getEnclosedRoot(first_usage_) : usage;
   last_usage_ = usage;
 
-  load_cost_ = IRSimplifier::simplify(alloc<Add>(load_cost_, alloc<IntImm>(1)));
+  load_cost_ =
+      IRSimplifier::simplify(alloc<Add>(load_cost_, immLike(load_cost_, 1)));
   loads_.push_back(load);
 
   conditionId_ = scope->conditionId();
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp
index ea3902dcf3c0d..7a219fe728757 100644
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -31,8 +31,8 @@ StmtPtr Tensor::constructStmt(
     for (const auto i : c10::irange(reduce_ndim)) {
       // Going in reverse order: from innermost loop to the outermost
       size_t dim_index = reduce_ndim - i - 1;
-      s = alloc<For>(
-          reduce_args[dim_index], alloc<IntImm>(0), reduce_dims[dim_index], s);
+      auto const& dim = reduce_dims[dim_index];
+      s = alloc<For>(reduce_args[dim_index], immLike(dim, 0), dim, s);
     }
     if (init_expr) {
       StorePtr init_stmt = alloc<Store>(buf(), indices, init_expr);
@@ -43,7 +43,8 @@ StmtPtr Tensor::constructStmt(
   for (const auto i : c10::irange(ndim)) {
     // Going in reverse order: from innermost loop to the outermost
     size_t dim_index = ndim - i - 1;
-    s = alloc<For>(args[dim_index], alloc<IntImm>(0), buf()->dim(dim_index), s);
+    auto const& dim = buf()->dim(dim_index);
+    s = alloc<For>(args[dim_index], immLike(dim, 0), dim, s);
   }
   return s;
 }

From 371c6612b39c1fa18d6c2e22613e292ae87b686f Mon Sep 17 00:00:00 2001
From: Facebook Community Bot <facebook-github-bot@users.noreply.github.com>
Date: Sun, 29 Aug 2021 09:56:34 -0700
Subject: [PATCH 329/530] Automated submodule update: FBGEMM (#64141)

Summary:
This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM).

New submodule commit: https://github.com/pytorch/FBGEMM/commit/9939bac9defab4d18fb7fdded7e1a76c0c2b49b4

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64141

Test Plan: Ensure that CI jobs succeed on GitHub before landing.

Reviewed By: jspark1105

Differential Revision: D30629417

fbshipit-source-id: 1b1ad3d4caff925f798b86b358ab193554c9b8e0
---
 third_party/fbgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/fbgemm b/third_party/fbgemm
index e922280540acf..9939bac9defab 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit e922280540acf7920ea21f99b8db064f89bb8c11
+Subproject commit 9939bac9defab4d18fb7fdded7e1a76c0c2b49b4

From 52d7dd73987dde6bcfe9c672cfb1b36e284becef Mon Sep 17 00:00:00 2001
From: "Tugsbayasgalan (Tugsuu) Manlaibaatar" <tmanlaibaatar@fb.com>
Date: Sun, 29 Aug 2021 10:19:56 -0700
Subject: [PATCH 330/530] [DOC] improve docstring for Optimizer.state_dict
 (#63153)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63153

Fixes: https://github.com/pytorch/pytorch/issues/60121

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D30629462

Pulled By: tugsbayasgalan

fbshipit-source-id: a9160e02ac53bb1a6219879747d73aae9ebe4d2f
---
 torch/optim/optimizer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 02f1cc265937b..79f72f041822b 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -100,7 +100,8 @@ def state_dict(self):
 
         * state - a dict holding current optimization state. Its content
             differs between optimizer classes.
-        * param_groups - a dict containing all parameter groups
+        * param_groups - a list containing all parameter groups where each
+            parameter group is a dict
         """
         # Save order indices instead of Tensors
         param_mappings = {}

From 7ebdbf82dccea370edda161936cc533c012e690a Mon Sep 17 00:00:00 2001
From: Garrett Cramer <gcramer@fb.com>
Date: Sun, 29 Aug 2021 11:33:48 -0700
Subject: [PATCH 331/530] add support for sending cpu sparse tensors over rpc
 (#62794)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62794

This pr updates jit serialization to support pickling Sparse COO tensors.
This pr updates message.cpp to support Sparse COO tensors.
A bug was filed a few years ago https://github.com/pytorch/pytorch/issues/30807.

I tested the fix by adding sparse tensor tests to rpc_test.py and dist_autograd_test.py.

cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23 gmagogsfm

Test Plan: Imported from OSS

Reviewed By: soulitzer

Differential Revision: D30608848

Pulled By: gcramer23

fbshipit-source-id: 629ba8e4a3d8365875a709c9b87447c7a71204fb
---
 torch/csrc/distributed/rpc/message.cpp        |  11 +-
 torch/csrc/jit/serialization/pickler.cpp      |  44 ++
 torch/csrc/jit/serialization/pickler.h        |   1 +
 torch/csrc/jit/serialization/unpickler.cpp    |  35 +
 torch/csrc/jit/serialization/unpickler.h      |   1 +
 .../distributed/rpc/dist_autograd_test.py     | 653 +++++++++++++----
 .../_internal/distributed/rpc/rpc_test.py     | 663 +++++++++++++++---
 7 files changed, 1172 insertions(+), 236 deletions(-)

diff --git a/torch/csrc/distributed/rpc/message.cpp b/torch/csrc/distributed/rpc/message.cpp
index 02771140f69bb..7265ed400b2e9 100644
--- a/torch/csrc/distributed/rpc/message.cpp
+++ b/torch/csrc/distributed/rpc/message.cpp
@@ -68,10 +68,17 @@ void Message::setId(int64_t id) {
 
 std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> Message::getStorages()
     const {
+  // Sparse tensors do not have storage. Instead, a sparse tensor
+  // contains two tensors indices and values, and both contain storage.
   std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> storages;
-  storages.reserve(tensors_.size());
+  storages.reserve(2 * tensors_.size());
   for (const auto& tensor : tensors_) {
-    storages.emplace_back(tensor.storage().getWeakStorageImpl());
+    if (tensor.is_sparse()) {
+      storages.emplace_back(tensor._indices().storage().getWeakStorageImpl());
+      storages.emplace_back(tensor._values().storage().getWeakStorageImpl());
+    } else {
+      storages.emplace_back(tensor.storage().getWeakStorageImpl());
+    }
   }
   return storages;
 }
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 4a4e8663b3838..f465eaf4dff00 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -353,6 +353,44 @@ void Pickler::pushTensor(const IValue& ivalue) {
   }
 }
 
+void Pickler::pushLiteralSparseTensor(const at::Tensor& tensor) {
+  pushGlobal("torch._utils", "_rebuild_sparse_tensor");
+  push<PickleOpCode>(PickleOpCode::MARK);
+  // layout
+  auto layout = static_cast<int>(tensor.layout());
+  pushInt(layout);
+  switch (layout) {
+    case static_cast<int>(c10::Layout::Sparse):
+      // size
+      push<PickleOpCode>(PickleOpCode::MARK);
+      for (auto size : tensor.sizes()) {
+        pushInt(size);
+      }
+      push<PickleOpCode>(PickleOpCode::TUPLE);
+      // requires grad
+      pushIValue(tensor.requires_grad());
+      // indices
+      pushTensor(tensor._indices());
+      // values
+      pushTensor(tensor._values());
+      break;
+    default:
+      TORCH_CHECK(
+          false,
+          "Unsupported sparse tensor layout type in serialization ",
+          static_cast<c10::Layout>(layout));
+      break;
+  }
+  // backward_hooks
+  pushGlobal("collections", "OrderedDict");
+  push<PickleOpCode>(PickleOpCode::EMPTY_TUPLE);
+  // Construct the collections.OrderedDict for the backward_hooks
+  push<PickleOpCode>(PickleOpCode::REDUCE);
+  push<PickleOpCode>(PickleOpCode::TUPLE);
+  // Call torch._utils._rebuild_sparse_coo_tensor
+  push<PickleOpCode>(PickleOpCode::REDUCE);
+}
+
 void Pickler::pushLiteralTensor(const IValue& ivalue) {
   // In contrast to tensor references, literal tensors are included in the
   // pickle program binary blob. They are written to the file after the STOP
@@ -362,6 +400,12 @@ void Pickler::pushLiteralTensor(const IValue& ivalue) {
   // The format here is the same one used by `torch.save()`. The code for the
   // format can be found in `torch/serialization.py`.
   auto& tensor = ivalue.toTensor();
+
+  if (tensor.is_sparse() || tensor.is_sparse_csr()) {
+    pushLiteralSparseTensor(tensor);
+    return;
+  }
+
   bool quantized = tensor.is_quantized();
   // The arguments to this function are:
   //    storage, storage_offset, size, stride, requires_grad, backward_hooks
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index ac54ac45a2886..3dc6bef9d9131 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -172,6 +172,7 @@ class TORCH_API Pickler {
   void pushTensor(const IValue& ivalue);
   void pushTensorReference(const IValue& ivalue);
   void pushLiteralTensor(const IValue& ivalue);
+  void pushLiteralSparseTensor(const at::Tensor& tensor);
   void pushTuple(const IValue& ivalue);
   void pushString(const std::string& string);
   void pushDevice(const IValue& ivalue);
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 581b94978c459..f944387465446 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -550,6 +550,9 @@ void Unpickler::readGlobal(
     // Unpickle a tensor
     bool quantized = class_name == "_rebuild_qtensor";
     rebuildTensor(quantized);
+  } else if (
+      module_name == "torch._utils" && class_name == "_rebuild_sparse_tensor") {
+    rebuildSparseTensor();
   } else if (module_name == "builtins" && class_name == "complex") {
     globals_.emplace_back([this] {
       auto elems = pop(stack_).toTuple()->elements();
@@ -647,6 +650,38 @@ void Unpickler::readGlobal(
   stack_.emplace_back(int64_t(globals_.size() - 1));
 }
 
+void Unpickler::rebuildSparseTensor() {
+  globals_.emplace_back([this] {
+    auto tup = pop(stack_).toTuple();
+    const auto& elements = tup->elements();
+    size_t idx = 0;
+    auto layout = elements.at(idx++).toInt();
+    at::Tensor result;
+    switch (layout) {
+      case static_cast<int>(c10::Layout::Sparse): {
+        std::vector<int64_t> size = tupleToIntList(elements.at(idx++));
+        bool requires_grad = elements.at(idx++).toBool();
+        auto& indices_tensor = elements.at(idx++).toTensor();
+        auto& values_tensor = elements.at(idx++).toTensor();
+        auto options = values_tensor.options()
+                           .layout(c10::Layout::Sparse)
+                           .requires_grad(requires_grad);
+        result = at::_sparse_coo_tensor_unsafe(
+            indices_tensor, values_tensor, size, options);
+        result = autograd::make_variable(result, options.requires_grad());
+        break;
+      }
+      default:
+        TORCH_CHECK(
+            false,
+            "Unsupported sparse tensor layout type in serialization ",
+            static_cast<c10::Layout>(layout));
+        break;
+    }
+    stack_.emplace_back(std::move(result));
+  });
+}
+
 void Unpickler::rebuildTensor(bool quantized) {
   globals_.emplace_back([this, quantized] {
     auto tup = pop(stack_).toTuple();
diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h
index f404deee848be..586ff9cc4ae59 100644
--- a/torch/csrc/jit/serialization/unpickler.h
+++ b/torch/csrc/jit/serialization/unpickler.h
@@ -108,6 +108,7 @@ class TORCH_API Unpickler {
       const std::string& module_name,
       const std::string& class_name);
   void rebuildTensor(bool quantized);
+  void rebuildSparseTensor();
 #ifdef USE_DISTRIBUTED
   void rebuildRRef();
 #endif
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index 017a61b7debf5..fba50303068e7 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -64,13 +64,29 @@ def _torch_ones(sizes, requires_grad=False):
 # rref tensor equals to the given grad.
 def _compare_owner_value(context_id, rref, grad):
     grads = dist_autograd.get_gradients(context_id)
-    return torch.equal(grads[rref.local_value()], grad)
+    x = grads[rref.local_value()]
+    if x.is_sparse:
+        assert grad.is_sparse
+        x = x.to_dense()
+        grad = grad.to_dense()
+    else:
+        assert not grad.is_sparse
+    return torch.equal(x, grad)
 
 
 def create_tensor():
     return torch.ones((3, 3), requires_grad=True)
 
 
+def build_sparse_tensor(coalesce=False, requires_grad=True, dtype=torch.float32):
+    i = [[0, 1, 1], [2, 0, 2]]
+    v = [3.2, 4.1, 5.3]
+    tensor = torch.sparse_coo_tensor(i, v, (3, 3), requires_grad=requires_grad, dtype=dtype)
+    if coalesce:
+        tensor = tensor.coalesce()
+    return tensor
+
+
 @torch.jit.script
 def create_torchscript_tensor() -> torch.Tensor:
     return torch.ones((3, 3)).requires_grad_()
@@ -143,20 +159,28 @@ def _all_contexts_cleaned_up(timeout_seconds=10):
 
 # This function creates a dis atugorad context, run rpc_sync on the given ps,
 # and then blocks until the ps has verified the grads are correctly accumulated.
-def _run_trainer(rref_t1, t2, ps, rank_diff):
+def _run_trainer(rref_t1, t2, ps, rank_diff, sparse):
     with dist_autograd.context() as context_id:
         ret = rpc.rpc_sync(ps, my_rref_add, args=(rref_t1, t2))
-        dist_autograd.backward(context_id, [ret.sum()])
+        if sparse:
+            loss = torch.sparse.sum(ret)
+        else:
+            loss = ret.sum()
+        dist_autograd.backward(context_id, [loss])
         # prevent deleting dist autograd context
         rpc.rpc_sync(ps, _set_rpc_done, args=(context_id, rank_diff))
         rpc.rpc_sync(ps, _check_rpc_done, args=(0,))
 
 # This function is the same as _run_trainer, except rpc calls torchscript
 # function "my_script_ref_add" instead of python funciton "my_rref_add"
-def _run_trainer_torchscript(rref_t1, t2, ps, rank_diff):
+def _run_trainer_torchscript(rref_t1, t2, ps, rank_diff, sparse):
     with dist_autograd.context() as context_id:
         ret = rpc.rpc_sync(ps, my_script_ref_add, args=(rref_t1, t2))
-        dist_autograd.backward(context_id, [ret.sum()])
+        if sparse:
+            loss = torch.sparse.sum(ret)
+        else:
+            loss = ret.sum()
+        dist_autograd.backward(context_id, [loss])
         # prevent deleting dist autograd context
         rpc.rpc_sync(ps, _set_rpc_done, args=(context_id, rank_diff))
         rpc.rpc_sync(ps, _check_rpc_done, args=(0,))
@@ -379,14 +403,18 @@ def _verify_graph_for_nested_rpc_call(self, ctx):
             "torch::distributed::autograd::RecvRpcBackward", next_funcs[0][0].name()
         )
 
-    def _test_graph(self, fn, exec_mode):
+    def _test_graph(self, fn, exec_mode, sparse):
         dst_rank = (self.rank + 1) % self.world_size
 
         initialize_pg(self.file_init_method, self.rank, self.world_size)
 
         with dist_autograd.context() as context_id:
-            t1 = torch.ones(3, 3, requires_grad=True)
-            t2 = torch.zeros(3, 3, requires_grad=True)
+            if sparse:
+                t1 = build_sparse_tensor()
+                t2 = build_sparse_tensor()
+            else:
+                t1 = torch.ones(3, 3, requires_grad=True)
+                t2 = torch.zeros(3, 3, requires_grad=True)
             if ExecMode.RPC_SYNC == exec_mode:
                 ret = rpc.rpc_sync(worker_name(dst_rank), fn, args=(t1, t2))
             elif ExecMode.REMOTE == exec_mode:
@@ -436,29 +464,49 @@ def _test_graph(self, fn, exec_mode):
 
     @dist_init
     def test_graph_for_builtin_call(self):
-        self._test_graph(torch.add, ExecMode.RPC_SYNC)
+        self._test_graph(torch.add, ExecMode.RPC_SYNC, False)
+
+    @dist_init
+    def test_graph_for_builtin_call_sparse(self):
+        self._test_graph(torch.add, ExecMode.RPC_SYNC, True)
 
     @dist_init
     def test_graph_for_python_call(self):
-        self._test_graph(my_py_add, ExecMode.RPC_SYNC)
+        self._test_graph(my_py_add, ExecMode.RPC_SYNC, False)
+
+    @dist_init
+    def test_graph_for_python_call_sparse(self):
+        self._test_graph(my_py_add, ExecMode.RPC_SYNC, True)
 
     @dist_init
     def test_graph_for_builtin_remote_call(self):
-        self._test_graph(torch.add, ExecMode.REMOTE)
+        self._test_graph(torch.add, ExecMode.REMOTE, False)
+
+    @dist_init
+    def test_graph_for_builtin_remote_call_sparse(self):
+        self._test_graph(torch.add, ExecMode.REMOTE, True)
 
     @dist_init
     def test_graph_for_python_remote_call(self):
-        self._test_graph(my_py_add, ExecMode.REMOTE)
+        self._test_graph(my_py_add, ExecMode.REMOTE, False)
+
+    @dist_init
+    def test_graph_for_python_remote_call_sparse(self):
+        self._test_graph(my_py_add, ExecMode.REMOTE, True)
 
     # 3-layer nested calls
-    def _test_graph_for_py_nested_call(self, exec_mode):
+    def _test_graph_for_py_nested_call(self, exec_mode, sparse):
         dst_rank = (self.rank + 1) % self.world_size
 
         initialize_pg(self.file_init_method, self.rank, self.world_size)
 
         with dist_autograd.context() as context_id:
-            t1 = torch.ones(3, 3, requires_grad=True)
-            t2 = torch.zeros(3, 3, requires_grad=True)
+            if sparse:
+                t1 = build_sparse_tensor(requires_grad=True)
+                t2 = build_sparse_tensor(requires_grad=True)
+            else:
+                t1 = torch.ones(3, 3, requires_grad=True)
+                t2 = torch.zeros(3, 3, requires_grad=True)
             nest_dst_rank = (dst_rank + 1) % self.world_size
             if ExecMode.RPC_SYNC == exec_mode:
                 ret = rpc.rpc_sync(
@@ -531,21 +579,33 @@ def _test_graph_for_py_nested_call(self, exec_mode):
 
     @dist_init
     def test_graph_for_py_nested_call(self):
-        self._test_graph_for_py_nested_call(ExecMode.RPC_SYNC)
+        self._test_graph_for_py_nested_call(ExecMode.RPC_SYNC, False)
+
+    @dist_init
+    def test_graph_for_py_nested_call_sparse(self):
+        self._test_graph_for_py_nested_call(ExecMode.RPC_SYNC, True)
 
     @dist_init
     def test_graph_for_py_nested_remote_call(self):
-        self._test_graph_for_py_nested_call(ExecMode.REMOTE)
+        self._test_graph_for_py_nested_call(ExecMode.REMOTE, False)
+
+    @dist_init
+    def test_graph_for_py_nested_remote_call_sparse(self):
+        self._test_graph_for_py_nested_call(ExecMode.REMOTE, True)
 
     # Rank0->Rank1->Rank0
-    def _test_graph_for_py_nested_call_itself(self, exec_mode):
+    def _test_graph_for_py_nested_call_itself(self, exec_mode, sparse):
         dst_rank = (self.rank + 1) % self.world_size
 
         initialize_pg(self.file_init_method, self.rank, self.world_size)
 
         with dist_autograd.context() as context_id:
-            t1 = torch.ones(3, 3, requires_grad=True)
-            t2 = torch.zeros(3, 3, requires_grad=True)
+            if sparse:
+                t1 = build_sparse_tensor(requires_grad=True)
+                t2 = build_sparse_tensor(requires_grad=True)
+            else:
+                t1 = torch.ones(3, 3, requires_grad=True)
+                t2 = torch.zeros(3, 3, requires_grad=True)
             if ExecMode.RPC_SYNC == exec_mode:
                 ret = rpc.rpc_sync(
                     worker_name(dst_rank),
@@ -610,18 +670,30 @@ def _test_graph_for_py_nested_call_itself(self, exec_mode):
 
     @dist_init
     def test_graph_for_py_nested_call_itself(self):
-        self._test_graph_for_py_nested_call_itself(ExecMode.RPC_SYNC)
+        self._test_graph_for_py_nested_call_itself(ExecMode.RPC_SYNC, False)
+
+    @dist_init
+    def test_graph_for_py_nested_call_itself_sparse(self):
+        self._test_graph_for_py_nested_call_itself(ExecMode.RPC_SYNC, True)
 
     @dist_init
     def test_graph_for_py_nested_remote_call_itself(self):
-        self._test_graph_for_py_nested_call_itself(ExecMode.REMOTE)
+        self._test_graph_for_py_nested_call_itself(ExecMode.REMOTE, False)
+
+    @dist_init
+    def test_graph_for_py_nested_remote_call_itself_sparse(self):
+        self._test_graph_for_py_nested_call_itself(ExecMode.REMOTE, True)
 
-    def _test_no_graph_with_tensors_not_require_grad(self, exec_mode):
+    def _test_no_graph_with_tensors_not_require_grad(self, exec_mode, sparse):
         initialize_pg(self.file_init_method, self.rank, self.world_size)
         dst_rank = (self.rank + 1) % self.world_size
         with dist_autograd.context() as context_id:
-            t1 = torch.ones(3, 3, requires_grad=False)
-            t2 = torch.zeros(3, 3, requires_grad=False)
+            if sparse:
+                t1 = build_sparse_tensor(requires_grad=False)
+                t2 = build_sparse_tensor(requires_grad=False)
+            else:
+                t1 = torch.ones(3, 3, requires_grad=False)
+                t2 = torch.zeros(3, 3, requires_grad=False)
             if ExecMode.RPC_SYNC == exec_mode:
                 ret = rpc.rpc_sync(
                     worker_name(dst_rank), torch.add, args=(t1, t2)
@@ -656,11 +728,19 @@ def _test_no_graph_with_tensors_not_require_grad(self, exec_mode):
 
     @dist_init
     def test_no_graph_with_tensors_not_require_grad(self):
-        self._test_no_graph_with_tensors_not_require_grad(ExecMode.RPC_SYNC)
+        self._test_no_graph_with_tensors_not_require_grad(ExecMode.RPC_SYNC, False)
+
+    @dist_init
+    def test_no_graph_with_tensors_not_require_grad_sparse(self):
+        self._test_no_graph_with_tensors_not_require_grad(ExecMode.RPC_SYNC, True)
 
     @dist_init
     def test_no_graph_with_tensors_not_require_grad_remote(self):
-        self._test_no_graph_with_tensors_not_require_grad(ExecMode.REMOTE)
+        self._test_no_graph_with_tensors_not_require_grad(ExecMode.REMOTE, False)
+
+    @dist_init
+    def test_no_graph_with_tensors_not_require_grad_remote_sparse(self):
+        self._test_no_graph_with_tensors_not_require_grad(ExecMode.REMOTE, True)
 
     def _test_grad_only_on_return_value(self, exec_mode):
         initialize_pg(self.file_init_method, self.rank, self.world_size)
@@ -699,13 +779,16 @@ def test_grad_only_on_return_value(self):
     def test_grad_only_on_return_value_remote(self):
         self._test_grad_only_on_return_value(ExecMode.REMOTE)
 
-    def _test_rpc_complex_args(self, exec_mode):
+    def _test_rpc_complex_args(self, exec_mode, sparse):
         with dist_autograd.context() as context_id:
             num_tensors = 10
             tensors = []
             for i in range(num_tensors):
-                tensors.append(torch.ones(3, 3, requires_grad=(i % 2 == 0)))
-
+                if sparse:
+                    tensor = build_sparse_tensor(requires_grad=(i % 2 == 0))
+                else:
+                    tensor = torch.ones(3, 3, requires_grad=(i % 2 == 0))
+                tensors.append(tensor)
             dst_rank = self._next_rank()
             if ExecMode.RPC_SYNC == exec_mode:
                 ret = rpc.rpc_sync(
@@ -739,11 +822,19 @@ def _test_rpc_complex_args(self, exec_mode):
 
     @dist_init
     def test_rpc_complex_args(self):
-        self._test_rpc_complex_args(ExecMode.RPC_SYNC)
+        self._test_rpc_complex_args(ExecMode.RPC_SYNC, False)
+
+    @dist_init
+    def test_rpc_complex_args_sparse(self):
+        self._test_rpc_complex_args(ExecMode.RPC_SYNC, True)
 
     @dist_init
     def test_remote_complex_args(self):
-        self._test_rpc_complex_args(ExecMode.REMOTE)
+        self._test_rpc_complex_args(ExecMode.REMOTE, False)
+
+    @dist_init
+    def test_remote_complex_args_sparse(self):
+        self._test_rpc_complex_args(ExecMode.REMOTE, True)
 
     def context_cleanup_test_helper(self, rpc_args, func, nested=False):
         initialize_pg(self.file_init_method, self.rank, self.world_size)
@@ -788,11 +879,22 @@ def test_context_cleanup_tensor_with_grad(self):
         t2 = torch.zeros(3, 3, requires_grad=True)
         self.context_cleanup_test_helper(rpc_args=(t1, t2), func=torch.add)
 
+    @dist_init
+    def test_context_cleanup_tensor_with_grad_sparse(self):
+        t1 = build_sparse_tensor(requires_grad=True)
+        t2 = build_sparse_tensor(requires_grad=True)
+        self.context_cleanup_test_helper(rpc_args=(t1, t2), func=torch.add)
+
     @dist_init
     def test_context_cleanup_tensor_no_grad(self):
         t1 = torch.ones(3, 3, requires_grad=False)
         self.context_cleanup_test_helper(rpc_args=(t1, t1), func=torch.add)
 
+    @dist_init
+    def test_context_cleanup_tensor_no_grad_sparse(self):
+        t1 = build_sparse_tensor(requires_grad=False)
+        self.context_cleanup_test_helper(rpc_args=(t1, t1), func=torch.add)
+
     @dist_init
     def test_context_cleanup_no_tensors(self):
         self.context_cleanup_test_helper(rpc_args=(1, 1), func=my_scalar_add)
@@ -807,6 +909,16 @@ def test_context_cleanup_nested_rpc(self):
             rpc_args=args, func=my_py_nested_call, nested=True
         )
 
+    @dist_init
+    def test_context_cleanup_nested_rpc_sparse(self):
+        t1 = build_sparse_tensor(requires_grad=True)
+        t2 = build_sparse_tensor(requires_grad=True)
+        dst_rank = (self.rank + 1) % self.world_size
+        args = (t1, t2, dst_rank, self.world_size, 0)
+        self.context_cleanup_test_helper(
+            rpc_args=args, func=my_py_nested_call, nested=True
+        )
+
     @dist_init
     def test_worker_ids_recorded(self):
         dst_ranks = {rank for rank in range(self.world_size) if rank != self.rank}
@@ -876,23 +988,27 @@ def test_error_in_context(self):
                     worker_name(self._next_rank()), torch.matmul, args=(t1, t2)
                 )
 
-    @dist_init
-    def test_backward_no_grad_on_tensor(self):
-        t1 = torch.rand((3, 3), requires_grad=True)
-        t2 = torch.rand((3, 3), requires_grad=True)
+    def _backward_no_grad_on_tensor(self, t1, t2, sparse):
         with dist_autograd.context() as context_id:
             loss = rpc.rpc_sync(
                 worker_name(self._next_rank()),
                 torch.add,
-                args=(t1, t2)).sum()
-
+                args=(t1, t2))
+            if sparse:
+                loss = torch.sparse.sum(loss)
+            else:
+                loss = loss.sum()
             dist_autograd.backward(context_id, [loss], retain_graph=True)
             self.assertIsNone(t1.grad)
             self.assertIsNone(t2.grad)
 
             # Now populate .grad with local autograd engine and
             # verify dist autograd doesn't mess with it.
-            loss_local = torch.add(t1, t2).sum()
+            loss_local = torch.add(t1, t2)
+            if sparse:
+                loss_local = torch.sparse.sum(loss_local)
+            else:
+                loss_local = loss_local.sum()
             loss_local.backward()
             self.assertIsNotNone(t1.grad)
             self.assertIsNotNone(t2.grad)
@@ -903,18 +1019,34 @@ def test_backward_no_grad_on_tensor(self):
             self.assertEqual(t1_grad_before, t1.grad)
             self.assertEqual(t2_grad_before, t2.grad)
 
-    def _test_backward_simple(self, dst):
-        # Run the same code locally and with dist autograd and verify gradients
-        # are same.
-        local_grads = None
-        t1 = torch.rand((3, 3), requires_grad=True)
-        t2 = torch.rand((3, 3), requires_grad=True)
+    @dist_init
+    def test_backward_no_grad_on_tensor(self):
+        self._backward_no_grad_on_tensor(
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3), requires_grad=True),
+            False
+        )
+
+    @dist_init
+    def test_backward_no_grad_on_tensor_sparse(self):
+        self._backward_no_grad_on_tensor(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            True
+        )
+
+    # Run the same code locally and with dist autograd and verify gradients
+    # are same.
+    def _backward_simple(self, dst, t1, t2, local_grads, sparse):
         for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]:
             with dist_autograd.context() as context_id:
                 ret = self._exec_func_with_dst(
                     dst, exec_mode, torch.add, t1, t2
                 )
-                loss = ret.sum()
+                if sparse:
+                    loss = torch.sparse.sum(ret)
+                else:
+                    loss = ret.sum()
                 ret = self._verify_backwards(
                     exec_mode, [loss], context_id, local_grads, t1, t2
                 )
@@ -922,29 +1054,65 @@ def _test_backward_simple(self, dst):
 
     @dist_init
     def test_backward_simple(self):
-        self._test_backward_simple(self._next_rank())
+        self._backward_simple(
+            self._next_rank(),
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3), requires_grad=True),
+            None,
+            False
+        )
+
+    @dist_init
+    def test_backward_simple_sparse(self):
+        self._backward_simple(
+            self._next_rank(),
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            None,
+            True
+        )
 
     @dist_init
     def test_backward_simple_self(self):
-        self._test_backward_simple(self.rank)
+        self._backward_simple(
+            self.rank,
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3), requires_grad=True),
+            None,
+            False
+        )
+
+    @dist_init
+    def test_backward_simple_self_sparse(self):
+        self._backward_simple(
+            self.rank,
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            None,
+            True
+        )
 
     # The current rank first creates a tensor on the rref_owner, and then passes
     # the rref with another tensor to the callee to run either my_rref_add or
     # my_nested_rref_add, depending on whether the callee is the rref owner.
     # The grad of tensor lives on the current rank, and the grad of the rref
     # tensor lives on the rref owner.
-    def _test_backward_rref(self, callee, rref_owner):
-        local_grads = None
-        t1 = torch.ones((3, 3), requires_grad=True)
-        t2 = torch.zeros((3, 3), requires_grad=True)
-
+    def _backward_rref(self, callee, rref_owner, t1, t2, local_grads, sparse):
         local_ret = torch.add(t1, t2)
-        local_ret.sum().backward()
+        if sparse:
+            local_ret = torch.sparse.sum(local_ret)
+        else:
+            local_ret = local_ret.sum()
+        local_ret.backward()
         with dist_autograd.context() as context_id:
-            rref_t1 = rpc.remote(
-                rref_owner, _torch_ones, args=((3, 3),), kwargs={"requires_grad": True}
-            )
-
+            if sparse:
+                rref_t1 = rpc.remote(
+                    rref_owner, build_sparse_tensor, args=(False, True,)
+                )
+            else:
+                rref_t1 = rpc.remote(
+                    rref_owner, _torch_ones, args=((3, 3),), kwargs={"requires_grad": True}
+                )
             if callee == rref_owner:
                 rref = rpc.remote(callee, my_rref_add, args=(rref_t1, t2))
             else:
@@ -952,7 +1120,11 @@ def _test_backward_rref(self, callee, rref_owner):
                     callee, my_nested_rref_add, args=(rref_owner, rref_t1, t2)
                 )
             ret = rref.to_here()
-            dist_autograd.backward(context_id, [ret.sum()])
+            if sparse:
+                ret = torch.sparse.sum(ret)
+            else:
+                ret = ret.sum()
+            dist_autograd.backward(context_id, [ret])
 
             # verify grads on caller
             grads = dist_autograd.get_gradients(context_id)
@@ -972,20 +1144,81 @@ def _test_backward_rref(self, callee, rref_owner):
     def test_backward_rref(self):
         callee = worker_name(self._next_rank())
         rref_owner = callee
-        self._test_backward_rref(callee, rref_owner)
+        self._backward_rref(
+            callee,
+            rref_owner,
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3), requires_grad=True),
+            None,
+            False
+        )
+
+    @dist_init
+    def test_backward_rref_sparse(self):
+        callee = worker_name(self._next_rank())
+        rref_owner = callee
+        self._backward_rref(
+            callee,
+            rref_owner,
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            None,
+            True
+        )
 
     @dist_init
     def test_backward_rref_multi(self):
         if self.rank > 0:
             callee = "worker0"
             rref_owner = callee
-            self._test_backward_rref(callee, rref_owner)
+            self._backward_rref(
+                callee,
+                rref_owner,
+                torch.rand((3, 3), requires_grad=True),
+                torch.rand((3, 3), requires_grad=True),
+                None,
+                False
+            )
+
+    @dist_init
+    def test_backward_rref_multi_sparse(self):
+        if self.rank > 0:
+            callee = "worker0"
+            rref_owner = callee
+            self._backward_rref(
+                callee,
+                rref_owner,
+                build_sparse_tensor(requires_grad=True),
+                build_sparse_tensor(requires_grad=True),
+                None,
+                True
+            )
 
     @dist_init
     def test_backward_rref_nested(self):
         callee = worker_name((self.rank + 1) % self.world_size)
         rref_owner = worker_name((self.rank + 2) % self.world_size)
-        self._test_backward_rref(callee, rref_owner)
+        self._backward_rref(
+            callee,
+            rref_owner,
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3), requires_grad=True),
+            None,
+            False
+        )
+
+    @dist_init
+    def test_backward_rref_nested_sparse(self):
+        callee = worker_name((self.rank + 1) % self.world_size)
+        rref_owner = worker_name((self.rank + 2) % self.world_size)
+        self._backward_rref(
+            callee,
+            rref_owner,
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            None,
+            True
+        )
 
     # In this test, every rank will serve as a parameter server (ps) and a
     # driver, and then kicks off trainers on the other three ranks. So, we have:
@@ -996,13 +1229,19 @@ def test_backward_rref_nested(self):
     #
     # These four test ps-trainer groups run on completely separate autograd
     # graphs, but they share the same set of underlying RpcAgents.
-    def _test_trainer_ps(self, create_ref_fn, trainer_fn):
-        local_grads = None
-        t1 = torch.ones((3, 3), requires_grad=True)
-        t2 = torch.zeros((3, 3), requires_grad=True)
+    def _test_trainer_ps(self, create_ref_fn, trainer_fn, sparse):
+        if sparse:
+            t1 = build_sparse_tensor(requires_grad=True)
+            t2 = build_sparse_tensor(requires_grad=True)
+        else:
+            t1 = torch.ones((3, 3), requires_grad=True)
+            t2 = torch.zeros((3, 3), requires_grad=True)
 
         local_ret = torch.add(t1, t2)
-        local_ret.sum().backward()
+        if sparse:
+            torch.sparse.sum(local_ret).backward()
+        else:
+            local_ret.sum().backward()
 
         # create rref on self
         rref_t1 = rpc.remote(
@@ -1018,7 +1257,7 @@ def _test_trainer_ps(self, create_ref_fn, trainer_fn):
                 rpc.rpc_async(
                     worker_name((self.rank + rank_diff) % self.world_size),
                     trainer_fn,
-                    args=(rref_t1, t2, worker_name(self.rank), rank_diff),
+                    args=(rref_t1, t2, worker_name(self.rank), rank_diff, sparse),
                 )
             )
 
@@ -1045,7 +1284,19 @@ def _test_trainer_ps(self, create_ref_fn, trainer_fn):
 
     @dist_init
     def test_trainer_ps(self):
-        self._test_trainer_ps(create_tensor, _run_trainer)
+        self._test_trainer_ps(
+            create_tensor,
+            _run_trainer,
+            False
+        )
+
+    @dist_init
+    def test_trainer_ps_sparse(self):
+        self._test_trainer_ps(
+            build_sparse_tensor,
+            _run_trainer,
+            True
+        )
 
     @dist_init
     def test_trainer_ps_torchscript_functions(self):
@@ -1056,17 +1307,9 @@ def test_trainer_ps_torchscript_functions(self):
         import torch.distributed.rpc.api as api
         api._ignore_rref_leak = True
 
-        self._test_trainer_ps(create_torchscript_tensor, _run_trainer_torchscript)
-
-    @dist_init
-    def test_backward_multiple_round_trips(self):
-        local_grads = None
-        t1 = torch.rand((3, 3), requires_grad=True)
-        t2 = torch.rand((3, 3))
-        t3 = torch.rand((3, 3), requires_grad=True)
-        t4 = torch.rand((3, 3))
-        t5 = torch.rand((3, 3), requires_grad=True)
+        self._test_trainer_ps(create_torchscript_tensor, _run_trainer_torchscript, False)
 
+    def _backward_multiple_round_trips(self, t1, t2, t3, t4, t5, local_grads, sparse):
         for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]:
             with dist_autograd.context() as context_id:
                 # Multiple RPCs between different nodes.
@@ -1074,15 +1317,44 @@ def test_backward_multiple_round_trips(self):
                 val = self._exec_func(exec_mode, torch.mul, t3, val)
                 s1 = self._exec_func(exec_mode, torch.stack, (t4, val))
                 s2 = self._exec_func(exec_mode, torch.stack, (t5, val))
-                val = self._exec_func(exec_mode, torch.bmm, s1, s2)
-                val = self._exec_func(exec_mode, torch.matmul, val, val)
-                loss = val.sum()
+                if sparse:
+                    val = self._exec_func(exec_mode, torch.mul, s1, s2)
+                    val = self._exec_func(exec_mode, torch.mul, val, val)
+                    loss = torch.sparse.sum(val)
+                else:
+                    val = self._exec_func(exec_mode, torch.bmm, s1, s2)
+                    val = self._exec_func(exec_mode, torch.matmul, val, val)
+                    loss = val.sum()
 
                 ret = self._verify_backwards(
                     exec_mode, [loss], context_id, local_grads, t1, t2, t3, t4, t5
                 )
                 local_grads = ret if ret else local_grads
 
+    @dist_init
+    def test_backward_multiple_round_trips(self):
+        self._backward_multiple_round_trips(
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3)),
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3)),
+            torch.rand((3, 3), requires_grad=True),
+            None,
+            False
+        )
+
+    @dist_init
+    def test_backward_multiple_round_trips_sparse(self):
+        self._backward_multiple_round_trips(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=False),
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=False),
+            build_sparse_tensor(requires_grad=True),
+            None,
+            True
+        )
+
     @dist_init
     def test_backward_different_tensor_dims(self):
         local_grads = None
@@ -1317,41 +1589,70 @@ def test_backward_multiple_roots(self):
                     exec_mode, [r1, r2, r3, r4], context_id, local_grads, t1, t2
                 )
 
-    @dist_init
-    def test_backward_different_dtypes(self):
+    def _backward_different_dtypes(self, t1, t2, sparse):
         local_grads = None
-        t1 = torch.rand((3, 3), requires_grad=True, dtype=torch.float32)
-        t2 = torch.rand((3, 3), requires_grad=True, dtype=torch.float64)
         for exec_mode in [ExecMode.LOCAL, ExecMode.REMOTE]:
             with dist_autograd.context() as context_id:
-                loss = self._exec_func(exec_mode, torch.add, t1, t2).sum()
-
+                loss = self._exec_func(exec_mode, torch.add, t1, t2)
+                if sparse:
+                    loss = torch.sparse.sum(loss)
+                else:
+                    loss = loss.sum()
                 local_grads = self._verify_backwards(
                     exec_mode, [loss], context_id, local_grads, t1, t2
                 )
 
     @dist_init
-    def test_backward_simple_python_udf(self):
-        # Run the same code locally and with dist autograd and verify gradients
-        # are same.
+    def test_backward_different_dtypes(self):
+        self._backward_different_dtypes(
+            torch.rand((3, 3), requires_grad=True, dtype=torch.float32),
+            torch.rand((3, 3), requires_grad=True, dtype=torch.float64),
+            False
+        )
+
+    @dist_init
+    def test_backward_different_dtypes_sparse(self):
+        self._backward_different_dtypes(
+            build_sparse_tensor(requires_grad=True, dtype=torch.float32),
+            build_sparse_tensor(requires_grad=True, dtype=torch.float64),
+            True
+        )
+
+    # Run the same code locally and with dist autograd and verify gradients
+    # are same.
+    def _backward_simple_python_udf(self, t1, t2, sparse):
         local_grads = None
-        t1 = torch.rand((3, 3), requires_grad=True)
-        t2 = torch.rand((3, 3), requires_grad=True)
         for exec_mode in [ExecMode.LOCAL, ExecMode.REMOTE]:
             with dist_autograd.context() as context_id:
                 ret = self._exec_func(exec_mode, my_py_add, t1, t2)
-                loss = ret.sum()
+                if sparse:
+                    loss = torch.sparse.sum(ret)
+                else:
+                    loss = ret.sum()
                 local_grads = self._verify_backwards(
                     exec_mode, [loss], context_id, local_grads, t1, t2
                 )
 
     @dist_init
-    def test_backward_simple_script_call(self):
-        # Run the same code locally and with dist autograd and verify gradients
-        # are same.
+    def test_backward_simple_python_udf(self):
+        self._backward_simple_python_udf(
+            torch.rand(3, 3, requires_grad=True),
+            torch.rand(3, 3, requires_grad=True),
+            False
+        )
+
+    @dist_init
+    def test_backward_simple_python_udf_sparse(self):
+        self._backward_simple_python_udf(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            True
+        )
+
+    # Run the same code locally and with dist autograd and verify gradients
+    # are same.
+    def _backward_simple_script_call(self, t1, t2, sparse):
         local_grads = None
-        t1 = torch.rand((3, 3), requires_grad=True)
-        t2 = torch.rand((3, 3), requires_grad=True)
         for exec_mode in [
             ExecMode.LOCAL,
             ExecMode.RPC_SYNC,
@@ -1360,12 +1661,31 @@ def test_backward_simple_script_call(self):
         ]:
             with dist_autograd.context() as context_id:
                 forward_ret = self._exec_func(exec_mode, my_script_add, t1, t2)
-                loss = forward_ret.sum()
+                if sparse:
+                    loss = torch.sparse.sum(forward_ret)
+                else:
+                    loss = forward_ret.sum()
                 ret = self._verify_backwards(
                     exec_mode, [loss], context_id, local_grads, t1, t2
                 )
                 local_grads = ret if ret else local_grads
 
+    @dist_init
+    def test_backward_simple_script_call(self):
+        self._backward_simple_script_call(
+            torch.rand(3, 3, requires_grad=True),
+            torch.rand(3, 3, requires_grad=True),
+            False
+        )
+
+    @dist_init
+    def test_backward_simple_script_call_sparse(self):
+        self._backward_simple_script_call(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            True
+        )
+
     @staticmethod
     def _complex_python_udf(t1, t2):
         t3 = torch.nn.functional.linear(t1, t2)
@@ -1474,17 +1794,17 @@ def _nested_python_udf(t1, t2, dst):
         t3 = t1 * t2
         t4 = t1 + t2
         res = rpc.rpc_sync(worker_name(dst), my_py_add, args=(t3, t4))
-        return torch.linalg.multi_dot([t1, t2, t3, t4, res])
+        return t1 * t2 * t3 * t4 * res
 
-    @dist_init
-    def test_backwards_nested_python_udf(self):
-        # Run equivalent of _nested_python_udf locally.
-        t1 = torch.rand((3, 3), requires_grad=True)
-        t2 = torch.rand((3, 3), requires_grad=True)
+    def _backwards_nested_python_udf(self, t1, t2, sparse):
         t3 = t1 * t2
         t4 = t1 + t2
         res = t3 + t4
-        loss = torch.linalg.multi_dot([t1, t2, t3, t4, res]).sum()
+        loss = t1 * t2 * t3 * t4 * res
+        if sparse:
+            loss = torch.sparse.sum(loss)
+        else:
+            loss = loss.sum()
         torch.autograd.backward([loss])
 
         # Now run distributed autograd.
@@ -1494,12 +1814,33 @@ def test_backwards_nested_python_udf(self):
                 DistAutogradTest._nested_python_udf,
                 args=(t1, t2, self._next_rank()),
             )
-            dist_autograd.backward(context_id, [loss.sum()])
-
+            if sparse:
+                loss = torch.sparse.sum(loss)
+            else:
+                loss = loss.sum()
+            dist_autograd.backward(context_id, [loss])
             grads = dist_autograd.get_gradients(context_id)
             self.assertEqual(t1.grad, grads[t1])
             self.assertEqual(t2.grad, grads[t2])
 
+    @dist_init
+    def test_backwards_nested_python_udf(self):
+        # Run equivalent of _nested_python_udf locally.
+        self._backwards_nested_python_udf(
+            torch.rand(3, 3, requires_grad=True),
+            torch.rand(3, 3, requires_grad=True),
+            False
+        )
+
+    @dist_init
+    def test_backwards_nested_python_udf_sparse(self):
+        # Run equivalent of _nested_python_udf locally.
+        self._backwards_nested_python_udf(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            True
+        )
+
     _test_clean_context_backward_context_id = None
 
     class MyBackwardFunc(Function):
@@ -1594,8 +1935,7 @@ def _call_remote_embedding(cls, embedding_rref, input, offsets, per_sample_weigh
     def _get_grad(cls, embedding_rref, context_id):
         embedding = embedding_rref.local_value()
         grad_map = dist_autograd.get_gradients(context_id)
-        # Can't send sparse tensors over RPC: https://github.com/pytorch/pytorch/issues/30807
-        return grad_map[embedding.weight].to_dense()
+        return grad_map[embedding.weight]
 
     @dist_init
     def test_embedding_bag_with_no_grad_tensors(self):
@@ -1637,26 +1977,27 @@ def test_embedding_bag_with_no_grad_tensors(self):
                 args=(remote_embedding, context_id),
             )
 
-            self.assertEqual(local_grad.to_dense(), remote_grad)
+            self.assertEqual(local_grad, remote_grad)
 
     @classmethod
-    def _mixed_requires_grad(cls, t1, t2):
+    def _mixed_requires_grad_operaton(cls, t1, t2):
         if t2.requires_grad:
             return t1 - t2
         else:
             return t1 * t2
 
-    @dist_init
-    def test_mixed_requires_grad(self):
+    def _mixed_requires_grad(self, t1, t2, sparse):
         for exec_mode in [ExecMode.RPC_SYNC, ExecMode.REMOTE]:
-            t1 = torch.rand((3, 3), requires_grad=True)
-            t2 = torch.rand((3, 3), requires_grad=False)
             with dist_autograd.context() as context_id:
                 ret = self._exec_func(
-                    exec_mode, DistAutogradTest._mixed_requires_grad, t1, t2
+                    exec_mode, DistAutogradTest._mixed_requires_grad_operaton, t1, t2
                 )
                 self.assertEqual(t1 * t2, ret)
-                dist_autograd.backward(context_id, [ret.sum()])
+                if sparse:
+                    loss = torch.sparse.sum(ret)
+                else:
+                    loss = ret.sum()
+                dist_autograd.backward(context_id, [loss])
                 self.assertTrue(t1.requires_grad)
                 self.assertFalse(t2.requires_grad)
                 grads = dist_autograd.get_gradients(context_id)
@@ -1664,6 +2005,22 @@ def test_mixed_requires_grad(self):
                 self.assertNotIn(t2, grads)
                 self.assertEqual(t2, grads[t1])
 
+    @dist_init
+    def test_mixed_requires_grad(self):
+        self._mixed_requires_grad(
+            torch.rand(3, 3, requires_grad=True),
+            torch.rand(3, 3, requires_grad=False),
+            False
+        )
+
+    @dist_init
+    def test_mixed_requires_grad_sparse(self):
+        self._mixed_requires_grad(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=False),
+            True
+        )
+
     class TestDebugInfoFunc(Function):
         @staticmethod
         def forward(ctx, input):
@@ -1801,37 +2158,69 @@ def test_backward_accumulate_grads(self):
 
     @staticmethod
     def _test_nested_backward_accumulate_grads(t1, t2, dst_rank):
-        return rpc.rpc_sync(worker_name(dst_rank), torch.matmul, args=(t1, t2))
+        return rpc.rpc_sync(worker_name(dst_rank), torch.add, args=(t1, t2))
 
-    @dist_init
-    def test_nested_backward_accumulate_grads(self):
-        t1 = torch.rand((3, 3), requires_grad=True)
-        t2 = torch.rand((3, 3), requires_grad=True)
+    def _nested_backward_accumulate_grads(self, t1, t2, sparse):
         with dist_autograd.context() as context_id:
-            loss = rpc.rpc_sync(
+            ret = rpc.rpc_sync(
                 worker_name(self._next_rank()),
                 DistAutogradTest._test_nested_backward_accumulate_grads,
                 args=(t1, t2, self._next_rank()),
-            ).sum()
-
+            )
+            if sparse:
+                loss = torch.sparse.sum(ret)
+            else:
+                loss = ret.sum()
             # Run backward twice.
             dist_autograd.backward(context_id, [loss], retain_graph=True)
             dist_autograd.backward(context_id, [loss])
 
     @dist_init
-    def test_multiple_backward(self):
-        t1 = torch.rand((3, 3), requires_grad=True)
-        t2 = torch.rand((3, 3), requires_grad=True)
+    def test_nested_backward_accumulate_grads(self):
+        self._nested_backward_accumulate_grads(
+            torch.rand(3, 3, requires_grad=True),
+            torch.rand(3, 3, requires_grad=True),
+            False
+        )
+
+    @dist_init
+    def test_nested_backward_accumulate_grads_sparse(self):
+        self._nested_backward_accumulate_grads(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            True
+        )
+
+    def _multiple_backward(self, t1, t2, sparse):
         with dist_autograd.context() as context_id:
             loss = rpc.rpc_sync(
                 worker_name(self._next_rank()),
                 torch.add,
-                args=(t1, t2)).sum()
-
+                args=(t1, t2))
+            if sparse:
+                loss = torch.sparse.sum(loss)
+            else:
+                loss = loss.sum()
             # Run backward in a loop multiple times.
             for i in range(1000):
                 dist_autograd.backward(context_id, [loss], retain_graph=True)
 
+    @dist_init
+    def test_multiple_backward(self):
+        self._multiple_backward(
+            torch.rand(3, 3, requires_grad=True),
+            torch.rand(3, 3, requires_grad=True),
+            False
+        )
+
+    @dist_init
+    def test_multiple_backward_sparse(self):
+        self._multiple_backward(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            True
+        )
+
     @dist_init(clean_shutdown=False)
     def test_multiple_backward_with_errors(self):
         initialize_pg(self.file_init_method, self.rank, self.world_size)
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 1a44ef6e63b65..e0ef915ee8937 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -209,10 +209,13 @@ def add_rref_to_value(rref, value):
 def run_nested_pickle(pickle_cls_instance, tensor):
     return pickle_cls_instance.t + tensor
 
-def build_sparse_tensor():
+def build_sparse_tensor(coalesce=False):
     i = [[0, 1, 1], [2, 0, 2]]
     v = [3, 4, 5]
-    return torch.sparse_coo_tensor(i, v, (2, 3))
+    tensor = torch.sparse_coo_tensor(i, v, (2, 3))
+    if coalesce:
+        tensor = tensor.coalesce()
+    return tensor
 
 def build_complex_tensors():
     a = torch.ones(3, 3)
@@ -238,6 +241,12 @@ def my_function(a, b, c):
 def my_tensor_function(a, b):
     return a + b
 
+def my_container_sum(a):
+    result = a[0]
+    for tensor in a[1:]:
+        result += tensor
+    return result
+
 
 def my_sleep_func(seconds=1):
     time.sleep(seconds)
@@ -275,6 +284,14 @@ def nested_rpc(dst):
     return rpc.rpc_sync(dst, torch.add, args=(torch.ones(2, 2), 1))
 
 
+def nested_rpc_sparse(dst):
+    return rpc.rpc_sync(
+        dst,
+        torch.add,
+        args=(build_sparse_tensor(), build_sparse_tensor())
+    )
+
+
 def multi_layer_nested_async_rpc(dst, world_size, ttl):
     # this method returns immediately without blocking the callee, but will
     # generate additional requests.
@@ -296,10 +313,29 @@ def nested_rref(dst):
     )
 
 
+def nested_rref_sparse(dst):
+    return (
+        rpc.remote(
+            dst,
+            torch.add,
+            args=(build_sparse_tensor(), build_sparse_tensor())
+        ),
+        rpc.remote(
+            dst,
+            torch.add,
+            args=(build_sparse_tensor(), build_sparse_tensor())
+        ),
+    )
+
+
 def nested_remote(dst):
     rref = rpc.remote(dst, torch.add, args=(torch.ones(2, 2), 3))
     return rref.to_here()
 
+def nested_remote_sparse(dst):
+    rref = rpc.remote(dst, torch.add, args=(build_sparse_tensor(), build_sparse_tensor()))
+    return rref.to_here()
+
 
 def rref_forward_chain(dst, world_size, rref, ttl):
     if ttl > 0:
@@ -328,6 +364,12 @@ def heavy_rpc(tensor):
     return 0
 
 
+def heavy_rpc_sparse(tensor):
+    for i in range(1, 100):
+        tensor *= i
+        tensor = tensor / (i + 1)
+    return 0
+
 @torch.jit.script
 def heavy_rpc_torchscript(tensor):
     for i in range(1, 100):
@@ -600,6 +642,57 @@ def __init__(self, init_method):
 load_tests = load_tests
 
 
+class MyEmbeddingBagModel(torch.nn.Module):
+    def __init__(self, sparse):
+        super().__init__()
+        self.eb = torch.nn.EmbeddingBag(
+            10,
+            10,
+            sparse=sparse
+        )
+
+    def forward(self, x):
+        return self.eb(x)
+
+
+class MyParameterServer:
+    def __init__(self, trainers):
+        self.lock = Lock()
+        self.trainers = trainers
+        self.iteration = 0
+        self.updates = 0
+        self.futures = []
+        self.total = None
+        self.gradient = None
+
+    @staticmethod
+    def get_gradient(rref):
+        return rref.local_value().gradient
+
+    @staticmethod
+    @rpc.functions.async_execution
+    def average(rref, riteration, tensor):
+        self = rref.local_value()
+        fut = torch.futures.Future()
+        with self.lock:
+            if riteration > self.iteration:
+                self.iteration = riteration
+                self.updates = 0
+                self.futures.clear()
+            self.futures.append(fut)
+            if self.total is None:
+                self.total = tensor
+            else:
+                self.total += tensor
+            self.updates += 1
+            if self.trainers == self.updates:
+                self.gradient = self.total / float(self.trainers)
+                for fut in self.futures:
+                    result = self.total / float(self.trainers)
+                    fut.set_result(result)
+        return fut
+
+
 class RpcTest(RpcAgentTestFixture):
     @dist_init
     def test_worker_id(self):
@@ -641,10 +734,26 @@ def test_self_add(self):
     def test_send_to_rank(self):
         dst_rank = (self.rank + 1) % self.world_size
 
+        # Test dense tensor
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
             ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
             self.assertEqual(ret, torch.ones(2, 2) + 1)
 
+        # Test sparse tensor
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            x = build_sparse_tensor()
+            y = build_sparse_tensor()
+            expected_tensor = (x + y)
+            ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y))
+            self.assertEqual(expected_tensor, ret)
+
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            x = build_sparse_tensor(coalesce=True)
+            y = build_sparse_tensor(coalesce=True)
+            expected_tensor = (x + y)
+            ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y))
+            self.assertEqual(expected_tensor, ret)
+
         # Test invalid ranks
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
             with self.assertRaises(RuntimeError):
@@ -662,41 +771,120 @@ def test_send_to_rank(self):
             with self.assertRaises(ValueError):
                 self._run_func_in_mode(dst_rank - 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
 
+    def _self_py_udf_remote(self, worker_info, x, y, z):
+        rref = rpc.remote(worker_info, my_function, args=(x, y, z))
+        self.assertEqual(rref.to_here(), x + y + z)
+
     @dist_init
     def test_self_py_udf_remote(self):
-        self_worker_info = rpc.get_worker_info()
-        rref = rpc.remote(self_worker_info, my_function, args=(torch.ones(2, 2), 1, 3))
-        self.assertEqual(rref.to_here(), torch.ones(2, 2) + 1 + 3)
+        self._self_py_udf_remote(
+            rpc.get_worker_info(),
+            torch.ones(2, 2),
+            1,
+            3
+        )
+
+    @dist_init
+    def test_self_py_udf_remote_sparse(self):
+        self._self_py_udf_remote(
+            rpc.get_worker_info(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor()
+        )
+
 
-    def _test_self_remote_rref_as_rpc_arg(self, dst):
+    def _self_remote_rref_as_rpc_arg(self, dst, x, y, z):
         self_worker_info = rpc.get_worker_info()
-        rref = rpc.remote(self_worker_info, my_function, args=(torch.ones(2, 2), 1, 3))
-        fut = rpc.rpc_async(dst, add_rref_to_value, args=(rref, torch.ones(2, 2)))
-        ret = rpc.rpc_sync(dst, add_rref_to_value, args=(rref, torch.ones(2, 2) + 1))
-        self.assertEqual(ret, torch.ones(2, 2) + 1 + 3 + torch.ones(2, 2) + 1)
-        self.assertEqual(fut.wait(), torch.ones(2, 2) + 1 + 3 + torch.ones(2, 2))
+        rref = rpc.remote(self_worker_info, my_function, args=(x, y, z))
+        fut = rpc.rpc_async(dst, add_rref_to_value, args=(rref, x))
+        ret = rpc.rpc_sync(dst, add_rref_to_value, args=(rref, x + y))
+        self.assertEqual(ret, x + y + z + x + y)
+        self.assertEqual(fut.wait(), x + y + z + x)
 
     @dist_init
     def test_self_remote_rref_as_rpc_arg(self):
         dst = worker_name((self.rank + 1) % self.world_size)
-        self._test_self_remote_rref_as_rpc_arg(dst)
+        self._self_remote_rref_as_rpc_arg(
+            dst,
+            torch.ones(2, 2),
+            1,
+            3
+        )
+
+    @dist_init
+    def test_self_remote_rref_as_rpc_arg_sparse(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        self._self_remote_rref_as_rpc_arg(
+            dst,
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor()
+        )
 
     @dist_init
     def test_self_remote_rref_as_self_rpc_arg(self):
-        self._test_self_remote_rref_as_rpc_arg(rpc.get_worker_info())
+        self._self_remote_rref_as_rpc_arg(
+            rpc.get_worker_info(),
+            torch.ones(2, 2),
+            1,
+            3
+        )
 
-    def _test_self_remote_rref_as_remote_arg(self, dst):
+    @dist_init
+    def test_self_remote_rref_as_self_rpc_arg_sparse(self):
+        self._self_remote_rref_as_rpc_arg(
+            rpc.get_worker_info(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor()
+        )
+
+    def _self_remote_rref_as_remote_arg(self, dst, x, y, z):
         self_worker_info = rpc.get_worker_info()
-        rref = rpc.remote(self_worker_info, my_function, args=(torch.ones(2, 2), 1, 3))
-        ret_rref = rpc.remote(dst, add_rref_to_value, args=(rref, torch.ones(2, 2)))
+        rref = rpc.remote(self_worker_info, my_function, args=(x, y, z))
+        ret_rref = rpc.remote(dst, add_rref_to_value, args=(rref, x))
         self.assertEqual(
-            ret_rref.to_here(), torch.ones(2, 2) + 1 + 3 + torch.ones(2, 2)
+            ret_rref.to_here(), x + y + z + x
         )
 
     @dist_init
     def test_self_remote_rref_as_remote_arg(self):
         dst = worker_name((self.rank + 1) % self.world_size)
-        self._test_self_remote_rref_as_remote_arg(dst)
+        self._self_remote_rref_as_remote_arg(
+            dst,
+            torch.ones(2, 2),
+            1,
+            3
+        )
+
+    @dist_init
+    def test_self_remote_rref_as_remote_arg_sparse(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        self._self_remote_rref_as_remote_arg(
+            dst,
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor()
+        )
+
+    @dist_init
+    def test_self_remote_rref_as_self_remote_arg(self):
+        self._self_remote_rref_as_remote_arg(
+            rpc.get_worker_info(),
+            torch.ones(2, 2),
+            1,
+            3
+        )
+
+    @dist_init
+    def test_self_remote_rref_as_self_remote_arg_sparse(self):
+        self._self_remote_rref_as_remote_arg(
+            rpc.get_worker_info(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor()
+        )
 
     @dist_init
     def test_rref_proxy_non_exist(self):
@@ -816,10 +1004,6 @@ def test_rref_proxy_class(self):
     def test_rref_proxy_class_self(self):
         self._test_rref_proxy_class(rpc.get_worker_info())
 
-    @dist_init
-    def test_self_remote_rref_as_self_remote_arg(self):
-        self._test_self_remote_rref_as_remote_arg(rpc.get_worker_info())
-
     @mock.patch.object(torch.distributed.autograd, "_init")
     @mock.patch.object(torch.distributed.rpc.api, "_set_and_start_rpc_agent")
     @dist_init(setup_rpc=False)
@@ -911,7 +1095,7 @@ def test_reinit(self):
             )
         rpc.shutdown()
 
-    def test_world_size_one(self):
+    def _world_size_one(self, a, b):
         if self.rank == 0:
             rpc.init_rpc(
                 name="me",
@@ -921,32 +1105,51 @@ def test_world_size_one(self):
                 rpc_backend_options=self.rpc_backend_options,
             )
 
-            expect = torch.ones(2, 2) * 2
-            result = rpc.rpc_sync(
-                "me",
-                my_tensor_function,
-                args=(torch.ones(2, 2), torch.ones(2, 2))
-            )
-            self.assertEqual(expect, result)
-
-            expect = torch.ones(3, 3) * 2
-            result = rpc.rpc_async(
-                "me",
-                my_tensor_function,
-                args=(torch.ones(3, 3), torch.ones(3, 3))
-            ).wait()
-            self.assertEqual(expect, result)
+            def _rpc_sync(x, y):
+                expect = x * 2
+                result = rpc.rpc_sync(
+                    "me",
+                    my_tensor_function,
+                    args=(x, y)
+                )
+                self.assertEqual(expect, result)
+
+            def _rpc_async(x, y):
+                expect = x * 2
+                result = rpc.rpc_async(
+                    "me",
+                    my_tensor_function,
+                    args=(x, y)
+                ).wait()
+                self.assertEqual(expect, result)
+
+            def _remote(x, y):
+                expect = x * 2
+                result = rpc.remote(
+                    "me",
+                    my_tensor_function,
+                    args=(x, y)
+                ).to_here()
+                self.assertEqual(expect, result)
 
-            expect = torch.ones(4, 4) * 2
-            result = rpc.remote(
-                "me",
-                my_tensor_function,
-                args=(torch.ones(4, 4), torch.ones(4, 4))
-            ).to_here()
-            self.assertEqual(expect, result)
+            _rpc_sync(a, b)
+            _rpc_async(a, b)
+            _remote(a, b)
 
             rpc.shutdown()
 
+    def test_world_size_one(self):
+        self._world_size_one(
+            torch.ones(2, 2),
+            torch.ones(2, 2)
+        )
+
+    def test_world_size_one_sparse(self):
+        self._world_size_one(
+            build_sparse_tensor(),
+            build_sparse_tensor()
+        )
+
     @dist_init(setup_rpc=False)
     def test_invalid_names(self):
         from torch.distributed.rpc import WorkerInfo
@@ -1027,17 +1230,30 @@ def test_nonzero(self):
         ret = rpc.rpc_sync(worker_name(dst_rank), torch.nonzero, args=(x,))
         self.assertEqual(ret, x.nonzero())
 
-    @dist_init
-    def test_multi_rpc(self):
+    def _multi_rpc(self, sparse):
         dst_rank = (self.rank + 1) % self.world_size
         for i in range(20):
             n = i + self.rank + 1
+            if sparse:
+                x = build_sparse_tensor() * n
+                y = build_sparse_tensor() * n
+            else:
+                x = torch.ones(2, 2)
+                y = torch.ones(2, 2)
             ret = rpc.rpc_sync(
                 worker_name(dst_rank),
                 torch.add,
-                args=(torch.ones(n, n), torch.ones(n, n)),
+                args=(x, y),
             )
-            self.assertEqual(ret, torch.ones(n, n) * 2)
+            self.assertEqual(ret, x * 2)
+
+    @dist_init
+    def test_multi_rpc(self):
+        self._multi_rpc(False)
+
+    @dist_init
+    def test_multi_rpc_sparse(self):
+        self._multi_rpc(True)
 
     @dist_init
     def test_future_wait_twice(self):
@@ -1053,7 +1269,7 @@ def test_future_wait_twice(self):
             with self.assertRaisesRegex(ValueError, "Expected error"):
                 fut.wait()
 
-    def _run_uneven_workload(self, num_repeat=30):
+    def _run_uneven_workload(self, f, x, num_repeat=30):
         # worker0 drives and waits for worker1 and worker2
         # throughout the test.
         if self.rank == 0:
@@ -1063,7 +1279,7 @@ def _run_uneven_workload(self, num_repeat=30):
             dst = "worker1"
             futs = []
             for _ in range(num_repeat):
-                fut = rpc.rpc_async(dst, heavy_rpc, args=(torch.ones(100, 100),))
+                fut = rpc.rpc_async(dst, f, args=(x,))
                 futs.append(fut)
 
             for fut in torch.futures.collect_all(futs).wait():
@@ -1075,13 +1291,13 @@ def _run_uneven_workload(self, num_repeat=30):
             dst = "worker2"
             futs = []
             for _ in range(num_repeat):
-                fut = rpc.rpc_async(dst, heavy_rpc, args=(torch.ones(100, 100),))
+                fut = rpc.rpc_async(dst, f, args=(x,))
                 futs.append(fut)
 
             for val in torch.futures.wait_all(futs):
                 self.assertEqual(val, 0)
 
-    def test_wait_all_workers(self):
+    def _wait_all_workers(self, f, x):
         initialize_pg(self.file_init_method, self.rank, self.world_size)
         rpc.init_rpc(
             name="worker%d" % self.rank,
@@ -1091,7 +1307,7 @@ def test_wait_all_workers(self):
             rpc_backend_options=self.rpc_backend_options,
         )
 
-        self._run_uneven_workload()
+        self._run_uneven_workload(f, x)
 
         # worker0 calls this at the end after waiting for RPC responses.
         # worker1/2 calls this immediately and has some works after it.
@@ -1103,7 +1319,13 @@ def test_wait_all_workers(self):
         dist.barrier()
         rpc.shutdown(graceful=False)
 
-    def test_wait_all_workers_twice(self):
+    def test_wait_all_workers_dense(self):
+        self._wait_all_workers(heavy_rpc, torch.ones(100, 100))
+
+    def test_wait_all_workers_sparse(self):
+        self._wait_all_workers(heavy_rpc_sparse, build_sparse_tensor())
+
+    def _wait_all_workers_twice(self, f, x):
         initialize_pg(self.file_init_method, self.rank, self.world_size)
         rpc.init_rpc(
             name="worker%d" % self.rank,
@@ -1113,7 +1335,7 @@ def test_wait_all_workers_twice(self):
             rpc_backend_options=self.rpc_backend_options,
         )
 
-        self._run_uneven_workload()
+        self._run_uneven_workload(f, x)
 
         # worker0 calls this at the end after waiting for RPC responses.
         # worker1/2 calls this immediately and has some works after it.
@@ -1126,6 +1348,12 @@ def test_wait_all_workers_twice(self):
         dist.barrier()
         rpc.shutdown(graceful=False)
 
+    def test_wait_all_workers_twice_dense(self):
+        self._wait_all_workers_twice(heavy_rpc, torch.ones(100, 100))
+
+    def test_wait_all_workers_twice_sparse(self):
+        self._wait_all_workers_twice(heavy_rpc_sparse, build_sparse_tensor())
+
     @dist_init
     def test_all_gather(self):
         info = rpc.get_worker_info()
@@ -1211,7 +1439,7 @@ def test_rpc_barrier_multithreaded(self):
     @dist_init
     def test_graceful_shutdown_with_uneven_workload(self):
         """Test graceful termination."""
-        self._run_uneven_workload()
+        self._run_uneven_workload(heavy_rpc, torch.ones(100, 100))
 
     @dist_init(setup_rpc=False)
     def test_shutdown_followed_by_rpc(self):
@@ -2081,6 +2309,16 @@ def test_py_tensors_in_container(self):
         )
         self.assertEqual(ret, my_complex_tensor_function(a, b, c))
 
+    @dist_init
+    def test_py_sparse_tensors_in_container(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        a = [build_sparse_tensor(), build_sparse_tensor()]
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank), my_container_sum, args=(a,)
+        )
+        self.assertEqual(ret, my_container_sum(a))
+
     @dist_init
     def test_py_nested_pickle(self):
         n = self.rank + 1
@@ -2137,16 +2375,23 @@ def test_py_raise_in_user_func_escaped_str(self):
         else:
             self.assertTrue(False, "expected raise_func_escape to raise ValueError.")
 
-    @dist_init
-    def test_nested_rpc(self):
+    def _nested_rpc(self, f, expected):
         n = self.rank + 1
         dst_rank = n % self.world_size
         ret = rpc.rpc_sync(
             worker_name(dst_rank),
-            nested_rpc,
+            f,
             args=(worker_name(self.rank),),
         )
-        self.assertEqual(ret, torch.ones(2, 2) + 1)
+        self.assertEqual(ret, expected)
+
+    @dist_init
+    def test_nested_rpc(self):
+        self._nested_rpc(nested_rpc, torch.ones(2, 2) + 1)
+
+    @dist_init
+    def test_nested_rpc_sparse(self):
+        self._nested_rpc(nested_rpc_sparse, build_sparse_tensor() * 2)
 
     def _stress_test_rpc(self, f, repeat=1000, args=()):
         n = self.rank + 1
@@ -2174,31 +2419,65 @@ def test_stress_light_rpc(self):
     def test_stress_heavy_rpc(self):
         self._stress_test_rpc(heavy_rpc, repeat=20, args=(torch.ones(100, 100),))
 
+    @dist_init
+    def test_stress_heavy_rpc_sparse(self):
+        self._stress_test_rpc(heavy_rpc_sparse, repeat=20, args=(build_sparse_tensor(),))
+
     @dist_init
     def test_stress_heavy_rpc_torchscript(self):
         self._stress_test_rpc(heavy_rpc_torchscript, repeat=20, args=(torch.ones(100, 100),))
 
-    @dist_init
-    def test_builtin_remote_ret(self):
+    def _builtin_remote_ret(self, x, y, expected):
         n = self.rank + 1
         dst_rank = n % self.world_size
         rref = rpc.remote(
             worker_name(dst_rank),
             torch.add,
-            args=(torch.ones(n, n), torch.ones(n, n)),
+            args=(x, y),
         )
-        self.assertEqual(rref.to_here(), torch.ones(n, n) * 2)
+        self.assertEqual(rref.to_here(), expected)
 
     @dist_init
-    def test_builtin_remote_self(self):
+    def test_builtin_remote_ret(self):
+        self._builtin_remote_ret(
+            torch.ones(2, 2),
+            torch.ones(2, 2),
+            torch.ones(2, 2) * 2
+        )
+
+    @dist_init
+    def test_builtin_remote_ret_sparse(self):
+        self._builtin_remote_ret(
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor() * 2
+        )
+
+    def _builtin_remote_self(self, x, y, expected):
         rref = rpc.remote(
             worker_name(self.rank),
             torch.add,
-            args=(torch.ones(2, 2), torch.ones(2, 2)),
+            args=(x, y),
+        )
+        self.assertEqual(rref.local_value(), expected)
+
+    @dist_init
+    def test_builtin_remote_self(self):
+        self._builtin_remote_self(
+            torch.ones(2, 2),
+            torch.ones(2, 2),
+            torch.ones(2, 2) * 2
+        )
+
+    @dist_init
+    def test_builtin_remote_self_sparse(self):
+        self._builtin_remote_self(
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor() * 2
         )
-        self.assertEqual(rref.local_value(), torch.ones(2, 2) * 2)
 
-    def _test_multi_remote_call(self, fn, args_fn=lambda x: (), kwargs_fn=lambda x: {}):
+    def _test_multi_remote_call(self, fn, sparse, args_fn=lambda x, y: (), kwargs_fn=lambda x, y: {}):
         m = 10
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -2210,21 +2489,35 @@ def _test_multi_remote_call(self, fn, args_fn=lambda x: (), kwargs_fn=lambda x:
                 rpc.remote(
                     worker_name(dst_rank),
                     fn,
-                    args=args_fn(n),
-                    kwargs=kwargs_fn(n),
+                    args=args_fn(n, sparse),
+                    kwargs=kwargs_fn(n, sparse),
                 )
             )
-            expected.append(fn(*args_fn(n), **kwargs_fn(n)))
+            expected.append(fn(*args_fn(n, sparse), **kwargs_fn(n, sparse)))
 
         for i in range(m):
             self.assertEqual(rrefs[i].to_here(), expected[i])
 
+    @staticmethod
+    def _multi_args_fn(n, sparse=False):
+        if sparse:
+            return (build_sparse_tensor(), build_sparse_tensor())
+        else:
+            return (torch.ones(n, n), torch.ones(n, n))
+
     @dist_init
     def test_multi_builtin_remote_ret(self):
-        def args_fn(n):
-            return (torch.ones(n, n), torch.ones(n, n))
+        self._test_multi_remote_call(
+            torch.add, False,
+            args_fn=RpcTest._multi_args_fn
+        )
 
-        self._test_multi_remote_call(torch.add, args_fn=args_fn)
+    @dist_init
+    def test_multi_builtin_remote_ret_sparse(self):
+        self._test_multi_remote_call(
+            torch.add, True,
+            args_fn=RpcTest._multi_args_fn
+        )
 
     @dist_init
     def test_py_udf_remote(self):
@@ -2237,82 +2530,177 @@ def test_py_udf_remote(self):
         )
         self.assertEqual(rref.to_here(), my_function(n, n + 1, n + 2))
 
-    @dist_init
-    def test_multi_py_udf_remote(self):
-        def kwargs_fn(n):
+    @staticmethod
+    def _multi_kwargs_fn(n, sparse=False):
+        if sparse:
+            return {
+                "a": build_sparse_tensor(),
+                "b": build_sparse_tensor(),
+                "c": build_sparse_tensor()
+            }
+        else:
             return {"a": torch.ones(n, n), "b": torch.ones(n, n), "c": torch.ones(n, n)}
 
-        self._test_multi_remote_call(my_function, kwargs_fn=kwargs_fn)
+    @dist_init
+    def test_multi_py_udf_remote(self):
+        self._test_multi_remote_call(
+            my_function,
+            False,
+            kwargs_fn=RpcTest._multi_kwargs_fn
+        )
 
     @dist_init
-    def test_py_rref_args(self):
+    def test_multi_py_udf_remote_sparse(self):
+        self._test_multi_remote_call(
+            my_function,
+            True,
+            kwargs_fn=RpcTest._multi_kwargs_fn
+        )
+
+    def _py_rref_args(self, a, b, x, y, expected):
         n = self.rank + 1
         dst_rank = n % self.world_size
         rref_a = rpc.remote(
-            worker_name(dst_rank), torch.add, args=(torch.ones(n, n), 2)
+            worker_name(dst_rank), torch.add, args=(a, b)
         )
         rref_b = rpc.remote(
-            worker_name(dst_rank), torch.add, args=(torch.ones(n, n), 1)
+            worker_name(dst_rank), torch.add, args=(x, y)
         )
         rref_c = rpc.remote(
             worker_name(dst_rank), my_rref_function, args=(rref_a, rref_b)
         )
-        self.assertEqual(rref_c.to_here(), torch.ones(n, n) + 4)
+        self.assertEqual(rref_c.to_here(), expected)
 
     @dist_init
-    def test_py_rref_args_user_share(self):
+    def test_py_rref_args(self):
+        self._py_rref_args(
+            torch.ones(2, 2),
+            1,
+            torch.ones(2, 2),
+            2,
+            torch.ones(2, 2) * 2 + 3)
+
+    @dist_init
+    def test_py_rref_args_sparse(self):
+        self._py_rref_args(
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor() * 4
+        )
+
+    def _py_rref_args_user_share(self, a, b, c, x, y, z, expected):
         n = self.rank + 1
         owner_rank = n % self.world_size
         user_rank = (n + 1) % self.world_size
         rref_a = rpc.remote(
-            worker_name(owner_rank), my_function, args=(torch.ones(n, n), 2, 0)
+            worker_name(owner_rank), my_function, args=(a, b, c)
         )
         rref_b = rpc.remote(
-            worker_name(owner_rank), my_function, args=(torch.ones(n, n), 1, 0)
+            worker_name(owner_rank), my_function, args=(x, y, z)
         )
         rref_c = rpc.remote(
             worker_name(user_rank), my_rref_function, args=(rref_a, rref_b)
         )
-        self.assertEqual(rref_c.to_here(), torch.ones(n, n) + 4)
+        self.assertEqual(rref_c.to_here(), expected)
 
     @dist_init
-    def test_py_rpc_rref_args(self):
+    def test_py_rref_args_user_share(self):
+        self._py_rref_args_user_share(
+            torch.ones(2, 2),
+            1,
+            2,
+            torch.ones(2, 2),
+            3,
+            4,
+            torch.ones(2, 2) * 2 + 10
+        )
+
+    @dist_init
+    def test_py_rref_args_user_share_sparse(self):
+        self._py_rref_args_user_share(
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor() * 6
+        )
+
+    def _py_rpc_rref_args(self, a, b, c, x, y, z, expected):
         n = self.rank + 1
         dst_rank = n % self.world_size
         rref_a = rpc.remote(
-            worker_name(dst_rank), my_function, args=(torch.ones(n, n), 2, 0)
+            worker_name(dst_rank), my_function, args=(a, b, c)
         )
         rref_b = rpc.remote(
-            worker_name(dst_rank), my_function, args=(torch.ones(n, n), 1, 0)
+            worker_name(dst_rank), my_function, args=(x, y, z)
         )
 
         c = rpc.rpc_sync(
             worker_name(dst_rank), my_rref_function, args=(rref_a, rref_b)
         )
+        self.assertEqual(c, expected)
 
-        self.assertEqual(c, torch.ones(n, n) + 4)
+    @dist_init
+    def test_py_rpc_rref_args(self):
+        self._py_rpc_rref_args(
+            torch.ones(2, 2),
+            1,
+            2,
+            torch.ones(2, 2),
+            3,
+            4,
+            torch.ones(2, 2) * 2 + 10
+        )
 
     @dist_init
-    def test_nested_remote(self):
+    def test_py_rpc_rref_args_sparse(self):
+        self._py_rpc_rref_args(
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor() * 6
+        )
+
+    def _nested_remote(self, f, expected):
         n = self.rank + 1
         dst_rank1 = n % self.world_size
         dst_rank2 = (n + 1) % self.world_size
 
         rref = rpc.remote(
             worker_name(dst_rank1),
-            nested_remote,
+            f,
             args=(worker_name(dst_rank2),),
         )
-        self.assertEqual(rref.to_here(), torch.ones(2, 2) + 3)
+        self.assertEqual(rref.to_here(), expected)
 
     @dist_init
-    def test_nested_rref(self):
+    def test_nested_remote(self):
+        self._nested_remote(
+            nested_remote,
+            torch.ones(2, 2) + 3
+        )
+
+    @dist_init
+    def test_nested_remote_sparse(self):
+        self._nested_remote(
+            nested_remote_sparse,
+            build_sparse_tensor() + build_sparse_tensor()
+        )
+
+    def _nested_rref(self, f, expected1, expected2):
         n = self.rank + 1
         dst_rank1 = n % self.world_size
         dst_rank2 = (n + 1) % self.world_size
         rref_of_rrefs = rpc.remote(
             worker_name(dst_rank1),
-            nested_rref,
+            f,
             args=(worker_name(dst_rank2),),
         )
 
@@ -2322,11 +2710,26 @@ def test_nested_rref(self):
         rrefs = rref_of_rrefs.to_here()
 
         self.assertEqual(len(rrefs), 2)
-        self.assertEqual(rrefs[0].to_here(), torch.ones(2, 2) + 1)
-        self.assertEqual(rrefs[1].to_here(), torch.ones(2, 2) + 2)
+        self.assertEqual(rrefs[0].to_here(), expected1)
+        self.assertEqual(rrefs[1].to_here(), expected2)
 
     @dist_init
-    def test_nested_rref_stress(self):
+    def test_nested_rref(self):
+        self._nested_rref(
+            nested_rref,
+            torch.ones(2, 2) + 1,
+            torch.ones(2, 2) + 2
+        )
+
+    @dist_init
+    def test_nested_rref_sparse(self):
+        self._nested_rref(
+            nested_rref_sparse,
+            build_sparse_tensor() * 2,
+            build_sparse_tensor() * 2
+        )
+
+    def _nested_rref_stress(self, f, expected1, expected2):
         n = self.rank + 1
         dst_rank1 = n % self.world_size
         dst_rank2 = (n + 1) % self.world_size
@@ -2335,7 +2738,7 @@ def test_nested_rref_stress(self):
             all_rrefs.append(
                 rpc.remote(
                     worker_name(dst_rank1),
-                    nested_rref,
+                    f,
                     args=(worker_name(dst_rank2),),
                 )
             )
@@ -2344,8 +2747,24 @@ def test_nested_rref_stress(self):
             rref_of_rrefs = all_rrefs[i]
             rrefs = rref_of_rrefs.to_here()
             self.assertEqual(len(rrefs), 2)
-            self.assertEqual(rrefs[0].to_here(), torch.ones(2, 2) + 1)
-            self.assertEqual(rrefs[1].to_here(), torch.ones(2, 2) + 2)
+            self.assertEqual(rrefs[0].to_here(), expected1)
+            self.assertEqual(rrefs[1].to_here(), expected2)
+
+    @dist_init
+    def test_nested_rref_stress(self):
+        self._nested_rref_stress(
+            nested_rref,
+            torch.ones(2, 2) + 1,
+            torch.ones(2, 2) + 2
+        )
+
+    @dist_init
+    def test_nested_rref_stress_sparse(self):
+        self._nested_rref_stress(
+            nested_rref_sparse,
+            build_sparse_tensor() * 2,
+            build_sparse_tensor() * 2
+        )
 
     @dist_init
     def test_multi_layer_nested_async_rpc(self):
@@ -4110,6 +4529,46 @@ def rref_error():
 
         dist.barrier()
 
+    def _trainer_func(self, rref, sparse):
+        m = MyEmbeddingBagModel(sparse=sparse)
+        loss_fn = nn.MSELoss()
+        for i in range(10):
+            outputs = m(torch.rand(10, 10).long())
+            loss_fn(outputs, torch.rand(10, 10)).backward()
+            gradient = list(m.parameters())[0].grad
+            fut = rref.rpc_async().average(rref, i, gradient)
+            gradient = fut.wait()
+            if gradient.is_sparse:
+                gradient = gradient.to_dense().double()
+            ps_gradient = rref.rpc_sync().get_gradient(rref)
+            if ps_gradient.is_sparse:
+                ps_gradient = ps_gradient.to_dense().double()
+            self.assertTrue(torch.equal(gradient, ps_gradient))
+
+    def _my_parameter_server(self, sparse):
+        ps_rref = RRef(MyParameterServer(self.world_size - 1))
+        futures = []
+        for index in range(1, self.world_size):
+            futures.append(
+                rpc.rpc_async(
+                    worker_name((self.rank + index) % self.world_size),
+                    self._trainer_func,
+                    args=(
+                        ps_rref,
+                        sparse
+                    ),
+                )
+            )
+        torch.futures.wait_all(futures)
+
+    @dist_init
+    def test_my_parameter_server(self):
+        self._my_parameter_server(False)
+
+    @dist_init
+    def test_my_parameter_server_sparse(self):
+        self._my_parameter_server(True)
+
 
 class CudaRpcTest(RpcAgentTestFixture):
 

From 710a2e933f33145e33fdf669ef9fd5fb3cb50d18 Mon Sep 17 00:00:00 2001
From: "Tugsbayasgalan (Tugsuu) Manlaibaatar" <tmanlaibaatar@fb.com>
Date: Sun, 29 Aug 2021 14:17:54 -0700
Subject: [PATCH 332/530] [DOC] Add doc for maybe_wrap_dim (#63161)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63161

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D30629451

Pulled By: tugsbayasgalan

fbshipit-source-id: b03f030f197e10393a8ff223b240d23c30858028
---
 aten/src/ATen/WrapDimUtils.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
index 2768efe6e683b..13e605c920ec1 100644
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -7,6 +7,9 @@
 namespace at {
 
 static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar=true) {
+  // if dim_post_expr is 0 and wrap_scalar is true, then dim must be in the range [-1, 0].
+  // This is a special case for scalar tensors and manifests in e.g. torch.sum(scalar_tensor, 0)
+  // Otherwise, dim should be in the range [-dim_post_expr, dim_post_expr-1].
   return c10::maybe_wrap_dim(dim, dim_post_expr, wrap_scalar);
 }
 

From 9db56531f708188cf59d9d4db60871405df9df69 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Sun, 29 Aug 2021 15:49:59 -0700
Subject: [PATCH 333/530] Revert D30620966: [pytorch][PR] Move
 Parallel[Native|TBB] to GHA

Test Plan: revert-hammer

Differential Revision:
D30620966 (https://github.com/pytorch/pytorch/commit/223f886032978487099da4f54e86e9e0549cde0c)

Original commit changeset: 9a23e4b3e168

fbshipit-source-id: b9248d377b9a7b850dfb3f10f3350fbc9855acfe
---
 .circleci/cimodel/data/pytorch_build_data.py  |   2 +
 .circleci/config.yml                          |  90 ++++
 .github/generated-ciflow-ruleset.json         |   6 -
 .github/scripts/generate_ci_workflows.py      |  38 +-
 ...rallelnative-linux-xenial-py3.6-gcc5.4.yml | 430 ------------------
 ...-paralleltbb-linux-xenial-py3.6-gcc5.4.yml | 430 ------------------
 6 files changed, 104 insertions(+), 892 deletions(-)
 delete mode 100644 .github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
 delete mode 100644 .github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml

diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index 156494589831b..5a85674d74fe9 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -7,6 +7,8 @@
             ("5.4", [  # All this subtree rebases to master and then build
                 ("3.6", [
                     ("important", [X(True)]),
+                    ("parallel_tbb", [X(True)]),
+                    ("parallel_native", [X(True)]),
                     ("pure_torch", [X(True)]),
                 ]),
             ]),
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 8df67e6fe2bc8..1bb32b5cc0a3d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7166,6 +7166,70 @@ workflows:
           build_environment: "pytorch-linux-pytorch_linux_xenial_py3_6_gcc5_4_distributed-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
+      - pytorch_linux_build:
+          name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
+          requires:
+            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
+          build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+      - pytorch_linux_test:
+          name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_test
+          requires:
+            - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
+          build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed_test
+          requires:
+            - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-linux-pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          resource_class: large
+      - pytorch_linux_build:
+          name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
+          requires:
+            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
+          build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+      - pytorch_linux_test:
+          name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_test
+          requires:
+            - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
+          build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          resource_class: large
+      - pytorch_linux_test:
+          name: pytorch_linux_pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed_test
+          requires:
+            - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-linux-pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build
           requires:
@@ -9322,6 +9386,32 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-xenial-py3.6-gcc7"
           image_name: "pytorch-linux-xenial-py3.6-gcc7"
+      - pytorch_linux_build:
+          name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
+          requires:
+            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
+          build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+      - pytorch_linux_test:
+          name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_test
+          requires:
+            - pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          resource_class: large
+      - pytorch_linux_build:
+          name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
+          requires:
+            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
+          build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+      - pytorch_linux_test:
+          name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_test
+          requires:
+            - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
+          build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          resource_class: large
       - pytorch_linux_build:
           name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build
           requires:
diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json
index 0fb27af006c85..d13561190d01f 100644
--- a/.github/generated-ciflow-ruleset.json
+++ b/.github/generated-ciflow-ruleset.json
@@ -10,8 +10,6 @@
       "linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
-      "parallelnative-linux-xenial-py3.6-gcc5.4",
-      "paralleltbb-linux-xenial-py3.6-gcc5.4",
       "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
       "periodic-linux-xenial-cuda11.1-py3.6-gcc7",
       "periodic-win-vs2019-cuda11.1-py3",
@@ -29,8 +27,6 @@
       "linux-bionic-py3.8-gcc9-coverage",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
-      "parallelnative-linux-xenial-py3.6-gcc5.4",
-      "paralleltbb-linux-xenial-py3.6-gcc5.4",
       "win-vs2019-cpu-py3"
     ],
     "ciflow/cuda": [
@@ -67,8 +63,6 @@
       "linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
-      "parallelnative-linux-xenial-py3.6-gcc5.4",
-      "paralleltbb-linux-xenial-py3.6-gcc5.4",
       "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
       "periodic-linux-xenial-cuda11.1-py3.6-gcc7"
     ],
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index dd115405e03ea..f1819dbac589d 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -272,32 +272,18 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
             labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}
         ),
     ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="paralleltbb-linux-xenial-py3.6-gcc5.4",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        # This is a master only job despit on_pull_request is set to True
-        on_pull_request=True,
-        ciflow_config=CIFlowConfig(
-            enabled=True,
-            trigger_action_only=True,
-            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU},
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="parallelnative-linux-xenial-py3.6-gcc5.4",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
-        test_runner_type=LINUX_CPU_TEST_RUNNER,
-        # This is a master only job despit on_pull_request is set to True
-        on_pull_request=True,
-        ciflow_config=CIFlowConfig(
-            enabled=True,
-            trigger_action_only=True,
-            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU},
-        ),
-    ),
+    # CIWorkflow(
+    #     arch="linux",
+    #     build_environment="paralleltbb-linux-xenial-py3.6-gcc5.4",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
+    # ),
+    # CIWorkflow(
+    #     arch="linux",
+    #     build_environment="parallelnative-linux-xenial-py3.6-gcc5.4",
+    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
+    # ),
     # CIWorkflow(
     #     arch="linux",
     #     build_environment="pure_torch-linux-xenial-py3.6-gcc5.4",
diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
deleted file mode 100644
index 402ce38129052..0000000000000
--- a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
+++ /dev/null
@@ -1,430 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: parallelnative-linux-xenial-py3.6-gcc5.4
-
-on:
-  pull_request:
-    types: [unassigned]
-  push:
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: parallelnative-linux-xenial-py3.6-gcc5.4
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-
-concurrency:
-  group: parallelnative-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}
-  cancel-in-progress: true
-
-jobs:
-  ciflow_should_run:
-    runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
-    steps:
-      - name: noop
-        run: echo running ciflow_should_run
-  calculate-docker-image:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.2xlarge
-    needs: [ciflow_should_run]
-    env:
-      DOCKER_BUILDKIT: 1
-    timeout-minutes: 90
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-          rm -f ~/.ssh/authorized_keys
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
-          DOCKER_SKIP_S3_UPLOAD: 1
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          cd .circleci/docker && ./build_docker.sh
-
-  build:
-    runs-on: linux.2xlarge
-    needs: [calculate-docker-image, ciflow_should_run]
-    env:
-      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
-      JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-build
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-          rm -f ~/.ssh/authorized_keys
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
-          submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - name: Pull docker image
-        run: |
-          docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Build PyTorch
-        run: |
-          docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}" \
-            sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
-          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
-          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: ubuntu-18.04
-    needs: [ciflow_should_run]
-    env:
-      TEST_RUNNER_TYPE: linux.2xlarge
-      ENABLE_DISTRIBUTED_TEST: 1
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      NUM_TEST_SHARDS: 1
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    env:
-      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
-      JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-          rm -f ~/.ssh/authorized_keys
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
-          submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - name: Pull docker image
-        run: |
-          docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Test PyTorch
-        env:
-          BUILD_ENVIRONMENT: parallelnative-linux-xenial-py3.6-gcc5.4-${{ matrix.config }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-        run: |
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          if [[ $NUM_TEST_SHARDS -ne 2 ]]; then
-            export SHARD_NUMBER=0
-          fi
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086
-          docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e CONTINUE_THROUGH_ERROR \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}" \
-            sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml'
-      - uses: actions/upload-artifact@v2
-        name: Store PyTorch Test Reports
-        if: always()
-        with:
-          name: test-reports-${{ matrix.config }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
-        name: Store PyTorch Test Reports on S3
-        if: always()
-        with:
-          name: test-reports-${{ matrix.config }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload test statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-test
-          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
-          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
-          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.16.34
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-          # Prune all of the docker images
-          docker system prune -af
diff --git a/.github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml
deleted file mode 100644
index 59eceb58ea230..0000000000000
--- a/.github/workflows/generated-paralleltbb-linux-xenial-py3.6-gcc5.4.yml
+++ /dev/null
@@ -1,430 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: paralleltbb-linux-xenial-py3.6-gcc5.4
-
-on:
-  pull_request:
-    types: [unassigned]
-  push:
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: paralleltbb-linux-xenial-py3.6-gcc5.4
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-
-concurrency:
-  group: paralleltbb-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}
-  cancel-in-progress: true
-
-jobs:
-  ciflow_should_run:
-    runs-on: ubuntu-18.04
-    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
-    steps:
-      - name: noop
-        run: echo running ciflow_should_run
-  calculate-docker-image:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: linux.2xlarge
-    needs: [ciflow_should_run]
-    env:
-      DOCKER_BUILDKIT: 1
-    timeout-minutes: 90
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-          rm -f ~/.ssh/authorized_keys
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
-          DOCKER_SKIP_S3_UPLOAD: 1
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          cd .circleci/docker && ./build_docker.sh
-
-  build:
-    runs-on: linux.2xlarge
-    needs: [calculate-docker-image, ciflow_should_run]
-    env:
-      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
-      JOB_BASE_NAME: paralleltbb-linux-xenial-py3.6-gcc5.4-build
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-          rm -f ~/.ssh/authorized_keys
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
-          submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - name: Pull docker image
-        run: |
-          docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Build PyTorch
-        run: |
-          docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}" \
-            sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
-          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
-          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: ubuntu-18.04
-    needs: [ciflow_should_run]
-    env:
-      TEST_RUNNER_TYPE: linux.2xlarge
-      ENABLE_DISTRIBUTED_TEST: 1
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
-      NUM_TEST_SHARDS: 1
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    env:
-      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
-      JOB_BASE_NAME: paralleltbb-linux-xenial-py3.6-gcc5.4-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
-    steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-          rm -f ~/.ssh/authorized_keys
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: actions/checkout@v2
-        with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
-          submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
-      - name: Pull docker image
-        run: |
-          docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Test PyTorch
-        env:
-          BUILD_ENVIRONMENT: paralleltbb-linux-xenial-py3.6-gcc5.4-${{ matrix.config }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-        run: |
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          if [[ $NUM_TEST_SHARDS -ne 2 ]]; then
-            export SHARD_NUMBER=0
-          fi
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086
-          docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e CONTINUE_THROUGH_ERROR \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}" \
-            sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml'
-      - uses: actions/upload-artifact@v2
-        name: Store PyTorch Test Reports
-        if: always()
-        with:
-          name: test-reports-${{ matrix.config }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
-        name: Store PyTorch Test Reports on S3
-        if: always()
-        with:
-          name: test-reports-${{ matrix.config }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Display and upload test statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: paralleltbb-linux-xenial-py3.6-gcc5.4-test
-          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
-          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
-          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.16.34
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-          # Prune all of the docker images
-          docker system prune -af

From c5ed31e4a7550bfe5a4893b3803ee7fdf1b31f53 Mon Sep 17 00:00:00 2001
From: mingfeima <mingfei.ma@intel.com>
Date: Sun, 29 Aug 2021 18:35:37 -0700
Subject: [PATCH 334/530] add channel last support for MaxUnpool2d (#49984)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49984

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D26007051

Pulled By: VitalyFedyunin

fbshipit-source-id: 6c54751ade4092e03c1651aaa60380f7d6e92f6b
---
 aten/src/ATen/native/MaxUnpooling.cpp        | 434 +++----------------
 aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp | 385 ++++++++++++++++
 aten/src/ATen/native/cpu/MaxUnpoolKernel.h   |  16 +
 test/test_nn.py                              |  31 ++
 tools/build_variables.bzl                    |   1 +
 5 files changed, 486 insertions(+), 381 deletions(-)
 create mode 100644 aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
 create mode 100644 aten/src/ATen/native/cpu/MaxUnpoolKernel.h

diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp
index b3c01941c73de..99874084470f4 100644
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@@ -1,90 +1,17 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/Parallel.h>
-#include <tuple>
+#include <ATen/native/cpu/MaxUnpoolKernel.h>
 
 namespace at {
 namespace native {
 
-template <typename scalar_t>
-Tensor max_unpooling2d_forward_out_cpu_frame(
-    Tensor& output,
-    const Tensor& input,
-    const Tensor& indices,
-    int64_t oheight,
-    int64_t owidth) {
-  int64_t numBatch = 1;
-  int64_t dimc = 0;
-  int64_t dimh = 1;
-  int64_t dimw = 2;
-  if (input.ndimension() == 4) {
-    numBatch = input.size(0);
-    dimc++;
-    dimh++;
-    dimw++;
-  }
-  int64_t numChannels = input.size(dimc);
-  int64_t inputHeight = input.size(dimh);
-  int64_t inputWidth = input.size(dimw);
-
-  auto* rawInput = input.data_ptr<scalar_t>();
-  auto* rawIndices = indices.data_ptr<int64_t>();
-  auto* rawOutput = output.data_ptr<scalar_t>();
-
-  at::internal::lazy_init_num_threads();
-
-  for (int64_t n = 0; n < numBatch; n++) {
-    int64_t nOutputOffset = n * numChannels * owidth * oheight;
-    int64_t nInputOffset = n * numChannels * inputWidth * inputHeight;
-    int64_t k = 0;
-    bool has_error = false;
-    int64_t error_index = 0;
-#pragma omp parallel for private(k)
-    for (k = 0; k < numChannels; k++) {
-      int64_t finalOutputOffset = nOutputOffset + k * owidth * oheight;
-      int64_t finalInputOffset = nInputOffset + k * inputWidth * inputHeight;
-      scalar_t* output_p_k = rawOutput + finalOutputOffset;
-      scalar_t* input_p_k = rawInput + finalInputOffset;
-      int64_t* ind_p_k = rawIndices + finalInputOffset;
-
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      int64_t maxp;
-      for (int64_t i = 0; i < inputHeight; i++) {
-        for (int64_t j = 0; j < inputWidth; j++) {
-          maxp = ind_p_k[i * inputWidth + j];
-          if (maxp < 0 || maxp >= owidth * oheight) {
-#pragma omp critical
-            {
-              has_error = true;
-              error_index = maxp;
-            }
-          } else {
-            output_p_k[maxp] = input_p_k[i * inputWidth + j];
-          }
-        }
-      }
-    }
-    if (has_error) {
-      AT_ERROR(
-          "Found an invalid max index: ",
-          error_index,
-          " (output volumes are of size ",
-          oheight,
-          "x",
-          owidth);
-      (void)error_index;
-    }
-  }
-  return output;
-}
-
-Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_,
+Tensor& max_unpooling2d_forward_out_cpu(
+    const Tensor& self_,
     const Tensor& indices_,
     IntArrayRef output_size,
     Tensor& output) {
   auto oheight = output_size[0];
   auto owidth = output_size[1];
-  TORCH_CHECK(output.is_contiguous(), "output must be contiguous");
   TORCH_CHECK(
       indices_.scalar_type() == at::ScalarType::Long,
       "elements in indices should be type int64");
@@ -100,8 +27,9 @@ Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_,
 
   TORCH_CHECK(self_.numel() > 0, "Input must be non-empty");
 
-  auto self = self_.contiguous();
-  auto indices = indices_.contiguous();
+  auto memory_format = self_.suggest_memory_format();
+  auto self = self_.contiguous(memory_format);
+  auto indices = indices_.contiguous(memory_format);
 
   if (self.ndimension() == 3) {
     int64_t numChannels = self.size(0);
@@ -109,15 +37,11 @@ Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_,
   } else {
     int64_t numBatch = self.size(0);
     int64_t numChannels = self.size(1);
-    output.resize_({numBatch, numChannels, oheight, owidth});
+    output.resize_({numBatch, numChannels, oheight, owidth}, memory_format);
   }
   output.zero_();
 
-  AT_DISPATCH_FLOATING_TYPES(
-      self.scalar_type(), "max_unpooling2d_forward_out_cpu_frame", ([&] {
-        max_unpooling2d_forward_out_cpu_frame<scalar_t>(
-            output, self, indices, oheight, owidth);
-      }));
+  max_unpool2d_kernel(kCPU, output, self, indices);
   return output;
 };
 
@@ -130,87 +54,6 @@ Tensor max_unpooling2d_forward_cpu(
   return output;
 }
 
-template <typename scalar_t>
-Tensor max_unpooling3d_forward_out_cpu_frame(
-    Tensor& output,
-    const Tensor& input,
-    const Tensor& indices,
-    int64_t oT,
-    int64_t oH,
-    int64_t oW) {
-  int64_t nBatch = 1;
-  int64_t dimw = 3;
-  int64_t dimh = 2;
-  int64_t dimt = 1;
-
-  if (input.ndimension() == 5) {
-    nBatch = input.size(0);
-    dimw++;
-    dimh++;
-    dimt++;
-  }
-
-  int64_t nSlices = input.size(dimt - 1);
-  int64_t iT = input.size(dimt);
-  int64_t iH = input.size(dimh);
-  int64_t iW = input.size(dimw);
-
-  scalar_t* input_data = input.data_ptr<scalar_t>();
-  scalar_t* output_data = output.data_ptr<scalar_t>();
-  int64_t* indices_data = indices.data_ptr<int64_t>();
-
-  at::internal::lazy_init_num_threads();
-
-  for (int64_t p = 0; p < nBatch; p++) {
-    int64_t inputOffset = p * nSlices * iT * iW * iH;
-    int64_t outputOffset = p * nSlices * oT * oW * oH;
-    int64_t k = 0;
-    bool has_error = false;
-    int error_index = 0;
-#pragma omp parallel for private(k)
-    for (k = 0; k < nSlices; k++) {
-      int64_t finalInputOffset = inputOffset + k * iT * iW * iH;
-      int64_t finalOutputOffset = outputOffset + k * oT * oW * oH;
-
-      scalar_t* output_p_k = output_data + finalOutputOffset;
-      scalar_t* input_p_k = input_data + finalInputOffset;
-      int64_t* ind_p_k = indices_data + finalInputOffset;
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      int maxp;
-      for (int64_t t = 0; t < iT; t++) {
-        for (int64_t i = 0; i < iH; i++) {
-          for (int64_t j = 0; j < iW; j++) {
-            int64_t index = t * iH * iW + i * iW + j;
-            maxp = ind_p_k[index];
-            if (maxp < 0 || maxp >= oT * oW * oH) {
-#pragma omp critical
-              {
-                has_error = true;
-                error_index = maxp;
-              }
-            } else {
-              output_p_k[maxp] = input_p_k[index];
-            }
-          }
-        }
-      }
-      if (has_error) {
-        AT_ERROR(
-            "found an invalid max index ",
-            error_index,
-            " (output volumes are of size ",
-            oT,
-            "x",
-            oH,
-            "x",
-            oW);
-        (void)error_index;
-      }
-    }
-  }
-  return output;
-}
-
 static void max_unpooling3d_shape_check(
     const Tensor& input,
     const Tensor& gradOutput,
@@ -310,16 +153,7 @@ Tensor& max_unpooling3d_forward_out_cpu(const Tensor& self_,
   }
   output.zero_();
 
-  AT_DISPATCH_FLOATING_TYPES(
-      self.scalar_type(), "max_unpooling3d_forward_out_cpu_frame", ([&] {
-        max_unpooling3d_forward_out_cpu_frame<scalar_t>(
-            output,
-            self,
-            indices,
-            oT,
-            oH,
-            oW);
-      }));
+  max_unpool3d_kernel(kCPU, output, self, indices);
   return output;
 }
 
@@ -335,59 +169,6 @@ Tensor max_unpooling3d_forward_cpu(
   return output;
 }
 
-template <typename scalar_t>
-static void max_unpooling2d_backward_out_cpu_frame(
-    scalar_t* gradInput_p,
-    scalar_t* gradOutput_p,
-    int64_t* ind_p,
-    int64_t nslices,
-    int64_t iheight,
-    int64_t iwidth,
-    int64_t oheight,
-    int64_t owidth) {
-  bool has_error = false;
-  int64_t error_index = 0;
-  int64_t k = 0;
-
-  at::internal::lazy_init_num_threads();
-#pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++) {
-    scalar_t* gradInput_p_k = gradInput_p + k * iwidth * iheight;
-    scalar_t* gradOutput_p_k = gradOutput_p + k * owidth * oheight;
-    int64_t* ind_p_k = ind_p + k * iwidth * iheight;
-
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t i, j;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t maxp;
-
-    for (i = 0; i < iheight; i++) {
-      for (j = 0; j < iwidth; j++) {
-        maxp = ind_p_k[i * iwidth + j]; /* retrieve position of max */
-        if (maxp < 0 || maxp >= owidth * oheight) {
-#pragma omp critical
-          {
-            has_error = true;
-            error_index = maxp;
-          }
-        }
-        gradInput_p_k[i * iwidth + j] =
-            gradOutput_p_k[maxp]; /* update gradient */
-      }
-    }
-  }
-  if (has_error) {
-    AT_ERROR(
-        "invalid max index ",
-        error_index,
-        ", owidth= ",
-        owidth,
-        ", oheight= ",
-        oheight);
-    (void)error_index;
-  }
-}
-
 Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
     const Tensor& self,
     const Tensor& indices_,
@@ -396,42 +177,24 @@ Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
   int64_t oheight = output_size[0];
   int64_t owidth = output_size[1];
-  int dimw = 2;
-  int dimh = 1;
-  int nbatch = 1;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int nslices;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int iheight;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int iwidth;
+  int64_t ndim = self.ndimension();
+  int64_t dimh = ndim == 3 ? 1 : 2;
+  int64_t dimw = ndim == 3 ? 2 : 3;
+
   TORCH_CHECK(
       indices_.scalar_type() == at::ScalarType::Long,
       "elements in indices should be type int64");
   TORCH_CHECK(
       self.sizes() == indices_.sizes(), "Input shape must match indices shape");
-
   TORCH_CHECK(output_size.size() == 2, "Output size must be 2");
 
-  /* get contiguous gradOutput and indices */
-  auto grad_output = grad_output_.contiguous();
-  auto indices = indices_.contiguous();
+  auto memory_format = self.suggest_memory_format();
+  auto grad_output = grad_output_.contiguous(memory_format);
+  auto indices = indices_.contiguous(memory_format);
 
-  /* resize */
-  grad_input.resize_as_(self);
+  grad_input.resize_(self.sizes(), memory_format);
   grad_input.zero_();
 
-  if (self.ndimension() == 4) {
-    nbatch = self.size(0);
-    dimw++;
-    dimh++;
-  }
-
-  /* sizes */
-  nslices = self.size(dimh - 1);
-  iheight = self.size(dimh);
-  iwidth = self.size(dimw);
-
   if (owidth != grad_output.size(dimw) || oheight != grad_output.size(dimh)) {
     AT_ERROR(
         "Inconsistent gradOutput size. output height = ",
@@ -443,23 +206,8 @@ Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
         "x",
         grad_output.size(dimw));
   }
-  AT_DISPATCH_FLOATING_TYPES(
-      self.scalar_type(), "max_unpooling2d_backward_out_cpu_frame", ([&] {
-        int p;
-        for (p = 0; p < nbatch; p++) {
-          auto inputOffset = p * nslices * iheight * iwidth;
-          auto outputOffset = p * nslices * oheight * owidth;
-          max_unpooling2d_backward_out_cpu_frame<scalar_t>(
-              grad_input.data_ptr<scalar_t>() + inputOffset,
-              grad_output.data_ptr<scalar_t>() + outputOffset,
-              indices.data_ptr<int64_t>() + inputOffset,
-              nslices,
-              iheight,
-              iwidth,
-              oheight,
-              owidth);
-        }
-      }));
+
+  max_unpool2d_backward_kernel(kCPU, grad_input, grad_output, indices);
   return grad_input;
 }
 
@@ -468,72 +216,14 @@ Tensor max_unpooling2d_backward_cpu(
     const Tensor& self,
     const Tensor& indices,
     IntArrayRef output_size) {
-  auto grad_input = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  at::native::max_unpooling2d_backward_out_cpu(
+  auto grad_input = at::empty({0}, self.options());
+  max_unpooling2d_backward_out_cpu(
       grad_output, self, indices, output_size, grad_input);
   return grad_input;
 }
 
-template <typename scalar_t>
-static void max_unpooling3d_backward_out_cpu_frame(
-    scalar_t* gradInput_p,
-    scalar_t* gradOutput_p,
-    int64_t* ind_p,
-    int64_t nslices,
-    int64_t iT,
-    int64_t iH,
-    int64_t iW,
-    int64_t oT,
-    int64_t oH,
-    int64_t oW) {
-  int64_t k = 0;
-  bool has_error = false;
-  int error_index = 0;
-
-  at::internal::lazy_init_num_threads();
-
-#pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++) {
-    scalar_t* gradInput_p_k = gradInput_p + k * iT * iH * iW;
-    scalar_t* gradOutput_p_k = gradOutput_p + k * oT * oH * oW;
-    int64_t* ind_p_k = ind_p + k * iT * iH * iW;
-
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t t, i, j, index;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t maxp;
-    for (t = 0; t < iT; t++) {
-      for (i = 0; i < iH; i++) {
-        for (j = 0; j < iW; j++) {
-          index = t * iH * iW + i * iW + j;
-          maxp = ind_p_k[index]; /* retrieve position of max */
-          if (maxp < 0 || maxp >= oT * oH * oW) {
-#pragma omp critical
-            {
-              has_error = true;
-              error_index = maxp;
-            }
-          }
-          gradInput_p_k[index] = gradOutput_p_k[maxp]; /* update gradient */
-        }
-      }
-    }
-  }
-  if (has_error) {
-    AT_ERROR(
-        "invalid max index ",
-        error_index,
-        ", oT= ",
-        oT,
-        ", oW= ",
-        oW,
-        ",oH= ",
-        oH);
-    (void)error_index;
-  }
-}
-
-Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_,
+Tensor& max_unpooling3d_backward_out_cpu(
+    const Tensor& grad_output_,
     const Tensor& self,
     const Tensor& indices_,
     IntArrayRef output_size,
@@ -541,26 +231,17 @@ Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_,
     IntArrayRef padding,
     Tensor& grad_input) {
   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
-  auto oT = output_size[0];
-  auto oH = output_size[1];
-  auto oW = output_size[2];
-  int dimw = 3;
-  int dimh = 2;
-  int dimt = 1;
-  int nbatch = 1;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int nslices;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int iT;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int iH;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int iW;
+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+  int64_t ndim = self.ndimension();
+  int64_t dimt = ndim == 4 ? 1 : 2;
+  int64_t dimh = ndim == 4 ? 2 : 3;
+  int64_t dimw = ndim == 4 ? 3 : 4;
 
   max_unpooling3d_shape_check(
       self, grad_output_, indices_, output_size, stride, padding);
 
-  // TODO (from THNN): check gradOutput shape
   /* get contiguous gradOutput */
   auto grad_output = grad_output_.contiguous();
   auto indices = indices_.contiguous();
@@ -568,39 +249,24 @@ Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_,
   /* resize */
   grad_input.resize_as_(self);
   grad_input.zero_();
-  if (self.ndimension() == 5) {
-    nbatch = self.size(0);
-    dimt++;
-    dimw++;
-    dimh++;
+
+  if (oW != grad_output.size(dimw) || oH != grad_output.size(dimh) || oT != grad_output.size(dimt)) {
+    AT_ERROR(
+        "Inconsistent gradOutput size. output depth = ",
+        oT,
+        ", output height = ",
+        oH,
+        ", output width = ",
+        oW,
+        ", gradOutput: ",
+        grad_output.size(dimt),
+        "x",
+        grad_output.size(dimh),
+        "x",
+        grad_output.size(dimw));
   }
 
-  /* sizes */
-  nslices = self.size(dimt - 1);
-  iT = self.size(dimt);
-  iH = self.size(dimh);
-  iW = self.size(dimw);
-
-  /* backprop */
-  AT_DISPATCH_FLOATING_TYPES(
-      self.scalar_type(), "max_unpooling3d_backward_out_cpu_frame", ([&] {
-        int p;
-        for (p = 0; p < nbatch; p++) {
-          int inputOffset = p * nslices * iT * iH * iW;
-          int outputOffset = p * nslices * oT * oH * oW;
-          max_unpooling3d_backward_out_cpu_frame<scalar_t>(
-              grad_input.data_ptr<scalar_t>() + inputOffset,
-              grad_output.data_ptr<scalar_t>() + outputOffset,
-              indices.data_ptr<int64_t>() + inputOffset,
-              nslices,
-              iT,
-              iH,
-              iW,
-              oT,
-              oH,
-              oW);
-        }
-      }));
+  max_unpool3d_backward_kernel(kCPU, grad_input, grad_output, indices);
   return grad_input;
 }
 
@@ -611,10 +277,16 @@ Tensor max_unpooling3d_backward_cpu(
     IntArrayRef output_size,
     IntArrayRef stride,
     IntArrayRef padding) {
-  auto grad_input = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto grad_input = at::empty({0}, self.options());
   at::native::max_unpooling3d_backward_out_cpu(
       grad_output, self, indices, output_size, stride, padding, grad_input);
   return grad_input;
 }
+
+DEFINE_DISPATCH(max_unpool2d_kernel);
+DEFINE_DISPATCH(max_unpool2d_backward_kernel);
+DEFINE_DISPATCH(max_unpool3d_kernel);
+DEFINE_DISPATCH(max_unpool3d_backward_kernel);
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
new file mode 100644
index 0000000000000..5a7b03128766b
--- /dev/null
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
@@ -0,0 +1,385 @@
+#include <ATen/ATen.h>
+
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/native/Pool.h>
+#include <ATen/native/cpu/utils.h>
+
+namespace at { namespace native {
+
+namespace {
+
+template <typename scalar_t, bool is_3d = false>
+void cpu_max_unpool(
+    Tensor& output_,
+    const Tensor& input,
+    const Tensor& indices) {
+  auto output = output_.contiguous();
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  // NB: input tensor dimensions:
+  // MaxUnpool2d:
+  //    dim = 3: CHW
+  //    dim = 4: NCHW
+  // MaxUnpool3d:
+  //    dim = 4: CDHW
+  //    dim = 5: NCDHW
+
+  int64_t numel = input.numel();
+  int64_t ndim = input.ndimension();
+
+  // treat batch size and channels as one dimension
+  // and the feature map as another dimension
+  int64_t channels, output_depth, output_height, output_width;
+  if (is_3d) {
+    TORCH_CHECK(ndim == 4 || ndim == 5, "MaxUnpool3d: expect input to be 4d or 5d tensor.");
+    channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1);
+    output_depth = output.size(-3);
+    output_height = output.size(-2);
+    output_width = output.size(-1);
+  } else {
+    TORCH_CHECK(ndim == 3 || ndim == 4, "MaxUnpool2d: expect input to be 3d or 4d tensor.");
+    channels = ndim == 3 ? input.size(0) : input.size(0) * input.size(1);
+    output_depth = 1;
+    output_height = output.size(-2);
+    output_width = output.size(-1);
+  }
+  int64_t input_image_size = numel / channels;
+  int64_t output_image_size = output.numel() / channels;
+
+  bool has_error = false;
+  int64_t error_index = 0;
+
+  // parallel on dim N, C, D, H, W: [channels, input_image_size]
+  at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
+    int64_t c = 0;
+    int64_t ip = 0;
+    data_index_init(begin, c, channels, ip, input_image_size);
+
+    for (int64_t i = begin; i < end; i++) {
+      scalar_t* output_ptr = output_data + c * output_image_size;
+
+      int64_t maxp = indices_data[i];
+      if (maxp < 0 || maxp >= output_image_size) {
+        #pragma omp critical
+        {
+          has_error = true;
+          error_index = maxp;
+        }
+      } else {
+        output_ptr[maxp] = input_data[i];
+      }
+
+      // move on to next input index
+      data_index_step(c, channels, ip, input_image_size);
+    }
+  });
+
+  if (has_error) {
+    if (is_3d) {
+      AT_ERROR("Found an invalid max index: ", error_index,
+          " (output volumes are of size ", output_depth,
+          "x", output_height, "x", output_width);
+      (void)error_index;
+    } else {
+      AT_ERROR("Found an invalid max index: ", error_index,
+          " (output volumes are of size ", output_height,
+          "x", output_width);
+      (void)error_index;
+    }
+  }
+
+  if (!output_.is_contiguous()) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t>
+void cpu_max_unpool_channels_last(
+    Tensor& output_,
+    const Tensor& input,
+    const Tensor& indices) {
+  TORCH_CHECK(input.ndimension() == 4,
+              "max_unpool2d with channels last format supports tensors with 4 dims");
+  auto memory_format = at::MemoryFormat::ChannelsLast;
+  auto output = output_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_height = input.size(2);
+  int64_t input_width = input.size(3);
+  int64_t output_height = output.size(2);
+  int64_t output_width = output.size(3);
+  int64_t input_image_size = input_height * input_width;
+  int64_t output_image_size = output_height * output_width;
+
+  bool has_error = false;
+  int64_t error_index = 0;
+
+  // parallel on dim N, H, W
+  at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t ip = 0;
+    data_index_init(begin, n, nbatch, ip, input_image_size);
+
+    for (int64_t i = begin; i < end; i++) {
+      scalar_t* input_ptr = input_data + i * channels;
+      int64_t* indices_ptr = indices_data + i * channels;
+      scalar_t* output_ptr = output_data + n * output_image_size * channels;
+
+      // can't do scatter on avx2 (only available on avx512)
+      for (int64_t c = 0; c < channels; c++) {
+        int64_t maxp = indices_ptr[c];
+        if (maxp < 0 || maxp >= output_image_size) {
+          #pragma omp critical
+          {
+            has_error = true;
+            error_index = maxp;
+          }
+        } else {
+          output_ptr[maxp * channels + c] = input_ptr[c];
+        }
+      }
+
+      // move on to next input index
+      data_index_step(n, nbatch, ip, input_image_size);
+    }
+  });
+
+  if (has_error) {
+    AT_ERROR("Found an invalid max index: ", error_index,
+        " (output volumes are of size ", output_height,
+        "x", output_width);
+    (void)error_index;
+  }
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t, bool is_3d = false>
+void cpu_max_unpool_backward(
+    Tensor& grad_input_,
+    const Tensor& grad_output,
+    const Tensor& indices) {
+  auto grad_input = grad_input_.contiguous();
+
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+  auto grad_input_data = grad_input.data_ptr<scalar_t>();
+
+  int64_t numel = grad_input.numel();
+  int64_t ndim = grad_output.ndimension();
+
+  // treat batch size and channels as one dimension
+  // and the feature map as another dimension
+  int64_t channels, output_depth, output_height, output_width;
+  if (is_3d) {
+    TORCH_CHECK(ndim == 4 || ndim == 5, "MaxUnpool3d_backward: expect grad_output to be 4d or 5d tensor.");
+    channels = ndim == 4 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
+    output_depth = grad_output.size(-3);
+    output_height = grad_output.size(-2);
+    output_width = grad_output.size(-1);
+  } else {
+    TORCH_CHECK(ndim == 3 || ndim == 4, "MaxUnpool2d_backward: expect grad_output to be 3d or 4d tensor.");
+    channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
+    output_depth = 1;
+    output_height = grad_output.size(-2);
+    output_width = grad_output.size(-1);
+  }
+  int64_t input_image_size = numel / channels;
+  int64_t output_image_size = grad_output.numel() / channels;
+
+  bool has_error = false;
+  int64_t error_index = 0;
+
+  // parallel on dim N, C, D, H, W
+  at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
+    int64_t c = 0;
+    int64_t ip = 0;
+    data_index_init(begin, c, channels, ip, input_image_size);
+
+    for (int64_t i = begin; i < end; i++) {
+      scalar_t* grad_output_ptr = grad_output_data + c * output_image_size;
+
+      int64_t maxp = indices_data[i];
+      if (maxp < 0 || maxp >= output_image_size) {
+        #pragma omp critical
+        {
+          has_error = true;
+          error_index = maxp;
+        }
+      } else {
+        grad_input_data[i] = grad_output_ptr[maxp];
+      }
+
+      // move on to next input index
+      data_index_step(c, channels, ip, input_image_size);
+    }
+  });
+
+  if (has_error) {
+    if (is_3d) {
+      AT_ERROR("invalid max index ", error_index,
+          ", odepth= ", output_depth,
+          ", owidth= ", output_width,
+          ", oheight= ", output_height);
+      (void)error_index;
+    } else {
+      AT_ERROR("invalid max index ", error_index,
+          ", owidth= ", output_width,
+          ", oheight= ", output_height);
+      (void)error_index;
+    }
+  }
+
+  if (!grad_input_.is_contiguous()) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+template <typename scalar_t>
+void cpu_max_unpool_backward_channels_last(
+    Tensor& grad_input_,
+    const Tensor& grad_output,
+    const Tensor& indices) {
+  TORCH_CHECK(grad_output.ndimension() == 4,
+      "max_unpool2d  backward with channels last format supports tensors with 4 dims.");
+  auto memory_format = at::MemoryFormat::ChannelsLast;
+  auto grad_input = grad_input_.contiguous(memory_format);
+
+  auto grad_input_data = grad_input.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t nbatch = grad_input.size(0);
+  int64_t channels = grad_input.size(1);
+  int64_t input_height = grad_input.size(2);
+  int64_t input_width = grad_input.size(3);
+  int64_t output_height = grad_output.size(2);
+  int64_t output_width = grad_output.size(3);
+  int64_t input_image_size = input_height * input_width;
+  int64_t output_image_size = output_height * output_width;
+
+  bool has_error = false;
+  int64_t error_index = 0;
+
+  // parallel on dim N, H, W
+  at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t ip = 0;
+    data_index_init(begin, n, nbatch, ip, input_image_size);
+
+    for (int64_t i = begin; i < end; i++) {
+      scalar_t* grad_output_ptr = grad_output_data + n * output_image_size * channels;
+      scalar_t* grad_input_ptr = grad_input_data + i * channels;
+      int64_t* indices_ptr = indices_data + i * channels;
+
+      for (int64_t c = 0; c < channels; c++) {
+        int64_t maxp = indices_ptr[c];
+        if (maxp < 0 || maxp >= output_image_size) {
+          #pragma omp critical
+          {
+            has_error = true;
+            error_index = maxp;
+          }
+        } else {
+          grad_input_ptr[c] = grad_output_ptr[maxp * channels + c];
+        }
+      }
+
+      // move on to next input index
+      data_index_step(n, nbatch, ip, input_image_size);
+    }
+  });
+
+  if (has_error) {
+    AT_ERROR("invalid max index ", error_index,
+        ", owidth= ", output_width,
+        ", oheight= ", output_height);
+    (void)error_index;
+  }
+
+  if (!grad_input_.is_contiguous(memory_format)) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+void max_unpool2d_kernel_impl(
+    Tensor& output,
+    const Tensor& input,
+    const Tensor& indices) {
+  switch(input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool2d", [&] {
+        cpu_max_unpool<scalar_t, /*is_3d*/false>(output, input, indices);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast: {
+      AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool2d_channels_last", [&] {
+        cpu_max_unpool_channels_last<scalar_t>(output, input, indices);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+void max_unpool3d_kernel_impl(
+    Tensor& output,
+    const Tensor& input,
+    const Tensor& indices) {
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool3d", [&] {
+    cpu_max_unpool<scalar_t, /*is_3d*/true>(output, input, indices);
+  });
+}
+
+void max_unpool2d_backward_kernel_impl(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& indices) {
+  switch(grad_output.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward", [&] {
+        cpu_max_unpool_backward<scalar_t, /*is_3d*/false>(grad_input, grad_output, indices);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast: {
+      AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward_channels_last", [&] {
+        cpu_max_unpool_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+void max_unpool3d_backward_kernel_impl(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& indices) {
+  AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool3d_backward", [&] {
+    cpu_max_unpool_backward<scalar_t, /*is_3d*/true>(grad_input, grad_output, indices);
+  });
+}
+
+} // anonymous namespace
+
+REGISTER_DISPATCH(max_unpool2d_kernel, &max_unpool2d_kernel_impl);
+REGISTER_DISPATCH(max_unpool2d_backward_kernel, &max_unpool2d_backward_kernel_impl);
+REGISTER_DISPATCH(max_unpool3d_kernel, &max_unpool3d_kernel_impl);
+REGISTER_DISPATCH(max_unpool3d_backward_kernel, &max_unpool3d_backward_kernel_impl);
+
+}} // at::native
diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.h b/aten/src/ATen/native/cpu/MaxUnpoolKernel.h
new file mode 100644
index 0000000000000..00fbeb64213d6
--- /dev/null
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.h
@@ -0,0 +1,16 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/DispatchStub.h>
+
+#pragma once
+
+namespace at { namespace native {
+
+using max_unpooling_fn = void(*)(Tensor&, const Tensor&, const Tensor&);
+
+DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_kernel);
+DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_backward_kernel);
+DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_kernel);
+DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_backward_kernel);
+
+}} // at::native
diff --git a/test/test_nn.py b/test/test_nn.py
index 4e01c94d4c971..7d26246786c6c 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -6186,6 +6186,37 @@ def test_MaxUnpool2d_output_size(self):
                 else:
                     self.assertRaises(ValueError, lambda: mu(output_small, indices_small, (h, w)))
 
+    def test_max_unpool2d_nhwc_cpu(self):
+        input = torch.randn(2, 10, 9, 9).float().cpu()
+        input = input.contiguous(memory_format=torch.channels_last)
+        ref_input = input.clone().contiguous()
+
+        pool = nn.MaxPool2d(3, stride=2, return_indices=True).cpu()
+        ref_pool = nn.MaxPool2d(3, stride=2, return_indices=True).cpu()
+
+        out, ind = pool(input)
+        ref_out, ref_ind = ref_pool(ref_input)
+        out.requires_grad_()
+        ref_out.requires_grad_()
+
+        unpool = nn.MaxUnpool2d(3, stride=2).cpu()
+        ref_unpool = nn.MaxUnpool2d(3, stride=2).cpu()
+
+        upout = unpool(out, ind)
+        ref_upout = ref_unpool(ref_out, ref_ind)
+
+        grad = torch.randn(upout.size()).float().cpu()
+        grad = grad.contiguous(memory_format=torch.channels_last)
+        ref_grad = grad.clone().contiguous()
+
+        upout.backward(grad)
+        ref_upout.backward(ref_grad)
+
+        self.assertTrue(upout.is_contiguous(memory_format=torch.channels_last))
+        self.assertTrue(ref_upout.is_contiguous())
+        self.assertTrue(torch.allclose(upout, ref_upout))
+        self.assertTrue(torch.allclose(out.grad, ref_out.grad))
+
     def test_container_copy(self):
         class Model(nn.Module):
             def __init__(self):
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index b2a1016118d28..34846b5d6c7b3 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -907,6 +907,7 @@ aten_native_source_codegen_list = [
     "aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp",
     "aten/src/ATen/native/cpu/MaxPooling.cpp",
     "aten/src/ATen/native/cpu/MaxPoolKernel.cpp",
+    "aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp",
     "aten/src/ATen/native/cpu/MultinomialKernel.cpp",
     "aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp",
     "aten/src/ATen/native/cpu/PowKernel.cpp",

From 29ad84f2523346b4b03ac99fa04203fe81d2c4e3 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.h1.fair>
Date: Sun, 29 Aug 2021 19:37:06 -0700
Subject: [PATCH 335/530] Removes beta warning from the special module
 documentation (#64148)

Summary:
Updates documentation per feature review. torch.special is now stable.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64148

Reviewed By: ngimel

Differential Revision: D30632049

Pulled By: mruberry

fbshipit-source-id: 8f6148ec7737e7b3a90644eeca23eb217eda513d
---
 docs/source/special.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/source/special.rst b/docs/source/special.rst
index 06961dbeaaab6..b74d833c96324 100644
--- a/docs/source/special.rst
+++ b/docs/source/special.rst
@@ -6,10 +6,6 @@ torch.special
 
 The torch.special module, modeled after SciPy's `special <https://docs.scipy.org/doc/scipy/reference/special.html>`_ module.
 
-This module is in BETA. New functions are still being added, and some
-functions may change in future PyTorch releases. See the documentation of each
-function for details.
-
 .. automodule:: torch.special
     :noindex:
 

From 44e3ed88c9a1bd9ee6b0168ba5271a2c6b006cc8 Mon Sep 17 00:00:00 2001
From: Zafar Takhirov <zaf@fb.com>
Date: Sun, 29 Aug 2021 20:28:32 -0700
Subject: [PATCH 336/530] [quant] AO migration of the `quantize.py` (#64086)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64086

AO Team is migrating the existing torch.quantization into torch.ao.quantization. We are doing it one file at a time to make sure that the internal callsites are updated properly.

This migrates the `quantize.py` from torch.quantization to `torch.ao.quantization`.

At this point both locations will be supported. Eventually the torch.quantization will be deprecated.

Test Plan: `buck test mode/opt //caffe2/test:quantization`

Reviewed By: jerryzh168, raghuramank100

Differential Revision: D30055886

fbshipit-source-id: 8ef7470f9fa640c0042bef5bb843e7a05ecd0b9f
---
 test/quantization/ao_migration/__init__.py    |   0
 .../ao_migration/test_quantize_py.py          |  63 ++
 test/test_quantization.py                     |   2 +
 torch/ao/quantization/__init__.py             |   0
 torch/ao/quantization/quantize.py             | 580 +++++++++++++++++
 torch/quantization/fx/convert.py              |   2 +-
 torch/quantization/fx/prepare.py              |   2 +-
 .../quantization/fx/quantization_patterns.py  |   2 +-
 torch/quantization/fx/utils.py                |   2 +-
 torch/quantization/quantize.py                | 604 +-----------------
 10 files changed, 676 insertions(+), 581 deletions(-)
 create mode 100644 test/quantization/ao_migration/__init__.py
 create mode 100644 test/quantization/ao_migration/test_quantize_py.py
 create mode 100644 torch/ao/quantization/__init__.py
 create mode 100644 torch/ao/quantization/quantize.py

diff --git a/test/quantization/ao_migration/__init__.py b/test/quantization/ao_migration/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/quantization/ao_migration/test_quantize_py.py b/test/quantization/ao_migration/test_quantize_py.py
new file mode 100644
index 0000000000000..086364bef9d56
--- /dev/null
+++ b/test/quantization/ao_migration/test_quantize_py.py
@@ -0,0 +1,63 @@
+from torch.testing._internal.common_utils import TestCase
+
+import importlib
+from typing import List
+
+
+class AOMigrationTestCase(TestCase):
+    def _test_package_import(self, package_name: str):
+        r"""Tests the module import by making sure that all the internals match
+        (except the dunder methods)."""
+        old_module = importlib.import_module(f'torch.quantization.{package_name}')
+        new_module = importlib.import_module(f'torch.ao.quantization.{package_name}')
+        old_module_dir = set(dir(old_module))
+        new_module_dir = set(dir(new_module))
+        # Remove magic modules from checking in subsets
+        for el in list(old_module_dir):
+            if el[:2] == '__' and el[-2:] == '__':
+                old_module_dir.remove(el)
+        assert (old_module_dir <= new_module_dir), \
+            f"Importing {old_module} vs. {new_module} does not match: " \
+            f"{old_module_dir - new_module_dir}"
+
+    def _test_function_import(self, package_name: str, function_list: List[str]):
+        r"""Tests individual function list import by comparing the functions
+        and their hashes."""
+        old_location = importlib.import_module(f'torch.quantization.{package_name}')
+        new_location = importlib.import_module(f'torch.ao.quantization.{package_name}')
+        for fn_name in function_list:
+            old_function = getattr(old_location, fn_name)
+            new_function = getattr(new_location, fn_name)
+            assert old_function == new_function, f"Functions don't match: {fn_name}"
+            assert hash(old_function) == hash(new_function), \
+                f"Hashes don't match: {old_function}({hash(old_function)}) vs. " \
+                f"{new_function}({hash(new_function)})"
+
+
+class TestAOMigrationQuantizePy(AOMigrationTestCase):
+    def test_package_import(self):
+        self._test_package_import('quantize')
+
+    def test_function_import(self):
+        function_list = [
+            '_convert',
+            '_observer_forward_hook',
+            '_propagate_qconfig_helper',
+            '_remove_activation_post_process',
+            '_remove_qconfig',
+            'add_observer_',
+            'add_quant_dequant',
+            'convert',
+            'get_observer_dict',
+            'get_unique_devices_',
+            'is_activation_post_process',
+            'prepare',
+            'prepare_qat',
+            'propagate_qconfig_',
+            'quantize',
+            'quantize_dynamic',
+            'quantize_qat',
+            'register_activation_post_process_hook',
+            'swap_module',
+        ]
+        self._test_function_import('quantize', function_list)
diff --git a/test/test_quantization.py b/test/test_quantization.py
index 867151373a5b6..ffc242ed77e33 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -100,6 +100,8 @@
 from quantization.jit.test_fusion_passes import TestFusionPasses  # noqa: F401
 from quantization.jit.test_deprecated_jit_quant import TestDeprecatedJitQuantized  # noqa: F401
 
+# AO Migration tests
+from quantization.ao_migration.test_quantize_py import TestAOMigrationQuantizePy  # noqa: F401
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
new file mode 100644
index 0000000000000..92a794ed7b631
--- /dev/null
+++ b/torch/ao/quantization/quantize.py
@@ -0,0 +1,580 @@
+import copy
+import itertools
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.quantized as nnq
+from torch.nn.intrinsic import _FusedModule
+
+# Import the duplicated API
+from torch.quantization.quantization_mappings import (
+    get_default_dynamic_quant_module_mappings,
+    get_default_static_quant_module_mappings,
+    get_default_qat_module_mappings,
+    get_default_qconfig_propagation_list,
+    no_observer_set,
+    _has_special_act_post_process,
+    _get_special_act_post_process,
+)
+from torch.quantization.stubs import DeQuantStub, QuantWrapper
+from torch.quantization.qconfig import (
+    add_module_to_qconfig_obs_ctr,
+    default_dynamic_qconfig,
+    float16_dynamic_qconfig,
+    float_qparams_weight_only_qconfig)
+
+def is_activation_post_process(module):
+    return (isinstance(module, torch.quantization.ObserverBase) or
+            isinstance(module, torch.quantization.FakeQuantizeBase))
+
+def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None,
+                              qconfig_parent=None, prefix=''):
+    r"""This is a helper function for `propagate_qconfig_`
+
+    Args:
+        module: input module
+        qconfig_dict: dictionary that maps from name of submodule to quantization
+                     configuration
+        allow_list: list of quantizable modules
+        qconfig_parent: quantization config of parent module, we will fallback to
+                       this config when there is no specified config for current
+                       module
+        prefix: corresponding prefix of the current module, used as key in
+                qconfig_dict
+
+    Return:
+        None, module is modified inplace with qconfig attached
+    """
+    # TODO: Add test
+    if allow_list is None:
+        allow_list = get_default_qconfig_propagation_list()
+
+    module_qconfig = qconfig_dict.get(type(module), qconfig_parent)
+    module_qconfig = qconfig_dict.get(prefix, module_qconfig)
+    module_qconfig = getattr(module, 'qconfig', module_qconfig)
+
+    torch.quantization.qconfig.assert_valid_qconfig(module_qconfig, module)
+
+    qconfig_with_device_check = add_module_to_qconfig_obs_ctr(module_qconfig, module)
+    module.qconfig = qconfig_with_device_check
+
+    for name, child in module.named_children():
+        module_prefix = prefix + '.' + name if prefix else name
+        _propagate_qconfig_helper(child, qconfig_dict, allow_list,
+                                  qconfig_with_device_check, module_prefix)
+
+# TODO(jerryzh): expose allow_list
+def propagate_qconfig_(module, qconfig_dict=None, allow_list=None):
+    r"""Propagate qconfig through the module hierarchy and assign `qconfig`
+    attribute on each leaf module
+
+    Args:
+        module: input module
+        qconfig_dict: dictionary that maps from name or type of submodule to
+            quantization configuration, qconfig applies to all submodules of a
+            given module unless qconfig for the submodules are specified (when
+            the submodule already has qconfig attribute)
+
+    Return:
+        None, module is modified inplace with qconfig attached
+    """
+    if qconfig_dict is None:
+        qconfig_dict = {}
+    _propagate_qconfig_helper(module, qconfig_dict, allow_list)
+
+def _observer_forward_hook(self, input, output):
+    r"""Forward hook that calls observer on the output
+    """
+    return self.activation_post_process(output)
+
+def register_activation_post_process_hook(module):
+    assert hasattr(module, 'activation_post_process'), \
+        'Expect activation_post_process attribut already attached to the module'
+    return module.register_forward_hook(_observer_forward_hook)
+
+def add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=None, device=None, custom_module_class_mapping=None):
+    r"""Add observer for the leaf child of the module.
+
+    This function insert observer module to all leaf child module that
+    has a valid qconfig attribute.
+
+    Args:
+        module: input module with qconfig attributes for all the leaf modules that we want to quantize
+        device: parent device, if any
+        non_leaf_module_list: list of non-leaf modules we want to add observer
+
+    Return:
+        None, module is modified inplace with added observer modules and forward_hooks
+    """
+    if qconfig_propagation_list is None:
+        qconfig_propagation_list = get_default_qconfig_propagation_list()
+
+    if custom_module_class_mapping is None:
+        custom_module_class_mapping = {}
+
+    # respect device affinity when adding observers
+    if device is None:
+        devices = get_unique_devices_(module)
+        assert len(devices) <= 1, (
+            "add_observer_ only works with cpu or single-device CUDA modules, "
+            "but got devices {}".format(devices)
+        )
+        device = next(iter(devices)) if len(devices) > 0 else None
+
+    def get_activation_post_process(qconfig, device, special_act_post_process=None):
+        activation = qconfig.activation() if special_act_post_process is None else special_act_post_process()
+        if device is not None:
+            activation.to(device)
+        return activation
+
+    def needs_observation(m):
+        return hasattr(m, 'qconfig') and m.qconfig is not None
+
+    def insert_activation_post_process(m, special_act_post_process=None):
+        """ Adds an activation post process module and register
+        a post hook that calls the module
+        """
+        # We don't insert observer/fake_quantize for DeQuantStub
+        if needs_observation(m) and not isinstance(m, DeQuantStub):
+            # observer and hook will be gone after we swap the module
+            m.add_module('activation_post_process', get_activation_post_process(
+                m.qconfig, device, special_act_post_process))
+            # Register observer as the first entry in the hook list
+            # All post forward hooks are preserved and will be executed after the observer before convert
+            handle = register_activation_post_process_hook(m)
+            m._forward_hooks.move_to_end(handle.id, last=False)
+
+    for name, child in module.named_children():
+        if type(child) in [nnq.FloatFunctional, nnq.QFunctional]:
+            if needs_observation(child):
+                child.activation_post_process = get_activation_post_process(child.qconfig, device)
+        elif isinstance(child, _FusedModule):
+            # activation_post_process are now added directly to nn.Sequentail/_FusedModule
+            if needs_observation(child):
+                insert_activation_post_process(child)
+        elif _has_special_act_post_process(child):
+            special_act_post_process = _get_special_act_post_process(child)
+            insert_activation_post_process(child, special_act_post_process)
+        elif non_leaf_module_list is not None and type(child) in non_leaf_module_list:
+            if needs_observation(child):
+                insert_activation_post_process(child)
+        elif needs_observation(child) and type(child) in custom_module_class_mapping:
+            observed_child = custom_module_class_mapping[type(child)].from_float(child)
+            setattr(module, name, observed_child)
+            # TODO: These are the modules that cannot be observed
+            #       Once there are more, we should move them to a separate list
+            if custom_module_class_mapping[type(child)] not in no_observer_set():
+                insert_activation_post_process(observed_child)
+        else:
+            add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping)
+
+    # Insert observers only for leaf nodes, note that this observer is for
+    # the output of the module, for input QuantStub will observe them
+    if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \
+       and type(module) in qconfig_propagation_list:
+        insert_activation_post_process(module)
+
+def get_unique_devices_(module):
+    return {p.device for p in module.parameters()} | \
+        {p.device for p in module.buffers()}
+
+def add_quant_dequant(module):
+    r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig
+    Note that this function will modify the children of module inplace and it
+    can return a new module which wraps the input module as well.
+
+    Args:
+        module: input module with qconfig attributes for all the leaf modules
+        that we want to quantize
+
+    Return:
+        Either the inplace modified module with submodules wrapped in
+        `QuantWrapper` based on qconfig or a new `QuantWrapper` module which
+        wraps the input module, the latter case only happens when the input
+        module is a leaf module and we want to quantize it.
+    """
+    if len(module._modules) == 0 and hasattr(module, 'qconfig') and module.qconfig:
+        return QuantWrapper(module)
+
+    for name, child in module.named_children():
+        module._modules[name] = add_quant_dequant(child)
+    return module
+
+def prepare(model, inplace=False, allow_list=None,
+            observer_non_leaf_module_list=None,
+            prepare_custom_config_dict=None):
+    r"""Prepares a copy of the model for quantization calibration or quantization-aware training.
+
+    Quantization configuration should be assigned preemptively
+    to individual submodules in `.qconfig` attribute.
+
+    The model will be attached with observer or fake quant modules, and qconfig
+    will be propagated.
+
+    Args:
+        `model`: input model to be modified in-place
+        `inplace`: carry out model transformations in-place, the original module is mutated
+        `allow_list`: list of quantizable modules
+        `observer_non_leaf_module_list`: list of non-leaf modules we want to add observer
+        `prepare_custom_config_dict`: customization configuration dictionary for prepare function
+
+    .. code-block:: python
+
+       # Example of prepare_custom_config_dict:
+       prepare_custom_config_dict = {
+           # user will manually define the corresponding observed
+           # module class which has a from_float class method that converts
+           # float custom module to observed custom module
+           "float_to_observed_custom_module_class": {
+               CustomModule: ObservedCustomModule
+           }
+        }
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.prepare")
+    if prepare_custom_config_dict is None:
+        prepare_custom_config_dict = {}
+    custom_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", {})
+
+    if not inplace:
+        model = copy.deepcopy(model)
+
+    # TODO: remove allow_list
+    qconfig_propagation_list = allow_list
+    if qconfig_propagation_list is None:
+        qconfig_propagation_list = get_default_qconfig_propagation_list()
+    propagate_qconfig_(model, qconfig_dict=None)
+
+    # sanity check common API misusage
+    if not any(hasattr(m, 'qconfig') and m.qconfig for m in model.modules()):
+        warnings.warn("None of the submodule got qconfig applied. Make sure you "
+                      "passed correct configuration through `qconfig_dict` or "
+                      "by assigning the `.qconfig` attribute directly on submodules")
+
+    add_observer_(
+        model, qconfig_propagation_list, observer_non_leaf_module_list,
+        custom_module_class_mapping=custom_module_class_mapping)
+    return model
+
+def _remove_activation_post_process(module):
+    # TODO: maybe we should change activation_post_process to _activation_post_process
+    # to prevent it from being used by user
+    if hasattr(module, 'activation_post_process') and \
+       is_activation_post_process(module.activation_post_process):
+        delattr(module, 'activation_post_process')
+
+    # remove activation_post_proceess hook
+    handle_ids_to_remove = set()
+    for handle_id, hook_fn in module._forward_hooks.items():
+        if hook_fn is _observer_forward_hook:
+            handle_ids_to_remove.add(handle_id)
+    for handle_id in handle_ids_to_remove:
+        module._forward_hooks.pop(handle_id)
+
+# TODO: rename to something more general
+def _remove_qconfig(module):
+    r"""Clean up the qconfig left in the module so that new qconfig can be
+    propagated.
+
+    Args:
+        module: module to be cleaned up
+    """
+    for child in module.children():
+        _remove_qconfig(child)
+
+    if hasattr(module, "qconfig"):
+        del module.qconfig
+
+    _remove_activation_post_process(module)
+
+def quantize(model, run_fn, run_args, mapping=None, inplace=False):
+    r"""Quantize the input float model with post training static quantization.
+
+    First it will prepare the model for calibration, then it calls
+    `run_fn` which will run the calibration step, after that we will
+    convert the model to a quantized model.
+
+    Args:
+        model: input float model
+        run_fn: a calibration function for calibrating the prepared model
+        run_args: positional arguments for `run_fn`
+        inplace: carry out model transformations in-place, the original module is mutated
+        mapping: correspondence between original module types and quantized counterparts
+
+    Return:
+        Quantized model.
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.quantize")
+    if mapping is None:
+        mapping = get_default_static_quant_module_mappings()
+    if not inplace:
+        model = copy.deepcopy(model)
+    model.eval()
+    prepare(model, inplace=True)
+    run_fn(model, *run_args)
+    convert(model, mapping, inplace=True)
+    return model
+
+def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8,
+                     mapping=None, inplace=False):
+    r"""Converts a float model to dynamic (i.e. weights-only) quantized model.
+
+    Replaces specified modules with dynamic weight-only quantized versions and output the quantized model.
+
+    For simplest usage provide `dtype` argument that can be float16 or qint8. Weight-only quantization
+    by default is performed for layers with large weights size - i.e. Linear and RNN variants.
+
+    Fine grained control is possible with `qconfig` and `mapping` that act similarly to `quantize()`.
+    If `qconfig` is provided, the `dtype` argument is ignored.
+
+    Args:
+        model: input model
+        qconfig_spec: Either:
+
+            - A dictionary that maps from name or type of submodule to quantization
+              configuration, qconfig applies to all submodules of a given
+              module unless qconfig for the submodules are specified (when the
+              submodule already has qconfig attribute). Entries in the dictionary
+              need to be QConfigDynamic instances.
+
+            - A set of types and/or submodule names to apply dynamic quantization to,
+              in which case the `dtype` argument is used to specify the bit-width
+
+        inplace: carry out model transformations in-place, the original module is mutated
+        mapping: maps type of a submodule to a type of corresponding dynamically quantized version
+            with which the submodule needs to be replaced
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.quantize_dynamic")
+    if qconfig_spec is None:
+        if dtype == torch.qint8:
+            qconfig_spec = {
+                nn.Linear : default_dynamic_qconfig,
+                nn.LSTM : default_dynamic_qconfig,
+                nn.GRU : default_dynamic_qconfig,
+                nn.LSTMCell : default_dynamic_qconfig,
+                nn.RNNCell : default_dynamic_qconfig,
+                nn.GRUCell : default_dynamic_qconfig,
+            }
+        elif dtype == torch.float16:
+            qconfig_spec = {
+                nn.Linear : float16_dynamic_qconfig,
+                nn.LSTM : float16_dynamic_qconfig,
+                nn.GRU : float16_dynamic_qconfig,
+                nn.LSTMCell : float16_dynamic_qconfig,
+                nn.RNNCell : float16_dynamic_qconfig,
+                nn.GRUCell : float16_dynamic_qconfig,
+            }
+        elif dtype == torch.quint8:
+            qconfig_spec = {
+                nn.EmbeddingBag : float_qparams_weight_only_qconfig,
+            }
+        else:
+            raise ValueError(
+                "Don't know how to quantize with default settings for {}. Provide full qconfig please".format(dtype))
+    elif isinstance(qconfig_spec, set):
+        if dtype is torch.qint8:
+            default_qconfig = default_dynamic_qconfig
+        elif dtype is torch.float16:
+            default_qconfig = float16_dynamic_qconfig
+        elif dtype is torch.quint8:
+            default_qconfig = float_qparams_weight_only_qconfig
+        else:
+            raise RuntimeError('Unknown dtype specified for quantize_dynamic: ', str(dtype))
+        qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig)))
+
+    if mapping is None:
+        mapping = get_default_dynamic_quant_module_mappings()
+
+    if not inplace:
+        model = copy.deepcopy(model)
+    model.eval()
+    propagate_qconfig_(model, qconfig_spec)
+    convert(model, mapping, inplace=True)
+    return model
+
+def prepare_qat(model, mapping=None, inplace=False):
+    r"""
+    Prepares a copy of the model for quantization calibration or
+    quantization-aware training and converts it to quantized version.
+
+    Quantization configuration should be assigned preemptively
+    to individual submodules in `.qconfig` attribute.
+
+    Args:
+        model: input model to be modified in-place
+        mapping: dictionary that maps float modules to quantized modules to be
+                 replaced.
+        inplace: carry out model transformations in-place, the original module
+                 is mutated
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.prepare_qat")
+    if mapping is None:
+        mapping = get_default_qat_module_mappings()
+
+    if not inplace:
+        model = copy.deepcopy(model)
+
+    propagate_qconfig_(model, qconfig_dict=None)
+    convert(model, mapping=mapping, inplace=True, remove_qconfig=False)
+    prepare(model, observer_non_leaf_module_list=set(mapping.values()), inplace=True)
+    return model
+
+def quantize_qat(model, run_fn, run_args, inplace=False):
+    r"""Do quantization aware training and output a quantized model
+
+    Args:
+        model: input model
+        run_fn: a function for evaluating the prepared model, can be a
+                function that simply runs the prepared model or a training
+                loop
+        run_args: positional arguments for `run_fn`
+
+    Return:
+        Quantized model.
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.quantize_qat")
+    if not inplace:
+        model = copy.deepcopy(model)
+    model.train()
+    prepare_qat(model, inplace=True)
+    run_fn(model, *run_args)
+    convert(model, inplace=True)
+    return model
+
+def convert(
+        module, mapping=None, inplace=False, remove_qconfig=True,
+        convert_custom_config_dict=None):
+    r"""Converts submodules in input module to a different module according to `mapping`
+    by calling `from_float` method on the target module class. And remove qconfig at the
+    end if remove_qconfig is set to True.
+
+    Args:
+        `module`: prepared and calibrated module
+        `mapping`: a dictionary that maps from source module type to target
+                   module type, can be overwritten to allow swapping user defined
+                   Modules
+        `inplace`: carry out model transformations in-place, the original module
+                   is mutated
+        `convert_custom_config_dict`: custom configuration dictionary for convert function
+
+    .. code-block:: python
+
+       # Example of convert_custom_config_dict:
+       convert_custom_config_dict = {
+           # user will manually define the corresponding quantized
+           # module class which has a from_observed class method that converts
+           # observed custom module to quantized custom module
+           "observed_to_quantized_custom_module_class": {
+               ObservedCustomModule: QuantizedCustomModule
+           }
+       }
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.convert")
+    if not inplace:
+        module = copy.deepcopy(module)
+    _convert(
+        module, mapping, inplace=True,
+        convert_custom_config_dict=convert_custom_config_dict)
+    if remove_qconfig:
+        _remove_qconfig(module)
+    return module
+
+def _convert(
+        module, mapping=None, inplace=False,
+        convert_custom_config_dict=None):
+    r"""Converts submodules in input module to a different module according to `mapping`
+    by calling `from_float` method on the target module class
+
+    Args:
+        module: input module
+        mapping: a dictionary that maps from source module type to target
+                 module type, can be overwritten to allow swapping user defined
+                 Modules
+        inplace: carry out model transformations in-place, the original module
+                 is mutated
+
+    """
+    if mapping is None:
+        mapping = get_default_static_quant_module_mappings()
+    if convert_custom_config_dict is None:
+        convert_custom_config_dict = {}
+    custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {})
+
+    if not inplace:
+        module = copy.deepcopy(module)
+    reassign = {}
+    for name, mod in module.named_children():
+        # both fused modules and observed custom modules are
+        # swapped as one unit
+        if not isinstance(mod, _FusedModule) and \
+           type(mod) not in custom_module_class_mapping:
+            _convert(mod, mapping, True,  # inplace
+                     convert_custom_config_dict)
+        reassign[name] = swap_module(mod, mapping, custom_module_class_mapping)
+
+    for key, value in reassign.items():
+        module._modules[key] = value
+
+    return module
+
+def swap_module(mod, mapping, custom_module_class_mapping):
+    r"""Swaps the module if it has a quantized counterpart and it has an
+    `observer` attached.
+
+    Args:
+        mod: input module
+        mapping: a dictionary that maps from nn module to nnq module
+
+    Return:
+        The corresponding quantized module of `mod`
+    """
+    new_mod = mod
+    if hasattr(mod, 'qconfig') and mod.qconfig is not None:
+        swapped = False
+        if type(mod) in custom_module_class_mapping:
+            new_mod = custom_module_class_mapping[type(mod)].from_observed(mod)
+            swapped = True
+        elif type(mod) in mapping:
+            new_mod = mapping[type(mod)].from_float(mod)
+            swapped = True
+
+        if swapped:
+            # Preserve module's pre forward hooks. They'll be called on quantized input
+            for pre_hook_fn in mod._forward_pre_hooks.values():
+                new_mod.register_forward_pre_hook(pre_hook_fn)
+            # Preserve module's post forward hooks except _observer_forward_hook
+            # After convert they'll work with quantized output
+            for hook_fn in mod._forward_hooks.values():
+                if hook_fn is not _observer_forward_hook:
+                    new_mod.register_forward_hook(hook_fn)
+
+            # respect device affinity when swapping modules
+            devices = get_unique_devices_(mod)
+            assert len(devices) <= 1, (
+                "swap_module only works with cpu or single-device CUDA modules, "
+                "but got devices {}".format(devices)
+            )
+            device = next(iter(devices)) if len(devices) > 0 else None
+            if device:
+                new_mod.to(device)
+    return new_mod
+
+def get_observer_dict(mod, target_dict, prefix=""):
+    r"""Traverse the modules and save all observers into dict.
+    This is mainly used for quantization accuracy debug
+    Args:
+        mod: the top module we want to save all observers
+        prefix: the prefix for the current module
+        target_dict: the dictionary used to save all the observers
+    """
+    def get_prefix(prefix):
+        return prefix if prefix == "" else prefix + '.'
+
+    if hasattr(mod, 'activation_post_process'):
+        target_dict[get_prefix(prefix) + 'activation_post_process'] = mod.activation_post_process
+    for name, child in mod.named_children():
+        module_prefix = get_prefix(prefix) + name if prefix else name
+        get_observer_dict(child, target_dict, module_prefix)
diff --git a/torch/quantization/fx/convert.py b/torch/quantization/fx/convert.py
index 867b0b24cf7ad..e00e4aaad1b68 100644
--- a/torch/quantization/fx/convert.py
+++ b/torch/quantization/fx/convert.py
@@ -36,7 +36,7 @@
     WEIGHT_INDEX_DICT,
 )
 
-from ..quantize import (
+from torch.ao.quantization.quantize import (
     _remove_qconfig,
     is_activation_post_process,
 )
diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py
index 86abac2d20991..a6fd660e5e84c 100644
--- a/torch/quantization/fx/prepare.py
+++ b/torch/quantization/fx/prepare.py
@@ -68,7 +68,7 @@
     get_default_qat_module_mappings,
 )
 
-from ..quantize import (
+from torch.ao.quantization.quantize import (
     is_activation_post_process,
     convert
 )
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index e8b873658b504..779dfcf07aece 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -24,7 +24,7 @@
     get_qparam_dict,
 )
 
-from ..quantize import (
+from torch.ao.quantization.quantize import (
     is_activation_post_process,
 )
 
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 10f8b06b6dfed..3c9adc2bc311b 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -2,7 +2,7 @@
 import torch
 import torch.nn as nn
 from ..utils import is_per_tensor, is_per_channel
-from ..quantize import is_activation_post_process
+from torch.ao.quantization.quantize import is_activation_post_process
 
 from torch.fx import GraphModule, map_arg
 
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index 674ed59ac86ed..5b0f4ed8779ab 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -1,580 +1,30 @@
-import copy
-import itertools
-import warnings
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/quantize.py`, while adding an import statement
+here.
+"""
 
 import torch
-import torch.nn as nn
-import torch.nn.quantized as nnq
-from torch.nn.intrinsic import _FusedModule
 
-from .quantization_mappings import (
-    get_default_dynamic_quant_module_mappings,
-    get_default_static_quant_module_mappings,
-    get_default_qat_module_mappings,
-    get_default_qconfig_propagation_list,
-    no_observer_set,
-    _has_special_act_post_process,
-    _get_special_act_post_process,
-)
-
-from .stubs import DeQuantStub, QuantWrapper
-from .qconfig import (
-    add_module_to_qconfig_obs_ctr,
-    default_dynamic_qconfig,
-    float16_dynamic_qconfig,
-    float_qparams_weight_only_qconfig)
-
-def is_activation_post_process(module):
-    return (isinstance(module, torch.quantization.ObserverBase) or
-            isinstance(module, torch.quantization.FakeQuantizeBase))
-
-def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None,
-                              qconfig_parent=None, prefix=''):
-    r"""This is a helper function for `propagate_qconfig_`
-
-    Args:
-        module: input module
-        qconfig_dict: dictionary that maps from name of submodule to quantization
-                     configuration
-        allow_list: list of quantizable modules
-        qconfig_parent: quantization config of parent module, we will fallback to
-                       this config when there is no specified config for current
-                       module
-        prefix: corresponding prefix of the current module, used as key in
-                qconfig_dict
-
-    Return:
-        None, module is modified inplace with qconfig attached
-    """
-    # TODO: Add test
-    if allow_list is None:
-        allow_list = get_default_qconfig_propagation_list()
-
-    module_qconfig = qconfig_dict.get(type(module), qconfig_parent)
-    module_qconfig = qconfig_dict.get(prefix, module_qconfig)
-    module_qconfig = getattr(module, 'qconfig', module_qconfig)
-
-    torch.quantization.qconfig.assert_valid_qconfig(module_qconfig, module)
-
-    qconfig_with_device_check = add_module_to_qconfig_obs_ctr(module_qconfig, module)
-    module.qconfig = qconfig_with_device_check
-
-    for name, child in module.named_children():
-        module_prefix = prefix + '.' + name if prefix else name
-        _propagate_qconfig_helper(child, qconfig_dict, allow_list,
-                                  qconfig_with_device_check, module_prefix)
-
-# TODO(jerryzh): expose allow_list
-def propagate_qconfig_(module, qconfig_dict=None, allow_list=None):
-    r"""Propagate qconfig through the module hierarchy and assign `qconfig`
-    attribute on each leaf module
-
-    Args:
-        module: input module
-        qconfig_dict: dictionary that maps from name or type of submodule to
-            quantization configuration, qconfig applies to all submodules of a
-            given module unless qconfig for the submodules are specified (when
-            the submodule already has qconfig attribute)
-
-    Return:
-        None, module is modified inplace with qconfig attached
-    """
-    if qconfig_dict is None:
-        qconfig_dict = {}
-    _propagate_qconfig_helper(module, qconfig_dict, allow_list)
-
-def _observer_forward_hook(self, input, output):
-    r"""Forward hook that calls observer on the output
-    """
-    return self.activation_post_process(output)
-
-def register_activation_post_process_hook(module):
-    assert hasattr(module, 'activation_post_process'), \
-        'Expect activation_post_process attribut already attached to the module'
-    return module.register_forward_hook(_observer_forward_hook)
-
-def add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=None, device=None, custom_module_class_mapping=None):
-    r"""Add observer for the leaf child of the module.
-
-    This function insert observer module to all leaf child module that
-    has a valid qconfig attribute.
-
-    Args:
-        module: input module with qconfig attributes for all the leaf modules that we want to quantize
-        device: parent device, if any
-        non_leaf_module_list: list of non-leaf modules we want to add observer
-
-    Return:
-        None, module is modified inplace with added observer modules and forward_hooks
-    """
-    if qconfig_propagation_list is None:
-        qconfig_propagation_list = get_default_qconfig_propagation_list()
-
-    if custom_module_class_mapping is None:
-        custom_module_class_mapping = {}
-
-    # respect device affinity when adding observers
-    if device is None:
-        devices = get_unique_devices_(module)
-        assert len(devices) <= 1, (
-            "add_observer_ only works with cpu or single-device CUDA modules, "
-            "but got devices {}".format(devices)
-        )
-        device = next(iter(devices)) if len(devices) > 0 else None
-
-    def get_activation_post_process(qconfig, device, special_act_post_process=None):
-        activation = qconfig.activation() if special_act_post_process is None else special_act_post_process()
-        if device is not None:
-            activation.to(device)
-        return activation
-
-    def needs_observation(m):
-        return hasattr(m, 'qconfig') and m.qconfig is not None
-
-    def insert_activation_post_process(m, special_act_post_process=None):
-        """ Adds an activation post process module and register
-        a post hook that calls the module
-        """
-        # We don't insert observer/fake_quantize for DeQuantStub
-        if needs_observation(m) and not isinstance(m, DeQuantStub):
-            # observer and hook will be gone after we swap the module
-            m.add_module('activation_post_process', get_activation_post_process(
-                m.qconfig, device, special_act_post_process))
-            # Register observer as the first entry in the hook list
-            # All post forward hooks are preserved and will be executed after the observer before convert
-            handle = register_activation_post_process_hook(m)
-            m._forward_hooks.move_to_end(handle.id, last=False)
-
-    for name, child in module.named_children():
-        if type(child) in [nnq.FloatFunctional, nnq.QFunctional]:
-            if needs_observation(child):
-                child.activation_post_process = get_activation_post_process(child.qconfig, device)
-        elif isinstance(child, _FusedModule):
-            # activation_post_process are now added directly to nn.Sequentail/_FusedModule
-            if needs_observation(child):
-                insert_activation_post_process(child)
-        elif _has_special_act_post_process(child):
-            special_act_post_process = _get_special_act_post_process(child)
-            insert_activation_post_process(child, special_act_post_process)
-        elif non_leaf_module_list is not None and type(child) in non_leaf_module_list:
-            if needs_observation(child):
-                insert_activation_post_process(child)
-        elif needs_observation(child) and type(child) in custom_module_class_mapping:
-            observed_child = custom_module_class_mapping[type(child)].from_float(child)
-            setattr(module, name, observed_child)
-            # TODO: These are the modules that cannot be observed
-            #       Once there are more, we should move them to a separate list
-            if custom_module_class_mapping[type(child)] not in no_observer_set():
-                insert_activation_post_process(observed_child)
-        else:
-            add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping)
-
-    # Insert observers only for leaf nodes, note that this observer is for
-    # the output of the module, for input QuantStub will observe them
-    if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \
-       and type(module) in qconfig_propagation_list:
-        insert_activation_post_process(module)
-
-def get_unique_devices_(module):
-    return {p.device for p in module.parameters()} | \
-        {p.device for p in module.buffers()}
-
-def add_quant_dequant(module):
-    r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig
-    Note that this function will modify the children of module inplace and it
-    can return a new module which wraps the input module as well.
-
-    Args:
-        module: input module with qconfig attributes for all the leaf modules
-        that we want to quantize
-
-    Return:
-        Either the inplace modified module with submodules wrapped in
-        `QuantWrapper` based on qconfig or a new `QuantWrapper` module which
-        wraps the input module, the latter case only happens when the input
-        module is a leaf module and we want to quantize it.
-    """
-    if len(module._modules) == 0 and hasattr(module, 'qconfig') and module.qconfig:
-        return QuantWrapper(module)
-
-    for name, child in module.named_children():
-        module._modules[name] = add_quant_dequant(child)
-    return module
-
-def prepare(model, inplace=False, allow_list=None,
-            observer_non_leaf_module_list=None,
-            prepare_custom_config_dict=None):
-    r"""Prepares a copy of the model for quantization calibration or quantization-aware training.
-
-    Quantization configuration should be assigned preemptively
-    to individual submodules in `.qconfig` attribute.
-
-    The model will be attached with observer or fake quant modules, and qconfig
-    will be propagated.
-
-    Args:
-        `model`: input model to be modified in-place
-        `inplace`: carry out model transformations in-place, the original module is mutated
-        `allow_list`: list of quantizable modules
-        `observer_non_leaf_module_list`: list of non-leaf modules we want to add observer
-        `prepare_custom_config_dict`: customization configuration dictionary for prepare function
-
-    .. code-block:: python
-
-       # Example of prepare_custom_config_dict:
-       prepare_custom_config_dict = {
-           # user will manually define the corresponding observed
-           # module class which has a from_float class method that converts
-           # float custom module to observed custom module
-           "float_to_observed_custom_module_class": {
-               CustomModule: ObservedCustomModule
-           }
-        }
-
-    """
-    torch._C._log_api_usage_once("quantization_api.quantize.prepare")
-    if prepare_custom_config_dict is None:
-        prepare_custom_config_dict = {}
-    custom_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", {})
-
-    if not inplace:
-        model = copy.deepcopy(model)
-
-    # TODO: remove allow_list
-    qconfig_propagation_list = allow_list
-    if qconfig_propagation_list is None:
-        qconfig_propagation_list = get_default_qconfig_propagation_list()
-    propagate_qconfig_(model, qconfig_dict=None)
-
-    # sanity check common API misusage
-    if not any(hasattr(m, 'qconfig') and m.qconfig for m in model.modules()):
-        warnings.warn("None of the submodule got qconfig applied. Make sure you "
-                      "passed correct configuration through `qconfig_dict` or "
-                      "by assigning the `.qconfig` attribute directly on submodules")
-
-    add_observer_(
-        model, qconfig_propagation_list, observer_non_leaf_module_list,
-        custom_module_class_mapping=custom_module_class_mapping)
-    return model
-
-def _remove_activation_post_process(module):
-    # TODO: maybe we should change activation_post_process to _activation_post_process
-    # to prevent it from being used by user
-    if hasattr(module, 'activation_post_process') and \
-       is_activation_post_process(module.activation_post_process):
-        delattr(module, 'activation_post_process')
-
-    # remove activation_post_proceess hook
-    handle_ids_to_remove = set()
-    for handle_id, hook_fn in module._forward_hooks.items():
-        if hook_fn is _observer_forward_hook:
-            handle_ids_to_remove.add(handle_id)
-    for handle_id in handle_ids_to_remove:
-        module._forward_hooks.pop(handle_id)
-
-# TODO: rename to something more general
-def _remove_qconfig(module):
-    r"""Clean up the qconfig left in the module so that new qconfig can be
-    propagated.
-
-    Args:
-        module: module to be cleaned up
-    """
-    for child in module.children():
-        _remove_qconfig(child)
-
-    if hasattr(module, "qconfig"):
-        del module.qconfig
-
-    _remove_activation_post_process(module)
-
-def quantize(model, run_fn, run_args, mapping=None, inplace=False):
-    r"""Quantize the input float model with post training static quantization.
-
-    First it will prepare the model for calibration, then it calls
-    `run_fn` which will run the calibration step, after that we will
-    convert the model to a quantized model.
-
-    Args:
-        model: input float model
-        run_fn: a calibration function for calibrating the prepared model
-        run_args: positional arguments for `run_fn`
-        inplace: carry out model transformations in-place, the original module is mutated
-        mapping: correspondence between original module types and quantized counterparts
-
-    Return:
-        Quantized model.
-    """
-    torch._C._log_api_usage_once("quantization_api.quantize.quantize")
-    if mapping is None:
-        mapping = get_default_static_quant_module_mappings()
-    if not inplace:
-        model = copy.deepcopy(model)
-    model.eval()
-    prepare(model, inplace=True)
-    run_fn(model, *run_args)
-    convert(model, mapping, inplace=True)
-    return model
-
-def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8,
-                     mapping=None, inplace=False):
-    r"""Converts a float model to dynamic (i.e. weights-only) quantized model.
-
-    Replaces specified modules with dynamic weight-only quantized versions and output the quantized model.
-
-    For simplest usage provide `dtype` argument that can be float16 or qint8. Weight-only quantization
-    by default is performed for layers with large weights size - i.e. Linear and RNN variants.
-
-    Fine grained control is possible with `qconfig` and `mapping` that act similarly to `quantize()`.
-    If `qconfig` is provided, the `dtype` argument is ignored.
-
-    Args:
-        model: input model
-        qconfig_spec: Either:
-
-            - A dictionary that maps from name or type of submodule to quantization
-              configuration, qconfig applies to all submodules of a given
-              module unless qconfig for the submodules are specified (when the
-              submodule already has qconfig attribute). Entries in the dictionary
-              need to be QConfigDynamic instances.
-
-            - A set of types and/or submodule names to apply dynamic quantization to,
-              in which case the `dtype` argument is used to specify the bit-width
-
-        inplace: carry out model transformations in-place, the original module is mutated
-        mapping: maps type of a submodule to a type of corresponding dynamically quantized version
-            with which the submodule needs to be replaced
-
-    """
-    torch._C._log_api_usage_once("quantization_api.quantize.quantize_dynamic")
-    if qconfig_spec is None:
-        if dtype == torch.qint8:
-            qconfig_spec = {
-                nn.Linear : default_dynamic_qconfig,
-                nn.LSTM : default_dynamic_qconfig,
-                nn.GRU : default_dynamic_qconfig,
-                nn.LSTMCell : default_dynamic_qconfig,
-                nn.RNNCell : default_dynamic_qconfig,
-                nn.GRUCell : default_dynamic_qconfig,
-            }
-        elif dtype == torch.float16:
-            qconfig_spec = {
-                nn.Linear : float16_dynamic_qconfig,
-                nn.LSTM : float16_dynamic_qconfig,
-                nn.GRU : float16_dynamic_qconfig,
-                nn.LSTMCell : float16_dynamic_qconfig,
-                nn.RNNCell : float16_dynamic_qconfig,
-                nn.GRUCell : float16_dynamic_qconfig,
-            }
-        elif dtype == torch.quint8:
-            qconfig_spec = {
-                nn.EmbeddingBag : float_qparams_weight_only_qconfig,
-            }
-        else:
-            raise ValueError(
-                "Don't know how to quantize with default settings for {}. Provide full qconfig please".format(dtype))
-    elif isinstance(qconfig_spec, set):
-        if dtype is torch.qint8:
-            default_qconfig = default_dynamic_qconfig
-        elif dtype is torch.float16:
-            default_qconfig = float16_dynamic_qconfig
-        elif dtype is torch.quint8:
-            default_qconfig = float_qparams_weight_only_qconfig
-        else:
-            raise RuntimeError('Unknown dtype specified for quantize_dynamic: ', str(dtype))
-        qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig)))
-
-    if mapping is None:
-        mapping = get_default_dynamic_quant_module_mappings()
-
-    if not inplace:
-        model = copy.deepcopy(model)
-    model.eval()
-    propagate_qconfig_(model, qconfig_spec)
-    convert(model, mapping, inplace=True)
-    return model
-
-def prepare_qat(model, mapping=None, inplace=False):
-    r"""
-    Prepares a copy of the model for quantization calibration or
-    quantization-aware training and converts it to quantized version.
-
-    Quantization configuration should be assigned preemptively
-    to individual submodules in `.qconfig` attribute.
-
-    Args:
-        model: input model to be modified in-place
-        mapping: dictionary that maps float modules to quantized modules to be
-                 replaced.
-        inplace: carry out model transformations in-place, the original module
-                 is mutated
-    """
-    torch._C._log_api_usage_once("quantization_api.quantize.prepare_qat")
-    if mapping is None:
-        mapping = get_default_qat_module_mappings()
-
-    if not inplace:
-        model = copy.deepcopy(model)
-
-    propagate_qconfig_(model, qconfig_dict=None)
-    convert(model, mapping=mapping, inplace=True, remove_qconfig=False)
-    prepare(model, observer_non_leaf_module_list=set(mapping.values()), inplace=True)
-    return model
-
-def quantize_qat(model, run_fn, run_args, inplace=False):
-    r"""Do quantization aware training and output a quantized model
-
-    Args:
-        model: input model
-        run_fn: a function for evaluating the prepared model, can be a
-                function that simply runs the prepared model or a training
-                loop
-        run_args: positional arguments for `run_fn`
-
-    Return:
-        Quantized model.
-    """
-    torch._C._log_api_usage_once("quantization_api.quantize.quantize_qat")
-    if not inplace:
-        model = copy.deepcopy(model)
-    model.train()
-    prepare_qat(model, inplace=True)
-    run_fn(model, *run_args)
-    convert(model, inplace=True)
-    return model
-
-def convert(
-        module, mapping=None, inplace=False, remove_qconfig=True,
-        convert_custom_config_dict=None):
-    r"""Converts submodules in input module to a different module according to `mapping`
-    by calling `from_float` method on the target module class. And remove qconfig at the
-    end if remove_qconfig is set to True.
-
-    Args:
-        `module`: prepared and calibrated module
-        `mapping`: a dictionary that maps from source module type to target
-                   module type, can be overwritten to allow swapping user defined
-                   Modules
-        `inplace`: carry out model transformations in-place, the original module
-                   is mutated
-        `convert_custom_config_dict`: custom configuration dictionary for convert function
-
-    .. code-block:: python
-
-       # Example of convert_custom_config_dict:
-       convert_custom_config_dict = {
-           # user will manually define the corresponding quantized
-           # module class which has a from_observed class method that converts
-           # observed custom module to quantized custom module
-           "observed_to_quantized_custom_module_class": {
-               ObservedCustomModule: QuantizedCustomModule
-           }
-       }
-
-    """
-    torch._C._log_api_usage_once("quantization_api.quantize.convert")
-    if not inplace:
-        module = copy.deepcopy(module)
-    _convert(
-        module, mapping, inplace=True,
-        convert_custom_config_dict=convert_custom_config_dict)
-    if remove_qconfig:
-        _remove_qconfig(module)
-    return module
-
-def _convert(
-        module, mapping=None, inplace=False,
-        convert_custom_config_dict=None):
-    r"""Converts submodules in input module to a different module according to `mapping`
-    by calling `from_float` method on the target module class
-
-    Args:
-        module: input module
-        mapping: a dictionary that maps from source module type to target
-                 module type, can be overwritten to allow swapping user defined
-                 Modules
-        inplace: carry out model transformations in-place, the original module
-                 is mutated
-
-    """
-    if mapping is None:
-        mapping = get_default_static_quant_module_mappings()
-    if convert_custom_config_dict is None:
-        convert_custom_config_dict = {}
-    custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {})
-
-    if not inplace:
-        module = copy.deepcopy(module)
-    reassign = {}
-    for name, mod in module.named_children():
-        # both fused modules and observed custom modules are
-        # swapped as one unit
-        if not isinstance(mod, _FusedModule) and \
-           type(mod) not in custom_module_class_mapping:
-            _convert(mod, mapping, True,  # inplace
-                     convert_custom_config_dict)
-        reassign[name] = swap_module(mod, mapping, custom_module_class_mapping)
-
-    for key, value in reassign.items():
-        module._modules[key] = value
-
-    return module
-
-def swap_module(mod, mapping, custom_module_class_mapping):
-    r"""Swaps the module if it has a quantized counterpart and it has an
-    `observer` attached.
-
-    Args:
-        mod: input module
-        mapping: a dictionary that maps from nn module to nnq module
-
-    Return:
-        The corresponding quantized module of `mod`
-    """
-    new_mod = mod
-    if hasattr(mod, 'qconfig') and mod.qconfig is not None:
-        swapped = False
-        if type(mod) in custom_module_class_mapping:
-            new_mod = custom_module_class_mapping[type(mod)].from_observed(mod)
-            swapped = True
-        elif type(mod) in mapping:
-            new_mod = mapping[type(mod)].from_float(mod)
-            swapped = True
-
-        if swapped:
-            # Preserve module's pre forward hooks. They'll be called on quantized input
-            for pre_hook_fn in mod._forward_pre_hooks.values():
-                new_mod.register_forward_pre_hook(pre_hook_fn)
-            # Preserve module's post forward hooks except _observer_forward_hook
-            # After convert they'll work with quantized output
-            for hook_fn in mod._forward_hooks.values():
-                if hook_fn is not _observer_forward_hook:
-                    new_mod.register_forward_hook(hook_fn)
-
-            # respect device affinity when swapping modules
-            devices = get_unique_devices_(mod)
-            assert len(devices) <= 1, (
-                "swap_module only works with cpu or single-device CUDA modules, "
-                "but got devices {}".format(devices)
-            )
-            device = next(iter(devices)) if len(devices) > 0 else None
-            if device:
-                new_mod.to(device)
-    return new_mod
-
-def get_observer_dict(mod, target_dict, prefix=""):
-    r"""Traverse the modules and save all observers into dict.
-    This is mainly used for quantization accuracy debug
-    Args:
-        mod: the top module we want to save all observers
-        prefix: the prefix for the current module
-        target_dict: the dictionary used to save all the observers
-    """
-    def get_prefix(prefix):
-        return prefix if prefix == "" else prefix + '.'
-
-    if hasattr(mod, 'activation_post_process'):
-        target_dict[get_prefix(prefix) + 'activation_post_process'] = mod.activation_post_process
-    for name, child in mod.named_children():
-        module_prefix = get_prefix(prefix) + name if prefix else name
-        get_observer_dict(child, target_dict, module_prefix)
+from torch.ao.quantization.quantize import _convert
+from torch.ao.quantization.quantize import _observer_forward_hook
+from torch.ao.quantization.quantize import _propagate_qconfig_helper
+from torch.ao.quantization.quantize import _remove_activation_post_process
+from torch.ao.quantization.quantize import _remove_qconfig
+from torch.ao.quantization.quantize import add_observer_
+from torch.ao.quantization.quantize import add_quant_dequant
+from torch.ao.quantization.quantize import convert
+from torch.ao.quantization.quantize import get_observer_dict
+from torch.ao.quantization.quantize import get_unique_devices_
+from torch.ao.quantization.quantize import is_activation_post_process
+from torch.ao.quantization.quantize import prepare
+from torch.ao.quantization.quantize import prepare_qat
+from torch.ao.quantization.quantize import propagate_qconfig_
+from torch.ao.quantization.quantize import quantize
+from torch.ao.quantization.quantize import quantize_dynamic
+from torch.ao.quantization.quantize import quantize_qat
+from torch.ao.quantization.quantize import register_activation_post_process_hook
+from torch.ao.quantization.quantize import swap_module

From 8af1407eab140a3abf12ea99883fea529791883e Mon Sep 17 00:00:00 2001
From: Harut Movsisyan <harutm@fb.com>
Date: Sun, 29 Aug 2021 20:58:45 -0700
Subject: [PATCH 337/530] [Static Runtime] Out version for torch.linalg.norm
 (#64070)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64070

Test Plan:
Confirm out variant is called for both versions:

```
> buck run //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- --v=1
```

Reviewed By: d1jang

Differential Revision: D30595816

fbshipit-source-id: e88d88d4fc698774e83a98efce66b8fa4e281563
---
 benchmarks/static_runtime/test_scripts.h      | 10 ++++
 .../static_runtime/test_static_runtime.cc     | 26 ++++++++++
 torch/csrc/jit/runtime/static/ops.cpp         | 47 +++++++++++++++++++
 3 files changed, 83 insertions(+)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index bcc975b79cf25..004319ca550f9 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -780,3 +780,13 @@ const std::string embedding_bag_byte_prepack_script = R"IR(
       %res: Tensor = aten::clone(%output, %none)
       return (%res)
 )IR";
+
+const auto linalg_norm_ord_scalar = R"JIT(
+  def forward(self, a: Tensor, ord: int, dim: List[int], keepdim: bool, dtype: int):
+      return torch.linalg_norm(a, ord, dim, keepdim, dtype=dtype).clone()
+)JIT";
+
+const auto linalg_norm_ord_str = R"JIT(
+  def forward(self, a: Tensor, ord: str, dim: List[int], keepdim: bool, dtype: int):
+      return torch.linalg_norm(a, ord, dim, keepdim, dtype=dtype).clone()
+)JIT";
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 1e987a9fab58e..f6e3680e0be38 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -1265,3 +1265,29 @@ TEST(StaticRuntime, QEmbeddingBagByteUnpack) {
   testStaticRuntime(embedding_bag_byte_prepack_script, {a});
   testStaticRuntime(embedding_bag_byte_prepack_script, {a},{b});
 }
+
+TEST(StaticRuntime, IndividualOps_LinalgNorm_ScalarOrd) {
+  auto a = at::randn({2, 3});
+  auto dim = std::vector<int64_t>({1});
+  auto dtype = at::ScalarType::Float;
+
+  std::vector<IValue> args0{a, 4, dim, true, dtype};
+  testStaticRuntime(linalg_norm_ord_scalar, args0);
+
+  auto b = at::randn({4, 5});
+  std::vector<IValue> args1{b, 4, dim, true, dtype};
+  testStaticRuntime(linalg_norm_ord_scalar, args0, args1);
+}
+
+TEST(StaticRuntime, IndividualOps_LinalgNorm_StringOrd) {
+  auto a = at::randn({2, 3});
+  auto dim = std::vector<int64_t>({0, 1});
+  auto dtype = at::ScalarType::Float;
+
+  std::vector<IValue> args0{a, "fro", dim, true, dtype};
+  testStaticRuntime(linalg_norm_ord_str, args0);
+
+  auto b = at::randn({4, 5});
+  std::vector<IValue> args1{b, "fro", dim, true, dtype};
+  testStaticRuntime(linalg_norm_ord_str, args0, args1);
+}
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 3b586689a6c5c..12339301e0433 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1666,6 +1666,53 @@ REGISTER_OPERATOR_FUNCTOR(aten::fmod, aten_fmod, [](Node* n) -> SROperator {
   };
 });
 
+REGISTER_OPERATOR_FUNCTOR(aten::linalg_norm, aten_linalg_norm, [](Node* n) -> SROperator {
+  if (!n->matches(torch::schema(
+          "aten::linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor")) &&
+      !n->matches(torch::schema(
+          "aten::linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor"))) {
+    LogAndDumpSchema(n);
+    return nullptr;
+  }
+  return [](ProcessedNode* p_node) {
+    const auto& input = p_node->Input(0).toTensor();
+    const auto dim = p_node->Input(2).toIntVector();
+    const auto keepdim = p_node->Input(3).toBool();
+    const auto dtype = p_node->Input(4).toOptional<c10::ScalarType>();
+
+    if (p_node->Output(0).isNone()) {
+      if (p_node->Input(1).isScalar()) {
+        p_node->Output(0) = at::native::linalg_norm(
+            input,
+            p_node->Input(1).toOptional<at::Scalar>(),
+            dim,
+            keepdim,
+            dtype);
+      } else {
+        p_node->Output(0) = at::native::linalg_norm(
+            input, p_node->Input(1).toStringView(), dim, keepdim, dtype);
+      }
+      return;
+    }
+
+    auto& output = p_node->Output(0).toTensor();
+    fastResizeToZero(output);
+
+    if (p_node->Input(1).isScalar()) {
+      at::native::linalg_norm_out(
+          input,
+          p_node->Input(1).toOptional<at::Scalar>(),
+          dim,
+          keepdim,
+          dtype,
+          output);
+    } else {
+      at::native::linalg_norm_out(
+          input, p_node->Input(1).toStringRef(), dim, keepdim, dtype, output);
+    }
+  };
+});
+
 namespace {
 
 void check_cat_no_zero_dim(const std::vector<at::Tensor>& tensors) {

From d3bcba5f85f97ef273109924c695f33bf739e115 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 29 Aug 2021 23:31:42 -0700
Subject: [PATCH 338/530] ENH Adds label_smoothing to cross entropy loss
 (#63122)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/7455

Partially resolves pytorch/vision#4281

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63122

Reviewed By: iramazanli

Differential Revision: D30586076

Pulled By: jbschlosser

fbshipit-source-id: 06afc3aa1f8b9edb07fe9ed68c58968ad1926924
---
 aten/src/ATen/native/LossNLL.cpp              |  79 ++++++-
 aten/src/ATen/native/native_functions.yaml    |   2 +-
 test/cpp/api/functional.cpp                   |  14 ++
 test/cpp/api/modules.cpp                      |  25 +++
 test/test_nn.py                               |  72 +++++++
 .../api/include/torch/nn/functional/loss.h    |   9 +-
 .../csrc/api/include/torch/nn/options/loss.h  |   2 +
 torch/csrc/api/src/nn/modules/loss.cpp        |   3 +-
 torch/nn/functional.py                        |   8 +-
 torch/nn/functional.pyi.in                    |   3 +-
 torch/nn/modules/loss.py                      |  13 +-
 torch/onnx/symbolic_opset12.py                |   6 +-
 torch/overrides.py                            |   2 +-
 torch/testing/_internal/common_nn.py          | 199 +++++++++++++++++-
 14 files changed, 412 insertions(+), 25 deletions(-)

diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index c7c65f7b8cc22..83f169972942f 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -459,9 +459,10 @@ TORCH_IMPL_FUNC(nll_loss_backward_out_cpu)
 
 Tensor cross_entropy_loss_prob_target(
     const Tensor& self,
-    const Tensor& target,
+    const Tensor& target_,
     const Tensor& weight,
-    int64_t reduction) {
+    int64_t reduction,
+    double label_smoothing) {
   const auto n_classes = self.size(1);
   TORCH_CHECK(
       !weight.defined() || (weight.dim() == 1 && weight.numel() == n_classes),
@@ -472,6 +473,15 @@ Tensor cross_entropy_loss_prob_target(
       weight.sizes());
 
   auto input = at::log_softmax(self, 1, self.scalar_type());
+  Tensor target;
+
+  if (label_smoothing > 0.0) {
+    TORCH_CHECK(label_smoothing <= 1.0, "label_smoothing must be between 0.0 and 1.0. Got: ", label_smoothing);
+    target = target_ * (1 - label_smoothing) + label_smoothing / n_classes;
+  } else {
+    target = target_;
+  }
+
   if (weight.defined()) {
     // Expand weight to the correct number of dims for broadcasting with input / target
     auto weight_broadcast_shape = SmallBuffer<int64_t, 5>(input.dim());
@@ -503,12 +513,66 @@ Tensor cross_entropy_loss_prob_target(
   }
 }
 
+Tensor cross_entropy_loss_label_smoothing(
+    const Tensor& self,
+    const Tensor& target,
+    const Tensor& weight,
+    int64_t reduction,
+    int64_t ignore_index,
+    double label_smoothing) {
+
+    auto input = at::log_softmax(self, 1, self.scalar_type());
+    auto nllloss = at::nll_loss_nd(input, target, weight, reduction, ignore_index);
+
+    auto n_classes = input.size(1);
+
+    Tensor smooth_loss;
+    if (weight.defined()) {
+      // Expand weight to the correct number of dims for broadcasting with input / target
+      auto weight_broadcast_shape = SmallBuffer<int64_t, 5>(input.dim());
+      std::fill(weight_broadcast_shape.begin(), weight_broadcast_shape.end(), 1);
+      weight_broadcast_shape[1] = weight.size(0);
+      Tensor weight_ = weight.view(weight_broadcast_shape);
+
+      smooth_loss = -(input * weight_).sum(1);
+    } else {
+      smooth_loss = -input.sum(1);
+    }
+
+    if (ignore_index >= 0) {
+      smooth_loss.index_put_({target == ignore_index}, 0.0);
+    }
+
+    Tensor ret;
+    switch (reduction) {
+      case Reduction::Mean:
+        if (weight.defined()) {
+          // TODO: This code can path can be removed if #61309 is resolved
+          // loss is normalized by the weights to be consistent with nll_loss_nd
+          ret = smooth_loss.sum() / weight.gather(0, target.flatten()).sum();
+        } else {
+          ret = smooth_loss.mean();
+        }
+        break;
+      case Reduction::Sum:
+        ret = smooth_loss.sum();
+        break;
+      case Reduction::None:
+        ret = smooth_loss;
+        break;
+      default:
+        TORCH_CHECK(false, "Invalid reduction type encountered in cross_entropy: ", reduction);
+    }
+    return (1 - label_smoothing) * nllloss + ret * (label_smoothing / n_classes);
+}
+
 Tensor cross_entropy_loss(
     const Tensor& self,
     const Tensor& target,
     const c10::optional<Tensor>& weight,
     int64_t reduction,
-    int64_t ignore_index) {
+    int64_t ignore_index,
+    double label_smoothing) {
   Tensor ret;
   if (self.sizes() == target.sizes()) {
     // Assume soft targets when input and target shapes are the same
@@ -519,7 +583,14 @@ Tensor cross_entropy_loss(
     // See [Note: hacky wrapper removal for optional tensor]
     c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight);
     const Tensor& weight_ = *weight_maybe_owned;
-    ret = cross_entropy_loss_prob_target(self, target, weight_, reduction);
+    ret = cross_entropy_loss_prob_target(self, target, weight_, reduction, label_smoothing);
+  } else if (label_smoothing > 0.0) {
+    TORCH_CHECK(label_smoothing <= 1.0, "label_smoothing must be between 0.0 and 1.0. Got: ", label_smoothing);
+
+    // See [Note: hacky wrapper removal for optional tensor]
+    c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight);
+    const Tensor& weight_ = *weight_maybe_owned;
+    ret = cross_entropy_loss_label_smoothing(self, target, weight_, reduction, ignore_index, label_smoothing);
   } else {
     ret = at::nll_loss_nd(
         at::log_softmax(self, 1, self.scalar_type()),
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 224d850c8004c..688763ea39c13 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6652,7 +6652,7 @@
   device_check: NoCheck   # TensorIterator
   variants: method
 
-- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor
+- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, float label_smoothing=0.0) -> Tensor
   python_module: nn
 
 - func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 2ecb84189c55a..8b7889f1841ef 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -792,6 +792,20 @@ TEST_F(FunctionalTest, CrossEntropy) {
 
   ASSERT_TRUE(output.allclose(expected, 1e-04));
   ASSERT_TRUE(F::cross_entropy(input, target).allclose(expected, 1e-04));
+
+  // label smoothing with class indices
+  input = torch::tensor({{3., 1.}, {1., 2.}}, torch::kFloat);
+  output = F::cross_entropy(
+      input, target, F::CrossEntropyFuncOptions().label_smoothing(0.15).reduction(torch::kMean));
+  expected = torch::tensor(0.3326, torch::kFloat);
+  ASSERT_TRUE(output.allclose(expected, 1e-04));
+
+  // label smoothing with target probabilities
+  target = torch::tensor({{0.8, 0.2}, {0.1, 0.9}}, torch::kFloat);
+  output = F::cross_entropy(
+      input, target, F::CrossEntropyFuncOptions().label_smoothing(0.2).reduction(torch::kMean));
+  expected = torch::tensor(0.5701, torch::kFloat);
+  ASSERT_TRUE(output.allclose(expected, 1e-04));
 }
 
 TEST_F(FunctionalTest, MaxUnpool1d) {
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 23d75efeee21f..927d884709200 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -2315,6 +2315,31 @@ TEST_F(ModulesTest, CrossEntropyLoss) {
   ASSERT_TRUE(
     CrossEntropyLoss(CrossEntropyLossOptions().ignore_index(-100).reduction(torch::kMean))
       ->forward(input, target).allclose(expected, 1e-04));
+
+  // label smoothing with class indices
+  loss = CrossEntropyLoss(CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kMean));
+  input = torch::tensor({{3., 1.}, {1., 2.}}, torch::dtype(torch::kFloat).requires_grad(true));
+  target = torch::tensor({0, 1}, torch::kLong);
+  output = loss->forward(input, target);
+  expected = torch::tensor(0.3326, torch::kFloat);
+  s = output.sum();
+  s.backward();
+
+  ASSERT_TRUE(output.allclose(expected, 1e-04));
+  ASSERT_EQ(input.sizes(), input.grad().sizes());
+
+  // label smoothing with with target probabilities
+  loss = CrossEntropyLoss(CrossEntropyLossOptions().label_smoothing(0.2).reduction(torch::kMean));
+  input = torch::tensor({{3., 1.}, {1., 2.}}, torch::dtype(torch::kFloat).requires_grad(true));
+  target = torch::tensor({{0.8, 0.2}, {0.1, 0.9}}, torch::kFloat);
+  output = loss->forward(input, target);
+  expected = torch::tensor(0.5701, torch::kFloat);
+  s = output.sum();
+  s.backward();
+
+  ASSERT_TRUE(output.allclose(expected, 1e-04));
+  ASSERT_EQ(input.sizes(), input.grad().sizes());
+
 }
 
 TEST_F(ModulesTest, CosineSimilarity) {
diff --git a/test/test_nn.py b/test/test_nn.py
index 7d26246786c6c..bb4dd59be5271 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -17183,6 +17183,78 @@ def test_cross_entropy_loss_one_hot_target(self, device):
                 output_one_hot = m(input, target_one_hot)
                 self.assertEqual(output, output_one_hot)
 
+    def test_cross_entropy_label_smoothing_errors(self, device):
+        N, C = 3, 4
+        input_args = [
+            (torch.randn((N, C), device=device), torch.arange(0, C, device=device)),
+            (torch.randn((N, C), device=device), torch.randn(N, C, device=device))
+        ]
+        for input_arg in input_args:
+            loss = nn.CrossEntropyLoss(label_smoothing=1.2)
+            with self.assertRaisesRegex(RuntimeError,
+                                        r"label_smoothing must be between 0\.0"):
+                loss(*input_arg)
+
+    def test_cross_entropy_label_smoothing_consistent_index_target_and_probs(self, device):
+        N, C = 10, 4
+        ks = range(5)
+        reductions = ['none', 'mean', 'sum']
+        label_smoothings = [0.05, 0.15]
+
+        for k, reduction, label_smoothing in product(ks, reductions, label_smoothings):
+            other_dims = [torch.randint(2, 5, size=(1,)).item() for _ in range(k)]
+            input = torch.randn(N, C, *other_dims, device=device, requires_grad=True)
+            target = torch.empty(N, *other_dims, dtype=torch.long, device=device).random_(0, C)
+
+            # construct target probablity that should have the same result as label_smoothing
+            target_proba = F.one_hot(target, num_classes=C)
+            # Need to put the C dim at index 1.
+            target_proba = target_proba.permute(0, -1, *range(1, target_proba.dim() - 1))
+            target_mask = (target_proba == 1)
+            target_proba = target_proba.to(dtype=input.dtype)
+
+            # y_k^ls = y_k * (1 - label_smoothing) + label_smoothing / n_classes
+            # Get one-hot representation of the target.
+            target_proba.masked_fill_(target_mask, 1 - label_smoothing + label_smoothing / C)
+            target_proba.masked_fill_(~target_mask, label_smoothing / C)
+
+            loss = nn.CrossEntropyLoss(reduction=reduction)
+            output_with_prob = loss(input, target_proba)
+
+            loss = nn.CrossEntropyLoss(
+                reduction=reduction, label_smoothing=label_smoothing)
+            output_with_index = loss(input, target)
+
+            self.assertEqual(output_with_prob, output_with_index,
+                             rtol=1e-07, atol=1e-05)
+
+    def test_cross_entropy_label_smoothing_with_probs(self, device):
+        N, C = 10, 4
+        ks = range(5)
+        reductions = ['none', 'mean', 'sum']
+        label_smoothings = [0.05, 0.15]
+
+        # Test with k-dimensional loss.
+        for k, label_smoothing in product(ks, label_smoothings):
+            other_dims = [torch.randint(2, 5, size=(1,)).item() for _ in range(k)]
+            input = torch.randn(N, C, *other_dims, device=device, requires_grad=True)
+            target = F.log_softmax(torch.randn(N, C, *other_dims, device=device), dim=1)
+
+            for reduction in reductions:
+                # use with label_smoothing
+                loss = nn.CrossEntropyLoss(reduction=reduction, label_smoothing=label_smoothing)
+                output_with_smoothing = loss(input, target)
+
+                # manually smoothing target
+                # class_proba^ls = class_proba * (1 - label_smoothing) +
+                #                  label_smoothing / n_classes
+                target_with_smoothing = target * (1 - label_smoothing) + label_smoothing / C
+                loss = nn.CrossEntropyLoss(reduction=reduction)
+                output_with_manual_smoothing = loss(input, target_with_smoothing)
+
+                self.assertEqual(output_with_smoothing, output_with_manual_smoothing)
+
+
     def test_softshrink_negative(self, device):
         input = torch.randn(5, device=device, requires_grad=True)
         m = torch.nn.Softshrink(-1)
diff --git a/torch/csrc/api/include/torch/nn/functional/loss.h b/torch/csrc/api/include/torch/nn/functional/loss.h
index ea2f6066ddf15..1fa91ad6deb1f 100644
--- a/torch/csrc/api/include/torch/nn/functional/loss.h
+++ b/torch/csrc/api/include/torch/nn/functional/loss.h
@@ -824,13 +824,15 @@ inline Tensor cross_entropy(
     const Tensor& target,
     const Tensor& weight,
     int64_t ignore_index,
-    CrossEntropyFuncOptions::reduction_t reduction) {
+    CrossEntropyFuncOptions::reduction_t reduction,
+    double label_smoothing) {
   return torch::cross_entropy_loss(
       input,
       target,
       weight,
       enumtype::reduction_get_enum(reduction),
-      ignore_index);
+      ignore_index,
+      label_smoothing);
 }
 } // namespace detail
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
@@ -855,7 +857,8 @@ inline Tensor cross_entropy(
       target,
       options.weight(),
       options.ignore_index(),
-      options.reduction());
+      options.reduction(),
+      options.label_smoothing());
 }
 
 // ============================================================================
diff --git a/torch/csrc/api/include/torch/nn/options/loss.h b/torch/csrc/api/include/torch/nn/options/loss.h
index d8ffd15c8660a..1479de571d13e 100644
--- a/torch/csrc/api/include/torch/nn/options/loss.h
+++ b/torch/csrc/api/include/torch/nn/options/loss.h
@@ -662,6 +662,8 @@ struct TORCH_API CrossEntropyLossOptions {
   TORCH_ARG(int64_t, ignore_index) = -100;
   /// Specifies the reduction to apply to the output. Default: Mean
   TORCH_ARG(reduction_t, reduction) = torch::kMean;
+  /// Specifies the amount of smoothing when computing the loss. Default: 0.0
+  TORCH_ARG(double, label_smoothing) = 0.0;
 };
 
 namespace functional {
diff --git a/torch/csrc/api/src/nn/modules/loss.cpp b/torch/csrc/api/src/nn/modules/loss.cpp
index d5d8c687168e8..dda67fe9c728e 100644
--- a/torch/csrc/api/src/nn/modules/loss.cpp
+++ b/torch/csrc/api/src/nn/modules/loss.cpp
@@ -378,7 +378,8 @@ Tensor CrossEntropyLossImpl::forward(
     target,
     weight,
     options.ignore_index(),
-    options.reduction());
+    options.reduction(),
+    options.label_smoothing());
 }
 
 // ============================================================================
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 52125864000f1..c11e261d9b85f 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -2772,6 +2772,7 @@ def cross_entropy(
     ignore_index: int = -100,
     reduce: Optional[bool] = None,
     reduction: str = "mean",
+    label_smoothing: float = 0.0,
 ) -> Tensor:
     r"""This criterion computes the cross entropy loss between input and target.
 
@@ -2808,6 +2809,10 @@ def cross_entropy(
             elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
             and :attr:`reduce` are in the process of being deprecated, and in the meantime,
             specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
+            of smoothing when computing the loss, where 0.0 means no smoothing. The targets
+            become a mixture of the original ground truth and a uniform distribution as described in
+            `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
 
     Examples::
 
@@ -2834,10 +2839,11 @@ def cross_entropy(
             ignore_index=ignore_index,
             reduce=reduce,
             reduction=reduction,
+            label_smoothing=label_smoothing,
         )
     if size_average is not None or reduce is not None:
         reduction = _Reduction.legacy_get_string(size_average, reduce)
-    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
+    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
 
 
 def binary_cross_entropy(
diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in
index 828f8df2185b5..cbd05d7e3dedb 100644
--- a/torch/nn/functional.pyi.in
+++ b/torch/nn/functional.pyi.in
@@ -239,7 +239,8 @@ def kl_div(input: Tensor, target: Tensor, size_average: Optional[bool] = ..., re
 
 
 def cross_entropy(input: Tensor, target: Tensor, weight: Optional[Tensor] = ..., size_average: Optional[bool] = ...,
-                  ignore_index: int = ..., reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ...
+                  ignore_index: int = ..., reduce: Optional[bool] = ..., reduction: str = ...,
+                  label_smoothing: float = ...) -> Tensor: ...
 
 
 def binary_cross_entropy(input: Tensor, target: Tensor, weight: Optional[Tensor] = ...,
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index af1da83eeef5b..d72c614c88048 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1104,6 +1104,10 @@ class probabilities only when a single class label per minibatch item is too res
             and :attr:`reduce` are in the process of being deprecated, and in
             the meantime, specifying either of those two args will override
             :attr:`reduction`. Default: ``'mean'``
+        label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
+            of smoothing when computing the loss, where 0.0 means no smoothing. The targets
+            become a mixture of the original ground truth and a uniform distribution as described in
+            `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
 
     Shape:
         - Input: :math:`(N, C)` where `C = number of classes`, or
@@ -1132,17 +1136,20 @@ class probabilities only when a single class label per minibatch item is too res
         >>> output = loss(input, target)
         >>> output.backward()
     """
-    __constants__ = ['ignore_index', 'reduction']
+    __constants__ = ['ignore_index', 'reduction', 'label_smoothing']
     ignore_index: int
+    label_smoothing: float
 
     def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
-                 reduce=None, reduction: str = 'mean') -> None:
+                 reduce=None, reduction: str = 'mean', label_smoothing: float = 0.0) -> None:
         super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction)
         self.ignore_index = ignore_index
+        self.label_smoothing = label_smoothing
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         return F.cross_entropy(input, target, weight=self.weight,
-                               ignore_index=self.ignore_index, reduction=self.reduction)
+                               ignore_index=self.ignore_index, reduction=self.reduction,
+                               label_smoothing=self.label_smoothing)
 
 
 class MultiLabelSoftMarginLoss(_WeightedLoss):
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index d8f954148a1ee..ab39325709ea9 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -65,7 +65,7 @@ def nll_loss_nd(g, self, target, weight, reduction, ignore_index):
     return nll_loss(g, self, target, weight, reduction, ignore_index)
 
 
-def cross_entropy_loss(g, self, target, weight, reduction, ignore_index):
+def cross_entropy_loss(g, self, target, weight, reduction, ignore_index, label_smoothing):
     # none reduction : onnx::Constant[value={0}]
     # mean reduction : onnx::Constant[value={1}]
     # sum reduction : onnx::Constant[value={2}]
@@ -73,6 +73,10 @@ def cross_entropy_loss(g, self, target, weight, reduction, ignore_index):
     reduction_vals = ["none", "mean", "sum"]
     reduction = reduction_vals[reduction]
 
+    label_smoothing = sym_help._maybe_get_const(label_smoothing, "f")
+    if label_smoothing > 0.0:
+        raise RuntimeError("Unsupported: ONNX does not support label_smoothing")
+
     # in onnx SoftmaxCrossEntropyLoss specification, ignore_index is optional without default value.
     # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
     ignore_index = sym_help._maybe_get_const(ignore_index, "i")
diff --git a/torch/overrides.py b/torch/overrides.py
index 09748b982b428..64b18b89eb401 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -677,7 +677,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.nn.functional.cosine_embedding_loss: (lambda input1, input2, target, margin=0, size_average=None,
                                                     reduce=None, reduction='mean': -1),
         torch.nn.functional.cross_entropy: (lambda input, target, weight=None, size_average=None, ignore_index=-100,
-                                            reduce=None, reduction="mean": -1),
+                                            reduce=None, reduction="mean", label_smoothing=0.0: -1),
         torch.nn.functional.ctc_loss: (lambda log_probs, targets, input_lengths, target_lengths, blank=0,
                                        reduction='mean', zero_infinity=False: -1),
         torch.nn.functional.dropout: lambda input, p=0.5, training=True, inplace=False: -1,
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index e0d09b7ba03fc..73233df8cc5bb 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -4103,7 +4103,8 @@ def nlllossNd_reference(input, target, weight=None, ignore_index=-100,
     return output
 
 
-def cross_entropy_loss_prob_target_reference(input, target, weight=None, reduction='mean'):
+def cross_entropy_loss_prob_target_reference(input, target, weight=None, reduction='mean',
+                                             label_smoothing=0.0):
     assert input.dim() >= 2
 
     input = torch.log_softmax(input, 1)
@@ -4112,6 +4113,10 @@ def cross_entropy_loss_prob_target_reference(input, target, weight=None, reducti
         weight = torch.ones(C).type_as(input)
     weight = weight.view(1, C, *(1 for _ in input.shape[2:]))
 
+    if label_smoothing > 0.0:
+        assert label_smoothing <= 1.0
+        target = (target * (1 - label_smoothing) + label_smoothing / C)
+
     output = -(input * target * weight).sum(dim=1)
     if reduction == 'mean':
         return output.mean()
@@ -4120,20 +4125,61 @@ def cross_entropy_loss_prob_target_reference(input, target, weight=None, reducti
     return output
 
 
-def cross_entropy_loss_reference(input, target, weight=None, ignore_index=-100, reduction='mean'):
+def cross_entropy_loss_indices_target_reference(input, target, weight=None, ignore_index=-100,
+                                                reduction='mean', label_smoothing=0.0):
+    log_softmax_input = torch.log_softmax(input, 1)
+    nllloss = F.nll_loss(
+        log_softmax_input,
+        target,
+        weight,
+        ignore_index=ignore_index,
+        reduction=reduction)
+
+    if label_smoothing == 0.0:
+        return nllloss
+
+    assert 0.0 < label_smoothing <= 1.0
+
+    input = torch.log_softmax(input, 1)
+    C = input.size(1)
+    if weight is not None:
+        input = input * weight.view(1, C, *(1 for _ in input.shape[2:]))
+
+    smooth_loss = -torch.sum(input, 1)
+
+    if ignore_index >= 0:
+        ignore_mask = target == ignore_index
+        smooth_loss.masked_fill_(ignore_mask, 0.0)
+
+    if reduction == 'mean':
+        if weight is not None:
+            # TODO: This code can path can be removed if #61309 is resolved
+            # loss is normalized by the weights to be consistent with nll_loss_nd
+            ret = torch.sum(smooth_loss) / weight.gather(0, target.flatten()).sum()
+        else:
+            ret = torch.mean(smooth_loss)
+    elif reduction == 'sum':
+        ret = torch.sum(smooth_loss)
+    else:
+        ret = smooth_loss
+
+    return (1 - label_smoothing) * nllloss + ret * (label_smoothing / C)
+
+
+def cross_entropy_loss_reference(input, target, weight=None, ignore_index=-100, reduction='mean',
+                                 label_smoothing=0.0):
     if input.shape == target.shape:
         return cross_entropy_loss_prob_target_reference(
             input,
             target,
             weight=weight,
-            reduction=reduction)
+            reduction=reduction,
+            label_smoothing=label_smoothing)
     else:
-        return nlllossNd_reference(
-            torch.log_softmax(input, 1),
-            target,
-            weight,
-            ignore_index=ignore_index,
-            reduction=reduction)
+        return cross_entropy_loss_indices_target_reference(
+            input, target, weight=weight, reduction=reduction,
+            ignore_index=ignore_index, label_smoothing=label_smoothing
+        )
 
 
 def nllloss_reference(input, target, weight=None, ignore_index=-100,
@@ -4893,6 +4939,141 @@ def padding3d_circular(input, pad):
         desc='4d_prob_target',
         check_bfloat16=False,
     ),
+    dict(
+        fullname='CrossEntropyLoss_2d_prob_target_smoothing_sum_reduction',
+        constructor=lambda *args, **kwargs: nn.CrossEntropyLoss(reduction='sum',
+                                                                label_smoothing=0.15),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kSum)',
+        input_size=(5, 3),
+        target_fn=lambda: torch.rand(5, 3).softmax(dim=1),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
+        check_bfloat16=False,
+    ),
+    dict(
+        fullname='CrossEntropyLoss_2d_prob_target_smoothing',
+        constructor=lambda *args: nn.CrossEntropyLoss(label_smoothing=0.15),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15)',
+        input_size=(5, 3),
+        target_fn=lambda: torch.rand(5, 3).softmax(dim=1),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
+        check_bfloat16=False,
+    ),
+    dict(
+        fullname='CrossEntropyLoss_2d_prob_target_smoothing_weight',
+        constructor_args_fn=lambda: (torch.rand(3).abs(),),
+        constructor=lambda weight: nn.CrossEntropyLoss(weight, label_smoothing=0.15),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).weight(torch::rand(3).abs())',
+        input_size=(5, 3),
+        target_fn=lambda: torch.rand(5, 3).softmax(dim=1),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), weight=get_weight(m), label_smoothing=0.15),
+        check_bfloat16=False,
+    ),
+    dict(
+        fullname='CrossEntropyLoss_3d_prob_target_smoothing_sum_reduction',
+        constructor=lambda *args: nn.CrossEntropyLoss(reduction='sum',
+                                                                label_smoothing=0.15),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kSum)',
+        input_size=(5, 3, 4),
+        target_fn=lambda: torch.rand(5, 3, 4).softmax(dim=1),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
+        check_bfloat16=False,
+    ),
+    dict(
+        fullname='CrossEntropyLoss_3d_prob_target_smoothing',
+        constructor=lambda *args: nn.CrossEntropyLoss(label_smoothing=0.15),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15)',
+        input_size=(5, 3, 4),
+        target_fn=lambda: torch.rand(5, 3, 4).softmax(dim=1),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
+        check_bfloat16=False,
+    ),
+    dict(
+        fullname='CrossEntropyLoss_3d_indices_target_smoothing',
+        constructor=lambda *args: nn.CrossEntropyLoss(label_smoothing=0.15),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15)',
+        input_size=(2, 3, 5),
+        target_fn=lambda: torch.rand(2, 5).mul(3).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
+        check_bfloat16=False,
+    ),
+    dict(
+        fullname='CrossEntropyLoss_3d_indices_target_smoothing_ignore_index',
+        constructor=lambda *args: nn.CrossEntropyLoss(label_smoothing=0.15, ignore_index=1),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).ignore_index(1)',
+        input_size=(2, 3, 5),
+        target_fn=lambda: torch.rand(2, 5).mul(3).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15, ignore_index=1),
+        check_bfloat16=False,
+    ),
+    dict(
+        fullname='CrossEntropyLoss_3d_indices_target_smoothing_sum_reduction',
+        constructor=lambda *args: nn.CrossEntropyLoss(reduction='sum', label_smoothing=0.15),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kSum)',
+        input_size=(2, 3, 5),
+        target_fn=lambda: torch.rand(2, 5).mul(3).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
+        check_bfloat16=False,
+    ),
+    dict(
+        fullname='CrossEntropyLoss_3d_indices_target_smoothing_sum_reduction_ignore_index',
+        constructor=lambda *args: nn.CrossEntropyLoss(reduction='sum', label_smoothing=0.15,
+                                                      ignore_index=1),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kSum).ignore_index(1)',
+        input_size=(2, 3, 5),
+        target_fn=lambda: torch.rand(2, 5).mul(3).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15, ignore_index=1),
+        check_bfloat16=False,
+    ),
+    dict(
+        fullname='CrossEntropyLoss_2d_indices_target_smoothing',
+        constructor=lambda *args: nn.CrossEntropyLoss(label_smoothing=0.15),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15)',
+        input_size=(15, 10),
+        target_fn=lambda: torch.empty(15).uniform_().mul(10).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
+        check_bfloat16=False,
+    ),
+    dict(
+        fullname='CrossEntropyLoss_2d_indices_target_smoothing_sum_reduction',
+        constructor=lambda *args: nn.CrossEntropyLoss(reduction='sum', label_smoothing=0.15),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kSum)',
+        input_size=(15, 10),
+        target_fn=lambda: torch.empty(15).uniform_().mul(10).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15),
+        check_bfloat16=False,
+    ),
+    dict(
+        fullname='CrossEntropyLoss_2d_indices_target_smoothing_ignore_index',
+        constructor=lambda *args: nn.CrossEntropyLoss(label_smoothing=0.15, ignore_index=3),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).ignore_index(3)',
+        input_size=(15, 10),
+        target_fn=lambda: torch.empty(15).uniform_().mul(10).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), label_smoothing=0.15, ignore_index=3),
+        check_bfloat16=False,
+    ),
+    dict(
+        fullname='CrossEntropyLoss_2d_indices_target_smoothing_weight',
+        constructor_args_fn=lambda: (torch.rand(10).abs(),),
+        constructor=lambda weight: nn.CrossEntropyLoss(weight, label_smoothing=0.15),
+        cpp_constructor_args='torch::nn::CrossEntropyLossOptions().label_smoothing(0.15).weight(torch::rand(10).abs())',
+        input_size=(15, 10),
+        target_fn=lambda: torch.empty(15).uniform_().mul(10).floor().long(),
+        reference_fn=lambda i, t, m:
+            loss_reference_fns['CrossEntropyLoss'](i, t, reduction=get_reduction(m), weight=get_weight(m), label_smoothing=0.15),
+        check_bfloat16=False,
+    ),
     dict(
         module_name='CrossEntropyLoss',
         constructor_args_fn=lambda: (torch.rand(3),),

From a836d83957a526a69b3d3de7094e005aa988eb51 Mon Sep 17 00:00:00 2001
From: Raghavan Raman <raghavanr@fb.com>
Date: Mon, 30 Aug 2021 04:38:00 -0700
Subject: [PATCH 339/530] [nnc] Fixed warning due to implicit parameter
 conversion (#64117)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64117

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D30616945

Pulled By: navahgar

fbshipit-source-id: eaf69232ac4a684ab5f97a54a514971655f86ef3
---
 torch/csrc/jit/tensorexpr/expr.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h
index fbbea12387920..4947bfdc36be9 100644
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -319,11 +319,16 @@ class TORCH_API BufHandle : public ExprHandle {
 // object. For example: VarHandle x('x'); ExprHandle x2 = x;
 class TORCH_API VarHandle : public ExprHandle {
  public:
+  // Creates an empty VarHandle whose base Var is set to nullptr.
   VarHandle() : ExprHandle() {}
+
   explicit VarHandle(Dtype dtype) : ExprHandle(Var::make(dtype)) {}
+
   VarHandle(const std::string& name_hint, Dtype dtype)
       : ExprHandle(Var::make(name_hint, dtype)) {}
+
   explicit VarHandle(VarPtr node) : ExprHandle(node) {}
+
   VarPtr node() const {
     return static_to<Var>(ExprHandle::node());
   }

From 093a12aaa984bd4a7768bb306157067f7c95b0ec Mon Sep 17 00:00:00 2001
From: Raghavan Raman <raghavanr@fb.com>
Date: Mon, 30 Aug 2021 04:38:00 -0700
Subject: [PATCH 340/530] [nnc] Updated internal asserts to include more
 detailed error messages (#64118)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64118

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D30616944

Pulled By: navahgar

fbshipit-source-id: 35289696cc0e7faa01599304243b86f0febc6daf
---
 torch/csrc/jit/tensorexpr/kernel.cpp | 36 +++++++++++++++++++++++-----
 torch/csrc/jit/tensorexpr/kernel.h   |  2 ++
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 0d0d19e004981..e4136d85c0a50 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -34,7 +34,10 @@ static bool checkTypes(const ScalarType highType, const int typeConstraints) {
   }
 
   // assume JIT not supporting complex and qint yet
-  TORCH_INTERNAL_ASSERT((typeConstraints & (kQintTypes | kComplexTypes)) == 0);
+  TORCH_INTERNAL_ASSERT(
+      (typeConstraints & (kQintTypes | kComplexTypes)) == 0,
+      buildErrorMessage(
+          "Qint and Complex types are not supported in the fuser."));
   return false;
 }
 
@@ -63,6 +66,13 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
+std::string buildErrorMessage(const std::string& s) {
+  // TODO: Update this generic error message to include details regarding
+  // turning off the fuser.
+  static const std::string generic_error_message = "";
+  return s + " " + generic_error_message;
+}
+
 static int te_cuda_pointwise_loop_levels = -1;
 static int te_cuda_pointwise_block_count = -1;
 static int te_cuda_pointwise_block_size = -1;
@@ -164,13 +174,18 @@ c10::optional<at::Device> pickDeviceType(const std::shared_ptr<Graph>& graph) {
     for (auto const& input : node->inputs()) {
       if (auto tt = input->type()->cast<TensorType>()) {
         if (auto inputDevice = tt->device()) {
-          TORCH_INTERNAL_ASSERT(!device || *device == *inputDevice);
+          TORCH_INTERNAL_ASSERT(
+              !device || *device == *inputDevice,
+              buildErrorMessage(
+                  "Different devices specified for inputs to the fuser."));
           device = inputDevice;
         }
       }
     }
   }
-  TORCH_INTERNAL_ASSERT(device);
+  TORCH_INTERNAL_ASSERT(
+      device,
+      buildErrorMessage("Could not find device in fuser graph inputs."));
   return device;
 }
 
@@ -356,7 +371,9 @@ bool matmulIsSupported(const torch::jit::Node* node) {
 void annotateInputShapes(
     const std::shared_ptr<Graph>& graph,
     const std::vector<c10::optional<at::Tensor>>& example_inputs) {
-  TORCH_INTERNAL_ASSERT(graph->inputs().size() == example_inputs.size());
+  TORCH_INTERNAL_ASSERT(
+      graph->inputs().size() == example_inputs.size(),
+      buildErrorMessage("Given inputs do not match the fuser graph inputs."));
   for (size_t idx = 0; idx < example_inputs.size(); idx++) {
     if (auto t = example_inputs[idx]) {
       auto concrete_tensor_type = tensorTypeInCurrentExecutionContext(*t);
@@ -820,7 +837,10 @@ std::vector<ExprHandle> TensorExprKernel::inferSizesForValue(
         throw std::runtime_error("Empty input list is passed to aten::cat");
       }
 
-      TORCH_INTERNAL_ASSERT(n->input(1)->node()->kind() == prim::Constant);
+      TORCH_INTERNAL_ASSERT(
+          n->input(1)->node()->kind() == prim::Constant,
+          buildErrorMessage(
+              "aten::cat op's dim input is not constant in fuser."));
       int64_t dim = n->input(1)->node()->i(attr::value);
       auto shape = sizesForValue(inputs[0]);
       auto norm_dim = normalizeAndCheckIndex(dim, shape.size());
@@ -2689,7 +2709,11 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) {
         blockSize = default_uint8_blocksize;
       }
       std::vector<ForPtr> loops = l.getLoopStmtsFor(buf);
-      TORCH_INTERNAL_ASSERT(!loops.empty(), "loops should not be empty");
+      TORCH_INTERNAL_ASSERT(
+          !loops.empty(),
+          buildErrorMessage(
+              "No loops found for the buffer " + buf->name_hint() +
+              " in the fuser."));
       ForPtr flattened = nullptr;
       LoopNest::flatten(loops, &flattened);
       assert(flattened);
diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h
index 4b92b020fce31..bdb9802ccdc3a 100644
--- a/torch/csrc/jit/tensorexpr/kernel.h
+++ b/torch/csrc/jit/tensorexpr/kernel.h
@@ -300,6 +300,8 @@ TORCH_API void annotateInputShapes(
 TORCH_API std::shared_ptr<Graph> removeUnusedSelfArgument(
     const std::shared_ptr<Graph>& graph);
 
+TORCH_API std::string buildErrorMessage(const std::string& s);
+
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch

From 7701ea48be276f5058cc6247ec53fb6a5789445a Mon Sep 17 00:00:00 2001
From: "Rong Rong (AI Infra)" <rongr@fb.com>
Date: Mon, 30 Aug 2021 07:49:27 -0700
Subject: [PATCH 341/530] remove one more distributed test (#64108)

Summary:
Follow up on https://github.com/pytorch/pytorch/issues/62896. one more place we should remove distributed test

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64108

Reviewed By: janeyx99, soulitzer

Differential Revision: D30614062

Pulled By: walterddr

fbshipit-source-id: 6576415dc2d481d65419da19c5aa0afc37a86cff
---
 .jenkins/pytorch/test.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 4eb1b35253c91..5014f603e4bb9 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -517,7 +517,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 || "$
   test_without_numpy
   install_torchvision
   test_python_shard1
-  test_distributed
   test_aten
 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 || "${SHARD_NUMBER}" == 2 ]]; then
   install_torchvision

From 82174330d0bae4e2356295e16e261052f1d0ff8c Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitaly.fedyunin@gmail.com>
Date: Mon, 30 Aug 2021 07:54:11 -0700
Subject: [PATCH 342/530] [DataLoader2] Adding Messages, Protocols, Loop
 wrappers (#63882)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63882

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30627452

Pulled By: VitalyFedyunin

fbshipit-source-id: 561ea2df07f3572e04401171946154024126387b
---
 test/test_dataloader.py                     |  42 ++++-
 torch/utils/data/__init__.py                |   3 +-
 torch/utils/data/communication/__init__.py  |   5 +
 torch/utils/data/communication/eventloop.py |  41 +++++
 torch/utils/data/communication/iter.py      | 173 ++++++++++++++++++++
 torch/utils/data/communication/messages.py  |  75 +++++++++
 torch/utils/data/communication/protocol.py  | 159 ++++++++++++++++++
 torch/utils/data/communication/queue.py     |  50 ++++++
 torch/utils/data/dataloader_experimental.py | 124 ++++++++++----
 9 files changed, 638 insertions(+), 34 deletions(-)
 create mode 100644 torch/utils/data/communication/__init__.py
 create mode 100644 torch/utils/data/communication/eventloop.py
 create mode 100644 torch/utils/data/communication/iter.py
 create mode 100644 torch/utils/data/communication/messages.py
 create mode 100644 torch/utils/data/communication/protocol.py
 create mode 100644 torch/utils/data/communication/queue.py

diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 65554632fd30f..c768246ff477c 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -22,6 +22,7 @@
     IterableDataset,
     Subset,
     TensorDataset,
+    communication,
     _utils
 )
 from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL
@@ -32,6 +33,7 @@
                                                   IS_IN_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest,
                                                   load_tests, TEST_WITH_TSAN, IS_SANDCASTLE)
 
+
 try:
     import psutil
     HAS_PSUTIL = True
@@ -730,7 +732,7 @@ def __getitem__(self, idx):
 
 # Should be used as worker_init_fn with TestWorkerInfoDataset.
 # See _test_get_worker_info below for usage.
-def test_worker_info_init_fn(worker_id):
+def _test_worker_info_init_fn(worker_id):
     worker_info = torch.utils.data.get_worker_info()
     assert worker_id == worker_info.id, "worker_init_fn and worker_info should have consistent id"
     assert worker_id < worker_info.num_workers, "worker_init_fn and worker_info should have valid id"
@@ -760,7 +762,7 @@ def _test_get_worker_info():
     dataset = TestWorkerInfoDataset(6, batch_size, num_workers)
     dataloader = DataLoader(dataset, batch_size=batch_size,
                             num_workers=num_workers,
-                            worker_init_fn=test_worker_info_init_fn)
+                            worker_init_fn=_test_worker_info_init_fn)
     it = iter(dataloader)
     data = []
     for d in it:
@@ -769,7 +771,7 @@ def _test_get_worker_info():
     data = torch.cat(data, 0)
     for d in data:
         # each `d` is a [worker_id, worker_pid] pair, which is set in
-        # test_worker_info_init_fn
+        # _test_worker_info_init_fn
         assert d[1] == worker_pids[d[0]]
     # get_worker_info returns None in main proc after data loading
     assert torch.utils.data.get_worker_info() is None
@@ -1963,11 +1965,41 @@ def test_excessive_thread_creation_warning(self):
 class TestDataLoader2(TestCase):
     @skipIfNoDill
     def test_basics(self):
-        dp = IterableWrapper(list(range(10)))
+        # TODO(VitalyFedyunin): This test will start breaking if we remove guaranteed order
+        # of traversing workers
+        dp = IterableWrapper(list(range(1000)))
         dl = DataLoader(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2)
         dl2 = DataLoader2(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2)
-        self.assertEquals(list(dl), list(dl2))
+        dl2_threading = DataLoader2(dp, batch_size=3, collate_fn=lambda x: x, num_workers=2, parallelism_mode='thread')
+        self.assertEqual(list(dl), list(dl2))
+        self.assertEqual(list(dl), list(dl2_threading))
+
+
+
+@unittest.skipIf(
+    TEST_WITH_TSAN,
+    "Fails with TSAN with the following error: starting new threads after multi-threaded "
+    "fork is not supported. Dying (set die_after_fork=0 to override)")
+class TestDataLoader2_EventLoop(TestCase):
+    @skipIfNoDill
+    def test_basic_threading(self):
+        def clean_me(process, req_queue, res_queue):
+            req_queue.put(communication.messages.TerminateRequest())
+            _ = res_queue.get()
+            process.join()
+
+        it = list(range(100))
+        numbers_dp = IterableWrapper(it)
+        (process, req_queue, res_queue, _thread_local_datapipe) = communication.eventloop.SpawnThreadForDataPipeline(numbers_dp)
+
+        process.start()
+        local_datapipe = communication.iter.QueueWrapper(
+            communication.protocol.IterDataPipeQueueProtocolClient(req_queue, res_queue))
+
+        actual = list(local_datapipe)
+        clean_me(process, req_queue, res_queue)
 
+        self.assertEqual(list(range(100)), actual)
 
 class StringDataset(Dataset):
     def __init__(self):
diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py
index 0af9e6193af3d..ac0c763fe3854 100644
--- a/torch/utils/data/__init__.py
+++ b/torch/utils/data/__init__.py
@@ -35,7 +35,7 @@
     runtime_validation_disabled,
 )
 from torch.utils.data.dataloader_experimental import DataLoader2
-
+from torch.utils.data import communication
 
 __all__ = ['BatchSampler',
            'ChainDataset',
@@ -56,6 +56,7 @@
            'WeightedRandomSampler',
            '_DatasetKind',
            'argument_validation',
+           'communication',
            'functional_datapipe',
            'get_worker_info',
            'guaranteed_datapipes_determinism',
diff --git a/torch/utils/data/communication/__init__.py b/torch/utils/data/communication/__init__.py
new file mode 100644
index 0000000000000..88a395e2bddcf
--- /dev/null
+++ b/torch/utils/data/communication/__init__.py
@@ -0,0 +1,5 @@
+from . import eventloop
+from . import iter
+from . import messages
+from . import protocol
+from . import queue
diff --git a/torch/utils/data/communication/eventloop.py b/torch/utils/data/communication/eventloop.py
new file mode 100644
index 0000000000000..75c44c5192313
--- /dev/null
+++ b/torch/utils/data/communication/eventloop.py
@@ -0,0 +1,41 @@
+import torch
+import threading
+import pickle
+
+from torch.utils.data import IterDataPipe, communication
+
+
+def DataPipeToQueuesLoop(source_datapipe, req_queue, res_queue):
+    if isinstance(source_datapipe, IterDataPipe):
+        pipe_type = communication.iter
+        protocol_type = communication.protocol.IterDataPipeQueueProtocolServer
+    else:
+        raise Exception('Only supports IterDataPipe, got', source_datapipe)
+        # pipe_type = communication.map
+        # protocol_type = communication.protocol.MapDataPipeQueueProtocolServer
+
+    torch.set_num_threads(1)
+    for _ in pipe_type.DataPipeBehindQueues(source_datapipe, protocol_type(req_queue, res_queue), blocking_request_get=True):
+        pass
+
+
+def SpawnProcessForDataPipeline(multiprocessing_ctx, datapipe):
+    req_queue = multiprocessing_ctx.Queue()
+    res_queue = multiprocessing_ctx.Queue()
+    process = multiprocessing_ctx.Process(
+        target=DataPipeToQueuesLoop, args=(datapipe, req_queue, res_queue))
+    return process, req_queue, res_queue
+
+
+def SpawnThreadForDataPipeline(datapipe):
+    req_queue = communication.queue.ThreadingQueue()
+    res_queue = communication.queue.ThreadingQueue()
+
+    try:
+        new_datapipe = pickle.loads(pickle.dumps(datapipe))
+    except Exception as e:
+        raise Exception('Unable to pickle DataPipe to make thread local copy', e)
+
+    process = threading.Thread(target=DataPipeToQueuesLoop, args=(
+        new_datapipe, req_queue, res_queue), daemon=True)
+    return process, req_queue, res_queue, new_datapipe
diff --git a/torch/utils/data/communication/iter.py b/torch/utils/data/communication/iter.py
new file mode 100644
index 0000000000000..594a466295a5f
--- /dev/null
+++ b/torch/utils/data/communication/iter.py
@@ -0,0 +1,173 @@
+import time
+import types
+
+from torch.utils.data import IterDataPipe, communication
+
+DEFAULT_NON_BLOCKING_SLEEP = 0.001
+
+
+def default_not_available_hook():
+    time.sleep(DEFAULT_NON_BLOCKING_SLEEP)
+
+
+class NotAvailable(Exception):
+    pass
+
+
+class InvalidStateResetRequired(Exception):
+    """
+        Returned by DataPipe when it is expecting to get reset request,
+        for example RouterDataPipe expecting all workers to request reset'
+    """
+    pass
+
+
+class NonBlocking(IterDataPipe):
+    not_available_hook = default_not_available_hook
+
+    def __iter__(self):
+        self.reset_iterator()
+        return self
+
+    def __next__(self):
+        while True:
+            try:
+                return self.nonblocking_next()
+            except StopIteration:
+                raise StopIteration
+            except NotAvailable:
+                if NonBlocking.not_available_hook is not None:
+                    NonBlocking.not_available_hook()
+
+    def nonblocking_next(self):
+        raise NotImplementedError(
+            "nonblocking_next is not implemented for %s" % self.__class__)
+
+    def reset_iterator(self):
+        raise NotImplementedError(
+            "reset_iterator is not implemented for %s" % self.__class__)
+
+    @staticmethod
+    def register_not_available_hook(hook_function):
+        NonBlocking.not_available_hook = hook_function
+
+
+def EnsureNonBlockingDataPipe(validated_datapipe):
+    if not isinstance(validated_datapipe, IterDataPipe):
+        raise Exception('Not Iterable DataPipe ' +
+                        str(validated_datapipe.__class__))
+    if isinstance(validated_datapipe, NonBlocking):
+        return validated_datapipe
+    if not hasattr(validated_datapipe, '_as_iterator'):
+        validated_datapipe._as_iterator = None  # type: ignore[attr-defined]
+    if not hasattr(validated_datapipe, 'nonblocking_next'):
+        def nonblocking_next(self):
+            if self._as_iterator is None:
+                self._as_iterator = iter(self)
+            return next(self._as_iterator)
+        validated_datapipe.nonblocking_next = types.MethodType(  # type: ignore[attr-defined]
+            nonblocking_next, validated_datapipe)
+    if not hasattr(validated_datapipe, 'reset_iterator'):
+        def reset_iterator(self):
+            self._as_iterator = None
+        validated_datapipe.reset_iterator = types.MethodType(  # type: ignore[attr-defined]
+            reset_iterator, validated_datapipe)
+    return validated_datapipe
+
+
+def DataPipeBehindQueues(source_datapipe, protocol, full_stop=False, blocking_request_get=False):
+    """
+        Indefinitely iterates over req_queue and passing values from source_datapipe to res_queue
+        If raise_stop is true, raises exception when StopIteration received from the source_datapipe
+    """
+    if not isinstance(protocol, communication.protocol.IterDataPipeQueueProtocolServer):
+        raise Exception('Expecting IterDataPipeQueueProtocolServer, got', protocol)
+    source_datapipe = EnsureNonBlockingDataPipe(source_datapipe)
+    forever = True
+    while forever:
+
+        try:
+            # Non-blocking call is Extremely slow here for python.mp, need to figureout good workaround
+            request = protocol.get_new_request(block=blocking_request_get)
+        except communication.protocol.EmptyQueue:
+            yield True
+            continue
+
+        if isinstance(request, communication.messages.ResetIteratorRequest):
+            source_datapipe.reset_iterator()
+            protocol.response_reset()
+
+        elif isinstance(request, communication.messages.TerminateRequest):
+            forever = False
+            protocol.response_terminate()
+
+        elif isinstance(request, communication.messages.GetNextRequest):
+            while forever:
+                try:
+                    value = source_datapipe.nonblocking_next()
+                except NotAvailable:
+                    yield True
+                    continue
+                except StopIteration:
+                    protocol.response_stop()
+                    if full_stop:
+                        forever = False
+                    else:
+                        yield True
+                    break
+                except InvalidStateResetRequired:
+                    protocol.response_invalid()
+                    if full_stop:
+                        forever = False
+                    else:
+                        yield True
+                    break
+                protocol.response_next(value)
+                yield True  # Returns control
+                break
+        else:
+            raise Exception('Unrecognized type of request received', request)
+
+
+class QueueWrapper(NonBlocking):
+    """
+        Creates iter.DataPipe which reads data from the DataLoader.Queue
+    """
+
+    def __init__(self, protocol, response_wait_time=0.00001):
+        if not isinstance(protocol, communication.protocol.IterDataPipeQueueProtocolClient):
+            raise Exception('Got', protocol)
+
+        self.protocol = protocol
+        self.counter = 0
+        self._stop_iteration = False
+        self._response_wait_time = response_wait_time
+
+    def reset_iterator(self):
+        self._stop_iteration = False
+        self.counter = 0
+        self.protocol.request_reset()
+        while True:
+            try:
+                self.protocol.get_response_reset()
+                break
+            except communication.protocol.EmptyQueue:
+                if NonBlocking.not_available_hook is not None:
+                    NonBlocking.not_available_hook()
+
+    def nonblocking_next(self):
+        if self._stop_iteration:
+            raise Exception(
+                '`next` or `nonblocking_next` called after receiving StopIteration')
+        if self.protocol.can_take_request():
+            self.protocol.request_next()
+        try:
+            response = self.protocol.get_response_next(block=True, timeout=self._response_wait_time)
+        except communication.protocol.EmptyQueue:
+            raise NotAvailable
+        if isinstance(response, communication.messages.StopIterationResponse):
+            self._stop_iteration = True
+            raise StopIteration
+        if isinstance(response, communication.messages.InvalidStateResponse):
+            raise NotAvailable
+        return response.value
diff --git a/torch/utils/data/communication/messages.py b/torch/utils/data/communication/messages.py
new file mode 100644
index 0000000000000..449cf23cfc01c
--- /dev/null
+++ b/torch/utils/data/communication/messages.py
@@ -0,0 +1,75 @@
+class DataLoaderQueueMessage(object):
+    pass
+
+
+class Request(DataLoaderQueueMessage):
+    pass
+
+
+class Response(DataLoaderQueueMessage):
+    pass
+
+
+class ResetIteratorRequest(Request):
+    pass
+
+
+class ResetIteratorResponse(Response):
+    pass
+
+
+class TerminateRequest(Request):
+    pass
+
+
+class TerminateResponse(Response):
+    pass
+
+
+class LenRequest(Request):
+    pass
+
+
+class LenResponse(Response):
+    __slots__ = ('len')
+
+    def __init__(self, len):
+        self.len = len
+
+
+class GetItemRequest(Request):
+    __slots__ = ('key')
+
+    def __init__(self, key):
+        self.key = key
+
+
+class GetItemResponse(Response):
+    __slots__ = ('key', 'value')
+
+    def __init__(self, key, value):
+        self.key = key
+        self.value = value
+
+
+class GetNextRequest(Request):
+    pass
+
+
+class GetNextResponse(Response):
+    __slots__ = ('value')
+
+    def __init__(self, value):
+        self.value = value
+
+
+class StopIterationResponse(Response):
+    pass
+
+
+class InvalidStateResponse(Response):
+    """
+        Returned by DataPipe when it is expecting to get reset request,
+        for example RouterDataPipe expecting all workers to request reset'
+    """
+    pass
diff --git a/torch/utils/data/communication/protocol.py b/torch/utils/data/communication/protocol.py
new file mode 100644
index 0000000000000..68ff335714d3f
--- /dev/null
+++ b/torch/utils/data/communication/protocol.py
@@ -0,0 +1,159 @@
+from torch.utils.data import communication
+
+
+class Protocol(object):
+    __slots__ = ('request_queue', 'response_queue')
+
+    def __init__(self, request_queue, response_queue):
+        self.request_queue = request_queue
+        self.response_queue = response_queue
+
+
+class ProtocolClient(Protocol):
+    """
+        ProtocolClient takes charge of putting requests into req_queue and returning results from res_queue.
+    """
+    _req_sent = None
+
+    def __init__(self, request_queue, response_queue):
+        self.request_queue = request_queue
+        self.response_queue = response_queue
+        self._req_sent = None
+
+    def can_take_request(self):
+        return self._req_sent is None
+
+    def waiting_for_response(self):
+        return self._req_sent is not None
+
+    def request_sent(self, request=True):
+        if not self.can_take_request():
+            raise Exception('Protocol only supports one request in the Queue')
+        self._req_sent = request
+
+    def request_served(self, result=None):
+        if not self.waiting_for_response():
+            raise Exception(
+                'Expected no peding requests, but something got served', result)
+        self._req_sent = None
+
+
+class ProtocolServer(Protocol):
+    """
+        ProtocolServer takes charge of getting requests from req_queue and fetching data from source datapipe.
+    """
+    _req_received = None
+
+    def __init__(self, request_queue, response_queue):
+        self.request_queue = request_queue
+        self.response_queue = response_queue
+        self._req_received = None
+
+    def have_pending_request(self):
+        return self._req_received is not None
+
+    def get_new_request(self, block=False):
+        if self.have_pending_request():
+            raise Exception(
+                'Trying to get next request, while having one unserved')
+        try:
+            response = self.request_queue.get(block=block)
+        except Exception as e:  # TODO: Catch only timeout exceptions
+            raise EmptyQueue('queue is empty')
+        self._req_received = response
+        return response
+
+        # TODO: Validate supported requests
+
+    def response_reset(self):
+        if not self.have_pending_request():
+            raise Exception("Attempting to reply with pending request")
+        if not isinstance(self._req_received, communication.messages.ResetIteratorRequest):
+            raise Exception(
+                "Replaying with reset status to other type of message")
+        self.response_queue.put(communication.messages.ResetIteratorResponse())
+        self._req_received = None
+
+    def response_next(self, value):
+        if not self.have_pending_request():
+            raise Exception("Attempting to reply with pending request")
+        self.response_queue.put(communication.messages.GetNextResponse(value))
+        self._req_received = None
+
+    def response_stop(self):
+        if not self.have_pending_request():
+            raise Exception("Attempting to reply with pending request")
+        self.response_queue.put(communication.messages.StopIterationResponse())
+        self._req_received = None
+
+    def response_invalid(self):
+        if not self.have_pending_request():
+            raise Exception("Attempting to reply with pending request")
+        self.response_queue.put(communication.messages.InvalidStateResponse())
+        self._req_received = None
+
+    def response_terminate(self):
+        if not self.have_pending_request():
+            raise Exception("Attempting to reply with pending request")
+        if not isinstance(self._req_received, communication.messages.TerminateRequest):
+            raise Exception(
+                "Replaying with terminate status to other type of message")
+        self.response_queue.put(communication.messages.TerminateResponse())
+        self._req_received = None
+
+
+class MapDataPipeQueueProtocolClient(ProtocolClient):
+    pass
+
+
+class MapDataPipeQueueProtocolServer(ProtocolServer):
+    pass
+
+
+class EmptyQueue(Exception):
+    pass
+
+
+class IterDataPipeQueueProtocolServer(ProtocolServer):
+    pass
+
+
+class IterDataPipeQueueProtocolClient(ProtocolClient):
+    def request_reset(self):
+        if not self.can_take_request():
+            raise Exception(
+                'Can not reset while we are still waiting response for previous request')
+        request = communication.messages.ResetIteratorRequest()
+        self.request_queue.put(request)
+        self.request_sent(request)
+
+    def request_next(self):
+        if not self.can_take_request():
+            raise Exception(
+                'Can not request next item while we are still waiting response for previous request')
+        request = communication.messages.GetNextRequest()
+        self.request_queue.put(request)
+        self.request_sent(request)
+
+    def get_response_reset(self, block=False):
+        try:
+            response = self.response_queue.get(block=block)
+        except Exception as e:  # TODO: Catch only timeout exceptions
+            raise EmptyQueue('queue is empty')
+        self.request_served(response)
+
+        if not isinstance(response, communication.messages.ResetIteratorResponse):
+            raise Exception('Invalid response received')
+
+    def get_response_next(self, block=False, timeout=None):
+        if not self.waiting_for_response():
+            raise Exception(
+                'Can not expect any response without submitted request')
+        try:
+            response = self.response_queue.get(block=block, timeout=timeout)
+        except Exception as e:  # TODO: Catch only timeout exceptions
+            raise EmptyQueue('queue is empty')
+        self.request_served(response)
+
+        # TODO(VitalyFedyunin): Add possible response types validation here
+        return response
diff --git a/torch/utils/data/communication/queue.py b/torch/utils/data/communication/queue.py
new file mode 100644
index 0000000000000..7717697b0f75d
--- /dev/null
+++ b/torch/utils/data/communication/queue.py
@@ -0,0 +1,50 @@
+import threading
+import time
+
+class LocalQueue():
+    ops = 0
+    stored = 0
+    uid = 0
+    empty = 0
+
+    def __init__(self, name='unnamed'):
+        self.items = []
+        self.name = name
+        self.uid = LocalQueue.uid
+        LocalQueue.uid += 1
+
+    def put(self, item, block=True):
+        LocalQueue.ops += 1
+        LocalQueue.stored += 1
+        self.items.append(item)
+
+    def get(self, block=True, timeout=0):
+        # TODO(VitalyFedyunin): Add support of block and timeout arguments
+        LocalQueue.ops += 1
+        if not len(self.items):
+            LocalQueue.empty += 1
+            raise Exception('LocalQueue is empty')
+        LocalQueue.stored -= 1
+        return self.items.pop()
+
+
+class ThreadingQueue():
+    def __init__(self, name='unnamed'):
+        self.lock = threading.Lock()
+        self.items = []
+        self.name = name
+
+    def put(self, item, block=True):
+        with self.lock:
+            self.items.append(item)
+
+    def get(self, block=True, timeout=0):
+        # TODO(VitalyFedyunin): Add support of block and timeout arguments
+        while True:
+            with self.lock:
+                if len(self.items) > 0:
+                    return self.items.pop()
+            if not block:
+                raise Exception("Not available")
+            # TODO(VitalyFedyunin): Figure out what to do if nothing in the queue
+            time.sleep(0.000001)
diff --git a/torch/utils/data/dataloader_experimental.py b/torch/utils/data/dataloader_experimental.py
index ea085298bf00f..a74c75cd75122 100644
--- a/torch/utils/data/dataloader_experimental.py
+++ b/torch/utils/data/dataloader_experimental.py
@@ -1,10 +1,60 @@
 
 import functools
+import time
+
+from typing import Any, List
 
 import torch.utils.data.backward_compatibility
-from torch.utils.data import DataLoader, IterDataPipe
+
+import torch.utils.data.sharding
+from torch.utils.data import DataLoader, IterDataPipe, communication
 from torch.utils.data.datapipes.iter import IterableWrapper
 
+class _ThreadingDataLoader2:
+
+    def __init__(self, datapipe, num_workers=0, collate_fn=None):
+        self.threads = []
+        self.datapipes = []
+        self.collate_fn = collate_fn
+        for worker_id in range(num_workers):
+            (thread, req_queue, res_queue, thread_localdatapipe) = communication.eventloop.SpawnThreadForDataPipeline(datapipe)
+            torch.utils.data.sharding.apply_sharding(thread_localdatapipe, num_workers, worker_id)
+            thread.start()
+            self.threads.append((thread, req_queue, res_queue))
+            local_datapipe = communication.iter.QueueWrapper(
+                communication.protocol.IterDataPipeQueueProtocolClient(req_queue, res_queue))
+            self.datapipes.append(local_datapipe)
+
+    def __iter__(self):
+        not_available = False
+        forever = True
+        exclude_datapipes: List[Any] = []
+        while len(exclude_datapipes) < len(self.datapipes):
+            for dp in self.datapipes:
+                if dp not in exclude_datapipes:
+                    try:
+                        value = dp.nonblocking_next()
+                        yield value
+                    except StopIteration:
+                        exclude_datapipes.append(dp)
+                    except communication.iter.NotAvailable:
+                        not_available = True
+            if not_available:
+                time.sleep(0.001)
+
+    def __del__(self):
+        self._cleanup_all_threads()
+
+    def _cleanup_all_threads(self):
+        def clean_me(thread, req_queue, res_queue):
+            req_queue.put(communication.messages.TerminateRequest())
+            _ = res_queue.get()
+            thread.join()
+
+        for thread, req_queue, res_queue in self.threads:
+            clean_me(thread, req_queue, res_queue)
+
+
 class DataLoader2:
     def __new__(cls,
                 dataset,
@@ -21,15 +71,17 @@ def __new__(cls,
                 *,
                 prefetch_factor=2,
                 persistent_workers=False,
-                batch_outside_worker=False):
+                batch_outside_worker=False,
+                parallelism_mode='mp'):
         if isinstance(dataset, IterDataPipe):
-            datapipe = dataset
+            data_loader: Any = None
             if batch_sampler is not None:
                 raise Exception(
-                    'batch_sampler is not yet supported for DataPipes')
+                    'batch_sampler is not yet supported by DataPipes')
             if sampler is not None:
                 raise Exception(
-                    'sampler is not yet supported for DataPipes')
+                    'sampler is not yet supported by DataPipes')
+            datapipe = dataset
             if shuffle:
                 datapipe = datapipe.shuffle()
             if batch_outside_worker and pin_memory:
@@ -40,30 +92,43 @@ def __new__(cls,
                     datapipe = datapipe.batch(batch_size, drop_last=drop_last)
                     if collate_fn is None:
                         collate_fn = torch.utils.data._utils.collate.default_collate
+            if parallelism_mode == 'mp' or num_workers == 0:
+                def sharding_worker_init_fn(worker_init_fn, worker_id):
+                    if worker_init_fn is not None:
+                        worker_init_fn(worker_id)
+                    torch.utils.data.backward_compatibility.worker_init_fn(
+                        worker_id)
 
-            def sharding_worker_init_fn(worker_init_fn, worker_id):
-                if worker_init_fn is not None:
-                    worker_init_fn(worker_id)
-                torch.utils.data.backward_compatibility.worker_init_fn(
-                    worker_id)
-
-            my_worker_init_fn = functools.partial(
-                sharding_worker_init_fn, worker_init_fn)
-
-            data_loader = DataLoader(datapipe,
-                                     batch_size=None,  # Replaced by .batch DataPipe
-                                     shuffle=False,  # Replaced by .shuffle DataPipe
-                                     sampler=None,
-                                     batch_sampler=None,
-                                     num_workers=num_workers,
-                                     collate_fn=collate_fn,
-                                     pin_memory=pin_memory,
-                                     drop_last=False,  # Replaced by .batch DataPipe
-                                     timeout=timeout,
-                                     worker_init_fn=my_worker_init_fn,
-                                     prefetch_factor=prefetch_factor,
-                                     persistent_workers=persistent_workers)
+                my_worker_init_fn = functools.partial(
+                    sharding_worker_init_fn, worker_init_fn)
 
+                data_loader = DataLoader(datapipe,
+                                         batch_size=None,  # Replaced by .batch DataPipe
+                                         shuffle=False,  # Replaced by .shuffle DataPipe
+                                         sampler=None,
+                                         batch_sampler=None,
+                                         num_workers=num_workers,
+                                         collate_fn=collate_fn,
+                                         pin_memory=pin_memory,
+                                         drop_last=False,  # Replaced by .batch DataPipe
+                                         timeout=timeout,
+                                         worker_init_fn=my_worker_init_fn,
+                                         prefetch_factor=prefetch_factor,
+                                         persistent_workers=persistent_workers)
+            elif parallelism_mode == 'thread':
+                if collate_fn is not None and not batch_outside_worker:
+                    datapipe = datapipe.map(collate_fn)
+                if pin_memory:
+                    raise Exception(
+                        'pin_memory is not yet supported by DataPipes with Threading')
+                if worker_init_fn is not None:
+                    raise Exception(
+                        'worker_init_fn is not yet supported by DataPipes with Threading')
+                data_loader = _ThreadingDataLoader2(datapipe,
+                                                    num_workers=num_workers,
+                                                    collate_fn=collate_fn)
+            else:
+                raise Exception('Unsupported parallelism mode', parallelism_mode)
             if not batch_outside_worker:
                 return data_loader
             else:
@@ -72,8 +137,11 @@ def sharding_worker_init_fn(worker_init_fn, worker_id):
                 datapipe = IterableWrapper(data_loader).batch(
                     batch_size, drop_last=drop_last).map(collate_fn)
                 return datapipe
-
         else:
+            if parallelism_mode != 'thread':
+                raise Exception(
+                    'thread parallelism mode is not supported for old DataSets')
+
             return DataLoader(dataset,
                               batch_size=batch_size,
                               shuffle=shuffle,

From f79df24859125fca8e8af799fb2be44dc3293752 Mon Sep 17 00:00:00 2001
From: Facebook Community Bot <facebook-github-bot@users.noreply.github.com>
Date: Mon, 30 Aug 2021 08:27:36 -0700
Subject: [PATCH 343/530] Automated submodule update: FBGEMM (#64149)

Summary:
This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM).

New submodule commit: https://github.com/pytorch/FBGEMM/commit/f6dfed87a10ed5729bce83e98788e437a94cbda0

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64149

Test Plan: Ensure that CI jobs succeed on GitHub before landing.

Reviewed By: jspark1105

Differential Revision: D30632209

fbshipit-source-id: aa1cebaf50169c3a93dbcb994fa47e29d6b6a0d7
---
 third_party/fbgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/fbgemm b/third_party/fbgemm
index 9939bac9defab..9f4078a7bb92b 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 9939bac9defab4d18fb7fdded7e1a76c0c2b49b4
+Subproject commit 9f4078a7bb92b88cdcfc913398ffade158160c91

From dc4fd3bddab51970e53060f8cb1a3c316a28b042 Mon Sep 17 00:00:00 2001
From: Raghavan Raman <raghavanr@fb.com>
Date: Mon, 30 Aug 2021 09:26:20 -0700
Subject: [PATCH 344/530] [MicroBench] Added a micro benchmark for a signed
 log1p kernel. (#64032)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64032

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D30579198

Pulled By: navahgar

fbshipit-source-id: a53d68225fba768b26491d14b535f8f2dcf50c0e
---
 benchmarks/cpp/tensorexpr/CMakeLists.txt      |   1 +
 .../cpp/tensorexpr/bench_signed_log1p.cpp     | 120 ++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp

diff --git a/benchmarks/cpp/tensorexpr/CMakeLists.txt b/benchmarks/cpp/tensorexpr/CMakeLists.txt
index 789c81fcf6526..a06502eb29053 100644
--- a/benchmarks/cpp/tensorexpr/CMakeLists.txt
+++ b/benchmarks/cpp/tensorexpr/CMakeLists.txt
@@ -6,6 +6,7 @@ add_executable(
   bench_batchnorm.cpp
   bench_concat.cpp
   bench_compile.cpp
+  bench_signed_log1p.cpp
   bench_fuser_overhead.cpp
   bench_gemm.cpp
   bench_parallel.cpp
diff --git a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
new file mode 100644
index 0000000000000..44781f58c9027
--- /dev/null
+++ b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
@@ -0,0 +1,120 @@
+#include <benchmark/benchmark.h>
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/torch.h>
+
+using namespace torch::jit::tensorexpr;
+
+namespace {
+
+class SignedLog1pBench : public benchmark::Fixture {
+ public:
+  void SetUp(const benchmark::State& state) override {
+    input_size_ = {state.range(0), state.range(1)};
+    input_size_int_ = {state.range(0), state.range(1)};
+    input_ = torch::rand(input_size_);
+    ref_ = signedLog1p(input_);
+  }
+
+  void TearDown(benchmark::State& state) override {
+    TORCH_CHECK(at::allclose(ref_, output_));
+    state.counters["GB/s"] = benchmark::Counter(
+        uint64_t(state.iterations()) * 2 * output_.nbytes(),
+        benchmark::Counter::kIsRate);
+  }
+
+  at::Tensor signedLog1p(const at::Tensor& inp) {
+    auto sign = at::sign(inp);
+    auto log1p = at::log1p(at::abs(inp));
+    return sign * log1p;
+  }
+
+  void runATen(benchmark::State& state) {
+    for (auto _ : state) {
+      output_ = signedLog1p(input_);
+    }
+  }
+
+  void runNNC(benchmark::State& state) {
+    Placeholder input_ph(
+        "input", kFloat, {input_size_int_[0], input_size_int_[1]});
+    Tensor abs_result = Compute(
+        "aten_abs",
+        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        [&](const VarHandle& m, const VarHandle& n) {
+          return abs(input_ph.load(m, n));
+        });
+    Tensor log1p_result = Compute(
+        "aten_log1p",
+        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        [&](const VarHandle& m, const VarHandle& n) {
+          return log1p(abs_result.load(m, n));
+        });
+    Tensor sign = Compute(
+        "aten_sign",
+        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        [&](const VarHandle& m, const VarHandle& n) {
+          return CompareSelect::make(
+              input_ph.load(m, n),
+              ExprHandle(0.0f),
+              ExprHandle(-1),
+              ExprHandle(1),
+              kLT);
+        });
+    Tensor output = Compute(
+        "aten_mul",
+        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        [&](const VarHandle& m, const VarHandle& n) {
+          return sign.load(m, n) * log1p_result.load(m, n);
+        });
+    LoopNest nest({output}, {abs_result, log1p_result, sign, output});
+    GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt());
+    nest.inlineIntermediateBufs(true);
+    nest.prepareForCodegen();
+    nest.simplify();
+    nest.vectorizeInnerLoops();
+    nest.simplify();
+    GRAPH_DEBUG("Final stmt: ", *nest.root_stmt());
+
+    // StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
+    std::vector<CodeGen::BufferArg> buf_args;
+    buf_args.push_back(input_ph);
+    buf_args.push_back(output);
+    LLVMCodeGen cg(nest.root_stmt(), buf_args);
+
+    std::vector<CodeGen::CallArg> call_args;
+    for (auto _ : state) {
+      output_ = at::empty_like(ref_);
+      call_args.clear();
+      call_args.push_back(input_.data_ptr<float>());
+      call_args.push_back(output_.data_ptr<float>());
+      cg.call(call_args);
+    }
+  }
+
+ private:
+  std::vector<long> input_size_;
+  std::vector<int> input_size_int_;
+  at::Tensor input_;
+  at::Tensor output_;
+  at::Tensor ref_;
+};
+
+} // namespace
+
+BENCHMARK_DEFINE_F(SignedLog1pBench, ATen)(benchmark::State& state) {
+  runATen(state);
+}
+
+BENCHMARK_DEFINE_F(SignedLog1pBench, NNC)(benchmark::State& state) {
+  runNNC(state);
+}
+
+BENCHMARK_REGISTER_F(SignedLog1pBench, ATen)->Args({10, 1467});
+
+BENCHMARK_REGISTER_F(SignedLog1pBench, NNC)->Args({10, 1467});

From 9777887f0ebe6403e19205eb1cafb81fe24606fb Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Mon, 30 Aug 2021 09:34:24 -0700
Subject: [PATCH 345/530] [PyTorch] Reduce copies/refcount bumps in
 BytecodeDeserializer::parseMethods (#63961)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63961

Saw a report that this function was slow and was doing unexplained vector copies. First pass to remove a bunch of copying.
ghstack-source-id: 136760976

Test Plan:
Pixel 3
before: https://our.intern.facebook.com/intern/aibench/details/461850118893980
after: https://www.internalfb.com/intern/aibench/details/48965886029524

MilanBoard failed to return data from simpleperf

Reviewed By: dhruvbird

Differential Revision: D30544551

fbshipit-source-id: 0e2b5471a10c0803d52c923e6fb5625f5542b99d
---
 torch/csrc/jit/mobile/import.cpp | 44 +++++++++++++-------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index d2865d071ea72..6a548103f6965 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -85,8 +85,8 @@ using caffe2::serialize::ReadAdapterInterface;
 
 OpCode parseOpCode(const char* str);
 
-IValue expect_field(
-    IValue tup,
+const IValue& expect_field(
+    const IValue& tup,
     const std::string& expected_name,
     size_t entry) {
   auto row = tup.toTuple()->elements().at(entry).toTuple();
@@ -325,8 +325,7 @@ void BytecodeDeserializer::parseMethods(
       " but the model version is ",
       model_version);
 
-  bool has_debug_handles = debug_handles.has_value();
-  if (has_debug_handles) {
+  if (debug_handles) {
     TORCH_CHECK(
         debug_handles->size() == vals.size(),
         "The numbers of bytecode values and debug info values do not match.");
@@ -340,12 +339,11 @@ void BytecodeDeserializer::parseMethods(
     const auto& element = vals[i];
     const auto& m_tuple = element.toTuple()->elements();
     const std::string& function_name = m_tuple[0].toStringRef();
-    IValue codeTable = m_tuple[1];
-    auto schemaTable = // older files do not store function schema
+    const IValue& codeTable = m_tuple[1];
+    const IValue* schemaTable = // older files do not store function schema
         (model_version > 0x4L || (model_version == 0x4L && m_tuple.size() >= 3))
-        ? at::optional<IValue>{m_tuple[2]}
-        : at::nullopt;
-
+        ? &m_tuple[2]
+        : nullptr;
     auto function =
         std::make_unique<mobile::Function>(c10::QualifiedName(function_name));
 
@@ -369,8 +367,8 @@ void BytecodeDeserializer::parseMethods(
         expect_field(codeTable, "register_size", BYTECODE_INDEX_REGISTER_SIZE)
             .toInt();
 
-    std::vector<IValue> debug_handles_list;
-    if (has_debug_handles) {
+    c10::List<int64_t> debug_handles_list;
+    if (debug_handles) {
       const auto& debug_handles_element = (*debug_handles)[i];
       const auto& debug_handles_m_tuple =
           debug_handles_element.toTuple()->elements();
@@ -379,22 +377,21 @@ void BytecodeDeserializer::parseMethods(
       TORCH_CHECK(
           debug_info_function_name == function_name,
           "The function names in the bytecode table and the debug info table do not match.");
-      IValue debug_handles_table = debug_handles_m_tuple[1];
+      const IValue& debug_handles_table = debug_handles_m_tuple[1];
       debug_handles_list = (expect_field(
                                 debug_handles_table,
                                 "function_debug_handles",
                                 BYTECODE_INDEX_MODULE_DEBUG_HANDLES)
                                 .toTuple()
                                 ->elements())[0]
-                               .toList()
-                               .vec();
+                               .toIntList();
       TORCH_CHECK(
           debug_handles_list.size() == ins_list.size(),
           "The numbers of instructions and debug handles strings do not match.");
     }
 
     for (const auto j : c10::irange(ins_list.size())) {
-      auto ins_item = ins_list[j].toTuple()->elements();
+      const auto& ins_item = ins_list[j].toTuple()->elements();
       TORCH_CHECK(
           ins_item.size() == 3,
           "There should be three parts in an instruction. The function name is ",
@@ -402,8 +399,8 @@ void BytecodeDeserializer::parseMethods(
       OpCode op_code = parseOpCode(ins_item[0].toString()->string().c_str());
       int X = ins_item[1].toInt();
       int N = ins_item[2].toInt();
-      if (has_debug_handles) {
-        int64_t debug_handle = debug_handles_list[j].toInt();
+      if (debug_handles) {
+        int64_t debug_handle = debug_handles_list[j];
         function->append_instruction(op_code, X, N, debug_handle);
       } else {
         function->append_instruction(op_code, X, N);
@@ -451,14 +448,9 @@ void BytecodeDeserializer::parseMethods(
           const auto& type = resolveTypeName(
               (expect_field(argTable, "type", BYTECODE_INDEX_ARGUMENT_TYPE))
                   .toStringRef());
-          auto default_value = expect_field(
-                                   argTable,
-                                   "default_value",
-                                   BYTECODE_INDEX_ARGUMENT_DEFAULT_VALUE)
-                                   .toIValue();
-          auto arg =
-              c10::Argument(name, type, c10::nullopt /*N*/, default_value);
-          args.emplace_back(std::move(arg));
+          const IValue& default_value = expect_field(
+              argTable, "default_value", BYTECODE_INDEX_ARGUMENT_DEFAULT_VALUE);
+          args.emplace_back(name, type, c10::nullopt /*N*/, default_value);
         }
         return args;
       };
@@ -522,7 +514,7 @@ mobile::Module BytecodeDeserializer::deserialize(
   // being a Tuple (int, table), and the integer stands for the bytecode version
   // number. The rest of the elements are the same as before.
   //
-  auto bvals = readArchive("bytecode", mcu).toTuple()->elements();
+  auto bvals = std::move(*readArchive("bytecode", mcu).toTuple()).elements();
 
   c10::optional<std::vector<IValue>> debug_handles;
   if (reader_->hasRecord("mobile_debug_handles.pkl")) {

From 16ecdbbaa2a24debae1c80b441bbea945d61d02d Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Mon, 30 Aug 2021 09:34:24 -0700
Subject: [PATCH 346/530] [PyTorch] Fix missing move in unpickler (#63974)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63974

Saw some time spent in this for model loading, no reason not to move here.
ghstack-source-id: 136760979

Test Plan: Re-profile model loading on devserver; IValue copy ctor time has gone down

Reviewed By: dhruvbird

Differential Revision: D30548923

fbshipit-source-id: 42000f2e18582762b43353cca10ae094833de3b3
---
 torch/csrc/jit/serialization/unpickler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index f944387465446..b521dc88a12ba 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -318,7 +318,7 @@ PickleOpCode Unpickler::readInstruction() {
       tuple->elements().reserve(stack_.size() - start);
       auto start_it = stack_.begin() + start;
       for (auto it = start_it; it != stack_.end(); ++it) {
-        tuple->elements().emplace_back(*it);
+        tuple->elements().emplace_back(std::move(*it));
       }
       stack_.erase(start_it, stack_.end());
       stack_.emplace_back(std::move(tuple));

From e24c3644d87acfb0359cb14bde4afcd62a9255ba Mon Sep 17 00:00:00 2001
From: Harut Movsisyan <harutm@fb.com>
Date: Mon, 30 Aug 2021 09:36:46 -0700
Subject: [PATCH 347/530] [Static Runtime] aten::cat out version when it is not
 being replaced by prim::VarConcat (#64157)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64157

UseVariadicCat optimization is not applied to aten::cat if list input to the op can not be moved to the position before op (https://fburl.com/diffusion/l6kweimu). For these cases we will need out version for SR.

Test Plan:
Confirm out variant is called:
```
> buck run //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- --v=1
```

Reviewed By: d1jang

Differential Revision: D30598574

fbshipit-source-id: 74cfa8291dc8b5df4aef58adfb1ab2a16f10d90a
---
 benchmarks/static_runtime/test_scripts.h      | 11 ++++++++++
 .../static_runtime/test_static_runtime.cc     | 20 +++++++++++++++++++
 torch/csrc/jit/runtime/static/ops.cpp         | 20 +++++++++++++++++++
 3 files changed, 51 insertions(+)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index 004319ca550f9..7fdb113c4ed45 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -790,3 +790,14 @@ const auto linalg_norm_ord_str = R"JIT(
   def forward(self, a: Tensor, ord: str, dim: List[int], keepdim: bool, dtype: int):
       return torch.linalg_norm(a, ord, dim, keepdim, dtype=dtype).clone()
 )JIT";
+
+const std::string cat_script = R"IR(
+  graph(%a: Tensor, %b: Tensor, %dim: int):
+      %ten_list: Tensor[] = prim::ListConstruct(%a, %b)
+      %1 : int = prim::Constant[value=0]()
+      %2 : int = prim::Constant[value=1]()
+      %3 : int = prim::Constant[value=1]()
+      %ten_list2 : Tensor[] = aten::slice(%ten_list, %1, %2, %3)
+      %ret: Tensor = aten::cat(%ten_list2, %dim)
+      return (%ret)
+)IR";
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index f6e3680e0be38..b7201baa1e182 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 #include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/runtime/static/fusion.h>
 #include <torch/csrc/jit/runtime/static/impl.h>
 #include <torch/csrc/jit/runtime/static/passes.h>
@@ -1291,3 +1292,22 @@ TEST(StaticRuntime, IndividualOps_LinalgNorm_StringOrd) {
   std::vector<IValue> args1{b, "fro", dim, true, dtype};
   testStaticRuntime(linalg_norm_ord_str, args0, args1);
 }
+
+TEST(StaticRuntime, IndividualOps_Cat) {
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  parseIR(cat_script, graph.get(), vmap);
+  torch::jit::StaticModule smodule(graph);
+  ASSERT_TRUE(getNodeWithKind(smodule, "aten::cat"));
+
+  auto a = at::randn({2, 4});
+  auto b = at::randn({3, 4});
+  std::vector<IValue> args0{a, b, 0};
+
+  testStaticRuntime(cat_script, args0);
+
+  auto c = at::randn({3, 4});
+  auto d = at::randn({3, 5});
+  std::vector<IValue> args1{c, d, 1};
+  testStaticRuntime(cat_script, args0, args1);
+}
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 12339301e0433..cf91f33a28c26 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1713,6 +1713,26 @@ REGISTER_OPERATOR_FUNCTOR(aten::linalg_norm, aten_linalg_norm, [](Node* n) -> SR
   };
 });
 
+REGISTER_OPERATOR_FUNCTOR(aten::cat, aten_cat, [](Node* n) -> SROperator {
+  if (!n->matches(
+          torch::schema("aten::cat(Tensor[] tensors, int dim=0) -> Tensor"))) {
+    LogAndDumpSchema(n);
+    return nullptr;
+  }
+  return [](ProcessedNode* p_node) {
+    const auto inputs = p_node->Input(0).toTensorVector();
+    const auto dim = p_node->Input(1).toInt();
+    if (p_node->Output(0).isNone()) {
+      p_node->Output(0) = at::native::_cat_cpu(inputs, dim);
+      return;
+    }
+
+    auto& output = p_node->Output(0).toTensor();
+    fastResizeToZero(output);
+    at::native::_cat_out_cpu(inputs, dim, output);
+  };
+});
+
 namespace {
 
 void check_cat_no_zero_dim(const std::vector<at::Tensor>& tensors) {

From 93d2e5090f9823102debab3845117c8e8208995b Mon Sep 17 00:00:00 2001
From: = <=>
Date: Mon, 30 Aug 2021 09:43:25 -0700
Subject: [PATCH 348/530] Improve performance of index_select by avoiding item
 (#63008)

Summary:
Partially fixes https://github.com/pytorch/pytorch/issues/61788

From a CUDA perspective: item already pulls all Tensor content onto the host (albeit one-by-one), which incurs very expensive memory transfers. This way we'll do it all at once.
From a CPU perspective: item has a lot of overhead as a native function in comparison to simply using a pointer.

Overall there's still lots of performance gains to be had, but this is a small change that should take us into a more usable landscape. This doesn't land a separate benchmark, but I postulate that's not necessary to decide on the benefit of this (we'll also see if it shows up indirectly), however is still a good follow-up item.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63008

Reviewed By: zou3519

Differential Revision: D30211160

Pulled By: cpuhrsch

fbshipit-source-id: 70b752be5df51afc66b5aa1c77135d1205520cdd
---
 aten/src/ATen/native/TensorShape.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 2545ec4c1e035..1dc2a270c44c2 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1209,12 +1209,15 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)
 
   if (dim < sparse_dim) {
 
-    auto dim_indices = indices[dim];
+    auto cpu_dim_indices = indices[dim].to(c10::kCPU).contiguous();
+    int64_t* cpu_dim_indices_ptr = cpu_dim_indices.data_ptr<int64_t>();
+    auto cpu_index = index.to(c10::kCPU).contiguous();
+    int64_t* cpu_index_ptr = cpu_index.data_ptr<int64_t>();
     std::vector<int64_t> zindices;
     std::vector<int64_t> iindices;
     int64_t new_nnz = 0;
-    for (const auto i : c10::irange(new_sizes[dim])) {
-      auto idx = index[i].item<int64_t>();
+    for (int64_t i = 0; i < new_sizes[dim]; i++) {
+      int64_t idx = cpu_index_ptr[i];
       if (idx < -size || idx >= size) {
         TORCH_CHECK_INDEX(false, "index_select(): index contains ", idx, " that is out of range for tensor of size ",
                    self.sizes(), " at dimension ", dim);
@@ -1222,8 +1225,8 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)
       if (idx < 0) {
         idx += size;
       }
-      for (const auto j : c10::irange(nnz)) {
-        auto jdx = dim_indices[j].item<int64_t>();
+      for (int64_t j = 0; j < nnz; j++) {
+        int64_t jdx = cpu_dim_indices_ptr[j];
         if (idx == jdx) {
           new_nnz++;
           iindices.push_back(i);

From ac99d63f83ceaee4a95e7baa8a52fba09903d00b Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Mon, 30 Aug 2021 11:46:14 -0700
Subject: [PATCH 349/530] [jit] Make operation call accept Stack& instead
 Stack* (#63414)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63414

Misuse of raw pointer in here where stack is never nullable.
ghstack-source-id: 136938318

Test Plan:
compiles.

Imported from OSS

Reviewed By: ejguan

Differential Revision: D30375410

fbshipit-source-id: 9d65b620bb76d90d886c800f54308520095d58ee
---
 aten/src/ATen/core/dispatch/Dispatcher.h      |   4 +
 aten/src/ATen/core/stack.h                    |  40 +-
 test/cpp/jit/test_alias_analysis.cpp          |  14 +-
 test/cpp/jit/test_custom_operators.cpp        |  14 +-
 test/cpp/jit/test_misc.cpp                    |   4 +-
 test/cpp/jit/test_schema_matching.cpp         |   4 +-
 test/cpp/jit/test_utils.cpp                   |   2 +-
 test/custom_operator/test_custom_ops.cpp      |   2 +-
 torch/csrc/autograd/record_function_ops.cpp   |   2 +-
 .../rpc/request_callback_no_python.cpp        |   2 +-
 torch/csrc/jit/codegen/cuda/interface.cpp     |   6 +-
 torch/csrc/jit/codegen/fuser/fallback.cpp     |   2 +-
 torch/csrc/jit/mobile/function.cpp            |   2 +-
 torch/csrc/jit/passes/batch_mm.cpp            |  16 +-
 .../csrc/jit/passes/constant_propagation.cpp  |   2 +-
 torch/csrc/jit/passes/decompose_ops.cpp       |   4 +-
 .../csrc/jit/passes/frozen_ops_to_mkldnn.cpp  |  18 +-
 torch/csrc/jit/passes/shape_analysis.cpp      |   2 +-
 torch/csrc/jit/passes/tensorexpr_fuser.cpp    |   4 +-
 .../passes/utils/check_alias_annotation.cpp   |   2 +-
 torch/csrc/jit/python/pybind_utils.h          |   2 +-
 torch/csrc/jit/python/python_interpreter.cpp  |   4 +-
 torch/csrc/jit/runtime/graph_executor.cpp     |  12 +-
 torch/csrc/jit/runtime/interpreter.cpp        |   4 +-
 torch/csrc/jit/runtime/register_c10_ops.cpp   |   2 +-
 torch/csrc/jit/runtime/register_cuda_ops.cpp  |  24 +-
 .../jit/runtime/register_distributed_ops.cpp  |  30 +-
 torch/csrc/jit/runtime/register_ops_utils.cpp |  52 +--
 torch/csrc/jit/runtime/register_ops_utils.h   | 138 +++----
 torch/csrc/jit/runtime/register_prim_ops.cpp  | 342 +++++++++---------
 .../jit/runtime/register_prim_ops_fulljit.cpp |  62 ++--
 .../csrc/jit/runtime/register_special_ops.cpp |  34 +-
 torch/csrc/jit/runtime/static/fusion.cpp      |   6 +-
 torch/csrc/jit/runtime/static/impl.cpp        |   2 +-
 34 files changed, 451 insertions(+), 409 deletions(-)

diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index fd32a72c75102..cfa6b740f8877 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -344,6 +344,10 @@ class TORCH_API OperatorHandle {
     c10::Dispatcher::singleton().callBoxed(*this, stack);
   }
 
+  void callBoxed(Stack& stack) const {
+    callBoxed(&stack);
+  }
+
   void redispatchBoxed(DispatchKeySet ks, Stack* stack) const {
     c10::Dispatcher::singleton().redispatchBoxed(*this, ks, stack);
   }
diff --git a/aten/src/ATen/core/stack.h b/aten/src/ATen/core/stack.h
index ffc0e8fd9037d..021e8a02104f2 100644
--- a/aten/src/ATen/core/stack.h
+++ b/aten/src/ATen/core/stack.h
@@ -1,6 +1,9 @@
 #pragma once
 
+#include <type_traits>
+
 #include <ATen/core/ivalue.h>
+#include <c10/util/Deprecated.h>
 
 // TODO move this to c10 namespace
 
@@ -9,7 +12,42 @@ namespace jit {
 
 using c10::IValue;
 using Stack = std::vector<IValue>;
-using Operation = std::function<void(Stack*)>;
+
+class Operation {
+  template <typename F, typename Arg>
+  using accepts = std::is_constructible<std::function<void(Arg)>, F&&>;
+
+ public:
+  template <typename F,
+            std::enable_if_t<accepts<F, Stack*>::value, int> = 0>
+  C10_DEPRECATED_MESSAGE("Please use void(Stack&) to register operator instead.")
+  Operation(F&& raw): op_([raw = std::forward<F>(raw)](Stack& stack) {
+    raw(&stack);
+  }) {}
+
+  template <typename F,
+            std::enable_if_t<accepts<F, Stack&>::value &&
+                !std::is_same<std::decay_t<F>, Operation>::value, int> = 0>
+  Operation(F&& op): op_(std::forward<F>(op)) {}
+
+  Operation(std::nullptr_t) noexcept {}
+
+  explicit operator bool() const noexcept {
+    return op_ ? true : false;
+  }
+
+  void operator()(Stack& stack) {
+    op_(stack);
+  }
+
+  template <typename T>
+  T* target() noexcept {
+    return op_.target<T>();
+  }
+
+ private:
+  std::function<void(Stack&)> op_;
+};
 
 // An operation with N inputs and M outputs pops the last N inputs off
 // the stack and pushes its M inputs onto the stack
diff --git a/test/cpp/jit/test_alias_analysis.cpp b/test/cpp/jit/test_alias_analysis.cpp
index 1bd556a8980b7..eef529d8d5d33 100644
--- a/test/cpp/jit/test_alias_analysis.cpp
+++ b/test/cpp/jit/test_alias_analysis.cpp
@@ -1,11 +1,11 @@
 #include <gtest/gtest.h>
 
 #include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/jit/frontend/ir_emitter.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/ir/irparser.h>
-#include "torch/csrc/jit/frontend/ir_emitter.h"
-#include "torch/csrc/jit/ir/alias_analysis.h"
-#include "torch/csrc/jit/runtime/custom_operator.h"
-#include "torch/csrc/utils/memory.h"
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/utils/memory.h>
 
 namespace torch {
 namespace jit {
@@ -484,7 +484,7 @@ TEST(AliasAnalysisTest, SafeToChangeAliasingRelationship) {
 TEST(WriteTrackingTest, Basic) {
   RegisterOperators reg({Operator(
       "prim::creates_alias(Tensor(a) x) -> Tensor(a)",
-      [](Stack* s) {},
+      [](Stack&) {},
       aliasAnalysisFromSchema())});
   const auto creates_alias = Symbol::fromQualString("prim::creates_alias");
   auto graph = std::make_shared<Graph>();
@@ -949,11 +949,11 @@ TEST(WildcardsTest, Basic) {
   RegisterOperators reg(
       {Operator(
            "prim::returns_wildcard(Tensor a) -> Tensor(*)",
-           [](Stack* stack) {},
+           [](Stack&) {},
            aliasAnalysisFromSchema()),
        Operator(
            "prim::writes(Tensor(z!) a) -> Tensor(a)",
-           [](Stack* stack) {},
+           [](Stack&) {},
            aliasAnalysisFromSchema())});
   const auto returns_wildcard =
       Symbol::fromQualString("prim::returns_wildcard");
diff --git a/test/cpp/jit/test_custom_operators.cpp b/test/cpp/jit/test_custom_operators.cpp
index a34ca33672c7b..39be82ea23430 100644
--- a/test/cpp/jit/test_custom_operators.cpp
+++ b/test/cpp/jit/test_custom_operators.cpp
@@ -31,7 +31,7 @@ TEST(CustomOperatorTest, InferredSchema) {
 
   Stack stack;
   push(stack, 2.0f, at::ones(5));
-  op->getOperation()(&stack);
+  op->getOperation()(stack);
   at::Tensor output;
   pop(stack, output);
 
@@ -61,7 +61,7 @@ TEST(CustomOperatorTest, ExplicitSchema) {
 
   Stack stack;
   push(stack, 2.0f, at::ones(5));
-  op->getOperation()(&stack);
+  op->getOperation()(stack);
   at::Tensor output;
   pop(stack, output);
 
@@ -109,7 +109,7 @@ TEST(CustomOperatorTest, ListParameters) {
       c10::List<c10::complex<double>>(
           {c10::complex<double>(2.4, -5.5), c10::complex<double>(-1.3, 2)}));
   push(stack, c10::List<at::Tensor>({at::ones(5)}));
-  op->getOperation()(&stack);
+  op->getOperation()(stack);
   c10::List<double> output;
   pop(stack, output);
 
@@ -140,7 +140,7 @@ TEST(CustomOperatorTest, ListParameters2) {
 
   Stack stack;
   push(stack, c10::List<at::Tensor>({at::ones(5)}));
-  op->getOperation()(&stack);
+  op->getOperation()(stack);
   c10::List<at::Tensor> output;
   pop(stack, output);
 
@@ -204,7 +204,7 @@ TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
   torch::jit::RegisterOperators reg({OperatorGenerator(
       TORCH_SELECTIVE_NAME_IN_SCHEMA(
           op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"),
-      [](Stack* stack) {
+      [](Stack& stack) {
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         double a;
         at::Tensor b;
@@ -223,7 +223,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
   torch::jit::RegisterOperators reg({OperatorGenerator(
       TORCH_SELECTIVE_NAME_IN_SCHEMA(
           op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"),
-      [](Stack* stack) {
+      [](Stack& stack) {
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         double a;
         at::Tensor b;
@@ -249,7 +249,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
 
   Stack stack;
   push(stack, 2.0f, at::ones(5));
-  op->getOperation()(&stack);
+  op->getOperation()(stack);
   at::Tensor output;
   pop(stack, output);
 
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 9f8a732f550f4..305d36a476213 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -1493,11 +1493,11 @@ TEST(NoneSchemaMatchTest, Basic) {
   RegisterOperators reg({
       Operator(
           "prim::test_none() -> int?",
-          [](Stack* stack) { push(stack, IValue()); },
+          [](Stack& stack) { push(stack, IValue()); },
           aliasAnalysisFromSchema()),
       Operator(
           "prim::is_none(int? a) -> bool",
-          [](Stack* stack) {
+          [](Stack& stack) {
             IValue a = pop(stack);
             if (a.isNone()) {
               push(stack, true);
diff --git a/test/cpp/jit/test_schema_matching.cpp b/test/cpp/jit/test_schema_matching.cpp
index 31d332b718f53..c56d0bc28fe99 100644
--- a/test/cpp/jit/test_schema_matching.cpp
+++ b/test/cpp/jit/test_schema_matching.cpp
@@ -15,7 +15,7 @@ TEST(SchemaMatchingTest, VarType) {
   RegisterOperators reg({
       Operator(
           "aten::test_vartype(t[] a, t b) -> (t)",
-          [](Stack* stack) {
+          [](Stack& stack) {
             c10::List<double> list;
             // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
             double a;
@@ -54,7 +54,7 @@ TEST(SchemaMatchingTest, VarType2) {
   RegisterOperators reg({
       Operator(
           "aten::test_vartype2(t a, t[] b) -> (t[])",
-          [](Stack* stack) {
+          [](Stack& stack) {
             // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
             double a;
             c10::List<double> list;
diff --git a/test/cpp/jit/test_utils.cpp b/test/cpp/jit/test_utils.cpp
index f2fb9e1fb0606..8da101e99bbdf 100644
--- a/test/cpp/jit/test_utils.cpp
+++ b/test/cpp/jit/test_utils.cpp
@@ -273,7 +273,7 @@ RegisterOperators reg({
     // because it always produces empty Tensors.
     Operator(
         "prim::MakeTestTensor() -> Tensor",
-        [](Stack* stack) { push(stack, at::Tensor()); },
+        [](Stack& stack) { push(stack, at::Tensor()); },
         aliasAnalysisFromSchema()),
 });
 } // namespace
diff --git a/test/custom_operator/test_custom_ops.cpp b/test/custom_operator/test_custom_ops.cpp
index 7c6a187df1465..ec22568c5a3ea 100644
--- a/test/custom_operator/test_custom_ops.cpp
+++ b/test/custom_operator/test_custom_ops.cpp
@@ -30,7 +30,7 @@ Result get_operator_from_registry_and_execute(const char* op_name, Args&&... arg
 
   torch::jit::Stack stack;
   torch::jit::push(stack, std::forward<Args>(args)...);
-  op->getOperation()(&stack);
+  op->getOperation()(stack);
 
   TORCH_INTERNAL_ASSERT(1 == stack.size());
   return torch::jit::pop(stack).to<Result>();
diff --git a/torch/csrc/autograd/record_function_ops.cpp b/torch/csrc/autograd/record_function_ops.cpp
index 7e621f9e8b62e..9650c354c5868 100644
--- a/torch/csrc/autograd/record_function_ops.cpp
+++ b/torch/csrc/autograd/record_function_ops.cpp
@@ -79,7 +79,7 @@ c10::AliasAnalysisKind aliasAnalysisFromSchema() {
 jit::RegisterOperators reg_fut_ops({
     jit::Operator(
         "profiler::_call_end_callbacks_on_jit_fut(Tensor x, Future(t) y) -> Future(t)",
-        [](jit::Stack* stack) {
+        [](jit::Stack& stack) {
           // Pop inputs, which should be a future and a tensor
           auto fut = jit::pop(stack).toFuture();
           auto tensor = jit::pop(stack).toTensor();
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
index 07d5c61e0c53c..5eada8d573f2f 100644
--- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@@ -582,7 +582,7 @@ c10::intrusive_ptr<JitFuture> RequestCallbackNoPython::runJitOperator(
     std::vector<c10::Stream> streams) const {
   c10::MultiStreamGuard guard(streams);
   try {
-    op.getOperation()(&stack);
+    op.getOperation()(stack);
   } catch (const std::exception&) {
     return asFuture(std::current_exception());
   }
diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp
index 009ae21dad6d0..cf8f3787229ce 100644
--- a/torch/csrc/jit/codegen/cuda/interface.cpp
+++ b/torch/csrc/jit/codegen/cuda/interface.cpp
@@ -182,8 +182,8 @@ RegisterOperators reg_fusion({
     Operator(
         prim::CudaFusionGroup,
         [](const Node* node) -> Operation {
-          return [node](Stack* stack) {
-            fuser::cuda::runFusionGroup(node, *stack);
+          return [node](Stack& stack) {
+            fuser::cuda::runFusionGroup(node, stack);
           };
         },
         aliasAnalysisSpecialCase()),
@@ -196,7 +196,7 @@ RegisterOperators reg_guard({
         // if we would ever return refined tensor, which would change aliasing
         // analysis, we should update aliasdb pass.
         [](const Node* node) -> Operation {
-          return [node](Stack* stack) {
+          return [node](Stack& stack) {
             // TODO: check latency here!!!!
             std::vector<TypePtr> types = node->tys(attr::types);
             const auto num_inputs = types.size();
diff --git a/torch/csrc/jit/codegen/fuser/fallback.cpp b/torch/csrc/jit/codegen/fuser/fallback.cpp
index 59fe7e6f4fd25..60a5d72f3c439 100644
--- a/torch/csrc/jit/codegen/fuser/fallback.cpp
+++ b/torch/csrc/jit/codegen/fuser/fallback.cpp
@@ -26,7 +26,7 @@ RegisterOperators reg_fused_operators({Operator(
     [](const Node* node) -> Operation {
       int64_t dim = node->i(attr::dim);
       int64_t num_inputs = node->inputs().size();
-      return [dim, num_inputs](Stack* stack) {
+      return [dim, num_inputs](Stack& stack) {
         auto result = at::cat(
             fmap(
                 last(stack, num_inputs),
diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp
index 0775a550d2a79..127bd5f9418d4 100644
--- a/torch/csrc/jit/mobile/function.cpp
+++ b/torch/csrc/jit/mobile/function.cpp
@@ -67,7 +67,7 @@ bool Function::append_operator(
   auto jit_op = findOperatorFor(opname);
   std::vector<c10::Argument> args;
   if (jit_op) {
-    fn = [jit_op](Stack& stack) { jit_op->getOperation()(&stack); };
+    fn = [jit_op](Stack& stack) { jit_op->getOperation()(stack); };
     args = jit_op->schema().arguments();
   } else {
     auto op = c10::Dispatcher::singleton().findSchema(opname_c10);
diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp
index 815a1bc0ea649..944e27805cf18 100644
--- a/torch/csrc/jit/passes/batch_mm.cpp
+++ b/torch/csrc/jit/passes/batch_mm.cpp
@@ -109,11 +109,11 @@ bool shape_is_fast_for_reduce(const at::Tensor& lhs, const at::Tensor& rhs) {
 
 RegisterOperators mm_tree_reduction_reg({Operator(
     "prim::MMTreeReduce(...) -> Tensor",
-    [](Stack* stack) {
+    [](Stack& stack) {
       auto num_inputs = pop(stack).toInt();
       std::vector<at::Tensor> inputs;
       inputs.reserve(num_inputs);
-      for (auto it = stack->end() - num_inputs; it != stack->end(); ++it) {
+      for (auto it = stack.end() - num_inputs; it != stack.end(); ++it) {
         inputs.push_back(std::move(*it).toTensor());
       }
       drop(stack, num_inputs);
@@ -320,11 +320,11 @@ RegisterOperators mm_batch_side_reg({Operator(
     [](const Node* node) -> Operation {
       size_t num_other_side_inputs = node->inputs().size() - 1;
       Side single_side = static_cast<Side>(node->i(Symbol::attr("side")));
-      return [num_other_side_inputs, single_side](Stack* stack) {
+      return [num_other_side_inputs, single_side](Stack& stack) {
         at::Tensor side_input;
         std::vector<at::Tensor> other_side_inputs;
         other_side_inputs.reserve(num_other_side_inputs);
-        for (auto it = stack->end() - num_other_side_inputs; it != stack->end();
+        for (auto it = stack.end() - num_other_side_inputs; it != stack.end();
              ++it) {
           other_side_inputs.push_back(std::move(*it).toTensor());
         }
@@ -343,18 +343,18 @@ RegisterOperators mm_batch_side_reg({Operator(
               mm_out,
               num_other_side_inputs,
               /*dim=*/single_side == Side::LHS ? 1 : 0);
-          stack->insert(
-              stack->end(),
+          stack.insert(
+              stack.end(),
               std::make_move_iterator(outputs.begin()),
               std::make_move_iterator(outputs.end()));
         } else {
           if (single_side == Side::LHS) {
             for (at::Tensor& other : other_side_inputs) {
-              stack->emplace_back(side_input.mm(other));
+              stack.emplace_back(side_input.mm(other));
             }
           } else {
             for (at::Tensor& other : other_side_inputs) {
-              stack->emplace_back(other.mm(side_input));
+              stack.emplace_back(other.mm(side_input));
             }
           }
         }
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index a7f831abd88f6..3a28eaeba46e6 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -78,7 +78,7 @@ c10::optional<std::vector<IValue>> runNodeIfInputsAreConstant(
 
       try {
         auto op = n->getOperation();
-        op(&stack);
+        op(stack);
       } catch (...) {
         return c10::nullopt;
       }
diff --git a/torch/csrc/jit/passes/decompose_ops.cpp b/torch/csrc/jit/passes/decompose_ops.cpp
index 7f935a1c1cbd5..0706c9c14ae98 100644
--- a/torch/csrc/jit/passes/decompose_ops.cpp
+++ b/torch/csrc/jit/passes/decompose_ops.cpp
@@ -59,7 +59,7 @@ bool isDecomposableNorm(Node* normalize_op) {
 RegisterOperators reg_ops(
     {Operator(
          "aten::_ncf_unsqueeze(Tensor(a) self, int ndim) -> Tensor(a)",
-         [](Stack* stack) {
+         [](Stack& stack) {
            const int64_t ndim = pop(stack).toInt();
            auto self = pop(stack).toTensor();
            c10::SmallVector<int64_t, 8> sizes(ndim, 1);
@@ -70,7 +70,7 @@ RegisterOperators reg_ops(
          aliasAnalysisFromSchema()),
      Operator(
          "aten::_ncf_view(Tensor(a) self, int[] input_shape, int normalized_ndim) -> Tensor(a)",
-         [](Stack* stack) {
+         [](Stack& stack) {
            const int64_t normalized_ndim = pop(stack).toInt();
            auto input_shape = pop(stack).toIntList();
            auto self = pop(stack).toTensor();
diff --git a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
index 6d218af06e34c..542e136280520 100644
--- a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
+++ b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
@@ -235,7 +235,7 @@ void InplaceMKLDNNSubgraph(std::shared_ptr<Graph> graph) {
 Operation createUnaryOp(
     std::function<void(at::Tensor output, at::Tensor input)> aten_op,
     bool inplace = false) {
-  return [aten_op, inplace](Stack* stack) {
+  return [aten_op, inplace](Stack& stack) {
     auto a = pop(stack).toTensor();
     c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
     // we cast `a` to an `ideep::tensor`, so we can get at its descriptor
@@ -275,7 +275,7 @@ Operation createUnaryOp(
   };
 }
 
-void MKLDNNLayerNormOp(Stack* stack, bool inplace) {
+void MKLDNNLayerNormOp(Stack& stack, bool inplace) {
   c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
 
   // enable_cudnn not used
@@ -303,7 +303,7 @@ void MKLDNNLayerNormOp(Stack* stack, bool inplace) {
 };
 
 Operation BroadOp(const Node* node) {
-  return [](Stack* stack) {
+  return [](Stack& stack) {
     auto b = pop(stack).toTensor();
     auto a = pop(stack).toTensor();
     auto b_size = b.sizes();
@@ -471,17 +471,17 @@ const RegisterOperators BroadOpReg({
 const RegisterOperators MKLDNNLayerNormOpReg({
     torch::jit::Operator(
         "prim::MKLDNNLayerNorm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor",
-        [](Stack* stack) { MKLDNNLayerNormOp(stack, false); },
+        [](Stack& stack) { MKLDNNLayerNormOp(stack, false); },
         AliasAnalysisKind::FROM_SCHEMA),
     torch::jit::Operator(
         "prim::MKLDNNLayerNorm_(Tensor(a!) input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor(a!)",
-        [](Stack* stack) { MKLDNNLayerNormOp(stack, true); },
+        [](Stack& stack) { MKLDNNLayerNormOp(stack, true); },
         AliasAnalysisKind::FROM_SCHEMA),
 });
 
 Operation ConstantMKLDNNTensorOp(const Node* node) {
   const auto& t = node->t(attr::value);
-  return [t](Stack* stack) {
+  return [t](Stack& stack) {
     push(stack, t);
     return 0;
   };
@@ -509,7 +509,7 @@ jit::RegisterOperators reg_fut_ops({
         // XXX: this follows the schema convention of conv2d/conv3d, not
         // aten::mkldnn_convolution, which is different for some reason!
         "prim::mkldnn_convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor",
-        [](jit::Stack* stack) {
+        [](jit::Stack& stack) {
           int64_t groups = pop(stack).toInt();
           auto dilation = pop(stack).toIntVector();
           auto padding = pop(stack).toIntVector();
@@ -558,7 +558,7 @@ jit::RegisterOperators reg_fut_ops({
     // in default bindings
     jit::Operator(
         "prim::MKLDNNScalarMul(Tensor self, Scalar other) -> Tensor",
-        [](jit::Stack* stack) {
+        [](jit::Stack& stack) {
           c10::impl::ExcludeDispatchKeyGuard edkg(
               c10::autograd_dispatch_keyset);
           float other = pop(stack).toScalar().toFloat();
@@ -576,7 +576,7 @@ jit::RegisterOperators reg_fut_ops({
         aliasAnalysisFromSchema()),
     jit::Operator(
         "prim::MKLDNNScalarMul_(Tensor(a!) self, Scalar other) -> Tensor(a!)",
-        [](jit::Stack* stack) {
+        [](jit::Stack& stack) {
           c10::impl::ExcludeDispatchKeyGuard edkg(
               c10::autograd_dispatch_keyset);
           float other = pop(stack).toScalar().toFloat();
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 47cd30b3d43ac..5e13829a8ce6d 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -410,7 +410,7 @@ class ShapePropagator {
     // is to uncover any mistakes we could make when editing this code,
     // and eventually it shouldn't matter, because this phase should be
     // preceded by schema checking.
-    op(&stack);
+    op(stack);
 
     AT_ASSERT(stack.size() == node->outputs().size());
     for (const auto i : c10::irange(stack.size())) {
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 833c338578616..1d5128c7e71e2 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -1299,9 +1299,9 @@ void FuseTensorExprs(
 Operation createTensorExprOp(const Node* node) {
   auto kernel =
       std::make_shared<tensorexpr::TensorExprKernel>(node->g(attr::Subgraph));
-  return [kernel](Stack* stack) {
+  return [kernel](Stack& stack) {
     RECORD_FUNCTION("TensorExpr", std::vector<c10::IValue>());
-    kernel->run(*stack);
+    kernel->run(stack);
     return 0;
   };
 }
diff --git a/torch/csrc/jit/passes/utils/check_alias_annotation.cpp b/torch/csrc/jit/passes/utils/check_alias_annotation.cpp
index cd894b46ff69b..ae3a962509994 100644
--- a/torch/csrc/jit/passes/utils/check_alias_annotation.cpp
+++ b/torch/csrc/jit/passes/utils/check_alias_annotation.cpp
@@ -253,7 +253,7 @@ void checkAliasAnnotation(
   const auto inputsDeepCopy = deepCopy(stack);
 
   // Run the op
-  node->getOperation()(&stack);
+  node->getOperation()(stack);
 
   const auto outputs = std::move(stack);
 
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 0138231d3bc3f..eff1ddc243999 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -1151,7 +1151,7 @@ inline py::object invokeOperatorFromPython(
   Stack stack = std::get<1>(opWithStack);
   {
     pybind11::gil_scoped_release no_gil_guard;
-    found_op->getOperation()(&stack);
+    found_op->getOperation()(stack);
   }
 
   return createPyObjectForStack(std::move(stack));
diff --git a/torch/csrc/jit/python/python_interpreter.cpp b/torch/csrc/jit/python/python_interpreter.cpp
index 82a0d22c54fa2..29b7929fcd690 100644
--- a/torch/csrc/jit/python/python_interpreter.cpp
+++ b/torch/csrc/jit/python/python_interpreter.cpp
@@ -43,7 +43,7 @@ Operation createPythonOperation(const Node* op_) {
 
   AT_ASSERT(op->outputs().size() == 1);
 
-  return [=](Stack* stack) {
+  return [=](Stack& stack) {
     pybind11::gil_scoped_acquire gil;
     py::tuple py_inputs(op->cconv.size());
     size_t i = 0;
@@ -66,7 +66,7 @@ Operation createPythonOperation(const Node* op_) {
     drop(stack, num_inputs);
     try {
       py::object py_output(func(*py_inputs));
-      stack->push_back(returnToIValue(op->output()->type(), py_output));
+      stack.push_back(returnToIValue(op->output()->type(), py_output));
     } catch (py::error_already_set& e) {
       throw std::runtime_error(e.what());
     }
diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp
index 476882650a1dd..0187988680e80 100644
--- a/torch/csrc/jit/runtime/graph_executor.cpp
+++ b/torch/csrc/jit/runtime/graph_executor.cpp
@@ -377,7 +377,7 @@ struct DifferentiableGraphOp {
         num_outputs(this->grad.f->outputs().size()) {}
 
   // XXX: keep in mind that stack can be larger than the inputs we need!
-  void operator()(Stack* stack) const {
+  void operator()(Stack& stack) const {
     auto grad_fn = std::make_shared<DifferentiableGraphBackward>(
         grad_executor,
         grad.df_input_vjps.size(),
@@ -394,13 +394,13 @@ struct DifferentiableGraphOp {
       captureInputs(*grad_fn, inputs);
     }
 
-    detachVariables(*stack);
+    detachVariables(stack);
     if (IsNewExecutorEnabled()) {
       ExecutionPlan plan =
-          f_ptr->getPlanFor(*stack, GraphExecutor::getDefaultNumBailOuts());
-      InterpreterState(plan.code).run(*stack);
+          f_ptr->getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts());
+      InterpreterState(plan.code).run(stack);
     } else {
-      InterpreterState(legacy_f).run(*stack);
+      InterpreterState(legacy_f).run(stack);
     }
 
     {
@@ -419,7 +419,7 @@ struct DifferentiableGraphOp {
       // drop the temporary outputs so that we return the same number of
       // outputs as if we were not also calculating gradient
       const size_t num_temporary_outputs = num_outputs - grad.f_real_outputs;
-      stack->erase(stack->end() - num_temporary_outputs, stack->end());
+      stack.erase(stack.end() - num_temporary_outputs, stack.end());
     }
   }
 
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index be2019e532f98..70c9c6c653326 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -297,13 +297,13 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           }
           case INST(OP): {
             INST_GUARD;
-            frame.function->operator_table_[inst.X](&stack);
+            frame.function->operator_table_[inst.X](stack);
           }
             INST_NEXT;
           case INST(OPN): {
             INST_GUARD;
             stack.push_back(inst.N);
-            frame.function->operator_table_[inst.X](&stack);
+            frame.function->operator_table_[inst.X](stack);
           }
             INST_NEXT;
           case INST(LOAD): {
diff --git a/torch/csrc/jit/runtime/register_c10_ops.cpp b/torch/csrc/jit/runtime/register_c10_ops.cpp
index 993d41194e84b..4d541ec46bbbf 100644
--- a/torch/csrc/jit/runtime/register_c10_ops.cpp
+++ b/torch/csrc/jit/runtime/register_c10_ops.cpp
@@ -12,7 +12,7 @@ namespace jit {
 namespace {
 
 Operator createOperatorFromC10(const c10::OperatorHandle& op) {
-  return Operator(op, [op](Stack* stack) { op.callBoxed(stack); });
+  return Operator(op, [op](Stack& stack) { op.callBoxed(stack); });
 }
 
 class RegistrationListener final : public c10::OpRegistrationListener {
diff --git a/torch/csrc/jit/runtime/register_cuda_ops.cpp b/torch/csrc/jit/runtime/register_cuda_ops.cpp
index f7a989d7acef9..599fd5398c370 100644
--- a/torch/csrc/jit/runtime/register_cuda_ops.cpp
+++ b/torch/csrc/jit/runtime/register_cuda_ops.cpp
@@ -38,7 +38,7 @@ void _device_synchronize(int64_t device_index) {
 RegisterOperators const reg({
     Operator(
         "cuda::current_stream.device(Device? device) -> __torch__.torch.classes.cuda.Stream",
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto device = pop(stack).toOptional<c10::Device>();
           c10::DeviceIndex device_index = device.has_value()
               ? device->index()
@@ -50,7 +50,7 @@ RegisterOperators const reg({
         aliasAnalysisFromSchema()),
     Operator(
         "cuda::current_stream.int(int? val) -> __torch__.torch.classes.cuda.Stream",
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto idx = pop(stack).toOptional<int64_t>();
           c10::DeviceIndex device_index = idx.has_value()
               ? static_cast<c10::DeviceIndex>(idx.value())
@@ -62,7 +62,7 @@ RegisterOperators const reg({
         aliasAnalysisFromSchema()),
     Operator(
         "cuda::default_stream.device(Device? device) -> __torch__.torch.classes.cuda.Stream",
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto device = pop(stack).toOptional<c10::Device>();
           c10::DeviceIndex device_index = device.has_value()
               ? device->index()
@@ -74,7 +74,7 @@ RegisterOperators const reg({
         aliasAnalysisFromSchema()),
     Operator(
         "cuda::default_stream.int(int? val) -> __torch__.torch.classes.cuda.Stream",
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto idx = pop(stack).toOptional<int64_t>();
           c10::DeviceIndex device_index = idx.has_value()
               ? static_cast<c10::DeviceIndex>(idx.value())
@@ -86,14 +86,14 @@ RegisterOperators const reg({
         aliasAnalysisFromSchema()),
     Operator(
         "cuda::_current_device() -> int",
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto v = c10::cuda::current_device();
           push(stack, static_cast<int>(v));
         },
         aliasAnalysisFromSchema()),
     Operator(
         "cuda::_set_device(int64_t val) -> ()",
-        [](Stack* stack) {
+        [](Stack& stack) {
           int64_t idx = -1;
           pop(stack, idx);
           c10::cuda::set_device(static_cast<c10::DeviceIndex>(idx));
@@ -101,7 +101,7 @@ RegisterOperators const reg({
         aliasAnalysisFromSchema()),
     Operator(
         "cuda::device_index(Device device) -> int",
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto device = pop(stack);
           auto idx = device.toDevice().index();
           push(stack, idx);
@@ -109,11 +109,11 @@ RegisterOperators const reg({
         aliasAnalysisFromSchema()),
     Operator(
         "cuda::device_count() -> int",
-        [](Stack* stack) { push(stack, at::cuda::device_count()); },
+        [](Stack& stack) { push(stack, at::cuda::device_count()); },
         aliasAnalysisFromSchema()),
     Operator(
         "cuda::set_stream(__torch__.torch.classes.cuda.Stream stream) -> ()",
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto v = pop(stack);
           auto s = v.toCustomClass<torch::jit::CUDAStream>();
           auto stream_device_idx = static_cast<int64_t>(s->device_index());
@@ -141,11 +141,11 @@ RegisterOperators const reg({
         aliasAnalysisFromSchema()),
     Operator(
         "cuda::synchronize() -> ()",
-        [](Stack* stack) { c10::cuda::device_synchronize(); },
+        [](Stack& stack) { c10::cuda::device_synchronize(); },
         aliasAnalysisFromSchema()),
     Operator(
         "cuda::synchronize.device(Device? device) -> ()",
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto device = pop(stack).toOptional<c10::Device>();
           c10::DeviceIndex device_index = device.has_value()
               ? device->index()
@@ -155,7 +155,7 @@ RegisterOperators const reg({
         aliasAnalysisFromSchema()),
     Operator(
         "cuda::synchronize.int(int? val) -> ()",
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto idx = pop(stack).toOptional<int64_t>();
           c10::DeviceIndex device_index = idx.has_value()
               ? static_cast<c10::DeviceIndex>(idx.value())
diff --git a/torch/csrc/jit/runtime/register_distributed_ops.cpp b/torch/csrc/jit/runtime/register_distributed_ops.cpp
index 2c8277d106f3c..edf7a0ccff23a 100644
--- a/torch/csrc/jit/runtime/register_distributed_ops.cpp
+++ b/torch/csrc/jit/runtime/register_distributed_ops.cpp
@@ -29,11 +29,11 @@ static auto workerInfo =
 
 // prepare the rpc input arguments and call the C++ impls
 void prepare_and_call_rpc_op(
-    Stack* stack,
+    Stack& stack,
     int num_inputs,
     const std::string& rpc_op) {
   // Get inputs from the stack.
-  auto stackIter = stack->end() - num_inputs;
+  auto stackIter = stack.end() - num_inputs;
   auto& dstWorkerIValue = *stackIter++;
   auto& qualifiedNameIValue = *stackIter++;
   IValue emptyTuple(c10::ivalue::Tuple::create({}));
@@ -137,7 +137,7 @@ void prepare_and_call_rpc_op(
         rpcTimeout);
     // Push output to the stack.
     drop(stack, num_inputs);
-    stack->emplace_back(std::move(futureIValuePtr));
+    stack.emplace_back(std::move(futureIValuePtr));
   } else if (rpc_op == "rpc_sync") {
     // Send RPC request.
     auto futureIValuePtr = dist_rpc::rpcTorchscript(
@@ -154,7 +154,7 @@ void prepare_and_call_rpc_op(
       auto res = futureIValuePtr->value();
       // Push output to the stack.
       drop(stack, num_inputs);
-      stack->emplace_back(std::move(res));
+      stack.emplace_back(std::move(res));
     }
   } else if (rpc_op == "rpc_remote") {
     auto rrefPtr = dist_rpc::remoteTorchscript(
@@ -165,7 +165,7 @@ void prepare_and_call_rpc_op(
         rpcTimeout);
     // Push output to the stack.
     drop(stack, num_inputs);
-    stack->emplace_back(
+    stack.emplace_back(
         c10::static_intrusive_pointer_cast<c10::RRefInterface>(rrefPtr));
   } else {
     throw std::runtime_error(
@@ -178,7 +178,7 @@ RegisterOperators reg_rpc_ops(
          fmt::format(
              "aten::to_here(RRef(t) self, float timeout = {}) -> t(*)",
              torch::distributed::rpc::kDefaultRpcTimeoutSeconds),
-         [](Stack* stack) {
+         [](Stack& stack) {
            auto timeout = pop(stack).toDouble();
            auto rref = pop(stack).toRRef();
            IValue res;
@@ -195,7 +195,7 @@ RegisterOperators reg_rpc_ops(
          aliasAnalysisFromSchema()),
      Operator(
          "aten::local_value(RRef(t) self) -> t(*)",
-         [](Stack* stack) {
+         [](Stack& stack) {
            auto rref = pop(stack).toRRef();
            TORCH_CHECK(
                rref->isOwner(),
@@ -208,14 +208,14 @@ RegisterOperators reg_rpc_ops(
          aliasAnalysisFromSchema()),
      Operator(
          "aten::is_owner(RRef(t) self) -> bool",
-         [](Stack* stack) {
+         [](Stack& stack) {
            auto rref = pop(stack).toRRef();
            push(stack, rref->isOwner());
          },
          aliasAnalysisFromSchema()),
      Operator(
          "aten::owner(RRef(t) self) -> __torch__.torch.classes.dist_rpc.WorkerInfo",
-         [](Stack* stack) {
+         [](Stack& stack) {
            auto rref = pop(stack).toRRef();
            push(
                stack,
@@ -225,21 +225,21 @@ RegisterOperators reg_rpc_ops(
          aliasAnalysisFromSchema()),
      Operator(
          "aten::owner_name(RRef(t) self) -> str",
-         [](Stack* stack) {
+         [](Stack& stack) {
            auto rref = pop(stack).toRRef();
            push(stack, rref->ownerName());
          },
          aliasAnalysisFromSchema()),
      Operator(
          "aten::confirmed_by_owner(RRef(t) self) -> bool",
-         [](Stack* stack) {
+         [](Stack& stack) {
            auto rref = pop(stack).toRRef();
            push(stack, rref->confirmedByOwner());
          },
          aliasAnalysisFromSchema()),
      Operator(
          "aten::dist_backward(int context_id, Tensor[] roots, bool retain_graph=False) -> ()",
-         [](Stack* stack) {
+         [](Stack& stack) {
            bool retain_graph = pop(stack).toBool();
            auto roots_list = pop(stack).toTensorList();
            int64_t context_id = pop(stack).toInt();
@@ -252,7 +252,7 @@ RegisterOperators reg_rpc_ops(
          prim::rpc_sync,
          [](const Node* node) -> Operation {
            int num_inputs = node->inputs().size();
-           return [num_inputs](Stack* stack) {
+           return [num_inputs](Stack& stack) {
              prepare_and_call_rpc_op(stack, num_inputs, "rpc_sync");
            };
          },
@@ -261,7 +261,7 @@ RegisterOperators reg_rpc_ops(
          prim::rpc_remote,
          [](const Node* node) -> Operation {
            int num_inputs = node->inputs().size();
-           return [num_inputs](Stack* stack) {
+           return [num_inputs](Stack& stack) {
              prepare_and_call_rpc_op(stack, num_inputs, "rpc_remote");
            };
          },
@@ -270,7 +270,7 @@ RegisterOperators reg_rpc_ops(
          prim::rpc_async,
          [](const Node* node) -> Operation {
            int num_inputs = node->inputs().size();
-           return [num_inputs](Stack* stack) {
+           return [num_inputs](Stack& stack) {
              prepare_and_call_rpc_op(stack, num_inputs, "rpc_async");
            };
          },
diff --git a/torch/csrc/jit/runtime/register_ops_utils.cpp b/torch/csrc/jit/runtime/register_ops_utils.cpp
index 91ff2c738a1bf..64bb3abc57584 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.cpp
+++ b/torch/csrc/jit/runtime/register_ops_utils.cpp
@@ -13,7 +13,7 @@ c10::impl::GenericList make_result_list<IValue>(const TypePtr& elemType) {
 }
 
 template <>
-void listIndex<at::Tensor>(Stack* stack) {
+void listIndex<at::Tensor>(Stack& stack) {
   at::Tensor elem = pop(stack).to<at::Tensor>();
   c10::List<at::Tensor> list = pop(stack).to<c10::List<at::Tensor>>();
 
@@ -31,7 +31,7 @@ void listIndex<at::Tensor>(Stack* stack) {
 }
 
 template <>
-void listCount<at::Tensor>(Stack* stack) {
+void listCount<at::Tensor>(Stack& stack) {
   at::Tensor elem = pop(stack).to<at::Tensor>();
   c10::List<at::Tensor> list = pop(stack).to<c10::List<at::Tensor>>();
 
@@ -44,21 +44,21 @@ void listCount<at::Tensor>(Stack* stack) {
 }
 
 template <>
-void listEq<at::Tensor>(Stack* stack) {
+void listEq<at::Tensor>(Stack& stack) {
   c10::List<at::Tensor> b = pop(stack).to<c10::List<at::Tensor>>();
   c10::List<at::Tensor> a = pop(stack).to<c10::List<at::Tensor>>();
   push(stack, tensor_list_equal(a, b));
 }
 
 template <>
-void listNe<at::Tensor>(Stack* stack) {
+void listNe<at::Tensor>(Stack& stack) {
   c10::List<at::Tensor> b = pop(stack).to<c10::List<at::Tensor>>();
   c10::List<at::Tensor> a = pop(stack).to<c10::List<at::Tensor>>();
   push(stack, !tensor_list_equal(a, b));
 }
 
 template <>
-void listSort<at::Tensor>(Stack* stack) {
+void listSort<at::Tensor>(Stack& stack) {
   bool reverse = pop(stack).toBool();
   c10::List<at::Tensor> list = pop(stack).toTensorList();
   std::sort(
@@ -74,7 +74,7 @@ void listSort<at::Tensor>(Stack* stack) {
 }
 
 template <>
-void listCopyAndSort<at::Tensor>(Stack* stack) {
+void listCopyAndSort<at::Tensor>(Stack& stack) {
   c10::List<at::Tensor> list = pop(stack).toTensorList();
   auto list_copied = list.copy();
   std::sort(
@@ -87,7 +87,7 @@ void listCopyAndSort<at::Tensor>(Stack* stack) {
 }
 
 template <>
-void listRemove<at::Tensor>(Stack* stack) {
+void listRemove<at::Tensor>(Stack& stack) {
   at::Tensor elem = pop(stack).to<at::Tensor>();
   c10::List<at::Tensor> list = pop(stack).to<c10::List<at::Tensor>>();
 
@@ -268,7 +268,7 @@ int64_t normalizeIndex(int64_t idx, int64_t list_size) {
   return idx;
 }
 
-void listAppend(Stack* stack) {
+void listAppend(Stack& stack) {
   IValue el = pop(stack).to<IValue>();
   c10::List<IValue> list = pop(stack).to<c10::List<IValue>>();
 
@@ -276,13 +276,13 @@ void listAppend(Stack* stack) {
   push(stack, std::move(list));
 }
 
-void listReverse(Stack* stack) {
+void listReverse(Stack& stack) {
   c10::List<IValue> list = pop(stack).to<c10::List<IValue>>();
 
   std::reverse(list.begin(), list.end());
 }
 
-void listPopImpl(Stack* stack, const char* empty_message) {
+void listPopImpl(Stack& stack, const char* empty_message) {
   int64_t idx = pop(stack).to<int64_t>();
   c10::List<IValue> list = pop(stack).to<c10::List<IValue>>();
 
@@ -297,22 +297,22 @@ void listPopImpl(Stack* stack, const char* empty_message) {
   list.erase(list.begin() + normalized_idx);
 }
 
-void listPop(Stack* stack) {
+void listPop(Stack& stack) {
   return listPopImpl(stack, "pop from empty list");
 }
 
-void listClear(Stack* stack) {
+void listClear(Stack& stack) {
   c10::List<IValue> list = pop(stack).to<c10::List<IValue>>();
 
   list.clear();
 }
 
-void listDelete(Stack* stack) {
+void listDelete(Stack& stack) {
   listPopImpl(stack, "pop index out of range");
   pop(stack);
 }
 
-void listInsert(Stack* stack) {
+void listInsert(Stack& stack) {
   IValue elem = pop(stack).to<IValue>();
   int64_t idx = pop(stack).to<int64_t>();
   c10::List<IValue> list = pop(stack).to<c10::List<IValue>>();
@@ -331,7 +331,7 @@ void listInsert(Stack* stack) {
   }
 }
 
-void listExtend(Stack* stack) {
+void listExtend(Stack& stack) {
   c10::List<IValue> b = pop(stack).to<c10::List<IValue>>();
   c10::List<IValue> a = pop(stack).to<c10::List<IValue>>();
 
@@ -341,12 +341,12 @@ void listExtend(Stack* stack) {
   }
 }
 
-void listCopy(Stack* stack) {
+void listCopy(Stack& stack) {
   c10::List<IValue> list = pop(stack).to<c10::List<IValue>>();
   push(stack, list.copy());
 }
 
-void listSelect(Stack* stack) {
+void listSelect(Stack& stack) {
   int64_t idx = pop(stack).to<int64_t>();
   c10::List<IValue> list = pop(stack).to<c10::List<IValue>>();
 
@@ -354,19 +354,19 @@ void listSelect(Stack* stack) {
   push(stack, std::move(element));
 }
 
-void listLen(Stack* stack) {
+void listLen(Stack& stack) {
   c10::List<IValue> a = pop(stack).to<c10::List<IValue>>();
 
   const int64_t size = a.size();
   push(stack, size);
 }
 
-void listList(Stack* stack) {
+void listList(Stack& stack) {
   c10::List<IValue> a = pop(stack).to<c10::List<IValue>>();
   push(stack, a.copy());
 }
 
-void listAdd(Stack* stack) {
+void listAdd(Stack& stack) {
   c10::List<IValue> b = pop(stack).to<c10::List<IValue>>();
   c10::List<IValue> a = pop(stack).to<c10::List<IValue>>();
 
@@ -383,14 +383,14 @@ void listAdd(Stack* stack) {
   push(stack, std::move(ret));
 }
 
-void listInplaceAdd(Stack* stack) {
+void listInplaceAdd(Stack& stack) {
   c10::List<IValue> b = pop(stack).to<List<IValue>>();
   c10::List<IValue> a = pop(stack).to<List<IValue>>();
   a.append(std::move(b));
   push(stack, std::move(a));
 }
 
-void listMulIntLeftInPlace(Stack* stack) {
+void listMulIntLeftInPlace(Stack& stack) {
   int64_t n = pop(stack).to<int64_t>();
   c10::List<IValue> list = pop(stack).to<c10::List<IValue>>();
   if (n <= 0) {
@@ -408,7 +408,7 @@ void listMulIntLeftInPlace(Stack* stack) {
   push(stack, std::move(list));
 }
 
-void listMulIntLeft(Stack* stack) {
+void listMulIntLeft(Stack& stack) {
   int64_t n = pop(stack).to<int64_t>();
   c10::List<IValue> list = pop(stack).to<c10::List<IValue>>();
 
@@ -426,7 +426,7 @@ void listMulIntLeft(Stack* stack) {
   push(stack, std::move(ret));
 }
 
-void listMulIntRight(Stack* stack) {
+void listMulIntRight(Stack& stack) {
   c10::List<IValue> list = pop(stack).to<c10::List<IValue>>();
   int64_t n = pop(stack).to<int64_t>();
 
@@ -444,7 +444,7 @@ void listMulIntRight(Stack* stack) {
   push(stack, std::move(ret));
 }
 
-void listSlice(Stack* stack) {
+void listSlice(Stack& stack) {
   auto step_val = pop(stack);
   auto end_val = pop(stack);
   auto start_val = pop(stack);
@@ -477,7 +477,7 @@ void listSlice(Stack* stack) {
   push(stack, std::move(sliced_list));
 }
 
-void listSetItem(Stack* stack) {
+void listSetItem(Stack& stack) {
   IValue value = pop(stack).to<IValue>();
   int64_t idx = pop(stack).to<int64_t>();
   c10::List<IValue> list = pop(stack).to<c10::List<IValue>>();
diff --git a/torch/csrc/jit/runtime/register_ops_utils.h b/torch/csrc/jit/runtime/register_ops_utils.h
index 5d00872d9ca7e..a4efb67943569 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.h
+++ b/torch/csrc/jit/runtime/register_ops_utils.h
@@ -55,7 +55,7 @@ c10::List<T> make_result_list(const TypePtr& elemType) {
 template <>
 c10::impl::GenericList make_result_list<IValue>(const TypePtr& elemType);
 
-inline void noop(Stack* n) {}
+inline void noop(Stack& n) {}
 
 // As described in https://docs.python.org/3/library/functions.html#round
 // When a number is exactly halfway between two integers, python builtin round
@@ -181,12 +181,12 @@ void setItem(const c10::List<T>& list, int64_t idx, T&& value) {
   list.set(normalized_idx, std::forward<T>(value));
 }
 
-void listAppend(Stack* stack);
+void listAppend(Stack& stack);
 
-void listReverse(Stack* stack);
+void listReverse(Stack& stack);
 
 template <typename T>
-void minList(Stack* stack) {
+void minList(Stack& stack) {
   c10::List<T> a = pop(stack).to<c10::List<T>>();
   c10::List<T> b = pop(stack).to<c10::List<T>>();
 
@@ -204,7 +204,7 @@ void minList(Stack* stack) {
 }
 
 template <typename T>
-void maxList(Stack* stack) {
+void maxList(Stack& stack) {
   c10::List<T> a = pop(stack).to<c10::List<T>>();
   c10::List<T> b = pop(stack).to<c10::List<T>>();
 
@@ -221,18 +221,18 @@ void maxList(Stack* stack) {
   push(stack, b.size() > a.size() ? b : a);
 }
 
-void listPopImpl(Stack* stack, const char* empty_message);
+void listPopImpl(Stack& stack, const char* empty_message);
 
-void listPop(Stack* stack);
+void listPop(Stack& stack);
 
-void listClear(Stack* stack);
+void listClear(Stack& stack);
 
-void listDelete(Stack* stack);
+void listDelete(Stack& stack);
 
-void listInsert(Stack* stack);
+void listInsert(Stack& stack);
 
 template <typename T>
-void listRemove(Stack* stack) {
+void listRemove(Stack& stack) {
   T elem = pop(stack).to<T>();
   c10::List<T> list = pop(stack).to<c10::List<T>>();
 
@@ -246,7 +246,7 @@ void listRemove(Stack* stack) {
 }
 
 template <typename T>
-void listMin(Stack* stack) {
+void listMin(Stack& stack) {
   c10::List<T> list = pop(stack).to<c10::List<T>>();
   size_t list_size = list.size();
   if (list_size == 0) {
@@ -259,11 +259,11 @@ void listMin(Stack* stack) {
     min_elem = elem < min_elem ? elem : min_elem;
   }
 
-  stack->push_back(min_elem);
+  stack.push_back(min_elem);
 }
 
 template <typename T>
-void listMax(Stack* stack) {
+void listMax(Stack& stack) {
   c10::List<T> list = pop(stack).to<c10::List<T>>();
   size_t list_size = list.size();
   if (list_size == 0) {
@@ -276,14 +276,14 @@ void listMax(Stack* stack) {
     max_elem = elem > max_elem ? elem : max_elem;
   }
 
-  stack->push_back(max_elem);
+  stack.push_back(max_elem);
 }
 
 template <>
-void listRemove<at::Tensor>(Stack* stack);
+void listRemove<at::Tensor>(Stack& stack);
 
 template <typename T>
-void listIndex(Stack* stack) {
+void listIndex(Stack& stack) {
   T elem = pop(stack).to<T>();
   c10::List<T> list = pop(stack).to<c10::List<T>>();
 
@@ -297,10 +297,10 @@ void listIndex(Stack* stack) {
 }
 
 template <>
-void listIndex<at::Tensor>(Stack* stack);
+void listIndex<at::Tensor>(Stack& stack);
 
 template <typename T>
-void listCount(Stack* stack) {
+void listCount(Stack& stack) {
   T elem = pop(stack).to<T>();
   c10::List<T> list = pop(stack).to<c10::List<T>>();
 
@@ -309,25 +309,25 @@ void listCount(Stack* stack) {
 }
 
 template <>
-void listCount<at::Tensor>(Stack* stack);
+void listCount<at::Tensor>(Stack& stack);
 
-void listExtend(Stack* stack);
+void listExtend(Stack& stack);
 
-void listCopy(Stack* stack);
+void listCopy(Stack& stack);
 
-void listSelect(Stack* stack);
+void listSelect(Stack& stack);
 
-void listLen(Stack* stack);
+void listLen(Stack& stack);
 
 template <typename T>
-void listEq(Stack* stack) {
+void listEq(Stack& stack) {
   c10::List<T> b = pop(stack).to<c10::List<T>>();
   c10::List<T> a = pop(stack).to<c10::List<T>>();
   push(stack, a == b);
 }
 
 template <typename T>
-void listNe(Stack* stack) {
+void listNe(Stack& stack) {
   c10::List<T> b = pop(stack).to<c10::List<T>>();
   c10::List<T> a = pop(stack).to<c10::List<T>>();
   push(stack, a != b);
@@ -357,16 +357,16 @@ inline bool tensor_list_equal(
 
 // Specialization for at::Tensor, since it doesn't define operator==
 template <>
-void listEq<at::Tensor>(Stack* stack);
+void listEq<at::Tensor>(Stack& stack);
 
 // Specialization for at::Tensor, since it doesn't define operator==
 template <>
-void listNe<at::Tensor>(Stack* stack);
+void listNe<at::Tensor>(Stack& stack);
 
-void listList(Stack* stack);
+void listList(Stack& stack);
 
 template <typename T>
-void listContains(Stack* stack) {
+void listContains(Stack& stack) {
   auto key = pop(stack).to<T>();
   auto list = pop(stack).to<c10::List<T>>();
   // NOLINTNEXTLINE(performance-implicit-conversion-in-loop)
@@ -379,20 +379,20 @@ void listContains(Stack* stack) {
   push(stack, false);
 }
 
-void listAdd(Stack* stack);
+void listAdd(Stack& stack);
 
-void listInplaceAdd(Stack* stack);
+void listInplaceAdd(Stack& stack);
 
-void listMulIntLeftInPlace(Stack* stack);
+void listMulIntLeftInPlace(Stack& stack);
 
-void listMulIntLeft(Stack* stack);
+void listMulIntLeft(Stack& stack);
 
-void listMulIntRight(Stack* stack);
+void listMulIntRight(Stack& stack);
 
-void listSlice(Stack* stack);
+void listSlice(Stack& stack);
 
 template <typename T>
-void listSort(Stack* stack) {
+void listSort(Stack& stack) {
   bool reverse = pop(stack).toBool();
   c10::List<T> list = pop(stack).to<c10::List<T>>();
   std::sort(list.begin(), list.end(), [reverse](const T& a, const T& b) {
@@ -408,10 +408,10 @@ void listSort(Stack* stack) {
 
 // Specialization for at::Tensor
 template <>
-void listSort<at::Tensor>(Stack* stack);
+void listSort<at::Tensor>(Stack& stack);
 
 template <typename T>
-void listCopyAndSort(Stack* stack) {
+void listCopyAndSort(Stack& stack) {
   c10::List<T> list = pop(stack).to<c10::List<T>>();
   auto list_copied = list.copy();
   std::sort(list_copied.begin(), list_copied.end(), [](const T& a, const T& b) {
@@ -426,22 +426,22 @@ void listCopyAndSort(Stack* stack) {
 
 // Specialization for at::Tensor
 template <>
-void listCopyAndSort<at::Tensor>(Stack* stack);
+void listCopyAndSort<at::Tensor>(Stack& stack);
 
-void listSetItem(Stack* stack);
+void listSetItem(Stack& stack);
 
 struct OperatorGeneratorArgs {
   const char* schema_str;
   bool isOperationCreator;
   union {
-    void (*operation)(Stack*);
+    void (*operation)(Stack&);
     OperationCreator operationCreator;
   };
   AliasAnalysisKind aliasAnalysis;
 
   explicit constexpr OperatorGeneratorArgs(
       torch::detail::SelectiveStr<true> schema_str,
-      void (*op)(Stack*),
+      void (*op)(Stack&),
       AliasAnalysisKind aa)
       : schema_str(schema_str),
         isOperationCreator(false),
@@ -472,7 +472,7 @@ struct OperatorGeneratorArgs {
   OperatorGeneratorArgs(                                                      \
       TORCH_SELECTIVE_SCHEMA(#aten_op                                         \
                              ".int_int(int a, int b) -> " #int_float_result), \
-      [](Stack* stack) {                                                      \
+      [](Stack& stack) {                                                      \
         int64_t a, b;                                                         \
         pop(stack, a, b);                                                     \
         push(stack, op);                                                      \
@@ -482,7 +482,7 @@ struct OperatorGeneratorArgs {
           TORCH_SELECTIVE_SCHEMA(                                             \
               #aten_op                                                        \
               ".float_float(float a, float b) -> " #int_float_result),        \
-          [](Stack* stack) {                                                  \
+          [](Stack& stack) {                                                  \
             double a, b;                                                      \
             pop(stack, a, b);                                                 \
             push(stack, op);                                                  \
@@ -492,7 +492,7 @@ struct OperatorGeneratorArgs {
           TORCH_SELECTIVE_SCHEMA(                                             \
               #aten_op                                                        \
               ".complex_complex(complex a, complex b) -> " #complex_result),  \
-          [](Stack* stack) {                                                  \
+          [](Stack& stack) {                                                  \
             c10::complex<double> a, b;                                        \
             pop(stack, a, b);                                                 \
             push(stack, op);                                                  \
@@ -503,7 +503,7 @@ struct OperatorGeneratorArgs {
 #define DEFINE_GENERIC_OP(aten_op, int_op, float_op, int_result, float_result) \
   OperatorGeneratorArgs(                                                       \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> " #int_result),   \
-      [](Stack* stack) {                                                       \
+      [](Stack& stack) {                                                       \
         int64_t a, b;                                                          \
         pop(stack, a, b);                                                      \
         push(stack, int_op);                                                   \
@@ -512,7 +512,7 @@ struct OperatorGeneratorArgs {
       OperatorGeneratorArgs(                                                   \
           TORCH_SELECTIVE_SCHEMA(                                              \
               #aten_op ".float(float a, float b) -> " #float_result),          \
-          [](Stack* stack) {                                                   \
+          [](Stack& stack) {                                                   \
             double a, b;                                                       \
             pop(stack, a, b);                                                  \
             push(stack, float_op);                                             \
@@ -523,7 +523,7 @@ struct OperatorGeneratorArgs {
   OperatorGeneratorArgs(                                                    \
       TORCH_SELECTIVE_SCHEMA(#aten_op                                       \
                              ".int_float(int a, float b) -> " #result),     \
-      [](Stack* stack) {                                                    \
+      [](Stack& stack) {                                                    \
         int64_t a;                                                          \
         double b;                                                           \
         pop(stack, a, b);                                                   \
@@ -533,7 +533,7 @@ struct OperatorGeneratorArgs {
       OperatorGeneratorArgs(                                                \
           TORCH_SELECTIVE_SCHEMA(#aten_op                                   \
                                  ".float_int(float a, int b) -> " #result), \
-          [](Stack* stack) {                                                \
+          [](Stack& stack) {                                                \
             double a;                                                       \
             int64_t b;                                                      \
             pop(stack, a, b);                                               \
@@ -544,7 +544,7 @@ struct OperatorGeneratorArgs {
 #define DEFINE_INT_OP(aten_op, op)                                  \
   OperatorGeneratorArgs(                                            \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> int"), \
-      [](Stack* stack) {                                            \
+      [](Stack& stack) {                                            \
         int64_t a, b;                                               \
         pop(stack, a, b);                                           \
         push(stack, op); /* NOLINT(hicpp-signed-bitwise) */         \
@@ -554,7 +554,7 @@ struct OperatorGeneratorArgs {
 #define DEFINE_STR_CMP_OP(aten_op, op)                               \
   OperatorGeneratorArgs(                                             \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".str(str a, str b) -> bool"), \
-      [](Stack* stack) {                                             \
+      [](Stack& stack) {                                             \
         auto b = pop(stack).toStringRef();                           \
         auto a = pop(stack).toStringRef();                           \
         push(stack, op);                                             \
@@ -570,7 +570,7 @@ struct OperatorGeneratorArgs {
   OperatorGeneratorArgs(                                          \
       TORCH_SELECTIVE_SCHEMA(#aten_op string_val                  \
                              "(Scalar a, Scalar b) -> " #result), \
-      [](Stack* stack) {                                          \
+      [](Stack& stack) {                                          \
         IValue x, y;                                              \
         pop(stack, x, y);                                         \
         if (x.isDouble()) {                                       \
@@ -625,7 +625,7 @@ struct OperatorGeneratorArgs {
 #define DEFINE_UNARY_INT_OP(aten_op, op, result)                  \
   OperatorGeneratorArgs(                                          \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a) -> " #result), \
-      [](Stack* stack) {                                          \
+      [](Stack& stack) {                                          \
         int64_t a;                                                \
         pop(stack, a);                                            \
         push(stack, op);                                          \
@@ -635,7 +635,7 @@ struct OperatorGeneratorArgs {
 #define DEFINE_UNARY_FLOAT_OP(aten_op, op, result)                    \
   OperatorGeneratorArgs(                                              \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".float(float a) -> " #result), \
-      [](Stack* stack) {                                              \
+      [](Stack& stack) {                                              \
         double a;                                                     \
         pop(stack, a);                                                \
         push(stack, op);                                              \
@@ -647,7 +647,7 @@ struct OperatorGeneratorArgs {
       DEFINE_UNARY_FLOAT_OP(aten_op, op, float_result),                   \
       OperatorGeneratorArgs(                                              \
           TORCH_SELECTIVE_SCHEMA(#aten_op ".Scalar(Scalar a) -> Scalar"), \
-          [](Stack* stack) {                                              \
+          [](Stack& stack) {                                              \
             IValue x;                                                     \
             pop(stack, x);                                                \
             if (x.isDouble()) {                                           \
@@ -662,7 +662,7 @@ struct OperatorGeneratorArgs {
 #define DEFINE_BOOL_OP(aten_op, op)                                     \
   OperatorGeneratorArgs(                                                \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".bool(bool a, bool b) -> bool"), \
-      [](Stack* stack) {                                                \
+      [](Stack& stack) {                                                \
         bool a, b;                                                      \
         pop(stack, a, b);                                               \
         push(stack, op);                                                \
@@ -671,7 +671,7 @@ struct OperatorGeneratorArgs {
 #define DEFINE_STRING_OP(op_name, string_op, result)                    \
   OperatorGeneratorArgs(                                                \
       TORCH_SELECTIVE_SCHEMA(#op_name ".str(str a, str b) ->" #result), \
-      [](Stack* stack) {                                                \
+      [](Stack& stack) {                                                \
         auto b = pop(stack).toStringRef();                              \
         auto a = pop(stack).toStringRef();                              \
         push(stack, string_op);                                         \
@@ -685,7 +685,7 @@ struct OperatorGeneratorArgs {
 #define DEFINE_UNARY_COMPLEX_OP(aten_op, op, result)                      \
   OperatorGeneratorArgs(                                                  \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".complex(complex a) -> " #result), \
-      [](Stack* stack) {                                                  \
+      [](Stack& stack) {                                                  \
         c10::complex<double> a;                                           \
         pop(stack, a);                                                    \
         push(stack, op);                                                  \
@@ -709,7 +709,7 @@ struct OperatorGeneratorArgs {
       DEFINE_UNARY_COMPLEX_OP(aten_op, op, complex_result),               \
       OperatorGeneratorArgs(                                              \
           TORCH_SELECTIVE_SCHEMA(#aten_op ".Scalar(Scalar a) -> Scalar"), \
-          [](Stack* stack) {                                              \
+          [](Stack& stack) {                                              \
             IValue x;                                                     \
             pop(stack, x);                                                \
             if (x.isDouble()) {                                           \
@@ -739,7 +739,7 @@ struct OperatorGeneratorArgs {
     complex_result)                                                           \
   OperatorGeneratorArgs(                                                      \
       TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> " #int_result),  \
-      [](Stack* stack) {                                                      \
+      [](Stack& stack) {                                                      \
         int64_t a, b;                                                         \
         pop(stack, a, b);                                                     \
         push(stack, int_op);                                                  \
@@ -748,7 +748,7 @@ struct OperatorGeneratorArgs {
       OperatorGeneratorArgs(                                                  \
           TORCH_SELECTIVE_SCHEMA(                                             \
               #aten_op ".complex(complex a, complex b) -> " #complex_result), \
-          [](Stack* stack) {                                                  \
+          [](Stack& stack) {                                                  \
             c10::complex<double> a, b;                                        \
             pop(stack, a, b);                                                 \
             push(stack, complex_op);                                          \
@@ -757,7 +757,7 @@ struct OperatorGeneratorArgs {
       OperatorGeneratorArgs(                                                  \
           TORCH_SELECTIVE_SCHEMA(                                             \
               #aten_op ".float(float a, float b) -> " #float_result),         \
-          [](Stack* stack) {                                                  \
+          [](Stack& stack) {                                                  \
             double a, b;                                                      \
             pop(stack, a, b);                                                 \
             push(stack, float_op);                                            \
@@ -768,7 +768,7 @@ struct OperatorGeneratorArgs {
   OperatorGeneratorArgs(                                                    \
       TORCH_SELECTIVE_SCHEMA(#aten_op                                       \
                              ".int_complex(int a, complex b) -> " #result), \
-      [](Stack* stack) {                                                    \
+      [](Stack& stack) {                                                    \
         int64_t a;                                                          \
         c10::complex<double> b;                                             \
         pop(stack, a, b);                                                   \
@@ -778,7 +778,7 @@ struct OperatorGeneratorArgs {
       OperatorGeneratorArgs(                                                \
           TORCH_SELECTIVE_SCHEMA(                                           \
               #aten_op ".complex_int(complex a, int b) -> " #result),       \
-          [](Stack* stack) {                                                \
+          [](Stack& stack) {                                                \
             c10::complex<double> a;                                         \
             int64_t b;                                                      \
             pop(stack, a, b);                                               \
@@ -790,7 +790,7 @@ struct OperatorGeneratorArgs {
   OperatorGeneratorArgs(                                                  \
       TORCH_SELECTIVE_SCHEMA(                                             \
           #aten_op ".float_complex(float a, complex b) -> " #result),     \
-      [](Stack* stack) {                                                  \
+      [](Stack& stack) {                                                  \
         double a;                                                         \
         c10::complex<double> b;                                           \
         pop(stack, a, b);                                                 \
@@ -800,7 +800,7 @@ struct OperatorGeneratorArgs {
       OperatorGeneratorArgs(                                              \
           TORCH_SELECTIVE_SCHEMA(                                         \
               #aten_op ".complex_float(complex a, float b) -> " #result), \
-          [](Stack* stack) {                                              \
+          [](Stack& stack) {                                              \
             c10::complex<double> a;                                       \
             double b;                                                     \
             pop(stack, a, b);                                             \
@@ -813,7 +813,7 @@ struct OperatorGeneratorArgs {
   OperatorGeneratorArgs(                                              \
       TORCH_SELECTIVE_SCHEMA(#aten_op string_val                      \
                              "(Scalar a, Scalar b) -> " #result),     \
-      [](Stack* stack) {                                              \
+      [](Stack& stack) {                                              \
         IValue x, y;                                                  \
         pop(stack, x, y);                                             \
         if (x.isComplexDouble()) {                                    \
@@ -860,7 +860,7 @@ struct OperatorGeneratorArgs {
     aten_op, int_op, float_op, complex_op, result)                         \
   OperatorGeneratorArgs(                                                   \
       TORCH_SELECTIVE_SCHEMA(#aten_op "(Scalar a, Scalar b) -> " #result), \
-      [](Stack* stack) {                                                   \
+      [](Stack& stack) {                                                   \
         IValue x, y;                                                       \
         pop(stack, x, y);                                                  \
         if (x.isComplexDouble()) {                                         \
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 2953b686ee379..9164471dfddf7 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -89,7 +89,7 @@ auto powWrapper(T a, U b) {
 static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::str(t elem) -> str"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           std::stringstream ss;
           ss << pop(stack);
           push(stack, ss.str());
@@ -97,7 +97,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::list(str t) -> str[]"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto str = pop(stack).toStringRef();
           c10::List<std::string> chars;
           chars.reserve(str.size());
@@ -109,7 +109,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::cpu(Tensor(a) self) -> Tensor(a|b)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.cpu());
@@ -117,7 +117,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::layout(Tensor a) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.layout());
@@ -128,7 +128,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::__range_length(int lo, int hi, int step) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           int64_t lo, hi, step;
           pop(stack, lo, hi, step);
@@ -148,7 +148,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::__derive_index(int index, int start, int step) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           int64_t index, start, step;
           pop(stack, index, start, step);
@@ -157,7 +157,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::TupleUnpack(Any tup) -> ..."),
-        [](Stack* stack) { tupleUnpack(*stack); },
+        [](Stack& stack) { tupleUnpack(stack); },
         aliasAnalysisSpecialCase()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::unchecked_cast(t x) -> t"),
@@ -165,7 +165,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisSpecialCase()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::IntImplicit(Tensor a) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           checkImplicitTensorToNum(a, /*to int*/ true);
@@ -174,7 +174,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::ComplexImplicit(Tensor a) -> complex"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           checkImplicitTensorToNum(a, /*to int*/ false);
@@ -183,7 +183,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::FloatImplicit(Tensor a) -> float"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           checkImplicitTensorToNum(a, /*to int*/ false);
@@ -192,7 +192,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::ScalarImplicit(Tensor a) -> Scalar"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           checkImplicitTensorToNum(a, /*to int*/ false);
@@ -201,7 +201,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Bool.Tensor(Tensor a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.is_nonzero());
@@ -209,7 +209,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Bool.int(int a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           int64_t i;
           pop(stack, i);
@@ -218,7 +218,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Bool.float(float a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           double d;
           pop(stack, d);
@@ -227,7 +227,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Int.Tensor(Tensor a) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.item<int64_t>());
@@ -235,7 +235,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Int.bool(bool a) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           bool b;
           pop(stack, b);
@@ -244,7 +244,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Int.float(float a) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           double d;
           pop(stack, d);
@@ -253,7 +253,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Int.Scalar(Scalar a) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue scalar;
           pop(stack, scalar);
           if (scalar.isInt()) {
@@ -266,7 +266,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Int.str(str a) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto s = pop(stack).toString();
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           std::string::size_type sz;
@@ -283,7 +283,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Float.Tensor(Tensor a) -> float"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.item<double>());
@@ -291,7 +291,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Float.Scalar(Scalar a) -> float"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue scalar;
           pop(stack, scalar);
           if (scalar.isDouble()) {
@@ -305,7 +305,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Float.int(int a) -> float"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           int64_t i;
           pop(stack, i);
@@ -314,7 +314,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Float.bool(bool a) -> float"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           bool b;
           pop(stack, b);
@@ -323,7 +323,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Float.str(str a) -> float"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto s = pop(stack).toString();
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           std::string::size_type sz;
@@ -340,7 +340,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Complex.Scalar(Scalar a) -> complex"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue scalar;
           pop(stack, scalar);
           if (scalar.isComplexDouble()) {
@@ -355,7 +355,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::Complex.Tensor_Tensor(Tensor a, Tensor b) -> complex"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a, b;
           pop(stack, a, b);
           push(stack, c10::complex<double>(a.item<double>(), b.item<double>()));
@@ -363,21 +363,21 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::format(str self, ...) -> str"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           size_t num_inputs = pop(stack).toInt();
-          format(*stack, num_inputs);
+          format(stack, num_inputs);
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::einsum.sublist(Tensor a, ...) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           size_t num_inputs = pop(stack).toInt();
-          einsum(*stack, num_inputs);
+          einsum(stack, num_inputs);
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::NumToTensor.Scalar(Scalar a) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Scalar s;
           pop(stack, s);
           push(stack, at::scalar_to_tensor(s));
@@ -385,29 +385,29 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::RaiseException(str msg) -> ()"),
-        [](Stack* stack) { throw JITException(pop(stack).toStringRef()); },
+        [](Stack& stack) { throw JITException(pop(stack).toStringRef()); },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::Size(int[] sizes) -> int[]"),
-        [](Stack* stack) {},
+        [](Stack& stack) {},
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::size(Tensor self) -> int[]"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto t = std::move(pop(stack)).toTensor();
           pack(stack, t.sizes().vec());
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::EnumName(AnyEnumType enum) -> str"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue e = pop(stack);
           push(stack, e.toEnumHolder()->name());
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::EnumValue.int(AnyEnumType enum) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue e = pop(stack);
           push(stack, e.toEnumHolder()->value());
         },
@@ -415,14 +415,14 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "prim::EnumValue.float(AnyEnumType enum) -> float"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue e = pop(stack);
           push(stack, e.toEnumHolder()->value());
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::EnumValue.str(AnyEnumType enum) -> str"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue e = pop(stack);
           push(stack, e.toEnumHolder()->value());
         },
@@ -431,7 +431,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         // note the compiler knows to type TupleIndex more accurately than it
         // is listed here.
         TORCH_SELECTIVE_SCHEMA("prim::TupleIndex(Any tup, int i) -> Any"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           int64_t index = pop(stack).toInt();
           auto tuple = pop(stack).toTuple();
           auto norm_index = normalizeIndex(index, tuple->elements().size());
@@ -439,7 +439,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
               norm_index > static_cast<int64_t>(tuple->elements().size())) {
             throw std::out_of_range("Tuple list index out of range");
           }
-          stack->emplace_back(tuple->elements()[norm_index]);
+          stack.emplace_back(tuple->elements()[norm_index]);
         },
         aliasAnalysisSpecialCase()),
     OperatorGeneratorArgs(
@@ -453,11 +453,11 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::device(Tensor a) -> Device"),
-        [](Stack* stack) { push(stack, pop(stack).toTensor().device()); },
+        [](Stack& stack) { push(stack, pop(stack).toTensor().device()); },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::dtype(Tensor a) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, static_cast<int64_t>(a.scalar_type()));
@@ -465,11 +465,11 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::__not__(bool self) -> bool"),
-        [](Stack* stack) { push(stack, !pop(stack).toBool()); },
+        [](Stack& stack) { push(stack, !pop(stack).toBool()); },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::__is__(t1 self, t2 obj) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue self, obj;
           pop(stack, self, obj);
           push(stack, self.is(obj));
@@ -477,7 +477,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::__isnot__(t1 self, t2 obj) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue self, obj;
           pop(stack, self, obj);
           push(stack, !self.is(obj));
@@ -485,28 +485,28 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::element_size(Tensor self) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor arg = pop(stack).toTensor();
           push(stack, arg.element_size());
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::numel(Tensor self) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor arg = pop(stack).toTensor();
           push(stack, arg.numel());
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::dim(Tensor self) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor arg = pop(stack).toTensor();
           push(stack, arg.dim());
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::get_device(Tensor self) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           RECORD_FUNCTION("get_device", std::vector<c10::IValue>());
           auto result =
               at::get_device((std::move(peek(stack, 0, 1))).toTensor());
@@ -516,7 +516,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::storage_offset(Tensor self) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           RECORD_FUNCTION("storage_offset", std::vector<c10::IValue>());
           auto result =
               ((std::move(peek(stack, 0, 1))).toTensor()).storage_offset();
@@ -526,7 +526,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::is_contiguous(Tensor self) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           RECORD_FUNCTION("is_contiguous", std::vector<c10::IValue>());
           auto result =
               ((std::move(peek(stack, 0, 1))).toTensor()).is_contiguous();
@@ -623,7 +623,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::eq.device(Device a, Device b) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto a = pop(stack).toDevice();
           auto b = pop(stack).toDevice();
           push(stack, a == b);
@@ -631,7 +631,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::ne.device(Device a, Device b) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto a = pop(stack).toDevice();
           auto b = pop(stack).toDevice();
           push(stack, a != b);
@@ -639,7 +639,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::eq.bool(bool a, bool b) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto a = pop(stack);
           auto b = pop(stack);
           push(stack, a == b);
@@ -647,7 +647,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::ne.bool(bool a, bool b) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto a = pop(stack);
           auto b = pop(stack);
           push(stack, a != b);
@@ -655,11 +655,11 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::Uninitialized() -> Any"),
-        [](Stack* stack) { push(stack, IValue::uninitialized()); },
+        [](Stack& stack) { push(stack, IValue::uninitialized()); },
         aliasAnalysisSpecialCase()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::Print(...) -> ()"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto num_inputs = pop(stack).toInt();
           std::stringstream ss;
           bool first = true;
@@ -682,7 +682,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     //    prim::VarConcat(Tensors..., dim) -> Tensor
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::VarConcat(...) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto num_inputs = pop(stack).toInt();
           auto dim = pop(stack).toInt();
           std::vector<at::Tensor> inputs(num_inputs - 1);
@@ -694,7 +694,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::VarStack(...) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto num_inputs = pop(stack).toInt();
           auto dim = pop(stack).toInt();
           std::vector<at::Tensor> inputs(num_inputs - 1);
@@ -707,7 +707,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::eq.enum(AnyEnumType a, AnyEnumType b) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue x = pop(stack);
           IValue y = pop(stack);
           push(stack, x == y);
@@ -716,7 +716,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::ne.enum(AnyEnumType a, AnyEnumType b) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue x = pop(stack);
           IValue y = pop(stack);
           push(stack, x != y);
@@ -731,7 +731,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::dequantize.tensor(Tensor qtensor) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor qtensor;
           pop(stack, qtensor);
           push(stack, at::dequantize(qtensor));
@@ -740,14 +740,14 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::dequantize.list(Tensor[] qtensors) -> Tensor[]"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto qtensors = pop(stack).toTensorVector();
           push(stack, at::dequantize(qtensors));
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::dequantize.any(Any tensors) -> Any"),
-        [](Stack* stack) { dequantize(*stack); },
+        [](Stack& stack) { dequantize(stack); },
         aliasAnalysisFromSchema()),
     DEFINE_UNARY_OP_WITH_COMPLEX(aten::log, std::log(a), float, float),
     DEFINE_STRING_OP(aten::add, a + b, str),
@@ -847,7 +847,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         float),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::pow.int_to_int(int a, int b) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           int64_t a, b;
           pop(stack, a, b);
@@ -860,7 +860,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     DEFINE_BINARY_OP(prim::max, a > b ? a : b),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::type(Device self) -> str"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto d = pop(stack);
           push(
               stack, DeviceTypeName(d.toDevice().type(), /* lower_case=*/true));
@@ -869,7 +869,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     // tensor length op (size of 1st dimension)
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::len.Tensor(Tensor t) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor t = pop(stack).toTensor();
           if (t.dim() == 0) {
             AT_ERROR("len() of a 0-d tensor");
@@ -879,7 +879,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::ord(str string) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto string = pop(stack).toStringRef();
           TORCH_CHECK(
               string.size() == 1,
@@ -891,7 +891,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::lower(str self) -> str"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto string = pop(stack).toStringRef();
           std::stringstream ss;
           for (char c : string) {
@@ -912,14 +912,14 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::len.str(str s) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto string = pop(stack).toStringRef();
           push(stack, static_cast<int64_t>(string.size()));
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::dict() -> Dict(str, Tensor)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto dict =
               c10::impl::GenericDict(StringType::get(), TensorType::get());
           push(stack, dict);
@@ -928,7 +928,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::__getitem__.str(str s, int index) -> str"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto index = pop(stack).toInt();
           auto string = pop(stack).toStringRef();
           auto norm_index = normalizeIndex(index, string.size());
@@ -941,7 +941,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
       TORCH_SELECTIVE_SCHEMA("aten::copy_." #other_type                  \
                              "(Tensor(a!) self, " #other_type            \
                              " other) -> Tensor(a!)"),                   \
-      [](Stack* stack) {                                                 \
+      [](Stack& stack) {                                                 \
         at::Tensor t;                                                    \
         c_type other;                                                    \
         pop(stack, t, other);                                            \
@@ -957,7 +957,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::backward(Tensor self, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           bool create_graph = pop(stack).toBool();
           auto retain_graph = pop(stack).toOptional<bool>();
           IValue gradient_ivalue = pop(stack);
@@ -977,7 +977,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
           auto self = pop(stack).toTensor();
           auto result = at::index(self, indices);
@@ -987,7 +987,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::_index_put_impl_.hacked_twin(Tensor(a!) self, Tensor[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto unsafe = pop(stack).toBool();
           auto accumulate = pop(stack).toBool();
           auto values = pop(stack).toTensor();
@@ -1001,7 +1001,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::index_put_.hacked_twin(Tensor(a!) self, Tensor[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto accumulate = pop(stack).toBool();
           auto values = pop(stack).toTensor();
           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
@@ -1013,7 +1013,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::index_put.hacked_twin(Tensor self, Tensor[] indices, Tensor values, bool accumulate=False) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto accumulate = pop(stack).toBool();
           auto values = pop(stack).toTensor();
           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
@@ -1026,7 +1026,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::to.prim_Device(Tensor(a) self, Device? device, int? dtype=None, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           bool non_blocking;
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -1044,7 +1044,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::to.prim_dtype(Tensor(a) self, int? dtype=None, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           bool non_blocking;
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -1060,7 +1060,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::is_cuda(Tensor a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.is_cuda());
@@ -1068,7 +1068,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::is_xpu(Tensor a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.is_xpu());
@@ -1076,7 +1076,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::data(Tensor(a) a) -> Tensor(a)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, autograd::Variable(a).variable_data());
@@ -1113,7 +1113,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
 #define DEFINE_STRING_IS_OP(op_name, char_op)                          \
   OperatorGeneratorArgs(                                               \
       TORCH_SELECTIVE_SCHEMA(#op_name "(str self) -> bool"),           \
-      [](Stack* stack) {                                               \
+      [](Stack& stack) {                                               \
         auto string = pop(stack).toStringRef();                        \
         push(                                                          \
             stack,                                                     \
@@ -1134,7 +1134,7 @@ static const OperatorGeneratorArgs opGenArgs[] = {
 #define DEFINE_STRING_CHAR_MAP_OP(op_name, char_op)         \
   OperatorGeneratorArgs(                                    \
       TORCH_SELECTIVE_SCHEMA(#op_name "(str self) -> str"), \
-      [](Stack* stack) {                                    \
+      [](Stack& stack) {                                    \
         auto string = pop(stack).toStringRef();             \
         std::stringstream ss;                               \
         for (char c : string) {                             \
@@ -1183,7 +1183,7 @@ RegisterOperators reg(([]() {
       // operator below is intended to be as close to the Python
       // implementation in torch/csrc/utils/tensor_list.cpp as possible.
       [](const Node* /*node*/) -> Operation {
-        return [](Stack* stack) {
+        return [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           int elem_ty_val;
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -1260,19 +1260,19 @@ RegisterOperators reg(([]() {
   return v;
 })());
 
-void dictSetItem(Stack* stack) {
+void dictSetItem(Stack& stack) {
   auto value = pop(stack);
   auto idx = pop(stack);
   auto dict = pop(stack).toGenericDict();
   dict.insert_or_assign(std::move(idx), std::move(value));
 }
 
-void dictLen(Stack* stack) {
+void dictLen(Stack& stack) {
   auto dict = pop(stack).toGenericDict();
   push(stack, int64_t(dict.size()));
 }
 
-void dictValues(Stack* stack) {
+void dictValues(Stack& stack) {
   auto dict = pop(stack).toGenericDict();
   auto values = c10::impl::GenericList(dict.valueType());
   for (const auto& entry : dict) {
@@ -1281,7 +1281,7 @@ void dictValues(Stack* stack) {
   push(stack, values);
 }
 
-void dictKeys(Stack* stack) {
+void dictKeys(Stack& stack) {
   auto dict = pop(stack).toGenericDict();
   auto keys = c10::impl::GenericList(dict.keyType());
   for (const auto& entry : dict) {
@@ -1290,7 +1290,7 @@ void dictKeys(Stack* stack) {
   push(stack, keys);
 }
 
-void dictIndex(Stack* stack) {
+void dictIndex(Stack& stack) {
   auto key = pop(stack);
   auto dict = pop(stack).toGenericDict();
   auto value = dict.find(key);
@@ -1301,7 +1301,7 @@ void dictIndex(Stack* stack) {
 }
 
 template <bool has_default>
-void dictGet(Stack* stack) {
+void dictGet(Stack& stack) {
   IValue default_value;
   if (has_default) {
     default_value = pop(stack);
@@ -1318,7 +1318,7 @@ void dictGet(Stack* stack) {
 
 // If the key is in the dict, return it. Else set it to the default value and
 // return that.
-void dictSetDefault(Stack* stack) {
+void dictSetDefault(Stack& stack) {
   auto default_value = pop(stack);
   auto key = pop(stack);
   auto dict = pop(stack).toGenericDict();
@@ -1332,7 +1332,7 @@ void dictSetDefault(Stack* stack) {
 }
 
 template <bool has_default>
-void dictPop(Stack* stack) {
+void dictPop(Stack& stack) {
   IValue default_value;
   if (has_default) {
     default_value = pop(stack);
@@ -1355,13 +1355,13 @@ void dictPop(Stack* stack) {
   }
 }
 
-void dictDelete(Stack* stack) {
+void dictDelete(Stack& stack) {
   dictPop<false>(stack);
   // pop pushes an item on the stack but delete does not, so get rid of it
   pop(stack);
 }
 
-void dictPopItem(Stack* stack) {
+void dictPopItem(Stack& stack) {
   auto dict = pop(stack).toGenericDict();
   if (dict.size() == 0) {
     AT_ERROR("popitem(): dictionary is empty");
@@ -1376,18 +1376,18 @@ void dictPopItem(Stack* stack) {
   push(stack, tuple);
 }
 
-void dictContains(Stack* stack) {
+void dictContains(Stack& stack) {
   auto key = pop(stack);
   auto dict = pop(stack).toGenericDict();
   push(stack, dict.contains(key));
 }
 
-void dictClear(Stack* stack) {
+void dictClear(Stack& stack) {
   auto dict = pop(stack).toGenericDict();
   dict.clear();
 }
 
-void dictUpdate(Stack* stack) {
+void dictUpdate(Stack& stack) {
   auto to_add = pop(stack).toGenericDict();
   auto dict = pop(stack).toGenericDict();
 
@@ -1396,7 +1396,7 @@ void dictUpdate(Stack* stack) {
   }
 }
 
-void dictItems(Stack* stack) {
+void dictItems(Stack& stack) {
   auto dict = pop(stack).toGenericDict();
   auto key_type = dict.keyType();
   auto value_type = dict.valueType();
@@ -1409,11 +1409,11 @@ void dictItems(Stack* stack) {
   push(stack, std::move(items));
 }
 
-void dictCopy(Stack* stack) {
+void dictCopy(Stack& stack) {
   push(stack, pop(stack).toGenericDict().copy());
 }
 
-void dictConstructFromList(Stack* stack) {
+void dictConstructFromList(Stack& stack) {
   auto input_list = pop(stack);
   auto list = input_list.toList();
   auto tup_type = list.elementType()->expect<TupleType>();
@@ -2120,7 +2120,7 @@ TORCH_LIBRARY_IMPL(aten, CatchAll, m) {
 static const OperatorGeneratorArgs opGenArgs1[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::rangelist(int n) -> int[]"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           int64_t n;
           pop(stack, n);
@@ -2136,7 +2136,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
     // because all _to_tensor conversion have to have the same operator namet
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::NumToTensor.bool(bool a) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           bool b;
           pop(stack, b);
@@ -2145,21 +2145,21 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::device(str a) -> Device"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           push(stack, c10::Device(pop(stack).toStringRef()));
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::percentFormat(str self, ...) -> str"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           size_t num_inputs = pop(stack).toInt();
-          percentFormat(*stack, num_inputs);
+          percentFormat(stack, num_inputs);
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::to.prim_other(Tensor(a) self, bool non_blocking=False, bool copy=False) -> Tensor(a|b)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor self;
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           bool non_blocking;
@@ -2174,7 +2174,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::requires_grad(Tensor a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.requires_grad());
@@ -2182,7 +2182,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::grad(Tensor a) -> Tensor(*)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.grad());
@@ -2190,7 +2190,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::is_sparse(Tensor a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.is_sparse());
@@ -2198,7 +2198,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::is_sparse_csr(Tensor a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.is_sparse_csr());
@@ -2206,7 +2206,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::is_mkldnn(Tensor a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.is_mkldnn());
@@ -2214,7 +2214,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::is_mlc(Tensor a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.is_mlc());
@@ -2222,7 +2222,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::is_vulkan(Tensor a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.is_vulkan());
@@ -2230,7 +2230,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::is_quantized(Tensor a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.is_quantized());
@@ -2238,7 +2238,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::is_meta(Tensor a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.is_meta());
@@ -2246,7 +2246,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::is_ort(Tensor a) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.is_ort());
@@ -2254,7 +2254,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::name(Tensor a) -> str?"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           if (a.name() == "") {
@@ -2266,7 +2266,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::index(Device self) -> int?"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto d = pop(stack).toDevice();
           if (d.has_index()) {
             push(stack, d.index());
@@ -2279,11 +2279,11 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         // TODO return generator object when torchscript supports RNG
         // first-class
         TORCH_SELECTIVE_SCHEMA("aten::manual_seed(int seed) -> ()"),
-        [](Stack* stack) { at::manual_seed(pop(stack).toInt()); },
+        [](Stack& stack) { at::manual_seed(pop(stack).toInt()); },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::cuda(Tensor(a) self) -> Tensor(a|b)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
           push(stack, a.cuda());
@@ -2291,12 +2291,12 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::AutogradZero() -> Tensor"),
-        [](Stack* stack) { stack->emplace_back(at::Tensor()); },
+        [](Stack& stack) { stack.emplace_back(at::Tensor()); },
         aliasAnalysisSpecialCase()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "prim::ReductionSizes(int[] size, int[] red_axes, bool keepdim = False) -> int[]"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           bool keepdim = pop(stack).toBool();
           c10::List<int64_t> axes = pop(stack).toIntList();
           c10::List<int64_t> size = pop(stack).toIntList();
@@ -2324,7 +2324,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::BroadcastSizes(...) -> int[]"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto num_inputs = pop(stack).toInt();
           std::vector<int64_t> size;
           size.reserve(8);
@@ -2339,7 +2339,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::warn(str message, int stacklevel=2) -> ()"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           TORCH_CHECK(false, "warn is implemented directly in the interpreter");
         },
         aliasAnalysisFromSchema()),
@@ -2347,7 +2347,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "onnx::Reshape(Tensor input, Tensor shape) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor input, shape;
           pop(stack, input, shape);
           shape = shape.contiguous();
@@ -2358,7 +2358,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
         aliasAnalysisSpecialCase()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("onnx::Shape(Tensor t) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto t = pop(stack).toTensor();
           at::IntArrayRef sizes = t.sizes();
           auto sizes_tensor = torch::empty(
@@ -2367,12 +2367,12 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
           for (const auto i : c10::irange(sizes.size())) {
             accessor[i] = sizes[i];
           }
-          stack->emplace_back(sizes_tensor);
+          stack.emplace_back(sizes_tensor);
         },
         aliasAnalysisSpecialCase()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::AutogradAnyNonZero(...) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto num_inputs = pop(stack).toInt();
           bool result = false;
           for (const IValue& v : last(stack, num_inputs)) {
@@ -2395,12 +2395,12 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
             }
           }
           drop(stack, num_inputs);
-          stack->emplace_back(result);
+          stack.emplace_back(result);
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::AutogradAllZero(...) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto num_inputs = pop(stack).toInt();
           bool result = true;
           for (const IValue& v : last(stack, num_inputs)) {
@@ -2411,12 +2411,12 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
             }
           }
           drop(stack, num_inputs);
-          stack->emplace_back(result);
+          stack.emplace_back(result);
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::AutogradAllNonZero(...) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto num_inputs = pop(stack).toInt();
           bool result = true;
           for (const IValue& v : last(stack, num_inputs)) {
@@ -2427,31 +2427,31 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
             }
           }
           drop(stack, num_inputs);
-          stack->emplace_back(result);
+          stack.emplace_back(result);
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::AutogradAdd(Any a, Any b) -> Any"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor a, b;
           pop(stack, a, b);
           // NOLINTNEXTLINE(bugprone-branch-clone)
           if (!a.defined() && !b.defined()) {
             // undef + undef == undef
-            stack->emplace_back(a);
+            stack.emplace_back(a);
           } else if (!a.defined()) {
-            stack->emplace_back(b);
+            stack.emplace_back(b);
           } else if (!b.defined()) {
-            stack->emplace_back(a);
+            stack.emplace_back(a);
           } else {
-            stack->emplace_back(a + b);
+            stack.emplace_back(a + b);
           }
         },
         aliasAnalysisSpecialCase()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::_size_if_not_equal(int[] self_size, int[] other_size) -> int[]?"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue self_size, other_size;
           pop(stack, self_size, other_size);
           auto s = self_size.toIntVector();
@@ -2466,7 +2466,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::_unwrap_optional(t(a)? optional) -> t(a)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto val = pop(stack);
           TORCH_CHECK(!val.isNone(), "Unwrapping null optional");
           push(stack, std::move(val));
@@ -2476,7 +2476,7 @@ static const OperatorGeneratorArgs opGenArgs1[] = {
 RegisterOperators reg1(
     createOperators(opGenArgs1, sizeof(opGenArgs1) / sizeof(opGenArgs1[0])));
 
-void hashValue(Stack* stack) {
+void hashValue(Stack& stack) {
   auto value = pop(stack);
   push(stack, value.hash());
 }
@@ -2618,7 +2618,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
 #define DEFINE_CONVERT_BASE_OP(op_name, prefix, char_op) \
   OperatorGeneratorArgs(                                 \
       TORCH_SELECTIVE_SCHEMA(#op_name "(int i) -> str"), \
-      [](Stack* stack) {                                 \
+      [](Stack& stack) {                                 \
         auto i = pop(stack).toInt();                     \
         std::stringstream ss;                            \
         if (i < 0) {                                     \
@@ -2635,7 +2635,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
 
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::bin(int i) -> str"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto i = pop(stack).toInt();
           std::stringstream ss;
           if (i == 0) {
@@ -2656,7 +2656,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "prim::StringIndex(str string, int index) -> str"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto index = pop(stack).toInt();
           auto string = pop(stack).toStringRef();
           auto norm_index = normalizeIndex(index, string.size());
@@ -2666,7 +2666,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::chr(int i) -> str"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto i = pop(stack).toInt();
           std::stringstream ss;
           TORCH_CHECK(
@@ -2684,7 +2684,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
 
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::modf(float a) -> (float, float)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           double a;
           pop(stack, a);
@@ -2696,7 +2696,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::frexp(float a) -> (float, int)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           double a;
           pop(stack, a);
@@ -2710,7 +2710,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::ldexp(float x, int i) -> float"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           double a;
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -2810,7 +2810,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         float),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::abs(Tensor x) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor x;
           pop(stack, x);
           push(stack, x.abs());
@@ -2833,7 +2833,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         float),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::_tensor_to_list(Tensor self) -> int[]"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor t;
           pop(stack, t);
           c10::List<int64_t> elems;
@@ -2846,7 +2846,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::_list_to_tensor(int[] self) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           c10::List<int64_t> l = pop(stack).toIntList();
           auto t = torch::empty(
               {static_cast<int64_t>(l.size())}, at::dtype(at::kInt));
@@ -2858,7 +2858,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::sum.int(int[] self) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           c10::List<int64_t> l = pop(stack).toIntList();
           auto sum = 0;
           for (const auto& elem : l) {
@@ -2869,7 +2869,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::sum.float(float[] self) -> float"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           c10::List<double> l = pop(stack).toDoubleList();
           auto sum = 0.0;
           for (const auto& elem : l) {
@@ -2880,7 +2880,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::sum.complex(complex[] self) -> complex"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           c10::List<c10::complex<double>> l = pop(stack).toComplexDoubleList();
           c10::complex<double> sum = 0.0;
           for (const auto i : c10::irange(l.size())) {
@@ -2891,7 +2891,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::sum.bool(bool[] self) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           c10::List<bool> l = pop(stack).toBoolList();
           auto sum = 0;
           for (const auto& elem : l) {
@@ -2904,7 +2904,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::any.str(str[] self) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto l = pop(stack).toList();
           for (const auto& elem : l) {
             if (elem != "") {
@@ -2917,7 +2917,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::any.int(int[] self) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           c10::List<int64_t> l = pop(stack).toIntList();
           for (const auto& elem : l) {
             if (elem) {
@@ -2930,7 +2930,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::any.float(float[] self) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           c10::List<double> l = pop(stack).toDoubleList();
           for (const auto& elem : l) {
             if (elem) {
@@ -2943,7 +2943,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::any.bool(bool[] self) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           c10::List<bool> l = pop(stack).toBoolList();
           for (const auto& elem : l) {
             if (elem) {
@@ -2956,7 +2956,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::all.int(int[] self) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           c10::List<int64_t> l = pop(stack).toIntList();
           for (const auto& elem : l) {
             if (!elem) {
@@ -2969,7 +2969,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::all.float(float[] self) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           c10::List<double> l = pop(stack).toDoubleList();
           for (const auto& elem : l) {
             if (!elem) {
@@ -2982,7 +2982,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::all.bool(bool[] self) -> bool"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           c10::List<bool> l = pop(stack).toBoolList();
           for (const auto& elem : l) {
             if (!elem) {
@@ -2995,7 +2995,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::divmod.int(int x, int y) -> (int, int)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           int64_t a, b;
           lldiv_t divresult = {};
@@ -3018,7 +3018,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA(
             "aten::divmod.float(float x, float y) -> (float, float)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           double a, b;
           pop(stack, a, b);
@@ -3035,7 +3035,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::id(AnyClassType? x) -> int"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           IValue a;
           pop(stack, a);
           if (a.isNone()) {
@@ -3050,7 +3050,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
   OperatorGeneratorArgs(                                                     \
       TORCH_SELECTIVE_SCHEMA("aten::divmod." #type_a "_" #type_b "(" #type_a \
                              " x," #type_b " y) -> (float, float)"),         \
-      [](Stack* stack) {                                                     \
+      [](Stack& stack) {                                                     \
         type_a a;                                                            \
         type_b b;                                                            \
         pop(stack, a, b);                                                    \
@@ -3076,7 +3076,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
   OperatorGeneratorArgs(                                                      \
       TORCH_SELECTIVE_SCHEMA("aten::Complex." #type_a "_" #type_b "(" #type_a \
                              " x," #type_b " y) -> complex"),                 \
-      [](Stack* stack) {                                                      \
+      [](Stack& stack) {                                                      \
         actual_type_a a;                                                      \
         actual_type_b b;                                                      \
         pop(stack, a, b);                                                     \
@@ -3090,7 +3090,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
   OperatorGeneratorArgs(                                                      \
       TORCH_SELECTIVE_SCHEMA("aten::Complex." #type_a "_" #type_b "(" #type_a \
                              " x," #type_b " y) -> complex"),                 \
-      [](Stack* stack) {                                                      \
+      [](Stack& stack) {                                                      \
         actual_type_a a;                                                      \
         actual_type_b b;                                                      \
         pop(stack, a, b);                                                     \
@@ -3101,7 +3101,7 @@ static const OperatorGeneratorArgs opGenArgs2[] = {
       OperatorGeneratorArgs(                                                  \
           TORCH_SELECTIVE_SCHEMA("aten::Complex." #type_b "_" #type_a         \
                                  "(" #type_b " x," #type_a " y) -> complex"), \
-          [](Stack* stack) {                                                  \
+          [](Stack& stack) {                                                  \
             actual_type_b a;                                                  \
             actual_type_a b;                                                  \
             pop(stack, a, b);                                                 \
diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
index 43c278be474fd..e43c7c052a673 100644
--- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
@@ -31,7 +31,7 @@ RegisterOperators reg(
     {Operator(
          prim::profile,
          [](const Node* node) -> Operation {
-           return [](Stack* stack) {
+           return [](Stack& stack) {
              AT_ERROR(
                  "Must be lowered to Interpreter's PROFILE instruction"); // NOLINT
            };
@@ -40,7 +40,7 @@ RegisterOperators reg(
      Operator(
          prim::profile_ivalue,
          [](const Node* node) -> Operation {
-           return [](Stack* stack) {
+           return [](Stack& stack) {
              AT_ERROR(
                  "Must be lowered to Interpreter's PROFILE instruction"); // NOLINT
            };
@@ -50,9 +50,9 @@ RegisterOperators reg(
          prim::FusionGroup,
          [](const Node* node) -> Operation {
            const auto key = registerFusion(node);
-           return [key](Stack* stack) {
+           return [key](Stack& stack) {
              RECORD_FUNCTION("FusionGroup", std::vector<c10::IValue>());
-             runFusion(key, *stack);
+             runFusion(key, stack);
            };
          },
          aliasAnalysisSpecialCase()),
@@ -67,7 +67,7 @@ RegisterOperators reg(
                      t->castRaw<TensorType>()->requiresGrad().has_value());
                  return *t->castRaw<TensorType>()->requiresGrad();
                });
-           return [rg_props](Stack* stack) {
+           return [rg_props](Stack& stack) {
              auto num_inputs = rg_props.size();
              // Check every input's shape against profiled (expected) shape.
              for (const auto i : c10::irange(num_inputs)) {
@@ -91,14 +91,14 @@ RegisterOperators reg(
            auto outputs_used = fmap(node->outputs(), [](const Value* v) {
              return v->uses().size() > 0;
            });
-           return [=](Stack* stack) {
+           return [=](Stack& stack) {
              RECORD_FUNCTION("chunk", last(stack, 1));
 
              at::Tensor t;
              pop(stack, t);
              auto result = at::chunk(t, chunks, dim);
-             stack->insert(
-                 stack->end(),
+             stack.insert(
+                 stack.end(),
                  std::make_move_iterator(result.begin()),
                  std::make_move_iterator(result.end()));
              // NB: Chunk can sometimes return a smaller number of outputs.
@@ -121,7 +121,7 @@ RegisterOperators reg(
                      num_results);
                  // We know that the output is unused, so it's ok to push
                  // anything on the stack.
-                 stack->emplace_back();
+                 stack.emplace_back();
                }
              }
            };
@@ -132,7 +132,7 @@ RegisterOperators reg(
          [](const Node* node) -> Operation {
            int64_t raw_dim = node->i(attr::dim);
            int64_t chunks = node->i(attr::chunks);
-           return [raw_dim, chunks](Stack* stack) {
+           return [raw_dim, chunks](Stack& stack) {
              c10::List<int64_t> shape = pop(stack).toIntList();
              c10::List<int64_t> regular_shape = shape.copy();
              c10::List<int64_t> last_shape = shape.copy();
@@ -158,7 +158,7 @@ RegisterOperators reg(
          aliasAnalysisSpecialCase()),
      Operator(
          "aten::_grad_sum_to_size(Tensor(a) self, int[]? size) -> Tensor(a)",
-         [](Stack* stack) {
+         [](Stack& stack) {
            RECORD_FUNCTION("_grad_sum_to_size", std::vector<c10::IValue>());
            IValue self, size;
            pop(stack, self, size);
@@ -175,7 +175,7 @@ RegisterOperators reg(
      OperatorGenerator(
          TORCH_SELECTIVE_SCHEMA(
              "prim::ModuleContainerIndex.list(Any self, int ind) -> Any"),
-         [](Stack* stack) {
+         [](Stack& stack) {
            IValue ind = pop(stack);
            IValue module_dict = pop(stack);
            std::stringstream ss;
@@ -189,7 +189,7 @@ RegisterOperators reg(
      OperatorGenerator(
          TORCH_SELECTIVE_SCHEMA(
              "prim::ModuleContainerIndex.dict(Any self, str ind) -> Any"),
-         [](Stack* stack) {
+         [](Stack& stack) {
            IValue ind = pop(stack);
            IValue module_dict = pop(stack);
            push(stack, module_dict.toModule().attr(ind.toStringRef()));
@@ -198,7 +198,7 @@ RegisterOperators reg(
      Operator(
          prim::TypeCheck /* (...)  -> (..., bool) */,
          [](const Node* /* node */) -> Operation {
-           return [](Stack* /* stack */) {
+           return [](Stack& /* stack */) {
              AT_ERROR("prim::TypeCheck not yet implemented"); // NOLINT
            };
          },
@@ -206,7 +206,7 @@ RegisterOperators reg(
      Operator(
          prim::FallbackGraph,
          [](const Node* node) -> Operation {
-           return [](Stack* stack) {
+           return [](Stack& stack) {
              AT_ERROR(
                  "Must be converted to prim::FunctionCall by replaceFallbackGraphWithFallbackFunction"); // NOLINT
            };
@@ -214,17 +214,17 @@ RegisterOperators reg(
          aliasAnalysisSpecialCase()),
      Operator(
          "prim::Guard(Tensor(a) t) -> Tensor(a)",
-         [](Stack* stack) { AT_ERROR("Should be replaced by prim::BailOut"); },
+         [](Stack& stack) { AT_ERROR("Should be replaced by prim::BailOut"); },
          aliasAnalysisFromSchema()),
      Operator(
          "prim::BailOut(...) -> Tensor(a)",
-         [](Stack* /* stack */) {
+         [](Stack& /* stack */) {
            AT_ERROR("prim::BailOut not yet implemented"); // NOLINT
          },
          aliasAnalysisFromSchema()),
      Operator(
          "prim::BailoutTemplate() -> int",
-         [](Stack* stack) {
+         [](Stack& stack) {
            // TODO: today, we put a single bailout template at the front to
            // carry the un-optimized graph for bailout nodes to use. Ideally
            // this should never run, but we haven't written the code to remove
@@ -237,7 +237,7 @@ RegisterOperators reg(
          aliasAnalysisFromSchema()),
      Operator(
          "aten::grad(Tensor[] outputs, Tensor[] inputs, Tensor?[]? grad_outputs=None, bool? retain_graph=None, bool create_graph=False, bool allow_unused=False) -> Tensor?[]",
-         [](Stack* stack) {
+         [](Stack& stack) {
            bool allow_unused = pop(stack).toBool();
            bool create_graph = pop(stack).toBool();
            auto retain_graph = pop(stack).toOptional<bool>();
@@ -277,7 +277,7 @@ RegisterOperators reg(
      // create_graph=True so we use aliasAnalysisConservative for these two OPs
      Operator(
          "aten::backward.TensorList(Tensor[] tensors, Tensor?[]? grad_tensors=None, bool? retain_graph=None, bool create_graph=False) -> ()",
-         [](Stack* stack) {
+         [](Stack& stack) {
            bool create_graph = pop(stack).toBool();
            auto retain_graph = pop(stack).toOptional<bool>();
            auto grad_tensors = pop(stack);
@@ -298,7 +298,7 @@ RegisterOperators reg(
          aliasAnalysisConservative()),
      Operator(
          "aten::save(t item, str filename) -> ()",
-         [](Stack* stack) {
+         [](Stack& stack) {
            auto filename = pop(stack).toStringRef();
            auto ivalue = pop(stack);
 
@@ -312,7 +312,7 @@ RegisterOperators reg(
          aliasAnalysisFromSchema()),
      Operator(
          "prim::IgnoredPythonOp(...) -> None",
-         [](Stack* stack) {
+         [](Stack& stack) {
            throw JITException(
                "This Python function is annotated to be ignored"
                " and cannot be and has not been included in the exported"
@@ -323,7 +323,7 @@ RegisterOperators reg(
          aliasAnalysisFromSchema()),
      Operator(
          "aten::wait(Future(t) self) -> t",
-         [](Stack* stack) {
+         [](Stack& stack) {
            TORCH_CHECK(
                false, "wait is implemented directly in the interpreter");
          },
@@ -332,7 +332,7 @@ RegisterOperators reg(
 RegisterOperators logging_operators(
     {Operator(
          "prim::AddStatValue(str key, int val) -> ()",
-         [](Stack* stack) {
+         [](Stack& stack) {
            auto val = pop(stack).toInt();
            auto key = pop(stack).toString();
 
@@ -353,7 +353,7 @@ RegisterOperators logging_operators(
          aliasAnalysisFromSchema()),
      Operator(
          "prim::TimePoint() -> int",
-         [](Stack* stack) {
+         [](Stack& stack) {
            auto schema = parseSchema("prim::TimePoint() -> int");
            Node* node = nullptr;
            // TODO: remove this custom tracing code once the custom op bugfix
@@ -372,7 +372,7 @@ RegisterOperators logging_operators(
          },
          aliasAnalysisFromSchema())});
 
-C10_UNUSED void hashValue(Stack* stack) {
+C10_UNUSED void hashValue(Stack& stack) {
   auto value = pop(stack);
   push(stack, value.hash());
 }
@@ -453,7 +453,7 @@ bool isSortableListOfObjectsOrTuples(
 }
 
 template <bool has_reverse_arg, bool copy_return_list>
-void sort_op(Stack* stack) {
+void sort_op(Stack& stack) {
   bool reverse = has_reverse_arg ? pop(stack).toBool() : false;
   auto g_list = pop(stack).toList();
 
@@ -697,7 +697,7 @@ at::Tensor interpolate(
       ") ");
 }
 
-void interpolate_op(Stack* stack) {
+void interpolate_op(Stack& stack) {
   at::Tensor input;
   IValue size;
   IValue scale_factors;
@@ -743,7 +743,7 @@ IValue convert_scale_factor_to_double(const IValue& int_ivalue) {
   return scale_factor_double;
 }
 
-void upsample_nearest_op(Stack* stack) {
+void upsample_nearest_op(Stack& stack) {
   at::Tensor input;
   IValue size;
   IValue scale_factor_int;
@@ -754,7 +754,7 @@ void upsample_nearest_op(Stack* stack) {
   push(stack, std::move(res));
 }
 
-void upsample_op(Stack* stack) {
+void upsample_op(Stack& stack) {
   at::Tensor input;
   IValue size;
   IValue scale_factor_int;
@@ -772,7 +772,7 @@ void upsample_op(Stack* stack) {
   push(stack, std::move(res));
 }
 
-void upsample_bilinear_op(Stack* stack) {
+void upsample_bilinear_op(Stack& stack) {
   at::Tensor input;
   IValue size;
   IValue scale_factor_int;
diff --git a/torch/csrc/jit/runtime/register_special_ops.cpp b/torch/csrc/jit/runtime/register_special_ops.cpp
index ace87f20b9c35..015d607044ddb 100644
--- a/torch/csrc/jit/runtime/register_special_ops.cpp
+++ b/torch/csrc/jit/runtime/register_special_ops.cpp
@@ -184,7 +184,7 @@ void recursiveStore(
 }
 
 template <bool if_set_requires_grad>
-void createTensorFromList(Stack* stack) {
+void createTensorFromList(Stack& stack) {
   // torch.tensor has a fourth requires_grad arg but torch.as_tensor not, so
   // we use the template arg to distinguish between these two cases
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -246,7 +246,7 @@ RegisterOperators reg({
     OperatorGenerator(
         TORCH_SELECTIVE_SCHEMA(
             "aten::split(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           RECORD_FUNCTION("split_with_sizes", last(stack, 3));
 
           auto result = at::split_with_sizes(
@@ -264,7 +264,7 @@ RegisterOperators reg({
           "aten::tensor." #operator_type "(" #operator_type                     \
           " t, *, ScalarType? dtype=None, Device? device=None"                  \
           ", bool requires_grad=False) -> Tensor"),                             \
-      [](Stack* stack) {                                                        \
+      [](Stack& stack) {                                                        \
         c_type scalar_val;                                                      \
         IValue dtype;                                                           \
         IValue device;                                                          \
@@ -280,7 +280,7 @@ RegisterOperators reg({
           TORCH_SELECTIVE_SCHEMA(                                               \
               "aten::as_tensor." #operator_type "(" #operator_type              \
               " t, *, ScalarType? dtype=None, Device? device=None) -> Tensor"), \
-          [](Stack* stack) {                                                    \
+          [](Stack& stack) {                                                    \
             c_type scalar_val;                                                  \
             IValue dtype;                                                       \
             IValue device;                                                      \
@@ -319,7 +319,7 @@ RegisterOperators reg({
     // tensor_new.cpp
     OperatorGenerator(
         TORCH_SELECTIVE_SCHEMA("aten::_infer_size(int[] a, int[] b) -> int[]"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto a = pop(stack);
           auto b = pop(stack);
           push(stack, at::infer_size(a.toIntVector(), b.toIntVector()));
@@ -328,7 +328,7 @@ RegisterOperators reg({
     OperatorGenerator(
         TORCH_SELECTIVE_SCHEMA(
             "aten::_no_grad_embedding_renorm_(Tensor weight, Tensor input, float max_norm, float norm_type) -> Tensor"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           at::Tensor weight;
           at::Tensor input;
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -353,7 +353,7 @@ RegisterOperators reg({
     OperatorGenerator(
         TORCH_SELECTIVE_SCHEMA(
             "aten::as_tensor(Tensor(a) data, *, ScalarType? dtype=None, Device? device=None) -> Tensor(a|b)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           auto device = pop(stack).toOptional<c10::Device>();
           auto dtype = pop(stack).toOptional<at::ScalarType>();
           at::Tensor data = pop(stack).toTensor();
@@ -377,24 +377,24 @@ RegisterOperators reg({
         TORCH_SELECTIVE_SCHEMA(
             "aten::_pack_sequence(Tensor output, Tensor batch_sizes, Tensor? sorted_indices, "
             "Tensor? unsorted_indices) -> (Tensor, Tensor, Tensor?, Tensor?)"),
-        [](Stack* stack) {},
+        [](Stack& stack) {},
         aliasAnalysisFromSchema()),
     OperatorGenerator(
         TORCH_SELECTIVE_SCHEMA("aten::_get_tracing_state() -> bool"),
-        [](Stack* stack) { push(stack, false); },
+        [](Stack& stack) { push(stack, false); },
         aliasAnalysisFromSchema()),
     OperatorGenerator(
         TORCH_SELECTIVE_SCHEMA("aten::is_scripting() -> bool"),
-        [](Stack* stack) { push(stack, true); },
+        [](Stack& stack) { push(stack, true); },
         aliasAnalysisFromSchema()),
     OperatorGenerator(
         TORCH_SELECTIVE_SCHEMA("aten::has_torch_function(...) -> bool"),
-        [](Stack* stack) { push(stack, false); },
+        [](Stack& stack) { push(stack, false); },
         aliasAnalysisFromSchema()),
     OperatorGenerator(
         TORCH_SELECTIVE_SCHEMA(
             "aten::_no_grad_uniform_(Tensor(a!) tensor, float a, float b) -> Tensor(a!)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // TODO: remove when script supports setting grad mode
           torch::NoGradGuard no_grad;
 
@@ -410,7 +410,7 @@ RegisterOperators reg({
     OperatorGenerator(
         TORCH_SELECTIVE_SCHEMA(
             "aten::_no_grad_normal_(Tensor(a!) tensor, float mean, float std) -> Tensor(a!)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // TODO: remove when script supports setting grad mode
           torch::NoGradGuard no_grad;
 
@@ -426,7 +426,7 @@ RegisterOperators reg({
     OperatorGenerator(
         TORCH_SELECTIVE_SCHEMA(
             "aten::_no_grad_fill_(Tensor(a!) tensor, float val) -> Tensor(a!)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // TODO: remove when script supports setting grad mode
           torch::NoGradGuard no_grad;
 
@@ -440,7 +440,7 @@ RegisterOperators reg({
     OperatorGenerator(
         TORCH_SELECTIVE_SCHEMA(
             "aten::_no_grad_zero_(Tensor(a!) tensor) -> Tensor(a!)"),
-        [](Stack* stack) {
+        [](Stack& stack) {
           // TODO: remove when script supports setting grad mode
           torch::NoGradGuard no_grad;
 
@@ -451,11 +451,11 @@ RegisterOperators reg({
         aliasAnalysisFromSchema()),
     Operator(
         "aten::is_grad_enabled() -> bool",
-        [](Stack* stack) { push(stack, torch::GradMode::is_enabled()); },
+        [](Stack& stack) { push(stack, torch::GradMode::is_enabled()); },
         aliasAnalysisConservative()),
     Operator(
         "aten::set_grad_enabled(bool val) -> ()",
-        [](Stack* stack) { torch::GradMode::set_enabled(pop(stack).toBool()); },
+        [](Stack& stack) { torch::GradMode::set_enabled(pop(stack).toBool()); },
         aliasAnalysisConservative()),
 });
 } // namespace
diff --git a/torch/csrc/jit/runtime/static/fusion.cpp b/torch/csrc/jit/runtime/static/fusion.cpp
index b08b59fc6890a..0b41b8e48a345 100644
--- a/torch/csrc/jit/runtime/static/fusion.cpp
+++ b/torch/csrc/jit/runtime/static/fusion.cpp
@@ -39,7 +39,7 @@ Operation createStaticSubgraphRuntime(const Node* node) {
   auto g = node->g(attr::Subgraph);
   auto module = std::make_shared<torch::jit::StaticModule>(g);
   auto num_inputs = module->num_inputs();
-  return [module, num_inputs](Stack* stack) {
+  return [module, num_inputs](Stack& stack) {
     RECORD_FUNCTION("Static Runtime", std::vector<c10::IValue>());
     auto inps = torch::jit::last(stack, num_inputs);
     // TODO maybe avoid call to vec
@@ -48,10 +48,10 @@ Operation createStaticSubgraphRuntime(const Node* node) {
 
     if (module->num_outputs() > 1) {
       for (auto& o : outputs.toTuple()->elements()) {
-        push_one(*stack, std::move(o));
+        push_one(stack, std::move(o));
       }
     } else {
-      push_one(*stack, std::move(outputs));
+      push_one(stack, std::move(outputs));
     }
     return 0;
   };
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index ee8e9038b1c48..e22447819ea67 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -1440,7 +1440,7 @@ void ProcessedNode::run() {
     }
 
     DCHECK(op_);
-    op_->operator()(&stack);
+    op_->operator()(stack);
 
     DCHECK_EQ(stack.size(), node_->outputs().size());
     for (const auto i : c10::irange(node_->outputs().size())) {

From 5b0dfd0f8aff50e2fce8f2f1fe6f2ef0594a9e25 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 30 Aug 2021 12:14:09 -0700
Subject: [PATCH 350/530] Fix bad use of channels last kernel in sync batch
 norm backward (#64100)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/64039

There are two distinct problems here.
1. If `grad_output` is channels last but not input, then input would be read as-if it were channels last. So reading the wrong values.
2. `use_channels_last_kernels` doesn't guarunte that `suggest_memory_format` will actually return channels last, so use `empty_like` instead so the strides always match.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64100

Reviewed By: mruberry

Differential Revision: D30622127

Pulled By: ngimel

fbshipit-source-id: e28cc57215596817f1432fcdd6c49d69acfedcf2
---
 aten/src/ATen/native/cuda/Normalization.cu  |  4 +-
 aten/src/ATen/native/cuda/Normalization.cuh |  6 ++-
 test/test_nn.py                             | 42 +++++++++++++++++++++
 3 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
index 0238b1b682877..1d4d1cc4bda4e 100644
--- a/aten/src/ATen/native/cuda/Normalization.cu
+++ b/aten/src/ATen/native/cuda/Normalization.cu
@@ -648,7 +648,9 @@ Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, c
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
 
-  if (at::cuda::detail::canUse32BitIndexMath(self) && batch_norm_use_channels_last_kernels(self)){
+  if (at::cuda::detail::canUse32BitIndexMath(self) &&
+      batch_norm_use_channels_last_kernels(self) &&
+      batch_norm_use_channels_last_kernels(input))  {
     return batch_norm_backward_elemt_channels_last_cuda_template(self, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
   }
 
diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh
index af074f5d2c6fd..6daa2b0858044 100644
--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@@ -1649,7 +1649,8 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
   const auto stride = input.sizes()[1];
   const auto reduction_size = input.numel() / stride;
 
-  at::Tensor grad_input = at::empty_like(input, input.suggest_memory_format());
+  // Input is guarunteed to be channels-last compatible
+  at::Tensor grad_input = at::empty_like(input);
 
   dim3 block;
   dim3 grid;
@@ -1716,7 +1717,8 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
   const auto reduction_size = input.numel() / stride;
   auto norm_fct = 1.0 / reduction_size;
 
-  at::Tensor grad_input = at::empty_like(input, input.suggest_memory_format());
+  // Input is guarunteed to be channels-last compatible
+  at::Tensor grad_input = at::empty_like(input);
 
   dim3 block;
   dim3 grid;
diff --git a/test/test_nn.py b/test/test_nn.py
index bb4dd59be5271..c9815dbf2ee0e 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -11192,6 +11192,48 @@ def test_convert_sync_batchnorm(self):
                 self.assertEqual(layer.state_dict()[key].device, converted_layer.state_dict()[key].device)
                 self.assertEqual(layer.state_dict()[key], converted_layer.state_dict()[key])
 
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    def test_sync_batchnorm_backward_elemt(self):
+        device = 'cuda'
+        saved_input = torch.rand(2, 3, 2, 1, device=device)
+        grad_output = torch.rand(2, 3, 2, 1, device=device)
+        mean = torch.rand(3, device=device)
+        invstd = torch.rand(3, device=device)
+        weight = torch.rand(3, device=device)
+        sum_dy = torch.rand(3, device=device)
+        sum_dy_xmu = torch.rand(3, device=device)
+        count_tensor = torch.tensor([5, 5, 5], dtype=torch.int32, device=device)
+
+        gI_contiguous = torch.batch_norm_backward_elemt(
+            grad_output,
+            saved_input,
+            mean,
+            invstd,
+            weight,
+            sum_dy,
+            sum_dy_xmu,
+            count_tensor
+        )
+
+        # Test batch_norm_backward_elemt gives the same answer for all
+        # combinations of contiguous as channels_last input
+        for a, b in [
+                (torch.channels_last, torch.contiguous_format),
+                (torch.contiguous_format, torch.channels_last),
+                (torch.channels_last, torch.channels_last),
+        ]:
+            gI_actual = torch.batch_norm_backward_elemt(
+                grad_output.contiguous(memory_format=a),
+                saved_input.contiguous(memory_format=b),
+                mean,
+                invstd,
+                weight,
+                sum_dy,
+                sum_dy_xmu,
+                count_tensor
+            )
+            self.assertEqual(gI_actual, gI_contiguous)
+
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
     def test_sync_batchnorm_accuracy_cuda(self):
         # The target of this test is to test the functionality and accuracy of

From d37636901ed1c65c1f8b68e36e37e59eb503c554 Mon Sep 17 00:00:00 2001
From: Kushashwa Ravi Shrimali <kushashwaravishrimali@gmail.com>
Date: Mon, 30 Aug 2021 12:16:23 -0700
Subject: [PATCH 351/530] [Doc] `make_tensor` to `torch.testing` module
 (#63925)

Summary:
This PR aims to add `make_tensor` to the `torch.testing` module in PyTorch docs.

TODOs:

* [x] Add examples

cc: pmeier mruberry brianjo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63925

Reviewed By: ngimel

Differential Revision: D30633487

Pulled By: mruberry

fbshipit-source-id: 8e5a1f880c6ece5925b4039fee8122bd739538af
---
 docs/source/testing.rst                       |   1 +
 test/test_autograd.py                         |   3 +-
 test/test_binary_ufuncs.py                    |   4 +-
 test/test_buffer_protocol.py                  |   7 +-
 test/test_foreach.py                          |   4 +-
 test/test_indexing.py                         |   3 +-
 test/test_jit.py                              |   3 +-
 test/test_linalg.py                           |   4 +-
 test/test_ops.py                              |   4 +-
 test/test_reductions.py                       |   4 +-
 test/test_shape_ops.py                        |   3 +-
 test/test_sort_and_select.py                  |   4 +-
 test/test_sparse.py                           |   3 +-
 test/test_sparse_csr.py                       |   4 +-
 test/test_tensor_creation_ops.py              |   3 +-
 test/test_testing.py                          |   3 +-
 test/test_torch.py                            |   3 +-
 test/test_unary_ufuncs.py                     |   4 +-
 test/test_view_ops.py                         |   3 +-
 torch/testing/__init__.py                     |   1 +
 torch/testing/_creation.py                    | 155 ++++++++++++++++++
 .../_internal/common_methods_invocations.py   |   4 +-
 torch/testing/_internal/common_modules.py     |  26 +--
 torch/testing/_internal/common_utils.py       | 100 +----------
 24 files changed, 213 insertions(+), 140 deletions(-)
 create mode 100644 torch/testing/_creation.py

diff --git a/docs/source/testing.rst b/docs/source/testing.rst
index 981a636c53390..9f1e2c3c53f89 100644
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -9,3 +9,4 @@ torch.testing
 .. automodule:: torch.testing
 
 .. autofunction:: assert_close
+.. autofunction:: make_tensor
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 4d416459c2af4..364d48807b737 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -24,13 +24,14 @@
 from torch.autograd.profiler_util import (_format_time, EventList, FunctionEvent, FunctionEventAvg)
 import torch.autograd.functional as autogradF
 from torch.utils.checkpoint import checkpoint
+from torch.testing import make_tensor
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import (TestCase, run_tests, skipIfNoLapack,
                                                   suppress_warnings, slowTest,
                                                   load_tests,
                                                   IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck,
                                                   TEST_WITH_ROCM, disable_gc,
-                                                  gradcheck, gradgradcheck, make_tensor)
+                                                  gradcheck, gradgradcheck)
 from torch.autograd import Variable, Function, detect_anomaly, kineto_available
 from torch.autograd.function import InplaceFunction
 import torch.autograd.forward_ad as fwAD
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 4995e0dfc6cc7..1e9e804ab86d1 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -13,12 +13,12 @@
 from torch._six import inf, nan
 from torch.testing._internal.common_utils import (
     TestCase, iter_indices, TEST_WITH_ASAN, run_tests,
-    torch_to_numpy_dtype_dict, make_tensor, TEST_SCIPY, set_default_dtype)
+    torch_to_numpy_dtype_dict, TEST_SCIPY, set_default_dtype)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA,
     dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyOnCPUAndCUDA,
     skipCUDAIfRocm, skipIf, ops)
-from torch.testing import all_types_and_complex_and, integral_types_and
+from torch.testing import all_types_and_complex_and, integral_types_and, make_tensor
 from torch.testing._internal.common_methods_invocations import binary_ufuncs
 
 if TEST_SCIPY:
diff --git a/test/test_buffer_protocol.py b/test/test_buffer_protocol.py
index c797b913f033c..619386e6d5665 100644
--- a/test/test_buffer_protocol.py
+++ b/test/test_buffer_protocol.py
@@ -1,4 +1,5 @@
 import torch.testing._internal.common_utils as common
+from torch.testing import make_tensor
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     dtypes
@@ -23,7 +24,7 @@ def _run_test(self, shape, dtype, count=-1, first=0, offset=None, **kwargs):
         if offset is None:
             offset = first * get_dtype_size(dtype)
 
-        numpy_original = common.make_tensor(shape, torch.device("cpu"), dtype).numpy()
+        numpy_original = make_tensor(shape, torch.device("cpu"), dtype).numpy()
         original = memoryview(numpy_original)
         # First call PyTorch's version in case of errors.
         # If this call exits successfully, the NumPy version must also do so.
@@ -125,7 +126,7 @@ def test_invalid_positional_args(self, device, dtype):
 
     @dtypes(*common.torch_to_numpy_dtype_dict.keys())
     def test_shared_buffer(self, device, dtype):
-        x = common.make_tensor((1,), device, dtype)
+        x = make_tensor((1,), device, dtype)
         # Modify the whole tensor
         arr, tensor = self._run_test(SHAPE, dtype)
         tensor[:] = x
@@ -158,7 +159,7 @@ def test_not_a_buffer(self, device, dtype):
 
     @dtypes(*common.torch_to_numpy_dtype_dict.keys())
     def test_non_writable_buffer(self, device, dtype):
-        numpy_arr = common.make_tensor((1,), device, dtype).numpy()
+        numpy_arr = make_tensor((1,), device, dtype).numpy()
         byte_arr = numpy_arr.tobytes()
         with self.assertWarnsOnceRegex(UserWarning,
                                        r"The given buffer is not writable."):
diff --git a/test/test_foreach.py b/test/test_foreach.py
index ce9b0d7ee55e3..123ef35bb7093 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -4,11 +4,13 @@
 import re
 import torch
 import unittest
+
+from torch.testing import make_tensor
 from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ROCM, TEST_WITH_SLOW
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, onlyCUDA, skipCUDAIfRocm, skipMeta, ops)
 from torch.testing._internal.common_methods_invocations import \
-    (foreach_unary_op_db, foreach_binary_op_db, foreach_pointwise_op_db, foreach_minmax_op_db, make_tensor)
+    (foreach_unary_op_db, foreach_binary_op_db, foreach_pointwise_op_db, foreach_minmax_op_db)
 
 # Includes some values such that N * N won't be a multiple of 4,
 # which should ensure we test the vectorized and non-vectorized
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 61580910f2cfb..8b8a2ead9ed72 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -8,7 +8,8 @@
 
 import numpy as np
 
-from torch.testing._internal.common_utils import TestCase, run_tests, make_tensor
+from torch.testing import make_tensor
+from torch.testing._internal.common_utils import TestCase, run_tests
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, dtypes, dtypesIfCPU, dtypesIfCUDA,
     onlyOnCPUAndCUDA)
diff --git a/test/test_jit.py b/test/test_jit.py
index 2595411c01848..d1a170da6f750 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -69,8 +69,7 @@
 from torch.autograd import Variable
 from torch.jit.annotations import BroadcastingList2, BroadcastingList3, Any  # noqa: F401
 from torch.nn.utils.rnn import PackedSequence
-from torch.testing import FileCheck
-from torch.testing._internal.common_utils import make_tensor
+from torch.testing import FileCheck, make_tensor
 import torch.autograd.profiler
 import torch.cuda
 import torch.jit
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 8ba3373d38ce4..f7ce39272bf86 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -14,14 +14,14 @@
 
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests, TEST_SCIPY, IS_MACOS, IS_WINDOWS, slowTest,
-     TEST_WITH_ASAN, make_tensor, TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU,
+     TEST_WITH_ASAN, TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU,
      iter_indices, gradcheck, gradgradcheck)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes,
      onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
      skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyOnCPUAndCUDA, dtypesIfCUDA,
      onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver)
-from torch.testing import floating_and_complex_types, floating_types, all_types
+from torch.testing import floating_and_complex_types, floating_types, all_types, make_tensor
 from torch.testing._internal.common_cuda import SM53OrLater, tf32_on_and_off, CUDA11OrLater, CUDA9
 from torch.distributions.binomial import Binomial
 
diff --git a/test/test_ops.py b/test/test_ops.py
index a6baf8dbe699a..27aee72f00846 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -5,9 +5,9 @@
 import torch
 
 from torch.testing import \
-    (FileCheck, floating_and_complex_types_and, get_all_dtypes)
+    (FileCheck, floating_and_complex_types_and, get_all_dtypes, make_tensor)
 from torch.testing._internal.common_utils import \
-    (TestCase, is_iterable_of_tensors, run_tests, IS_SANDCASTLE, clone_input_helper, make_tensor,
+    (TestCase, is_iterable_of_tensors, run_tests, IS_SANDCASTLE, clone_input_helper,
      gradcheck, gradgradcheck, IS_IN_CI, suppress_warnings)
 from torch.testing._internal.common_methods_invocations import \
     (op_db, _NOTHING, UnaryUfuncInfo, ReductionOpInfo, SpectralFuncInfo)
diff --git a/test/test_reductions.py b/test/test_reductions.py
index eed7f732051cd..ca3042b66cf91 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -10,10 +10,10 @@
 
 from torch._six import inf, nan
 from torch.testing import (
-    integral_types_and, floating_and_complex_types_and, get_all_dtypes)
+    integral_types_and, floating_and_complex_types_and, get_all_dtypes, make_tensor)
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict,
-    IS_WINDOWS, make_tensor)
+    IS_WINDOWS)
 from torch.testing._internal.common_device_type import (
     OpDTypes, instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU,
     onlyOnCPUAndCUDA, onlyCUDA, largeTensorTest, ops, precisionOverride)
diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py
index 916adee666307..cb4ec3c18f82a 100644
--- a/test/test_shape_ops.py
+++ b/test/test_shape_ops.py
@@ -7,8 +7,9 @@
 import warnings
 
 from torch._six import nan
+from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, make_tensor, torch_to_numpy_dtype_dict)
+    TestCase, run_tests, torch_to_numpy_dtype_dict)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCPU, onlyCUDA, dtypes, onlyOnCPUAndCUDA,
     dtypesIfCPU, dtypesIfCUDA, largeTensorTest)
diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
index 564258aa77b51..e562e389a3fc8 100644
--- a/test/test_sort_and_select.py
+++ b/test/test_sort_and_select.py
@@ -5,9 +5,9 @@
 from torch._six import nan
 from itertools import permutations, product
 
-from torch.testing import all_types, all_types_and
+from torch.testing import all_types, all_types_and, make_tensor
 from torch.testing._internal.common_utils import \
-    (TEST_WITH_ROCM, TestCase, run_tests, make_tensor, slowTest)
+    (TEST_WITH_ROCM, TestCase, run_tests, slowTest)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, onlyOnCPUAndCUDA,
      skipCUDAIfRocm, onlyCUDA, dtypesIfCUDA, dtypesIfCPU, onlyCPU, largeTensorTest)
diff --git a/test/test_sparse.py b/test/test_sparse.py
index abe5e93889498..333f29f13138e 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -5,8 +5,9 @@
 import random
 from collections import defaultdict
 import unittest
+from torch.testing import make_tensor
 from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \
-    do_test_empty_full, load_tests, TEST_NUMPY, TEST_SCIPY, IS_WINDOWS, gradcheck, coalescedonoff, make_tensor, \
+    do_test_empty_full, load_tests, TEST_NUMPY, TEST_SCIPY, IS_WINDOWS, gradcheck, coalescedonoff, \
     DeterministicGuard
 from torch.testing._internal.common_cuda import TEST_CUDA, _get_torch_cuda_version
 from numbers import Number
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index b9f48855e46db..fbb2b30e46304 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -3,8 +3,10 @@
 import unittest
 import random
 import itertools
+
+from torch.testing import make_tensor
 from torch.testing._internal.common_utils import \
-    (IS_MACOS, IS_WINDOWS, TestCase, run_tests, load_tests, coalescedonoff, make_tensor)
+    (IS_MACOS, IS_WINDOWS, TestCase, run_tests, load_tests, coalescedonoff)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, onlyCPU, onlyCUDA)
 
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 192e03f61cac0..9ef374248984e 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -8,9 +8,10 @@
 from itertools import product, combinations, combinations_with_replacement, permutations
 import random
 
+from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, do_test_empty_full, TEST_WITH_ROCM, suppress_warnings,
-    torch_to_numpy_dtype_dict, slowTest, make_tensor, TEST_SCIPY, IS_MACOS, IS_PPC,
+    torch_to_numpy_dtype_dict, slowTest, TEST_SCIPY, IS_MACOS, IS_PPC,
     IS_WINDOWS)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA,
diff --git a/test/test_testing.py b/test/test_testing.py
index 7e67569bb4799..f38183d4a3769 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -10,8 +10,9 @@
 
 import torch
 
+from torch.testing import make_tensor
 from torch.testing._internal.common_utils import \
-    (IS_FBCODE, IS_SANDCASTLE, IS_WINDOWS, TestCase, make_tensor, run_tests, skipIfRocm, slowTest)
+    (IS_FBCODE, IS_SANDCASTLE, IS_WINDOWS, TestCase, run_tests, skipIfRocm, slowTest)
 from torch.testing._internal.common_device_type import \
     (PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY, PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY, dtypes,
      get_device_type_test_bases, instantiate_device_type_tests, onlyCUDA, onlyOnCPUAndCUDA,
diff --git a/test/test_torch.py b/test/test_torch.py
index 15e36c83654db..c50b7ca99e883 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -27,13 +27,14 @@
 from itertools import product, combinations, permutations
 from functools import partial
 from torch import multiprocessing as mp
+from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
     TestCase, TEST_WITH_ROCM, run_tests,
     IS_WINDOWS, IS_FILESYSTEM_UTF8_ENCODING, NO_MULTIPROCESSING_SPAWN,
     do_test_dtypes, IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, load_tests, slowTest,
     skipCUDAMemoryLeakCheckIf, BytesIOContext, noarchTest,
     skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName,
-    wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard, make_tensor)
+    wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard)
 from multiprocessing.reduction import ForkingPickler
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index e5b8c4a66093b..22f61519a2853 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -11,7 +11,7 @@
 from torch._six import inf, nan
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict,
-    suppress_warnings, make_tensor, TEST_SCIPY, slowTest, skipIfNoSciPy, IS_WINDOWS)
+    suppress_warnings, TEST_SCIPY, slowTest, skipIfNoSciPy, IS_WINDOWS)
 from torch.testing._internal.common_methods_invocations import (
     unary_ufuncs, _NOTHING)
 from torch.testing._internal.common_device_type import (
@@ -19,7 +19,7 @@
     onlyCUDA, dtypesIfCUDA, precisionOverride, skipCUDAIfRocm, dtypesIfCPU,
     OpDTypes)
 from torch.testing import (
-    floating_types_and, all_types_and_complex_and, floating_and_complex_types_and)
+    floating_types_and, all_types_and_complex_and, floating_and_complex_types_and, make_tensor)
 
 if TEST_SCIPY:
     import scipy
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 306c6cb411f3f..7bb6906ef1cc7 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -6,8 +6,9 @@
 from functools import partial
 import random
 
+from torch.testing import make_tensor
 from torch.testing._internal.common_utils import \
-    (TestCase, run_tests, suppress_warnings, make_tensor)
+    (TestCase, run_tests, suppress_warnings)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, onlyCPU, dtypes, onlyOnCPUAndCUDA)
 
diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
index 526d02c71e322..7ea18a4f9cea2 100644
--- a/torch/testing/__init__.py
+++ b/torch/testing/__init__.py
@@ -1,4 +1,5 @@
 from ._core import *  # noqa: F403
 from ._asserts import *  # noqa: F403
+from ._creation import *  # noqa: F403
 from ._check_kernel_launches import *  # noqa: F403
 from ._deprecated import *  # noqa: F403
diff --git a/torch/testing/_creation.py b/torch/testing/_creation.py
new file mode 100644
index 0000000000000..4eb10d1d5d26b
--- /dev/null
+++ b/torch/testing/_creation.py
@@ -0,0 +1,155 @@
+"""
+This module contains tensor creation utilities.
+"""
+
+import torch
+from typing import Optional, List, Tuple, Union, cast
+import math
+
+__all__ = [
+    "make_tensor",
+]
+
+def make_tensor(
+    shape: Union[torch.Size, List[int], Tuple[int, ...]],
+    device: Union[str, torch.device],
+    dtype: torch.dtype,
+    *,
+    low: Optional[float] = None,
+    high: Optional[float] = None,
+    requires_grad: bool = False,
+    noncontiguous: bool = False,
+    exclude_zero: bool = False
+) -> torch.Tensor:
+    r"""Creates a tensor with the given :attr:`shape`, :attr:`device`, and :attr:`dtype`, and filled with
+    values uniformly drawn from ``[low, high)``.
+
+    If :attr:`low` or :attr:`high` are specified and are outside the range of the :attr:`dtype`'s representable
+    finite values then they are clamped to the lowest or highest representable finite value, respectively.
+    If ``None``, then the following table describes the default values for :attr:`low` and :attr:`high`,
+    which depend on :attr:`dtype`.
+
+    +---------------------------+------------+----------+
+    | ``dtype``                 | ``low``    | ``high`` |
+    +===========================+============+==========+
+    | boolean type              | ``0``      | ``2``    |
+    +---------------------------+------------+----------+
+    | unsigned integral type    | ``0``      | ``10``   |
+    +---------------------------+------------+----------+
+    | signed integral types     | ``-9``     | ``10``   |
+    +---------------------------+------------+----------+
+    | floating types            | ``-9``     | ``9``    |
+    +---------------------------+------------+----------+
+    | complex types             | ``-9``     | ``9``    |
+    +---------------------------+------------+----------+
+
+    Args:
+        shape (Tuple[int, ...]): A sequence of integers defining the shape of the output tensor.
+        device (Union[str, torch.device]): The device of the returned tensor.
+        dtype (:class:`torch.dtype`): The data type of the returned tensor.
+        low (Optional[Number]): Sets the lower limit (inclusive) of the given range. If a number is provided it is
+            clamped to the least representable finite value of the given dtype. When ``None`` (default),
+            this value is determined based on the :attr:`dtype` (see the table above). Default: ``None``.
+        high (Optional[Number]): Sets the upper limit (exclusive) of the given range. If a number is provided it is
+            clamped to the greatest representable finite value of the given dtype. When ``None`` (default) this value
+            is determined based on the :attr:`dtype` (see the table above). Default: ``None``.
+        requires_grad (Optional[bool]): If autograd should record operations on the returned tensor. Default: ``False``.
+        noncontiguous (Optional[bool]): If `True`, the returned tensor will be noncontiguous. This argument is
+            ignored if the constructed tensor has fewer than two elements.
+        exclude_zero (Optional[bool]): If ``True`` then zeros are replaced with the dtype's small positive value
+            depending on the :attr:`dtype`. For bool and integer types zero is replaced with one. For floating
+            point types it is replaced with the dtype's smallest positive normal number (the "tiny" value of the
+            :attr:`dtype`'s :func:`~torch.finfo` object), and for complex types it is replaced with a complex number
+            whose real and imaginary parts are both the smallest positive normal number representable by the complex
+            type. Default ``False``.
+
+    Raises:
+        ValueError: If ``low > high``.
+        ValueError: If either :attr:`low` or :attr:`high` is ``nan``.
+        TypeError: If :attr:`dtype` isn't supported by this function.
+
+    Examples:
+        >>> from torch.testing import make_tensor
+        >>> # Creates a float tensor with values in [-1, 1)
+        >>> make_tensor((3,), device='cpu', dtype=torch.float32, low=-1, high=1)
+        tensor([ 0.1205, 0.2282, -0.6380])
+        >>> # Creates a bool tensor on CUDA
+        >>> make_tensor((2, 2), device='cuda', dtype=torch.bool)
+        tensor([[False, False],
+                [False, True]], device='cuda:0')
+    """
+    def _modify_low_high(low, high, lowest, highest, default_low, default_high, dtype):
+        """
+        Modifies (and raises ValueError when appropriate) low and high values given by the user (input_low, input_high) if required.
+        """
+        def clamp(a, l, h):
+            return min(max(a, l), h)
+
+        low = low if low is not None else default_low
+        high = high if high is not None else default_high
+
+        # Checks for error cases
+        if low != low or high != high:
+            raise ValueError("make_tensor: one of low or high was NaN!")
+        if low > high:
+            raise ValueError("make_tensor: low must be weakly less than high!")
+
+        low = clamp(low, lowest, highest)
+        high = clamp(high, lowest, highest)
+
+        if dtype in [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]:
+            return math.floor(low), math.ceil(high)
+
+        return low, high
+
+    _integral_types = [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]
+    _floating_types = [torch.float16, torch.bfloat16, torch.float32, torch.float64]
+    _complex_types = [torch.cfloat, torch.cdouble]
+
+    if dtype is torch.bool:
+        result = torch.randint(0, 2, shape, device=device, dtype=dtype)
+    elif dtype is torch.uint8:
+        ranges = (torch.iinfo(dtype).min, torch.iinfo(dtype).max)
+        low, high = cast(Tuple[int, int], _modify_low_high(low, high, ranges[0], ranges[1], 0, 10, dtype))
+        result = torch.randint(low, high, shape, device=device, dtype=dtype)
+    elif dtype in _integral_types:
+        ranges = (torch.iinfo(dtype).min, torch.iinfo(dtype).max)
+        low, high = _modify_low_high(low, high, ranges[0], ranges[1], -9, 10, dtype)
+        result = torch.randint(low, high, shape, device=device, dtype=dtype)  # type: ignore[call-overload]
+    elif dtype in _floating_types:
+        ranges_floats = (torch.finfo(dtype).min, torch.finfo(dtype).max)
+        low, high = _modify_low_high(low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype)
+        rand_val = torch.rand(shape, device=device, dtype=dtype)
+        result = high * rand_val + low * (1 - rand_val)
+    elif dtype in _complex_types:
+        float_dtype = torch.float if dtype is torch.cfloat else torch.double
+        ranges_floats = (torch.finfo(float_dtype).min, torch.finfo(float_dtype).max)
+        low, high = _modify_low_high(low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype)
+        real_rand_val = torch.rand(shape, device=device, dtype=float_dtype)
+        imag_rand_val = torch.rand(shape, device=device, dtype=float_dtype)
+        real = high * real_rand_val + low * (1 - real_rand_val)
+        imag = high * imag_rand_val + low * (1 - imag_rand_val)
+        result = torch.complex(real, imag)
+    else:
+        raise TypeError(f"The requested dtype '{dtype}' is not supported by torch.testing.make_tensor()."
+                        " To request support, file an issue at: https://github.com/pytorch/pytorch/issues")
+
+    if noncontiguous and result.numel() > 1:
+        result = torch.repeat_interleave(result, 2, dim=-1)
+        result = result[..., ::2]
+
+    if exclude_zero:
+        if dtype in _integral_types or dtype is torch.bool:
+            replace_with = torch.tensor(1, device=device, dtype=dtype)
+        elif dtype in _floating_types:
+            replace_with = torch.tensor(torch.finfo(dtype).tiny, device=device, dtype=dtype)
+        else:  # dtype in _complex_types:
+            float_dtype = torch.float if dtype is torch.cfloat else torch.double
+            float_eps = torch.tensor(torch.finfo(float_dtype).tiny, device=device, dtype=float_dtype)
+            replace_with = torch.complex(float_eps, float_eps)
+        result[result == 0] = replace_with
+
+    if dtype in _floating_types + _complex_types:
+        result.requires_grad = requires_grad
+
+    return result
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 2230808b5fd43..a3d61b477b4a4 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -19,7 +19,7 @@
     (make_non_contiguous, floating_types, floating_types_and, complex_types,
      floating_and_complex_types, floating_and_complex_types_and,
      all_types_and_complex_and, all_types_and, all_types_and_complex,
-     integral_types_and, all_types, double_types)
+     integral_types_and, all_types, double_types, make_tensor)
 from .._core import _dispatch_dtypes
 from torch.testing._internal.common_device_type import \
     (onlyOnCPUAndCUDA, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfNoCusolver,
@@ -32,7 +32,7 @@
      random_symmetric_pd_matrix, make_symmetric_matrices,
      make_symmetric_pd_matrices, random_square_matrix_of_rank,
      random_fullrank_matrix_distinct_singular_value,
-     TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, make_tensor, TEST_SCIPY,
+     TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
      torch_to_numpy_dtype_dict, TEST_WITH_ASAN,
      GRADCHECK_NONDET_TOL,)
 import torch.testing._internal.opinfo_helper as opinfo_helper
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index 99525a7b68756..6ef4de398a39e 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -2,11 +2,11 @@
 from copy import deepcopy
 from functools import wraps, partial
 from itertools import chain
-from torch.testing import floating_types
+from torch.testing import floating_types, make_tensor
 from torch.testing._internal.common_device_type import (
     _TestParametrizer, _dtype_test_suffix, _update_param_kwargs, skipIf)
 from torch.testing._internal.common_nn import nllloss_reference, get_reduction
-from torch.testing._internal.common_utils import make_tensor, freeze_rng_state
+from torch.testing._internal.common_utils import freeze_rng_state
 from types import ModuleType
 from typing import List, Tuple, Type, Set, Dict
 
@@ -225,7 +225,7 @@ def generate_regression_criterion_inputs(make_input):
     return [
         ModuleInput(
             constructor_input=FunctionInput(reduction=reduction),
-            forward_input=FunctionInput(make_input(size=(4, )), make_input(size=4,)),
+            forward_input=FunctionInput(make_input(shape=(4, )), make_input(shape=4,)),
             reference_fn=no_batch_dim_reference_criterion_fn,
             desc='no_batch_dim_{}'.format(reduction)
         ) for reduction in ['none', 'mean', 'sum']]
@@ -236,7 +236,7 @@ def module_inputs_torch_nn_AvgPool1d(module_info, device, dtype, requires_grad,
 
     return [
         ModuleInput(constructor_input=FunctionInput(kernel_size=2),
-                    forward_input=FunctionInput(make_input(size=(3, 6))),
+                    forward_input=FunctionInput(make_input(shape=(3, 6))),
                     desc='no_batch_dim',
                     reference_fn=no_batch_dim_reference_fn)]
 
@@ -246,13 +246,13 @@ def module_inputs_torch_nn_ELU(module_info, device, dtype, requires_grad, **kwar
 
     return [
         ModuleInput(constructor_input=FunctionInput(alpha=2.),
-                    forward_input=FunctionInput(make_input(size=(3, 2, 5))),
+                    forward_input=FunctionInput(make_input(shape=(3, 2, 5))),
                     reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2 * (i.exp() - 1))),
         ModuleInput(constructor_input=FunctionInput(alpha=2.),
-                    forward_input=FunctionInput(make_input(size=())),
+                    forward_input=FunctionInput(make_input(shape=())),
                     desc='scalar'),
         ModuleInput(constructor_input=FunctionInput(),
-                    forward_input=FunctionInput(make_input(size=(3,))),
+                    forward_input=FunctionInput(make_input(shape=(3,))),
                     desc='no_batch_dim',
                     reference_fn=no_batch_dim_reference_fn)]
 
@@ -262,14 +262,14 @@ def module_inputs_torch_nn_CELU(module_info, device, dtype, requires_grad, **kwa
 
     return [
         ModuleInput(constructor_input=FunctionInput(alpha=2.),
-                    forward_input=FunctionInput(make_input(size=(3, 2, 5))),
+                    forward_input=FunctionInput(make_input(shape=(3, 2, 5))),
                     reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2. * ((.5 * i).exp() - 1))),
         ModuleInput(constructor_input=FunctionInput(alpha=2.),
-                    forward_input=FunctionInput(make_input(size=())),
+                    forward_input=FunctionInput(make_input(shape=())),
                     reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2 * (i.exp() - 1)),
                     desc='scalar'),
         ModuleInput(constructor_input=FunctionInput(alpha=2.),
-                    forward_input=FunctionInput(make_input(size=(3,))),
+                    forward_input=FunctionInput(make_input(shape=(3,))),
                     desc='no_batch_dim',
                     reference_fn=no_batch_dim_reference_fn)]
 
@@ -279,12 +279,12 @@ def module_inputs_torch_nn_L1Loss(module_info, device, dtype, requires_grad, **k
 
     return [
         ModuleInput(constructor_input=FunctionInput(),
-                    forward_input=FunctionInput(make_input(size=(2, 3, 4)),
-                                                make_input(size=(2, 3, 4))),
+                    forward_input=FunctionInput(make_input(shape=(2, 3, 4)),
+                                                make_input(shape=(2, 3, 4))),
                     reference_fn=lambda m, p, i, t: 1. / i.numel() * sum((a - b).abs().sum()
                                                                          for a, b in zip(i, t))),
         ModuleInput(constructor_input=FunctionInput(),
-                    forward_input=FunctionInput(make_input(size=()), make_input(size=())),
+                    forward_input=FunctionInput(make_input(shape=()), make_input(shape=())),
                     reference_fn=lambda m, p, i, t: 1. / i.numel() * (i - t).abs().sum(),
                     desc='scalar')] + generate_regression_criterion_inputs(make_input)
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index b8e5b097bd6c0..90f3551caae94 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -43,13 +43,13 @@
 
 import numpy as np
 
-from torch.testing import floating_types_and, integral_types, complex_types, get_all_dtypes
 import expecttest
 from .._core import \
     (_compare_tensors_internal, _compare_scalars_internal, _compare_return_type)
 
 import torch
 import torch.cuda
+from torch.testing import make_tensor
 from torch._utils_internal import get_writable_path
 from torch._six import string_classes
 from torch import Tensor
@@ -1939,103 +1939,7 @@ def f_retry(*args, **kwargs):
     return deco_retry
 
 
-# Methods for matrix and tensor generation
-
-def make_tensor(size, device: torch.device, dtype: torch.dtype, *, low=None, high=None,
-                requires_grad: bool = False, noncontiguous: bool = False,
-                exclude_zero: bool = False) -> torch.Tensor:
-    """ Creates a random tensor with the given size, device and dtype.
-
-        Default values for low and high:
-            * boolean type: low = 0, high = 2
-            * uint8 type: low = 0, high = 9
-            * floating and integral types: low = -9 and high = 9
-            * complex types, for each real and imaginary part: low = -9, high = 9
-        If low/high are specified and within dtype limits: low = low, high = high
-        If low/high are specified but exceed the limits: low = dtype_min, high = dtype_max
-        If low is -inf and/or high is inf: low = dtype_min, high = dtype_max
-        If low is inf or nan and/or high is -inf or nan: ValueError raised
-
-        If noncontiguous=True, a noncontiguous tensor with the given size will be returned unless the size
-        specifies a tensor with a 1 or 0 elements in which case the noncontiguous parameter is ignored because
-        it is not possible to create a noncontiguous Tensor with a single element.
-
-        If exclude_zero is passed with True (default is False), all the matching values (with zero) in
-        created tensor are replaced with a tiny (smallest positive representable number) value if floating type,
-        [`tiny` + `tiny`.j] if complex type and 1 if integer/boolean type.
-    """
-    def _modify_low_high(low, high, lowest, highest, default_low, default_high, dtype):
-        """
-        Modifies (and raises ValueError when appropriate) low and high values given by the user (input_low, input_high) if required.
-        """
-        def clamp(a, l, h):
-            return min(max(a, l), h)
-
-        low = low if low is not None else default_low
-        high = high if high is not None else default_high
-
-        # Checks for error cases
-        if low != low or high != high:
-            raise ValueError("make_tensor: one of low or high was NaN!")
-        if low > high:
-            raise ValueError("make_tensor: low must be weakly less than high!")
-
-        low = clamp(low, lowest, highest)
-        high = clamp(high, lowest, highest)
-
-        if dtype in integral_types():
-            return math.floor(low), math.ceil(high)
-
-        return low, high
-
-    if dtype is torch.bool:
-        result = torch.randint(0, 2, size, device=device, dtype=dtype)
-    elif dtype is torch.uint8:
-        ranges = (torch.iinfo(dtype).min, torch.iinfo(dtype).max)
-        low, high = _modify_low_high(low, high, ranges[0], ranges[1], 0, 9, dtype)
-        result = torch.randint(low, high, size, device=device, dtype=dtype)
-    elif dtype in integral_types():
-        ranges = (torch.iinfo(dtype).min, torch.iinfo(dtype).max)
-        low, high = _modify_low_high(low, high, ranges[0], ranges[1], -9, 9, dtype)
-        result = torch.randint(low, high, size, device=device, dtype=dtype)
-    elif dtype in floating_types_and(torch.half, torch.bfloat16):
-        ranges_floats = (torch.finfo(dtype).min, torch.finfo(dtype).max)
-        low, high = _modify_low_high(low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype)
-        rand_val = torch.rand(size, device=device, dtype=dtype)
-        result = high * rand_val + low * (1 - rand_val)
-    else:
-        assert dtype in complex_types()
-        float_dtype = torch.float if dtype is torch.cfloat else torch.double
-        ranges_floats = (torch.finfo(float_dtype).min, torch.finfo(float_dtype).max)
-        low, high = _modify_low_high(low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype)
-        real_rand_val = torch.rand(size, device=device, dtype=float_dtype)
-        imag_rand_val = torch.rand(size, device=device, dtype=float_dtype)
-        real = high * real_rand_val + low * (1 - real_rand_val)
-        imag = high * imag_rand_val + low * (1 - imag_rand_val)
-        result = torch.complex(real, imag)
-
-    if noncontiguous and result.numel() > 1:
-        result = torch.repeat_interleave(result, 2, dim=-1)
-        result = result[..., ::2]
-
-    if exclude_zero:
-        if dtype in integral_types() or dtype is torch.bool:
-            replace_with = torch.tensor(1, device=device, dtype=dtype)
-        elif dtype in floating_types_and(torch.half, torch.bfloat16):
-            replace_with = torch.tensor(torch.finfo(dtype).tiny, device=device, dtype=dtype)
-        elif dtype in complex_types():
-            float_dtype = torch.float if dtype is torch.cfloat else torch.double
-            float_eps = torch.tensor(torch.finfo(float_dtype).tiny, device=device, dtype=float_dtype)
-            replace_with = torch.complex(float_eps, float_eps)
-        else:
-            raise ValueError(f"Invalid dtype passed, supported dtypes are: {get_all_dtypes()}")
-        result[result == 0] = replace_with
-
-    if dtype in floating_types_and(torch.half, torch.bfloat16) or\
-       dtype in complex_types():
-        result.requires_grad = requires_grad
-
-    return result
+# Methods for matrix generation
 
 def random_square_matrix_of_rank(l, rank, dtype=torch.double, device='cpu'):
     assert rank <= l

From a8ffe81b2c3123926354b4ec2001693b38daa80d Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Mon, 30 Aug 2021 12:25:29 -0700
Subject: [PATCH 352/530] Bring back old algorithm for sorting on small number
 of segments (#64127)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63456
The code was copy-pasted from the previous commit without modification.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64127

Reviewed By: mruberry

Differential Revision: D30632090

Pulled By: ngimel

fbshipit-source-id: 58bbdd9b0423f01d4e65e2ec925ad9a3f88efc9b
---
 aten/src/ATen/native/cuda/Sort.cu | 95 +++++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Sort.cu b/aten/src/ATen/native/cuda/Sort.cu
index 83fce65d33b6c..9cb32bc5ac14c 100644
--- a/aten/src/ATen/native/cuda/Sort.cu
+++ b/aten/src/ATen/native/cuda/Sort.cu
@@ -207,6 +207,87 @@ struct offset_t {
 
 }
 
+namespace {
+
+// Segmented sort by full sort algorithm:.
+// Say we are sorting a (2, 3) tensor. We have in flattened form:
+// values       0.4 1.2 5.3 6.2 1.3 2.3
+// indices        0   1   2   0   1   2
+// segment_id     0   0   0   1   1   1
+
+// First we sort by values, globally:
+// values       6.2 5.3 2.3 1.2 1.3 0.4
+// indices        0   2   2   1   1   0
+// segment_id     1   0   1   0   1   0
+
+// Then we stable sort by segment id:
+// values       5.3 1.2 0.4 6.2 2.3 1.3
+// indices        2   1   0   0   2   1
+// segment_id     0   0   0   1   1   1
+
+// This method can only work if the slice we are sorting (`dim`) is
+// innermost, and both values and indices are contiguous. We do this
+// by re-arranging the input into this form as needed, which will
+// unfortunately allocate memory if the request is not in this form.
+// Vectorized sort is slower than iterated sort if the number of
+// slices is small (since we're sorting twice, instead of invoking a
+// smaller sort `numSlices` times), but the cub sort
+// implementation here is a catch-all, so we're not looking for
+// efficiency, but instead correctness.
+
+template<typename scalar_t>
+__global__ void sort_postprocess_kernel(const scalar_t *in, scalar_t *out, int64_t *index, const int2 *i_s_ptr, int nsegments, int nsort) {
+  CUDA_KERNEL_LOOP(i, nsegments * nsort) {
+    int segment = i / nsort;
+    int j = i % nsort;
+
+    int offset = segment * nsort;
+    const scalar_t *in_ = in + offset;
+    scalar_t *out_ = out + offset;
+    int64_t *index_ = index + offset;
+    const int2 *i_s_ptr_ = i_s_ptr + offset;
+
+    int idx = i_s_ptr_[j].y;
+    index_[j] = idx;
+    out_[j] = in_[idx];
+  }
+}
+
+template<typename scalar_t>
+inline void segmented_sort_pairs_by_full_sort(
+  int64_t nsegments, int64_t nsort, int64_t n, bool descending, const Tensor &indices,
+  const scalar_t *self_ptr, scalar_t *values_ptr, int64_t *indices_ptr
+) {
+  int64_t segment_bits = std::max<int64_t>(1L, static_cast<int64_t>(std::ceil(std::log2(nsegments))));
+
+  auto int_options = indices.options().dtype(kInt);
+  auto indices_and_segment = at::empty({nsegments, nsort, 2}, int_options);
+  indices_and_segment.select(-1, 0).copy_(  // segment id
+    at::arange(nsegments, int_options).view({nsegments, 1}).expand({nsegments, nsort}));
+  indices_and_segment.select(-1, 1).copy_(  // reverse indices
+    at::arange(nsort, int_options).view({1, nsort}).expand({nsegments, nsort}));
+
+  auto i_s_ptr = reinterpret_cast<int2 *>(indices_and_segment.data_ptr<int>());
+  auto indices_and_segment2 = at::empty_like(indices_and_segment);
+  auto i_s_ptr2 = reinterpret_cast<int2 *>(indices_and_segment2.data_ptr<int>());
+
+  at::cuda::cub::sort_pairs<scalar_t, int2>(
+    self_ptr, nullptr, i_s_ptr, i_s_ptr2,
+    n, descending);
+
+  TORCH_INTERNAL_ASSERT(segment_bits <= 32);
+
+  // sort on lower 32bits, i.e. segment index
+  at::cuda::cub::sort_keys<int64_t>(
+    reinterpret_cast<int64_t *>(i_s_ptr2), reinterpret_cast<int64_t *>(i_s_ptr),
+    n, false, 0, segment_bits);
+
+  sort_postprocess_kernel<<<(n + 511) / 512, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+    self_ptr, values_ptr, indices_ptr, i_s_ptr, nsegments, nsort);
+}
+
+}  // namespace
+
 // We perform a segmented sort in cub with inputs that have
 // more than 1024/2048 elements along the selected dimension.
 // Otherwise, we do an inplace bitonic sort (see sortKeyValueInplace).
@@ -349,11 +430,15 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
         int64_t n = std::min(remaining, nbatch);
         int64_t nsegments = n / nsort;
 
-        auto reverse_indices = at::arange(nsort, indices.options()).view({1, nsort}).expand({nsegments, nsort}).contiguous();
-
-        at::cuda::cub::segmented_sort_pairs(self_ptr, values_ptr,
-          reverse_indices.data_ptr<int64_t>(), indices_ptr, n, nsegments,
-          offset_t{(int)nsort, 0}, offset_t{(int)nsort, 1}, descending);
+        if (nsegments < 128) {
+          segmented_sort_pairs_by_full_sort(nsegments, nsort, n, descending,
+            indices, self_ptr, values_ptr, indices_ptr);
+        } else {
+          auto reverse_indices = at::arange(nsort, indices.options()).view({1, nsort}).expand({nsegments, nsort}).contiguous();
+          at::cuda::cub::segmented_sort_pairs(self_ptr, values_ptr,
+            reverse_indices.data_ptr<int64_t>(), indices_ptr, n, nsegments,
+            offset_t{(int)nsort, 0}, offset_t{(int)nsort, 1}, descending);
+        }
 
         remaining -= n;
         self_ptr += n;

From 401bbb2aa0a183ddfb309740c020fb4962367ac9 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 30 Aug 2021 12:28:39 -0700
Subject: [PATCH 353/530] remove componentwise comparison of complex values in
 TestCase.assertEqual (#63572)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63572

Addresses #61906. Issue will be fixed later in the stack when `torch.testing.assert_close` got the same treatment.

cc ezyang gchanan

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D30633527

Pulled By: mruberry

fbshipit-source-id: c2002a4998a7a75cb2ab83f87190bde43a9d4f7c
---
 test/test_tensor_creation_ops.py |  2 +-
 test/test_testing.py             | 54 ++++-------------------
 test/test_torch.py               |  4 +-
 test/test_unary_ufuncs.py        |  5 +--
 torch/testing/_core.py           | 75 ++++----------------------------
 5 files changed, 21 insertions(+), 119 deletions(-)

diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 9ef374248984e..dcb49386c9ff8 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -3258,7 +3258,7 @@ def seed(generator):
             self.assertTrue((res1 >= 0).all().item())
 
     @dtypes(torch.half, torch.float, torch.bfloat16, torch.double,
-            torch.complex32, torch.complex64, torch.complex128)
+            torch.complex64, torch.complex128)
     def test_randn(self, device, dtype):
         SIZE = 100
         for size in [0, SIZE]:
diff --git a/test/test_testing.py b/test/test_testing.py
index f38183d4a3769..fdc3463edcb2b 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -88,25 +88,19 @@ def test__comparescalars_debug_msg(self, device):
                         "atol=1e-05 is only 1.9100000000000003e-05!")
         self.assertEqual(debug_msg, expected_msg)
 
-        # complex x complex, real difference
+        # complex x complex
         result, debug_msg = self._compareScalars(complex(1, 3), complex(3, 1))
-        expected_msg = ("Comparing the real part 1.0 and 3.0 gives a difference "
-                        "of 2.0, but the allowed difference with rtol=1.3e-06 "
-                        "and atol=1e-05 is only 1.39e-05!")
-        self.assertEqual(debug_msg, expected_msg)
-
-        # complex x complex, imaginary difference
-        result, debug_msg = self._compareScalars(complex(1, 3), complex(1, 5.5))
-        expected_msg = ("Comparing the imaginary part 3.0 and 5.5 gives a "
-                        "difference of 2.5, but the allowed difference with "
-                        "rtol=1.3e-06 and atol=1e-05 is only 1.715e-05!")
+        expected_msg = ("Comparing (1+3j) and (3+1j) gives a difference "
+                        "of 2.8284271247461903, but the allowed difference "
+                        "with rtol=1.3e-06 and atol=1e-05 is only "
+                        "1.4110960958218895e-05!")
         self.assertEqual(debug_msg, expected_msg)
 
         # complex x int
         result, debug_msg = self._compareScalars(complex(1, -2), 1)
-        expected_msg = ("Comparing the imaginary part -2.0 and 0.0 gives a "
-                        "difference of 2.0, but the allowed difference with "
-                        "rtol=1.3e-06 and atol=1e-05 is only 1e-05!")
+        expected_msg = ("Comparing (1-2j) and 1 gives a difference of 2.0, "
+                        "but the allowed difference with rtol=1.3e-06 and "
+                        "atol=1e-05 is only 1.13e-05!")
         self.assertEqual(debug_msg, expected_msg)
 
         # NaN x NaN, equal_nan=False
@@ -170,28 +164,6 @@ def test__comparetensors_debug_msg(self, device):
                         "occuring at index 0.")
         self.assertEqual(debug_msg, expected_msg)
 
-        # Checks complex tensor comparisons (real part)
-        a = torch.tensor((1 - 1j, 4 + 3j), device=device)
-        b = torch.tensor((1 - 1j, 1 + 3j), device=device)
-        result, debug_msg = self._compareTensors(a, b)
-        expected_msg = ("Real parts failed to compare as equal! "
-                        "With rtol=1.3e-06 and atol={0}, "
-                        "found 1 element(s) (out of 2) whose difference(s) exceeded the "
-                        "margin of error (including 0 nan comparisons). The greatest difference was "
-                        "3.0 (4.0 vs. 1.0), which occurred at index 1.").format(atol)
-        self.assertEqual(debug_msg, expected_msg)
-
-        # Checks complex tensor comparisons (imaginary part)
-        a = torch.tensor((1 - 1j, 4 + 3j), device=device)
-        b = torch.tensor((1 - 1j, 4 - 21j), device=device)
-        result, debug_msg = self._compareTensors(a, b)
-        expected_msg = ("Imaginary parts failed to compare as equal! "
-                        "With rtol=1.3e-06 and atol={0}, "
-                        "found 1 element(s) (out of 2) whose difference(s) exceeded the "
-                        "margin of error (including 0 nan comparisons). The greatest difference was "
-                        "24.0 (3.0 vs. -21.0), which occurred at index 1.").format(atol)
-        self.assertEqual(debug_msg, expected_msg)
-
         # Checks size mismatch
         a = torch.tensor((1, 2), device=device)
         b = torch.tensor((3), device=device)
@@ -407,7 +379,7 @@ def test_isclose_comparetensors_complex(self, device, dtype):
         tests = (
             (complex(1, -1), complex(-1, 1), False),
             (complex(1, -1), complex(2, -2), True),
-            (complex(1, 99), complex(4, 100), False),
+            (complex(1, 99), complex(4, 100), True),
         )
 
         self._comparetensors_helper(tests, device, dtype, False, atol=.5, rtol=.5)
@@ -421,14 +393,6 @@ def test_isclose_comparetensors_complex(self, device, dtype):
             (complex(float('nan'), float('nan')), complex(float('nan'), float('nan')), True),
         )
         self._isclose_helper(tests, device, dtype, True)
-
-        tests = (
-            (complex(1, 1), complex(1, float('nan')), False),
-            (complex(1, 1), complex(float('nan'), 1), False),
-            (complex(float('nan'), 1), complex(float('nan'), 1), True),
-            (complex(float('nan'), 1), complex(1, float('nan')), False),
-            (complex(float('nan'), float('nan')), complex(float('nan'), float('nan')), True),
-        )
         self._comparetensors_helper(tests, device, dtype, True)
 
     # Tests that isclose with rtol or atol values less than zero throws a
diff --git a/test/test_torch.py b/test/test_torch.py
index c50b7ca99e883..b267b9cd6b610 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -5121,7 +5121,7 @@ def filter_shape(shape, dim):
                 spacing = [space.cpu().detach().numpy() for space in spacing]
             expected = np.gradient(t_np, *self._wrap_to_list(spacing), axis=dims, edge_order=edge_order)
             actual, expected = self._inf_nan_preprocess(list(actual), self._wrap_to_list(expected))
-            self.assertEqual(actual, expected, equal_nan="relaxed", atol=1e-4, rtol=0, exact_dtype=False)
+            self.assertEqual(actual, expected, equal_nan=True, atol=1e-4, rtol=0, exact_dtype=False)
 
     @onlyOnCPUAndCUDA
     @dtypes(torch.long, torch.float32, torch.complex64)
@@ -5188,7 +5188,7 @@ def test_gradient_type_promotion(self, device):
                     self.assertEqual(expected[i].imag, torch.zeros(actual[i].shape), exact_dtype=False)
             else:
                 actual, expected = self._inf_nan_preprocess(list(actual), expected)
-                self.assertEqual(actual, expected, equal_nan="relaxed", exact_dtype=False)
+                self.assertEqual(actual, expected, equal_nan=True, exact_dtype=False)
 
     @onlyOnCPUAndCUDA
     @dtypes(torch.long, torch.float32, torch.complex64)
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 22f61519a2853..526b67a6b03da 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -359,10 +359,7 @@ def test_reference_numerics_extremal(self, device, dtype, op):
         tensors = generate_numeric_tensors_extremal(device, dtype,
                                                     domain=op.domain)
 
-        # https://github.com/pytorch/pytorch/issues/50749
-        equal_nan = "relaxed" if device.startswith('cuda') else True
-
-        self._test_reference_numerics(dtype, op, tensors, equal_nan)
+        self._test_reference_numerics(dtype, op, tensors)
 
     # Tests for testing (non)contiguity consistency
 
diff --git a/torch/testing/_core.py b/torch/testing/_core.py
index d9806150047c5..66060f8cbcee0 100644
--- a/torch/testing/_core.py
+++ b/torch/testing/_core.py
@@ -6,7 +6,7 @@
 import random
 import math
 import cmath
-from typing import cast, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 import operator
 
 FileCheck = torch._C.FileCheck
@@ -78,27 +78,12 @@ def _unravel_index(flat_index, shape):
 #   Two tensors are "equal" if they are "close", in the sense of torch.allclose.
 #   The only exceptions are complex tensors and bool tensors.
 #
-#   Complex tensors are "equal" if both the
-#   real and complex parts (separately) are close. This is divergent from
-#   torch.allclose's behavior, which compares the absolute values of the
-#   complex numbers instead.
-#
-#   Using torch.allclose would be a less strict
-#   comparison that would allow large complex values with
-#   significant real or imaginary differences to be considered "equal,"
-#   and would make setting rtol and atol for complex tensors distinct from
-#   other tensor types.
-#
 #   Bool tensors are equal only if they are identical, regardless of
 #   the rtol and atol values.
 #
 #   The `equal_nan` can be True or False, which maps to the True or False
-#   in `torch.allclose`. `equal_nan` can also be "relaxed", which means
-#   the complex will be compared in the relaxed mode:
-#       2 + nan j == 3 + nan j ---> False when equal_nan=True
-#                                   True when equal_nan="relaxed"
-def _compare_tensors_internal(a: torch.Tensor, b: torch.Tensor, *, rtol, atol, equal_nan: Union[str, bool]) -> _compare_return_type:
-    assert equal_nan in {True, False, "relaxed"}
+#   in `torch.allclose`.
+def _compare_tensors_internal(a: torch.Tensor, b: torch.Tensor, *, rtol, atol, equal_nan) -> _compare_return_type:
     debug_msg : Optional[str]
     # Integer (including bool) comparisons are identity comparisons
     # when rtol is zero and atol is less than one
@@ -129,48 +114,19 @@ def _compare_tensors_internal(a: torch.Tensor, b: torch.Tensor, *, rtol, atol, e
                                    _unravel_index(greatest_diff_index, a.shape)))
         return (False, debug_msg)
 
-    # Compares complex tensors' real and imaginary parts separately.
-    # (see NOTE Test Framework Tensor "Equality")
-    if a.is_complex():
-        if equal_nan == "relaxed":
-            a = a.clone()
-            b = b.clone()
-            a.real[a.imag.isnan()] = math.nan
-            a.imag[a.real.isnan()] = math.nan
-            b.real[b.imag.isnan()] = math.nan
-            b.imag[b.real.isnan()] = math.nan
-
-        real_result, debug_msg = _compare_tensors_internal(a.real, b.real,
-                                                           rtol=rtol, atol=atol,
-                                                           equal_nan=equal_nan)
-
-        if not real_result:
-            debug_msg = "Real parts failed to compare as equal! " + cast(str, debug_msg)
-            return (real_result, debug_msg)
-
-        imag_result, debug_msg = _compare_tensors_internal(a.imag, b.imag,
-                                                           rtol=rtol, atol=atol,
-                                                           equal_nan=equal_nan)
-
-        if not imag_result:
-            debug_msg = "Imaginary parts failed to compare as equal! " + cast(str, debug_msg)
-            return (imag_result, debug_msg)
-
-        return (True, None)
-
     # All other comparisons use torch.allclose directly
-    if torch.allclose(a, b, rtol=rtol, atol=atol, equal_nan=(equal_nan in {"relaxed", True})):
+    if torch.allclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan):
         return (True, None)
 
     # Gathers debug info for failed float tensor comparison
     # NOTE: converts to float64 to best represent differences
-    a_flat = a.to(torch.float64).flatten()
-    b_flat = b.to(torch.float64).flatten()
+    a_flat = a.to(torch.float64 if not a.dtype.is_complex else torch.complex128).flatten()
+    b_flat = b.to(torch.float64 if not a.dtype.is_complex else torch.complex128).flatten()
     diff = torch.abs(a_flat - b_flat)
 
     # Masks close values
     # NOTE: this avoids (inf - inf) oddities when computing the difference
-    close = torch.isclose(a_flat, b_flat, rtol, atol, (equal_nan in {"relaxed", True}))
+    close = torch.isclose(a_flat, b_flat, rtol, atol, equal_nan)
     diff[close] = 0
     nans = torch.isnan(diff)
     num_nans = nans.sum()
@@ -212,7 +168,7 @@ def _helper(a, b, s) -> _compare_return_type:
 
         # Special-case for infinity comparisons
         # NOTE: if b is inf then allowed_diff will be inf when rtol is not 0
-        if ((math.isinf(a) or math.isinf(b)) and a != b):
+        if ((cmath.isinf(a) or cmath.isinf(b)) and a != b):
             result = False
 
         msg = None
@@ -228,21 +184,6 @@ def _helper(a, b, s) -> _compare_return_type:
                 )
         return result, msg
 
-    if isinstance(a, complex) or isinstance(b, complex):
-        a = complex(a)
-        b = complex(b)
-
-        if equal_nan == "relaxed":
-            if cmath.isnan(a) and cmath.isnan(b):
-                return (True, None)
-
-        result, msg = _helper(a.real, b.real, " the real part ")
-
-        if not result:
-            return (False, msg)
-
-        return _helper(a.imag, b.imag, " the imaginary part ")
-
     return _helper(a, b, " ")
 
 
From eafe33c995d47d45fceaf42801717f3120d799b9 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 30 Aug 2021 12:28:39 -0700
Subject: [PATCH 354/530] remove componentwise comparison of complex values in
 torch.testing.assert_close (#63841)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63841

Closes #61906.

cc ezyang gchanan

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D30633526

Pulled By: mruberry

fbshipit-source-id: ddb5d61838cd1e12d19d0093799e827344382cdc
---
 test/test_testing.py      | 65 +++++++++++++++--------------
 torch/testing/_asserts.py | 86 ++++++++-------------------------------
 2 files changed, 50 insertions(+), 101 deletions(-)

diff --git a/test/test_testing.py b/test/test_testing.py
index fdc3463edcb2b..a5ea232122e08 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -863,20 +863,43 @@ def test_matching_atol(self):
         for fn in assert_close_with_inputs(actual, expected):
             fn(rtol=0.0, atol=eps * 2)
 
-    def test_matching_nan(self):
-        actual = torch.tensor(float("NaN"))
-        expected = actual.clone()
+    # TODO: the code that this test was designed for was removed in https://github.com/pytorch/pytorch/pull/56058
+    #  We need to check if this test is still needed or if this behavior is now enabled by default.
+    def test_matching_conjugate_bit(self):
+        actual = torch.tensor(complex(1, 1)).conj()
+        expected = torch.tensor(complex(1, -1))
 
         for fn in assert_close_with_inputs(actual, expected):
-            with self.assertRaises(AssertionError):
-                fn()
+            fn()
+
+    def test_matching_nan(self):
+        nan = float("NaN")
+
+        tests = (
+            (nan, nan),
+            (complex(nan, 0), complex(0, nan)),
+            (complex(nan, nan), complex(nan, 0)),
+            (complex(nan, nan), complex(nan, nan)),
+        )
+
+        for actual, expected in tests:
+            for fn in assert_close_with_inputs(actual, expected):
+                with self.assertRaises(AssertionError):
+                    fn()
 
     def test_matching_nan_with_equal_nan(self):
-        actual = torch.tensor(float("NaN"))
-        expected = actual.clone()
+        nan = float("NaN")
 
-        for fn in assert_close_with_inputs(actual, expected):
-            fn(equal_nan=True)
+        tests = (
+            (nan, nan),
+            (complex(nan, 0), complex(0, nan)),
+            (complex(nan, nan), complex(nan, 0)),
+            (complex(nan, nan), complex(nan, nan)),
+        )
+
+        for actual, expected in tests:
+            for fn in assert_close_with_inputs(actual, expected):
+                fn(equal_nan=True)
 
     def test_numpy(self):
         tensor = torch.rand(2, 2, dtype=torch.float32)
@@ -1181,30 +1204,6 @@ def test_mapping_mismatching_values_msg(self):
             torch.testing.assert_close(actual, expected)
 
 
-class TestAssertCloseComplex(TestCase):
-    def test_mismatching_nan_with_equal_nan(self):
-        actual = torch.tensor(complex(1, float("NaN")))
-        expected = torch.tensor(complex(float("NaN"), 1))
-
-        for fn in assert_close_with_inputs(actual, expected):
-            with self.assertRaises(AssertionError):
-                fn(equal_nan=True)
-
-    def test_mismatching_nan_with_equal_nan_relaxed(self):
-        actual = torch.tensor(complex(1, float("NaN")))
-        expected = torch.tensor(complex(float("NaN"), 1))
-
-        for fn in assert_close_with_inputs(actual, expected):
-            fn(equal_nan="relaxed")
-
-    def test_matching_conjugate_bit(self):
-        actual = torch.tensor(complex(1, 1)).conj()
-        expected = torch.tensor(complex(1, -1))
-
-        for fn in assert_close_with_inputs(actual, expected):
-            fn()
-
-
 class TestAssertCloseSparseCOO(TestCase):
     def test_matching_coalesced(self):
         indices = (
diff --git a/torch/testing/_asserts.py b/torch/testing/_asserts.py
index 2de2cc0735529..073e2e2230820 100644
--- a/torch/testing/_asserts.py
+++ b/torch/testing/_asserts.py
@@ -44,52 +44,6 @@ def _get_default_rtol_and_atol(actual: Tensor, expected: Tensor) -> Tuple[float,
     return max(actual_rtol, expected_rtol), max(actual_atol, expected_atol)
 
 
-def _check_complex_components_individually(
-    check_tensors: Callable[..., Optional[_TestingErrorMeta]]
-) -> Callable[..., Optional[_TestingErrorMeta]]:
-    """Decorates real-valued tensor check functions to handle complex components individually.
-
-    If the inputs are not complex, this decorator is a no-op.
-
-    Args:
-        check_tensors (Callable[[Tensor, Tensor], Optional[_TestingErrorMeta]]): Tensor check function for real-valued
-        tensors.
-    """
-
-    @functools.wraps(check_tensors)
-    def wrapper(
-        actual: Tensor, expected: Tensor, *, equal_nan: Union[str, bool], **kwargs: Any
-    ) -> Optional[_TestingErrorMeta]:
-        if equal_nan == "relaxed":
-            relaxed_complex_nan = True
-            equal_nan = True
-        else:
-            relaxed_complex_nan = False
-
-        if actual.dtype not in (torch.complex32, torch.complex64, torch.complex128):
-            return check_tensors(actual, expected, equal_nan=equal_nan, **kwargs)
-
-        if relaxed_complex_nan:
-            actual, expected = [
-                t.clone().masked_fill(
-                    t.real.isnan() | t.imag.isnan(), complex(float("NaN"), float("NaN"))  # type: ignore[call-overload]
-                )
-                for t in (actual, expected)
-            ]
-
-        error_meta = check_tensors(actual.real, expected.real, equal_nan=equal_nan, **kwargs)
-        if error_meta:
-            return error_meta
-
-        error_meta = check_tensors(actual.imag, expected.imag, equal_nan=equal_nan, **kwargs)
-        if error_meta:
-            return error_meta
-
-        return None
-
-    return wrapper
-
-
 def _check_sparse_coo_members_individually(
     check_tensors: Callable[..., Optional[_TestingErrorMeta]]
 ) -> Callable[..., Optional[_TestingErrorMeta]]:
@@ -430,10 +384,24 @@ def append_difference(msg: str, *, type: str, difference: float, index: Tuple[in
     return msg.strip()
 
 
+def _get_comparison_dtype(dtype: torch.dtype) -> torch.dtype:
+    """Selects the comparison dtype based on the input dtype.
+
+    Returns:
+        Highest precision dtype of the same dtype category as the input. :class:`torch.bool` is treated as integral
+        dtype.
+    """
+    if dtype.is_complex:
+        return torch.complex128
+    elif dtype.is_floating_point:
+        return torch.float64
+    else:
+        return torch.int64
+
+
 @_check_quantized
 @_check_sparse_coo_members_individually
 @_check_sparse_csr_members_individually
-@_check_complex_components_individually
 def _check_values_close(
     actual: Tensor,
     expected: Tensor,
@@ -457,7 +425,7 @@ def _check_values_close(
     Returns:
         (Optional[AssertionError]): If check did not pass.
     """
-    dtype = torch.float64 if actual.dtype.is_floating_point else torch.int64
+    dtype = _get_comparison_dtype(actual.dtype)
     actual = actual.to(dtype)
     expected = expected.to(dtype)
     mismatches = ~torch.isclose(actual, expected, rtol=rtol, atol=atol, equal_nan=equal_nan)
@@ -740,7 +708,7 @@ def assert_close(
     allow_subclasses: bool = True,
     rtol: Optional[float] = None,
     atol: Optional[float] = None,
-    equal_nan: Union[bool, str] = False,
+    equal_nan: bool = False,
     check_device: bool = True,
     check_dtype: bool = True,
     check_stride: bool = False,
@@ -761,9 +729,6 @@ def assert_close(
     (``-inf`` and ``inf``) are only considered close if and only if they are equal. ``NaN``'s are only considered equal
     to each other if :attr:`equal_nan` is ``True``.
 
-    If :attr:`actual` and :attr:`expected` are complex-valued, they are considered close if both their real and
-    imaginary components are considered close according to the definition above.
-
     If :attr:`actual` and :attr:`expected` are sparse (either having COO or CSR layout), their strided members are
     checked individually. Indices, namely ``indices`` for COO or ``crow_indices``  and ``col_indices`` for CSR layout,
     are always checked for equality whereas the values are checked for closeness according to the definition above.
@@ -795,8 +760,7 @@ def assert_close(
             default values based on the :attr:`~torch.Tensor.dtype` are selected with the below table.
         atol (Optional[float]): Absolute tolerance. If specified :attr:`rtol` must also be specified. If omitted,
             default values based on the :attr:`~torch.Tensor.dtype` are selected with the below table.
-        equal_nan (Union[bool, str]): If ``True``, two ``NaN`` values will be considered equal. If ``"relaxed"``,
-            complex values are considered as ``NaN`` if either the real **or** imaginary component is ``NaN``.
+        equal_nan (Union[bool, str]): If ``True``, two ``NaN`` values will be considered equal.
         check_device (bool): If ``True`` (default), asserts that corresponding tensors are on the same
             :attr:`~torch.Tensor.device`. If this check is disabled, tensors on different
             :attr:`~torch.Tensor.device`'s are moved to the CPU before being compared.
@@ -956,20 +920,6 @@ def assert_close(
         Relative difference: nan (up to 1.3e-06 allowed)
         >>> torch.testing.assert_close(actual, expected, equal_nan=True)
 
-        >>> # If equal_nan=True, the real and imaginary NaN's of complex inputs have to match.
-        >>> expected = torch.tensor(complex(float("NaN"), 0))
-        >>> actual = torch.tensor(complex(0, float("NaN")))
-        >>> torch.testing.assert_close(actual, expected, equal_nan=True)
-        Traceback (most recent call last):
-        ...
-        AssertionError: Scalars are not close!
-        <BLANKLINE>
-        Absolute difference: nan (up to 1e-05 allowed)
-        Relative difference: nan (up to 1.3e-06 allowed)
-        >>> # If equal_nan="relaxed", however, then complex numbers are treated as NaN if any
-        >>> # of the real or imaginary components is NaN.
-        >>> torch.testing.assert_close(actual, expected, equal_nan="relaxed")
-
         >>> expected = torch.tensor([1.0, 2.0, 3.0])
         >>> actual = torch.tensor([1.0, 4.0, 5.0])
         >>> # The default mismatch message can be overwritten.

From e98173ff3423247c597e21c923c8f47470ef07ab Mon Sep 17 00:00:00 2001
From: Tanvir Zaman <motanv@fb.com>
Date: Mon, 30 Aug 2021 12:56:15 -0700
Subject: [PATCH 355/530] Fix bytes_written and bytes_read (#64040)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64040

In operator cost inference functions, in many places we are using sizeof(x.data_type()). Since data_type() returns a 32 bit integer from [this enum](https://www.internalfb.com/code/fbsource/[15e7ffe4073cf08c61077c7c24a4839504b964a2]/fbcode/caffe2/caffe2/proto/caffe2.proto?lines=20), we are basically always getting 4 for sizeof(x.data_type()) no matter what actual data type x has. Big thanks to Jack Langman for specifically pointing to this bug.

We would instead use the size in bytes based on actual data type.

Test Plan:
Added unit tests BatchMatMulMemCostTest:

buck test //caffe2/caffe2/fb/fbgemm:batch_matmul_op_test -- BatchMatMulMemCostTest

Extended existing unit test test_columnwise_concat for different data types:

buck test //caffe2/caffe2/python/operator_test:concat_op_cost_test -- test_columnwise_concat

Differential Revision: D30561459

fbshipit-source-id: 976fa5167097a35af548498480001aafd7851d93
---
 caffe2/core/operator_schema.h                 |  17 ++-
 caffe2/operators/batch_matmul_op.cc           | 113 ++++++++++--------
 caffe2/operators/concat_split_op.cc           |  15 ++-
 caffe2/operators/conv_pool_op_base.h          |  15 ++-
 caffe2/operators/distance_op.cc               |  28 +++--
 caffe2/operators/fc_inference.cc              |  22 ++--
 caffe2/operators/one_hot_ops.cc               |  30 +++--
 caffe2/operators/utility_ops.cc               |  13 +-
 .../operator_test/concat_op_cost_test.py      |  54 +++++----
 caffe2/python/workspace_test.py               |   2 +-
 caffe2/sgd/adagrad_op.cc                      |  55 ++++++---
 11 files changed, 224 insertions(+), 140 deletions(-)

diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
index 64f5ef3ed883a..0d048eb8d26e9 100644
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@@ -6,12 +6,13 @@
 #include <initializer_list>
 #include <ostream>
 #include <set>
-#include <vector>
 #include <unordered_map>
+#include <vector>
 
 #include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
+#include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/filler.h"
 #include "caffe2/utils/proto_utils.h"
@@ -273,8 +274,8 @@ class TORCH_API OpSchema {
   OpSchema&
   Arg(const char* name, const char* description, bool required = false);
 
-#define DECLARE_STANDARD_ARG(name, str)     \
-  static const char* Arg_##name; \
+#define DECLARE_STANDARD_ARG(name, str) \
+  static const char* Arg_##name;        \
   OpSchema& Arg##name(const char* description);
 
   DECLARE_STANDARD_ARG(IsTest, is_test)
@@ -339,7 +340,9 @@ class TORCH_API OpSchema {
     return inplace_enforced_(x, y);
   }
 
-  TORCH_API friend std::ostream& operator<<(std::ostream& out, const OpSchema& schema);
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const OpSchema& schema);
 
   const std::vector<Argument>& args() const {
     return args_;
@@ -562,8 +565,10 @@ OpSchema::Cost PointwiseCostInference(
   }
 
   c.flops = nElemX * OpsPerPoint;
-  c.bytes_read = nElemRead * sizeof(X.data_type());
-  c.bytes_written = nElemX * sizeof(X.data_type());
+  auto const& X_element_size_byte =
+      DataTypeToTypeMeta(X.data_type()).itemsize();
+  c.bytes_read = nElemRead * X_element_size_byte;
+  c.bytes_written = nElemX * X_element_size_byte;
   return c;
 }
 
diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc
index 32799ced10671..205acf74f1572 100644
--- a/caffe2/operators/batch_matmul_op.cc
+++ b/caffe2/operators/batch_matmul_op.cc
@@ -1,6 +1,7 @@
 #include "caffe2/operators/batch_matmul_op.h"
 
 #include "caffe2/core/operator_schema.h"
+#include "caffe2/core/types.h"
 
 namespace caffe2 {
 
@@ -116,9 +117,13 @@ OpSchema::Cost CostInferenceForBatchMatMul(
     K = in[0].dims(ndims_A - 1);
   }
 
+  auto const& A_element_size_byte =
+      DataTypeToTypeMeta(A.data_type()).itemsize();
+  auto const& Y_element_size_byte =
+      DataTypeToTypeMeta(Y.data_type()).itemsize();
   c.flops = 2 * nElemY * K;
-  c.bytes_read = (nElemA + nElemB) * sizeof(A.data_type());
-  c.bytes_written = nElemY * sizeof(Y.data_type());
+  c.bytes_read = (nElemA + nElemB) * A_element_size_byte;
+  c.bytes_written = nElemY * Y_element_size_byte;
   c.params_bytes = 0;
   return c;
 }
@@ -180,72 +185,76 @@ class GetBatchMatMulGradient : public GradientMakerBase {
     auto no_trans_arg = vector<Argument>();
     auto trans_a_arg = vector<Argument>{MakeArgument<int>("trans_a", 1)};
     auto trans_b_arg = vector<Argument>{MakeArgument<int>("trans_b", 1)};
-    auto trans_both_arg = vector<Argument>{MakeArgument<int>("trans_a", 1),
-                                           MakeArgument<int>("trans_b", 1)};
+    auto trans_both_arg = vector<Argument>{
+        MakeArgument<int>("trans_a", 1), MakeArgument<int>("trans_b", 1)};
 
     if (trans_a) {
       if (trans_b) {
         // A'B':
         // dA = B'G', dB = G'A'
-        return vector<OperatorDef>{CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{I(1), GO(0)},
-                                       vector<string>{GI(0)},
-                                       trans_both_arg),
-                                   CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{GO(0), I(0)},
-                                       vector<string>{GI(1)},
-                                       trans_both_arg)};
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(1), GO(0)},
+                vector<string>{GI(0)},
+                trans_both_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(0)},
+                vector<string>{GI(1)},
+                trans_both_arg)};
       } else {
         // A'B:
         // dA = BG', dB = AG
-        return vector<OperatorDef>{CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{I(1), GO(0)},
-                                       vector<string>{GI(0)},
-                                       trans_b_arg),
-                                   CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{I(0), GO(0)},
-                                       vector<string>{GI(1)},
-                                       no_trans_arg)};
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(1), GO(0)},
+                vector<string>{GI(0)},
+                trans_b_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(0), GO(0)},
+                vector<string>{GI(1)},
+                no_trans_arg)};
       }
     } else {
       if (trans_b) {
         // AB':
         // dA = GB, dB = G'A
-        return vector<OperatorDef>{CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{GO(0), I(1)},
-                                       vector<string>{GI(0)},
-                                       no_trans_arg),
-                                   CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{GO(0), I(0)},
-                                       vector<string>{GI(1)},
-                                       trans_a_arg)};
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(1)},
+                vector<string>{GI(0)},
+                no_trans_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(0)},
+                vector<string>{GI(1)},
+                trans_a_arg)};
       } else {
         // AB:
         // dA = GB', dB = A'G
-        return vector<OperatorDef>{CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{GO(0), I(1)},
-                                       vector<string>{GI(0)},
-                                       trans_b_arg),
-                                   CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{I(0), GO(0)},
-                                       vector<string>{GI(1)},
-                                       trans_a_arg)};
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(1)},
+                vector<string>{GI(0)},
+                trans_b_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(0), GO(0)},
+                vector<string>{GI(1)},
+                trans_a_arg)};
       }
     }
   }
diff --git a/caffe2/operators/concat_split_op.cc b/caffe2/operators/concat_split_op.cc
index 8eceb5ab4a577..8aa9e282adb84 100644
--- a/caffe2/operators/concat_split_op.cc
+++ b/caffe2/operators/concat_split_op.cc
@@ -101,9 +101,12 @@ OpSchema::Cost CostInferenceForSplit(
   CAFFE_ENFORCE_GT(in.size(), 0);
   struct OpSchema::Cost cost;
   cost.flops = 0;
-  auto input_bytes_count = nElemFromDim(in[0]) * sizeof(in[0].data_type());
-  auto split_bytes_count =
-      (in.size() == 1) ? 0 : nElemFromDim(in[1]) * sizeof(in[1].data_type());
+  auto const& input_0_element_size_byte =
+      DataTypeToTypeMeta(in[0].data_type()).itemsize();
+  auto const& input_1_element_size_byte =
+      (in.size() > 1) ? DataTypeToTypeMeta(in[1].data_type()).itemsize() : 0;
+  auto input_bytes_count = nElemFromDim(in[0]) * input_0_element_size_byte;
+  auto split_bytes_count = nElemFromDim(in[1]) * input_1_element_size_byte;
   // There can be two input blobs:
   // (1) actual tensor to be split
   // (2) lengths of outputs along split axis
@@ -329,11 +332,13 @@ OpSchema::Cost CostInferenceForConcat(
   }
   auto split_info_bytes_count = in.size() * sizeof(int);
 
+  auto const& input_0_element_size_byte =
+      DataTypeToTypeMeta(in[0].data_type()).itemsize();
   struct OpSchema::Cost cost;
   cost.flops = 0;
-  cost.bytes_read = nElemRead * sizeof(in[0].data_type());
+  cost.bytes_read = nElemRead * input_0_element_size_byte;
   cost.bytes_written =
-      size * sizeof(in[0].data_type()) + split_info_bytes_count;
+      size * input_0_element_size_byte + split_info_bytes_count;
   cost.params_bytes = 0;
   return cost;
 }
diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h
index 25bd99a92e50f..b356ef952d79c 100644
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@@ -7,6 +7,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2_legacy.pb.h"
 #include "caffe2/utils/math.h"
 
@@ -519,14 +520,20 @@ class ConvPoolOpBase : public Operator<Context> {
     uint64_t nElemW = nElemFromDim(W);
     uint64_t nElemBias = inputs.size() > 2 ? nElemFromDim(inputs[2]) : 0;
 
+    auto const& X_elemenet_size_byte =
+        DataTypeToTypeMeta(X.data_type()).itemsize();
+    auto const& Y_element_size_byte =
+        DataTypeToTypeMeta(Y.data_type()).itemsize();
+    auto const& W_element_size_byte =
+        DataTypeToTypeMeta(W.data_type()).itemsize();
+
     // grouping is NOT properly handled yet
     c.flops = N * Y_t * Y_h * Y_w * kernel_t * kernel_w * kernel_h *
         in_channels * out_channels * 2;
-    c.bytes_read = (nElemX + nElemW + nElemBias) * sizeof(X.data_type());
-    c.bytes_written =
-        N * out_channels * Y_t * Y_h * Y_w * sizeof(Y.data_type());
+    c.bytes_read = (nElemX + nElemW + nElemBias) * X_elemenet_size_byte;
+    c.bytes_written = N * out_channels * Y_t * Y_h * Y_w * Y_element_size_byte;
     c.params_bytes = out_channels * in_channels * kernel_t * kernel_h *
-        kernel_w * sizeof(W.data_type());
+        kernel_w * W_element_size_byte;
     return c;
   }
 
diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc
index 1529534d8fb2e..9ea8eea5a2725 100644
--- a/caffe2/operators/distance_op.cc
+++ b/caffe2/operators/distance_op.cc
@@ -1,4 +1,5 @@
 #include "caffe2/operators/distance_op.h"
+#include "caffe2/core/types.h"
 #include "caffe2/utils/eigen_utils.h"
 #ifdef CAFFE2_USE_MKLDNN
 #include <caffe2/ideep/operators/operator_fallback_ideep.h>
@@ -7,7 +8,7 @@
 
 namespace caffe2 {
 
-template<>
+template <>
 bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
   auto& X = Input(0);
   auto& Y = Input(1);
@@ -257,7 +258,9 @@ OpSchema::Cost CostInferenceForDotProduct(
   CAFFE_ENFORCE_EQ(out[0].dims().size(), 1);
 
   struct OpSchema::Cost c = PointwiseCostInference<2>(def, in);
-  c.bytes_written = out[0].dims(0) * sizeof(out[0].data_type());
+  auto const& out_0_element_size_byte =
+      DataTypeToTypeMeta(out[0].data_type()).itemsize();
+  c.bytes_written = out[0].dims(0) * out_0_element_size_byte;
   c.params_bytes = 0;
   return c;
 }
@@ -379,10 +382,12 @@ bool DotProductWithPaddingOp<float, CPUContext>::RunOnDevice() {
 }
 
 // L2
-REGISTER_CPU_OPERATOR(SquaredL2Distance,
-                      SquaredL2DistanceOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
-                      SquaredL2DistanceGradientOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SquaredL2Distance,
+    SquaredL2DistanceOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SquaredL2DistanceGradient,
+    SquaredL2DistanceGradientOp<float, CPUContext>);
 
 OPERATOR_SCHEMA(SquaredL2Distance)
     .NumInputs(2)
@@ -402,7 +407,8 @@ class GetSquaredL2DistanceGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
-        "SquaredL2DistanceGradient", "",
+        "SquaredL2DistanceGradient",
+        "",
         vector<string>{I(0), I(1), GO(0)},
         vector<string>{GI(0), GI(1)});
   }
@@ -762,9 +768,9 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase {
       replicate = GetArgument(Def(), "replicate").i();
     }
 
-    const auto dot_arg =
-        vector<Argument>{MakeArgument<float>("pad_value", pad_value),
-                         MakeArgument<bool>("replicate", replicate)};
+    const auto dot_arg = vector<Argument>{
+        MakeArgument<float>("pad_value", pad_value),
+        MakeArgument<bool>("replicate", replicate)};
 
     return SingleGradientDef(
         "DotProductWithPaddingGradient",
@@ -775,4 +781,4 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase {
   }
 };
 REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient);
-}  // namespace caffe2
+} // namespace caffe2
diff --git a/caffe2/operators/fc_inference.cc b/caffe2/operators/fc_inference.cc
index a44c230980c7f..ba1b7122cdc9d 100644
--- a/caffe2/operators/fc_inference.cc
+++ b/caffe2/operators/fc_inference.cc
@@ -1,4 +1,5 @@
 #include "caffe2/operators/fc_inference.h"
+#include "caffe2/core/types.h"
 
 namespace caffe2 {
 std::vector<TensorShape> FCShapeInference(
@@ -51,11 +52,12 @@ OpSchema::Cost CostInferenceForFC(
       ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
       : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
 
-  const auto& X = in[0];
+  auto const& X_element_size_byte =
+      DataTypeToTypeMeta(in[0].data_type()).itemsize();
   c.flops = M * N * (2 * K + 1);
-  c.bytes_read = (K * (M + N) + N) * sizeof(X.data_type());
-  c.bytes_written = M * N * sizeof(X.data_type());
-  c.params_bytes = (K * N + N) * sizeof(X.data_type());
+  c.bytes_read = (K * (M + N) + N) * X_element_size_byte;
+  c.bytes_written = M * N * X_element_size_byte;
+  c.params_bytes = (K * N + N) * X_element_size_byte;
   return c;
 }
 
@@ -94,7 +96,11 @@ OpSchema::Cost CostInferenceForFCGradient(
 
   CAFFE_ENFORCE_LT(0, out.size());
   const TensorShape dW = out[0];
+  auto const& dW_element_size_byte =
+      DataTypeToTypeMeta(dW.data_type()).itemsize();
   const TensorShape db = out[1];
+  auto const& db_element_size_byte =
+      DataTypeToTypeMeta(db.data_type()).itemsize();
 
   auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
   const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
@@ -111,15 +117,17 @@ OpSchema::Cost CostInferenceForFCGradient(
   uint64_t size_db = nElemFromDim(db);
 
   c.flops = M * N * (2 * K + 1);
-  c.bytes_written = (size_dW + size_db) * sizeof(float);
+  c.bytes_written =
+      size_dW * dW_element_size_byte + size_db * db_element_size_byte;
   c.params_bytes = (K * N + N) * sizeof(float);
 
   if (out.size() == 3) {
     const TensorShape dX = out[2];
     uint64_t size_dX = nElemFromDim(dX);
-
+    auto const& dX_element_size_byte =
+        DataTypeToTypeMeta(dX.data_type()).itemsize();
     c.flops += 2 * M * N * K;
-    c.bytes_written += size_dX * sizeof(float);
+    c.bytes_written += size_dX * dX_element_size_byte;
   }
   return c;
 }
diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc
index c3eaf05db0e8f..55c73a5be22c4 100644
--- a/caffe2/operators/one_hot_ops.cc
+++ b/caffe2/operators/one_hot_ops.cc
@@ -2,6 +2,7 @@
 
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
 
 namespace caffe2 {
 
@@ -78,12 +79,21 @@ OpSchema::Cost CostInferenceForBatchOneHot(
   const auto& length = in[1];
   const auto& values = in[2];
 
-  uint64_t nBytesData = nElemFromDim(data) * sizeof(data.data_type());
-  uint64_t nBytesLength = nElemFromDim(length) * sizeof(length.data_type());
-  uint64_t nBytesValues = nElemFromDim(values) * sizeof(values.data_type());
+  auto const& data_element_size_byte =
+      DataTypeToTypeMeta(data.data_type()).itemsize();
+  auto const& length_element_size_byte =
+      DataTypeToTypeMeta(length.data_type()).itemsize();
+  auto const& values_element_size_byte =
+      DataTypeToTypeMeta(values.data_type()).itemsize();
+  auto const& output_element_size_byte =
+      DataTypeToTypeMeta(output.data_type()).itemsize();
+
+  uint64_t nBytesData = nElemFromDim(data) * data_element_size_byte;
+  uint64_t nBytesLength = nElemFromDim(length) * length_element_size_byte;
+  uint64_t nBytesValues = nElemFromDim(values) * values_element_size_byte;
   c.flops = 0;
   c.bytes_read = nBytesData + nBytesLength + nBytesValues;
-  c.bytes_written = nElemFromDim(output) * sizeof(output.data_type());
+  c.bytes_written = nElemFromDim(output) * output_element_size_byte;
   c.params_bytes = 0;
   return c;
 }
@@ -145,15 +155,15 @@ bool BatchBucketOneHotOp<CPUContext>::RunOnDevice() {
     for (int64_t j = 0; j < D; j++) {
       // here we assume the boundary values for each feature are sorted
       int64_t lower_bucket_idx = std::lower_bound(
-                                    boundaries_offset,
-                                    boundaries_offset + lens_data[j],
-                                    input_data[pos]) -
+                                     boundaries_offset,
+                                     boundaries_offset + lens_data[j],
+                                     input_data[pos]) -
           boundaries_offset;
 
       int64_t upper_bucket_idx = std::upper_bound(
-                                    boundaries_offset,
-                                    boundaries_offset + lens_data[j],
-                                    input_data[pos]) -
+                                     boundaries_offset,
+                                     boundaries_offset + lens_data[j],
+                                     input_data[pos]) -
           boundaries_offset;
 
       int64_t bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2;
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index 8b5e116024b81..561da9189b388 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -1,6 +1,7 @@
 #include "caffe2/operators/utility_ops.h"
 #include <cmath>
 #include <iostream>
+#include "caffe2/core/types.h"
 #include "caffe2/utils/eigen_utils.h"
 
 namespace caffe2 {
@@ -34,9 +35,11 @@ OpSchema::Cost CostInferenceForWeightedSum(
   const auto& nElem = nElemFromDim(X0);
   const auto& nInputs = in.size();
   c.flops = (nInputs - 1) * nElem;
-  c.bytes_read = (nInputs / 2) * (nElem + 1) * sizeof(X0.data_type());
-  c.bytes_written = nElem * sizeof(X0.data_type());
-  c.params_bytes = (nInputs / 2) * sizeof(X0.data_type());
+  auto const& X0_element_size_byte =
+      DataTypeToTypeMeta(X0.data_type()).itemsize();
+  c.bytes_read = (nInputs / 2) * (nElem + 1) * X0_element_size_byte;
+  c.bytes_written = nElem * X0_element_size_byte;
+  c.params_bytes = (nInputs / 2) * X0_element_size_byte;
   return c;
 }
 
@@ -48,9 +51,7 @@ REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(SumInt, SumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp<CPUContext>);
-REGISTER_CPU_OPERATOR(
-    ScatterWeightedSum,
-    ScatterWeightedSumOp<CPUContext>);
+REGISTER_CPU_OPERATOR(ScatterWeightedSum, ScatterWeightedSumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Scatter, ScatterOp<CPUContext>);
 
diff --git a/caffe2/python/operator_test/concat_op_cost_test.py b/caffe2/python/operator_test/concat_op_cost_test.py
index 996b330be4947..7dab4d6bd5d1f 100644
--- a/caffe2/python/operator_test/concat_op_cost_test.py
+++ b/caffe2/python/operator_test/concat_op_cost_test.py
@@ -7,33 +7,39 @@
 
 class TestConcatOpCost(TestCase):
     def test_columnwise_concat(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
-        workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=np.int32))
-        concat_op = core.CreateOperator(
-            "Concat",
-            ["input_1", "input_2"],
-            ["output", "split_info"],
-        )
-        workspace.RunOperatorOnce(concat_op)
+        def _test_columnwise_concat_for_type(dtype):
+            workspace.ResetWorkspace()
+            workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype))
+            workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=dtype))
+            concat_op = core.CreateOperator(
+                "Concat",
+                ["input_1", "input_2"],
+                ["output", "split_info"],
+            )
+            workspace.RunOperatorOnce(concat_op)
 
-        output = workspace.FetchBlob("output")
-        self.assertTupleEqual(output.shape, (2, 4))
-        np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])
+            output = workspace.FetchBlob("output")
+            self.assertTupleEqual(output.shape, (2, 4))
+            np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])
 
-        flops, bytes_written, bytes_read = workspace.GetOperatorCost(
-            concat_op, concat_op.input
-        )
+            flops, bytes_written, bytes_read = workspace.GetOperatorCost(
+                concat_op, concat_op.input
+            )
 
-        self.assertEqual(flops, 0)
-        self.assertEqual(
-            bytes_read,
-            sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
-        )
-        self.assertEqual(
-            bytes_written,
-            sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
-        )
+            self.assertEqual(flops, 0)
+            self.assertEqual(
+                bytes_read,
+                sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
+            )
+            self.assertEqual(
+                bytes_written,
+                sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
+            )
+
+        [
+            _test_columnwise_concat_for_type(t)
+            for t in [np.int64, np.float, np.half, np.int8]
+        ]
 
     def test_split_then_concat(self):
         workspace.ResetWorkspace()
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index afb2065027075..1bf7b607e1b7e 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -60,7 +60,7 @@ def testGetOperatorCost(self):
         self.assertTupleEqual(
             op_cost,
             namedtuple("Cost", ["flops", "bytes_written", "bytes_read"])(
-                1152, 256, 2084
+                1152, 256, 4168
             ),
         )
 
diff --git a/caffe2/sgd/adagrad_op.cc b/caffe2/sgd/adagrad_op.cc
index 0de50f03e62d5..0b6f604b48cdb 100644
--- a/caffe2/sgd/adagrad_op.cc
+++ b/caffe2/sgd/adagrad_op.cc
@@ -1,4 +1,5 @@
 #include "adagrad_op.h"
+#include "caffe2/core/types.h"
 
 namespace caffe2 {
 
@@ -23,22 +24,30 @@ static OpSchema::Cost CostInferenceForAdagrad(
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   c.flops = grad_size * 10;
 
+  auto const& moment_element_size_byte =
+      DataTypeToTypeMeta(moment.data_type()).itemsize();
+  auto const& param_element_size_byte =
+      DataTypeToTypeMeta(param.data_type()).itemsize();
+  auto const& grad_element_size_byte =
+      DataTypeToTypeMeta(grad.data_type()).itemsize();
+  auto const& lr_element_size_byte =
+      DataTypeToTypeMeta(lr.data_type()).itemsize();
   uint64_t bytes_written =
-      grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type()));
+      grad_size * param_element_size_byte + moment_element_size_byte;
 
   if (output_size == 3) {
     // also need to output effective learning rate in this case
     // assume it's the same data type as lr
-    bytes_written += grad_size * sizeof(lr.data_type());
+    bytes_written += grad_size * lr_element_size_byte;
   } else if (output_size == 4) {
     // also need to output effective learning rate and updates in this case
     // assume update is the same data type as param
     bytes_written +=
-        grad_size * (sizeof(lr.data_type()) + sizeof(param.data_type()));
+        grad_size * (lr_element_size_byte + param_element_size_byte);
   }
   c.bytes_written = bytes_written;
   c.bytes_read = c.bytes_written +
-      grad_size * (sizeof(grad.data_type()) + sizeof(lr.data_type()));
+      grad_size * (grad_element_size_byte + lr_element_size_byte);
 
   return c;
 }
@@ -102,10 +111,18 @@ static OpSchema::Cost CostInferenceForSparseAdagrad(
   // (optimistically count sqrt as one flop).
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   c.flops = grad_size * 7;
+  auto const& param_element_size_byte =
+      DataTypeToTypeMeta(param.data_type()).itemsize();
+  auto const& moment_element_size_byte =
+      DataTypeToTypeMeta(moment.data_type()).itemsize();
   c.bytes_written =
-      grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type()));
-  c.bytes_read = c.bytes_written + grad_size * sizeof(grad.data_type()) +
-      n * sizeof(indices.data_type());
+      grad_size * (param_element_size_byte + moment_element_size_byte);
+  auto const& grad_element_size_byte =
+      DataTypeToTypeMeta(grad.data_type()).itemsize();
+  auto const& indices_element_size_byte =
+      DataTypeToTypeMeta(indices.data_type()).itemsize();
+  c.bytes_read = c.bytes_written + grad_size * grad_element_size_byte +
+      n * indices_element_size_byte;
 
   return c;
 }
@@ -153,6 +170,16 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad(
   OpSchema::Cost c;
 
   if (n > 0) {
+    auto const& param_element_size_byte =
+        DataTypeToTypeMeta(param.data_type()).itemsize();
+    auto const& moment_element_size_byte =
+        DataTypeToTypeMeta(moment.data_type()).itemsize();
+    auto const& grad_element_size_byte =
+        DataTypeToTypeMeta(grad.data_type()).itemsize();
+    auto const& indices_element_size_byte =
+        DataTypeToTypeMeta(indices.data_type()).itemsize();
+    auto const& lr_element_size_byte =
+        DataTypeToTypeMeta(lr.data_type()).itemsize();
     auto block_size = grad_size / n;
     if (block_size == 1) {
       // +2: applying weight decay and add to grads
@@ -161,22 +188,22 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad(
       // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
       c.flops = n * 9;
       c.bytes_written =
-          n * (sizeof(param.data_type()) + sizeof(moment.data_type()));
+          n * (param_element_size_byte + moment_element_size_byte);
       c.bytes_read = c.bytes_written +
           n *
-              (sizeof(grad.data_type()) + sizeof(indices.data_type()) +
-               sizeof(lr.data_type()));
+              (grad_element_size_byte + indices_element_size_byte +
+               lr_element_size_byte);
     } else {
       // 5 per block (not counting index transforms)
       // 8 for each value of a block
       // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
       c.flops = n * (5 + (block_size * 8));
-      c.bytes_written =
-          n * sizeof(moment.data_type()) + n * block_size * (param.data_type());
+      c.bytes_written = n * moment_element_size_byte +
+          n * block_size * param_element_size_byte;
 
-      c.bytes_read = c.bytes_written + n * (sizeof(lr.data_type())) +
+      c.bytes_read = c.bytes_written + n * lr_element_size_byte +
           2 * n * block_size *
-              (sizeof(grad.data_type()) + sizeof(param.data_type()));
+              (grad_element_size_byte + param_element_size_byte);
     }
   }
   return c;

From f3e329cbec5f4f32e195bbe3b8b5b4d2b1323128 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Mon, 30 Aug 2021 13:10:23 -0700
Subject: [PATCH 356/530] Implements the orthogonal parametrization (#62089)

Summary:
Implements an orthogonal / unitary parametrisation.

It does passes the tests and I have trained a couple models with this implementation, so I believe it should be somewhat correct. Now, the implementation is very subtle. I'm tagging nikitaved  and IvanYashchuk as reviewers in case they have comments / they see some room for optimisation of the code, in particular of the `forward` function.

Fixes https://github.com/pytorch/pytorch/issues/42243

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62089

Reviewed By: ezyang

Differential Revision: D30639063

Pulled By: albanD

fbshipit-source-id: 988664f333ac7a75ce71ba44c8d77b986dff2fe6
---
 docs/source/nn.rst                 |   3 +-
 test/test_nn.py                    | 133 +++++++++++++
 torch/nn/utils/parametrizations.py | 298 +++++++++++++++++++++++++++--
 torch/nn/utils/parametrize.py      |  73 ++++---
 4 files changed, 464 insertions(+), 43 deletions(-)

diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 07ce4db2f48af..6eca9d4b16b6a 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -389,6 +389,7 @@ in :func:`torch.nn.utils.parameterize.register_parametrization`.
     :toctree: generated
     :nosignatures:
 
+    parametrizations.orthogonal
     parametrizations.spectral_norm
 
 Utility functions to parametrize Tensors on existing Modules.
@@ -396,7 +397,7 @@ Note that these functions can be used to parametrize a given Parameter
 or Buffer given a specific function that maps from an input space to the
 parametrized space. They are not parameterizations that would transform
 an object into a parameter. See the
-`Parametrizations <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`__ tutorial
+`Parametrizations tutorial <https://pytorch.org/tutorials/intermediate/parametrizations.html>`_
 for more information on how to implement your own parametrizations.
 
 .. autosummary::
diff --git a/test/test_nn.py b/test/test_nn.py
index c9815dbf2ee0e..c6d0e78044126 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -4518,6 +4518,139 @@ def test_spectral_norm_pickle(self):
         m = pickle.loads(pickle.dumps(m))
         self.assertIsInstance(m, nn.Linear)
 
+    def test_orthogonal_parametrization(self):
+        # Orthogonal implements 6 algorithms (3x parametrizations times 2 options of use_trivialization)
+
+        def assert_is_orthogonal(X):
+            n, k = X.size(-2), X.size(-1)
+            if n < k:
+                X = X.transpose(-2, -1)
+                n, k = k, n
+            Id = torch.eye(k, dtype=X.dtype, device=X.device).expand(*(X.size()[:-2]), k, k)
+            eps = 10 * n * torch.finfo(X.dtype).eps
+            torch.testing.assert_allclose(X.transpose(-2, -1).conj() @ X, Id, atol=eps, rtol=0.)
+
+
+        def assert_weight_allclose_Q(weight, W):
+            # Test that weight is equal to the Q part of the QR decomposition of W
+            # (or of its transpose if the matrix is wide)
+            wide_matrix = W.size(-2) < W.size(-1)
+            if wide_matrix:
+                W = W.transpose(-2, -1)
+            Q, R = torch.linalg.qr(W)
+            Q *= R.diagonal(dim1=-2, dim2=-1).sgn().unsqueeze(-2)
+            if wide_matrix:
+                Q = Q.transpose(-2, -1)
+            torch.testing.assert_allclose(Q, weight, atol=1e-5, rtol=0.)
+
+
+        for shape, dtype, use_linear in product(((4, 4), (5, 3), (3, 5)),  # square/ tall / wide
+                                                (torch.float32, torch.complex64),
+                                                (True, False)):
+            # Conv2d does not support complex yet
+            if not use_linear and dtype.is_complex:
+                continue
+
+            if use_linear:
+                input = torch.randn(3, shape[0], dtype=dtype)
+            else:
+                input = torch.randn(2, 2, shape[0] + 2, shape[1] + 1, dtype=dtype)
+
+            for parametrization, use_trivialization in product(("matrix_exp", "cayley", "householder"),
+                                                               (False, True)):
+                # right_inverse for Cayley and matrix_exp not implemented for use_trivialization=False
+                # See Note [right_inverse expm cayley]
+                can_initialize = use_trivialization or parametrization == "householder"
+
+                # We generate them every time to always start with fresh weights
+                if use_linear:
+                    m = nn.Linear(*shape, dtype=dtype)
+                else:
+                    m = nn.Conv2d(2, 3, shape, dtype=dtype)
+
+                # We do not support householder for complex inputs
+                # See Note [Householder complex]
+                w_init = m.weight.clone()
+                if parametrization == "householder" and m.weight.is_complex():
+                    msg = "householder parametrization does not support complex tensors"
+                    with self.assertRaisesRegex(ValueError, msg):
+                        torch.nn.utils.parametrizations.orthogonal(m,
+                                                                   "weight",
+                                                                   parametrization,
+                                                                   use_trivialization=use_trivialization)
+                    continue
+
+                wide_matrix = w_init.size(-2) < w_init.size(-1)
+                torch.nn.utils.parametrizations.orthogonal(m,
+                                                           "weight",
+                                                           parametrization,
+                                                           use_trivialization=use_trivialization)
+                # Forwards works as expected
+                self.assertEqual(w_init.shape, m.weight.shape)
+                assert_is_orthogonal(m.weight)
+                if can_initialize:
+                    assert_weight_allclose_Q(m.weight, w_init)
+
+                # Intializing with a given orthogonal matrix works
+                X = torch.randn_like(m.weight)
+                if wide_matrix:
+                    X = X.transpose(-2, -1)
+                w_new = torch.linalg.qr(X).Q
+                if wide_matrix:
+                    w_new = w_new.transpose(-2, -1)
+                if can_initialize:
+                    m.weight = w_new
+                    torch.testing.assert_allclose(w_new, m.weight, atol=1e-5, rtol=0.)
+                else:
+                    msg = "assign to the matrix exponential or the Cayley parametrization"
+                    with self.assertRaisesRegex(NotImplementedError, msg):
+                        m.weight = w_new
+
+                # Intializing with a non-orthogonal matrix makes m.weight be the Q part of the given matrix
+                w_new = torch.randn_like(m.weight)
+                if can_initialize:
+                    m.weight = w_new
+                    assert_weight_allclose_Q(m.weight, w_new)
+                else:
+                    msg = "assign to the matrix exponential or the Cayley parametrization"
+                    with self.assertRaisesRegex(NotImplementedError, msg):
+                        m.weight = w_new
+
+                opt = torch.optim.SGD(m.parameters(), lr=0.1)
+                for _ in range(2):
+                    opt.zero_grad()
+                    m(input).norm().backward()
+                    grad = m.parametrizations.weight.original.grad
+                    self.assertIsNotNone(grad)
+                    # We do not update the upper triangular part of the matrix if tall tril if wide
+                    if grad.size(-2) >= grad.size(-1):
+                        zeros_grad = grad.triu(1)
+                    else:
+                        zeros_grad = grad.tril(-1)
+                    self.assertEqual(zeros_grad, torch.zeros_like(zeros_grad))
+                    # The gradient in the diagonal can only be imaginary because a skew-Hermitian
+                    # matrix has imaginary diagonal
+                    diag_grad = grad.diagonal(dim1=-2, dim2=-1)
+                    if grad.is_complex():
+                        diag_grad = diag_grad.real
+                    self.assertEqual(diag_grad, torch.zeros_like(diag_grad))
+                    opt.step()
+                    assert_is_orthogonal(m.weight)
+
+    def test_orthogonal_errors(self):
+        m = nn.Linear(3, 4)
+        with self.assertRaisesRegex(ValueError, "has to be one of"):
+            torch.nn.utils.parametrizations.orthogonal(m, "weight", "foo")
+
+        with self.assertRaisesRegex(ValueError, "Expected a matrix"):
+            torch.nn.utils.parametrizations.orthogonal(m, "bias")
+
+        torch.nn.utils.parametrizations.orthogonal(m, "weight")
+        with self.assertRaisesRegex(ValueError, "matrices of shape"):
+            m.weight = torch.randn(5, 5)
+        torch.nn.utils.parametrize.remove_parametrizations(m, "weight")
+
+
     def test_threshold_int(self):
         x = torch.tensor([-3, -2, -1, 0, 1, 2, 3])
         expected = torch.tensor([99, 99, 99, 99, 1, 2, 3])
diff --git a/torch/nn/utils/parametrizations.py b/torch/nn/utils/parametrizations.py
index de3d5c7144f9e..de67aa814f39c 100644
--- a/torch/nn/utils/parametrizations.py
+++ b/torch/nn/utils/parametrizations.py
@@ -1,10 +1,286 @@
+from enum import Enum, auto
+
 import torch
+from torch import Tensor
 from ..utils import parametrize
 from ..modules import Module
 from .. import functional as F
 
 from typing import Optional
 
+
+def _is_orthogonal(Q, eps=None):
+    n, k = Q.size(-2), Q.size(-1)
+    Id = torch.eye(k, dtype=Q.dtype, device=Q.device)
+    # A reasonable eps, but not too large
+    eps = 10. * n * torch.finfo(Q.dtype).eps
+    return torch.allclose(Q.transpose(-2, -1).conj() @ Q, Id, atol=eps)
+
+
+def _make_orthogonal(A):
+    """ Assume that A is a tall matrix.
+    Compute the Q factor s.t. A = QR (A may be complex) and diag(R) is real and non-negative
+    """
+    X, tau = torch.geqrf(A)
+    Q = torch.linalg.householder_product(X, tau)
+    # The diagonal of X is the diagonal of R (which is always real) so we normalise by its signs
+    Q *= X.diagonal(dim1=-2, dim2=-1).sgn().unsqueeze(-2)
+    return Q
+
+
+class _OrthMaps(Enum):
+    matrix_exp = auto()
+    cayley = auto()
+    householder = auto()
+
+
+class _Orthogonal(Module):
+    base: Tensor
+
+    def __init__(self,
+                 weight,
+                 orthogonal_map: _OrthMaps,
+                 *,
+                 use_trivialization=True) -> None:
+        super().__init__()
+
+        # Note [Householder complex]
+        # For complex tensors, it is not possible to compute the tensor `tau` necessary for
+        # linalg.householder_product from the reflectors.
+        # To see this, note that the reflectors have a shape like:
+        # 0 0 0
+        # * 0 0
+        # * * 0
+        # which, for complex matrices, give n(n-1) (real) parameters. Now, you need n^2 parameters
+        # to parametrize the unitary matrices. Saving tau on its own does not work either, because
+        # not every combination of `(A, tau)` gives a unitary matrix, meaning that if we optimise
+        # them as independent tensors we would not maintain the constraint
+        # An equivalent reasoning holds for rectangular matrices
+        if weight.is_complex() and orthogonal_map == _OrthMaps.householder:
+            raise ValueError("The householder parametrization does not support complex tensors.")
+
+        self.shape = weight.shape
+        self.orthogonal_map = orthogonal_map
+        if use_trivialization:
+            self.register_buffer("base", None)
+
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        n, k = X.size(-2), X.size(-1)
+        transposed = n < k
+        if transposed:
+            X = X.transpose(-2, -1)
+            n, k = k, n
+        # Here n > k and X is a tall matrix
+        if self.orthogonal_map == _OrthMaps.matrix_exp or self.orthogonal_map == _OrthMaps.cayley:
+            # We just need n x k - k(k-1)/2 parameters
+            X = X.tril()
+            if n != k:
+                # Embed into a square matrix
+                X = torch.cat([X, X.new_zeros(n, n - k).expand(*X.shape[:-2], -1, -1)], dim=-1)
+            A = X - X.transpose(-2, -1).conj()
+            # A is skew-symmetric (or skew-hermitian)
+            if self.orthogonal_map == _OrthMaps.matrix_exp:
+                Q = torch.matrix_exp(A)
+            elif self.orthogonal_map == _OrthMaps.cayley:
+                # Computes the Cayley retraction (I+A/2)(I-A/2)^{-1}
+                Id = torch.eye(n, dtype=A.dtype, device=A.device)
+                Q = torch.linalg.solve(torch.add(Id, A, alpha=-0.5), torch.add(Id, A, alpha=0.5))
+            # Q is now orthogonal (or unitary) of size (..., n, n)
+            if n != k:
+                Q = Q[..., :k]
+            # Q is now the size of the X (albeit perhaps transposed)
+        else:
+            # X is real here, as we do not support householder with complex numbers
+            A = X.tril(diagonal=-1)
+            tau = 2. / (1. + (A * A).sum(dim=-2))
+            Q = torch.linalg.householder_product(A, tau)
+            # The diagonal of X is 1's and -1's
+            # We do not want to differentiate through this or update the diagonal of X hence the casting
+            Q = Q * X.diagonal(dim1=-2, dim2=-1).int().unsqueeze(-2)
+
+        if hasattr(self, "base"):
+            Q = self.base @ Q
+        if transposed:
+            Q = Q.transpose(-2, -1)
+        return Q
+
+    @torch.autograd.no_grad()
+    def right_inverse(self, Q: torch.Tensor) -> torch.Tensor:
+        if Q.shape != self.shape:
+            raise ValueError(f"Expected a matrix or batch of matrices of shape {self.shape}. "
+                             f"Got a tensor of shape {Q.shape}.")
+
+        Q_init = Q
+        n, k = Q.size(-2), Q.size(-1)
+        transpose = n < k
+        if transpose:
+            Q = Q.transpose(-2, -1)
+            n, k = k, n
+
+        # We always make sure to always copy Q in every path
+        if not hasattr(self, "base"):
+            # Note [right_inverse expm cayley]
+            # If we do not have use_trivialization=True, we just implement the inverse of the forward
+            # map for the Householder. To see why, think that for the Cayley map,
+            # we would need to find the matrix X \in R^{n x k} such that:
+            # Y = torch.cat([X.tril(), X.new_zeros(n, n - k).expand(*X.shape[:-2], -1, -1)], dim=-1)
+            # A = Y - Y.transpose(-2, -1).conj()
+            # cayley(A)[:, :k]
+            # gives the original tensor. It is not clear how to do this.
+            # Perhaps via some algebraic manipulation involving the QR like that of
+            # Corollary 2.2 in Edelman, Arias and Smith?
+            if self.orthogonal_map == _OrthMaps.cayley or self.orthogonal_map == _OrthMaps.matrix_exp:
+                raise NotImplementedError("It is not possible to assign to the matrix exponential "
+                                          "or the Cayley parametrizations when use_trivialization=False.")
+
+            # If parametrization == _OrthMaps.householder, make Q orthogonal via the QR decomposition.
+            # Here Q is always real because we do not support householder and complex matrices.
+            # See note [Householder complex]
+            A, tau = torch.geqrf(Q)
+            # We want to have a decomposition X = QR with diag(R) > 0, as otherwise we could
+            # decompose an orthogonal matrix Q as Q = (-Q)@(-Id), which is a valid QR decomposition
+            # The diagonal of Q is the diagonal of R from the qr decomposition
+            A.diagonal(dim1=-2, dim2=-1).sign_()
+            # Equality with zero is ok because LAPACK returns exactly zero when it does not want
+            # to use a particular reflection
+            A.diagonal(dim1=-2, dim2=-1)[tau == 0.] *= -1
+            return A.transpose(-2, -1) if transpose else A
+        else:
+            if n == k:
+                # We check whether Q is orthogonal
+                if not _is_orthogonal(Q):
+                    Q = _make_orthogonal(Q)
+                else:  # Is orthogonal
+                    Q = Q.clone()
+            else:
+                # Complete Q into a full n x n orthogonal matrix
+                N = torch.randn(*(Q.size()[:-2] + (n, n - k)), dtype=Q.dtype, device=Q.device)
+                Q = torch.cat([Q, N], dim=-1)
+                Q = _make_orthogonal(Q)
+            self.base = Q
+
+            # It is necessary to return the -Id, as we use the diagonal for the
+            # Householder parametrization. Using -Id makes:
+            # householder(torch.zeros(m,n)) == torch.eye(m,n)
+            # Poor man's version of eye_like
+            neg_Id = torch.zeros_like(Q_init)
+            neg_Id.diagonal(dim1=-2, dim2=-1).fill_(-1.)
+            return neg_Id
+
+
+def orthogonal(module: Module,
+               name: str = 'weight',
+               orthogonal_map: Optional[str] = None,
+               *,
+               use_trivialization: bool = True) -> Module:
+    r"""Applies an orthogonal or unitary parametrization to a matrix or a batch of matrices.
+
+    Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`, the parametrized
+    matrix :math:`Q \in \mathbb{K}^{m \times n}` is **orthogonal** as
+
+    .. math::
+
+        \begin{align*}
+            Q^{\text{H}}Q &= \mathrm{I}_n \mathrlap{\qquad \text{if }m \geq n}\\
+            QQ^{\text{H}} &= \mathrm{I}_m \mathrlap{\qquad \text{if }m < n}
+        \end{align*}
+
+    where :math:`Q^{\text{H}}` is the conjugate transpose when :math:`Q` is complex
+    and the transpose when :math:`Q` is real-valued, and
+    :math:`\mathrm{I}_n` is the `n`-dimensional identity matrix.
+    In plain words, :math:`Q` will have orthonormal columns whenever :math:`m \geq n`
+    and orthonormal rows otherwise.
+
+    If the tensor has more than two dimensions, we consider it as a batch of matrices of shape `(..., m, n)`.
+
+    The matrix :math:`Q` may be parametrized via three different ``orthogonal_map`` in terms of the original tensor:
+
+    - ``"matrix_exp"``/``"cayley"``:
+      the :func:`~torch.matrix_exp` :math:`Q = \exp(A)` and the `Cayley map`_
+      :math:`Q = (\mathrm{I}_n + A/2)(\mathrm{I}_n - A/2)^{-1}` are applied to a skew-symmetric
+      :math:`A` to give an orthogonal matrix.
+    - ``"householder"``: computes a product of Householder reflectors
+      (:func:`~torch.linalg.householder_product`).
+
+    ``"matrix_exp"``/``"cayley"`` often make the parametrized weight converge faster than
+    ``"householder"``, but they are slower to compute for very thin or very wide matrices.
+
+    If ``use_trivialization=True`` (default), the parametrization implements the "Dynamic Trivialization Framework",
+    where an extra matrix :math:`B \in \mathbb{K}^{n \times n}` is stored under
+    ``module.parametrizations.weight[0].base``. This helps the
+    convergence of the parametrized layer at the expense of some extra memory use.
+    See `Trivializations for Gradient-Based Optimization on Manifolds`_ .
+
+    Initial value of :math:`Q`:
+    If the original tensor is not parametrized and ``use_trivialization=True`` (default), the initial value
+    of :math:`Q` is that of the original tensor if it is orthogonal (or unitary in the complex case)
+    and it is orthogonalized via the QR decomposition otherwise (see :func:`torch.linalg.qr`).
+    Same happens when it is not parametrized and ``orthogonal_map="householder"`` even when ``use_trivialization=False``.
+    Otherwise, the initial value is the result of the composition of all the registered
+    parametrizations applied to the original tensor.
+
+    .. note::
+        This function is implemented using the parametrization functionality
+        in :func:`~torch.nn.utils.parametrize.register_parametrization`.
+
+
+    .. _`Cayley map`: https://en.wikipedia.org/wiki/Cayley_transform#Matrix_map
+    .. _`Trivializations for Gradient-Based Optimization on Manifolds`: https://arxiv.org/abs/1909.09501
+
+    Args:
+        module (nn.Module): module on which to register the parametrization.
+        name (str, optional): name of the tensor to make orthogonal. Default: ``"weight"``.
+        orthogonal_map (str, optional): One of the following: ``"matrix_exp"``, ``"cayley"``, ``"householder"``.
+            Default: ``"matrix_exp"`` if the matrix is square or complex, ``"householder"`` otherwise.
+        use_trivialization (bool, optional): whether to use the dynamic trivialization framework.
+            Default: ``True``.
+
+    Returns:
+        The original module with an orthogonal parametrization registered to the specified
+        weight
+
+    Example::
+
+        >>> orth_linear = orthogonal(nn.Linear(20, 40))
+        >>> orth_linear
+        ParametrizedLinear(
+        in_features=20, out_features=40, bias=True
+        (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+            (0): _Orthogonal()
+            )
+        )
+        )
+        >>> Q = orth_linear.weight
+        >>> torch.dist(Q.T @ Q, torch.eye(20))
+        tensor(4.9332e-07)
+    """
+    weight = getattr(module, name, None)
+    if not isinstance(weight, Tensor):
+        raise ValueError(
+            "Module '{}' has no parameter ot buffer with name '{}'".format(module, name)
+        )
+
+    # We could implement this for 1-dim tensors as the maps on the sphere
+    # but I believe it'd bite more people than it'd help
+    if weight.ndim < 2:
+        raise ValueError("Expected a matrix or batch of matrices. "
+                         f"Got a tensor of {weight.ndim} dimensions.")
+
+    if orthogonal_map is None:
+        orthogonal_map = "matrix_exp" if weight.size(-2) == weight.size(-1) or weight.is_complex() else "householder"
+
+    orth_enum = getattr(_OrthMaps, orthogonal_map, None)
+    if orth_enum is None:
+        raise ValueError('orthogonal_map has to be one of "matrix_exp", "cayley", "householder". '
+                         f'Got: {orthogonal_map}')
+    orth = _Orthogonal(weight,
+                       orth_enum,
+                       use_trivialization=use_trivialization)
+    parametrize.register_parametrization(module, name, orth, unsafe=True)
+    return module
+
+
 class _SpectralNorm(Module):
     def __init__(
         self,
@@ -147,8 +423,8 @@ def spectral_norm(module: Module,
     .. _`Spectral Normalization for Generative Adversarial Networks`: https://arxiv.org/abs/1802.05957
 
     .. note::
-        This function is implemented using the new parametrization functionality
-        in :func:`torch.nn.utils.parametrize.register_parametrization`. It is a
+        This function is implemented using the parametrization functionality
+        in :func:`~torch.nn.utils.parametrize.register_parametrization`. It is a
         reimplementation of :func:`torch.nn.utils.spectral_norm`.
 
     .. note::
@@ -165,13 +441,13 @@ def spectral_norm(module: Module,
 
     Args:
         module (nn.Module): containing module
-        name (str, optional): name of weight parameter
+        name (str, optional): name of weight parameter. Default: ``"weight"``.
         n_power_iterations (int, optional): number of power iterations to
-            calculate spectral norm
+            calculate spectral norm. Default: ``1``.
         eps (float, optional): epsilon for numerical stability in
-            calculating norms
-        dim (int, optional): dimension corresponding to number of outputs,
-            the default is ``0``, except for modules that are instances of
+            calculating norms. Default: ``1e-12``.
+        dim (int, optional): dimension corresponding to number of outputs.
+            Default: ``0``, except for modules that are instances of
             ConvTranspose{1,2,3}d, when it is ``1``
 
     Returns:
@@ -193,13 +469,11 @@ def spectral_norm(module: Module,
         >>> torch.linalg.matrix_norm(snm.weight, 2)
         tensor(1.0000, grad_fn=<CopyBackwards>)
     """
-    if not hasattr(module, name):
+    weight = getattr(module, name, None)
+    if not isinstance(weight, Tensor):
         raise ValueError(
-            "Module '{}' has no attribute with name '{}'".format(module, name)
+            "Module '{}' has no parameter or buffer with name '{}'".format(module, name)
         )
-    # getattr should get the correct parametrized weight if there
-    # is already an parametrization registered
-    weight = getattr(module, name)
 
     if dim is None:
         if isinstance(module, (torch.nn.ConvTranspose1d,
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
index 332fe762b8309..d8f2a947352cf 100644
--- a/torch/nn/utils/parametrize.py
+++ b/torch/nn/utils/parametrize.py
@@ -129,8 +129,11 @@ def __init__(
             new = original
             for module in reversed(self):  # type: ignore[call-overload]
                 if hasattr(module, "right_inverse"):
-                    new = module.right_inverse(new)
-                # else, we assume that right_inverse is the identity
+                    try:
+                        new = module.right_inverse(new)
+                    except NotImplementedError:
+                        pass
+                # else, or if it throws, we assume that right_inverse is the identity
 
         if not isinstance(new, Tensor) and not isinstance(new, collections.abc.Sequence):
             raise ValueError("'right_inverse' must return a Tensor or a Sequence of tensors (list, tuple...). "
@@ -209,7 +212,9 @@ def right_inverse(self, value: Tensor) -> None:
             for module in reversed(self):  # type: ignore[call-overload]
                 if hasattr(module, "right_inverse"):
                     value = module.right_inverse(value)
-                # else we assume that right_inverse is the identity
+                else:
+                    raise RuntimeError(f"parametrization {type(module).__name__} does not implement "
+                                       "right_inverse.")
             if self.is_tensor:
                 # These exceptions should only throw when a right_inverse function does not
                 # return the same dtype for every input, which should most likely be caused by a bug
@@ -372,16 +377,12 @@ def register_parametrization(
 
         def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]]
 
-    If this method is not implemented, it defaults to the identity.
     This method is called on the unparametrized tensor when the first parametrization
-    is registered.
+    is registered to compute the initial value of the original tensor.
+    If this method is not implemented, the original tensor will be just the unparametrized tensor.
 
-    In most situations, ``right_inverse`` will be a function such that
-    ``forward(right_inverse(X)) == X`` (see
-    `right inverse <https://en.wikipedia.org/wiki/Inverse_function#Right_inverses>`_).
-    Sometimes, when the parametrization is not surjective, it may be reasonable
-    to relax this.
-    This may be used to initialize the tensor, as shown in the example below.
+    If all the parametrizations registered on a tensor implement `right_inverse` it is possible
+    to initialize a parametrized tensor by assigning to it, as shown in the example below.
 
     It is possible for the first parametrization to depend on several inputs.
     This may be implemented returning a tuple of tensors from ``right_inverse``
@@ -397,6 +398,14 @@ def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]]
         If unsafe=True, then right_inverse will be called if the tensor is not parametrized,
         and nothing will be called otherwise.
 
+    .. note::
+
+        In most situations, ``right_inverse`` will be a function such that
+        ``forward(right_inverse(X)) == X`` (see
+        `right inverse <https://en.wikipedia.org/wiki/Inverse_function#Right_inverses>`_).
+        Sometimes, when the parametrization is not surjective, it may be reasonable
+        to relax this.
+
     .. warning::
 
         If a parametrization depends on several inputs, :func:`~register_parametrization`
@@ -483,25 +492,29 @@ def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]]
                     f"parametrization(module.{tensor_name}).shape: {X.shape}"
                 )
             if hasattr(parametrization, "right_inverse"):
-                Z = parametrization.right_inverse(X)  # type: ignore[operator]
-                if not isinstance(Z, Tensor):
-                    raise ValueError(
-                        f"parametrization.right_inverse must return a tensor. Got: {type(Z).__name__}"
-                    )
-                if Z.dtype != Y.dtype:
-                    raise ValueError(
-                        "The tensor returned by parametrization.right_inverse must have the same dtype "
-                        f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n"
-                        f"module.{tensor_name}.dtype: {Y.dtype}\n"
-                        f"returned dtype: {Z.dtype}"
-                    )
-                if Z.shape != Y.shape:
-                    raise ValueError(
-                        "The tensor returned by parametrization.right_inverse must have the same shape "
-                        f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n"
-                        f"module.{tensor_name}.shape: {Y.shape}\n"
-                        f"returned shape: {Z.shape}"
-                    )
+                try:
+                    Z = parametrization.right_inverse(X)  # type: ignore[operator]
+                except NotImplementedError:
+                    pass
+                else:
+                    if not isinstance(Z, Tensor):
+                        raise ValueError(
+                            f"parametrization.right_inverse must return a tensor. Got: {type(Z).__name__}"
+                        )
+                    if Z.dtype != Y.dtype:
+                        raise ValueError(
+                            "The tensor returned by parametrization.right_inverse must have the same dtype "
+                            f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n"
+                            f"module.{tensor_name}.dtype: {Y.dtype}\n"
+                            f"returned dtype: {Z.dtype}"
+                        )
+                    if Z.shape != Y.shape:
+                        raise ValueError(
+                            "The tensor returned by parametrization.right_inverse must have the same shape "
+                            f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n"
+                            f"module.{tensor_name}.shape: {Y.shape}\n"
+                            f"returned shape: {Z.shape}"
+                        )
             # else right_inverse is assumed to be the identity
 
         # add the new parametrization to the parametrization list

From b9933f08b985f9105e00804d5c99016841bd4cc7 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Mon, 30 Aug 2021 13:26:00 -0700
Subject: [PATCH 357/530] Fix type annotation in tools/nightly.py (#64202)

Summary:
`tempfile.TemporaryDirectory` is a generic only in python-3.9 and above

Workaround by wrapping type annotation in quotes

Fixes https://github.com/pytorch/pytorch/issues/64017

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64202

Reviewed By: janeyx99

Differential Revision: D30644215

Pulled By: malfet

fbshipit-source-id: 3c16240b9fa899bd4d572c1732a7d87d3dd0fbd5
---
 tools/nightly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/nightly.py b/tools/nightly.py
index 0b387e3b32dcf..7a46a011d232a 100755
--- a/tools/nightly.py
+++ b/tools/nightly.py
@@ -324,7 +324,7 @@ def deps_install(deps: List[str], existing_env: bool, env_opts: List[str]) -> No
 
 
 @timed("Installing pytorch nightly binaries")
-def pytorch_install(url: str) -> tempfile.TemporaryDirectory[str]:
+def pytorch_install(url: str) -> "tempfile.TemporaryDirectory[str]":
     """"Install pytorch into a temporary directory"""
     pytdir = tempfile.TemporaryDirectory()
     cmd = ["conda", "create", "--yes", "--no-deps", "--prefix", pytdir.name, url]

From 85df73658ca38e894542e649bd053f269e77880a Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Mon, 30 Aug 2021 13:29:51 -0700
Subject: [PATCH 358/530] Make name() part of IMethod interface (#63995)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63995

JIT methods already have name() in their interface, and Py methods have names in their implementation.  I'm adding this for a particular case where someone tried to use name() on a JIT method that we're replacing with an IMethod.

Test Plan: add case to imethod API test

Reviewed By: suo

Differential Revision: D30559401

fbshipit-source-id: 76236721f5cd9a9d9d488ddba12bfdd01d679a2c
---
 test/cpp/api/imethod.cpp               | 3 +++
 torch/csrc/api/include/torch/imethod.h | 2 ++
 torch/csrc/deploy/deploy.h             | 4 ++++
 torch/csrc/jit/api/method.h            | 2 +-
 4 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/test/cpp/api/imethod.cpp b/test/cpp/api/imethod.cpp
index 8673e55fb5629..b8c12c649fd19 100644
--- a/test/cpp/api/imethod.cpp
+++ b/test/cpp/api/imethod.cpp
@@ -28,6 +28,9 @@ TEST(IMethodTest, CallMethod) {
   auto pyModel = package.load_pickle("model", "model.pkl");
   torch::deploy::PythonMethodWrapper pyMethod(pyModel, "forward");
 
+  EXPECT_EQ(scriptMethod.name(), "forward");
+  EXPECT_EQ(pyMethod.name(), "forward");
+
   auto input = torch::ones({10, 20});
   auto outputPy = pyMethod({input});
   auto outputScript = scriptMethod({input});
diff --git a/torch/csrc/api/include/torch/imethod.h b/torch/csrc/api/include/torch/imethod.h
index af010785a8016..5ab9b83888214 100644
--- a/torch/csrc/api/include/torch/imethod.h
+++ b/torch/csrc/api/include/torch/imethod.h
@@ -28,6 +28,8 @@ class TORCH_API IMethod {
       std::vector<c10::IValue> args,
       const IValueMap& kwargs = IValueMap()) const = 0;
 
+  virtual const std::string& name() const = 0;
+
   // Returns an ordered list of argument names, possible in both
   // script and python methods.  This is a more portable dependency
   // than a ScriptMethod FunctionSchema, which has more information
diff --git a/torch/csrc/deploy/deploy.h b/torch/csrc/deploy/deploy.h
index 20364797edd8a..f34e4bc5fdbcc 100644
--- a/torch/csrc/deploy/deploy.h
+++ b/torch/csrc/deploy/deploy.h
@@ -232,6 +232,10 @@ class PythonMethodWrapper : public torch::IMethod {
       std::string method_name)
       : model_(std::move(model)), method_name_(std::move(method_name)) {}
 
+  const std::string& name() const override {
+    return method_name_;
+  }
+
   c10::IValue operator()(
       std::vector<c10::IValue> args,
       const IValueMap& kwargs = IValueMap()) const override {
diff --git a/torch/csrc/jit/api/method.h b/torch/csrc/jit/api/method.h
index bcd44a1df343a..3fcc4421891a0 100644
--- a/torch/csrc/jit/api/method.h
+++ b/torch/csrc/jit/api/method.h
@@ -46,7 +46,7 @@ struct TORCH_API Method : public torch::IMethod {
     return function_->graph();
   }
 
-  const std::string& name() const {
+  const std::string& name() const override {
     return function_->name();
   }
 

From 9035a1cb4d6fd927b04d8491cd0e8e073ee22025 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Mon, 30 Aug 2021 13:55:19 -0700
Subject: [PATCH 359/530] .github: Adding configuration for docs_test (#64201)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64201

Adds docs_test to our existing test matrix for github actions

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D30646765

Pulled By: seemethere

fbshipit-source-id: 946adae01ff1f1f7ebe626e408e161b77b19a011
---
 .github/scripts/generate_ci_workflows.py                    | 2 ++
 .github/scripts/generate_pytorch_test_matrix.py             | 2 ++
 .github/templates/linux_ci_workflow.yml.j2                  | 2 +-
 .../generated-linux-bionic-cuda10.2-py3.9-gcc7.yml          | 2 +-
 .../generated-linux-bionic-py3.8-gcc9-coverage.yml          | 2 +-
 .../generated-linux-xenial-cuda10.2-py3.6-gcc7.yml          | 2 +-
 .../generated-linux-xenial-cuda11.3-py3.6-gcc7.yml          | 2 +-
 .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml   | 2 +-
 .../generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 2 +-
 .jenkins/pytorch/test.sh                                    | 6 ++++++
 10 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index f1819dbac589d..3ae63051c327a 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -148,6 +148,7 @@ class CIWorkflow:
     enable_nogpu_no_avx_test: YamlShellBool = "''"
     enable_nogpu_no_avx2_test: YamlShellBool = "''"
     enable_slow_test: YamlShellBool = "''"
+    enable_docs_test: YamlShellBool = "''"
 
     def __post_init__(self) -> None:
         if self.is_libtorch:
@@ -266,6 +267,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         test_runner_type=LINUX_CPU_TEST_RUNNER,
         on_pull_request=True,
         enable_doc_jobs=True,
+        enable_docs_test=1,
         num_test_shards=2,
         ciflow_config=CIFlowConfig(
             enabled=True,
diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py
index 75df57cfa2f89..7dc29097ab83a 100755
--- a/.github/scripts/generate_pytorch_test_matrix.py
+++ b/.github/scripts/generate_pytorch_test_matrix.py
@@ -55,6 +55,8 @@ def main() -> None:
         configs['distributed'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     if os.getenv('ENABLE_SLOW_TEST'):
         configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
+    if os.getenv('ENABLE_DOCS_TEST'):
+        configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     matrix = {
         'include': [
             {
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 7d9020790710e..e7681b049464a 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -254,6 +254,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }}
       ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }}
       ENABLE_SLOW_TEST: !{{ enable_slow_test }}
+      ENABLE_DOCS_TEST: !{{ enable_docs_test }}
       NUM_TEST_SHARDS: !{{ num_test_shards }}
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -346,7 +347,6 @@ jobs:
           env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
-          BUILD_ENVIRONMENT: !{{ build_environment }}-${{ matrix.config }}
           PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index c51f8f047e986..0c5096146273c 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -230,6 +230,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX_TEST: ''
       ENABLE_NOGPU_NO_AVX2_TEST: ''
       ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -325,7 +326,6 @@ jobs:
           env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
-          BUILD_ENVIRONMENT: linux-bionic-cuda10.2-py3.9-gcc7-${{ matrix.config }}
           PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index 014b1d1162d07..536274b7df5c0 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -230,6 +230,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX_TEST: ''
       ENABLE_NOGPU_NO_AVX2_TEST: ''
       ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -325,7 +326,6 @@ jobs:
           env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
-          BUILD_ENVIRONMENT: linux-bionic-py3.8-gcc9-coverage-${{ matrix.config }}
           PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 76b973eebce24..42d10cb4782ce 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -230,6 +230,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX_TEST: 1
       ENABLE_NOGPU_NO_AVX2_TEST: 1
       ENABLE_SLOW_TEST: 1
+      ENABLE_DOCS_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -325,7 +326,6 @@ jobs:
           env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
-          BUILD_ENVIRONMENT: linux-xenial-cuda10.2-py3.6-gcc7-${{ matrix.config }}
           PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 8114bd541fdb3..8c3b8d40d7651 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -230,6 +230,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX_TEST: ''
       ENABLE_NOGPU_NO_AVX2_TEST: ''
       ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -325,7 +326,6 @@ jobs:
           env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
-          BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7-${{ matrix.config }}
           PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index 6cc391ba15991..cbbfa5981cb44 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -230,6 +230,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX_TEST: ''
       ENABLE_NOGPU_NO_AVX2_TEST: ''
       ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: 1
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -325,7 +326,6 @@ jobs:
           env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
-          BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc5.4-${{ matrix.config }}
           PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 7b947790902ec..fc43fa74c8bfd 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -228,6 +228,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX_TEST: ''
       ENABLE_NOGPU_NO_AVX2_TEST: ''
       ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -323,7 +324,6 @@ jobs:
           env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
-          BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ matrix.config }}
           PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 5014f603e4bb9..9710d3aafb35b 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -494,6 +494,10 @@ test_torch_deploy() {
   assert_git_not_dirty
 }
 
+test_docs_test() {
+  .jenkins/pytorch/docs-test.sh
+}
+
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
   (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@@ -532,6 +536,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
 elif [[ "${BUILD_ENVIRONMENT}" == *distributed* ]]; then
   test_distributed
   test_rpc
+elif [[ "${TEST_CONFIG}" = docs_test ]]; then
+  test_docs_test
 else
   install_torchvision
   install_monkeytype

From 09e53c0cfe81a40c32610f0cb76b3072e3bfca02 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Mon, 30 Aug 2021 13:55:19 -0700
Subject: [PATCH 360/530] .github: Adding configuration for backwards_compat
 (#64204)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64204

Adds backwards_compat to our existing test matrix for github actions

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D30646764

Pulled By: seemethere

fbshipit-source-id: f0da6027e29fab03aff058cb13466fae5dcf3678
---
 .github/scripts/generate_ci_workflows.py                        | 2 ++
 .github/scripts/generate_pytorch_test_matrix.py                 | 2 ++
 .github/templates/linux_ci_workflow.yml.j2                      | 1 +
 .../workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml    | 1 +
 .../workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml    | 1 +
 .../workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml    | 1 +
 .../workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml    | 1 +
 .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml       | 1 +
 .../generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml     | 1 +
 9 files changed, 11 insertions(+)

diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 3ae63051c327a..467d13d0dc45d 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -149,6 +149,7 @@ class CIWorkflow:
     enable_nogpu_no_avx2_test: YamlShellBool = "''"
     enable_slow_test: YamlShellBool = "''"
     enable_docs_test: YamlShellBool = "''"
+    enable_backwards_compat_test: YamlShellBool = "''"
 
     def __post_init__(self) -> None:
         if self.is_libtorch:
@@ -268,6 +269,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         on_pull_request=True,
         enable_doc_jobs=True,
         enable_docs_test=1,
+        enable_backwards_compat_test=1,
         num_test_shards=2,
         ciflow_config=CIFlowConfig(
             enabled=True,
diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py
index 7dc29097ab83a..cb71f588ece5e 100755
--- a/.github/scripts/generate_pytorch_test_matrix.py
+++ b/.github/scripts/generate_pytorch_test_matrix.py
@@ -57,6 +57,8 @@ def main() -> None:
         configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     if os.getenv('ENABLE_DOCS_TEST'):
         configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
+    if os.getenv('ENABLE_BACKWARDS_COMPAT_TEST'):
+        configs['backwards_compat'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     matrix = {
         'include': [
             {
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index e7681b049464a..d7be808898476 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -255,6 +255,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }}
       ENABLE_SLOW_TEST: !{{ enable_slow_test }}
       ENABLE_DOCS_TEST: !{{ enable_docs_test }}
+      ENABLE_BACKWARDS_COMPAT_TEST: !{{ enable_backwards_compat_test }}
       NUM_TEST_SHARDS: !{{ num_test_shards }}
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 0c5096146273c..f34765c98160b 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -231,6 +231,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX2_TEST: ''
       ENABLE_SLOW_TEST: ''
       ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index 536274b7df5c0..6162b3cac1604 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -231,6 +231,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX2_TEST: ''
       ENABLE_SLOW_TEST: ''
       ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 42d10cb4782ce..2fe24a515ea2a 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -231,6 +231,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX2_TEST: 1
       ENABLE_SLOW_TEST: 1
       ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 8c3b8d40d7651..59b0e2535b3bb 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -231,6 +231,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX2_TEST: ''
       ENABLE_SLOW_TEST: ''
       ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index cbbfa5981cb44..1fa72f51255dd 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -231,6 +231,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX2_TEST: ''
       ENABLE_SLOW_TEST: ''
       ENABLE_DOCS_TEST: 1
+      ENABLE_BACKWARDS_COMPAT_TEST: 1
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index fc43fa74c8bfd..246d5cabd86de 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -229,6 +229,7 @@ jobs:
       ENABLE_NOGPU_NO_AVX2_TEST: ''
       ENABLE_SLOW_TEST: ''
       ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge

From 65050ec9247ef4566d035bfe3d0c58eb6e7f091b Mon Sep 17 00:00:00 2001
From: Daya Khudia <dskhudia@fb.com>
Date: Mon, 30 Aug 2021 13:58:47 -0700
Subject: [PATCH 361/530] Back out "[JIT] Add aten::slice optimization"

Summary:
Original commit changeset: d12ee39f6828
build-break
overriding_review_checks_triggers_an_audit_and_retroactive_review
Oncall Short Name: dskhudia

Test Plan: Local run succeeds

Differential Revision: D30633990

fbshipit-source-id: 91cf7cc0ad7e47d919347c2a1527688e062e0c62
---
 test/jit/test_peephole.py                     | 74 +-------------
 .../csrc/jit/passes/peephole_list_idioms.cpp  | 97 ++++++-------------
 torch/csrc/jit/passes/peephole_list_idioms.h  |  8 --
 3 files changed, 31 insertions(+), 148 deletions(-)

diff --git a/test/jit/test_peephole.py b/test/jit/test_peephole.py
index ecb4a06dfe0b9..23de44807761c 100644
--- a/test/jit/test_peephole.py
+++ b/test/jit/test_peephole.py
@@ -2,7 +2,7 @@
 from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA, _inline_everything
 from torch import nn
 from torch.testing import FileCheck
-from typing import Callable, List
+from typing import List
 
 import unittest
 
@@ -721,75 +721,3 @@ def foo():
         self.run_pass("peephole", foo.graph)
         FileCheck().check("DictConstruct").check("len").run(foo.graph)
         self.assertEqual(foo(), 1)
-
-    def test_peephole_slice_all_three_args(self):
-        def foo(x: int):
-            return [1, 2, x, 4, 5, 6, 7][-5:6:2]
-
-        graph = torch.jit.script(foo).graph
-        self.run_pass("peephole", graph)
-        FileCheck().check_not("aten::slice").run(graph)
-        self.checkScript(foo, (3, ))
-
-    def test_peephole_slice_one_empty_arg(self):
-        def check_helper(fn: Callable[[int], None]) -> None:
-            graph = torch.jit.script(fn).graph
-            self.run_pass("peephole", graph)
-            FileCheck().check_not("aten::slice").run(graph)
-            self.checkScript(fn, (3, ))
-
-        def foo(x: int):
-            return [1, 2, x, 4, 5, 6, 7][1::2]
-
-        check_helper(foo)
-
-        def foo(x: int):
-            return [1, 2, x, 4, 5, 6, 7][:5:3]
-
-        check_helper(foo)
-
-        def foo(x: int):
-            return [1, 2, x, 4, 5, 6, 7][0:4]
-
-        check_helper(foo)
-
-    def test_peephole_slice_two_empty_args(self):
-        def check_helper(fn: Callable[[int], None]) -> None:
-            graph = torch.jit.script(fn).graph
-            self.run_pass("peephole", graph)
-            FileCheck().check_not("aten::slice").run(graph)
-            self.checkScript(fn, (3, ))
-
-        def foo(x: int):
-            return [1, 2, x, 4, 5, 6, 7][::2]
-
-        check_helper(foo)
-
-        def foo(x: int):
-            return [1, 2, x, 4, 5, 6, 7][:5]
-
-        check_helper(foo)
-
-        def foo(x: int):
-            return [1, 2, x, 4, 5, 6, 7][1:]
-
-        check_helper(foo)
-
-    def test_peephole_slice_optimization_not_applied_list_modified(self):
-        @torch.jit.script
-        def foo():
-            li = [1, 2, 3, 4, 5, 6, 7]
-            li[0] = 0
-            return li[2:5]
-
-        self.run_pass("peephole", foo.graph)
-        FileCheck().check("aten::slice").run(foo.graph)
-
-    def test_peephole_slice_optimization_not_applied_non_const_args(self):
-        @torch.jit.script
-        def foo(x: int, y: int):
-            li = [1, 2, 3, 4, 5, 6, 7]
-            return li[x:y]
-
-        self.run_pass("peephole", foo.graph)
-        FileCheck().check("aten::slice").run(foo.graph)
diff --git a/torch/csrc/jit/passes/peephole_list_idioms.cpp b/torch/csrc/jit/passes/peephole_list_idioms.cpp
index ec3d249b8b1be..f33f388259d20 100644
--- a/torch/csrc/jit/passes/peephole_list_idioms.cpp
+++ b/torch/csrc/jit/passes/peephole_list_idioms.cpp
@@ -7,9 +7,7 @@
 #include <torch/csrc/jit/passes/peephole_list_idioms.h>
 #include <torch/csrc/jit/passes/value_refinement_utils.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
-#include <torch/csrc/jit/runtime/slice_indices_adjust.h>
 #include <torch/csrc/utils/memory.h>
-#include <limits>
 
 namespace torch {
 namespace jit {
@@ -59,7 +57,7 @@ struct ListLenRefiner {
       }
 
       auto first_input = n->input(0);
-      if (first_input->type()->castRaw<ListType>() &&
+      if (first_input->type()->cast<ListType>() &&
           !mutated_lists_.count(first_input)) {
         if (!li_with_len_use.count(first_input)) {
           li_with_len_use.insert(first_input);
@@ -174,7 +172,7 @@ struct PeepholeOptimizeListIdiomsImpl {
 
  private:
   void checkForMutatedList(Value* v) {
-    if (v->type()->castRaw<ListType>() && aliasDb_->hasWriters(v)) {
+    if (v->type()->cast<ListType>() && aliasDb_->hasWriters(v)) {
       mutated_lists_.insert(v);
     }
   }
@@ -193,43 +191,6 @@ struct PeepholeOptimizeListIdiomsImpl {
     }
   }
 
-  bool optimizeSlice(Node* slice_node, Node* list_construct_node) {
-    auto start_val = toIValue(slice_node->input(1));
-    auto end_val = toIValue(slice_node->input(2));
-    auto step_val = toIValue(slice_node->input(3));
-
-    // All args must be constant to apply this optimization.
-    if (start_val == c10::nullopt || end_val == c10::nullopt ||
-        step_val == c10::nullopt) {
-      return false;
-    }
-
-    int64_t start = start_val->isInt() ? start_val->to<int64_t>()
-                                       : std::numeric_limits<int64_t>::max();
-    int64_t end = end_val->isInt() ? end_val->to<int64_t>()
-                                   : std::numeric_limits<int64_t>::max();
-    int64_t step = step_val->isInt() ? step_val->to<int64_t>() : 1;
-
-    size_t list_size = list_construct_node->inputs().size();
-    size_t num_values = slice_indices_adjust(list_size, &start, &end, step);
-
-    WithInsertPoint guard(slice_node);
-    auto slice_list_construct =
-        graph_->insertNode(graph_->create(prim::ListConstruct));
-    slice_list_construct->output()->setType(slice_node->output()->type());
-    for (size_t i = start, j = 0; j < num_values; ++j) {
-      slice_list_construct->addInput(list_construct_node->input(i));
-      i += step;
-    }
-
-    slice_node->output()->replaceAllUsesWith(slice_list_construct->output());
-    if (mutated_lists_.count(slice_node->output())) {
-      mutated_lists_.insert(slice_list_construct->output());
-    }
-
-    return true;
-  }
-
   bool runBlock(Block* block) {
     bool changed = false;
     for (Node* node : block->nodes()) {
@@ -239,7 +200,7 @@ struct PeepholeOptimizeListIdiomsImpl {
 
       // only optimizing list ops
       if (node->inputs().size() == 0 ||
-          !node->input(0)->type()->castRaw<ListType>()) {
+          !node->input(0)->type()->cast<ListType>()) {
         continue;
       }
 
@@ -250,33 +211,36 @@ struct PeepholeOptimizeListIdiomsImpl {
         continue;
       }
 
-      auto list_creation_node = first_input->node();
-      if (list_creation_node->kind() != prim::ListConstruct) {
-        continue;
-      }
-
       if (node->kind() == aten::len) {
-        WithInsertPoint guard(node);
-        node->output()->replaceAllUsesWith(graph_->insertConstant(
-            static_cast<int64_t>(first_input->node()->inputs().size())));
-        changed = true;
+        if (first_input->node()->kind() == prim::ListConstruct) {
+          WithInsertPoint guard(node);
+          node->output()->replaceAllUsesWith(graph_->insertConstant(
+              static_cast<int64_t>(first_input->node()->inputs().size())));
+          changed = true;
+        }
       } else if (node->kind() == aten::__getitem__) {
-        if (auto index = toIValue(node->input(1))) {
-          size_t list_size = list_creation_node->inputs().size();
-          if (auto norm_index = normalizeIndex(index->toInt(), list_size)) {
-            node->output()->replaceAllUsesWith(
-                list_creation_node->input(*norm_index));
-            changed = true;
+        auto list_creation_node = first_input->node();
+        if (list_creation_node->kind() == prim::ListConstruct) {
+          if (auto index = toIValue(node->input(1))) {
+            size_t list_size = list_creation_node->inputs().size();
+            if (auto norm_index = normalizeIndex(index->toInt(), list_size)) {
+              node->output()->replaceAllUsesWith(
+                  list_creation_node->input(*norm_index));
+              changed = true;
+            }
           }
         }
       } else if (node->kind() == prim::ListUnpack) {
-        // if sizes are unequal it's a runtime error
-        if (list_creation_node->inputs().size() != node->outputs().size()) {
-          continue;
-        }
-        for (size_t i = 0; i < node->outputs().size(); ++i) {
-          node->output(i)->replaceAllUsesWith(list_creation_node->input(i));
-          changed = true;
+        auto list_creation_node = first_input->node();
+        if (list_creation_node->kind() == prim::ListConstruct) {
+          // if sizes are unequal it's a runtime error
+          if (list_creation_node->inputs().size() != node->outputs().size()) {
+            continue;
+          }
+          for (size_t i = 0; i < node->outputs().size(); ++i) {
+            node->output(i)->replaceAllUsesWith(list_creation_node->input(i));
+            changed = true;
+          }
         }
       } else if (node->kind() == aten::add) {
         if (node->inputs().size() != 2) {
@@ -287,7 +251,8 @@ struct PeepholeOptimizeListIdiomsImpl {
         if (mutated_lists_.count(second_input)) {
           continue;
         }
-        if (second_input->node()->kind() != prim::ListConstruct) {
+        if (first_input->node()->kind() != prim::ListConstruct ||
+            second_input->node()->kind() != prim::ListConstruct) {
           continue;
         }
         WithInsertPoint guard(node);
@@ -305,8 +270,6 @@ struct PeepholeOptimizeListIdiomsImpl {
           mutated_lists_.insert(list_construct->output());
         }
         changed = true;
-      } else if (node->kind() == aten::slice) {
-        changed |= optimizeSlice(node, first_input->node());
       }
     }
     return changed;
diff --git a/torch/csrc/jit/passes/peephole_list_idioms.h b/torch/csrc/jit/passes/peephole_list_idioms.h
index d20df9571db01..c8add4849d4ce 100644
--- a/torch/csrc/jit/passes/peephole_list_idioms.h
+++ b/torch/csrc/jit/passes/peephole_list_idioms.h
@@ -51,14 +51,6 @@ namespace jit {
 //
 // This is only applied to lists that are not modified.
 //
-// 5. Slice
-// Given a function like this:
-//     def foo():
-//         return [1, 2, 3, 4, 5][0:2]
-// This pass produces (after deadcode elimination):
-//     def foo():
-//         return [1, 2]
-//
 // Currently this is invoked as part of PeepholeOptimize
 // return true if graph is modified.
 // If `refine_list_len` is true will attempt to refine the len of lists through

From 8f88f797dbff54aa4d2b153e9f0dc87794e4cf38 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Mon, 30 Aug 2021 14:21:39 -0700
Subject: [PATCH 362/530] [quant][graphmode][fx] Add reference quantized conv
 module (#63828)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63828

Added reference quantized conv module for the custom backend flow, the reference quantized module will
have the following code:
```
        w(float) -- quant - dequant \
        x(float) ------------- F.conv2d ---
```
In the full model, we will see
```
        w(float) -- quant - *dequant \
        x -- quant --- *dequant --  *F.conv2d --- *quant - dequant
```
and the backend should be able to fuse the ops with `*` into a quantized linear

Test Plan:
python test/test_quantization.py TestQuantizeFx.test_conv_linear_reference

Imported from OSS

Reviewed By: vkuzo

Differential Revision: D30504749

fbshipit-source-id: e1d8c43a0e0d6d9ea2375b8ca59a9c0f455514fb
---
 .../core/test_quantized_module.py             |  84 +++----
 test/quantization/fx/test_quantize_fx.py      |  68 +++--
 .../quantized/_reference/__init__.py          |   1 -
 .../quantized/_reference/modules/__init__.py  |   8 -
 .../quantized/_reference/modules/conv_relu.py |  58 -----
 torch/nn/quantized/_reference/modules/conv.py | 237 ++++++++++++------
 .../quantization/fx/quantization_patterns.py  |  30 ++-
 torch/quantization/quantization_mappings.py   |  15 --
 8 files changed, 257 insertions(+), 244 deletions(-)
 delete mode 100644 torch/nn/intrinsic/quantized/_reference/__init__.py
 delete mode 100644 torch/nn/intrinsic/quantized/_reference/modules/__init__.py
 delete mode 100644 torch/nn/intrinsic/quantized/_reference/modules/conv_relu.py

diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py
index bc8a6b397eef8..b0bc78294d9b5 100644
--- a/test/quantization/core/test_quantized_module.py
+++ b/test/quantization/core/test_quantized_module.py
@@ -2,9 +2,7 @@
 import torch.nn as nn
 import torch.nn.intrinsic as nni
 import torch.nn.intrinsic.quantized as nniq
-import torch.nn.intrinsic.quantized._reference as nniqr
 import torch.nn.quantized as nnq
-import torch.nn.quantized._reference as nnqr
 import torch.nn.quantized.dynamic as nnqd
 import torch.quantization
 
@@ -211,12 +209,11 @@ def test_quant_dequant_api(self):
         self.assertEqual(rqr, rqr2)
 
     def _test_conv_api_impl(
-        self, module_name, qconv_module, conv_module, batch_size,
-        in_channels_per_group, input_feature_map_size, out_channels_per_group,
-        groups, kernel_size, stride, padding, padding_mode, dilation,
-        X_scale, X_zero_point, W_scale, W_zero_point, Y_scale, Y_zero_point,
-        use_bias, use_fused, use_channelwise, is_reference
-    ):
+            self, module_name, qconv_module, conv_module, batch_size,
+            in_channels_per_group, input_feature_map_size, out_channels_per_group,
+            groups, kernel_size, stride, padding, padding_mode, dilation,
+            X_scale, X_zero_point, W_scale, W_zero_point, Y_scale, Y_zero_point,
+            use_bias, use_fused, use_channelwise):
         for i in range(len(kernel_size)):
             assume(input_feature_map_size[i] + 2 * padding[i]
                    >= dilation[i] * (kernel_size[i] - 1) + 1)
@@ -245,8 +242,7 @@ def _test_conv_api_impl(
 
         # Test members
         self.assertTrue(module_name == qconv_module._get_name(), module_name + " " + qconv_module._get_name())
-        if not is_reference:
-            self.assertTrue(hasattr(qconv_module, '_packed_params'))
+        self.assertTrue(hasattr(qconv_module, '_packed_params'))
         self.assertTrue(hasattr(qconv_module, 'scale'))
         self.assertTrue(hasattr(qconv_module, 'zero_point'))
 
@@ -275,9 +271,8 @@ def _test_conv_api_impl(
         # For example, the result of round(2.5) + 1 is 3 while round(2.5 + 1) is
         # 4 assuming the rounding mode is round-to-nearest, ties-to-even.
         # skip numerics checking for reference module
-        if not is_reference:
-            np.testing.assert_array_almost_equal(
-                Y_exp.int_repr().numpy(), Y_act.int_repr().numpy(), decimal=0)
+        np.testing.assert_array_almost_equal(
+            Y_exp.int_repr().numpy(), Y_act.int_repr().numpy(), decimal=0)
 
         # Test serialization of quantized Conv Module using state_dict
         model_dict = qconv_module.state_dict()
@@ -297,8 +292,7 @@ def _test_conv_api_impl(
 
         self.assertTrue(dir(loaded_qconv_module) == dir(qconv_module))
         self.assertTrue(module_name == loaded_qconv_module._get_name())
-        if not is_reference:
-            self.assertTrue(hasattr(loaded_qconv_module, '_packed_params'))
+        self.assertTrue(hasattr(loaded_qconv_module, '_packed_params'))
         self.assertTrue(hasattr(loaded_qconv_module, '_weight_bias'))
 
         self.assertEqual(qconv_module.weight(), loaded_qconv_module.weight())
@@ -308,9 +302,8 @@ def _test_conv_api_impl(
         self.assertEqual(qconv_module.zero_point,
                          loaded_qconv_module.zero_point)
         Y_loaded = loaded_qconv_module(X_q)
-        if not is_reference:
-            np.testing.assert_array_almost_equal(
-                Y_exp.int_repr().numpy(), Y_loaded.int_repr().numpy(), decimal=0)
+        np.testing.assert_array_almost_equal(
+            Y_exp.int_repr().numpy(), Y_loaded.int_repr().numpy(), decimal=0)
 
         # Test serialization
         b = io.BytesIO()
@@ -330,9 +323,8 @@ def _test_conv_api_impl(
         self.assertEqual(copied_conv.zero_point,
                          qconv_module.zero_point)
         Y_copied = copied_conv(X_q)
-        if not is_reference:
-            np.testing.assert_array_almost_equal(
-                Y_exp.int_repr().numpy(), Y_copied.int_repr().numpy(), decimal=0)
+        np.testing.assert_array_almost_equal(
+            Y_exp.int_repr().numpy(), Y_copied.int_repr().numpy(), decimal=0)
 
         deepcopied_conv = copy.deepcopy(qconv_module)
         self.assertEqual(deepcopied_conv.bias(), qconv_module.bias())
@@ -340,9 +332,8 @@ def _test_conv_api_impl(
         self.assertEqual(deepcopied_conv.zero_point,
                          qconv_module.zero_point)
         Y_deepcopied = copied_conv(X_q)
-        if not is_reference:
-            np.testing.assert_array_almost_equal(
-                Y_exp.int_repr().numpy(), Y_deepcopied.int_repr().numpy(), decimal=0)
+        np.testing.assert_array_almost_equal(
+            Y_exp.int_repr().numpy(), Y_deepcopied.int_repr().numpy(), decimal=0)
 
         # JIT testing
         self.checkScriptable(
@@ -377,9 +368,8 @@ def test_conv1d_api(self):
             [True, False],  # use_bias
             [True, False],  # use_fused
             [True, False],  # use_channelwise
-            [True, False]  # is_reference
         )
-        for pad_mode, use_bias, use_fused, use_channelwise, is_reference in options:
+        for pad_mode, use_bias, use_fused, use_channelwise in options:
             if torch.backends.quantized.engine == "qnnpack":
                 use_channelwise = False
             batch_size = 2
@@ -407,15 +397,13 @@ def test_conv1d_api(self):
             Y_zero_point = 4
             if torch.backends.quantized.engine == 'qnnpack':
                 use_channelwise = False
-            # (use_fused, is_reference) -> quantized class
+            # use_fused -> quantized class
             class_map = {
-                (True, True): (nniqr.ConvReLU1d, "QuantizedConvReLU1d(Reference)"),
-                (True, False): (nniq.ConvReLU1d, "QuantizedConvReLU1d"),
-                (False, True): (nnqr.Conv1d, "QuantizedConv1d(Reference)"),
-                (False, False): (nnq.Conv1d, "QuantizedConv1d")
+                True: (nniq.ConvReLU1d, "QuantizedConvReLU1d"),
+                False: (nnq.Conv1d, "QuantizedConv1d")
             }
 
-            qconv_cls, module_name = class_map[(use_fused, is_reference)]
+            qconv_cls, module_name = class_map[use_fused]
             qconv_module = qconv_cls(
                 in_channels, out_channels, kernel, stride, pad,
                 dilation, groups, use_bias, padding_mode=pad_mode
@@ -434,7 +422,7 @@ def test_conv1d_api(self):
                 in_channels_per_group, input_feature_map_size,
                 out_channels_per_group, groups, kernel_size, stride, pad, pad_mode,
                 dilation, X_scale, X_zero_point, W_scale, W_zero_point, Y_scale,
-                Y_zero_point, use_bias, use_fused, use_channelwise, is_reference)
+                Y_zero_point, use_bias, use_fused, use_channelwise)
 
     @override_qengines
     def test_conv2d_api(self):
@@ -443,9 +431,8 @@ def test_conv2d_api(self):
             [True, False],  # use_bias
             [True, False],  # use_fused
             [True, False],  # use_channelwise
-            [True, False]  # is_reference
         )
-        for pad_mode, use_bias, use_fused, use_channelwise, is_reference in options:
+        for pad_mode, use_bias, use_fused, use_channelwise in options:
             if torch.backends.quantized.engine == "qnnpack":
                 use_channelwise = False
             batch_size = 2
@@ -475,15 +462,13 @@ def test_conv2d_api(self):
             W_zero_point = [3]
             Y_scale = 5.0
             Y_zero_point = 4
-            # (use_fused, is_reference) -> quantized class
+            # use_fused -> quantized class
             class_map = {
-                (True, True): (nniqr.ConvReLU2d, "QuantizedConvReLU2d(Reference)"),
-                (True, False): (nniq.ConvReLU2d, "QuantizedConvReLU2d"),
-                (False, True): (nnqr.Conv2d, "QuantizedConv2d(Reference)"),
-                (False, False): (nnq.Conv2d, "QuantizedConv2d")
+                True: (nniq.ConvReLU2d, "QuantizedConvReLU2d"),
+                False: (nnq.Conv2d, "QuantizedConv2d")
             }
 
-            qconv_cls, module_name = class_map[(use_fused, is_reference)]
+            qconv_cls, module_name = class_map[use_fused]
             qconv_module = qconv_cls(
                 in_channels, out_channels, kernel_size, stride, padding,
                 dilation, groups, use_bias, padding_mode=pad_mode
@@ -502,7 +487,7 @@ def test_conv2d_api(self):
                 in_channels_per_group, input_feature_map_size,
                 out_channels_per_group, groups, kernel_size, stride, padding,
                 pad_mode, dilation, X_scale, X_zero_point, W_scale, W_zero_point,
-                Y_scale, Y_zero_point, use_bias, use_fused, use_channelwise, is_reference)
+                Y_scale, Y_zero_point, use_bias, use_fused, use_channelwise)
 
     @skipIfNoFBGEMM
     def test_conv3d_api(self):
@@ -510,9 +495,8 @@ def test_conv3d_api(self):
             [True, False],  # use_bias
             [True, False],  # use_fused
             [True, False],  # use_channelwise
-            [True, False]  # is_reference
         )
-        for use_bias, use_fused, use_channelwise, is_reference in options:
+        for use_bias, use_fused, use_channelwise in options:
             if torch.backends.quantized.engine == "qnnpack":
                 use_channelwise = False
             batch_size = 2
@@ -547,16 +531,14 @@ def test_conv3d_api(self):
             W_zero_point = [3]
             Y_scale = 5.0
             Y_zero_point = 4
-            # (use_fused, is_reference) -> quantized class
+            # use_fused -> quantized class
             class_map = {
-                (True, True): (nniqr.ConvReLU3d, "QuantizedConvReLU3d(Reference)"),
-                (True, False): (nniq.ConvReLU3d, "QuantizedConvReLU3d"),
-                (False, True): (nnqr.Conv3d, "QuantizedConv3d(Reference)"),
-                (False, False): (nnq.Conv3d, "QuantizedConv3d")
+                True: (nniq.ConvReLU3d, "QuantizedConvReLU3d"),
+                False: (nnq.Conv3d, "QuantizedConv3d")
             }
 
             with override_quantized_engine('fbgemm'):
-                qconv_cls, module_name = class_map[(use_fused, is_reference)]
+                qconv_cls, module_name = class_map[use_fused]
                 qconv_module = qconv_cls(
                     in_channels, out_channels, kernel_size, stride, padding,
                     dilation, groups, use_bias, padding_mode=pad_mode
@@ -576,7 +558,7 @@ def test_conv3d_api(self):
                     out_channels_per_group, groups, kernel_size, stride, padding,
                     pad_mode, dilation, X_scale, X_zero_point, W_scale,
                     W_zero_point, Y_scale, Y_zero_point, use_bias, use_fused,
-                    use_channelwise, is_reference)
+                    use_channelwise)
 
     def test_pool_api(self):
         """Tests the correctness of the pool module.
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 7ae29e03f6a46..9682da14483df 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -532,7 +532,7 @@ def forward(self, x):
                 Conv1d,
                 conv1d_module_args,
                 (conv1d_input,),
-                ns.call_module(nn.Conv1d if is_reference else nnq.Conv1d),
+                ns.call_module(nnqr.Conv1d if is_reference else nnq.Conv1d),
                 None
             ),
             (
@@ -540,7 +540,7 @@ def forward(self, x):
                 Conv2d,
                 conv2d_module_args,
                 (conv2d_input,),
-                ns.call_module(nn.Conv2d if is_reference else nnq.Conv2d),
+                ns.call_module(nnqr.Conv2d if is_reference else nnq.Conv2d),
                 None
             ),
             (
@@ -548,7 +548,7 @@ def forward(self, x):
                 Conv3d,
                 conv3d_module_args,
                 (conv3d_input,),
-                ns.call_module(nn.Conv3d if is_reference else nnq.Conv3d),
+                ns.call_module(nnqr.Conv3d if is_reference else nnq.Conv3d),
                 None
             ),
             (
@@ -631,11 +631,7 @@ def _get_keys(prefix, is_dynamic):
             qr = result_dict["quantized_reference"]
 
             def checkWeightQParams(model):
-                for module_name in ("conv",):
-                    if hasattr(model, module_name):
-                        self.assertTrue(hasattr(qr.get_submodule(module_name), "_weight_qparams"))
-                        self.assertTrue("Reference" in qr.get_submodule(module_name)._get_name())
-                for module_name in ("linear",):
+                for module_name in ("linear", "conv"):
                     if hasattr(model, module_name):
                         self.assertTrue(hasattr(qr.get_submodule(module_name), "weight_qscheme"))
                         self.assertTrue(hasattr(qr.get_submodule(module_name), "weight_scale"))
@@ -643,19 +639,7 @@ def checkWeightQParams(model):
                         self.assertTrue("Reference" in qr.get_submodule(module_name)._get_name())
 
             def checkSerDeser(model, is_dynamic):
-                for module_name in ("conv",):
-                    if hasattr(model, module_name):
-                        # make sure seralization works
-                        state_dict = copy.deepcopy(model.state_dict())
-                        self.assertTrue(module_name + "._weight_qparams" in state_dict)
-
-                        # check load_state_dict restores states
-                        module = getattr(model, module_name)
-                        prev_scale = module._weight_qparams["scale"]
-                        module._weight_qparams["scale"] = None
-                        model.load_state_dict(state_dict)
-                        self.assertTrue(torch.equal(prev_scale, module._weight_qparams["scale"]))
-                for module_name in ("linear",):
+                for module_name in ("linear", "conv"):
                     if hasattr(model, module_name):
                         # make sure seralization works
                         state_dict = copy.deepcopy(model.state_dict())
@@ -3001,6 +2985,44 @@ def forward(self, x):
             result_ref = m_ref(data)
             self.assertTrue(torch.equal(result, result_ref))
 
+    def test_ref_conv_module(self):
+        """ Make sure the numerics for models with ref conv module
+        matches models with fbgemm/qnnpack module
+        """
+        convs = {
+            1: nn.Conv1d,
+            2: nn.Conv2d,
+            3: nn.Conv3d,
+        }
+
+        class M1(torch.nn.Module):
+            def __init__(self, dim):
+                super().__init__()
+                self.conv = convs[dim](3, 3, 3)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        class M2(torch.nn.Module):
+            def __init__(self, dim):
+                super().__init__()
+                self.conv = convs[dim](3, 3, 3)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.conv(x))
+
+        for dim, M in itertools.product([1, 2, 3], [M1, M2]):
+            m = M(dim).eval()
+            m = prepare_fx(m, {"": default_qconfig})
+            m_copy = copy.deepcopy(m)
+            m = convert_fx(m, is_reference=False)
+            m_ref = convert_fx(m_copy, is_reference=True)
+            data = self.img_data_dict[dim][0][0]
+            result = m(data)
+            result_ref = m_ref(data)
+            self.assertTrue(torch.equal(result, result_ref))
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     """Unit tests for individual ops
@@ -4558,13 +4580,13 @@ def forward(self, x):
             reference_order_check = [
                 ns.call_function(torch.quantize_per_tensor),
                 ns.call_method('dequantize'),
-                ns.call_module(nn.Conv2d),
+                ns.call_module(nnqr.Conv2d),
                 ns.call_function(torch.quantize_per_tensor),
                 ns.call_method('dequantize'),
                 ns.call_module(nn.Sigmoid),
                 ns.call_function(torch.quantize_per_tensor),
                 ns.call_method('dequantize'),
-                ns.call_module(nn.Conv2d),
+                ns.call_module(nnqr.Conv2d),
                 ns.call_function(torch.quantize_per_tensor),
                 ns.call_method('dequantize'),
             ]
diff --git a/torch/nn/intrinsic/quantized/_reference/__init__.py b/torch/nn/intrinsic/quantized/_reference/__init__.py
deleted file mode 100644
index 3d79bdbfe8320..0000000000000
--- a/torch/nn/intrinsic/quantized/_reference/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .modules import *  # noqa: F403
diff --git a/torch/nn/intrinsic/quantized/_reference/modules/__init__.py b/torch/nn/intrinsic/quantized/_reference/modules/__init__.py
deleted file mode 100644
index 33b18d8cf7d3f..0000000000000
--- a/torch/nn/intrinsic/quantized/_reference/modules/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import torch
-from .conv_relu import ConvReLU1d, ConvReLU2d, ConvReLU3d
-
-__all__ = [
-    'ConvReLU1d',
-    'ConvReLU2d',
-    'ConvReLU3d',
-]
diff --git a/torch/nn/intrinsic/quantized/_reference/modules/conv_relu.py b/torch/nn/intrinsic/quantized/_reference/modules/conv_relu.py
deleted file mode 100644
index b0305f6207d95..0000000000000
--- a/torch/nn/intrinsic/quantized/_reference/modules/conv_relu.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import torch
-import torch.nn.quantized._reference as nnqr
-import torch.nn.functional as F
-
-class ConvReLU1d(nnqr.Conv1d):
-    _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU1d
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x_dequant = x.dequantize()
-        weight_dequant = self._qweight.dequantize()
-        float_result = F.conv1d(
-            x_dequant, weight_dequant, self._bias, self._conv1d_stride,  # type: ignore[has-type]
-            self._conv1d_padding, self._conv1d_dilation, self.groups)  # type: ignore[has-type]
-        float_result = F.relu(float_result, inplace=True)
-        # NEEDFIX: we don't have dtype in the Linear module APIs right now!
-        result = torch.quantize_per_tensor(
-            float_result, self.scale, self.zero_point, torch.quint8)
-        return result
-
-    def _get_name(self):
-        return "QuantizedConvReLU1d(Reference)"
-
-
-class ConvReLU2d(nnqr.Conv2d):
-    _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU2d
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x_dequant = x.dequantize()
-        weight_dequant = self._qweight.dequantize()
-        float_result = F.conv2d(
-            x_dequant, weight_dequant, self._bias, self.stride,
-            self.padding, self.dilation, self.groups)
-        float_result = F.relu(float_result, inplace=True)
-        # NEEDFIX: we don't have dtype in the Linear module APIs right now!
-        result = torch.quantize_per_tensor(
-            float_result, self.scale, self.zero_point, torch.quint8)
-        return result
-
-    def _get_name(self):
-        return "QuantizedConvReLU2d(Reference)"
-
-class ConvReLU3d(nnqr.Conv3d):
-    _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU3d
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x_dequant = x.dequantize()
-        weight_dequant = self._qweight.dequantize()
-        float_result = F.conv3d(
-            x_dequant, weight_dequant, self._bias, self.stride,
-            self.padding, self.dilation, self.groups)
-        float_result = F.relu(float_result, inplace=True)
-        # NEEDFIX: we don't have dtype in the Linear module APIs right now!
-        result = torch.quantize_per_tensor(
-            float_result, self.scale, self.zero_point, torch.quint8)
-        return result
-
-    def _get_name(self):
-        return "QuantizedConvReLU3d(Reference)"
diff --git a/torch/nn/quantized/_reference/modules/conv.py b/torch/nn/quantized/_reference/modules/conv.py
index 036f8e46212c5..6b03bb0491ad1 100644
--- a/torch/nn/quantized/_reference/modules/conv.py
+++ b/torch/nn/quantized/_reference/modules/conv.py
@@ -1,42 +1,101 @@
 import torch
-import torch.nn.quantized as nnq
+import torch.nn as nn
 import torch.nn.functional as F
-from typing import Optional
+from typing import Optional, Dict, Any
 from torch.nn.common_types import _size_1_t
-from torch.nn.modules.utils import _single
+from .utils import _quantize_and_dequantize_weight
+from .utils import _save_weight_qparams
+from .utils import _get_weight_qparam_keys
 
-class _ConvNd(nnq._ConvNd):
+class _ConvNd(torch.nn.modules.conv._ConvNd):
     """ A reference version of nn.quantized.Conv2d
         we will not pack the parameters in this module, since weight packing is an
         optimization for quantized backends supported in PyTorch (fbgemm/qnnpack),
         this is useful when user want to use this module in other backends like Glow.
     """
-    __annotations__ = {"_bias": Optional[torch.Tensor]}
+    __annotations__ = {"bias": Optional[torch.Tensor]}
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
         super()._save_to_state_dict(destination, prefix, keep_vars)
-        destination[prefix + '_qweight'] = self._qweight
-        destination[prefix + '_bias'] = self._bias
+        _save_weight_qparams(
+            destination, prefix, self.weight_qscheme, self.weight_dtype,
+            self.weight_scale, self.weight_zero_point, self.weight_axis)
 
     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
-        self._qweight = state_dict[prefix + '_qweight']
-        self._bias = state_dict[prefix + '_bias']
-        state_dict.pop(prefix + '_qweight')
-        state_dict.pop(prefix + '_bias')
+        for key in _get_weight_qparam_keys(state_dict, prefix):
+            setattr(self, key, state_dict[prefix + key])
+            state_dict.pop(prefix + key)
 
         super()._load_from_state_dict(
             state_dict, prefix, local_metadata, False,
             missing_keys, unexpected_keys, error_msgs)
 
-    def _weight_bias(self):
-        return self._qweight, self._bias
-
-    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
-        self._qweight = w
-        self._bias = b
-
-class Conv1d(_ConvNd, nnq.Conv1d):
+    def _init_weight_qparams(self, weight_qparams, device):
+        if weight_qparams is None:
+            weight_qparams = {
+                "qscheme": torch.per_tensor_affine,
+                "dtype": torch.quint8,
+                "scale": 1.0,
+                "zero_point": 0
+            }
+        self.weight_qscheme = weight_qparams["qscheme"]
+        self.weight_dtype = weight_qparams["dtype"]
+        assert self.weight_qscheme in [None, torch.per_tensor_affine, torch.per_channel_affine], \
+            Exception(f"qscheme: {self.weight_qscheme} is not support in reference quantized linear module")
+        if self.weight_qscheme is not None:
+            self.register_buffer(
+                "weight_scale",
+                torch.tensor(weight_qparams["scale"], dtype=torch.float, device=device))
+            self.register_buffer(
+                "weight_zero_point",
+                torch.tensor(weight_qparams["zero_point"], dtype=torch.int, device=device))
+            if self.weight_qscheme == torch.per_channel_affine:
+                self.register_buffer(
+                    "weight_axis",
+                    torch.tensor(weight_qparams["axis"], dtype=torch.int, device=device))
+            else:
+                # added for TorchScriptability, not used
+                self.register_buffer(
+                    "weight_axis", torch.tensor(0, dtype=torch.int, device=device))
+
+    def get_weight(self):
+        """
+        Fake quantize (quantize and dequantize) the weight with
+        the quantization parameters for weight, this is used to
+        simulate the numerics for the quantized weight in a quantized
+        model
+        """
+        # supress mypy warning
+        assert isinstance(self.weight, torch.Tensor)
+        assert isinstance(self.weight_scale, torch.Tensor)
+        assert isinstance(self.weight_zero_point, torch.Tensor)
+        assert isinstance(self.weight_axis, torch.Tensor)
+        return _quantize_and_dequantize_weight(
+            self.weight, self.weight_qscheme,
+            self.weight_dtype, self.weight_scale, self.weight_zero_point, self.weight_axis)
+
+    @staticmethod
+    def from_float(cls, float_conv, weight_qparams):
+        qref_conv = cls(
+            float_conv.in_channels,
+            float_conv.out_channels,
+            float_conv.kernel_size,  # type: ignore[arg-type]
+            float_conv.stride,  # type: ignore[arg-type]
+            float_conv.padding,  # type: ignore[arg-type]
+            float_conv.dilation,  # type: ignore[arg-type]
+            float_conv.groups,
+            float_conv.bias is not None,  # type: ignore[arg-type]
+            float_conv.padding_mode,
+            device=float_conv.weight.device,
+            dtype=float_conv.weight.dtype,
+            weight_qparams=weight_qparams)
+        qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach())
+        if float_conv.bias is not None:
+            qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach())
+        return qref_conv
+
+class Conv1d(_ConvNd, nn.Conv1d):
     def __init__(self,
                  in_channels: int,
                  out_channels: int,
@@ -46,91 +105,107 @@ def __init__(self,
                  dilation: _size_1_t = 1,
                  groups: int = 1,
                  bias: bool = True,
-                 padding_mode: str = 'zeros'):
-        nnq.Conv1d.__init__(
+                 padding_mode: str = "zeros",
+                 device=None,
+                 dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None):
+        nn.Conv1d.__init__(
             self, in_channels, out_channels, kernel_size, stride, padding, dilation,
-            groups, bias, padding_mode)
-        # self.stride, self.padding, self.dilation are 2d tuple since
-        # current quantized conv1d is using Conv2dPackedParams
-        # TODO: we should fix this if we implemenet Conv1dPackedParams
-        self._conv1d_stride = _single(self.stride[0])
-        self._conv1d_padding = _single(self.padding[0])
-        self._conv1d_dilation = _single(self.dilation[0])
+            groups, bias, padding_mode, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x_dequant = x.dequantize()
-        weight_dequant = self._qweight.dequantize()
-        float_result = F.conv1d(
-            x_dequant, weight_dequant, self._bias, self._conv1d_stride,
-            self._conv1d_padding, self._conv1d_dilation, self.groups)
-        # NEEDFIX: we don't have dtype in the Linear module APIs right now!
-        result = torch.quantize_per_tensor(
-            float_result, self.scale, self.zero_point, torch.quint8)
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.conv1d ---
+
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.conv1d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv1d
+        """
+        weight_dequant = self.get_weight()
+        result = F.conv1d(
+            x, weight_dequant, self.bias, self.stride,
+            self.padding, self.dilation, self.groups)
         return result
 
     def _get_name(self):
-        return 'QuantizedConv1d(Reference)'
-
-    @torch.jit.export
-    def __setstate__(self, state):
-        self.in_channels = state[0]
-        self.out_channels = state[1]
-        self.kernel_size = state[2]
-        self.stride = state[3]
-        self.padding = state[4]
-        self.dilation = state[5]
-        self.transposed = state[6]
-        self.output_padding = state[7]
-        self.groups = state[8]
-        self.padding_mode = state[9]
-        self.set_weight_bias(state[10], state[11])
-        self.scale = state[12]
-        self.zero_point = state[13]
-        self.training = state[14]
-        self._conv1d_stride = (self.stride[0],)
-        self._conv1d_padding = (self.padding[0],)
-        self._conv1d_dilation = (self.dilation[0],)
-
-class Conv2d(_ConvNd, nnq.Conv2d):
+        return "QuantizedConv1d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):
+        return _ConvNd.from_float(cls, float_conv, weight_qparams)
+
+class Conv2d(_ConvNd, nn.Conv2d):
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
-                 padding_mode='zeros'):
-        nnq.Conv2d.__init__(
+                 padding_mode='zeros',
+                 device=None,
+                 dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None):
+        nn.Conv2d.__init__(
             self, in_channels, out_channels, kernel_size, stride, padding, dilation,
-            groups, bias, padding_mode)
+            groups, bias, padding_mode, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x_dequant = x.dequantize()
-        weight_dequant = self._qweight.dequantize()
-        float_result = F.conv2d(
-            x_dequant, weight_dequant, self._bias, self.stride,
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.conv2d ---
+
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.conv2d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv2d
+        """
+        weight_dequant = self.get_weight()
+        result = F.conv2d(
+            x, weight_dequant, self.bias, self.stride,
             self.padding, self.dilation, self.groups)
-        # NEEDFIX: we don't have dtype in the Linear module APIs right now!
-        result = torch.quantize_per_tensor(
-            float_result, self.scale, self.zero_point, torch.quint8)
         return result
 
     def _get_name(self):
-        return 'QuantizedConv2d(Reference)'
+        return "QuantizedConv2d(Reference)"
 
-class Conv3d(_ConvNd, nnq.Conv3d):
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):
+        return _ConvNd.from_float(cls, float_conv, weight_qparams)
+
+class Conv3d(_ConvNd, nn.Conv3d):
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
-                 padding_mode='zeros'):
-        nnq.Conv3d.__init__(
+                 padding_mode="zeros",
+                 device=None,
+                 dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None):
+        nn.Conv3d.__init__(
             self, in_channels, out_channels, kernel_size, stride, padding, dilation,
             groups, bias, padding_mode)
+        self._init_weight_qparams(weight_qparams, device)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x_dequant = x.dequantize()
-        weight_dequant = self._qweight.dequantize()
-        float_result = F.conv3d(
-            x_dequant, weight_dequant, self._bias, self.stride,
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.conv3d ---
+
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.conv3d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv3d
+        """
+        weight_dequant = self.get_weight()
+        result = F.conv3d(
+            x, weight_dequant, self.bias, self.stride,
             self.padding, self.dilation, self.groups)
-        # NEEDFIX: we don't have dtype in the Linear module APIs right now!
-        result = torch.quantize_per_tensor(
-            float_result, self.scale, self.zero_point, torch.quint8)
         return result
 
     def _get_name(self):
-        return 'QuantizedConv3d(Reference)'
+        return "QuantizedConv3d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):
+        return _ConvNd.from_float(cls, float_conv, weight_qparams)
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 779dfcf07aece..418cae1511c35 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -638,19 +638,22 @@ def convert(self,
                 # and qparam is a dictionary of
                 # {"qscheme": ..., "scale": ..., "zero_point": ...} for per tensor quantization or
                 # {"qscheme": ..., "scale": ..., "zero_point": ..., "axis": ...} for per channel quantization
+                float_conv = self.conv
+                fused_conv = None
                 if isinstance(
-                        self.conv,
+                        float_conv,
                         QAT_CONV_MODULE_CLASSES):
                     # case 1. converting qat conv module to
                     # a float conv module, we need to attch
                     # weight fake_quant to the conv module,
                     # weight fake_quant is assumed to be run during
                     # QAT so we don't need to run it again here
-                    float_conv = self.conv.to_float()
+                    float_conv = self.conv.to_float()  # type: ignore[operator]
                     # change qat conv to conv
                     parent_name, name = _parent_name(self.conv_node.target)
                     setattr(modules[parent_name], name, float_conv)
                     if isinstance(float_conv, torch.nn.intrinsic._FusedModule):
+                        fused_conv = float_conv
                         float_conv = float_conv[0]
                     weight_post_process = self.conv.weight_fake_quant
                 else:
@@ -658,15 +661,28 @@ def convert(self,
                     # to float conv module, we need to attach
                     # weight observer to the conv module and run it
                     # with conv weight
-                    float_conv = self.conv
-                    if isinstance(self.conv, torch.nn.intrinsic._FusedModule):
-                        float_conv = self.conv[0]
+                    if isinstance(float_conv, torch.nn.intrinsic._FusedModule):
+                        fused_conv = float_conv
+                        float_conv = float_conv[0]  # type: ignore[index]
                     assert qconfig is not None
                     weight_post_process = qconfig.weight()
                     # run weight observer
-                    weight_post_process(float_conv.weight)
+                    weight_post_process(float_conv.weight)  # type: ignore[operator]
                 weight_qparams = get_qparam_dict(weight_post_process)
-                _to_reference(float_conv, weight_qparams)
+                # hardcoded for now, TODO: expose the api to user,
+                # we can have a map from module to reference module
+                # and allow user to register new ones
+                qconv_cls = get_static_quant_module_class(
+                    type(float_conv), is_reference=is_reference)
+                ref_conv = qconv_cls.from_float(float_conv, weight_qparams)  # type: ignore[attr-defined]
+                # if the parent is a fused conv (Sequential), we can replace the first
+                # item to ref conv, otherwise we can update
+                # the conv instance in the module tree
+                if fused_conv is not None:
+                    fused_conv[0] = ref_conv
+                else:
+                    parent_name, name = _parent_name(self.conv_node.target)
+                    setattr(modules[parent_name], name, ref_conv)
                 op_out = quantized_graph.create_node(
                     'call_module',
                     self.conv_node.target,
diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py
index 03b177805bac3..6851ba7bd447d 100644
--- a/torch/quantization/quantization_mappings.py
+++ b/torch/quantization/quantization_mappings.py
@@ -7,7 +7,6 @@
 import torch.nn.intrinsic as nni
 import torch.nn.intrinsic.quantized as nniq
 import torch.nn.intrinsic.quantized.dynamic as nniqd
-import torch.nn.intrinsic.quantized._reference as nniqr
 import torch.nn.intrinsic.qat as nniqat
 import torch.nn.quantized as nnq
 import torch.nn.quantized._reference as nnqr
@@ -29,20 +28,6 @@
     nn.Conv1d: nnqr.Conv1d,
     nn.Conv2d: nnqr.Conv2d,
     nn.Conv3d: nnqr.Conv3d,
-    nni.ConvReLU1d: nniqr.ConvReLU1d,
-    nni.ConvReLU2d: nniqr.ConvReLU2d,
-    nni.ConvReLU3d: nniqr.ConvReLU3d,
-    # QAT Modules
-    nnqat.Conv2d: nnqr.Conv2d,
-    nnqat.Conv3d: nnqr.Conv3d,
-    nniqat.ConvBn1d: nnqr.Conv1d,
-    nniqat.ConvBn2d: nnqr.Conv2d,
-    nniqat.ConvBn3d: nnqr.Conv3d,
-    nniqat.ConvBnReLU1d: nniqr.ConvReLU1d,
-    nniqat.ConvBnReLU2d: nniqr.ConvReLU2d,
-    nniqat.ConvBnReLU3d: nniqr.ConvReLU3d,
-    nniqat.ConvReLU2d: nniqr.ConvReLU2d,
-    nniqat.ConvReLU3d: nniqr.ConvReLU3d,
 }
 
 # Default map for swapping float module to quantized ones

From e4fd2ab59ce8645f5ae9477c7724b6af82124b3b Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Mon, 30 Aug 2021 14:46:50 -0700
Subject: [PATCH 363/530] Back out "Added reference tests to ReductionOpInfo"
 (#64183)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64183

Original commit changeset: 6a1f82ac2819

Test Plan: CI

Reviewed By: soulitzer

Differential Revision: D30639835

fbshipit-source-id: e238043c6fbd0453317a9ed219e348298f98aaca
---
 aten/src/ATen/native/cpu/ReduceOpsKernel.cpp  |  29 ++-
 test/test_reductions.py                       | 111 +----------
 .../_internal/common_methods_invocations.py   | 179 +++++-------------
 3 files changed, 61 insertions(+), 258 deletions(-)

diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index 01ed54e56fc73..89d2fb21fb511 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -163,29 +163,24 @@ static void std_var_kernel_impl(TensorIterator& iter, int64_t correction, bool t
 }
 
 static void prod_kernel_impl(TensorIterator& iter) {
-  // Workaround for the error: '*' in boolean context, suggest '&&' instead
-  // [-Werror=int-in-bool-context]
+  // Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
   if (iter.dtype() == ScalarType::Bool) {
     using scalar_t = bool;
     binary_kernel_reduce_vec(
-        iter,
-        [=](scalar_t a, scalar_t b)
-            __ubsan_ignore_undefined__ -> scalar_t { return a && b; },
-        [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
-            __ubsan_ignore_undefined__ { return a && b; },
-        // NOLINTNEXTLINE(bugprone-argument-comment)
-        /*identity=*/1);
+      iter,
+      [=](scalar_t a, scalar_t b) -> scalar_t { return a && b; },
+      [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return a && b; },
+      // NOLINTNEXTLINE(bugprone-argument-comment)
+      /*identity=*/1);
   } else {
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "prod_cpu", [&] {
       binary_kernel_reduce_vec(
-          iter,
-          [=](scalar_t a, scalar_t b)
-              __ubsan_ignore_undefined__ -> scalar_t { return a * b; },
-          [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
-              __ubsan_ignore_undefined__ { return a * b; },
-          // NOLINTNEXTLINE(bugprone-argument-comment)
-          /*identity=*/1);
-    });
+        iter,
+        [=](scalar_t a, scalar_t b) -> scalar_t { return a * b; },
+        [=](Vectorized <scalar_t> a, Vectorized <scalar_t> b) { return a * b; },
+        // NOLINTNEXTLINE(bugprone-argument-comment)
+        /*identity=*/1);
+      });
   }
 }
 
diff --git a/test/test_reductions.py b/test/test_reductions.py
index ca3042b66cf91..a9c667564d118 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -10,7 +10,7 @@
 
 from torch._six import inf, nan
 from torch.testing import (
-    integral_types_and, floating_and_complex_types_and, get_all_dtypes, make_tensor)
+    integral_types_and, floating_and_complex_types_and, make_tensor)
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict,
     IS_WINDOWS)
@@ -296,115 +296,6 @@ def test_empty_tensor_nonempty_slice(self, device, op: ReductionOpInfo):
             result = op(t, *args, dim=dim, **kwargs)
             self.assertEqual(result.shape, _reduced_shape(t.shape, dim))
 
-    def _test_noncontiguous(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_kwargs):
-        """Helper method to test noncontiguous input tensors."""
-        assert not t.is_contiguous()
-
-        t_contig = t.contiguous()
-        for args, kwargs in op.generate_args_kwargs(t_contig, **reduction_kwargs):
-            kwargs.update(reduction_kwargs)
-            result = op(t, *args, **kwargs)
-            expected = op(t_contig, *args, **kwargs)
-            self.assertEqual(result, expected)
-
-    @ops(reduction_ops)
-    def test_noncontiguous_innermost(self, device, dtype, op: ReductionOpInfo):
-        """Tests reducing along noncontiguous innermost dimension."""
-        t = make_tensor((10, 10), device, dtype)
-        self._test_noncontiguous(op, t[:, ::2], dim=1)
-
-    @ops(reduction_ops)
-    def test_noncontiguous_outermost(self, device, dtype, op: ReductionOpInfo):
-        """Tests reducing along noncontiguous outermost dimension."""
-        t = make_tensor((10, 10), device, dtype)
-        self._test_noncontiguous(op, t[::2, :], dim=0)
-
-    @ops(reduction_ops)
-    def test_noncontiguous_all(self, device, dtype, op: ReductionOpInfo):
-        """Tests reducing all dimensions of a noncontiguous tensor."""
-        t = make_tensor((5, 5, 5), device, dtype)
-        self._test_noncontiguous(op, t[::2, ::3, 1:-1:2])
-
-    @ops(reduction_ops)
-    def test_noncontiguous_transposed(self, device, dtype, op: ReductionOpInfo):
-        """Tests reducing a transposed tensor."""
-        t = make_tensor((5, 5), device, dtype)
-        self._test_noncontiguous(op, t.T)
-
-    @ops(reduction_ops)
-    def test_noncontiguous_expanded(self, device, dtype, op: ReductionOpInfo):
-        """Tests reducing a tensor with expanded singleton dimensions."""
-        t = make_tensor((2, 3), device, dtype)
-        self._test_noncontiguous(op, t.unsqueeze(1).expand(-1, 5, -1))
-
-    # NumPy does not support BFloat16 so we don't test that against reference
-    # implementations. We also don't compare dtypes or test for different
-    # keepdim because we already have other tests covering those.
-    # The test_reference_testing in test_ops.py only uses the samples from
-    # sample_inputs_func which do not test as exhaustively as these tests.
-
-    def _test_ref(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_kwargs):
-        """Compares op against op.ref for the given input and reduction kwargs"""
-        for args, kwargs in op.generate_args_kwargs(t, **reduction_kwargs):
-            kwargs.update(reduction_kwargs)
-            result = op(t, *args, **kwargs)
-            expected = op.ref(t.detach().cpu().numpy(), *args, **kwargs)
-            self.assertEqual(result, expected, exact_dtype=False)
-
-    @ops(filter(lambda op: op.ref is not None, reduction_ops),
-         allowed_dtypes=get_all_dtypes(include_bfloat16=False))
-    def test_ref_scalar_input(self, device, dtype, op: ReductionOpInfo):
-        """Compares op against reference for scalar input tensors"""
-        self._test_ref(op, make_tensor([], device, dtype))
-
-    @ops(filter(lambda op: op.ref is not None, reduction_ops),
-         allowed_dtypes=get_all_dtypes(include_bfloat16=False))
-    def test_ref_small_input(self, device, dtype, op: ReductionOpInfo):
-        """Compares op against reference for small input tensors"""
-        t = make_tensor((5, 3, 4, 2), device, dtype, exclude_zero=True)
-        self._test_ref(op, t)
-        for dim in [0, 1, 3] + ([[0, 2], [1, 3]] if op.supports_multiple_dims else []):
-            self._test_ref(op, t, dim=dim)
-
-    @ops(filter(lambda op: op.ref is not None, reduction_ops),
-         allowed_dtypes=[torch.float32])
-    def test_ref_large_input_1D(self, device, dtype, op: ReductionOpInfo):
-        """Compares op against reference for a large 1D input tensor to check stability"""
-        self._test_ref(op, make_tensor((2 ** 20,), device, dtype, low=-1, high=2, exclude_zero=True))
-
-    @ops(filter(lambda op: op.ref is not None, reduction_ops),
-         allowed_dtypes=[torch.float32])
-    def test_ref_large_input_2D(self, device, dtype, op: ReductionOpInfo):
-        """Compares op against reference for a large 2D input tensor to test parallelism"""
-        t = make_tensor((32, 2 ** 16), device, dtype, low=-1, high=2, exclude_zero=True)
-        self._test_ref(op, t, dim=1)
-
-    @ops(filter(lambda op: op.ref is not None, reduction_ops),
-         allowed_dtypes=[torch.float32])
-    def test_ref_large_input_64bit_indexing(self, device, dtype, op: ReductionOpInfo):
-        """Compares op against reference for a very large input tensor that requires 64 bit indexing"""
-        self._test_ref(op, make_tensor((275000000,), device, dtype, low=-1, high=2, exclude_zero=True))
-
-    @ops(filter(lambda op: op.ref is not None, reduction_ops),
-         allowed_dtypes=get_all_dtypes(include_bfloat16=False))
-    def test_ref_duplicate_values(self, device, dtype, op: ReductionOpInfo):
-        """Compares op against reference for input tensors with duplicate values"""
-        t = make_tensor((8, 8), device, dtype, exclude_zero=True)
-        t[::2, ::2] = t[1::2, 1::2]
-        self._test_ref(op, t)
-        self._test_ref(op, t, dim=0)
-        self._test_ref(op, t, dim=1)
-
-    @ops(filter(lambda op: op.ref is not None, reduction_ops),
-         allowed_dtypes=[torch.float32, torch.complex64])
-    def test_ref_extremal_values(self, device, dtype, op: ReductionOpInfo):
-        """Compares op against reference for input tensors with extremal values"""
-        t = make_tensor((10,), device, dtype, exclude_zero=True)
-        extremals = [0, 1] + [nan, inf, -inf] if torch.is_floating_point(t) else []
-        for extremal in extremals:
-            t[5] = extremal
-            self._test_ref(op, t)
-
     ###########################################################################
     # TODO: Legacy tests - port to ReductionOpInfo
     ###########################################################################
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index a3d61b477b4a4..1349a29e9d7fe 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -187,8 +187,6 @@ def _np(t):
                 return tuple(map(to_numpy, x))
             elif isinstance(x, dict):
                 return {k: to_numpy(v) for k, v in x.items()}
-            elif isinstance(x, torch.dtype):
-                return torch.empty(0, dtype=x).numpy().dtype
             elif isinstance(x, (numbers.Number, bool, str)):
                 return x
 
@@ -784,8 +782,8 @@ def _generate_reduction_inputs(device, dtype, requires_grad):
     """Generates input tensors for testing reduction operators"""
     yield make_tensor([], device, dtype, requires_grad=requires_grad)
     yield make_tensor([2], device, dtype, requires_grad=requires_grad)
-    yield make_tensor([3, 5], device, dtype, requires_grad=requires_grad, noncontiguous=True)
-    yield make_tensor([3, 2, 1, 2], device, dtype, requires_grad=requires_grad)
+    yield make_tensor([2, 3], device, dtype, requires_grad=requires_grad, noncontiguous=True)
+    yield make_tensor([3, 2, 1, 5], device, dtype, requires_grad=requires_grad)
 
 
 def _generate_reduction_kwargs(ndim, supports_multiple_dims=True):
@@ -929,8 +927,6 @@ def sample_inputs_func(*args, **kwargs):
         # Override OpInfo defaults and call base class __init__
         kwargs.setdefault('inplace_variant', None)
         kwargs.setdefault('sample_inputs_func', sample_inputs_func)
-        kwargs.setdefault('default_test_dtypes', (
-            torch.uint8, torch.int64, torch.float16, torch.bfloat16, torch.float32, torch.complex64))
         super(ReductionOpInfo, self).__init__(name, **kwargs)
 
         self.identity = identity
@@ -4084,6 +4080,38 @@ def generator():
 
     return list(generator())
 
+def sample_inputs_prod(op_info, device, dtype, requires_grad):
+    def make_arg(shape):
+        # shrink values to be in the interval [-1, +1] for better precision in gradgradcheck
+        return make_tensor(shape, device, dtype, low=-1, high=+1, requires_grad=requires_grad)
+
+    def prod_single_zero():
+        result = make_arg(2 * (S,))
+        with torch.no_grad():
+            result[0, 1] = 0
+        return result
+
+    # will not be needed once OpInfo tests support Iterables
+    def sample_generator():
+        for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad):
+            yield SampleInput(sample.input)  # only Tensor, ignore other inputs
+            yield sample
+            sample.kwargs['keepdim'] = True
+            yield sample
+        yield SampleInput(prod_single_zero())
+        yield SampleInput(make_arg((3, 3, 3)), args=(1,))
+        yield SampleInput(make_arg((3, 3, 3)), args=(1,), kwargs={'keepdim': True})
+
+        # test zero scalar tensor
+        zero = make_arg(())
+        with torch.no_grad():
+            zero.zero_()
+        yield SampleInput(zero)
+        yield SampleInput(zero, args=(0,))
+        yield SampleInput(zero, args=(0,), kwargs={'keepdim': True})
+
+    return list(sample_generator())
+
 
 def sample_inputs_nextafter(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -5493,53 +5521,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
     return op(input.triu() if upper else input.tril(), upper)
 
 
-def reference_reduction_numpy(f, supports_keepdims=True):
-    """Wraps a NumPy reduction operator.
-
-    The wrapper function will forward dim and keepdim kwargs to the wrapped
-    function as the NumPy equivalent axis and keepdims kwargs.
-
-    Args:
-        f: NumPy reduction operator to wrap
-        supports_keepdims (bool, optional): Whether the NumPy operator accepts
-            keepdims parameter. If it does not, the wrapper will manually unsqueeze
-            the reduced dimensions if it was called with keepdim=True. Defaults to True.
-
-    Returns:
-        Wrapped function
-    """
-    @wraps(f)
-    def wrapper(x: np.ndarray, *args, **kwargs):
-        # Copy keys into a set
-        keys = set(kwargs.keys())
-
-        dim = kwargs.pop('dim', None)
-        keepdim = kwargs.pop('keepdim', False)
-
-        if 'dim' in keys:
-            if x.ndim == 0:
-                # NumPy reductions don't accept dim=0 for scalar inputs
-                for i in dim if isinstance(dim, tuple) else (dim,):
-                    assert i in {0, -1}
-                kwargs['axis'] = None
-            else:
-                kwargs['axis'] = tuple(dim) if isinstance(dim, Sequence) else dim
-
-        if 'keepdim' in keys and supports_keepdims:
-            kwargs['keepdims'] = keepdim
-
-        result = f(x, *args, **kwargs)
-
-        # Unsqueeze reduced dimensions if NumPy does not support keepdims
-        if keepdim and not supports_keepdims and x.ndim > 0:
-            dim = list(range(x.ndim)) if dim is None else dim
-            result = np.expand_dims(result, dim)
-
-        return result
-
-    return wrapper
-
-
 # Operator database (sorted alphabetically)
 op_db: List[OpInfo] = [
     UnaryUfuncInfo('abs',
@@ -7058,6 +7039,15 @@ def wrapper(x: np.ndarray, *args, **kwargs):
            supports_out=False,
            supports_forward_ad=True,
            sample_inputs_func=sample_inputs_max_min_reduction_no_dim,),
+    # TODO(@heitorschueroff) Add test for dtype kwarg
+    OpInfo('mean',
+           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           assert_autodiffed=True,
+           supports_forward_ad=True,
+           sample_inputs_func=sample_inputs_reduction,
+           # Need to skip out test because one of the overload for mean does not support it
+           # TODO(@heitorschueroff) fix this when implementing ReductionInfo
+           skips=(SkipInfo('TestCommon', 'test_out'),)),
     OpInfo('quantile',
            dtypes=floating_types(),
            sample_inputs_func=sample_inputs_reduction_quantile),
@@ -8900,7 +8890,6 @@ def wrapper(x: np.ndarray, *args, **kwargs):
         supports_autograd=False,
         result_dtype=torch.bool,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-        ref=reference_reduction_numpy(np.all),
         skips=(
             # FIXME: does not support passing keepdim without dim
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -8908,8 +8897,7 @@ def wrapper(x: np.ndarray, *args, **kwargs):
             SkipInfo('TestReductions', 'test_dim_none'),
             SkipInfo('TestReductions', 'test_dim_none_keepdim'),
             # FIXME: uint8 input returns uint8 instead of bool
-            SkipInfo('TestReductions', 'test_result_dtype',
-                     dtypes=[torch.uint8]),
+            SkipInfo('TestReductions', 'test_result_dtype', dtypes=[torch.uint8]),
         ),
     ),
     ReductionOpInfo(
@@ -8920,7 +8908,6 @@ def wrapper(x: np.ndarray, *args, **kwargs):
         supports_autograd=False,
         result_dtype=torch.bool,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-        ref=reference_reduction_numpy(np.any),
         skips=(
             # FIXME: does not support passing keepdim without dim
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -8928,15 +8915,14 @@ def wrapper(x: np.ndarray, *args, **kwargs):
             SkipInfo('TestReductions', 'test_dim_none'),
             SkipInfo('TestReductions', 'test_dim_none_keepdim'),
             # FIXME: uint8 input returns uint8 instead of bool
-            SkipInfo('TestReductions', 'test_result_dtype',
-                     dtypes=[torch.uint8]),
+            SkipInfo('TestReductions', 'test_result_dtype', dtypes=[torch.uint8]),
         ),
     ),
     ReductionOpInfo(
         'amax',
         nan_policy='propagate',
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
-        ref=reference_reduction_numpy(np.amax),
+        ref=lambda a, dim=None, keepdim=False, **kwargs: np.amax(a, axis=dim, keepdims=keepdim, **kwargs),
         skips=(
             # FIXME: sum reduces all dimensions when dim=[]
             SkipInfo('TestReductions', 'test_dim_empty'),
@@ -8947,7 +8933,7 @@ def wrapper(x: np.ndarray, *args, **kwargs):
         'amin',
         nan_policy='propagate',
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
-        ref=reference_reduction_numpy(np.amin),
+        ref=lambda a, dim=None, keepdim=False, **kwargs: np.amin(a, axis=dim, keepdims=keepdim, **kwargs),
         skips=(
             # FIXME: sum reduces all dimensions when dim=[]
             SkipInfo('TestReductions', 'test_dim_empty'),
@@ -8960,7 +8946,6 @@ def wrapper(x: np.ndarray, *args, **kwargs):
         supports_autograd=False,
         result_dtype=torch.int64,
         dtypes=all_types_and(torch.float16, torch.bfloat16),
-        ref=reference_reduction_numpy(np.argmax, supports_keepdims=False),
         skips=(
             # FIXME: keepdim parameter is ignored when dim=None
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -8973,7 +8958,6 @@ def wrapper(x: np.ndarray, *args, **kwargs):
         supports_autograd=False,
         result_dtype=torch.int64,
         dtypes=all_types_and(torch.float16, torch.bfloat16),
-        ref=reference_reduction_numpy(np.argmin, supports_keepdims=False),
         skips=(
             # FIXME: keepdim parameter is ignored when dim=None
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -8988,7 +8972,6 @@ def wrapper(x: np.ndarray, *args, **kwargs):
         result_dtype=torch.int64,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         sample_inputs_func=sample_inputs_reduction_count_nonzero,
-        ref=reference_reduction_numpy(np.count_nonzero),
         skips=(
             # FIXME: count_nonzero does not accept keepdim kwarg
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -9002,35 +8985,6 @@ def wrapper(x: np.ndarray, *args, **kwargs):
             SkipInfo('TestReductions', 'test_dim_empty'),
         ),
     ),
-    ReductionOpInfo(
-        'mean',
-        nan_policy='propagate',
-        supports_out=False,
-        supports_forward_ad=True,
-        assert_autodiffed=True,
-        promotes_int_to_float=True,
-        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-        ref=reference_reduction_numpy(np.mean),
-        decorators=(
-            # FIXME: fix precision
-            DecorateInfo(toleranceOverride({
-                torch.float16: tol(atol=1e-05, rtol=1e-02),
-            }), 'TestReductions', 'test_noncontiguous_all'),
-            DecorateInfo(toleranceOverride({
-                torch.float16: tol(atol=1e-05, rtol=1e-02),
-            }), 'TestReductions', 'test_ref_small_input'),
-        ),
-        skips=(
-            # FIXME: prod does not support passing keepdim without passing dim
-            SkipInfo('TestReductions', 'test_dim_default_keepdim'),
-            # FIXME: prod reduces all dimensions when dim=[]
-            SkipInfo('TestReductions', 'test_dim_empty'),
-            SkipInfo('TestReductions', 'test_dim_empty_keepdim'),
-            # FIXME: prod does not support passing None to dim
-            SkipInfo('TestReductions', 'test_dim_none'),
-            SkipInfo('TestReductions', 'test_dim_none_keepdim'),
-        ),
-    ),
     ReductionOpInfo(
         'prod',
         identity=1,
@@ -9041,7 +8995,7 @@ def wrapper(x: np.ndarray, *args, **kwargs):
         gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
         dtypes=all_types_and_complex_and(torch.bool),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-        ref=reference_reduction_numpy(np.prod),
+        sample_inputs_func=sample_inputs_prod,
         skips=(
             # FIXME: prod does not support passing keepdim without passing dim
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -9051,11 +9005,6 @@ def wrapper(x: np.ndarray, *args, **kwargs):
             # FIXME: prod does not support passing None to dim
             SkipInfo('TestReductions', 'test_dim_none'),
             SkipInfo('TestReductions', 'test_dim_none_keepdim'),
-            # FIXME: improve precision, failing with nan != inf
-            SkipInfo('TestReductions', 'test_ref_small_input',
-                     dtypes=[torch.float16, torch.complex64]),
-            SkipInfo('TestReductions', 'test_ref_duplicate_values',
-                     dtypes=[torch.uint8, torch.float16, torch.complex64]),
         ),
     ),
     ReductionOpInfo(
@@ -9066,22 +9015,6 @@ def wrapper(x: np.ndarray, *args, **kwargs):
         supports_forward_ad=True,
         promotes_int_to_int64=True,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-        ref=reference_reduction_numpy(np.sum),
-        decorators=(
-            # FIXME: fix precision
-            DecorateInfo(toleranceOverride({
-                torch.float16: tol(atol=1e-05, rtol=1e-02),
-            }), 'TestReductions', 'test_noncontiguous_all'),
-            DecorateInfo(toleranceOverride({
-                torch.float16: tol(atol=1e-03, rtol=1e-02),
-            }), 'TestReductions', 'test_ref_small_input'),
-            DecorateInfo(toleranceOverride({
-                torch.float32: tol(atol=1e-03, rtol=1e-03),
-            }), 'TestReductions', 'test_ref_large_input_64bit_indexing'),
-            DecorateInfo(toleranceOverride({
-                torch.float16: tol(atol=1e-05, rtol=1e-02),
-            }), 'TestReductions', 'test_ref_duplicate_values'),
-        ),
         skips=(
             # FIXME: sum does not support passing keepdim without passing dim
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),
@@ -9100,22 +9033,6 @@ def wrapper(x: np.ndarray, *args, **kwargs):
         supports_out=False,
         promotes_int_to_int64=True,
         dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
-        ref=reference_reduction_numpy(np.nansum),
-        decorators=(
-            # FIXME: fix precision
-            DecorateInfo(toleranceOverride({
-                torch.float16: tol(atol=1e-05, rtol=1e-02),
-            }), 'TestReductions', 'test_noncontiguous_all'),
-            DecorateInfo(toleranceOverride({
-                torch.float16: tol(atol=1e-03, rtol=1e-02),
-            }), 'TestReductions', 'test_ref_small_input'),
-            DecorateInfo(toleranceOverride({
-                torch.float32: tol(atol=1e-03, rtol=1e-03),
-            }), 'TestReductions', 'test_ref_large_input_64bit_indexing'),
-            DecorateInfo(toleranceOverride({
-                torch.float16: tol(atol=1e-05, rtol=1e-02),
-            }), 'TestReductions', 'test_ref_duplicate_values'),
-        ),
         skips=(
             # FIXME: nansum does not support passing keepdim without passing dim
             SkipInfo('TestReductions', 'test_dim_default_keepdim'),

From c3464e78a461c6275e9fbbe3dfa72ca3983cb4df Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Mon, 30 Aug 2021 14:56:35 -0700
Subject: [PATCH 364/530] Revert D30561459: Fix bytes_written and bytes_read

Test Plan: revert-hammer

Differential Revision:
D30561459 (https://github.com/pytorch/pytorch/commit/e98173ff3423247c597e21c923c8f47470ef07ab)

Original commit changeset: 976fa5167097

fbshipit-source-id: 43f4c234ca400820fe6db5b4f37a25e14dc4b0dd
---
 caffe2/core/operator_schema.h                 |  17 +--
 caffe2/operators/batch_matmul_op.cc           | 113 ++++++++----------
 caffe2/operators/concat_split_op.cc           |  15 +--
 caffe2/operators/conv_pool_op_base.h          |  15 +--
 caffe2/operators/distance_op.cc               |  28 ++---
 caffe2/operators/fc_inference.cc              |  22 ++--
 caffe2/operators/one_hot_ops.cc               |  30 ++---
 caffe2/operators/utility_ops.cc               |  13 +-
 .../operator_test/concat_op_cost_test.py      |  54 ++++-----
 caffe2/python/workspace_test.py               |   2 +-
 caffe2/sgd/adagrad_op.cc                      |  55 +++------
 11 files changed, 140 insertions(+), 224 deletions(-)

diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
index 0d048eb8d26e9..64f5ef3ed883a 100644
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@@ -6,13 +6,12 @@
 #include <initializer_list>
 #include <ostream>
 #include <set>
-#include <unordered_map>
 #include <vector>
+#include <unordered_map>
 
 #include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
-#include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/filler.h"
 #include "caffe2/utils/proto_utils.h"
@@ -274,8 +273,8 @@ class TORCH_API OpSchema {
   OpSchema&
   Arg(const char* name, const char* description, bool required = false);
 
-#define DECLARE_STANDARD_ARG(name, str) \
-  static const char* Arg_##name;        \
+#define DECLARE_STANDARD_ARG(name, str)     \
+  static const char* Arg_##name; \
   OpSchema& Arg##name(const char* description);
 
   DECLARE_STANDARD_ARG(IsTest, is_test)
@@ -340,9 +339,7 @@ class TORCH_API OpSchema {
     return inplace_enforced_(x, y);
   }
 
-  TORCH_API friend std::ostream& operator<<(
-      std::ostream& out,
-      const OpSchema& schema);
+  TORCH_API friend std::ostream& operator<<(std::ostream& out, const OpSchema& schema);
 
   const std::vector<Argument>& args() const {
     return args_;
@@ -565,10 +562,8 @@ OpSchema::Cost PointwiseCostInference(
   }
 
   c.flops = nElemX * OpsPerPoint;
-  auto const& X_element_size_byte =
-      DataTypeToTypeMeta(X.data_type()).itemsize();
-  c.bytes_read = nElemRead * X_element_size_byte;
-  c.bytes_written = nElemX * X_element_size_byte;
+  c.bytes_read = nElemRead * sizeof(X.data_type());
+  c.bytes_written = nElemX * sizeof(X.data_type());
   return c;
 }
 
diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc
index 205acf74f1572..32799ced10671 100644
--- a/caffe2/operators/batch_matmul_op.cc
+++ b/caffe2/operators/batch_matmul_op.cc
@@ -1,7 +1,6 @@
 #include "caffe2/operators/batch_matmul_op.h"
 
 #include "caffe2/core/operator_schema.h"
-#include "caffe2/core/types.h"
 
 namespace caffe2 {
 
@@ -117,13 +116,9 @@ OpSchema::Cost CostInferenceForBatchMatMul(
     K = in[0].dims(ndims_A - 1);
   }
 
-  auto const& A_element_size_byte =
-      DataTypeToTypeMeta(A.data_type()).itemsize();
-  auto const& Y_element_size_byte =
-      DataTypeToTypeMeta(Y.data_type()).itemsize();
   c.flops = 2 * nElemY * K;
-  c.bytes_read = (nElemA + nElemB) * A_element_size_byte;
-  c.bytes_written = nElemY * Y_element_size_byte;
+  c.bytes_read = (nElemA + nElemB) * sizeof(A.data_type());
+  c.bytes_written = nElemY * sizeof(Y.data_type());
   c.params_bytes = 0;
   return c;
 }
@@ -185,76 +180,72 @@ class GetBatchMatMulGradient : public GradientMakerBase {
     auto no_trans_arg = vector<Argument>();
     auto trans_a_arg = vector<Argument>{MakeArgument<int>("trans_a", 1)};
     auto trans_b_arg = vector<Argument>{MakeArgument<int>("trans_b", 1)};
-    auto trans_both_arg = vector<Argument>{
-        MakeArgument<int>("trans_a", 1), MakeArgument<int>("trans_b", 1)};
+    auto trans_both_arg = vector<Argument>{MakeArgument<int>("trans_a", 1),
+                                           MakeArgument<int>("trans_b", 1)};
 
     if (trans_a) {
       if (trans_b) {
         // A'B':
         // dA = B'G', dB = G'A'
-        return vector<OperatorDef>{
-            CreateOperatorDef(
-                "BatchMatMul",
-                "",
-                vector<string>{I(1), GO(0)},
-                vector<string>{GI(0)},
-                trans_both_arg),
-            CreateOperatorDef(
-                "BatchMatMul",
-                "",
-                vector<string>{GO(0), I(0)},
-                vector<string>{GI(1)},
-                trans_both_arg)};
+        return vector<OperatorDef>{CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{I(1), GO(0)},
+                                       vector<string>{GI(0)},
+                                       trans_both_arg),
+                                   CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{GO(0), I(0)},
+                                       vector<string>{GI(1)},
+                                       trans_both_arg)};
       } else {
         // A'B:
         // dA = BG', dB = AG
-        return vector<OperatorDef>{
-            CreateOperatorDef(
-                "BatchMatMul",
-                "",
-                vector<string>{I(1), GO(0)},
-                vector<string>{GI(0)},
-                trans_b_arg),
-            CreateOperatorDef(
-                "BatchMatMul",
-                "",
-                vector<string>{I(0), GO(0)},
-                vector<string>{GI(1)},
-                no_trans_arg)};
+        return vector<OperatorDef>{CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{I(1), GO(0)},
+                                       vector<string>{GI(0)},
+                                       trans_b_arg),
+                                   CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{I(0), GO(0)},
+                                       vector<string>{GI(1)},
+                                       no_trans_arg)};
       }
     } else {
       if (trans_b) {
         // AB':
         // dA = GB, dB = G'A
-        return vector<OperatorDef>{
-            CreateOperatorDef(
-                "BatchMatMul",
-                "",
-                vector<string>{GO(0), I(1)},
-                vector<string>{GI(0)},
-                no_trans_arg),
-            CreateOperatorDef(
-                "BatchMatMul",
-                "",
-                vector<string>{GO(0), I(0)},
-                vector<string>{GI(1)},
-                trans_a_arg)};
+        return vector<OperatorDef>{CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{GO(0), I(1)},
+                                       vector<string>{GI(0)},
+                                       no_trans_arg),
+                                   CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{GO(0), I(0)},
+                                       vector<string>{GI(1)},
+                                       trans_a_arg)};
       } else {
         // AB:
         // dA = GB', dB = A'G
-        return vector<OperatorDef>{
-            CreateOperatorDef(
-                "BatchMatMul",
-                "",
-                vector<string>{GO(0), I(1)},
-                vector<string>{GI(0)},
-                trans_b_arg),
-            CreateOperatorDef(
-                "BatchMatMul",
-                "",
-                vector<string>{I(0), GO(0)},
-                vector<string>{GI(1)},
-                trans_a_arg)};
+        return vector<OperatorDef>{CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{GO(0), I(1)},
+                                       vector<string>{GI(0)},
+                                       trans_b_arg),
+                                   CreateOperatorDef(
+                                       "BatchMatMul",
+                                       "",
+                                       vector<string>{I(0), GO(0)},
+                                       vector<string>{GI(1)},
+                                       trans_a_arg)};
       }
     }
   }
diff --git a/caffe2/operators/concat_split_op.cc b/caffe2/operators/concat_split_op.cc
index 8aa9e282adb84..8eceb5ab4a577 100644
--- a/caffe2/operators/concat_split_op.cc
+++ b/caffe2/operators/concat_split_op.cc
@@ -101,12 +101,9 @@ OpSchema::Cost CostInferenceForSplit(
   CAFFE_ENFORCE_GT(in.size(), 0);
   struct OpSchema::Cost cost;
   cost.flops = 0;
-  auto const& input_0_element_size_byte =
-      DataTypeToTypeMeta(in[0].data_type()).itemsize();
-  auto const& input_1_element_size_byte =
-      (in.size() > 1) ? DataTypeToTypeMeta(in[1].data_type()).itemsize() : 0;
-  auto input_bytes_count = nElemFromDim(in[0]) * input_0_element_size_byte;
-  auto split_bytes_count = nElemFromDim(in[1]) * input_1_element_size_byte;
+  auto input_bytes_count = nElemFromDim(in[0]) * sizeof(in[0].data_type());
+  auto split_bytes_count =
+      (in.size() == 1) ? 0 : nElemFromDim(in[1]) * sizeof(in[1].data_type());
   // There can be two input blobs:
   // (1) actual tensor to be split
   // (2) lengths of outputs along split axis
@@ -332,13 +329,11 @@ OpSchema::Cost CostInferenceForConcat(
   }
   auto split_info_bytes_count = in.size() * sizeof(int);
 
-  auto const& input_0_element_size_byte =
-      DataTypeToTypeMeta(in[0].data_type()).itemsize();
   struct OpSchema::Cost cost;
   cost.flops = 0;
-  cost.bytes_read = nElemRead * input_0_element_size_byte;
+  cost.bytes_read = nElemRead * sizeof(in[0].data_type());
   cost.bytes_written =
-      size * input_0_element_size_byte + split_info_bytes_count;
+      size * sizeof(in[0].data_type()) + split_info_bytes_count;
   cost.params_bytes = 0;
   return cost;
 }
diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h
index b356ef952d79c..25bd99a92e50f 100644
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@@ -7,7 +7,6 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
-#include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2_legacy.pb.h"
 #include "caffe2/utils/math.h"
 
@@ -520,20 +519,14 @@ class ConvPoolOpBase : public Operator<Context> {
     uint64_t nElemW = nElemFromDim(W);
     uint64_t nElemBias = inputs.size() > 2 ? nElemFromDim(inputs[2]) : 0;
 
-    auto const& X_elemenet_size_byte =
-        DataTypeToTypeMeta(X.data_type()).itemsize();
-    auto const& Y_element_size_byte =
-        DataTypeToTypeMeta(Y.data_type()).itemsize();
-    auto const& W_element_size_byte =
-        DataTypeToTypeMeta(W.data_type()).itemsize();
-
     // grouping is NOT properly handled yet
     c.flops = N * Y_t * Y_h * Y_w * kernel_t * kernel_w * kernel_h *
         in_channels * out_channels * 2;
-    c.bytes_read = (nElemX + nElemW + nElemBias) * X_elemenet_size_byte;
-    c.bytes_written = N * out_channels * Y_t * Y_h * Y_w * Y_element_size_byte;
+    c.bytes_read = (nElemX + nElemW + nElemBias) * sizeof(X.data_type());
+    c.bytes_written =
+        N * out_channels * Y_t * Y_h * Y_w * sizeof(Y.data_type());
     c.params_bytes = out_channels * in_channels * kernel_t * kernel_h *
-        kernel_w * W_element_size_byte;
+        kernel_w * sizeof(W.data_type());
     return c;
   }
 
diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc
index 9ea8eea5a2725..1529534d8fb2e 100644
--- a/caffe2/operators/distance_op.cc
+++ b/caffe2/operators/distance_op.cc
@@ -1,5 +1,4 @@
 #include "caffe2/operators/distance_op.h"
-#include "caffe2/core/types.h"
 #include "caffe2/utils/eigen_utils.h"
 #ifdef CAFFE2_USE_MKLDNN
 #include <caffe2/ideep/operators/operator_fallback_ideep.h>
@@ -8,7 +7,7 @@
 
 namespace caffe2 {
 
-template <>
+template<>
 bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
   auto& X = Input(0);
   auto& Y = Input(1);
@@ -258,9 +257,7 @@ OpSchema::Cost CostInferenceForDotProduct(
   CAFFE_ENFORCE_EQ(out[0].dims().size(), 1);
 
   struct OpSchema::Cost c = PointwiseCostInference<2>(def, in);
-  auto const& out_0_element_size_byte =
-      DataTypeToTypeMeta(out[0].data_type()).itemsize();
-  c.bytes_written = out[0].dims(0) * out_0_element_size_byte;
+  c.bytes_written = out[0].dims(0) * sizeof(out[0].data_type());
   c.params_bytes = 0;
   return c;
 }
@@ -382,12 +379,10 @@ bool DotProductWithPaddingOp<float, CPUContext>::RunOnDevice() {
 }
 
 // L2
-REGISTER_CPU_OPERATOR(
-    SquaredL2Distance,
-    SquaredL2DistanceOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SquaredL2DistanceGradient,
-    SquaredL2DistanceGradientOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(SquaredL2Distance,
+                      SquaredL2DistanceOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
+                      SquaredL2DistanceGradientOp<float, CPUContext>);
 
 OPERATOR_SCHEMA(SquaredL2Distance)
     .NumInputs(2)
@@ -407,8 +402,7 @@ class GetSquaredL2DistanceGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
-        "SquaredL2DistanceGradient",
-        "",
+        "SquaredL2DistanceGradient", "",
         vector<string>{I(0), I(1), GO(0)},
         vector<string>{GI(0), GI(1)});
   }
@@ -768,9 +762,9 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase {
       replicate = GetArgument(Def(), "replicate").i();
     }
 
-    const auto dot_arg = vector<Argument>{
-        MakeArgument<float>("pad_value", pad_value),
-        MakeArgument<bool>("replicate", replicate)};
+    const auto dot_arg =
+        vector<Argument>{MakeArgument<float>("pad_value", pad_value),
+                         MakeArgument<bool>("replicate", replicate)};
 
     return SingleGradientDef(
         "DotProductWithPaddingGradient",
@@ -781,4 +775,4 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase {
   }
 };
 REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient);
-} // namespace caffe2
+}  // namespace caffe2
diff --git a/caffe2/operators/fc_inference.cc b/caffe2/operators/fc_inference.cc
index ba1b7122cdc9d..a44c230980c7f 100644
--- a/caffe2/operators/fc_inference.cc
+++ b/caffe2/operators/fc_inference.cc
@@ -1,5 +1,4 @@
 #include "caffe2/operators/fc_inference.h"
-#include "caffe2/core/types.h"
 
 namespace caffe2 {
 std::vector<TensorShape> FCShapeInference(
@@ -52,12 +51,11 @@ OpSchema::Cost CostInferenceForFC(
       ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
       : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
 
-  auto const& X_element_size_byte =
-      DataTypeToTypeMeta(in[0].data_type()).itemsize();
+  const auto& X = in[0];
   c.flops = M * N * (2 * K + 1);
-  c.bytes_read = (K * (M + N) + N) * X_element_size_byte;
-  c.bytes_written = M * N * X_element_size_byte;
-  c.params_bytes = (K * N + N) * X_element_size_byte;
+  c.bytes_read = (K * (M + N) + N) * sizeof(X.data_type());
+  c.bytes_written = M * N * sizeof(X.data_type());
+  c.params_bytes = (K * N + N) * sizeof(X.data_type());
   return c;
 }
 
@@ -96,11 +94,7 @@ OpSchema::Cost CostInferenceForFCGradient(
 
   CAFFE_ENFORCE_LT(0, out.size());
   const TensorShape dW = out[0];
-  auto const& dW_element_size_byte =
-      DataTypeToTypeMeta(dW.data_type()).itemsize();
   const TensorShape db = out[1];
-  auto const& db_element_size_byte =
-      DataTypeToTypeMeta(db.data_type()).itemsize();
 
   auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
   const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
@@ -117,17 +111,15 @@ OpSchema::Cost CostInferenceForFCGradient(
   uint64_t size_db = nElemFromDim(db);
 
   c.flops = M * N * (2 * K + 1);
-  c.bytes_written =
-      size_dW * dW_element_size_byte + size_db * db_element_size_byte;
+  c.bytes_written = (size_dW + size_db) * sizeof(float);
   c.params_bytes = (K * N + N) * sizeof(float);
 
   if (out.size() == 3) {
     const TensorShape dX = out[2];
     uint64_t size_dX = nElemFromDim(dX);
-    auto const& dX_element_size_byte =
-        DataTypeToTypeMeta(dX.data_type()).itemsize();
+
     c.flops += 2 * M * N * K;
-    c.bytes_written += size_dX * dX_element_size_byte;
+    c.bytes_written += size_dX * sizeof(float);
   }
   return c;
 }
diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc
index 55c73a5be22c4..c3eaf05db0e8f 100644
--- a/caffe2/operators/one_hot_ops.cc
+++ b/caffe2/operators/one_hot_ops.cc
@@ -2,7 +2,6 @@
 
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
-#include "caffe2/core/types.h"
 
 namespace caffe2 {
 
@@ -79,21 +78,12 @@ OpSchema::Cost CostInferenceForBatchOneHot(
   const auto& length = in[1];
   const auto& values = in[2];
 
-  auto const& data_element_size_byte =
-      DataTypeToTypeMeta(data.data_type()).itemsize();
-  auto const& length_element_size_byte =
-      DataTypeToTypeMeta(length.data_type()).itemsize();
-  auto const& values_element_size_byte =
-      DataTypeToTypeMeta(values.data_type()).itemsize();
-  auto const& output_element_size_byte =
-      DataTypeToTypeMeta(output.data_type()).itemsize();
-
-  uint64_t nBytesData = nElemFromDim(data) * data_element_size_byte;
-  uint64_t nBytesLength = nElemFromDim(length) * length_element_size_byte;
-  uint64_t nBytesValues = nElemFromDim(values) * values_element_size_byte;
+  uint64_t nBytesData = nElemFromDim(data) * sizeof(data.data_type());
+  uint64_t nBytesLength = nElemFromDim(length) * sizeof(length.data_type());
+  uint64_t nBytesValues = nElemFromDim(values) * sizeof(values.data_type());
   c.flops = 0;
   c.bytes_read = nBytesData + nBytesLength + nBytesValues;
-  c.bytes_written = nElemFromDim(output) * output_element_size_byte;
+  c.bytes_written = nElemFromDim(output) * sizeof(output.data_type());
   c.params_bytes = 0;
   return c;
 }
@@ -155,15 +145,15 @@ bool BatchBucketOneHotOp<CPUContext>::RunOnDevice() {
     for (int64_t j = 0; j < D; j++) {
       // here we assume the boundary values for each feature are sorted
       int64_t lower_bucket_idx = std::lower_bound(
-                                     boundaries_offset,
-                                     boundaries_offset + lens_data[j],
-                                     input_data[pos]) -
+                                    boundaries_offset,
+                                    boundaries_offset + lens_data[j],
+                                    input_data[pos]) -
           boundaries_offset;
 
       int64_t upper_bucket_idx = std::upper_bound(
-                                     boundaries_offset,
-                                     boundaries_offset + lens_data[j],
-                                     input_data[pos]) -
+                                    boundaries_offset,
+                                    boundaries_offset + lens_data[j],
+                                    input_data[pos]) -
           boundaries_offset;
 
       int64_t bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2;
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index 561da9189b388..8b5e116024b81 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -1,7 +1,6 @@
 #include "caffe2/operators/utility_ops.h"
 #include <cmath>
 #include <iostream>
-#include "caffe2/core/types.h"
 #include "caffe2/utils/eigen_utils.h"
 
 namespace caffe2 {
@@ -35,11 +34,9 @@ OpSchema::Cost CostInferenceForWeightedSum(
   const auto& nElem = nElemFromDim(X0);
   const auto& nInputs = in.size();
   c.flops = (nInputs - 1) * nElem;
-  auto const& X0_element_size_byte =
-      DataTypeToTypeMeta(X0.data_type()).itemsize();
-  c.bytes_read = (nInputs / 2) * (nElem + 1) * X0_element_size_byte;
-  c.bytes_written = nElem * X0_element_size_byte;
-  c.params_bytes = (nInputs / 2) * X0_element_size_byte;
+  c.bytes_read = (nInputs / 2) * (nElem + 1) * sizeof(X0.data_type());
+  c.bytes_written = nElem * sizeof(X0.data_type());
+  c.params_bytes = (nInputs / 2) * sizeof(X0.data_type());
   return c;
 }
 
@@ -51,7 +48,9 @@ REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(SumInt, SumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp<CPUContext>);
-REGISTER_CPU_OPERATOR(ScatterWeightedSum, ScatterWeightedSumOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    ScatterWeightedSum,
+    ScatterWeightedSumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Scatter, ScatterOp<CPUContext>);
 
diff --git a/caffe2/python/operator_test/concat_op_cost_test.py b/caffe2/python/operator_test/concat_op_cost_test.py
index 7dab4d6bd5d1f..996b330be4947 100644
--- a/caffe2/python/operator_test/concat_op_cost_test.py
+++ b/caffe2/python/operator_test/concat_op_cost_test.py
@@ -7,39 +7,33 @@
 
 class TestConcatOpCost(TestCase):
     def test_columnwise_concat(self):
-        def _test_columnwise_concat_for_type(dtype):
-            workspace.ResetWorkspace()
-            workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype))
-            workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=dtype))
-            concat_op = core.CreateOperator(
-                "Concat",
-                ["input_1", "input_2"],
-                ["output", "split_info"],
-            )
-            workspace.RunOperatorOnce(concat_op)
-
-            output = workspace.FetchBlob("output")
-            self.assertTupleEqual(output.shape, (2, 4))
-            np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])
+        workspace.ResetWorkspace()
+        workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
+        workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=np.int32))
+        concat_op = core.CreateOperator(
+            "Concat",
+            ["input_1", "input_2"],
+            ["output", "split_info"],
+        )
+        workspace.RunOperatorOnce(concat_op)
 
-            flops, bytes_written, bytes_read = workspace.GetOperatorCost(
-                concat_op, concat_op.input
-            )
+        output = workspace.FetchBlob("output")
+        self.assertTupleEqual(output.shape, (2, 4))
+        np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])
 
-            self.assertEqual(flops, 0)
-            self.assertEqual(
-                bytes_read,
-                sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
-            )
-            self.assertEqual(
-                bytes_written,
-                sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
-            )
+        flops, bytes_written, bytes_read = workspace.GetOperatorCost(
+            concat_op, concat_op.input
+        )
 
-        [
-            _test_columnwise_concat_for_type(t)
-            for t in [np.int64, np.float, np.half, np.int8]
-        ]
+        self.assertEqual(flops, 0)
+        self.assertEqual(
+            bytes_read,
+            sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
+        )
+        self.assertEqual(
+            bytes_written,
+            sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
+        )
 
     def test_split_then_concat(self):
         workspace.ResetWorkspace()
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index 1bf7b607e1b7e..afb2065027075 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -60,7 +60,7 @@ def testGetOperatorCost(self):
         self.assertTupleEqual(
             op_cost,
             namedtuple("Cost", ["flops", "bytes_written", "bytes_read"])(
-                1152, 256, 4168
+                1152, 256, 2084
             ),
         )
 
diff --git a/caffe2/sgd/adagrad_op.cc b/caffe2/sgd/adagrad_op.cc
index 0b6f604b48cdb..0de50f03e62d5 100644
--- a/caffe2/sgd/adagrad_op.cc
+++ b/caffe2/sgd/adagrad_op.cc
@@ -1,5 +1,4 @@
 #include "adagrad_op.h"
-#include "caffe2/core/types.h"
 
 namespace caffe2 {
 
@@ -24,30 +23,22 @@ static OpSchema::Cost CostInferenceForAdagrad(
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   c.flops = grad_size * 10;
 
-  auto const& moment_element_size_byte =
-      DataTypeToTypeMeta(moment.data_type()).itemsize();
-  auto const& param_element_size_byte =
-      DataTypeToTypeMeta(param.data_type()).itemsize();
-  auto const& grad_element_size_byte =
-      DataTypeToTypeMeta(grad.data_type()).itemsize();
-  auto const& lr_element_size_byte =
-      DataTypeToTypeMeta(lr.data_type()).itemsize();
   uint64_t bytes_written =
-      grad_size * param_element_size_byte + moment_element_size_byte;
+      grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type()));
 
   if (output_size == 3) {
     // also need to output effective learning rate in this case
     // assume it's the same data type as lr
-    bytes_written += grad_size * lr_element_size_byte;
+    bytes_written += grad_size * sizeof(lr.data_type());
   } else if (output_size == 4) {
     // also need to output effective learning rate and updates in this case
     // assume update is the same data type as param
     bytes_written +=
-        grad_size * (lr_element_size_byte + param_element_size_byte);
+        grad_size * (sizeof(lr.data_type()) + sizeof(param.data_type()));
   }
   c.bytes_written = bytes_written;
   c.bytes_read = c.bytes_written +
-      grad_size * (grad_element_size_byte + lr_element_size_byte);
+      grad_size * (sizeof(grad.data_type()) + sizeof(lr.data_type()));
 
   return c;
 }
@@ -111,18 +102,10 @@ static OpSchema::Cost CostInferenceForSparseAdagrad(
   // (optimistically count sqrt as one flop).
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   c.flops = grad_size * 7;
-  auto const& param_element_size_byte =
-      DataTypeToTypeMeta(param.data_type()).itemsize();
-  auto const& moment_element_size_byte =
-      DataTypeToTypeMeta(moment.data_type()).itemsize();
   c.bytes_written =
-      grad_size * (param_element_size_byte + moment_element_size_byte);
-  auto const& grad_element_size_byte =
-      DataTypeToTypeMeta(grad.data_type()).itemsize();
-  auto const& indices_element_size_byte =
-      DataTypeToTypeMeta(indices.data_type()).itemsize();
-  c.bytes_read = c.bytes_written + grad_size * grad_element_size_byte +
-      n * indices_element_size_byte;
+      grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type()));
+  c.bytes_read = c.bytes_written + grad_size * sizeof(grad.data_type()) +
+      n * sizeof(indices.data_type());
 
   return c;
 }
@@ -170,16 +153,6 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad(
   OpSchema::Cost c;
 
   if (n > 0) {
-    auto const& param_element_size_byte =
-        DataTypeToTypeMeta(param.data_type()).itemsize();
-    auto const& moment_element_size_byte =
-        DataTypeToTypeMeta(moment.data_type()).itemsize();
-    auto const& grad_element_size_byte =
-        DataTypeToTypeMeta(grad.data_type()).itemsize();
-    auto const& indices_element_size_byte =
-        DataTypeToTypeMeta(indices.data_type()).itemsize();
-    auto const& lr_element_size_byte =
-        DataTypeToTypeMeta(lr.data_type()).itemsize();
     auto block_size = grad_size / n;
     if (block_size == 1) {
       // +2: applying weight decay and add to grads
@@ -188,22 +161,22 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad(
       // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
       c.flops = n * 9;
       c.bytes_written =
-          n * (param_element_size_byte + moment_element_size_byte);
+          n * (sizeof(param.data_type()) + sizeof(moment.data_type()));
       c.bytes_read = c.bytes_written +
           n *
-              (grad_element_size_byte + indices_element_size_byte +
-               lr_element_size_byte);
+              (sizeof(grad.data_type()) + sizeof(indices.data_type()) +
+               sizeof(lr.data_type()));
     } else {
       // 5 per block (not counting index transforms)
       // 8 for each value of a block
       // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
       c.flops = n * (5 + (block_size * 8));
-      c.bytes_written = n * moment_element_size_byte +
-          n * block_size * param_element_size_byte;
+      c.bytes_written =
+          n * sizeof(moment.data_type()) + n * block_size * (param.data_type());
 
-      c.bytes_read = c.bytes_written + n * lr_element_size_byte +
+      c.bytes_read = c.bytes_written + n * (sizeof(lr.data_type())) +
           2 * n * block_size *
-              (grad_element_size_byte + param_element_size_byte);
+              (sizeof(grad.data_type()) + sizeof(param.data_type()));
     }
   }
   return c;

From ad4848565e1d9f4d408c60614f213acb52035181 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Mon, 30 Aug 2021 15:03:15 -0700
Subject: [PATCH 365/530] Enable Half, BFloat16, and Complex dtypes for coo-coo
 sparse matmul [CUDA] (#59980)

Summary:
This PR enables Half, BFloat16, ComplexFloat, and ComplexDouble support for matrix-matrix multiplication of COO sparse matrices.
The change is applied only to CUDA 11+ builds.

`cusparseSpGEMM` also supports `CUDA_C_16F` (complex float16) and `CUDA_C_16BF` (complex bfloat16). PyTorch also supports the complex float16 dtype (`ScalarType::ComplexHalf`), but there is no convenient dispatch, so this dtype is omitted in this PR.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/59980

Reviewed By: ngimel

Differential Revision: D29699456

Pulled By: cpuhrsch

fbshipit-source-id: 407ae53392acb2f92396a62a57cbaeb0fe6e950b
---
 aten/src/ATen/cuda/CUDADataType.h             | 61 +++++++++++++++++++
 .../ATen/native/sparse/cuda/SparseMatMul.cu   | 54 ++++++++++------
 test/test_sparse.py                           | 44 +++++++------
 torch/testing/_internal/common_cuda.py        |  1 +
 torch/utils/hipify/cuda_to_hip_mappings.py    | 31 +++++-----
 5 files changed, 139 insertions(+), 52 deletions(-)
 create mode 100644 aten/src/ATen/cuda/CUDADataType.h

diff --git a/aten/src/ATen/cuda/CUDADataType.h b/aten/src/ATen/cuda/CUDADataType.h
new file mode 100644
index 0000000000000..71c9af9af8aac
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDADataType.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+
+#include <cuda.h>
+#include <library_types.h>
+
+namespace at {
+namespace cuda {
+
+template <typename scalar_t>
+cudaDataType getCudaDataType() {
+  TORCH_INTERNAL_ASSERT(false, "Cannot convert type ", typeid(scalar_t).name(), " to cudaDataType.")
+}
+
+template<> cudaDataType getCudaDataType<at::Half>() {
+  return CUDA_R_16F;
+}
+template<> cudaDataType getCudaDataType<float>() {
+  return CUDA_R_32F;
+}
+template<> cudaDataType getCudaDataType<double>() {
+  return CUDA_R_64F;
+}
+template<> cudaDataType getCudaDataType<c10::complex<c10::Half>>() {
+  return CUDA_C_16F;
+}
+template<> cudaDataType getCudaDataType<c10::complex<float>>() {
+  return CUDA_C_32F;
+}
+template<> cudaDataType getCudaDataType<c10::complex<double>>() {
+  return CUDA_C_64F;
+}
+
+// HIP doesn't define integral types
+#ifndef __HIP_PLATFORM_HCC__
+template<> cudaDataType getCudaDataType<uint8_t>() {
+  return CUDA_R_8U;
+}
+template<> cudaDataType getCudaDataType<int8_t>() {
+  return CUDA_R_8I;
+}
+template<> cudaDataType getCudaDataType<int>() {
+  return CUDA_R_32I;
+}
+#endif
+
+#if !defined(__HIP_PLATFORM_HCC__) && defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+template<> cudaDataType getCudaDataType<int16_t>() {
+  return CUDA_R_16I;
+}
+template<> cudaDataType getCudaDataType<int64_t>() {
+  return CUDA_R_64I;
+}
+template<> cudaDataType getCudaDataType<at::BFloat16>() {
+  return CUDA_R_16BF;
+}
+#endif
+
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
index d5f31a1980bac..a08c93d1d71bd 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@@ -16,6 +16,7 @@
 #include <THC/THCThrustAllocator.cuh>
 
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDADataType.h>
 #include <ATen/cuda/CUDAUtils.h>
 #include <cusparse.h>
 #include <ATen/native/sparse/cuda/SparseCUDABlas.h>
@@ -118,14 +119,7 @@ struct csrMatrixRef {
         nnz_{nnz},
         size_{size} {
     #if IS_CUSPARSE11_AVAILABLE()
-      cudaDataType cuda_data_type;
-      if ( std::is_same<float, scalar_t>::value ) {
-        cuda_data_type = CUDA_R_32F;
-      } else if ( std::is_same<double, scalar_t>::value) {
-        cuda_data_type = CUDA_R_64F;
-      } else {
-        TORCH_CHECK(false, "Tensor types must be either float32 or float64");
-      }
+      cudaDataType cuda_data_type = at::cuda::getCudaDataType<scalar_t>();
       TORCH_CUDASPARSE_CHECK(cusparseCreateCsr(
         &description_,
         this->size(0),
@@ -192,8 +186,14 @@ struct CusparseMatrixMultiplyOp {
   cusparseSpGEMMDescr_t spgemmDesc;
 
   CusparseMatrixMultiplyOp() {
-    static_assert(std::is_same<float, scalar_t>::value || std::is_same<double, scalar_t>::value,
-      "cusparse csr sparse-sparse MM only supports data type of float and double.");
+    static_assert(
+      std::is_same<c10::Half, scalar_t>::value ||
+          std::is_same<c10::BFloat16, scalar_t>::value ||
+          std::is_same<float, scalar_t>::value ||
+          std::is_same<double, scalar_t>::value ||
+          std::is_same<c10::complex<float>, scalar_t>::value ||
+          std::is_same<c10::complex<double>, scalar_t>::value,
+      "cusparseSpGEMM only supports data type of half, bfloat16, float, double and complex float, double.");
     // SpGEMM Computation
     TORCH_CUDASPARSE_CHECK(cusparseSpGEMM_createDescr(&spgemmDesc));
   }
@@ -212,14 +212,6 @@ struct CusparseMatrixMultiplyOp {
 
     const int B_num_cols = B.size(1);
 
-    cudaDataType computeType;
-    if ( std::is_same<float, scalar_t>::value ) {
-      computeType = CUDA_R_32F;
-    } else if ( std::is_same<double, scalar_t>::value) {
-      computeType = CUDA_R_64F;
-    } else {
-      TORCH_CHECK(false, "Tensor types must be either float32 or float64");
-    }
     csrOutput out({A.size(0), B.size(1)});
 
     out.csr_pointers_ = at::empty({out.size(0) + 1}, output_indices.options().dtype(kInt));
@@ -252,6 +244,16 @@ struct CusparseMatrixMultiplyOp {
     cusparseSpMatDescr_t matC = C.description_;
     //--------------------------------------------------------------------------
 
+    cudaDataType computeType = at::cuda::getCudaDataType<scalar_t>();
+
+    // If a specific GPU model does not provide native support for a given data type,
+    // the routine returns CUSPARSE_STATUS_ARCH_MISMATCH error
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+    TORCH_CHECK(prop->major >= 5 && !((10*prop->major + prop->minor) < 53 && computeType == CUDA_R_16F),
+        "sparse_mm: CUDA Float16 requires compute capability >= 53 (current: ", prop->major, prop->minor, ")");
+    TORCH_CHECK(!(prop->major < 8 && computeType == CUDA_R_16BF),
+        "sparse_mm: CUDA BFloat16 requires compute capability >= 80 (current: ", prop->major, prop->minor, ")");
+
     // ask bufferSize1 bytes for external memory
     TORCH_CUDASPARSE_CHECK(cusparseSpGEMM_workEstimation(
         handle,
@@ -646,8 +648,14 @@ void sparse_sparse_matmul_cuda_kernel(
     const Tensor& mat1,
     const Tensor& mat2) {
 
-  static_assert(std::is_same<float, scalar_t>::value || std::is_same<double, scalar_t>::value,
-    "sparse_sparse_matmul_cuda_kernel only supports float and double value types");
+  static_assert(
+    std::is_same<c10::Half, scalar_t>::value ||
+        std::is_same<c10::BFloat16, scalar_t>::value ||
+        std::is_same<float, scalar_t>::value ||
+        std::is_same<double, scalar_t>::value ||
+        std::is_same<c10::complex<float>, scalar_t>::value ||
+        std::is_same<c10::complex<double>, scalar_t>::value,
+    "sparse_sparse_matmul_cuda_kernel only supports data type of half, bfloat16, float, double and complex float, double.");
 
   Tensor mat1_indices_ = mat1._indices().contiguous();
   Tensor mat1_values = mat1._values().contiguous();
@@ -775,9 +783,15 @@ Tensor sparse_sparse_matmul_cuda(const Tensor& mat1_, const Tensor& mat2_) {
   auto output = at::native::empty_like(mat1_);
   output.sparse_resize_and_clear_({mat1_.size(0), mat2_.size(1)}, mat1_.sparse_dim(), 0);
 
+#if IS_CUSPARSE11_AVAILABLE()
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, mat1_.scalar_type(), "sparse_matmul", [&] {
+    sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
+  });
+#else
   AT_DISPATCH_FLOATING_TYPES(mat1_.scalar_type(), "sparse_matmul", [&] {
     sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
   });
+#endif
   return output;
 }
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 333f29f13138e..aaf045c4b0ea0 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -12,8 +12,12 @@
 from torch.testing._internal.common_cuda import TEST_CUDA, _get_torch_cuda_version
 from numbers import Number
 from typing import Dict, Any
+from torch.testing import get_all_complex_dtypes, get_all_fp_dtypes
+from torch.testing._internal.common_cuda import \
+    (SM53OrLater, SM80OrLater, CUDA11OrLater)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, ops, dtypes, dtypesIfCPU, onlyCPU, onlyCUDA, deviceCountAtLeast)
+    (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride,
+     deviceCountAtLeast)
 from torch.testing._internal.common_methods_invocations import \
     (sparse_unary_ufuncs)
 
@@ -3217,8 +3221,13 @@ def sparse_log(x):
     # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA
     @skipIfRocm
     @coalescedonoff
-    @dtypes(torch.double)
-    @dtypesIfCPU(torch.double, torch.cdouble)
+    @dtypes(*get_all_complex_dtypes(),
+            *get_all_fp_dtypes(include_half=False, include_bfloat16=False))
+    @dtypesIfCUDA(*(get_all_complex_dtypes() if CUDA11OrLater else ()),
+                  *get_all_fp_dtypes(
+                      include_half=(CUDA11OrLater and SM53OrLater),
+                      include_bfloat16=(CUDA11OrLater and SM80OrLater)))
+    @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
     def test_sparse_matmul(self, device, dtype, coalesced):
         """
         This function test `torch.sparse.mm` when both the mat1 and mat2 are sparse tensors.
@@ -3328,22 +3337,23 @@ def test_sparse_matmul(sparse_dims, nnz, shape_a, shape_b):
             r2 = torch.sparse.mm(a, b)
             self.assertEqual(r1, r2)
 
-            a.requires_grad_(True)
-            b.requires_grad_(True)
+            if dtype in [torch.double, torch.cdouble]:
+                a.requires_grad_(True)
+                b.requires_grad_(True)
 
-            # check autograd support on sparse matmul
-            def fn(D1, D2):
-                return torch.sparse.mm(D1, D2).to_dense()
+                # check autograd support on sparse matmul
+                def fn(D1, D2):
+                    return torch.sparse.mm(D1, D2).to_dense()
 
-            if a.is_cuda:
-                # For cuda, `nondet_tol` is set with `1e-5`
-                # This is because cuSparse sometimes returns approximate zero values like `~e-323`
-                # TODO: Check this cuSparse issue.
-                # This happens when you do chain multiplication `torch.sparse.mm` operations
-                gradcheck(fn, (a, b), check_sparse_nnz=True, nondet_tol=1e-5)
-            else:
-                gradcheck(fn, (a, b), check_sparse_nnz=True)
-            grad_with_custom_sparsity_pattern_test_helper(sparse_dims, nnz, shape_a, shape_b)
+                if a.is_cuda:
+                    # For cuda, `nondet_tol` is set with `1e-5`
+                    # This is because cuSparse sometimes returns approximate zero values like `~e-323`
+                    # TODO: Check this cuSparse issue.
+                    # This happens when you do chain multiplication `torch.sparse.mm` operations
+                    gradcheck(fn, (a, b), check_sparse_nnz=True, nondet_tol=1e-5)
+                else:
+                    gradcheck(fn, (a, b), check_sparse_nnz=True)
+                grad_with_custom_sparsity_pattern_test_helper(sparse_dims, nnz, shape_a, shape_b)
 
         def test_error_cases():
             def fn(sparse_dims, nnz, shape_a, shape_b):
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 5d0849bb8407d..36e7f8a178577 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -20,6 +20,7 @@
 CUDA9 = torch.version.cuda and torch.version.cuda.startswith('9.')
 SM53OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (5, 3)
 SM60OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (6, 0)
+SM80OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 0)
 
 TEST_MAGMA = TEST_CUDA
 if TEST_CUDA:
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 558acc24ef3c8..6b60516efe322 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -554,6 +554,7 @@
         ),
         ("device_functions.h", ("hip/device_functions.h", CONV_INCLUDE, API_RUNTIME)),
         ("driver_types.h", ("hip/driver_types.h", CONV_INCLUDE, API_RUNTIME)),
+        ("library_types.h", ("hip/library_types.h", CONV_INCLUDE, API_RUNTIME)),
         ("cuComplex.h", ("hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME)),
         ("cuda_fp16.h", ("hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME)),
         (
@@ -3786,21 +3787,21 @@
             ),
         ),
         ("cudaDataType_t", ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("cudaDataType", ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_16F", ("hipR16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_16F", ("hipC16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_32F", ("hipR32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_32F", ("hipC32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_64F", ("hipR64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_64F", ("hipC64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_8I", ("hipR8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_8I", ("hipC8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_8U", ("hipR8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_8U", ("hipC8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_32I", ("hipR32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_32I", ("hipC32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_32U", ("hipR32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_32U", ("hipC32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("cudaDataType", ("hipDataType", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_16F", ("HIP_R_16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_16F", ("HIP_C_16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_32F", ("HIP_R_32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_32F", ("HIP_C_32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_64F", ("HIP_R_64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_64F", ("HIP_C_64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_8I", ("HIP_R_8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_8I", ("HIP_C_8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_8U", ("HIP_R_8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_8U", ("HIP_C_8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_32I", ("HIP_R_32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_32I", ("HIP_C_32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_32U", ("HIP_R_32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_32U", ("HIP_C_32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
         (
             "MAJOR_VERSION",
             ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),

From a7ae73a2380c3e45394998d2d1d9bceb14f2ee55 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 30 Aug 2021 15:03:40 -0700
Subject: [PATCH 366/530] BUG Fixes regression for nllloss gradcheck (#64203)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/64163

This PR includes the fix and the opinfo from https://github.com/pytorch/pytorch/pull/63854/ for non-regression testing.

cc albanD mruberry jbschlosser

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64203

Reviewed By: albanD

Differential Revision: D30647522

Pulled By: jbschlosser

fbshipit-source-id: 2974d299763505908fa93532aca2bd5d5b71f2e9
---
 aten/src/ATen/native/cuda/Loss.cu             | 10 ++--
 .../_internal/common_methods_invocations.py   | 47 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Loss.cu b/aten/src/ATen/native/cuda/Loss.cu
index ac9c3c0d8130f..2087f19dd3486 100644
--- a/aten/src/ATen/native/cuda/Loss.cu
+++ b/aten/src/ATen/native/cuda/Loss.cu
@@ -1,4 +1,5 @@
 #include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
@@ -207,7 +208,7 @@ __global__ void nll_loss_forward_reduce_cuda_kernel_1d(
     bool size_average,
     int n_classes,
     int64_t ignore_index) {
-  CUDA_KERNEL_ASSERT(threadIdx.x == 0 && threadIdx.y == 0 & threadIdx.z == 0);
+  CUDA_KERNEL_ASSERT(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0);
 
   int t = static_cast<int>(*target);
   if (t != static_cast<int>(ignore_index)) {
@@ -263,7 +264,7 @@ __global__ void nll_loss_forward_reduce_cuda_kernel_2d(
     *total_weight = static_cast<scalar_t>(total_weight_acc);
     if (size_average && nframe == 0) {
       // Mean reduction on empty tensors produces NaN
-      *output = std::numeric_limits<double>::quiet_NaN();
+      *output = std::numeric_limits<scalar_t>::quiet_NaN();
     } else if (size_average && total_weight_acc != 0) {
       *output = static_cast<scalar_t>(output_acc / total_weight_acc);
     } else {
@@ -286,7 +287,7 @@ void nll_loss_forward_out_cuda_template(
 
   auto weight_ = weight.defined() ? weight.contiguous() : weight;
 
-  if (reduction == Reduction::None & n_dims == 2) {
+  if (reduction == Reduction::None && n_dims == 2) {
     output.resize_({batch_size});
     if (batch_size == 0) {
       // This guards from unnecessary operations and launching CUDA kernel with
@@ -365,7 +366,8 @@ void nll_loss_forward_out_cuda_template(
               target.scalar_type(),
               "nll_loss_forward_reduce_cuda_kernel_2d_index",
               [&] {
-                nll_loss_forward_reduce_cuda_kernel_2d<scalar_t, float, index_t>
+                using accscalar_t = at::acc_type<scalar_t, /*is_cuda*/true>;
+                nll_loss_forward_reduce_cuda_kernel_2d<scalar_t, accscalar_t, index_t>
                     <<<1,
                        NLL_LOSS_THREADS,
                        0,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 1349a29e9d7fe..52e8d73c6200e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -13,7 +13,7 @@
 from torch._six import inf
 import collections.abc
 
-from typing import Any, Callable, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, List, Optional, Sequence, Tuple, Union, Dict
 
 from torch.testing import \
     (make_non_contiguous, floating_types, floating_types_and, complex_types,
@@ -5221,6 +5221,36 @@ def sample_inputs_grid_sample(op_info, device, dtype, requires_grad, **kwargs):
 
     return sample_inputs
 
+def sample_inputs_nll_loss(op_info, device, dtype, requires_grad, **kwargs):
+    batch_size, num_classes = shape = (2, 3)
+
+    input_shape_and_kwargs: List[Tuple[Tuple[int, ...], Dict[str, Any]]] = [
+        ((*shape, 1), dict()),
+        ((*shape, 1, 2), dict()),
+        ((*shape, 1, 2, 3), dict()),
+        (shape, dict(weight=make_tensor((num_classes,), device=device, dtype=dtype).abs())),
+        (shape, dict(ignore_index=num_classes // 2)),
+        (shape, dict(reduction="sum")),
+        (shape, dict(reduction="mean")),
+    ]
+
+    sample_inputs = []
+    for input_shape, kwargs in input_shape_and_kwargs:
+        input = make_tensor(input_shape, device=device, dtype=dtype, requires_grad=requires_grad)
+
+        target = make_tensor(
+            (batch_size, *input_shape[2:]),
+            low=0,
+            high=num_classes,
+            device=device,
+            dtype=torch.long,
+            requires_grad=requires_grad
+        )
+
+        sample_inputs.append(SampleInput(input, args=(target,), kwargs=kwargs))
+
+    return sample_inputs
+
 foreach_unary_op_db: List[OpInfo] = [
     ForeachFuncInfo('exp'),
     ForeachFuncInfo('acos'),
@@ -9044,6 +9074,21 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
             SkipInfo('TestReductions', 'test_dim_none_keepdim'),
         ),
     ),
+    OpInfo(
+        "nn.functional.nll_loss",
+        ref=_NOTHING,
+        dtypesIfCPU=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_nll_loss,
+        skips=(
+            SkipInfo(
+                "TestJit",
+                "test_variant_consistency_jit",
+                dtypes=(torch.float32,),
+            ),
+        ),
+    ),
 ]
 
 # Common operator groupings

From 5401159b8f8988c7dc7489d44e71192bb679cf85 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Mon, 30 Aug 2021 15:58:50 -0700
Subject: [PATCH 367/530] OpInfo for nn.functional.interpolate (#61956)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61956

Each mode goes through a different implementation so they are listed as
different variants.

Test Plan: - run tests

Reviewed By: malfet

Differential Revision: D30013751

Pulled By: zou3519

fbshipit-source-id: 4253b40b55667d7486ef2d98b441c13d807ab292
---
 .../_internal/common_methods_invocations.py   | 114 ++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 52e8d73c6200e..04db52b2e607b 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -2535,6 +2535,48 @@ def sample_inputs_hardswish(self, device, dtype, requires_grad):
                requires_grad=requires_grad, low=-5, high=5)) for _ in range(1, N)]
     return tensors
 
+def sample_inputs_interpolate(mode, self, device, dtype, requires_grad):
+    N, C = 2, 3
+    D = 4
+    S = 3
+    L = 5
+
+    align_corners_options: Tuple[Any, ...] = (None,)
+    if mode in ('linear', 'bilinear', 'bicubic', 'trilinear'):
+        align_corners_options = (True, False, None)
+    ranks_for_mode = {
+        'nearest': [1, 2, 3],
+        'linear': [1],
+        'bilinear': [2],
+        'bicubic': [2],
+        'trilinear': [3],
+        'area': [1, 2, 3]
+    }
+
+    def shape(size, rank, with_batch_channel=True):
+        if with_batch_channel:
+            return tuple([N, C] + ([size] * rank))
+        return tuple([size] * rank)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype,
+                       requires_grad=requires_grad, low=-1, high=1)
+
+    sample_inputs = []
+    for align_corners in align_corners_options:
+        for rank in ranks_for_mode[mode]:
+            sample_inputs.extend([
+                SampleInput(make_arg(shape(D, rank)),
+                            args=(shape(S, rank, False), None, mode, align_corners)),
+                SampleInput(make_arg(shape(D, rank)),
+                            args=(shape(L, rank, False), None, mode, align_corners)),
+                SampleInput(make_arg(shape(D, rank)),
+                            args=(None, 1.7, mode, align_corners)),
+                SampleInput(make_arg(shape(D, rank)),
+                            args=(None, 0.6, mode, align_corners)),
+            ])
+
+    return sample_inputs
+
 def sample_inputs_gelu(self, device, dtype, requires_grad):
     N = 5
     tensors = [SampleInput(make_tensor((N * 2, N * 2), device=device, dtype=dtype,
@@ -7227,6 +7269,78 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            dtypes=floating_types_and(torch.half),
            sample_inputs_func=sample_inputs_nn_unfold,
            skips=(
+               # JIT alias info internal asserts here
+               SkipInfo('TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='nearest',
+           supports_autograd=True,
+           dtypesIfCPU=floating_types_and(torch.uint8),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.uint8),
+           sample_inputs_func=partial(sample_inputs_interpolate, 'nearest'),
+           skips=(
+               # JIT alias info internal asserts here
+               SkipInfo('TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='linear',
+           supports_autograd=True,
+           dtypesIfCUDA=floating_types_and(torch.half),
+           sample_inputs_func=partial(sample_inputs_interpolate, 'linear'),
+           skips=(
+               # JIT alias info internal asserts here
+               SkipInfo('TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='bilinear',
+           supports_autograd=True,
+           dtypesIfCUDA=floating_types_and(torch.half),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           sample_inputs_func=partial(sample_inputs_interpolate, 'bilinear'),
+           skips=(
+               # JIT alias info internal asserts here
+               SkipInfo('TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='bicubic',
+           supports_autograd=True,
+           dtypesIfCUDA=floating_types_and(torch.half),
+           sample_inputs_func=partial(sample_inputs_interpolate, 'bicubic'),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           skips=(
+               # JIT alias info internal asserts here
+               SkipInfo('TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='trilinear',
+           supports_autograd=True,
+           dtypesIfCUDA=floating_types_and(torch.half),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           sample_inputs_func=partial(sample_inputs_interpolate, 'trilinear'),
+           skips=(
+               # JIT alias info internal asserts here
+               SkipInfo('TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='area',
+           supports_autograd=True,
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=partial(sample_inputs_interpolate, 'area'),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           skips=(
+               # JIT alias info internal asserts here
                SkipInfo('TestJit', 'test_variant_consistency_jit'),
            ),
            supports_out=False),

From 1f16c22dc8251f01627ee73ad1ef69bd18e51447 Mon Sep 17 00:00:00 2001
From: Harut Movsisyan <harutm@fb.com>
Date: Mon, 30 Aug 2021 16:16:45 -0700
Subject: [PATCH 368/530] [Static Runtime] Implement aten::cumsum out variant
 (#64159)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64159

Test Plan:
Confirm out variant is called for both versions:

```
> buck run //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- --v=1
```

Reviewed By: mikeiovine

Differential Revision: D30622819

fbshipit-source-id: a2c8c7f969dae5f507718fb3d513e1fb4f026736
---
 benchmarks/static_runtime/test_scripts.h      | 10 +++++++++
 .../static_runtime/test_static_runtime.cc     | 22 +++++++++++++++++++
 torch/csrc/jit/runtime/static/ops.cpp         | 22 +++++++++++++++++++
 3 files changed, 54 insertions(+)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index 7fdb113c4ed45..e26437fe4a6f9 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -801,3 +801,13 @@ const std::string cat_script = R"IR(
       %ret: Tensor = aten::cat(%ten_list2, %dim)
       return (%ret)
 )IR";
+
+const auto cumsum_script = R"JIT(
+   def forward(self, a: Tensor, dim: int):
+      return torch.cumsum(a, dim).clone()
+)JIT";
+
+const auto cumsum_script_dtype = R"JIT(
+   def forward(self, a: Tensor, dim: int, dtype: int):
+      return torch.cumsum(a, dim, dtype=dtype).clone()
+)JIT";
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index b7201baa1e182..aa5cd35e38e56 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -1311,3 +1311,25 @@ TEST(StaticRuntime, IndividualOps_Cat) {
   std::vector<IValue> args1{c, d, 1};
   testStaticRuntime(cat_script, args0, args1);
 }
+
+
+TEST(StaticRuntime, IndividualOps_Cumsum) {
+  auto a = at::randn({2, 3});
+  std::vector<IValue> args0{a, 0};
+  testStaticRuntime(cumsum_script, args0);
+
+  auto b = at::randn({4, 3});
+  std::vector<IValue> args1{b, 1};
+  testStaticRuntime(cumsum_script, args0, args1);
+}
+
+TEST(StaticRuntime, IndividualOps_CumsumDtype) {
+  auto a = at::randn({1, 2});
+  auto dtype = at::ScalarType::Float;
+  std::vector<IValue> args0{a, 0, dtype};
+  testStaticRuntime(cumsum_script_dtype, args0);
+
+  auto b = at::randn({3, 4});
+  std::vector<IValue> args1{b, 1, dtype};
+  testStaticRuntime(cumsum_script_dtype, args0, args1);
+}
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index cf91f33a28c26..a73872b540258 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1733,6 +1733,28 @@ REGISTER_OPERATOR_FUNCTOR(aten::cat, aten_cat, [](Node* n) -> SROperator {
   };
 });
 
+REGISTER_OPERATOR_FUNCTOR(aten::cumsum, aten_cumsum, [](Node* n) -> SROperator {
+  if (!n->matches(torch::schema(
+          "aten::cumsum(Tensor self, int dim, ScalarType? dtype=None) -> Tensor"))) {
+    LogAndDumpSchema(n);
+    return nullptr;
+  }
+  return [](ProcessedNode* p_node) {
+    const auto& input = p_node->Input(0).toTensor();
+    const auto dim = p_node->Input(1).toInt();
+    const auto dtype = p_node->Input(2).toOptional<c10::ScalarType>();
+
+    if (p_node->Output(0).isNone()) {
+      p_node->Output(0) = at::cpu::cumsum(input, dim, dtype);
+      return;
+    }
+
+    auto& output = p_node->Output(0).toTensor();
+    fastResizeToZero(output);
+    at::cpu::cumsum_out(output, input, dim, dtype);
+  };
+});
+
 namespace {
 
 void check_cat_no_zero_dim(const std::vector<at::Tensor>& tensors) {

From ebc0aacf83a0446ed798a96059c05da815c73d3d Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Mon, 30 Aug 2021 18:36:33 -0700
Subject: [PATCH 369/530] [nnc] Fix half2float conversion and re-enable float16
 (#64199)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64199

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D30643865

Pulled By: bertmaher

fbshipit-source-id: 9de6adca53bd08839328cbaf6364f7de9550264b
---
 test/test_jit_fuser_te.py                  | 44 ++++++++++++++++------
 test/test_tensorexpr.py                    |  1 -
 torch/csrc/jit/passes/tensorexpr_fuser.cpp |  2 +-
 torch/csrc/jit/tensorexpr/half_support.h   | 40 ++++++++++++++++++++
 torch/csrc/jit/tensorexpr/ir_verifier.cpp  | 12 ++++++
 5 files changed, 86 insertions(+), 13 deletions(-)

diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 6d2432aa151f8..918cc702d83d6 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -94,8 +94,7 @@ def setUp(self):
             torch.bool,
         ]
         self.fp_dtypes = [
-            # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
-            # torch.float16,
+            torch.float16,
             torch.float32,
             torch.float64,
         ]
@@ -1130,8 +1129,7 @@ def foo(x):
         dtypes = [
             torch.bool,
             torch.int,
-            # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
-            # torch.float16,
+            torch.float16,
             torch.float32,
             torch.float64,
         ]
@@ -1146,6 +1144,9 @@ def forward(self, x):
 
         bad_dtypes = []
         for dtype, output_dtype, device, size in product(dtypes, dtypes, self.devices, sizes):
+            # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
+            if dtype == torch.float16 and device == "cpu":
+                continue
             if dtype == output_dtype:
                 continue
 
@@ -1201,18 +1202,16 @@ def test_isnan(self):
             torch.int16,
             torch.int32,
             torch.int64,
-            # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
-            # torch.float16,
+            torch.float16,
             torch.float32,
             torch.float64,
             torch.bool,
         ]
 
         for inp, device, dtype in product(inputs, self.devices, dtypes):
-            # TODO
-            if dtype == torch.float16 and not LLVM_ENABLED:
+            # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
+            if dtype == torch.float16 and device == "cpu":
                 continue
-
             inp = inp.to(device=device, dtype=dtype)
             try:
                 f = torch.jit.trace(lambda x: x.isnan(), (inp,))
@@ -1272,6 +1271,9 @@ def apply(fn):
         gpu_only = {torch.erf, torch.erfc}
         sizes = [(1,), (2,), (4, 4)]
         for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes):
+            # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
+            if dtype == torch.float16 and device == "cpu":
+                continue
             if op in gpu_only and device == "cpu":
                 continue
             try:
@@ -1323,6 +1325,8 @@ def apply(fn):
         ]
         devices = self.devices
         for dtype, op, device in product(self.dtypes, binary_ops, devices):
+            if dtype == torch.float16 and device == "cpu":
+                continue
             try:
                 x = self.data_for(dtype, device)
                 y = self.data_for(dtype, device)
@@ -1373,6 +1377,8 @@ def fn(x, y):
                                      "[[10, 3, 4], [4, 5]]",
                                      ]
         for dtype, size, device in product(self.dtypes, sizes, devices):
+            if dtype == torch.float16 and device == "cpu":
+                continue
             try:
                 size_x, size_y = size
                 x = self.data_for(dtype, device, size=size_x)
@@ -1417,6 +1423,8 @@ def apply_with_scalar(fn, scalar):
         # only using  scalar values relevant to particular ops
         scalars = [1.5, 3, 0, -2.0, -1]
         for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars):
+            if dtype == torch.float16 and device == "cpu":
+                continue
             try:
                 x = self.data_for(dtype, device)
                 fn = apply_with_scalar(op, scalar)
@@ -1449,6 +1457,8 @@ def apply_with_scalar(fn, scalar):
         # only using  scalar values relevant to particular ops
         scalars = [1.5, 3, -2.0, -1]  # skip 0
         for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars):
+            if dtype == torch.float16 and device == "cpu":
+                continue
             try:
                 x = self.data_for(dtype, device)
                 fn = apply_with_scalar(op, scalar)
@@ -1484,6 +1494,8 @@ def apply_with_scalar(fn, scalar):
         # only using  scalar values relevant to particular ops
         scalars = [1.5, 3, 0, -2.0, -1]
         for dtype, op, device, scalar in product(dtypes, binary_ops, self.devices, scalars):
+            if dtype == torch.float16 and device == "cpu":
+                continue
             try:
                 x = self.data_for(dtype, device)
                 fn = apply_with_scalar(op, scalar)
@@ -1512,6 +1524,8 @@ def apply(fn):
         ]
         devices = self.devices
         for dtype, op, device in product(self.dtypes, ternary_ops, devices):
+            if dtype == torch.float16 and device == "cpu":
+                continue
             try:
                 x = self.data_for(dtype, device)
                 y = self.data_for(dtype, device)
@@ -1541,6 +1555,8 @@ def apply(fn):
         ]
         devices = self.devices
         for dtype, op, device in product(self.dtypes, ternary_ops, devices):
+            if dtype == torch.float16 and device == "cpu":
+                continue
             try:
                 x = self.data_for(dtype, device, size=[5, 3, 128, 128])
                 y = self.data_for(dtype, device, size=[3])
@@ -1572,6 +1588,8 @@ def apply(fn):
             torch.cat,
         ]
         for dtype, op, device in product(self.dtypes, list_ops, devices):
+            if dtype == torch.float16 and device == "cpu":
+                continue
             try:
                 x = self.data_for(dtype, device, size=[5, 4, 1, 7])
                 y = self.data_for(dtype, device, size=[5, 4, 1, 7])
@@ -1603,6 +1621,8 @@ def apply(fn):
         ]
         devices = self.devices
         for dtype, op, device in product(self.dtypes, ops, devices):
+            if dtype == torch.float16 and device == "cpu":
+                continue
             try:
                 cond = self.data_for(torch.bool, device)
                 x = self.data_for(dtype, device)
@@ -1768,7 +1788,10 @@ def test_type_as_cat(self):
         with inline_fusion_groups():
             def eager(x, y):
                 return torch.cat((x, y.type_as(x)), dim=1)
-            for dtype1, dtype2 in product(self.dtypes, self.dtypes):
+            dtypes = self.dtypes.copy()
+            # CPU fuser doesn't support float16.
+            dtypes.remove(torch.float16)
+            for dtype1, dtype2 in product(dtypes, dtypes):
                 x = torch.randint(2, (1, 13,)).to(dtype1)
                 zero = torch.tensor([[0]]).to(dtype2)
                 one = torch.tensor([[1]]).to(dtype2)
@@ -1936,7 +1959,6 @@ def bn_neither(i, x):
             for fn in [bn, bn_no_weight, bn_no_bias, bn_neither]:
                 test(fn, (i, x))
 
-
 works_list = [
     '__radd__',
     '__rdiv__',
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 47c7e689aa6a4..366c262ad7c1d 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -1222,7 +1222,6 @@ def bias_gelu(bias, y):
             x = warmup_and_run_forward(traced, a, b)
             self.assertLastGraphAllFused()
 
-    @unittest.skip("float16 is not supported yet.")
     def test_half_bn_relu(self):
         devices = ["cuda"] if torch.cuda.is_available() else []
 
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 1d5128c7e71e2..a3e37072a032d 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -966,7 +966,7 @@ class TensorExprFuser {
         // but on top of that Float16 has a few kinks on LLVM.  Thus, on CPU we
         // additionally disable it until we either move to a more stable version
         // or find workarounds.
-        if (*st == c10::ScalarType::Half) {
+        if (*st == c10::ScalarType::Half && *device == c10::kCPU) {
           return false;
         }
 
diff --git a/torch/csrc/jit/tensorexpr/half_support.h b/torch/csrc/jit/tensorexpr/half_support.h
index eaf74d3c79d82..674af8a764928 100644
--- a/torch/csrc/jit/tensorexpr/half_support.h
+++ b/torch/csrc/jit/tensorexpr/half_support.h
@@ -128,6 +128,46 @@ class HalfRewriter : public IRMutator {
     return v;
   }
 
+  template <typename T>
+  ExprPtr mutateArithmetic(T v) {
+    IRMutator::mutate(v);
+    if (v->dtype().scalar_type() == c10::kHalf) {
+      v->set_dtype(v->dtype().cloneWithScalarType(c10::kFloat));
+    }
+    return v;
+  }
+
+  ExprPtr mutate(AddPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(SubPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(MulPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(DivPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(MaxPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(MinPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(CompareSelectPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(BroadcastPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(IfThenElsePtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(IntrinsicsPtr v) override {
+    return mutateArithmetic(v);
+  }
+
  private:
   std::unordered_set<ExprPtr> inserted_half_casts_;
   std::unordered_map<VarPtr, VarPtr> var_map;
diff --git a/torch/csrc/jit/tensorexpr/ir_verifier.cpp b/torch/csrc/jit/tensorexpr/ir_verifier.cpp
index f7adbdee93992..f31a935291c33 100644
--- a/torch/csrc/jit/tensorexpr/ir_verifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_verifier.cpp
@@ -119,7 +119,19 @@ void IRVerifier::visit(IfThenElsePtr v) {
 }
 
 void IRVerifier::visit(IntrinsicsPtr v) {
+  if (v->op_type() == kIsNan) {
+    if (v->dtype().scalar_type() != c10::kInt) {
+      throw malformed_ir("bad dtype in intrinsic arg");
+    }
+    IRVisitor::visit(v);
+    return;
+  }
   // TODO: add a check for OpArgCount and op_type
+  for (auto const& param : v->params()) {
+    if (param->dtype() != v->dtype()) {
+      throw malformed_ir("bad dtype in intrinsic arg");
+    }
+  }
   IRVisitor::visit(v);
 }
 

From 4bd03b02424d93b72f15e28c542ede13f88ea929 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Mon, 30 Aug 2021 18:39:50 -0700
Subject: [PATCH 370/530] Add python mode (#63496)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63496

This PR adds a (private) enable_python_mode context manager.
(see torch/utils/_python_dispatch.py).
enable_python_mode accepts the type of a __torch_dispatch__ object
as its argument. Whenever an operator gets called inside of the
context manager, it dispatches to the __torch_dispatch__ of
the passed-in type.

Example usage:
```
with enable_python_mode(LoggingTensor):
    z = torch.empty([])
    assert isinstance(z, LoggingTensor)
```

There are quite a few changes that were made to support this.

First, we added TorchDispatchTypeObject, a C++ struct that represents the
type of a `__torch_dispatch__` object (e.g. LoggingTensor).
It holds both the PyObject* representing the class and a PyInterpreter*
so we know which Python interpreter it came from.

Next, we updated the concrete_dispatch_fn in python_variable.cpp to accept
a `const std::shared_ptr<TorchDispatchTypeObject>&` argument. When this
is null, dispatching happens as usual. When it is non-null, we prepend
the TorchDispatchTypeObject's PyObject* to the overloaded args list so that
it is considered first for dispatch.

To get that to work, we changed how `handle_torch_dispatch_no_python_arg_parser`
works. The "overloaded args list" previously only consisted of Tensor PyObjects,
but now it can have types in addition to Tensors!
- We renamed `append_overloaded_arg` to `append_overloaded_arg`
- We added a new `append_overloaded_type` that appends a type to
overloaded_args
- We added special handling in `handle_torch_dispatch_no_python_arg_parser`
and `append_overloaded_arg` to handle types in addition to Tensors.

Then, there is PythonMode and PythonModeTLS.
- We reuse the DispatchKey::Python dispatch key as a mode key
- We use PythonMode::enter and PythonMode::exit to enable/disable
DispatchKey::Python and set the PythonModeTLS.
- PythonModeTLS stores a TorchDispatchTypeObject as metadata.
- PythonMode is in libtorch_python, and PythonModeTLS is in ATen.
This split is due to the libtorch_python library boundary (because we need
to save TLS in ATen/ThreadLocalState)
- We modify the PythonFallbackKernel to look up
the relevant TorchDispatchTypeObject (if Python Mode is active) and
dispatch using it.

There are two more miscellaneous changes:
- internal_new_from_data (torch/csrc/utils/tensor_new.cpp) gets an
exclude guard. enable_python_mode currently does not handle
torch.tensor and the exclude guard is to prevent a bug.

Future:
- This PR does not allow for the nesting of Python modes. In the future we
should be able to enable this with a more sane no_dispatch API and by changing
the TLS to a stack. For now I did not need this for CompositeImplicitAutograd testing.

Test Plan: - new tests

Reviewed By: malfet, albanD

Differential Revision: D30543236

Pulled By: zou3519

fbshipit-source-id: ef5444d96a5a957d1657b7e37dce80f9a497d452
---
 aten/src/ATen/PythonModeTLS.cpp             | 26 +++++++
 aten/src/ATen/PythonModeTLS.h               | 17 +++++
 aten/src/ATen/ThreadLocalState.cpp          |  3 +
 aten/src/ATen/ThreadLocalState.h            |  3 +
 aten/src/ATen/core/PythonFallbackKernel.cpp | 13 +++-
 c10/core/TensorImpl.cpp                     | 20 ++++-
 c10/core/TensorImpl.h                       | 35 ++++++++-
 test/run_test.py                            |  1 +
 test/test_python_dispatch.py                | 81 ++++++++++++++++++++-
 tools/build_variables.bzl                   |  2 +
 torch/_C/__init__.pyi.in                    |  2 +
 torch/csrc/autograd/init.cpp                | 17 +++++
 torch/csrc/autograd/python_mode.cpp         | 27 +++++++
 torch/csrc/autograd/python_mode.h           | 17 +++++
 torch/csrc/autograd/python_variable.cpp     | 38 ++++++++--
 torch/csrc/utils/python_arg_parser.cpp      | 39 ++++++++--
 torch/csrc/utils/python_arg_parser.h        | 11 ++-
 torch/csrc/utils/tensor_new.cpp             |  1 +
 torch/utils/_python_dispatch.py             | 34 +++++++++
 19 files changed, 366 insertions(+), 21 deletions(-)
 create mode 100644 aten/src/ATen/PythonModeTLS.cpp
 create mode 100644 aten/src/ATen/PythonModeTLS.h
 create mode 100644 torch/csrc/autograd/python_mode.cpp
 create mode 100644 torch/csrc/autograd/python_mode.h
 create mode 100644 torch/utils/_python_dispatch.py

diff --git a/aten/src/ATen/PythonModeTLS.cpp b/aten/src/ATen/PythonModeTLS.cpp
new file mode 100644
index 0000000000000..b53043ca84147
--- /dev/null
+++ b/aten/src/ATen/PythonModeTLS.cpp
@@ -0,0 +1,26 @@
+#include <ATen/PythonModeTLS.h>
+
+namespace at { namespace impl {
+
+thread_local std::shared_ptr<TorchDispatchTypeObject> pythonModeState;
+
+void PythonModeTLS::set_state(const std::shared_ptr<TorchDispatchTypeObject>& state) {
+  pythonModeState = state;
+  if (state) {
+    c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true);
+  } else {
+    PythonModeTLS::reset_state();
+  }
+}
+
+const std::shared_ptr<TorchDispatchTypeObject>& PythonModeTLS::get_state() {
+  return pythonModeState;
+}
+
+void PythonModeTLS::reset_state() {
+  pythonModeState.reset((TorchDispatchTypeObject*)nullptr);
+  c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false);
+}
+
+} // namespace impl
+} // namespace at
diff --git a/aten/src/ATen/PythonModeTLS.h b/aten/src/ATen/PythonModeTLS.h
new file mode 100644
index 0000000000000..be52b182c659b
--- /dev/null
+++ b/aten/src/ATen/PythonModeTLS.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <torch/library.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+
+namespace at {
+namespace impl {
+
+struct TORCH_API PythonModeTLS {
+  static void set_state(const std::shared_ptr<TorchDispatchTypeObject>& state);
+  static const std::shared_ptr<TorchDispatchTypeObject>& get_state();
+  static void reset_state();
+};
+
+} // namespace impl
+} // namespace at
diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index 98c2519e045ce..19cfa89967ccb 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -17,6 +17,7 @@ ThreadLocalState::ThreadLocalState()
   saved_tensors_default_hooks_ = SavedTensorDefaultHooks::get_hooks();
 
   bumped_record_all_functions_ = at::checkRecordAllFunctions();
+  python_mode_state_ = at::impl::PythonModeTLS::get_state();
 }
 
 void ThreadLocalState::set_grad_mode(bool enabled) {
@@ -30,6 +31,8 @@ void ThreadLocalState::setThreadLocalState(
   // restore the dispatch key set TLS at the same time.
   c10::AutogradState::set_tls_state(state.autograd_tls_);
 
+  at::impl::PythonModeTLS::set_state(state.python_mode_state_);
+
   at::set_record_function_tls_(state.rf_tls_);
 
   SavedTensorDefaultHooks::set_hooks(
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index 41146912819b4..c99ca6158ffa5 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -6,6 +6,7 @@
 #include <c10/util/ThreadLocalDebugInfo.h>
 
 #include <ATen/record_function.h>
+#include <ATen/PythonModeTLS.h>
 
 namespace at {
 
@@ -40,6 +41,8 @@ class TORCH_API ThreadLocalState {
   // TLS for AutogradModes
   AutogradState autograd_tls_;
 
+  std::shared_ptr<TorchDispatchTypeObject> python_mode_state_;
+
   // TLS for saved tensors default hooks
   std::pair<PyObject*, PyObject*> saved_tensors_default_hooks_;
 
diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp
index 276eabfe458c0..8e77d0952ec75 100644
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@@ -1,9 +1,18 @@
 #include <torch/library.h>
 #include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/PythonModeTLS.h>
 
 namespace {
 
 void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  // If Python Mode is active, use its PyInterpreter for dispatch
+  const auto& maybe_python_mode_state = at::impl::PythonModeTLS::get_state();
+  if (maybe_python_mode_state) {
+    maybe_python_mode_state->pyinterpreter()->dispatch(op, stack, maybe_python_mode_state);
+    return;
+  }
+
+  // Otherwise, find a PyInterpreter on a Tensor
   const auto& schema = op.schema();
   const auto num_arguments = schema.arguments().size();
   // It is safe to dispatch on the very first Tensor with a pyobj_interpreter
@@ -15,7 +24,7 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
     if (ivalue.isTensor()) {
       auto* interpreter = ivalue.unsafeToTensorImpl()->pyobj_interpreter();
       if (interpreter) {
-        interpreter->dispatch(op, stack);
+        interpreter->dispatch(op, stack, nullptr);
         return;
       }
     } else if (ivalue.isTensorList()) {
@@ -24,7 +33,7 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
       for (const auto& nv : ivalue.toListRef()) {
         auto* interpreter = nv.unsafeToTensorImpl()->pyobj_interpreter();
         if (interpreter) {
-          interpreter->dispatch(op, stack);
+          interpreter->dispatch(op, stack, nullptr);
           return;
         }
       }
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index de829c493732d..9a72659711743 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -40,7 +40,8 @@ static c10::intrusive_ptr<TensorImpl> noop_detach_fn(
 static void noop_dispatch_fn(
     const PyInterpreter*,
     const c10::OperatorHandle& op,
-    torch::jit::Stack* stack) {
+    torch::jit::Stack* stack,
+    const std::shared_ptr<TorchDispatchTypeObject>& type) {
   TORCH_INTERNAL_ASSERT(
       0,
       "attempted to dispatch (__torch_dispatch__) an operator on Tensor with nontrivial PyObject after corresponding interpreter died");
@@ -608,6 +609,23 @@ void TensorImpl::copy_tensor_metadata(
   }
 }
 
+TorchDispatchTypeObject::TorchDispatchTypeObject(
+    PyObject* type_object,
+    c10::impl::PyInterpreter* pyinterpreter)
+    : data_(type_object), pyinterpreter_(pyinterpreter) {}
+
+TorchDispatchTypeObject::~TorchDispatchTypeObject() {
+  pyinterpreter_->decref(data_);
+}
+
+c10::impl::PyInterpreter* TorchDispatchTypeObject::pyinterpreter() const {
+  return pyinterpreter_;
+}
+
+PyObject* TorchDispatchTypeObject::ptr() const {
+  return data_;
+}
+
 namespace impl {
 
 namespace {
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 7051e36b35516..d110a17b46590 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -161,6 +161,9 @@ struct C10_API AutogradMetaInterface {
   virtual ~AutogradMetaInterface();
 };
 
+// forward declared
+struct TorchDispatchTypeObject;
+
 namespace impl {
 
 // Unfortunately, the definition of AutogradMeta lives in a separate
@@ -255,7 +258,8 @@ struct C10_API PyInterpreter {
   using dispatch_sig = void(
       const PyInterpreter*,
       const c10::OperatorHandle&,
-      torch::jit::Stack* stack);
+      torch::jit::Stack* stack,
+      const std::shared_ptr<TorchDispatchTypeObject>& type);
 
   PyInterpreter(
       name_sig* name_fn,
@@ -299,8 +303,9 @@ struct C10_API PyInterpreter {
   // Invoke the Python boxed fallback dispatch to go back into Python
   __ubsan_ignore_function__ void dispatch(
       const c10::OperatorHandle& op,
-      torch::jit::Stack* stack) const {
-    return (*dispatch_fn_)(this, op, stack);
+      torch::jit::Stack* stack,
+      const std::shared_ptr<TorchDispatchTypeObject>& type) const {
+    return (*dispatch_fn_)(this, op, stack, type);
   }
 
   // Disarm this PyInterpreter, making all of its methods noops.
@@ -348,6 +353,30 @@ struct C10_API NamedTensorMetaInterface {
   };
 };
 
+// NOTE [What is TorchDispatchTypeObject?]
+// A TorchDispatchTypeObject represents the type of a Tensor subclass that has
+// a __torch_dispatch__ classmethod. Concretely, it holds the class as a
+// PyObject* and a PyInterpreter* that says which python interpreter the class
+// came from.
+//
+// See NOTE [dispatch_fn's type argument] for more details
+struct C10_API TorchDispatchTypeObject {
+  // Steals a reference to type_object
+  TorchDispatchTypeObject(
+      PyObject* type_object,
+      c10::impl::PyInterpreter* pyinterpreter);
+
+  // Releases the stolen reference to type_object
+  ~TorchDispatchTypeObject();
+
+  c10::impl::PyInterpreter* pyinterpreter() const;
+  PyObject* ptr() const;
+
+ private:
+  PyObject* data_;
+  c10::impl::PyInterpreter* pyinterpreter_;
+};
+
 // NOTE [ Version Counter Sharing ]
 //
 // Every Tensor has a version counter. Version counters are incremented whenever
diff --git a/test/run_test.py b/test/run_test.py
index dd95e13de8e36..615aaf912c314 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -103,6 +103,7 @@
     "test_optim",
     "test_functional_optim",
     "test_pytree",
+    "test_python_dispatch",
     "test_mobile_optimizer",
     "test_set_default_mobile_cpu_allocator",
     "test_xnnpack_integration",
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index 0f5b6b9cbd70e..e474f1f4783f0 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -1,6 +1,7 @@
 import torch
 from torch.testing._internal.common_utils import TestCase, run_tests
 from torch.utils._pytree import tree_map
+from torch.utils._python_dispatch import enable_python_mode
 
 from typing import Iterator, List
 import logging
@@ -50,7 +51,10 @@ def unwrap(e):
         def wrap(e):
             return LoggingTensor(e) if isinstance(e, torch.Tensor) else e
 
-        rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
+        # no_dispatch is only needed if you use enable_python_mode.
+        # It prevents infinite recursion.
+        with no_dispatch():
+            rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
         logging.getLogger("LoggingTensor").info(f"{func.__module__}.{func.__name__}", args, kwargs, rs)
         return rs
 
@@ -335,6 +339,81 @@ def backward(ctx, grad_output):
 $5 = torch._ops.aten.mul($4, $0)
 $6 = torch._ops.aten.add_($1, $5)''')
 
+    def test_enable_python_mode_error(self) -> None:
+        with self.assertRaisesRegex(ValueError, "__torch_dispatch__"):
+            with enable_python_mode(torch.Tensor):
+                pass
+        z = LoggingTensor(torch.empty([]))
+        with self.assertRaisesRegex(ValueError, "must be the type"):
+            with enable_python_mode(z):
+                pass
+
+    def test_enable_python_mode_basic(self) -> None:
+        with enable_python_mode(LoggingTensor):
+            z = torch.empty([])
+            self.assertTrue(isinstance(z, LoggingTensor))
+
+    def test_enable_python_mode_unrelated_tensors(self) -> None:
+        x = torch.randn([])
+        y = torch.randn([])
+        with enable_python_mode(LoggingTensor):
+            z = x + y
+            self.assertTrue(isinstance(z, LoggingTensor))
+
+    def test_enable_python_mode_subclass_priority(self) -> None:
+        class ErrorA(RuntimeError):
+            pass
+
+        class ErrorB(RuntimeError):
+            pass
+
+        class A(torch.Tensor):
+            @staticmethod
+            def __new__(cls, elem):
+                return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                raise ErrorA
+
+        class B(A):
+            @staticmethod
+            def __new__(cls, elem):
+                return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                raise ErrorB
+
+        a = A(torch.empty(1))
+        b = B(torch.empty(1))
+        with self.assertRaises(ErrorA):
+            a + a
+
+        # B has precedence over A due to the subclass relationship
+        with self.assertRaises(ErrorB):
+            with enable_python_mode(A):
+                b + b
+        with self.assertRaises(ErrorB):
+            with enable_python_mode(B):
+                a + a
+        with self.assertRaises(ErrorB):
+            with enable_python_mode(B):
+                a + b
+
+    def test_enable_python_mode_respects_no_dispatch(self) -> None:
+        with enable_python_mode(LoggingTensor):
+            z = torch.ones([2, 3])
+            self.assertTrue(isinstance(z, LoggingTensor))
+            with no_dispatch():
+                expected = torch.ones([2, 3])
+                self.assertEqual(z.elem, expected)
+
+    def test_nested_enable_python_mode(self) -> None:
+        with self.assertRaisesRegex(RuntimeError, "has already been set"):
+            with enable_python_mode(LoggingTensor):
+                with enable_python_mode(LoggingTensor):
+                    pass
 
 if __name__ == '__main__':
     run_tests()
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 34846b5d6c7b3..dd89981094d4f 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -666,6 +666,7 @@ libtorch_python_core_sources = [
     "torch/csrc/autograd/init.cpp",
     "torch/csrc/autograd/python_anomaly_mode.cpp",
     "torch/csrc/autograd/python_saved_variable_hooks.cpp",
+    "torch/csrc/autograd/python_mode.cpp",
     "torch/csrc/autograd/python_cpp_function.cpp",
     "torch/csrc/autograd/python_engine.cpp",
     "torch/csrc/autograd/python_function.cpp",
@@ -793,6 +794,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/ParallelNativeTBB.cpp",
     "aten/src/ATen/ParallelOpenMP.cpp",
     "aten/src/ATen/ParallelThreadPoolNative.cpp",
+    "aten/src/ATen/PythonModeTLS.cpp",
     "aten/src/ATen/ScalarOps.cpp",
     "aten/src/ATen/SequenceNumber.cpp",
     "aten/src/ATen/SparseTensorImpl.cpp",
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 3629150d15090..c847e8deced62 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -652,6 +652,8 @@ def __set_forward_AD_enabled(enabled: _bool) -> None: ...
 def __is_forward_AD_enabled() -> _bool: ...
 def _register_default_hooks(pack_hook: Callable, unpack_hook: Callable) -> None: ...
 def _reset_default_hooks() -> None: ...
+def _enter_python_mode(cls: Type) -> None: ...
+def _exit_python_mode() -> None: ...
 
 class _InferenceMode(object):
     def __init__(self, mode: _bool) -> None: ...
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 697ca871f83c5..860aaec466218 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -14,6 +14,7 @@
 #include <torch/csrc/autograd/python_saved_variable_hooks.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/autograd/utils/python_arg_parsing.h>
+#include <torch/csrc/autograd/python_mode.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
 #include <c10/core/ScalarType.h>
 
@@ -494,6 +495,20 @@ static PyObject * python_exit_dual_level(PyObject* _unused, PyObject* args, PyOb
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject * enter_python_mode(PyObject* _unused, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  PythonMode::enter(arg);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject * exit_python_mode(PyObject* _unused, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  PythonMode::exit();
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 // autograd methods on torch._C
 static PyMethodDef methods[] = { // NOLINT
   {"_set_grad_enabled", set_grad_enabled, METH_O, nullptr},
@@ -514,6 +529,8 @@ static PyMethodDef methods[] = { // NOLINT
   {"is_anomaly_enabled", is_anomaly_mode_enabled, METH_NOARGS, nullptr},
   {"_enter_dual_level", python_enter_dual_level, METH_NOARGS, nullptr},
   {"_exit_dual_level", castPyCFunctionWithKeywords(python_exit_dual_level), METH_VARARGS | METH_KEYWORDS, nullptr},
+  {"_enter_python_mode", enter_python_mode, METH_O, nullptr},
+  {"_exit_python_mode", exit_python_mode, METH_NOARGS, nullptr},
   {nullptr, nullptr, 0, nullptr}
 };
 
diff --git a/torch/csrc/autograd/python_mode.cpp b/torch/csrc/autograd/python_mode.cpp
new file mode 100644
index 0000000000000..435842631a5bb
--- /dev/null
+++ b/torch/csrc/autograd/python_mode.cpp
@@ -0,0 +1,27 @@
+#include <torch/csrc/autograd/python_mode.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <ATen/PythonModeTLS.h>
+#include <c10/core/TensorImpl.h>
+
+namespace torch { namespace autograd {
+
+void PythonMode::enter(PyObject* type) {
+  if (at::impl::PythonModeTLS::get_state()) {
+    TORCH_CHECK(
+        false,
+        "python mode has already been set. We do not yet support nested python ",
+        "mode. Please file us an issue and reset it before setting it again.")
+  }
+  // TorchDispatchTypeObject steals a reference, See NOTE [What is TorchDispatchTypeObject?]
+  Py_INCREF(type);
+  auto state = std::make_shared<c10::TorchDispatchTypeObject>(type, getPyInterpreter());
+  at::impl::PythonModeTLS::set_state(state);
+}
+
+void PythonMode::exit() {
+  TORCH_INTERNAL_ASSERT(at::impl::PythonModeTLS::get_state(), "exiting Python Mode but it wasn't set!");
+  at::impl::PythonModeTLS::reset_state();
+}
+
+}}
diff --git a/torch/csrc/autograd/python_mode.h b/torch/csrc/autograd/python_mode.h
new file mode 100644
index 0000000000000..03da51c1c49e0
--- /dev/null
+++ b/torch/csrc/autograd/python_mode.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+#include <c10/core/TensorImpl.h>
+
+namespace torch { namespace autograd {
+
+struct TORCH_API PythonMode {
+  // Enter python mode, causing all operators to dispatch to the type's __torch_dispatch__.
+  // `type` is the type of a Tensor subclass that has __torch_dispatch__.
+  static void enter(PyObject* type);
+
+  // Exit the current python mode.
+  static void exit();
+};
+
+}}
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 50d6eb9ab7e05..abe90105cde2b 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -32,6 +32,7 @@
 
 #include <torch/library.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/autograd/python_mode.h>
 
 
 #include <ATen/ATen.h>
@@ -64,7 +65,12 @@ void concrete_decref_fn(const c10::impl::PyInterpreter* self, PyObject* pyobj) {
     return;
 
   pybind11::gil_scoped_acquire gil;
-  if (Py_REFCNT(pyobj) > 1) {
+  // Two possibilities:
+  // 1. We are decref-ing a tensor. Then we must be careful about
+  // PyObject resurrection (this only applies to Tensors, see THPVariable_clear).
+  // 2. We are decref-ing some other Python object. We don't do
+  // PyObject resurrection on non-Tensors, so we just carry on as usual
+  if (THPVariable_Check(pyobj) && Py_REFCNT(pyobj) > 1) {
     // It's still alive!  This can happen if a weak ref resurrected
     // the PyObject without flipping ownership.  At this point it is
     // too late to rescue the object, so just stub out the PyObject
@@ -82,7 +88,11 @@ void concrete_decref_fn(const c10::impl::PyInterpreter* self, PyObject* pyobj) {
 };
 
 c10::intrusive_ptr<TensorImpl> concrete_detach_fn(const c10::impl::PyInterpreter*, const c10::TensorImpl* self);
-void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHandle& op, torch::jit::Stack* stack);
+void concrete_dispatch_fn(
+    const c10::impl::PyInterpreter*,
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    const std::shared_ptr<TorchDispatchTypeObject>& type);
 
 class PyInterpreterHolder {
  public:
@@ -1491,7 +1501,19 @@ bool isPythonTensor(const Tensor& tensor) {
   return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Python);
 }
 
-void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+// NOTE [dispatch_fn's type argument]
+// `type` is nullable and represents the PythonMode going on.
+// Right now we only support a single PythonMode, but in the future we could
+// change this to a stack of PythonModes.
+//
+// If `type` isn't null, then we consider the type for dispatch by prepending
+// it to the overloaded_args list. `handle_torch_funciton_no_python_arg_parser`
+// is responsible for doing overload resolution.
+void concrete_dispatch_fn(
+    const c10::impl::PyInterpreter*,
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    const std::shared_ptr<TorchDispatchTypeObject>& type) {
   const auto& schema = op.schema();
   const auto num_returns = schema.returns().size();
 
@@ -1568,13 +1590,17 @@ void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHa
   auto args = py::reinterpret_steal<py::object>(PyTuple_New(positional_default_start));
   py::dict kwargs;
 
+  if (type) {
+    append_overloaded_type(&overloaded_args, type->ptr());
+  }
+
   // Find overloaded tensors
   for (int64_t idx = 0; idx < arguments.size(); idx++) {
     const auto& ivalue = arguments[idx];
     if (ivalue.isTensor()) {
       const auto& tensor = ivalue.toTensor();
       if (isPythonTensor(tensor)) {
-        append_overloaded_arg(&overloaded_args, py::cast(tensor).ptr());
+        append_overloaded_tensor(&overloaded_args, py::cast(tensor).ptr());
       }
     } else if (ivalue.isList()) {
       const auto& list = ivalue.toListRef();
@@ -1583,7 +1609,7 @@ void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHa
         if (nv.isTensor()) {
           const auto& tensor = nv.toTensor();
           if (isPythonTensor(tensor)) {
-            append_overloaded_arg(&overloaded_args, py::cast(tensor).ptr());
+            append_overloaded_tensor(&overloaded_args, py::cast(tensor).ptr());
           }
         }
       }
@@ -1633,7 +1659,7 @@ c10::intrusive_ptr<TensorImpl> concrete_detach_fn(const c10::impl::PyInterpreter
   Tensor self_t = Tensor(c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::unsafe_reclaim_from_nonowning(const_cast<c10::TensorImpl*>(self)));
   auto self_p = py::reinterpret_steal<py::object>(THPVariable_Wrap(self_t));
   TORCH_INTERNAL_ASSERT(isPythonTensor(self_t));
-  append_overloaded_arg(&overloaded_args, self_p.ptr());
+  append_overloaded_tensor(&overloaded_args, self_p.ptr());
   auto args = py::reinterpret_steal<py::object>(PyTuple_New(1));
   PyTuple_SET_ITEM(args.ptr(), 0, self_p.release().ptr());
 
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 6115dcdfbe61a..3ee20c055bf94 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -200,12 +200,28 @@ auto handle_torch_function(PyObject* self, const std::string& func_name, PyObjec
   return ret.release().ptr();
 }
 
+// Note: [Overloaded args]
+// An overloaded arg may be one of the following:
+// - an instance of an object that has a __torch_function__ method
+// - an instance of an object that has a __torch_dispatch__ classmethod
+// - a class type that has a __torch_dispatch__ classmethod
+//
+// This function returns the type of the arg (if the arg is an instance),
+// otherwise, it returns the arg.
+static PyObject* get_type_of_overloaded_arg(PyObject* obj_or_type) {
+  if (PyType_Check(obj_or_type)) {
+    return obj_or_type;
+  }
+  return (PyObject*)Py_TYPE(obj_or_type);
+}
+
+// See Note: [Overloaded args] for what they hold
 auto handle_torch_function_no_python_arg_parser(const std::vector<py::handle> &overloaded_args, PyObject* args, PyObject* kwargs, const char* func_name, PyObject* torch_api_function, const char* module_name, const char* torch_function_name) -> PyObject* {
   // overloaded_args already all have unique types
   std::vector<py::object> overloaded_types;
   overloaded_types.reserve(overloaded_args.size());
   for (auto &arg : overloaded_args) {
-    overloaded_types.push_back(py::reinterpret_borrow<py::object>((PyObject *) Py_TYPE(arg.ptr())));
+    overloaded_types.push_back(py::reinterpret_borrow<py::object>(get_type_of_overloaded_arg(arg.ptr())));
   }
   py::tuple py_types = py::cast(overloaded_types);
   py::object ret;
@@ -231,7 +247,7 @@ auto handle_torch_function_no_python_arg_parser(const std::vector<py::handle> &o
     ss << "no implementation found for '" << module_name << "." << func_name
        << "' on types that implement " << torch_function_name << ": [";
     for (auto &arg : overloaded_args) {
-      ss << arg.ptr()->ob_type->tp_name;
+      ss << PyObject_Repr(get_type_of_overloaded_arg(arg.ptr()));
       if (!arg.is(overloaded_args.back())) {
         ss << ", ";
       }
@@ -328,10 +344,11 @@ auto handle_torch_function_indexing(PyObject* self, PyObject* index, PyObject* v
  *
  */
 
-void append_overloaded_arg(std::vector<py::handle>* overloaded_args, PyObject* obj) {
+static void append_overloaded_arg(std::vector<py::handle>* overloaded_args, PyObject* obj, bool obj_is_type) {
   bool class_not_seen_yet = true;
+  PyObject* obj_type = obj_is_type ? obj : (PyObject*)Py_TYPE(obj);
   for (auto &arg : *overloaded_args) {
-    if (Py_TYPE(obj) == Py_TYPE(arg.ptr())) {
+    if (obj_type == get_type_of_overloaded_arg(arg.ptr())) {
       // obj is the same type as another parameter we've seen in a prior
       // iteration of the loop over parameters so we already have an entry
       // with the proper __torch_function__ implementation to call, so skip
@@ -343,7 +360,7 @@ void append_overloaded_arg(std::vector<py::handle>* overloaded_args, PyObject* o
   if (class_not_seen_yet) {
     int arg_index = overloaded_args->size();
     for(const auto j : c10::irange(arg_index)) {
-      if (PyObject_IsInstance(obj, (PyObject*)(Py_TYPE((*overloaded_args)[j].ptr())))) {
+      if (PyObject_IsSubclass(obj_type, (PyObject*)(get_type_of_overloaded_arg((*overloaded_args)[j].ptr())))) {
         // obj is a subclass of another object we've seen already so its
         // __torch_function__ should be called first, therefore we
         // insert it into overloaded_args before the superclass
@@ -358,6 +375,14 @@ void append_overloaded_arg(std::vector<py::handle>* overloaded_args, PyObject* o
   }
 }
 
+void append_overloaded_tensor(std::vector<py::handle>* overloaded_args, PyObject* obj) {
+  append_overloaded_arg(overloaded_args, obj, /*obj_is_type*/false);
+}
+
+void append_overloaded_type(std::vector<py::handle>* overloaded_args, PyObject* obj) {
+  append_overloaded_arg(overloaded_args, obj, /*obj_is_type*/true);
+}
+
 bool is_tensor_and_append_overloaded(PyObject* obj, std::vector<py::handle>* overloaded_args) {
   if (THPVariable_CheckExact(obj)) {
     // torch.Tensor instances (not subclasses, except for Parameter)
@@ -366,7 +391,7 @@ bool is_tensor_and_append_overloaded(PyObject* obj, std::vector<py::handle>* ove
 
   if (check_has_torch_function(obj)) {
     // tensor subclasses and unrelated objects with __torch_function__
-    append_overloaded_arg(overloaded_args, obj);
+    append_overloaded_tensor(overloaded_args, obj);
     return true;
   } else if (THPVariable_Check(obj)) {
     // tensor subclasses without __torch_function__
@@ -905,7 +930,7 @@ bool FunctionSignature::parse(PyObject* self, PyObject* args, PyObject* kwargs,
 
   int i = 0;
   if (self != nullptr && check_has_torch_function(self)) {
-    append_overloaded_arg(&this->overloaded_args, self);
+    append_overloaded_tensor(&this->overloaded_args, self);
   }
   for (auto& param : params) {
     PyObject* obj = nullptr;
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index d132185ccaefb..6a05807e5a314 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -818,6 +818,15 @@ bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector<py::handle>
  * 'overloaded_args': the vector to append the overloaded args
  * 'obj': the input tensor that is overloaded
  */
-void append_overloaded_arg(std::vector<py::handle>* overloaded_args, PyObject* obj);
+void append_overloaded_tensor(std::vector<py::handle>* overloaded_args, PyObject* obj);
+
+/* Given an argument that is definitely a type and is definitely overloaded,
+ * append it to the overloaded arguments list. Use this only with __torch_dispatch__,
+ * where we operate on classes that have a __torch_dispatch__ classmethod.
+ *
+ * 'overloaded_args': the vector to append the overloaded type
+ * 'obj': the input class that has a __torch_dispatch__ classmethod.
+ */
+void append_overloaded_type(std::vector<py::handle>* overloaded_args, PyObject* obj);
 
 } // namespace torch
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 17d7acc37640c..25e9a5962614f 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -267,6 +267,7 @@ Tensor internal_new_from_data(
   {
     at::AutoDispatchBelowADInplaceOrView guard;  // TODO: remove
     at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    c10::impl::ExcludeDispatchKeyGuard pythonmode_guard(c10::DispatchKey::Python);
     // functorch uses FuncTorchDynamicLayerBackMode as a mode key to wrap all
     // tensors returned from operators in special TensorWrapper tensor extension
     // The problem with this is that TensorWrapper does not have storage so
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
new file mode 100644
index 0000000000000..a7cfae10c37c1
--- /dev/null
+++ b/torch/utils/_python_dispatch.py
@@ -0,0 +1,34 @@
+import torch
+import contextlib
+from typing import Iterator
+
+# Context manager that causes all pytorch operators to dispatch to the passed-in
+# type's __torch_dispatch__ function.
+# operation that accepts no tensors but returns a tensor.
+#
+# enable_python_mode is affected by torch._C._DisableTorchDispatch.
+#
+# NB: Calling an operator inside __torch_dispatch__ does go through
+# __torch_dispatch__ again. Please use _DisableTorchDispatch inside
+# __torch_dispatch__ to prevent infinite recursion.
+#
+# TODO: Limitations and things about enable_python_mode we should fix before exposing it:
+# - it currently cannot be nested. This should be simple to implement; we need a
+#   stack of TorchDispatchTypeObjects and the next bullet point.
+# - We need a better user-facing api for torch._C._DisableTorchDispatch that
+#   is able to selectively disable __torch_dispatch__ of a particular class.
+# - It doesn't work with the tensor constructors (torch.tensor, torch.Tensor)
+# - Better name (see https://github.com/pytorch/pytorch/pull/63496#discussion_r694091694)
+@contextlib.contextmanager
+def enable_python_mode(cls) -> Iterator[None]:
+    if not hasattr(cls, '__torch_dispatch__'):
+        raise ValueError('The class passed to enable_python_mode '
+                         'must have a __torch_dispatch__ classmethod')
+    if not isinstance(cls, type) or not issubclass(cls, (torch.Tensor,)):
+        raise ValueError('The argument passed to enable_python_mode '
+                         'must be the type of a Tensor subclass')
+    torch._C._enter_python_mode(cls)
+    try:
+        yield
+    finally:
+        torch._C._exit_python_mode()

From af85bc5ffd1d4ad52e0fed255aa7afe2fdfbc5e2 Mon Sep 17 00:00:00 2001
From: Erjia Guan <erjia@fb.com>
Date: Mon, 30 Aug 2021 18:41:08 -0700
Subject: [PATCH 371/530] Replace group_by_key by group_by IterDataPipe
 (#64220)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64220

Remove `ByKeyGrouperIterDataPipe` due to duplicated functionality.
Fix a bug in `GrouperIterDataPipe` using the existing test.

Test Plan: Imported from OSS

Reviewed By: VitalyFedyunin

Differential Revision: D30650542

Pulled By: ejguan

fbshipit-source-id: 666b4d28282fb4f49f3ff101b8d08be16a50d836
---
 test/test_datapipe.py                       |  22 +++-
 torch/utils/data/datapipes/iter/__init__.py |   4 +-
 torch/utils/data/datapipes/iter/grouping.py | 121 +-------------------
 3 files changed, 23 insertions(+), 124 deletions(-)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 86e53fa699142..c35698e057c89 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -299,7 +299,7 @@ def _helper(prior_dp, dp, channel_first=False):
         _helper(cached, datapipe4, channel_first=True)
 
     # TODO(VitalyFedyunin): Generates unclosed buffer warning, need to investigate
-    def test_groupbykey_iterable_datapipe(self):
+    def test_groupby_iterable_datapipe(self):
         temp_dir = self.temp_dir.name
         temp_tarfile_pathname = os.path.join(temp_dir, "test_tar.tar")
         file_list = [
@@ -316,13 +316,25 @@ def test_groupbykey_iterable_datapipe(self):
         datapipe1 = dp.iter.FileLister(temp_dir, '*.tar')
         datapipe2 = dp.iter.FileLoader(datapipe1)
         datapipe3 = dp.iter.TarArchiveReader(datapipe2)
-        datapipe4 = dp.iter.ByKeyGrouper(datapipe3, group_size=2)
 
-        expected_result = [("a.png", "a.json"), ("c.png", "c.json"), ("b.png", "b.json"), ("d.png", "d.json"), (
-            "f.png", "f.json"), ("g.png", "g.json"), ("e.png", "e.json"), ("h.json", "h.txt")]
+        def group_fn(data):
+            filepath, _ = data
+            return os.path.basename(filepath).split(".")[0]
+
+        datapipe4 = dp.iter.Grouper(datapipe3, group_key_fn=group_fn, group_size=2)
+
+        def order_fn(data):
+            data.sort(key=lambda f: f[0], reverse=True)
+            return data
+
+        datapipe5 = dp.iter.Mapper(datapipe4, fn=order_fn)  # type: ignore[var-annotated]
+
+        expected_result = [
+            ("a.png", "a.json"), ("c.png", "c.json"), ("b.png", "b.json"), ("d.png", "d.json"),
+            ("f.png", "f.json"), ("g.png", "g.json"), ("e.png", "e.json"), ("h.txt", "h.json")]
 
         count = 0
-        for rec, expected in zip(datapipe4, expected_result):
+        for rec, expected in zip(datapipe5, expected_result):
             count = count + 1
             self.assertEqual(os.path.basename(rec[0][0]), expected[0])
             self.assertEqual(os.path.basename(rec[1][0]), expected[1])
diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
index f302fd3a2b7ea..8478577c8f2aa 100644
--- a/torch/utils/data/datapipes/iter/__init__.py
+++ b/torch/utils/data/datapipes/iter/__init__.py
@@ -19,7 +19,7 @@
 from torch.utils.data.datapipes.iter.grouping import (
     BatcherIterDataPipe as Batcher,
     BucketBatcherIterDataPipe as BucketBatcher,
-    ByKeyGrouperIterDataPipe as ByKeyGrouper,
+    GrouperIterDataPipe as Grouper,
 )
 from torch.utils.data.datapipes.iter.httpreader import (
     HTTPReaderIterDataPipe as HttpReader,
@@ -48,12 +48,12 @@
 
 __all__ = ['Batcher',
            'BucketBatcher',
-           'ByKeyGrouper',
            'Collator',
            'Concater',
            'FileLister',
            'FileLoader',
            'Filter',
+           'Grouper',
            'HttpReader',
            'IterableWrapper',
            'LineReader',
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index 5f449489ac756..f47299ce1cf9c 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -1,12 +1,10 @@
-import functools
-import os
 import random
 import warnings
 
 from collections import defaultdict
 
 from torch.utils.data import IterDataPipe, functional_datapipe, DataChunk
-from typing import Any, Callable, Dict, Iterator, List, Optional, Sized, Tuple, TypeVar, DefaultDict
+from typing import Any, Callable, DefaultDict, Iterator, List, Optional, Sized, TypeVar
 
 T_co = TypeVar('T_co', covariant=True)
 
@@ -225,35 +223,6 @@ def __len__(self) -> int:
         raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))
 
 
-# defaut group key is the file pathname without the extension.
-# Assuming the passed in data is a tuple and 1st item is file's pathname.
-def default_group_key_fn(dataitem: Tuple[str, Any]):
-    return os.path.splitext(dataitem[0])[0]
-
-
-def default_sort_data_fn(datalist: List[Tuple[str, Any]]):
-    txt_ext = ['.json', '.jsn', '.txt', '.text']
-
-    def cmp_fn(a: Tuple[str, Any], b: Tuple[str, Any]):
-        a_is_txt = os.path.splitext(a[0])[1] in txt_ext
-        b_is_txt = os.path.splitext(b[0])[1] in txt_ext
-
-        # if a is txt but b is not, b go front
-        if a_is_txt and not b_is_txt:
-            return 1
-        # if a is not txt but b is txt, a go front
-        if not a_is_txt and b_is_txt:
-            return -1
-        # if a and b both are or are not txt, sort in alphabetic order
-        if a[0] < b[0]:
-            return -1
-        elif a[0] > b[0]:
-            return 1
-        return 0
-
-    return sorted(datalist, key=functools.cmp_to_key(cmp_fn))
-
-
 @functional_datapipe('groupby')
 class GrouperIterDataPipe(IterDataPipe):
     # TODO(VtalyFedyunin): Add inline docs and tests (they are partially available in notebooks)
@@ -309,6 +278,9 @@ def __iter__(self):
         for x in self.datapipe:
             key = self.group_key_fn(x)
 
+            buffer_elements[key].append(x)
+            buffer_size += 1
+
             if self.group_size is not None and self.group_size == len(buffer_elements[key]):
                 yield self.wrapper_class(buffer_elements[key])
                 buffer_size -= len(buffer_elements[key])
@@ -319,92 +291,7 @@ def __iter__(self):
                 if result_to_yield is not None:
                     yield self.wrapper_class(result_to_yield)
 
-            buffer_elements[key].append(x)
-            buffer_size += 1
-
         while buffer_size:
             (result_to_yield, buffer_size) = self._remove_biggest_key(buffer_elements, buffer_size)
             if result_to_yield is not None:
                 yield self.wrapper_class(result_to_yield)
-
-
-@functional_datapipe('group_by_key')
-class ByKeyGrouperIterDataPipe(IterDataPipe[list]):
-    r""" :class:`GroupByKeyIterDataPipe`.
-
-    Iterable datapipe to group data from input iterable by keys which are generated from `group_key_fn`,
-    yields a list with `group_size` items in it, each item in the list is a tuple of key and data
-
-    args:
-        datapipe: Iterable datapipe that provides data. (typically str key (eg. pathname) and data stream in tuples)
-        group_size: the size of group
-        max_buffer_size: the max size of stream buffer which is used to store not yet grouped but iterated data
-        group_key_fn: a function which is used to generate group key from the data in the input datapipe
-        sort_data_fn: a function which is used to sort the grouped data before yielding back
-        length: a nominal length of the datapipe
-    """
-    datapipe: IterDataPipe[Tuple[str, Any]]
-    group_size: int
-    max_buffer_size: int
-    group_key_fn: Callable
-    sort_data_fn: Callable
-    curr_buffer_size: int
-    stream_buffer: Dict[str, List[Tuple[str, Any]]]
-    length: int
-
-    def __init__(
-            self,
-            datapipe: IterDataPipe[Tuple[str, Any]],
-            *,
-            group_size: int,
-            max_buffer_size: Optional[int] = None,
-            group_key_fn: Callable = default_group_key_fn,
-            sort_data_fn: Callable = default_sort_data_fn,
-            length: int = -1):
-        super().__init__()
-
-        assert group_size > 0
-        self.datapipe = datapipe
-        self.group_size = group_size
-
-        # default max buffer size is group_size * 10
-        self.max_buffer_size = max_buffer_size if max_buffer_size is not None else group_size * 10
-        assert self.max_buffer_size >= self.group_size
-
-        self.group_key_fn = group_key_fn  # type: ignore[assignment]
-        self.sort_data_fn = sort_data_fn  # type: ignore[assignment]
-        self.curr_buffer_size = 0
-        self.stream_buffer = {}
-        self.length = length
-
-    def __iter__(self) -> Iterator[list]:
-        if self.group_size == 1:
-            for data in self.datapipe:
-                yield [data]
-        else:
-            for data in self.datapipe:
-                key = self.group_key_fn(data)
-                if key not in self.stream_buffer:
-                    self.stream_buffer[key] = []
-                res = self.stream_buffer[key]
-                res.append(data)
-                if len(res) == self.group_size:
-                    yield self.sort_data_fn(res)
-                    del self.stream_buffer[key]
-                    self.curr_buffer_size = self.curr_buffer_size - self.group_size + 1
-                else:
-                    if self.curr_buffer_size == self.max_buffer_size:
-                        raise OverflowError(
-                            "stream_buffer is overflow, please adjust the order of data "
-                            "in the input datapipe or increase the buffer size!")
-                    self.curr_buffer_size = self.curr_buffer_size + 1
-
-            if self.curr_buffer_size > 0:
-                msg = "Not able to group [{}] with group size {}.".format(
-                    ','.join([v[0] for _, vs in self.stream_buffer.items() for v in vs]), str(self.group_size))
-                raise RuntimeError(msg)
-
-    def __len__(self) -> int:
-        if self.length == -1:
-            raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))
-        return self.length

From a49907f984670781a718ef6aa0046709886eae5a Mon Sep 17 00:00:00 2001
From: Erjia Guan <erjia@fb.com>
Date: Mon, 30 Aug 2021 18:41:08 -0700
Subject: [PATCH 372/530] Modify inline doc for DataPipe (#64221)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64221

List of tasks in this PR
- [x]  Add inline doc for DataPipe
- [x] Improve the inline doc
- [x] Expose DataPipe to `datapipes.iter` (`UnBatcher`) Note: `Forker`, `Demux`, `Mux` are exposed in another PR authored by Kevin
- [x] Add correct typing to DataPipe
- [x] Unify the argument to `datapipe` rather than `source_datapipe`

Test Plan: Imported from OSS

Reviewed By: VitalyFedyunin

Differential Revision: D30650541

Pulled By: ejguan

fbshipit-source-id: c09d1b9742b8097d8e645c15947cef80c876877b
---
 torch/utils/data/datapipes/iter/__init__.py   |  2 +
 torch/utils/data/datapipes/iter/callable.py   | 14 ++++---
 .../data/datapipes/iter/combinatorics.py      |  9 ++--
 torch/utils/data/datapipes/iter/combining.py  |  8 ++--
 torch/utils/data/datapipes/iter/filelister.py | 21 +++++-----
 torch/utils/data/datapipes/iter/fileloader.py |  5 ++-
 torch/utils/data/datapipes/iter/grouping.py   | 42 +++++++++++++------
 torch/utils/data/datapipes/iter/httpreader.py | 12 +++---
 torch/utils/data/datapipes/iter/linereader.py |  9 ++--
 .../data/datapipes/iter/routeddecoder.py      |  6 ++-
 torch/utils/data/datapipes/iter/selecting.py  |  7 ++--
 .../utils/data/datapipes/iter/streamreader.py | 14 ++++---
 .../data/datapipes/iter/tararchivereader.py   | 12 +++---
 torch/utils/data/datapipes/iter/utils.py      |  7 ++++
 .../data/datapipes/iter/ziparchivereader.py   | 15 ++++---
 15 files changed, 114 insertions(+), 69 deletions(-)

diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
index 8478577c8f2aa..b55bbf6667509 100644
--- a/torch/utils/data/datapipes/iter/__init__.py
+++ b/torch/utils/data/datapipes/iter/__init__.py
@@ -20,6 +20,7 @@
     BatcherIterDataPipe as Batcher,
     BucketBatcherIterDataPipe as BucketBatcher,
     GrouperIterDataPipe as Grouper,
+    UnBatcherIterDataPipe as UnBatcher,
 )
 from torch.utils.data.datapipes.iter.httpreader import (
     HTTPReaderIterDataPipe as HttpReader,
@@ -63,6 +64,7 @@
            'Shuffler',
            'StreamReader',
            'TarArchiveReader',
+           'UnBatcher',
            'ZipArchiveReader',
            'Zipper']
 
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index 18f6f17fff156..2c5ca3d024392 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -31,14 +31,15 @@ class MapperIterDataPipe(IterDataPipe[T_co]):
     Iterable DataPipe to run a function over each item from the source DataPipe.
     The function can be any regular python function or partial object. Lambda
     function is not recommended as it is not supported by pickle.
-    args:
+
+    Args:
         datapipe: Source Iterable DataPipe
         fn: Function called over each item
         fn_args: Positional arguments for `fn`
         fn_kwargs: Keyword arguments for `fn`
-        nesting_level: Determines which level the fn gets applied to, by default it applies to the top level (= 0)
-        This also accepts -1 as input to apply the function to the lowest nesting level. It currently doesn't support
-        argument < -1.
+        nesting_level: Determines which level the fn gets applied to, by default it applies to the top level (= 0).
+            This also accepts -1 as input to apply the function to the lowest nesting level. It currently doesn't support
+            argument < -1.
     """
     datapipe: IterDataPipe
     fn: Callable
@@ -112,10 +113,11 @@ class CollatorIterDataPipe(MapperIterDataPipe):
 
     Iterable DataPipe to collate samples from datapipe to Tensor(s) by `util_.collate.default_collate`,
     or customized Data Structure by collate_fn.
-    args:
+
+    Args:
         datapipe: Iterable DataPipe being collated
         collate_fn: Customized collate function to collect and combine data or a batch of data.
-                    Default function collates to Tensor(s) based on data type.
+            Default function collates to Tensor(s) based on data type.
         fn_args: Positional arguments for `collate_fn`
         fn_kwargs: Keyword arguments for `collate_fn`
 
diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index d1a7dd0368221..4d6fac749729d 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py
@@ -10,10 +10,11 @@ class SamplerIterDataPipe(IterDataPipe[T_co]):
     r""" :class:`SamplerIterDataPipe`.
 
     Iterable DataPipe to generate sample elements.
-    args:
-        datapipe: IterDataPipe sampled from
+
+    Args:
+        datapipe: IterDataPipe to sample from
         sampler: Sampler class to genereate sample elements from input DataPipe.
-                    Default is :class:`SequentialSampler` for IterDataPipe
+            Default is :class:`SequentialSampler` for IterDataPipe
     """
     datapipe: IterDataPipe
     sampler: Sampler
@@ -63,7 +64,7 @@ class ShufflerIterDataPipe(IterDataPipe[T_co]):
     mode (:attr:`num_worker > 0`), `worker_init_fn` is used to set up a random seed
     for each worker process.
 
-    args:
+    Args:
         datapipe: The IterDataPipe being shuffled
         buffer_size: The buffer size for shuffling (default to 10000)
         unbatch_level: Specifies if it necessary to unbatch source data before
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 4b28e0926c42b..879e8be27ff0c 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -11,7 +11,8 @@ class ConcaterIterDataPipe(IterDataPipe):
     r""" :class:`ConcaterIterDataPipe`.
 
     Iterable DataPipe to concatenate multiple Iterable DataPipes.
-    args:
+
+    Args:
         datapipes: Iterable DataPipes being concatenated
     """
     datapipes: Tuple[IterDataPipe]
@@ -97,12 +98,13 @@ def __iter__(self):
 
 @functional_datapipe('zip')
 class ZipperIterDataPipe(IterDataPipe[Tuple[T_co]]):
-    r""" :class:`ZipIterDataPipe`.
+    r""" :class:`ZipperIterDataPipe`.
 
     Iterable DataPipe aggregates elements into a tuple from each of
     the input DataPipe. The output DataPipe is stopped when the
     shortest input DataPipe is exhausted.
-    args:
+
+    Args:
         *datapipes: Iterable DataPipes being aggregated
     """
     datapipes: Tuple[IterDataPipe]
diff --git a/torch/utils/data/datapipes/iter/filelister.py b/torch/utils/data/datapipes/iter/filelister.py
index 48fdce9f52ef5..aef147d2d2941 100644
--- a/torch/utils/data/datapipes/iter/filelister.py
+++ b/torch/utils/data/datapipes/iter/filelister.py
@@ -6,11 +6,12 @@ class FileListerIterDataPipe(IterDataPipe[str]):
     r""" :class:`FileListerIterDataPipe`
 
     Iterable DataPipe to load file pathname(s) (path + filename), yield pathname from given disk root dir.
-    args:
-        root : root dir
-        mask : a unix style filter string or string list for filtering file name(s)
-        abspath : whether to return relative pathname or absolute pathname
-        length : a nominal length of the datapipe
+
+    Args:
+        root: Root directory
+        mask: Unix style filter string or string list for filtering file name(s)
+        abspath: Whether to return relative pathname or absolute pathname
+        length: Nominal length of the datapipe
     """
 
     def __init__(
@@ -22,11 +23,11 @@ def __init__(
             abspath: bool = False,
             length: int = -1):
         super().__init__()
-        self.root : str = root
-        self.masks : Union[str, List[str]] = masks
-        self.recursive : bool = recursive
-        self.abspath : bool = abspath
-        self.length : int = length
+        self.root: str = root
+        self.masks: Union[str, List[str]] = masks
+        self.recursive: bool = recursive
+        self.abspath: bool = abspath
+        self.length: int = length
 
     def __iter__(self) -> Iterator[str] :
         yield from get_file_pathnames_from_root(self.root, self.masks, self.recursive, self.abspath)
diff --git a/torch/utils/data/datapipes/iter/fileloader.py b/torch/utils/data/datapipes/iter/fileloader.py
index 2b73e4e156b70..7c048fc054378 100644
--- a/torch/utils/data/datapipes/iter/fileloader.py
+++ b/torch/utils/data/datapipes/iter/fileloader.py
@@ -10,13 +10,14 @@ class FileLoaderIterDataPipe(IterDataPipe[Tuple[str, IOBase]]):
 
     Iterable Datapipe to load file streams from given pathnames,
     yield pathname and file stream in a tuple.
-    args:
+
+    Args:
         datapipe: Iterable datapipe that provides pathnames
         mode: An optional string that specifies the mode in which
             the file is opened by `open()`. It defaults to 'b' which
             means open for reading in binary mode. Another option is
             't' for text mode
-        length: a nominal length of the datapipe
+        length: Nominal length of the datapipe
 
     Note:
         The opened file handles will be closed by Python's GC periodly. Users can choose
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index f47299ce1cf9c..aece256d10650 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -30,26 +30,27 @@ def __iter__(self):
 
 
 @functional_datapipe('batch')
-class BatcherIterDataPipe(IterDataPipe[DataChunk[T_co]]):
+class BatcherIterDataPipe(IterDataPipe[DataChunk]):
     r""" :class:`BatcherIterDataPipe`.
 
     Iterable DataPipe to create mini-batches of data. An outer dimension will be added as
     `batch_size` if `drop_last` is set to `True`, or `length % batch_size` for the
     last batch if `drop_last` is set to `False`.
-    args:
+
+    Args:
         datapipe: Iterable DataPipe being batched
         batch_size: The size of each batch
         drop_last: Option to drop the last batch if it's not full
         unbatch_level: Specifies if it necessary to unbatch source data before
             applying new batching rule
     """
-    datapipe: IterDataPipe[T_co]
+    datapipe: IterDataPipe
     batch_size: int
     drop_last: bool
     length: Optional[int]
 
     def __init__(self,
-                 datapipe: IterDataPipe[T_co],
+                 datapipe: IterDataPipe,
                  batch_size: int,
                  drop_last: bool = False,
                  unbatch_level: int = 0,
@@ -66,8 +67,8 @@ def __init__(self,
         self.length = None
         self.wrapper_class = DataChunk
 
-    def __iter__(self) -> Iterator[DataChunk[T_co]]:
-        batch: List[T_co] = []
+    def __iter__(self) -> Iterator[DataChunk]:
+        batch: List = []
         for x in self.datapipe:
             batch.append(x)
             if len(batch) == self.batch_size:
@@ -96,13 +97,16 @@ class UnBatcherIterDataPipe(IterDataPipe):
 
     Iterable DataPipe to undo batching of data. In other words, it flattens the data up to the specified level
     within a batched DataPipe.
-    args:
+
+    Args:
         datapipe: Iterable DataPipe being un-batched
         unbatch_level: Defaults to `1` (only flattening the top level). If set to `2`, it will flatten the top 2 levels,
-        and `-1` will flatten the entire DataPipe.
+            and `-1` will flatten the entire DataPipe.
     """
 
-    def __init__(self, datapipe, unbatch_level: int = 1):
+    def __init__(self,
+                 datapipe: IterDataPipe,
+                 unbatch_level: int = 1):
         self.datapipe = datapipe
         self.unbatch_level = unbatch_level
 
@@ -143,7 +147,8 @@ class BucketBatcherIterDataPipe(IterDataPipe[DataChunk[T_co]]):
     Iterable DataPipe to create mini-batches of data from sorted bucket. An outer
     dimension will be added as `batch_size` if `drop_last` is set to `True`,
     or `length % batch_size` for the last batch if `drop_last` is set to `False`.
-        args:
+
+    Args:
         datapipe: Iterable DataPipe being batched
         batch_size: The size of each batch
         drop_last: Option to drop the last batch if it's not full
@@ -224,8 +229,21 @@ def __len__(self) -> int:
 
 
 @functional_datapipe('groupby')
-class GrouperIterDataPipe(IterDataPipe):
-    # TODO(VtalyFedyunin): Add inline docs and tests (they are partially available in notebooks)
+class GrouperIterDataPipe(IterDataPipe[DataChunk]):
+    r""":class:`GrouperIterDataPipe`.
+
+    Iterable datapipe to group data from input IterDataPipe by keys which are generated from `group_key_fn`,
+    and yield a DataChunk with size ranging from `guaranteed_group_size` to `group_size`.
+
+    Args:
+        datapipe: Iterable datapipe to be grouped
+        group_key_fn: Function used to generate group key from the data of the source datapipe
+        buffer_size: The size of buffer for ungrouped data
+        group_size: The size of each group
+        unbatch_level: Specifies if it necessary to unbatch source data before grouping
+        guaranteed_group_size: The guaranteed minimum group size
+        drop_remaining: Specifies if the group smaller than `guaranteed_group_size` will be dropped from buffer
+    """
     def __init__(self,
                  datapipe: IterDataPipe[T_co],
                  group_key_fn: Callable,
diff --git a/torch/utils/data/datapipes/iter/httpreader.py b/torch/utils/data/datapipes/iter/httpreader.py
index c663a18cdaab8..747b5d567e4cd 100644
--- a/torch/utils/data/datapipes/iter/httpreader.py
+++ b/torch/utils/data/datapipes/iter/httpreader.py
@@ -10,16 +10,18 @@ class HTTPReaderIterDataPipe(IterDataPipe[Tuple[str, IOBase]]):
 
     Iterable DataPipe to load file url(s) (http url(s) pointing to file(s)),
     yield file url and IO stream in a tuple
-    args:
-        timeout : timeout for http request
+
+    Args:
+        datapipe: Iterable DataPipe providing urls
+        timeout: Timeout for http request
     """
 
-    def __init__(self, source_datapipe, timeout=None):
-        self.source_datapipe = source_datapipe
+    def __init__(self, datapipe, timeout=None):
+        self.datapipe = datapipe
         self.timeout = timeout
 
     def __iter__(self):
-        for furl in self.source_datapipe:
+        for furl in self.datapipe:
             try:
                 if self.timeout is None:
                     r = urllib.urlopen(furl)
diff --git a/torch/utils/data/datapipes/iter/linereader.py b/torch/utils/data/datapipes/iter/linereader.py
index 2b15b93c9c60a..04b992d647b77 100644
--- a/torch/utils/data/datapipes/iter/linereader.py
+++ b/torch/utils/data/datapipes/iter/linereader.py
@@ -7,12 +7,15 @@ class LineReaderIterDataPipe(IterDataPipe[Tuple[str, str]]):
 
     Iterable DataPipe to load file name and stream as source IterDataPipe
     and yield filename and line(s).
+
+    Args:
+        datapipe: Iterable DataPipe providing file name and string file stream
     """
 
-    def __init__(self, source_datapipe):
-        self.source_datapipe = source_datapipe
+    def __init__(self, datapipe):
+        self.datapipe = datapipe
 
     def __iter__(self):
-        for file_name, stream in self.source_datapipe:
+        for file_name, stream in self.datapipe:
             for line in stream:
                 yield file_name, line
diff --git a/torch/utils/data/datapipes/iter/routeddecoder.py b/torch/utils/data/datapipes/iter/routeddecoder.py
index f149c074e63fe..ea47742f8e80b 100644
--- a/torch/utils/data/datapipes/iter/routeddecoder.py
+++ b/torch/utils/data/datapipes/iter/routeddecoder.py
@@ -6,7 +6,8 @@
     Decoder,
     basichandlers as decoder_basichandlers,
     imagehandler as decoder_imagehandler,
-    extension_extract_fn)
+    extension_extract_fn
+)
 
 
 @functional_datapipe('decode')
@@ -15,7 +16,8 @@ class RoutedDecoderIterDataPipe(IterDataPipe[Tuple[str, Any]]):
 
     Iterable datapipe to decode binary streams from input DataPipe, yield pathname
     and decoded data in a tuple.
-    args:
+
+    Args:
         datapipe: Iterable datapipe that provides pathname and binary stream in tuples
         handlers: Optional user defined decoder handlers. If None, basic and image decoder
             handlers will be set as default. If multiple handles are provided, the priority
diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py
index 83872cebdb53d..a89bfdfb39e5c 100644
--- a/torch/utils/data/datapipes/iter/selecting.py
+++ b/torch/utils/data/datapipes/iter/selecting.py
@@ -11,15 +11,16 @@ class FilterIterDataPipe(MapperIterDataPipe):
     r""" :class:`FilterIterDataPipe`.
 
     Iterable DataPipe to filter elements from datapipe according to filter_fn.
-    args:
+
+    Args:
         datapipe: Iterable DataPipe being filtered
         filter_fn: Customized function mapping an element to a boolean.
         fn_args: Positional arguments for `filter_fn`
         fn_kwargs: Keyword arguments for `filter_fn`
         drop_empty_batches: By default, drops batch if it is empty after filtering instead of keeping an empty list
         nesting_level: Determines which level the fn gets applied to, by default it applies to the top level (= 0).
-        This also accepts -1 as input to apply filtering to the lowest nesting level. It currently doesn't support
-        argument < -1.
+            This also accepts -1 as input to apply filtering to the lowest nesting level.
+            It currently doesn't support argument < -1.
     """
     drop_empty_batches: bool
 
diff --git a/torch/utils/data/datapipes/iter/streamreader.py b/torch/utils/data/datapipes/iter/streamreader.py
index f74efe746a759..197fb8e2b3005 100644
--- a/torch/utils/data/datapipes/iter/streamreader.py
+++ b/torch/utils/data/datapipes/iter/streamreader.py
@@ -7,16 +7,18 @@ class StreamReaderIterDataPipe(IterDataPipe[Tuple[str, bytes]]):
 
     Iterable DataPipe to load IO stream with label name,
     and to yield bytes with label name in a tuple
-    args:
-        chunk : bytes to read from stream on each iteration.
-                If None, stream reads to the EOF.
+
+    Args:
+        datapipe: Iterable DataPipe provides url and byte stream
+        chunk: Number of bytes to be read from stream per iteration.
+            If None, all bytes will be read util the EOF.
     """
-    def __init__(self, source_datapipe, chunk=None):
-        self.source_datapipe = source_datapipe
+    def __init__(self, datapipe, chunk=None):
+        self.datapipe = datapipe
         self.chunk = chunk
 
     def __iter__(self):
-        for (furl, stream) in self.source_datapipe:
+        for furl, stream in self.datapipe:
             while True:
                 d = stream.read(self.chunk)
                 if not d:
diff --git a/torch/utils/data/datapipes/iter/tararchivereader.py b/torch/utils/data/datapipes/iter/tararchivereader.py
index 9145f5f1dbc11..c34583a4d9420 100644
--- a/torch/utils/data/datapipes/iter/tararchivereader.py
+++ b/torch/utils/data/datapipes/iter/tararchivereader.py
@@ -12,9 +12,11 @@ class TarArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]):
 
     Iterable datapipe to extract tar binary streams from input iterable which contains tuples of
     pathname and tar binary stream, yields pathname and extracted binary stream in a tuple.
-    args:
+
+    Args:
         datapipe: Iterable datapipe that provides pathname and tar binary stream in tuples
-        mode: File mode used by `tarfile.open` to read file object. Mode has to be a string of the form 'filemode[:compression]'
+        mode: File mode used by `tarfile.open` to read file object.
+            Mode has to be a string of the form 'filemode[:compression]'
         length: a nominal length of the datapipe
 
     Note:
@@ -24,13 +26,13 @@ class TarArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]):
     """
     def __init__(
         self,
-        datapipe : Iterable[Tuple[str, BufferedIOBase]],
+        datapipe: Iterable[Tuple[str, BufferedIOBase]],
         mode: str = "r:*",
-        length : int = -1
+        length: int = -1
     ):
         super().__init__()
         self.datapipe: Iterable[Tuple[str, BufferedIOBase]] = datapipe
-        self.mode = mode
+        self.mode: str = mode
         self.length: int = length
 
     def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]:
diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py
index ee04abc455fba..9ba80e3576f77 100644
--- a/torch/utils/data/datapipes/iter/utils.py
+++ b/torch/utils/data/datapipes/iter/utils.py
@@ -2,6 +2,13 @@
 
 
 class IterableWrapperIterDataPipe(IterDataPipe):
+    r""":class:`IterableWrapperIterDataPipe`.
+
+    Iterable datapipe that wraps an iterable object.
+
+    Args:
+        iterable: Iterable object to be wrapped into an IterDataPipe
+    """
     def __init__(self, iterable):
         self.iterable = iterable
 
diff --git a/torch/utils/data/datapipes/iter/ziparchivereader.py b/torch/utils/data/datapipes/iter/ziparchivereader.py
index e98bd179760c8..881d00598151a 100644
--- a/torch/utils/data/datapipes/iter/ziparchivereader.py
+++ b/torch/utils/data/datapipes/iter/ziparchivereader.py
@@ -13,9 +13,10 @@ class ZipArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]):
 
     Iterable data pipe to extract zip binary streams from input iterable which contains tuples of
     pathname and zip binary stream, yields pathname and extracted binary stream in a tuple.
-    args:
+
+    Args:
         datapipe: Iterable datapipe that provides pathname and zip binary stream in tuples
-        length: a nominal length of the datapipe
+        length: Nominal length of the datapipe
 
     Note:
         The opened file handles will be closed automatically if the default DecoderDataPipe
@@ -24,12 +25,11 @@ class ZipArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]):
     """
     def __init__(
             self,
-            datapipe : Iterable[Tuple[str, BufferedIOBase]],
-            length : int = -1):
+            datapipe: Iterable[Tuple[str, BufferedIOBase]],
+            length: int = -1):
         super().__init__()
-        self.datapipe : Iterable[Tuple[str, BufferedIOBase]] = datapipe
-        self.length : int = length
-
+        self.datapipe: Iterable[Tuple[str, BufferedIOBase]] = datapipe
+        self.length: int = length
 
     def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]:
         if not isinstance(self.datapipe, Iterable):
@@ -60,7 +60,6 @@ def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]:
                     "Unable to extract files from corrupted zipfile stream {} due to: {}, abort!".format(pathname, e))
                 raise e
 
-
     def __len__(self):
         if self.length == -1:
             raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))

From ffc2612087be1ab469e5a2cd5a1106bf8ec9e753 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Mon, 30 Aug 2021 19:08:45 -0700
Subject: [PATCH 373/530] Add acc_gpu_kernel_with_scalars and port add to use
 it (#63884)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63884

See https://dev-discuss.pytorch.org/t/cuda-loops-case-study-code-generation-vs-templates/302
for explanation of what's going on here.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D30545296

Pulled By: ezyang

fbshipit-source-id: f0da52153ae63599fe1d57e90e73f50ca2116939
---
 .../ATen/native/cuda/BinaryAddSubKernel.cu    | 51 +++-----------
 aten/src/ATen/native/cuda/Loops.cuh           | 68 +++++++++++++------
 2 files changed, 58 insertions(+), 61 deletions(-)

diff --git a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu
index a07fd663581fe..b1c76e119a78a 100644
--- a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu
@@ -10,53 +10,20 @@
 
 namespace at { namespace native {
 
-template<typename scalar_t, typename accscalar_t>
+template <typename T>
 struct AddFunctor {
-  AddFunctor(accscalar_t a): alpha(a) {}
-  __device__ __forceinline__ scalar_t operator() (const scalar_t a, const scalar_t b) const {
-    return a + alpha * b;
+  AddFunctor(T alpha) : alpha_(alpha) {}
+  T alpha_;
+  __device__ __forceinline__ T operator()(T a, T b) const __ubsan_ignore_undefined__ {
+    return a + b * alpha_;
   }
-  private:
-    accscalar_t alpha;
-};
-
-template<typename scalar_t, typename accscalar_t, int SCALAR_ARG>
-struct AddScalarFunctor {
-  static_assert(SCALAR_ARG == 1 || SCALAR_ARG == 2, "SCALAR_ARG must be either 1 or 2");
-  AddScalarFunctor(accscalar_t alpha, accscalar_t b): alpha(alpha), b(b) {}
-  __device__ __forceinline__ scalar_t operator() (const scalar_t a) const {
-    return static_cast<scalar_t>(SCALAR_ARG == 1 ? b + alpha * a : a + alpha * b);
-  }
-  private:
-    accscalar_t alpha;
-    accscalar_t b;
 };
 
 void add_kernel_cuda(TensorIteratorBase& iter, const Scalar& alpha_scalar) {
-  if (!isIntegralType(iter.common_dtype(), /* includeBool */ true) && (iter.is_cpu_scalar(1) || iter.is_cpu_scalar(2))) {
-    // if common dtype is half the scalar constant can overflow in half precision, and yet the result can
-    // still be representable in the half dtype. Cast scalar to acc_type to have better accuracy.
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() {
-      using accscalar_t = at::acc_type<scalar_t, true>;
-      int scalar_arg = iter.is_cpu_scalar(1) ? 1 : 2;
-      auto b = iter.scalar_value<accscalar_t>(scalar_arg);
-      iter.remove_operand(scalar_arg);
-      const cuda::OptionalCUDAGuard device_guard(device_of(iter.tensor(1)));
-      if (scalar_arg == 1) {
-        AddScalarFunctor<scalar_t, decltype(b), 1> f(alpha_scalar.to<accscalar_t>(), b);
-        gpu_kernel(iter, f);
-      } else {
-        AddScalarFunctor<scalar_t, decltype(b), 2> f(alpha_scalar.to<accscalar_t>(), b);
-        gpu_kernel(iter, f);
-      }
-    });
-  } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() {
-      using accscalar_t = at::acc_type<scalar_t, true>;
-      AddFunctor<scalar_t, accscalar_t> f(alpha_scalar.to<accscalar_t>());
-      gpu_kernel_with_scalars(iter, f);
-    });
-  }
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() {
+    using opmath_t = at::opmath_type<scalar_t>;
+    opmath_gpu_kernel_with_scalars<scalar_t>(iter, AddFunctor<opmath_t>(alpha_scalar.to<opmath_t>()));
+  });
 }
 
 static void sub_kernel_cuda(TensorIteratorBase& iter, const Scalar& alpha_scalar) {
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index fde8e86409db7..8849293e20210 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -5,6 +5,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/TensorIteratorDynamicCasting.h>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/OpMathType.h>
 
 #include <thrust/tuple.h>
 
@@ -111,49 +112,64 @@ void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {
   gpu_kernel_impl(iter, f);
 }
 
-template<typename func_t>
+template<typename arg1_t, typename arg2_t, typename return_t, typename func_t>
 struct AUnaryFunctor {
   using traits = function_traits<func_t>;
-  using arg1_t = typename traits::template arg<0>::type;
-  using arg2_t = typename traits::template arg<1>::type;
-  using return_t = typename traits::result_type;
+  using opmath_arg1_t = typename traits::template arg<0>::type;
   __device__ return_t operator()(arg2_t b) const {
     return f(a, b);
   }
-  AUnaryFunctor(func_t f_, arg1_t a_): f(f_), a(a_) {}
+  // NB: scalar is stored in higher precision!
+  AUnaryFunctor(func_t f_, opmath_arg1_t a_): f(f_), a(a_) {}
   private:
     func_t f;
-    arg1_t a;
+    opmath_arg1_t a;
 };
 
-template<typename func_t>
+template<typename arg1_t, typename arg2_t, typename return_t, typename func_t>
 struct BUnaryFunctor {
   using traits = function_traits<func_t>;
-  using arg1_t = typename traits::template arg<0>::type;
-  using arg2_t = typename traits::template arg<1>::type;
-  using return_t = typename traits::result_type;
+  using opmath_arg2_t = typename traits::template arg<1>::type;
   __device__ return_t operator()(arg1_t a) const {
     return f(a, b);
   }
-  BUnaryFunctor(func_t f_, arg2_t b_): f(f_), b(b_) {}
+  // NB: scalar is stored in higher precision!
+  BUnaryFunctor(func_t f_, opmath_arg2_t b_): f(f_), b(b_) {}
   private:
     func_t f;
-    arg2_t b;
+    opmath_arg2_t b;
 };
 
-template <typename func_t>
-void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
+// Though seemingly noop, this inserts casts from arg1_t to func_t's type
+// (which may be higher precision), as well as casts to return_t
+template <typename arg1_t, typename arg2_t, typename return_t, typename func_t>
+struct BinaryFunctor {
+  __device__ return_t operator()(arg1_t a, arg2_t b) const {
+    return f(a, b);
+  }
+  BinaryFunctor(func_t f_): f(f_) {}
+  private:
+    func_t f;
+};
+
+// Unlike gpu_kernel_with_scalars, this allows you to pass a func_t which
+// accepts inputs at higher precision (typically opmath_t), but then
+// ensure that we load from memory at the correct precision (scalar_t)
+// to avoid expensive loads.  For the whole sordid story see
+// https://dev-discuss.pytorch.org/t/cuda-loops-case-study-code-generation-vs-templates/302
+template <typename arg1_t, typename arg2_t = arg1_t, typename return_t = arg1_t, typename func_t>
+void opmath_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
   TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
 
   using traits = function_traits<func_t>;
+  using opmath_arg1_t = typename traits::template arg<0>::type;
+  using opmath_arg2_t = typename traits::template arg<1>::type;
   static_assert(
       traits::arity == 2,
       "gpu_kernel_with_scalars only supports two input arguments");
 
-  using arg1_t = typename traits::template arg<0>::type;
-  using arg2_t = typename traits::template arg<1>::type;
   if (iter.is_cpu_scalar(1)) {
-    AUnaryFunctor<func_t> af(f, iter.scalar_value<arg1_t>(1));
+    AUnaryFunctor<arg1_t, arg2_t, return_t, func_t> af(f, iter.scalar_value<opmath_arg1_t>(1));
     iter.remove_operand(1);
     // TODO: When all kernels that use gpu_kernel_with_scalars are
     // ported to structured, this device guard can be deleted.  This
@@ -163,14 +179,28 @@ void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
     const OptionalDeviceGuard device_guard(device_of(iter.tensor(1)));
     gpu_kernel(iter, af);
   } else if (iter.is_cpu_scalar(2)) {
-    BUnaryFunctor<func_t> bf(f, iter.scalar_value<arg2_t>(2));
+    BUnaryFunctor<arg1_t, arg2_t, return_t, func_t> bf(f, iter.scalar_value<opmath_arg2_t>(2));
     iter.remove_operand(2);
     gpu_kernel(iter, bf);
   } else {
-    gpu_kernel(iter, f);
+    gpu_kernel(iter, BinaryFunctor<arg1_t, arg2_t, return_t, func_t>(f));
   }
 }
 
+// Legacy variant that assumes that func_t has the correct types
+// that we expect to load from memory
+template <typename func_t>
+void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
+  using traits = function_traits<func_t>;
+  static_assert(
+      traits::arity == 2,
+      "gpu_kernel_with_scalars only supports two input arguments");
+  using arg1_t = typename traits::template arg<0>::type;
+  using arg2_t = typename traits::template arg<1>::type;
+  using return_t = typename traits::result_type;
+  opmath_gpu_kernel_with_scalars<arg1_t, arg2_t, return_t, func_t>(iter, f);
+}
+
 namespace { // functions for `gpu_kernel_multiple_outputs`.
 
 // check the return type is `thrust::tuple`, not `std::tuple`.

From cb7cf823b30dcf623d1bceb76c6e16a899f5dc46 Mon Sep 17 00:00:00 2001
From: CaoE <e.cao@intel.com>
Date: Mon, 30 Aug 2021 19:12:23 -0700
Subject: [PATCH 374/530] add BFloat16 support for fold and unfold on CPU
 (#62880)

Summary:
Add BFloat16 support for fold and unfold operators on CPU.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62880

Reviewed By: iramazanli

Differential Revision: D30576387

Pulled By: zou3519

fbshipit-source-id: c48f6e56702bfea34448db1b3a1634c49c5d8ec8
---
 aten/src/ATen/native/Col2Im.cpp                  |  2 +-
 aten/src/ATen/native/Im2Col.cpp                  |  2 +-
 test/test_nn.py                                  | 16 ++++++++++++++++
 .../_internal/common_methods_invocations.py      |  1 +
 4 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/Col2Im.cpp b/aten/src/ATen/native/Col2Im.cpp
index e1cc31df60f54..7e11b1bdd5b6f 100644
--- a/aten/src/ATen/native/Col2Im.cpp
+++ b/aten/src/ATen/native/Col2Im.cpp
@@ -136,7 +136,7 @@ static void col2im_out_cpu_template(
   output.resize_({batch_size, n_output_plane, output_height, output_width});
   output.zero_();
 
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf,
       input.scalar_type(), "col2im_out_cpu", [&] {
         Tensor input_n = Tensor();
         Tensor output_n = Tensor();
diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp
index 0970095a68fa9..586b9612f80f4 100644
--- a/aten/src/ATen/native/Im2Col.cpp
+++ b/aten/src/ATen/native/Im2Col.cpp
@@ -86,7 +86,7 @@ static void im2col_out_cpu_template(
   output.resize_({batch_size, n_output_plane, output_length});
   output.zero_();
 
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf,
       input.scalar_type(), "im2col_out_cpu", [&] {
         Tensor input_n;
         Tensor output_n;
diff --git a/test/test_nn.py b/test/test_nn.py
index c6d0e78044126..96321ba183be0 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -17438,14 +17438,30 @@ def test_softshrink_negative(self, device):
             m(input)
 
     def test_fold(self, device):
+        def test_dtype(fn, input, dtype):
+            input = input.detach().clone().to(dtype=dtype).requires_grad_(True)
+            input2 = input.detach().clone().float().requires_grad_(True)
+            out = fn(input)
+            out.sum().backward()
+            out2 = fn(input2)
+            out2.sum().backward()
+            self.assertEqual(out.dtype, dtype)
+            self.assertEqual(input.grad.dtype, dtype)
+            self.assertEqual(out, out2.to(dtype=dtype), atol=0.05, rtol=0)
+            self.assertEqual(input.grad, input2.grad.to(dtype=dtype))
+
         def func(x):
             return F.fold(x, output_size=(4, 5), kernel_size=(2, 2))
+
         seeds = (44, 83, 71, 25, 999)
         for sd in seeds:
             torch.manual_seed(sd)
             x = torch.randn(1, 12, 12, device=device, requires_grad=True)
             gradcheck(func, [x])
             gradgradcheck(func, [x])
+            if device == 'cpu':
+                test_dtype(func, x, torch.bfloat16)
+
 
     def test_logsigmoid_out(self, device):
         # this isn't actually documented, but was broken previously:
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 04db52b2e607b..e7d93807511a5 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -7267,6 +7267,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
     OpInfo('nn.functional.unfold',
            aten_name='im2col',
            dtypes=floating_types_and(torch.half),
+           dtypesIfCPU=floating_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_nn_unfold,
            skips=(
                # JIT alias info internal asserts here

From c7c711bfb88fcb0ef573125a5a8655c49156055b Mon Sep 17 00:00:00 2001
From: Samantha Andow <samdow@fb.com>
Date: Mon, 30 Aug 2021 19:15:16 -0700
Subject: [PATCH 375/530] Add optional tensor arguments to (#63967)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63435

Adds optional tensor arguments to check handling torch function checks. The only one I didn't do this for in the functional file was `multi_head_attention_forward` since that already took care of some optional tensor arguments but not others so it seemed like arguments were specifically chosen

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63967

Reviewed By: albanD

Differential Revision: D30640441

Pulled By: ezyang

fbshipit-source-id: 5ef9554d2fb6c14779f8f45542ab435fb49e5d0f
---
 torch/nn/functional.py | 72 +++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index c11e261d9b85f..4b0449c8f5672 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -442,10 +442,10 @@ def fractional_max_pool2d_with_indices(
     .. _Fractional MaxPooling:
         http://arxiv.org/abs/1412.6071
     """
-    if has_torch_function_unary(input):
+    if has_torch_function_variadic(input, _random_samples):
         return handle_torch_function(
             fractional_max_pool2d_with_indices,
-            (input,),
+            (input, _random_samples),
             input,
             kernel_size,
             output_size=output_size,
@@ -473,10 +473,10 @@ def _fractional_max_pool2d(
     return_indices: bool = False,
     _random_samples: Optional[Tensor] = None
 ) -> Tensor:
-    if has_torch_function_unary(input):
+    if has_torch_function_variadic(input, _random_samples):
         return handle_torch_function(
             fractional_max_pool2d,
-            (input,),
+            (input, _random_samples),
             input,
             kernel_size,
             output_size=output_size,
@@ -537,10 +537,10 @@ def fractional_max_pool3d_with_indices(
     .. _Fractional MaxPooling:
         http://arxiv.org/abs/1412.6071
     """
-    if has_torch_function_unary(input):
+    if has_torch_function_variadic(input, _random_samples):
         return handle_torch_function(
             fractional_max_pool3d_with_indices,
-            (input,),
+            (input, _random_samples),
             input,
             kernel_size,
             output_size=output_size,
@@ -571,10 +571,10 @@ def _fractional_max_pool3d(
     return_indices: bool = False,
     _random_samples: Optional[Tensor] = None
 ) -> Tensor:
-    if has_torch_function_unary(input):
+    if has_torch_function_variadic(input, _random_samples):
         return handle_torch_function(
             fractional_max_pool3d,
-            (input,),
+            (input, _random_samples),
             input,
             kernel_size,
             output_size=output_size,
@@ -1843,8 +1843,8 @@ def linear(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tens
         - Bias: :math:`(out\_features)`
         - Output: :math:`(N, *, out\_features)`
     """
-    if has_torch_function_variadic(input, weight):
-        return handle_torch_function(linear, (input, weight), input, weight, bias=bias)
+    if has_torch_function_variadic(input, weight, bias):
+        return handle_torch_function(linear, (input, weight, bias), input, weight, bias=bias)
     return torch._C._nn.linear(input, weight, bias)
 
 
@@ -1865,10 +1865,10 @@ def bilinear(input1: Tensor, input2: Tensor, weight: Tensor, bias: Optional[Tens
         - output: :math:`(N, *, H_{out})` where :math:`H_{out}=\text{out\_features}`
           and all but the last dimension are the same shape as the input.
     """
-    if has_torch_function_variadic(input1, input2, weight):
+    if has_torch_function_variadic(input1, input2, weight, bias):
         return handle_torch_function(
             bilinear,
-            (input1, input2, weight),
+            (input1, input2, weight, bias),
             input1, input2, weight,
             bias=bias
         )
@@ -2135,10 +2135,10 @@ def embedding_bag(
         tensor([[ 0.0000,  0.0000,  0.0000],
                 [-0.7082,  3.2145, -2.6251]])
     """
-    if has_torch_function_variadic(input, weight):
+    if has_torch_function_variadic(input, weight, offsets, per_sample_weights):
         return handle_torch_function(
             embedding_bag,
-            (input, weight),
+            (input, weight, offsets, per_sample_weights),
             input,
             weight,
             offsets=offsets,
@@ -2263,10 +2263,10 @@ def batch_norm(
     See :class:`~torch.nn.BatchNorm1d`, :class:`~torch.nn.BatchNorm2d`,
     :class:`~torch.nn.BatchNorm3d` for details.
     """
-    if has_torch_function_unary(input):
+    if has_torch_function_variadic(input, running_mean, running_var, weight, bias):
         return handle_torch_function(
             batch_norm,
-            (input,),
+            (input, running_mean, running_var, weight, bias),
             input,
             running_mean,
             running_var,
@@ -2309,10 +2309,10 @@ def instance_norm(
     See :class:`~torch.nn.InstanceNorm1d`, :class:`~torch.nn.InstanceNorm2d`,
     :class:`~torch.nn.InstanceNorm3d` for details.
     """
-    if has_torch_function_unary(input):
+    if has_torch_function_variadic(input, running_mean, running_var, weight, bias):
         return handle_torch_function(
             instance_norm,
-            (input,),
+            (input, running_mean, running_var, weight, bias),
             input,
             running_mean=running_mean,
             running_var=running_var,
@@ -2340,9 +2340,9 @@ def layer_norm(
 
     See :class:`~torch.nn.LayerNorm` for details.
     """
-    if has_torch_function_unary(input):
+    if has_torch_function_variadic(input, weight, bias):
         return handle_torch_function(
-            layer_norm, (input,), input, normalized_shape, weight=weight, bias=bias, eps=eps
+            layer_norm, (input, weight, bias), input, normalized_shape, weight=weight, bias=bias, eps=eps
         )
     return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
 
@@ -2354,8 +2354,8 @@ def group_norm(
 
     See :class:`~torch.nn.GroupNorm` for details.
     """
-    if has_torch_function_unary(input):
-        return handle_torch_function(group_norm, (input,), input, num_groups, weight=weight, bias=bias, eps=eps)
+    if has_torch_function_variadic(input, weight, bias):
+        return handle_torch_function(group_norm, (input, weight, bias,), input, num_groups, weight=weight, bias=bias, eps=eps)
     _verify_batch_size([input.size(0) * input.size(1) // num_groups, num_groups] + list(input.size()[2:]))
     return torch.group_norm(input, num_groups, weight, bias, eps, torch.backends.cudnn.enabled)
 
@@ -2515,10 +2515,10 @@ def nll_loss(
         >>> output = F.nll_loss(F.log_softmax(input), target)
         >>> output.backward()
     """
-    if has_torch_function_variadic(input, target):
+    if has_torch_function_variadic(input, target, weight):
         return handle_torch_function(
             nll_loss,
-            (input, target),
+            (input, target, weight),
             input,
             target,
             weight=weight,
@@ -2828,10 +2828,10 @@ def cross_entropy(
         >>> loss = F.cross_entropy(input, target)
         >>> loss.backward()
     """
-    if has_torch_function_variadic(input, target):
+    if has_torch_function_variadic(input, target, weight):
         return handle_torch_function(
             cross_entropy,
-            (input, target),
+            (input, target, weight),
             input,
             target,
             weight=weight,
@@ -2887,10 +2887,10 @@ def binary_cross_entropy(
         >>> loss = F.binary_cross_entropy(F.sigmoid(input), target)
         >>> loss.backward()
     """
-    if has_torch_function_variadic(input, target):
+    if has_torch_function_variadic(input, target, weight):
         return handle_torch_function(
             binary_cross_entropy,
-            (input, target),
+            (input, target, weight),
             input,
             target,
             weight=weight,
@@ -2959,10 +2959,10 @@ def binary_cross_entropy_with_logits(
          >>> loss = F.binary_cross_entropy_with_logits(input, target)
          >>> loss.backward()
     """
-    if has_torch_function_variadic(input, target):
+    if has_torch_function_variadic(input, target, weight, pos_weight):
         return handle_torch_function(
             binary_cross_entropy_with_logits,
-            (input, target),
+            (input, target, weight, pos_weight),
             input,
             target,
             weight=weight,
@@ -3243,10 +3243,10 @@ def multilabel_soft_margin_loss(
 
     See :class:`~torch.nn.MultiLabelSoftMarginLoss` for details.
     """
-    if has_torch_function_variadic(input, target):
+    if has_torch_function_variadic(input, target, weight):
         return handle_torch_function(
             multilabel_soft_margin_loss,
-            (input, target),
+            (input, target, weight),
             input,
             target,
             weight=weight,
@@ -3323,10 +3323,10 @@ def multi_margin_loss(
 
     See :class:`~torch.nn.MultiMarginLoss` for details.
     """
-    if has_torch_function_variadic(input, target):
+    if has_torch_function_variadic(input, target, weight):
         return handle_torch_function(
             multi_margin_loss,
-            (input, target),
+            (input, target, weight),
             input,
             target,
             p=p,
@@ -4443,8 +4443,8 @@ def normalize(input: Tensor, p: float = 2.0, dim: int = 1, eps: float = 1e-12, o
         out (Tensor, optional): the output tensor. If :attr:`out` is used, this
                                 operation won't be differentiable.
     """
-    if has_torch_function_unary(input):
-        return handle_torch_function(normalize, (input,), input, p=p, dim=dim, eps=eps, out=out)
+    if has_torch_function_variadic(input, out):
+        return handle_torch_function(normalize, (input, out), input, p=p, dim=dim, eps=eps, out=out)
     if out is None:
         denom = input.norm(p, dim, keepdim=True).clamp_min(eps).expand_as(input)
         return input / denom

From 6b85c99ce562cf81749e5efc49bd835041e43f92 Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Mon, 30 Aug 2021 19:17:21 -0700
Subject: [PATCH 376/530] Avoid an unnecessary list creation in `DataChunk`
 (#64111)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64111

Reviewed By: mruberry

Differential Revision: D30639383

Pulled By: ezyang

fbshipit-source-id: 96b243307413c99a67d55d862a71937e1ef210f4
---
 torch/utils/data/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index 7a069d61de6cc..609e1a1eb6e2d 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -31,7 +31,7 @@ def __init__(self, items):
         self.items = items
 
     def as_str(self, indent=''):
-        res = indent + "[" + ", ".join([str(i) for i in iter(self)]) + "]"
+        res = indent + "[" + ", ".join(str(i) for i in iter(self)) + "]"
         return res
 
     def __iter__(self) -> Iterator[T]:

From 93f1090267df7e3023017b83f885edbc59e48913 Mon Sep 17 00:00:00 2001
From: oleshp <31859680+oleshp@users.noreply.github.com>
Date: Mon, 30 Aug 2021 19:22:05 -0700
Subject: [PATCH 377/530] Update contribution_guide.rst (#64142)

Summary:
Grammatical update.

Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64142

Reviewed By: mruberry

Differential Revision: D30639394

Pulled By: ezyang

fbshipit-source-id: cf1a4dfbd8e34b0772f1b09f5d820278e8ef8574
---
 docs/source/community/contribution_guide.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/community/contribution_guide.rst b/docs/source/community/contribution_guide.rst
index 166aa7526e731..7cba558dbdb54 100644
--- a/docs/source/community/contribution_guide.rst
+++ b/docs/source/community/contribution_guide.rst
@@ -200,8 +200,8 @@ Triaging issues
 ~~~~~~~~~~~~~~~
 
 If you feel that an issue could benefit from a particular tag or level
-of complexity comment on the issue and share your opinion. If you
-feel an issue isn't categorized properly comment and let the team know.
+of complexity, comment on the issue and share your opinion. If you
+feel an issue isn't categorized properly, comment and let the team know.
 
 About open source development
 -----------------------------

From 09dfaa0339c944d1c4b64193e0962972affbe0c7 Mon Sep 17 00:00:00 2001
From: leslie-fang-intel <leslie.fang@intel.com>
Date: Mon, 30 Aug 2021 19:28:59 -0700
Subject: [PATCH 378/530] add operation list for AutocastCPU (#63534)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63534

In this PR:
* We have changed the default dtype of `AutocastCPU` from `float16` to `bfloat16` as discussed here `https://github.com/pytorch/pytorch/pull/61002`
* We also update the operation list which needs casting to `lower_precision_fp` or `float32`.

Test Plan: Imported from OSS

Reviewed By: zou3519

Differential Revision: D30644914

Pulled By: ezyang

fbshipit-source-id: 8b93485ba452b3759611e3f0ac88e920fe495ac1
---
 aten/src/ATen/autocast_mode.cpp               | 300 ++++++++++++++++--
 test/run_test.py                              |   1 +
 torch/cpu/amp/autocast_mode.py                |   2 +-
 .../testing/_internal/autocast_test_lists.py  |  14 +-
 4 files changed, 277 insertions(+), 40 deletions(-)

diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 1ac5ad1c88ba6..9f5f486bb7581 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -461,22 +461,22 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(ADD_NS(conv1d), "conv1d", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), lower_precision_fp)
   KERNEL_CPU(ADD_NS(conv2d), "conv2d", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), lower_precision_fp)
   KERNEL_CPU(ADD_NS(conv3d), "conv3d", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), lower_precision_fp)
-  KERNEL_CPU(ADD_NS(_log_softmax), "_log_softmax", Tensor (const Tensor &, int64_t, bool), lower_precision_fp)
   KERNEL_CPU(ADD_NS(bmm), "bmm", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
   KERNEL_CPU(ADD_NS(mm), "mm", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
   KERNEL_CPU(ADD_NS(baddbmm), "baddbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
   KERNEL_CPU(ADD_NS(addmm), "addmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
   KERNEL_CPU(ADD_NS(addbmm), "addbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
   KERNEL_CPU(ADD_NS(linear), "linear", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor> &), lower_precision_fp)
+  KERNEL_CPU(ADD_NS(_convolution), "_convolution.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool), lower_precision_fp)
 
   // fp32 cast policy
+  KERNEL_CPU(ADD_NS(conv_transpose1d), "conv_transpose1d", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor> &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32)
+  KERNEL_CPU(ADD_NS(conv_transpose2d), "conv_transpose2d.input", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor> &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32)
   KERNEL_CPU(ADD_NS(conv_transpose3d), "conv_transpose3d.input", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor> &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32)
   KERNEL_CPU(ADD_NS(batch_norm), "batch_norm", Tensor (const Tensor &, const c10::optional<Tensor> &, const c10::optional<Tensor> &, const c10::optional<Tensor> &, const c10::optional<Tensor> &, bool, double, double, bool), fp32)
-  KERNEL_CPU(ADD_NS(max_pool2d), "max_pool2d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, bool), fp32)
-  KERNEL_CPU(ADD_NS(adaptive_avg_pool2d), "adaptive_avg_pool2d", Tensor (const Tensor &, IntArrayRef), fp32)
 
-  KERNEL_CPU(ADD_NS(convolution), "convolution", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t), fp32)
   KERNEL_CPU(ADD_NS(dropout), "dropout", Tensor (const Tensor &, double, bool), fp32)
+  KERNEL_CPU(ADD_NS(avg_pool1d), "avg_pool1d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool), fp32)
   KERNEL_CPU(ADD_NS(avg_pool2d), "avg_pool2d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional<int64_t>), fp32)
   KERNEL_CPU(ADD_NS(avg_pool3d), "avg_pool3d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional<int64_t>), fp32)
   KERNEL_CPU(ADD_NS(gelu), "gelu", Tensor (const Tensor &), fp32)
@@ -492,45 +492,285 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(ADD_NS(upsample_bilinear2d), "upsample_bilinear2d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, bool, c10::optional<ArrayRef<double>>), fp32)
   KERNEL_CPU(ADD_NS(upsample_trilinear3d), "upsample_trilinear3d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional<double>, c10::optional<double>, c10::optional<double>), fp32)
   KERNEL_CPU(ADD_NS(upsample_trilinear3d), "upsample_trilinear3d.vec", Tensor (const Tensor &, c10::optional<IntArrayRef>, bool, c10::optional<ArrayRef<double>>), fp32)
+
   KERNEL_CPU(ADD_NS(binary_cross_entropy), "binary_cross_entropy", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, int64_t), fp32)
   KERNEL_CPU(ADD_NS(binary_cross_entropy_with_logits), "binary_cross_entropy_with_logits", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, const c10::optional<Tensor>&, int64_t), fp32)
-  KERNEL_CPU(ADD_NS(pow), "pow.Tensor_Scalar", Tensor (const Tensor &, const Scalar &), fp32)
-  KERNEL_CPU(ADD_NS(pow), "pow.Tensor_Tensor", Tensor (const Tensor &, const Tensor &), fp32)
-  KERNEL_CPU(ADD_NS(pow), "pow.Scalar", Tensor (const Scalar&, const Tensor &), fp32)
-  KERNEL_CPU(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t, double), fp32)
-  KERNEL_CPU(ADD_NS(reflection_pad1d), "reflection_pad1d", Tensor (const Tensor &, IntArrayRef), fp32)
-  KERNEL_CPU(ADD_NS(std), "std", Tensor (const Tensor &, bool), fp32)
-  KERNEL_CPU(ADD_NS(std), "std.dim", Tensor (const Tensor &, IntArrayRef, bool, bool), fp32)
   KERNEL_CPU(ADD_NS(instance_norm), "instance_norm", Tensor (const Tensor &, const c10::optional<Tensor>&, const c10::optional<Tensor>&, const c10::optional<Tensor>&, const c10::optional<Tensor>&, bool, double, double, bool), fp32)
+  KERNEL_CPU(ADD_NS(grid_sampler), "grid_sampler", Tensor(const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32)
+  KERNEL_CPU(ADD_NS(polar), "polar", Tensor(const Tensor &, const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(multinomial), "multinomial", Tensor(const Tensor &, int64_t, bool, c10::optional<at::Generator>), fp32)
+  KERNEL_CPU(ADD_NS(poisson), "poisson", Tensor(const Tensor &, c10::optional<at::Generator>), fp32)
+  KERNEL_CPU(ADD_NS(fmod), "fmod.Tensor", Tensor(const Tensor &, const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(fmod), "fmod.Scalar", Tensor(const Tensor &, const Scalar &), fp32)
+  KERNEL_CPU(ADD_NS(prod), "prod", Tensor(const Tensor &, c10::optional<at::ScalarType>), fp32)
+  KERNEL_CPU(ADD_NS(prod), "prod.dim_int", Tensor(const Tensor &, int64_t, bool, c10::optional<at::ScalarType>), fp32)
+  KERNEL_CPU(ADD_NS(prod), "prod.dim_Dimname", Tensor(const Tensor &, at::Dimname, bool, c10::optional<at::ScalarType>), fp32)
+  KERNEL_CPU(ADD_NS(quantile), "quantile", Tensor(const Tensor &, const Tensor &, c10::optional<int64_t>, bool), fp32)
+  KERNEL_CPU(ADD_NS(quantile), "quantile.scalar", Tensor(const Tensor &, double, c10::optional<int64_t>, bool), fp32)
+  KERNEL_CPU(ADD_NS(quantile), "quantile.new", Tensor(const Tensor &, const Tensor &, c10::optional<int64_t>, bool, c10::string_view), fp32)
+  KERNEL_CPU(ADD_NS(quantile), "quantile.new_scalar", Tensor(const Tensor &, double, c10::optional<int64_t>, bool, c10::string_view), fp32)
+  KERNEL_CPU(ADD_NS(nanquantile), "nanquantile", Tensor(const Tensor &, const Tensor &, c10::optional<int64_t>, bool), fp32)
+  KERNEL_CPU(ADD_NS(nanquantile), "nanquantile.scalar", Tensor(const Tensor &, double, c10::optional<int64_t>, bool), fp32)
+  KERNEL_CPU(ADD_NS(nanquantile), "nanquantile.new", Tensor(const Tensor &, const Tensor &, c10::optional<int64_t>, bool, c10::string_view), fp32)
+  KERNEL_CPU(ADD_NS(nanquantile), "nanquantile.new_scalar", Tensor(const Tensor &, double, c10::optional<int64_t>, bool, c10::string_view), fp32)
+  KERNEL_CPU(ADD_NS(stft), "stft", Tensor(const Tensor &, int64_t, c10::optional<int64_t>, c10::optional<int64_t>, const c10::optional<Tensor> &, bool, c10::optional<bool>, c10::optional<bool>), fp32)
+  KERNEL_CPU(ADD_NS(cdist), "cdist", Tensor(const Tensor &, const Tensor &, double, c10::optional<int64_t>), fp32)
+  KERNEL_CPU(ADD_NS(cross), "cross", Tensor(const Tensor &, const Tensor &, c10::optional<int64_t>), fp32)
+  KERNEL_CPU(ADD_NS(cumprod), "cumprod", Tensor(const Tensor &, int64_t, c10::optional<at::ScalarType>), fp32)
+  KERNEL_CPU(ADD_NS(cumprod), "cumprod.dimname", Tensor(const Tensor &, at::Dimname, c10::optional<at::ScalarType>), fp32)
+  KERNEL_CPU(ADD_NS(cumsum), "cumsum", Tensor(const Tensor &, int64_t, c10::optional<at::ScalarType>), fp32)
+  KERNEL_CPU(ADD_NS(cumsum), "cumsum.dimname", Tensor(const Tensor &, at::Dimname, c10::optional<at::ScalarType>), fp32)
+  KERNEL_CPU(ADD_NS(diag), "diag", Tensor(const Tensor &, int64_t), fp32)
+  KERNEL_CPU(ADD_NS(diagflat), "diagflat", Tensor(const Tensor &, int64_t), fp32)
+  KERNEL_CPU(ADD_NS(histc), "histc", Tensor(const Tensor &, int64_t, const at::Scalar &, const at::Scalar &), fp32)
+  KERNEL_CPU(ADD_NS(logcumsumexp), "logcumsumexp", Tensor(const Tensor &, int64_t), fp32)
+  KERNEL_CPU(ADD_NS(searchsorted), "searchsorted.Tensor", Tensor(const Tensor &, const Tensor &, bool, bool), fp32)
+  KERNEL_CPU(ADD_NS(searchsorted), "searchsorted.Scalar", Tensor(const Tensor &, const at::Scalar &, bool, bool), fp32)
+  KERNEL_CPU(ADD_NS(trace), "trace", Tensor(const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(tril), "tril", Tensor(const Tensor &, int64_t), fp32)
+  KERNEL_CPU(ADD_NS(triu), "triu", Tensor(const Tensor &, int64_t), fp32)
+  KERNEL_CPU(ADD_NS(vander), "vander", Tensor(const Tensor &, c10::optional<int64_t>, bool), fp32)
+  KERNEL_CPU(ADD_NS(view_as_complex), "view_as_complex", Tensor(const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(cholesky), "cholesky", Tensor(const Tensor &, bool), fp32)
+  KERNEL_CPU(ADD_NS(cholesky_inverse), "cholesky_inverse", Tensor(const Tensor &, bool), fp32)
+  KERNEL_CPU(ADD_NS(cholesky_solve), "cholesky_solve", Tensor(const Tensor &, const Tensor &, bool), fp32)
+  KERNEL_CPU(ADD_NS(dot), "dot", Tensor(const Tensor &, const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(inverse), "inverse", Tensor(const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(lu_solve), "lu_solve", Tensor(const Tensor &, const Tensor &, const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(matrix_rank), "matrix_rank", Tensor(const Tensor &, bool), fp32)
+  KERNEL_CPU(ADD_NS(orgqr), "orgqr", Tensor(const Tensor &, const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(ormqr), "ormqr", Tensor(const Tensor &, const Tensor &, const Tensor &, bool, bool), fp32)
+  KERNEL_CPU(ADD_NS(pinverse), "pinverse", Tensor(const Tensor &, double), fp32)
+  KERNEL_CPU(ADD_NS(vdot), "vdot", Tensor(const Tensor &, const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(im2col), "im2col", Tensor(const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef), fp32)
+  KERNEL_CPU(ADD_NS(col2im), "col2im", Tensor(const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef), fp32)
+  KERNEL_CPU(ADD_NS(max_pool3d), "max_pool3d", Tensor(const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, bool), fp32)
+  KERNEL_CPU(ADD_NS(max_unpool2d), "max_unpool2d", Tensor(const Tensor &, const Tensor &, IntArrayRef), fp32)
+  KERNEL_CPU(ADD_NS(max_unpool3d), "max_unpool3d", Tensor(const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef), fp32)
+  KERNEL_CPU(ADD_NS(adaptive_avg_pool3d), "adaptive_avg_pool3d", Tensor(const Tensor &, IntArrayRef), fp32)
+  KERNEL_CPU(ADD_NS(reflection_pad1d), "reflection_pad1d", Tensor(const Tensor &, IntArrayRef), fp32)
+  KERNEL_CPU(ADD_NS(reflection_pad2d), "reflection_pad2d", Tensor(const Tensor &, IntArrayRef), fp32)
+  KERNEL_CPU(ADD_NS(replication_pad1d), "replication_pad1d", Tensor(const Tensor &, IntArrayRef), fp32)
+  KERNEL_CPU(ADD_NS(replication_pad2d), "replication_pad2d", Tensor(const Tensor &, IntArrayRef), fp32)
+  KERNEL_CPU(ADD_NS(replication_pad3d), "replication_pad3d", Tensor(const Tensor &, IntArrayRef), fp32)
+  KERNEL_CPU(ADD_NS(elu), "elu", Tensor(const Tensor &, const Scalar &, const Scalar &, const Scalar &), fp32)
+  KERNEL_CPU(ADD_NS(hardshrink), "hardshrink", Tensor(const Tensor &, const Scalar &), fp32)
+  KERNEL_CPU(ADD_NS(hardsigmoid), "hardsigmoid", Tensor(const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(hardswish), "hardswish", Tensor(const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(log_sigmoid), "log_sigmoid", Tensor(const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(prelu), "prelu", Tensor(const Tensor &, const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(selu), "selu", Tensor(const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(celu), "celu", Tensor(const Tensor &, const Scalar &), fp32)
+  KERNEL_CPU(ADD_NS(softplus), "softplus", Tensor(const Tensor &, const Scalar &, const Scalar &), fp32)
+  KERNEL_CPU(ADD_NS(softshrink), "softshrink", Tensor(const Tensor &, const Scalar &), fp32)
+  KERNEL_CPU(ADD_NS(group_norm), "group_norm", Tensor(const Tensor &, int64_t, const c10::optional<Tensor> &, const c10::optional<Tensor> &, double, bool), fp32)
+  KERNEL_CPU(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t, double), fp32)
+  KERNEL_CPU(ADD_NS(mse_loss), "mse_loss", Tensor(const Tensor &, const Tensor &, int64_t), fp32)
+  KERNEL_CPU(ADD_NS(ctc_loss), "ctc_loss.IntList", Tensor(const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, int64_t, int64_t, bool), fp32)
+  KERNEL_CPU(ADD_NS(ctc_loss), "ctc_loss.Tensor", Tensor(const Tensor &, const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32)
+  KERNEL_CPU(ADD_NS(kl_div), "kl_div", Tensor(const Tensor &, const Tensor &, int64_t, bool), fp32)
+  KERNEL_CPU(ADD_NS(multilabel_margin_loss), "multilabel_margin_loss", Tensor(const Tensor &, const Tensor &, int64_t), fp32)
+  KERNEL_CPU(ADD_NS(fft_fft), "fft_fft", Tensor(const Tensor &, c10::optional<int64_t>, int64_t, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_ifft), "fft_ifft", Tensor(const Tensor &, c10::optional<int64_t>, int64_t, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_fft2), "fft_fft2", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, at::IntArrayRef, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_ifft2), "fft_ifft2", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, at::IntArrayRef, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_fftn), "fft_fftn", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, c10::optional<at::IntArrayRef>, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_ifftn), "fft_ifftn", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, c10::optional<at::IntArrayRef>, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_rfft), "fft_rfft", Tensor(const Tensor &, c10::optional<int64_t>, int64_t, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_irfft), "fft_irfft", Tensor(const Tensor &, c10::optional<int64_t>, int64_t, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_rfft2), "fft_rfft2", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, at::IntArrayRef, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_irfft2), "fft_irfft2", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, at::IntArrayRef, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_rfftn), "fft_rfftn", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, c10::optional<at::IntArrayRef>, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_irfftn), "fft_irfftn", Tensor(const Tensor &, c10::optional<at::IntArrayRef>, c10::optional<at::IntArrayRef>, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_hfft), "fft_hfft", Tensor(const Tensor &, c10::optional<int64_t>, int64_t, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(fft_ihfft), "fft_ihfft", Tensor(const Tensor &, c10::optional<int64_t>, int64_t, c10::optional<c10::string_view>), fp32)
+  KERNEL_CPU(ADD_NS(conv_tbc), "conv_tbc", Tensor(const Tensor &, const Tensor &, const Tensor &, int64_t), fp32)
+  KERNEL_CPU(ADD_NS(linalg_matrix_norm), "linalg_matrix_norm", Tensor(const Tensor &, const at::Scalar &, at::IntArrayRef, bool, c10::optional<at::ScalarType>), fp32)
+  KERNEL_CPU(ADD_NS(linalg_matrix_norm), "linalg_matrix_norm.str_ord", Tensor(const Tensor &, c10::string_view, at::IntArrayRef, bool, c10::optional<at::ScalarType>), fp32)
+  KERNEL_CPU(ADD_NS(linalg_cond), "linalg_cond", Tensor(const Tensor &, const c10::optional<at::Scalar> &), fp32)
+  KERNEL_CPU(ADD_NS(linalg_cond), "linalg_cond.p_str", Tensor(const Tensor &, c10::string_view), fp32)
+  KERNEL_CPU(ADD_NS(linalg_matrix_rank), "linalg_matrix_rank", Tensor(const Tensor &, const c10::optional<double>, bool), fp32)
+  KERNEL_CPU(ADD_NS(linalg_matrix_rank), "linalg_matrix_rank.tol_tensor", Tensor(const Tensor &, const Tensor &, bool), fp32)
+  KERNEL_CPU(ADD_NS(linalg_solve), "linalg_solve", Tensor(const Tensor &, const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(linalg_cholesky), "linalg_cholesky", Tensor(const Tensor &, bool), fp32)
+  KERNEL_CPU(ADD_NS(linalg_svdvals), "linalg_svdvals", Tensor(const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(linalg_eigvals), "linalg_eigvals", Tensor(const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(linalg_eigvalsh), "linalg_eigvalsh", Tensor(const Tensor &, c10::string_view), fp32)
+  KERNEL_CPU(ADD_NS(linalg_inv), "linalg_inv", Tensor(const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(linalg_householder_product), "linalg_householder_product", Tensor(const Tensor &, const Tensor &), fp32)
+  KERNEL_CPU(ADD_NS(linalg_tensorinv), "linalg_tensorinv", Tensor(const Tensor &, int64_t), fp32)
+  KERNEL_CPU(ADD_NS(linalg_tensorsolve), "linalg_tensorsolve", Tensor(const Tensor &, const Tensor &, c10::optional<at::IntArrayRef>), fp32)
   KERNEL_CPU(ADD_NS(fake_quantize_per_tensor_affine), "fake_quantize_per_tensor_affine", Tensor (const Tensor &, double, int64_t, int64_t, int64_t), fp32)
+  KERNEL_CPU(ADD_NS(glu), "glu", Tensor (const Tensor &, int64_t), fp32)
 
-  // promote
-  KERNEL_CPU(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote)
-  KERNEL_CPU(ADD_NS(stack), "stack", Tensor (TensorList, int64_t), promote)
+  m.impl(TORCH_SELECTIVE_NAME("aten::cummax"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, int64_t),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, int64_t),
+                                 &ADD_NS(cummax)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::cummax.dimname"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, at::Dimname),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, at::Dimname),
+                                 &ADD_NS(cummax)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::cummin"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, int64_t),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, int64_t),
+                                 &ADD_NS(cummin)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::cummin.dimname"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, at::Dimname),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, at::Dimname),
+                                 &ADD_NS(cummin)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::eig"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, bool),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, bool),
+                                 &ADD_NS(eig)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::geqrf"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &),
+                                 std::tuple<Tensor, Tensor> (const Tensor &),
+                                 &ADD_NS(geqrf)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::lstsq"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, const Tensor &),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, const Tensor &),
+                                 &ADD_NS(lstsq)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::_lu_with_info"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor, Tensor> (const Tensor &, bool, bool),
+                                 std::tuple<Tensor, Tensor, Tensor> (const Tensor &, bool, bool),
+                                 &ADD_NS(_lu_with_info)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::lu_unpack"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor, Tensor> (const Tensor &, const Tensor &, bool, bool),
+                                 std::tuple<Tensor, Tensor, Tensor> (const Tensor &, const Tensor &, bool, bool),
+                                 &ADD_NS(lu_unpack)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::qr"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, bool),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, bool),
+                                 &ADD_NS(qr)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::solve"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, const Tensor &),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, const Tensor &),
+                                 &ADD_NS(solve)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::svd"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor, Tensor> (const Tensor &, bool, bool),
+                                 std::tuple<Tensor, Tensor, Tensor> (const Tensor &, bool, bool),
+                                 &ADD_NS(svd)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::symeig"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, bool, bool),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, bool, bool),
+                                 &ADD_NS(symeig)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::triangular_solve"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, const Tensor &, bool, bool, bool),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, const Tensor &, bool, bool, bool),
+                                 &ADD_NS(triangular_solve)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::fractional_max_pool2d"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef, IntArrayRef, const Tensor &),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef, IntArrayRef, const Tensor &),
+                                 &ADD_NS(fractional_max_pool2d)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::fractional_max_pool3d"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef, IntArrayRef, const Tensor &),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef, IntArrayRef, const Tensor &),
+                                 &ADD_NS(fractional_max_pool3d)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::adaptive_max_pool1d"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef),
+                                 &ADD_NS(adaptive_max_pool1d)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::adaptive_max_pool2d"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef),
+                                 &ADD_NS(adaptive_max_pool2d)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::adaptive_max_pool3d"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, IntArrayRef),
+                                 &ADD_NS(adaptive_max_pool3d)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::multilabel_margin_loss_forward"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, const Tensor &, int64_t),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, const Tensor &, int64_t),
+                                 &ADD_NS(multilabel_margin_loss_forward)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::linalg_qr"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, c10::string_view),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, c10::string_view),
+                                 &ADD_NS(linalg_qr)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::linalg_cholesky_ex"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, bool, bool),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, bool, bool),
+                                 &ADD_NS(linalg_cholesky_ex)>::type::call)));
 
-  m.impl(TORCH_SELECTIVE_NAME("aten::topk"),
+  m.impl(TORCH_SELECTIVE_NAME("aten::linalg_svd"),
          TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor,Tensor> (const Tensor &, int64_t, int64_t, bool, bool),
-                                 std::tuple<Tensor,Tensor> (const Tensor &, int64_t, int64_t, bool, bool),
-                                 &ADD_NS(topk)>::type::call)));
+                                 std::tuple<Tensor, Tensor, Tensor> (const Tensor &, bool),
+                                 std::tuple<Tensor, Tensor, Tensor> (const Tensor &, bool),
+                                 &ADD_NS(linalg_svd)>::type::call)));
 
-  m.impl(TORCH_SELECTIVE_NAME("aten::sort"),
+  m.impl(TORCH_SELECTIVE_NAME("aten::linalg_eig"),
          TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor,Tensor> (const Tensor &, int64_t, bool),
-                                 std::tuple<Tensor,Tensor> (const Tensor &, int64_t, bool),
-                                 &ADD_NS(sort)>::type::call)));
+                                 std::tuple<Tensor, Tensor> (const Tensor &),
+                                 std::tuple<Tensor, Tensor> (const Tensor &),
+                                 &ADD_NS(linalg_eig)>::type::call)));
 
-   m.impl(TORCH_SELECTIVE_NAME("aten::kthvalue"),
+  m.impl(TORCH_SELECTIVE_NAME("aten::linalg_eigh"),
          TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor,Tensor> (const Tensor &, int64_t, int64_t, bool),
-                                 std::tuple<Tensor,Tensor> (const Tensor &, int64_t, int64_t, bool),
-                                 &ADD_NS(kthvalue)>::type::call)));
+                                 std::tuple<Tensor, Tensor> (const Tensor &, c10::string_view),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, c10::string_view),
+                                 &ADD_NS(linalg_eigh)>::type::call)));
 
-   m.impl(TORCH_SELECTIVE_NAME("aten::kthvalue.dimname"),
+  m.impl(TORCH_SELECTIVE_NAME("aten::linalg_lstsq"),
          TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor,Tensor> (const Tensor &, int64_t, at::Dimname, bool),
-                                 std::tuple<Tensor,Tensor> (const Tensor &, int64_t, at::Dimname, bool),
-                                 &ADD_NS(kthvalue)>::type::call)));
+                                 std::tuple<Tensor, Tensor, Tensor, Tensor> (const Tensor &, const Tensor &, c10::optional<double>, c10::optional<c10::string_view>),
+                                 std::tuple<Tensor, Tensor, Tensor, Tensor> (const Tensor &, const Tensor &, c10::optional<double>, c10::optional<c10::string_view>),
+                                 &ADD_NS(linalg_lstsq)>::type::call)));
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::linalg_inv_ex"),
+         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
+                                 std::tuple<Tensor, Tensor> (const Tensor &, bool),
+                                 std::tuple<Tensor, Tensor> (const Tensor &, bool),
+                                 &ADD_NS(linalg_inv_ex)>::type::call)));
+
+  // promote
+  KERNEL_CPU(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote)
+  KERNEL_CPU(ADD_NS(stack), "stack", Tensor (TensorList, int64_t), promote)
+  KERNEL_CPU(ADD_NS(index_copy), "index_copy", Tensor (const Tensor &, int64_t, const Tensor &, const Tensor &), promote)
+  KERNEL_CPU(ADD_NS(index_copy), "index_copy.dimname", Tensor (const Tensor &, at::Dimname, const Tensor &, const Tensor &), promote)
+
 }
 } // namespace
 } // namespace autocast
diff --git a/test/run_test.py b/test/run_test.py
index 615aaf912c314..77e7f150c16e9 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -75,6 +75,7 @@
     "distributed/test_pg_wrapper",
     "distributed/algorithms/test_join",
     "test_cuda",
+    "test_autocast",
     "test_jit_cuda_fuser",
     "test_cuda_primary_ctx",
     "test_dataloader",
diff --git a/torch/cpu/amp/autocast_mode.py b/torch/cpu/amp/autocast_mode.py
index 08ea200a2bdc4..8c65f727753e2 100644
--- a/torch/cpu/amp/autocast_mode.py
+++ b/torch/cpu/amp/autocast_mode.py
@@ -5,5 +5,5 @@ class autocast(torch.autocast_mode.autocast):
     See :class:`torch.autocast`.
     ``torch.cpu.amp.autocast(args...)`` is equivalent to ``torch.autocast("cpu", args...)``
     """
-    def __init__(self, enabled=True, dtype=torch.float16):
+    def __init__(self, enabled=True, dtype=torch.bfloat16):
         super().__init__("cpu", enabled=enabled, dtype=dtype)
diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py
index 754ccca11ed9d..8350845e4ef19 100644
--- a/torch/testing/_internal/autocast_test_lists.py
+++ b/torch/testing/_internal/autocast_test_lists.py
@@ -307,7 +307,6 @@ def __init__(self, dev):
             ("conv1d", conv_args_fp32[0]),
             ("conv2d", conv_args_fp32[1]),
             ("conv3d", conv_args_fp32[2]),
-            ("log_softmax", pointwise0_fp32 + (0,)),
             ("bmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
                      torch.randn((n, n, n), device=dev, dtype=torch.float32))),
             ("mm", mat0_fp32 + mat1_fp32),
@@ -319,24 +318,22 @@ def __init__(self, dev):
                                     torch.randn((n, n, n), device=dev, dtype=torch.float32))),
         ]
         self.torch_fp32 = [
+            ("conv_transpose1d", conv_args_bf16[0]),
+            ("conv_transpose2d", conv_args_bf16[1]),
             ("conv_transpose3d", conv_args_bf16[2]),
             ("batch_norm", dummy_bf16[2], {"weight": None, "bias": None, "running_mean": torch.rand((n), dtype=torch.float32),
                                            "running_var": torch.rand((n), dtype=torch.float32), "training": False,
                                            "momentum": 0.1, "eps": 1e-5, "cudnn_enabled": False}),
-            ("max_pool2d", dummy_bf16[2], {"kernel_size": (3, 2), "stride": (1, 1)}),
             ("dropout", dummy_bf16[2], {"p": 0.1, "train": False}),
             ("binary_cross_entropy_with_logits", mat0_bf16 + (torch.rand((n, n), device=dev, dtype=torch.bfloat16),)),
-            ("pow", ((pointwise0_bf16[0] + 1.).clamp(0.0, 100.0),) + pointwise1_bf16),
-            ("pow", ((pointwise0_bf16[0] + 1.).clamp(0.0, 100.0),) + (1.7,)),
-            ("instance_norm", dummy_bf16[2], {"weight": None, "bias": None, "running_mean": torch.rand((n), dtype=torch.float32),
-                                              "running_var": torch.rand((n), dtype=torch.float32), "use_input_stats": False,
+            ("instance_norm", dummy_bf16[1], {"weight": None, "bias": None, "running_mean": None,
+                                              "running_var": None, "use_input_stats": True,
                                               "momentum": 0.1, "eps": 1e-5, "cudnn_enabled": False}),
         ]
         self.nn_bf16 = [
             ("linear", mat0_fp32 + mat1_fp32),
         ]
         self.nn_fp32 = [
-            ("adaptive_avg_pool2d", dummy_bf16[2], {"output_size": (3, 2)}),
             ("avg_pool2d", dummy_bf16[2], {"kernel_size": (3, 2), "stride": (1, 1)}),
             ("avg_pool3d", dummy_bf16[3], {"kernel_size": (3, 3, 3), "stride": (1, 1, 1)}),
             ("gelu", dummy_bf16[3]),
@@ -348,9 +345,8 @@ def __init__(self, dev):
             ("upsample_trilinear3d", dummy_bf16[4], {"output_size": (n, n, n), "align_corners": False}),
             ("binary_cross_entropy", (torch.rand((n, n), device=dev, dtype=torch.bfloat16),) +
                                      (torch.rand((n, n), device=dev, dtype=torch.bfloat16),)),
-            ("smooth_l1_loss", mat0_bf16 + mat1_bf16),
             ("reflection_pad1d", dummy_bf16[2], {"padding": (3, 3)}),
-            ("std", dummy_bf16[2]),
+            ("smooth_l1_loss", mat0_bf16 + mat1_bf16),
         ]
         self.torch_need_autocast_promote = [
             ("cat", (pointwise0_bf16 + pointwise1_fp32,)),

From 538647fe1fb94b7822ea3b8bbbd6901961431d60 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Mon, 30 Aug 2021 19:54:50 -0700
Subject: [PATCH 379/530] [WIP][FX] BC guarantees for 1.10 (#63888)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63888

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D30523133

Pulled By: jamesr66a

fbshipit-source-id: b04cc0d842a74862f42ecba98b757310cd2ec7b0
---
 ..._compat-fx_backcompat_class_members.expect |  19 ++
 ...t-fx_backcompat_function_signatures.expect |  70 +++++
 test/test_fx.py                               | 242 ++++++++++++++++++
 test/test_fx_experimental.py                  |   8 +-
 torch/fx/__init__.py                          |  23 +-
 torch/fx/_compatibility.py                    |  34 +++
 torch/fx/_symbolic_trace.py                   |  55 ++--
 torch/fx/annotate.py                          |   3 +-
 torch/fx/experimental/fx_acc/acc_ops.py       |  10 +-
 torch/fx/graph.py                             |  41 ++-
 torch/fx/graph_module.py                      |  33 ++-
 torch/fx/immutable_collections.py             |   4 +
 torch/fx/interpreter.py                       |  24 +-
 torch/fx/node.py                              |  51 +++-
 torch/fx/operator_schemas.py                  |   7 +
 torch/fx/passes/shape_prop.py                 |   8 +-
 torch/fx/passes/split_module.py               |   3 +
 torch/fx/proxy.py                             |  27 +-
 torch/fx/subgraph_rewriter.py                 |  10 +-
 torch/fx/tensor_type.py                       |   6 +-
 torch/quantization/ns/graph_passes.py         |   2 +
 21 files changed, 603 insertions(+), 77 deletions(-)
 create mode 100644 test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
 create mode 100644 test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
 create mode 100644 torch/fx/_compatibility.py

diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
new file mode 100644
index 0000000000000..88e4654b568df
--- /dev/null
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
@@ -0,0 +1,19 @@
+torch.fx._symbolic_trace.ProxyableClassMeta []
+torch.fx._symbolic_trace.Tracer ['call_module', 'create_arg', 'create_args_for_root', 'is_leaf_module', 'path_of_module', 'trace']
+torch.fx.graph.Graph ['call_function', 'call_method', 'call_module', 'create_node', 'eliminate_dead_code', 'erase_node', 'flatten_inps', 'get_attr', 'graph_copy', 'inserting_after', 'inserting_before', 'lint', 'node_copy', 'nodes', 'output', 'owning_module', 'placeholder', 'print_tabular', 'python_code', 'unflatten_outs']
+torch.fx.graph.PythonCode []
+torch.fx.graph_module.GraphModule ['add_submodule', 'code', 'delete_all_unused_submodules', 'delete_submodule', 'graph', 'recompile', 'to_folder']
+torch.fx.immutable_collections.immutable_dict ['clear', 'pop', 'popitem', 'update']
+torch.fx.immutable_collections.immutable_list ['append', 'clear', 'extend', 'insert', 'pop', 'remove']
+torch.fx.interpreter.Interpreter ['call_function', 'call_method', 'call_module', 'fetch_args_kwargs_from_env', 'fetch_attr', 'get_attr', 'map_nodes_to_values', 'output', 'placeholder', 'run', 'run_node']
+torch.fx.interpreter.Transformer ['call_function', 'call_module', 'get_attr', 'placeholder', 'transform']
+torch.fx.node.Node ['all_input_nodes', 'append', 'args', 'format_node', 'is_impure', 'kwargs', 'next', 'normalized_arguments', 'prepend', 'prev', 'replace_all_uses_with', 'replace_input_with', 'stack_trace', 'update_arg', 'update_kwarg']
+torch.fx.passes.shape_prop.ShapeProp ['propagate', 'run_node']
+torch.fx.passes.shape_prop.TensorMetadata ['dtype', 'is_quantized', 'memory_format', 'q_scale', 'q_zero_point', 'qscheme', 'requires_grad', 'shape', 'stride']
+torch.fx.passes.split_module.Partition []
+torch.fx.proxy.Attribute ['node']
+torch.fx.proxy.GraphAppendingTracer []
+torch.fx.proxy.Proxy ['keys']
+torch.fx.proxy.TraceError []
+torch.fx.proxy.TracerBase ['create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool']
+torch.fx.subgraph_rewriter.Match ['anchor', 'nodes_map']
\ No newline at end of file
diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
new file mode 100644
index 0000000000000..a73fde735bc2d
--- /dev/null
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
@@ -0,0 +1,70 @@
+torch.fx._symbolic_trace.Tracer.__init__(self, autowrap_modules: Tuple[Callable] = (<module math>,), autowrap_functions: Tuple[Callable, ...] = (,), enable_cpatching: bool = False, param_shapes_constant: bool = False) -> None
+torch.fx._symbolic_trace.Tracer.call_module(self, m: torch.nn.modules.module.Module, forward: Callable[..., Any], args: Tuple[Any, ...], kwargs: Dict[str, Any]) -> Any
+torch.fx._symbolic_trace.Tracer.create_arg(self, a: Any) -> 'Argument'
+torch.fx._symbolic_trace.Tracer.is_leaf_module(self, m: torch.nn.modules.module.Module, module_qualified_name: str) -> bool
+torch.fx._symbolic_trace.Tracer.path_of_module(self, mod: torch.nn.modules.module.Module) -> str
+torch.fx._symbolic_trace.Tracer.trace(self, root: Union[torch.nn.modules.module.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None) -> torch.fx.graph.Graph
+torch.fx._symbolic_trace.symbolic_trace(root: Union[torch.nn.modules.module.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None, enable_cpatching: bool = False) -> torch.fx.graph_module.GraphModule
+torch.fx._symbolic_trace.wrap(fn_or_name: Union[str, Callable])
+torch.fx.graph.Graph.__init__(self, owning_module: Optional[GraphModule] = None, tracer_cls: Optional[Type[Tracer]] = None)
+torch.fx.graph.Graph.call_function(self, the_function: Callable[..., Any], args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node
+torch.fx.graph.Graph.call_method(self, method_name: str, args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node
+torch.fx.graph.Graph.call_module(self, module_name: str, args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node
+torch.fx.graph.Graph.create_node(self, op: str, target: 'Target', args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, name: Optional[str] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node
+torch.fx.graph.Graph.eliminate_dead_code(self)
+torch.fx.graph.Graph.erase_node(self, to_erase: torch.fx.node.Node) -> None
+torch.fx.graph.Graph.get_attr(self, qualified_name: str, type_expr: Optional[Any] = None) -> torch.fx.node.Node
+torch.fx.graph.Graph.graph_copy(self, g: 'Graph', val_map: Dict[torch.fx.node.Node, torch.fx.node.Node], return_output_node = False) -> 'Optional[Argument]'
+torch.fx.graph.Graph.inserting_after(self, n: Optional[torch.fx.node.Node] = None)
+torch.fx.graph.Graph.inserting_before(self, n: Optional[torch.fx.node.Node] = None)
+torch.fx.graph.Graph.lint(self)
+torch.fx.graph.Graph.node_copy(self, node: torch.fx.node.Node, arg_transform: Callable[[torch.fx.node.Node], Argument] = <function <lambda>>) -> torch.fx.node.Node
+torch.fx.graph.Graph.output(self, result: 'Argument', type_expr: Optional[Any] = None)
+torch.fx.graph.Graph.placeholder(self, name: str, type_expr: Optional[Any] = None) -> torch.fx.node.Node
+torch.fx.graph.Graph.print_tabular(self)
+torch.fx.graph.Graph.python_code(self, root_module: str) -> torch.fx.graph.PythonCode
+torch.fx.graph_module.GraphModule.__init__(self, root: Union[torch.nn.modules.module.Module, Dict[str, Any]], graph: torch.fx.graph.Graph, class_name: str = 'GraphModule')
+torch.fx.graph_module.GraphModule.add_submodule(self, target: str, m: torch.nn.modules.module.Module) -> bool
+torch.fx.graph_module.GraphModule.delete_all_unused_submodules(self) -> None
+torch.fx.graph_module.GraphModule.delete_submodule(self, target: str) -> bool
+torch.fx.graph_module.GraphModule.recompile(self) -> torch.fx.graph.PythonCode
+torch.fx.interpreter.Interpreter.__init__(self, module: torch.fx.graph_module.GraphModule, garbage_collect_values: bool = True)
+torch.fx.interpreter.Interpreter.call_function(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
+torch.fx.interpreter.Interpreter.call_method(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
+torch.fx.interpreter.Interpreter.call_module(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
+torch.fx.interpreter.Interpreter.fetch_args_kwargs_from_env(self, n: torch.fx.node.Node) -> Tuple[Tuple, Dict]
+torch.fx.interpreter.Interpreter.fetch_attr(self, target: str)
+torch.fx.interpreter.Interpreter.get_attr(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
+torch.fx.interpreter.Interpreter.map_nodes_to_values(self, args: torch.fx.node.Argument, n: torch.fx.node.Node) -> torch.fx.node.Argument
+torch.fx.interpreter.Interpreter.output(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
+torch.fx.interpreter.Interpreter.placeholder(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
+torch.fx.interpreter.Interpreter.run(self, *args, initial_env: Optional[Dict[torch.fx.node.Node, Any]] = None) -> Any
+torch.fx.interpreter.Interpreter.run_node(self, n: torch.fx.node.Node) -> Any
+torch.fx.interpreter.Transformer.__init__(self, module)
+torch.fx.interpreter.Transformer.call_function(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
+torch.fx.interpreter.Transformer.call_module(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
+torch.fx.interpreter.Transformer.get_attr(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> torch.fx.proxy.Proxy
+torch.fx.interpreter.Transformer.placeholder(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> torch.fx.proxy.Proxy
+torch.fx.interpreter.Transformer.transform(self) -> torch.fx.graph_module.GraphModule
+torch.fx.node.Node.__init__(self, graph: 'Graph', name: str, op: str, target: 'Target', args: Tuple[Argument, ...], kwargs: Dict[str, Argument], return_type: Optional[Any] = None) -> None
+torch.fx.node.Node.append(self, x: 'Node') -> None
+torch.fx.node.Node.format_node(self, placeholder_names: List[str] = None, maybe_return_typename: List[str] = None) -> Optional[str]
+torch.fx.node.Node.prepend(self, x: 'Node') -> None
+torch.fx.node.Node.replace_all_uses_with(self, replace_with: 'Node') -> List[Node]
+torch.fx.node.Node.replace_input_with(self, old_input: 'Node', new_input: 'Node')
+torch.fx.node.Node.update_arg(self, idx: int, arg: torch.fx.node.Argument) -> None
+torch.fx.node.Node.update_kwarg(self, key: str, arg: torch.fx.node.Argument) -> None
+torch.fx.node.map_aggregate(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Argument], torch.fx.node.Argument]) -> torch.fx.node.Argument
+torch.fx.node.map_arg(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Node], torch.fx.node.Argument]) -> torch.fx.node.Argument
+torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int])
+torch.fx.proxy.Attribute.__init__(self, root: torch.fx.proxy.Proxy, attr: str)
+torch.fx.proxy.Proxy.__init__(self, node: torch.fx.node.Node, tracer: 'Optional[TracerBase]' = None)
+torch.fx.proxy.Proxy.keys(self)
+torch.fx.proxy.TracerBase.create_arg(self, a: Any) -> torch.fx.node.Argument
+torch.fx.proxy.TracerBase.create_node(self, kind: str, target: torch.fx.node.Target, args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, torch.fx.node.Argument], name: Optional[str] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node
+torch.fx.proxy.TracerBase.create_proxy(self, kind: str, target: torch.fx.node.Target, args: Tuple[Any, ...], kwargs: Dict[str, Any], name: Optional[str] = None, type_expr: Optional[Any] = None, proxy_factory_fn: Callable[[torch.fx.node.Node], Proxy] = None)
+torch.fx.proxy.TracerBase.iter(self, obj: 'Proxy') -> Iterator
+torch.fx.proxy.TracerBase.keys(self, obj: 'Proxy') -> Any
+torch.fx.proxy.TracerBase.proxy(self, node: torch.fx.node.Node) -> 'Proxy'
+torch.fx.proxy.TracerBase.to_bool(self, obj: 'Proxy') -> bool
+torch.fx.subgraph_rewriter.replace_pattern(gm: torch.fx.graph_module.GraphModule, pattern: Callable, replacement: Callable) -> List[torch.fx.subgraph_rewriter.Match]
\ No newline at end of file
diff --git a/test/test_fx.py b/test/test_fx.py
index 47873d7ef9b41..eadcf6cc0b2f4 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -11,6 +11,8 @@
 import sys
 import torch
 import traceback
+import typing
+import types
 import warnings
 import unittest
 from math import sqrt
@@ -31,6 +33,7 @@
 from collections import namedtuple
 
 from torch.fx.proxy import TraceError
+from torch.fx._compatibility import _BACK_COMPAT_OBJECTS, _MARKED_WITH_COMATIBLITY
 
 from fx.test_subgraph_rewriter import TestSubgraphRewriter  # noqa: F401
 from fx.test_dce_pass import TestDCE  # noqa: F401
@@ -3060,6 +3063,245 @@ def test_get_torch_func_signature_exhaustive(self, device, dtype, op):
             assert op.name in known_no_schema or "nn.functional" in op.name
 
 
+class TestFXAPIBackwardCompatibility(JitTestCase):
+    def setUp(self):
+        self.maxDiff = None
+
+    def _fn_to_stable_annotation_str(self, obj):
+        """
+        Unfortunately we have to serialize function signatures manually since
+        serialization for `inspect.Signature` objects is not stable across
+        python versions
+        """
+        fn_name = torch.typename(obj)
+
+        signature = inspect.signature(obj)
+
+        sig_str = f'{fn_name}{signature}'
+
+        arg_strs = []
+        for k, v in signature.parameters.items():
+            maybe_type_annotation = f': {self._annotation_type_to_stable_str(v.annotation, sig_str)}'\
+                if v.annotation is not inspect.Signature.empty else ''
+
+            def default_val_str(val):
+                if isinstance(val, (tuple, list)):
+                    str_pieces = ['(' if isinstance(val, tuple) else '[']
+                    str_pieces.append(', '.join(default_val_str(v) for v in val))
+                    if isinstance(val, tuple) and len(str_pieces) == 2:
+                        str_pieces.append(',')
+                    str_pieces.append(')' if isinstance(val, tuple) else ']')
+                    return ''.join(str_pieces)
+
+                # Need to fix up some default value strings.
+                # First case: modules. Default module `repr` contains the FS path of the module.
+                # Don't leak that
+                if isinstance(val, types.ModuleType):
+                    return f'<module {val.__name__}>'
+
+                # Second case: callables. Callables (such as lambdas) encode their address in
+                # their string repr. Don't do that
+                if callable(val):
+                    return f'<function {val.__name__}>'
+
+                return str(val)
+
+            if v.default is not inspect.Signature.empty:
+                default_val_str = default_val_str(v.default) if not isinstance(v.default, str) else f"'{v.default}'"
+                maybe_default = f' = {default_val_str}'
+            else:
+                maybe_default = ''
+            maybe_stars = ''
+            if v.kind == inspect.Parameter.VAR_POSITIONAL:
+                maybe_stars = '*'
+            elif v.kind == inspect.Parameter.VAR_KEYWORD:
+                maybe_stars = '**'
+            arg_strs.append(f'{maybe_stars}{k}{maybe_type_annotation}{maybe_default}')
+
+        return_annot = f' -> {self._annotation_type_to_stable_str(signature.return_annotation, sig_str)}'\
+            if signature.return_annotation is not inspect.Signature.empty else ''
+
+        return f'{fn_name}({", ".join(arg_strs)}){return_annot}'
+
+    def _annotation_type_to_stable_str(self, t, sig_str):
+        if t is inspect.Signature.empty:
+            return ''
+
+        # Forward ref
+        if isinstance(t, str):
+            return f"'{t}'"
+        if hasattr(typing, 'ForwardRef') and isinstance(t, typing.ForwardRef):
+            return t.__forward_arg__
+        if hasattr(typing, '_ForwardRef') and isinstance(t, typing._ForwardRef):
+            return t.__forward_arg__
+
+        trivial_mappings = {
+            str : 'str',
+            int : 'int',
+            float: 'float',
+            bool: 'bool',
+            torch.dtype: 'torch.dtype',
+            torch.Tensor: 'torch.Tensor',
+            torch.device: 'torch.device',
+            torch.memory_format: 'torch.memory_format',
+            slice: 'slice',
+            torch.nn.Module: 'torch.nn.modules.module.Module',
+            torch.fx.Graph : 'torch.fx.graph.Graph',
+            torch.fx.Node : 'torch.fx.node.Node',
+            torch.fx.Proxy : 'torch.fx.proxy.Proxy',
+            torch.fx.node.Target : 'torch.fx.node.Target',
+            torch.fx.node.Argument : 'torch.fx.node.Argument',
+            torch.fx.graph.PythonCode : 'torch.fx.graph.PythonCode',
+            torch.fx.graph_module.GraphModule: 'torch.fx.graph_module.GraphModule',
+            torch.fx.subgraph_rewriter.Match: 'torch.fx.subgraph_rewriter.Match',
+            Ellipsis : '...',
+            typing.Any: 'Any',
+            type(None): 'NoneType',
+            None: 'None',
+            typing.Iterator: 'Iterator',
+        }
+
+        mapping = trivial_mappings.get(t, None)
+        if mapping:
+            return mapping
+
+        # Handle types with contained types
+        contained = getattr(t, '__args__', None) or []
+
+        # Callables contain a bare List for arguments
+        contained = t if isinstance(t, list) else contained
+
+        # Python 3.8 puts type vars into __args__ for unbound types such as Dict
+        if all(isinstance(ct, typing.TypeVar) for ct in contained):
+            contained = []
+
+        contained_type_annots = [self._annotation_type_to_stable_str(ct, sig_str) for ct in contained]
+        contained_type_str = f'[{", ".join(contained_type_annots)}]' if len(contained_type_annots) > 0 else ''
+
+
+        origin = getattr(t, '__origin__', None)
+        if origin is None:
+            # Unbound types don't have `__origin__` in some Python versions, so fix that up here.
+            origin = t if t in {typing.Tuple, typing.Union, typing.Dict, typing.List, typing.Type, typing.Callable} else origin
+
+        if origin in {tuple, typing.Tuple}:
+            return f'Tuple{contained_type_str}'
+        if origin in {typing.Union}:
+            # Annoying hack to detect Optional
+            if len(contained) == 2 and (contained[0] is type(None)) ^ (contained[1] is type(None)):
+                not_none_param = contained[0] if contained[0] is not type(None) else contained[1]
+                return f'Optional[{self._annotation_type_to_stable_str(not_none_param, sig_str)}]'
+            return f'Union{contained_type_str}'
+        if origin in {dict, typing.Dict}:
+            return f'Dict{contained_type_str}'
+        if origin in {list, typing.List}:
+            return f'List{contained_type_str}'
+        if origin in {type, typing.Type}:
+            return f'Type{contained_type_str}'
+        if isinstance(t, typing.Callable):
+            if len(contained) > 0 and contained[0] is not Ellipsis:
+                return f'Callable[[{", ".join(contained_type_annots[:-1])}], {contained_type_annots[-1]}]'
+            else:
+                return f'Callable{contained_type_str}'
+
+        raise RuntimeError(f'Unrecognized type {t} used in BC-compatible type signature {sig_str}.'
+                           f'Please add support for this type and confirm with the '
+                           f'FX team that your signature change is valid.')
+
+
+    def test_function_back_compat(self):
+        """
+        Test backward compatibility for function signatures with
+        @compatibility(is_backward_compatible=True). Currently this checks for
+        exact signature matches, which may lead to false positives. If this
+        becomes too annoying, we can refine this check to actually parse out
+        the saved schema strings and check if the change is truly backward-
+        incompatible.
+        """
+        signature_strs = []
+
+        for obj in _BACK_COMPAT_OBJECTS:
+            if not isinstance(obj, type):
+                signature_strs.append(self._fn_to_stable_annotation_str(obj))
+
+        signature_strs.sort()
+
+        try:
+            self.assertExpected('\n'.join(signature_strs), 'fx_backcompat_function_signatures')
+        except AssertionError as e:
+            msg = f"{e}\n****** ERROR ******\nAn FX function that has been marked " \
+                  f"as backwards-compatible has experienced a signature change. See the " \
+                  f"above exception context for more information. If this change was " \
+                  f"unintended, please revert it. If it was intended, check with the FX " \
+                  f"team to ensure that the proper deprecation protocols have been followed " \
+                  f"and subsequently --accept the change."
+            raise AssertionError(msg)
+
+    def test_class_member_back_compat(self):
+        """
+        Test backward compatibility for members of classes with
+        @compatibility(is_backward_compatible=True). Currently this checks for
+        exact matches on the publicly visible members of the class.
+        """
+        class_method_strs = []
+
+        for obj in _BACK_COMPAT_OBJECTS:
+            if isinstance(obj, type):
+                public_members = [name for name in obj.__dict__ if not name.startswith('_')]
+                class_method_strs.append(f'{torch.typename(obj)} {sorted(public_members)}')
+
+        class_method_strs.sort()
+
+        try:
+            self.assertExpected('\n'.join(class_method_strs), 'fx_backcompat_class_members')
+        except AssertionError as e:
+            msg = f"{e}\n****** ERROR ******\nAn FX class that has been marked " \
+                  f"as backwards-compatible has experienced change in its public members. See the " \
+                  f"above exception context for more information. If this change was " \
+                  f"unintended, please revert it. If it was intended, check with the FX " \
+                  f"team to ensure that the proper deprecation protocols have been followed " \
+                  f"and subsequently --accept the change."
+            raise AssertionError(msg)
+
+    def test_public_api_surface(self):
+        mod = torch.fx
+
+        non_back_compat_objects = {}
+
+        def check_symbols_have_bc_designation(m, prefix):
+            if not m.__name__.startswith('torch.fx'):
+                return
+            if m.__name__.startswith('torch.fx.experimental'):
+                return
+            for k, v in m.__dict__.items():
+                if v is m:
+                    continue
+                if k.startswith('_'):
+                    continue
+                if isinstance(v, types.ModuleType):
+                    check_symbols_have_bc_designation(v, prefix + [k])
+                elif isinstance(v, type) or isinstance(v, types.FunctionType):
+                    if v not in _MARKED_WITH_COMATIBLITY:
+                        non_back_compat_objects.setdefault(v)
+
+        check_symbols_have_bc_designation(mod, ['torch', 'fx'])
+
+
+        non_back_compat_strs = [torch.typename(obj) for obj in non_back_compat_objects.keys()]
+        # Only want objects in torch.fx
+        non_back_compat_strs = [
+            s for s in non_back_compat_strs if s.startswith('torch.fx') and not s.startswith('torch.fx.experimental')]
+        # Only want objects in public namespaces
+        non_back_compat_strs = [
+            s for s in non_back_compat_strs if all(not atom.startswith('_') for atom in s.split('.'))]
+        non_back_compat_strs.sort()
+
+        if len(non_back_compat_strs) != 0:
+            raise AssertionError(f"Public FX API(s) {non_back_compat_strs} introduced but not given a "
+                                 f"backwards-compatibility classification! Please decorate these "
+                                 f"API(s) with `@torch.fx._compatibility.compatibility` to specify "
+                                 f"BC guarantees.")
+
 class TestFunctionalTracing(JitTestCase):
     IGNORE_FUNCS = ("has_torch_function", "has_torch_function_unary",
                     "has_torch_function_variadic", "handle_torch_function",
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index f000b0af59598..e723ee4622991 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -32,7 +32,7 @@
     type_matches,
     create_type_hint,
 )
-from torch.fx.passes.shape_prop import extract_tensor_metadata, ShapeProp
+from torch.fx.passes.shape_prop import _extract_tensor_metadata, ShapeProp
 from torch.fx.passes.split_module import split_module
 from torch.testing._internal.common_device_type import (
     ops,
@@ -96,13 +96,13 @@ def forward(self, a, b, c):
         # Fix for now to add type/shape to output
         for node in traced.graph.nodes:
             if node.op == "output":
-                node.meta["tensor_meta"] = extract_tensor_metadata(a)
+                node.meta["tensor_meta"] = _extract_tensor_metadata(a)
         for mod in module_with_submodules.modules():
             if isinstance(mod, GraphModule):
                 for node in mod.graph.nodes:
-                    node.meta["tensor_meta"] = extract_tensor_metadata(a)
+                    node.meta["tensor_meta"] = _extract_tensor_metadata(a)
         for node in module_with_submodules.graph.nodes:
-            node.meta["tensor_meta"] = extract_tensor_metadata(a)
+            node.meta["tensor_meta"] = _extract_tensor_metadata(a)
 
         weights1 = {}
         weights2 = {}
diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index 4ff795e632944..6524c2d1b8716 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -1,6 +1,4 @@
 r'''
-**This feature is under a Beta release and its API may change.**
-
 FX is a toolkit for developers to use to transform ``nn.Module``
 instances. FX consists of three main components: a **symbolic tracer,**
 an **intermediate representation**, and **Python code generation**. A
@@ -28,12 +26,13 @@ def forward(self, x):
     # High-level intermediate representation (IR) - Graph representation
     print(symbolic_traced.graph)
     """
-    graph(x):
-        %param : [#users=1] = self.param
-        %add_1 : [#users=1] = call_function[target=<built-in function add>](args = (%x, %param), kwargs = {})
-        %linear_1 : [#users=1] = call_module[target=linear](args = (%add_1,), kwargs = {})
-        %clamp_1 : [#users=1] = call_method[target=clamp](args = (%linear_1,), kwargs = {min: 0.0, max: 1.0})
-        return clamp_1
+    graph():
+        %x : [#users=1] = placeholder[target=x]
+        %param : [#users=1] = get_attr[target=param]
+        %add : [#users=1] = call_function[target=operator.add](args = (%x, %param), kwargs = {})
+        %linear : [#users=1] = call_module[target=linear](args = (%add,), kwargs = {})
+        %clamp : [#users=1] = call_method[target=clamp](args = (%linear,), kwargs = {min: 0.0, max: 1.0})
+        return clamp
     """
 
     # Code generation - valid Python code
@@ -41,10 +40,10 @@ def forward(self, x):
     """
     def forward(self, x):
         param = self.param
-        add_1 = x + param;  x = param = None
-        linear_1 = self.linear(add_1);  add_1 = None
-        clamp_1 = linear_1.clamp(min = 0.0, max = 1.0);  linear_1 = None
-        return clamp_1
+        add = x + param;  x = param = None
+        linear = self.linear(add);  add = None
+        clamp = linear.clamp(min = 0.0, max = 1.0);  linear = None
+        return clamp
     """
 
 The **symbolic tracer** performs "symbolic execution" of the Python
diff --git a/torch/fx/_compatibility.py b/torch/fx/_compatibility.py
new file mode 100644
index 0000000000000..2d33813200be2
--- /dev/null
+++ b/torch/fx/_compatibility.py
@@ -0,0 +1,34 @@
+from typing import Any, Dict
+import textwrap
+
+_BACK_COMPAT_OBJECTS : Dict[Any, None] = {}
+_MARKED_WITH_COMATIBLITY : Dict[Any, None] = {}
+
+def compatibility(is_backward_compatible : bool):
+    if is_backward_compatible:
+
+        def mark_back_compat(fn):
+            docstring = textwrap.dedent(getattr(fn, '__doc__', None) or '')
+            docstring += """
+.. note::
+    Backwards-compatibility for this API is guaranteed.
+"""
+            fn.__doc__ = docstring
+            _BACK_COMPAT_OBJECTS.setdefault(fn)
+            _MARKED_WITH_COMATIBLITY.setdefault(fn)
+            return fn
+
+        return mark_back_compat
+    else:
+
+        def mark_not_back_compat(fn):
+            docstring = textwrap.dedent(getattr(fn, '__doc__', None) or '')
+            docstring += """
+.. warning::
+    This API is experimental and is *NOT* backward-compatible.
+"""
+            fn.__doc__ = docstring
+            _MARKED_WITH_COMATIBLITY.setdefault(fn)
+            return fn
+
+        return mark_not_back_compat
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index 25f739e49f9ad..d38197322fab1 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -12,6 +12,7 @@
 import torch.utils._pytree as pytree
 
 import sys
+from ._compatibility import compatibility
 from .node import Argument, map_aggregate, base_types
 from .graph import Graph, _PyTreeInfo
 from .graph_module import GraphModule
@@ -25,6 +26,7 @@
 
 _proxyable_classes : Dict[Type, None] = {}
 
+@compatibility(is_backward_compatible=True)
 class ProxyableClassMeta(type):
     """
     ProxyableClassMeta allows you to make construction of a given Python class
@@ -157,6 +159,7 @@ def __enter__(self):
     def __exit__(self, type, value, tb):
         sys.setprofile(None)
 
+@compatibility(is_backward_compatible=False)
 class PHBase(object):
     """
     Object representing an input placeholder to `concrete_args`
@@ -166,6 +169,7 @@ def __repr__(self):
 
 PH = PHBase()
 
+@compatibility(is_backward_compatible=True)
 class Tracer(TracerBase):
     # Reference: https://github.com/pytorch/pytorch/issues/54354
     # The first line of this docstring overrides the one Sphinx generates for the
@@ -182,6 +186,11 @@ class Tracer(TracerBase):
     process. The different behaviors that can be overridden are described
     in the docstrings of the methods on this class.
     """
+
+    # Not checking BC on this API because the default value for `autowrap_modules`
+    # includes the local filepath to the `math` module, which would jitter
+    # across machines.
+    @compatibility(is_backward_compatible=True)
     def __init__(self, autowrap_modules: Tuple[ModuleType] = (math, ),
                  autowrap_functions: Tuple[Callable, ...] = (),
                  enable_cpatching: bool = False,
@@ -197,11 +206,19 @@ def __init__(self, autowrap_modules: Tuple[ModuleType] = (math, ),
 
             autowrap_modules (Tuple[ModuleType]): defaults to `(math, )`,
                 Python modules whose functions should be wrapped automatically
-                without needing to use fx.wrap().
+                without needing to use fx.wrap(). Backward-compatibility for
+                this parameter is guaranteed.
 
             autowrap_function (Tuple[Callable, ...]): defaults to `()`,
                 Python functions that should be wrapped automatically without
-                needing to use fx.wrap().
+                needing to use fx.wrap(). Backward compabilibility for this
+                parameter is guaranteed.
+
+            param_shapes_constant (bool): When this flag is set,  calls to shape,
+                size and a few other shape like attributes of a module's parameter
+                will be evaluted directly, rather than returning a new Proxy value
+                for an attribute access. Backward compatibility for this parameter
+                is guaranteed.
 
             enable_cpatching (bool): defaults to `False`,
                 Allows you to enable/disable monkeypatching of torch functions at the
@@ -210,12 +227,9 @@ def __init__(self, autowrap_modules: Tuple[ModuleType] = (math, ),
                 C-level monkeypatching works by directly modifying the PyCFunctionObject*
                 so that calling it returns a different function.
 
-                Turning this on is likely to slow down tracing by 1.5-3x.
-
-            param_shapes_constant (bool): see https://github.com/pytorch/pytorch/issues/61733. When
-            this flag is set,  calls to shape, size and a few other shape like attributes of a module's parameter
-            will be evaluted directly, rather than returning a new Proxy value for an attribute access.
-
+                Turning this on is likely to slow down tracing by 1.5-3x. This
+                parameter is experimental and its backward-compatibility is NOT
+                guaranteed.
         """
 
         super().__init__()
@@ -235,6 +249,7 @@ def __init__(self, autowrap_modules: Tuple[ModuleType] = (math, ),
 
         self.submodule_paths: Optional[Dict[torch.nn.Module, str]] = None
 
+    @compatibility(is_backward_compatible=True)
     def create_arg(self, a: Any) -> 'Argument':
         """
         A method to specify the behavior of tracing when preparing values to
@@ -325,6 +340,7 @@ def create_arg(self, a: Any) -> 'Argument':
 
         return super().create_arg(a)
 
+    @compatibility(is_backward_compatible=True)
     def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool:
         """
         A method to specify whether a given ``nn.Module`` is a "leaf" module.
@@ -346,6 +362,7 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> boo
         """
         return m.__module__.startswith('torch.nn') and not isinstance(m, torch.nn.Sequential)
 
+    @compatibility(is_backward_compatible=True)
     def path_of_module(self, mod : torch.nn.Module) -> str:
         """
         Helper method to find the qualified name of ``mod`` in the Module hierarchy
@@ -372,6 +389,7 @@ def path_of_module(self, mod : torch.nn.Module) -> str:
                     return n
             raise NameError('module is not installed as a submodule')
 
+    @compatibility(is_backward_compatible=True)
     def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args : Tuple[Any, ...], kwargs : Dict[str, Any]) -> Any:
         """
         Method that specifies the behavior of this ``Tracer`` when it encounters
@@ -404,6 +422,8 @@ def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args : Tu
             return forward(*args, **kwargs)
         return self.create_proxy('call_module', module_qualified_name, args, kwargs)
 
+    # This method will be refactored
+    @compatibility(is_backward_compatible=False)
     def create_args_for_root(self, root_fn, is_module, concrete_args=None):
         """
         Create ``placeholder`` nodes corresponding to the signature of the ``root``
@@ -509,8 +529,8 @@ def _module_getattr(self, attr, attr_val, parameter_proxy_cache):
 
         return attr_val
 
-
-    def trace(self, root: Union[torch.nn.Module, Callable], concrete_args: Optional[Dict[str, Any]] = None) -> Graph:
+    @compatibility(is_backward_compatible=True)
+    def trace(self, root: Union[torch.nn.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None) -> Graph:
         """
         Trace ``root`` and return the corresponding FX ``Graph`` representation. ``root``
         can either be an ``nn.Module`` instance or a Python callable.
@@ -524,8 +544,11 @@ def trace(self, root: Union[torch.nn.Module, Callable], concrete_args: Optional[
         Args:
 
             root (Union[Module, Callable]): Either a ``Module`` or a function to be
-                traced through.
-            concrete_args (Optional[Dict[str, any]]): Concrete arguments that should not be treated as Proxies.
+                traced through. Backwards-compatibility for this parameter is
+                guaranteed.
+            concrete_args (Optional[Dict[str, any]]): Concrete arguments that should
+                not be treated as Proxies. This parameter is experimental and
+                its backwards-compatibility is *NOT* guaranteed.
 
         Returns:
 
@@ -772,6 +795,7 @@ def _autowrap_check(patcher : _Patcher, frame_dict : Dict[str, Any], function_id
                 patcher.patch(frame_dict, name, _create_wrapped_func(value))
 
 
+@compatibility(is_backward_compatible=True)
 def wrap(fn_or_name : Union[str, Callable]):
     """
     This function can be called at module-level scope to register fn_or_name as a "leaf function".
@@ -828,9 +852,11 @@ def my_custom_function(x, y):
     _wrapped_fns_to_patch.append((f.f_globals, fn_name))
     return fn_or_name
 
-def symbolic_trace(root : Union[torch.nn.Module, Callable], concrete_args: Optional[Dict[str, Any]] = None,
+@compatibility(is_backward_compatible=True)
+def symbolic_trace(root : Union[torch.nn.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None,
                    enable_cpatching: bool = False) -> GraphModule:
-    """Symbolic tracing API
+    """
+    Symbolic tracing API
 
     Given an ``nn.Module`` or function instance ``root``, this function will return a ``GraphModule``
     constructed by recording operations seen while tracing through ``root``.
@@ -876,7 +902,6 @@ def f(x):
 
     Returns:
         GraphModule: a Module created from the recorded operations from ``root``.
-
     """
     tracer = Tracer(enable_cpatching=enable_cpatching)
     graph = tracer.trace(root, concrete_args)
diff --git a/torch/fx/annotate.py b/torch/fx/annotate.py
index 6e0646a58ec52..032ce14b6ec70 100644
--- a/torch/fx/annotate.py
+++ b/torch/fx/annotate.py
@@ -1,6 +1,7 @@
 from torch.fx.proxy import Proxy
+from ._compatibility import compatibility
 
-
+@compatibility(is_backward_compatible=False)
 def annotate(val, type):
     # val could be either a regular value (not tracing)
     # or fx.Proxy (tracing)
diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py
index 692ca6304910f..1b4b4690f732d 100644
--- a/torch/fx/experimental/fx_acc/acc_ops.py
+++ b/torch/fx/experimental/fx_acc/acc_ops.py
@@ -10,7 +10,7 @@
     register_acc_op_mapping,
     register_custom_acc_mapper_fn,
 )
-from torch.fx.passes.shape_prop import extract_tensor_metadata
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
 
 this_arg_is_optional = True
 
@@ -1134,12 +1134,12 @@ def packed_quantized_linear_mapper(
     with node.graph.inserting_before(node):
         # Insert get_attr nodes for weight and bias
         get_weight = node.graph.get_attr(weight_name)
-        get_weight.meta["tensor_meta"] = extract_tensor_metadata(linear_module.weight())
+        get_weight.meta["tensor_meta"] = _extract_tensor_metadata(linear_module.weight())
 
         get_bias = None
         if linear_module.bias() is not None:
             get_bias = node.graph.get_attr(bias_name)
-            get_bias.meta["tensor_meta"] = extract_tensor_metadata(linear_module.bias())
+            get_bias.meta["tensor_meta"] = _extract_tensor_metadata(linear_module.bias())
 
         # Create kwargs for acc_op.quantized_linear
         kwargs = {
@@ -1182,12 +1182,12 @@ def packed_quantized_conv2d_mapper(
     with node.graph.inserting_before(node):
         # Insert get_attr nodes for weight and bias
         get_weight = node.graph.get_attr(weight_name)
-        get_weight.meta["tensor_meta"] = extract_tensor_metadata(conv_module.weight())
+        get_weight.meta["tensor_meta"] = _extract_tensor_metadata(conv_module.weight())
 
         get_bias = None
         if conv_module.bias() is not None:
             get_bias = node.graph.get_attr(bias_name)
-            get_bias.meta["tensor_meta"] = extract_tensor_metadata(conv_module.bias())
+            get_bias.meta["tensor_meta"] = _extract_tensor_metadata(conv_module.bias())
 
         # Create kwargs for acc_op.conv
         kwargs = {
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 1ee6f05f79809..29ffc416715a7 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -1,6 +1,7 @@
 from .node import Node, Argument, Target, map_arg, _type_repr, _get_qualified_name
 import torch.utils._pytree as pytree
 from . import _pytree as fx_pytree
+from ._compatibility import compatibility
 
 from typing import TYPE_CHECKING, Callable, Any, List, Dict, NamedTuple, Optional, Tuple, Set, FrozenSet, Type
 from dataclasses import dataclass
@@ -175,9 +176,12 @@ def _is_illegal_name(self, name: str, obj: Any) -> bool:
         return False
 
 
+@compatibility(is_backward_compatible=True)
 @dataclass
 class PythonCode:
-    """Represents all the information necessary to exec or save a graph as Python code."""
+    """
+    Represents all the information necessary to exec or save a graph as Python code.
+    """
     # Python source code for the forward function definition.
     src: str
     # Values in global scope during exection of `src_def`.
@@ -240,6 +244,7 @@ class _PyTreeInfo(NamedTuple):
     in_spec: pytree.TreeSpec
     out_spec: Optional[pytree.TreeSpec]
 
+@compatibility(is_backward_compatible=True)
 class Graph:
     """
     ``Graph`` is the main data structure used in the FX Intermediate Representation.
@@ -283,6 +288,8 @@ def forward(self, x):
 
     For the semantics of operations represented in the ``Graph``, please see :class:`Node`.
     """
+
+    @compatibility(is_backward_compatible=True)
     def __init__(self, owning_module: Optional["GraphModule"] = None, tracer_cls: Optional[Type["Tracer"]] = None):
         """
         Construct an empty Graph.
@@ -299,6 +306,11 @@ def __init__(self, owning_module: Optional["GraphModule"] = None, tracer_cls: Op
 
     @property
     def owning_module(self):
+        """
+        Return the module that owns this ``GraphModule``, if there is one,
+        ``None`` if there is no owning module or if there are multiple owning
+        modules.
+        """
         return self._owning_module
 
     @owning_module.setter
@@ -322,6 +334,7 @@ def nodes(self) -> _node_list:
         """
         return _node_list(self)
 
+    @compatibility(is_backward_compatible=True)
     def graph_copy(self, g : 'Graph', val_map : Dict[Node, Node], return_output_node=False) -> 'Optional[Argument]':
         """
         Copy all nodes from a given graph into ``self``.
@@ -354,7 +367,7 @@ def __deepcopy__(self, memo=None) -> 'Graph':
         from the default implementation. This uses graph_copy to copy the nodes
         in an iterative way, rather than recursive. It also populates the
         memoization table to prevent unnecessary copies (e.g. references to
-        nodes or other parts of the Graph from a custom GraphModule implementation
+        nodes or other parts of the Graph from a custom GraphModule implementation.
         """
         memo = memo if memo else {}
         g = Graph(tracer_cls=self._tracer_cls)
@@ -364,6 +377,7 @@ def __deepcopy__(self, memo=None) -> 'Graph':
         g.output(output_val, type_expr=getattr(old_output_val, 'type', None))
         return g
 
+    @compatibility(is_backward_compatible=True)
     def create_node(self, op: str, target: 'Target',
                     args: Optional[Tuple['Argument', ...]] = None,
                     kwargs: Optional[Dict[str, 'Argument']] = None,
@@ -410,10 +424,12 @@ def create_node(self, op: str, target: 'Target',
         self._len += 1
         return n
 
+    @compatibility(is_backward_compatible=False)
     def flatten_inps(self, *args):
         flat_args, args_spec = pytree.tree_flatten(args)
         return flat_args
 
+    @compatibility(is_backward_compatible=False)
     def unflatten_outs(self, out):
         if self._pytree_info is None:
             return out
@@ -422,6 +438,7 @@ def unflatten_outs(self, out):
         assert(self._pytree_info.out_spec is not None)
         return pytree.tree_unflatten(out, self._pytree_info.out_spec)
 
+    @compatibility(is_backward_compatible=True)
     def erase_node(self, to_erase : Node) -> None:
         """
         Erases a ``Node`` from the ``Graph``. Throws an exception if
@@ -448,6 +465,7 @@ def erase_node(self, to_erase : Node) -> None:
         assert isinstance(new_kwargs, dict)
         to_erase.kwargs = new_kwargs
 
+    @compatibility(is_backward_compatible=True)
     def inserting_before(self, n: Optional[Node] = None):
         """Set the point at which create_node and companion methods will insert into the graph.
         When used within a 'with' statement, this will temporary set the insert point and
@@ -470,6 +488,7 @@ def inserting_before(self, n: Optional[Node] = None):
         assert n.graph == self, "Node to insert before is not in graph."
         return _InsertPoint(self, n.prepend)
 
+    @compatibility(is_backward_compatible=True)
     def inserting_after(self, n: Optional[Node] = None):
         """Set the point at which create_node and companion methods will insert into the graph.
         When used within a 'with' statement, this will temporary set the insert point and
@@ -492,7 +511,7 @@ def inserting_after(self, n: Optional[Node] = None):
         assert n.graph == self, "Node to insert after is not in graph."
         return _InsertPoint(self, n.append)
 
-    # sugar for create_node when you know the op
+    @compatibility(is_backward_compatible=True)
     def placeholder(self, name: str, type_expr: Optional[Any] = None) -> Node:
         """
         Insert a ``placeholder`` node into the Graph. A ``placeholder`` represents
@@ -514,6 +533,7 @@ def placeholder(self, name: str, type_expr: Optional[Any] = None) -> Node:
         """
         return self.create_node('placeholder', name, type_expr=type_expr)
 
+    @compatibility(is_backward_compatible=True)
     def get_attr(self, qualified_name: str, type_expr: Optional[Any] = None) -> Node:
         """
         Insert a ``get_attr`` node into the Graph. A ``get_attr`` ``Node`` represents the
@@ -571,6 +591,7 @@ def _get_attr_reference_exists(mod: torch.nn.Module, qualified_name: str) -> boo
                           "necessary buffer")
         return self.create_node('get_attr', qualified_name, type_expr=type_expr)
 
+    @compatibility(is_backward_compatible=True)
     def call_module(self,
                     module_name: str,
                     args: Optional[Tuple['Argument', ...]] = None,
@@ -615,6 +636,7 @@ def call_module(self,
                           "necessary submodule")
         return self.create_node('call_module', module_name, args, kwargs, type_expr=type_expr)
 
+    @compatibility(is_backward_compatible=True)
     def call_method(self,
                     method_name: str,
                     args: Optional[Tuple['Argument', ...]] = None,
@@ -649,6 +671,7 @@ def call_method(self,
         """
         return self.create_node('call_method', method_name, args, kwargs, type_expr=type_expr)
 
+    @compatibility(is_backward_compatible=True)
     def call_function(self,
                       the_function: Callable[..., Any],
                       args: Optional[Tuple['Argument', ...]] = None,
@@ -684,6 +707,7 @@ def call_function(self,
         """
         return self.create_node('call_function', the_function, args, kwargs, type_expr=type_expr)
 
+    @compatibility(is_backward_compatible=True)
     def node_copy(self, node: Node, arg_transform: Callable[[Node], 'Argument'] = lambda x: x) -> Node:
         """
         Copy a node from one graph into another. ``arg_transform`` needs to transform arguments from
@@ -714,6 +738,7 @@ def node_copy(self, node: Node, arg_transform: Callable[[Node], 'Argument'] = la
         result_node.meta = copy.copy(node.meta)
         return result_node
 
+    @compatibility(is_backward_compatible=True)
     def output(self, result: 'Argument', type_expr: Optional[Any] = None):
         """
         Insert an ``output`` ``Node`` into the ``Graph``. An ``output`` node represents
@@ -745,6 +770,7 @@ def _target_to_str(self, target : Target) -> str:
         op = _snake_case(op)
         return op
 
+    @compatibility(is_backward_compatible=True)
     def python_code(self, root_module: str) -> PythonCode:
         """
         Turn this ``Graph`` into valid Python code.
@@ -995,7 +1021,7 @@ def forward({', '.join(orig_args)}){maybe_return_annotation[0]}:
 
     def __str__(self) -> str:
         """
-        Print a human-readable (not machine-readable) string representation
+        Return a human-readable (not machine-readable) string representation
         of this Graph
         """
         placeholder_names : List[str] = []
@@ -1011,10 +1037,12 @@ def __str__(self) -> str:
                 s += '\n    ' + node_str
         return s
 
+    @compatibility(is_backward_compatible=True)
     def print_tabular(self):
         """
         Prints the intermediate representation of the graph in tabular
-        format.
+        format. Note that this API requires the ``tabulate`` module to be
+        installed.
         """
         try:
             from tabulate import tabulate
@@ -1027,6 +1055,7 @@ def print_tabular(self):
         print(tabulate(node_specs,
               headers=['opcode', 'name', 'target', 'args', 'kwargs']))
 
+    @compatibility(is_backward_compatible=True)
     def lint(self):
         """
         Runs various checks on this Graph to make sure it is well-formed. In
@@ -1097,6 +1126,7 @@ def check_arg(arg : Node, n : Optional[Node] = None) -> None:
                         else:
                             m_itr = new_m_itr
 
+    @compatibility(is_backward_compatible=True)
     def eliminate_dead_code(self):
         """
         Remove all dead code from the graph, based on each node's number of
@@ -1124,7 +1154,6 @@ def forward(self, x):
 
             def forward(self, x):
                 return x + self.attr_1
-
         """
         # Lint the graph first to make sure its topologically sorted, otherwise
         # DCE below will not behave as expected.
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index c91857342ffcd..e7750db9353bd 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -6,6 +6,7 @@
 import linecache
 from typing import Type, Dict, List, Any, Union, Optional, Set
 from .graph import Graph, _is_from_torch, _custom_builtins, PythonCode
+from ._compatibility import compatibility
 from torch.package import Importer, sys_importer
 import copy
 import itertools
@@ -17,9 +18,9 @@
 
 # Normal exec loses the source code, however we can work with
 # the linecache module to recover it.
-# Using exec_with_source will add it to our local cache
+# Using _exec_with_source will add it to our local cache
 # and then tools like TorchScript will be able to get source info.
-class EvalCacheLoader(object):
+class _EvalCacheLoader(object):
     def __init__(self):
         self.eval_cache = {}
         self.next_id = 0
@@ -62,10 +63,10 @@ def _get_key(self):
         self.next_id += 1
         return key
 
-_loader = EvalCacheLoader()
+_loader = _EvalCacheLoader()
 
 
-def exec_with_source(src: str, globals: Dict[str, Any]):
+def _exec_with_source(src: str, globals: Dict[str, Any]):
     key = _loader.cache(src, globals)
     exec(compile(src, key, 'exec'), globals)
 
@@ -73,7 +74,7 @@ def exec_with_source(src: str, globals: Dict[str, Any]):
 def _forward_from_src(src: str, globals: Dict[str, Any]):
     # avoid mutating the passed in dict
     globals_copy = globals.copy()
-    exec_with_source(src, globals_copy)
+    _exec_with_source(src, globals_copy)
     forward_fn = globals_copy['forward']
     del globals_copy['forward']
     return forward_fn
@@ -95,7 +96,7 @@ def _format_import_block(globals: Dict[str, Any], importer: Importer):
     return '\n'.join(import_strs)
 
 
-def reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Module:
+def _reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Module:
     # BC: attribute name was changed from `code` to `_code` to facilitate
     # making `code` into a property and adding a docstring to it
     fn_src = body.get('_code') or body['code']
@@ -103,14 +104,14 @@ def reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Mod
     return _deserialize_graph_module(forward, body)
 
 
-def reduce_package_graph_module(
+def _reduce_package_graph_module(
     importer: PackageImporter, body: Dict[Any, Any], generated_module_name: str
 ) -> torch.nn.Module:
     forward = importer.import_module(generated_module_name).forward
     return _deserialize_graph_module(forward, body)
 
 
-def reduce_deploy_graph_module(
+def _reduce_deploy_graph_module(
     importer: PackageImporter, body: Dict[Any, Any], import_block: str
 ) -> torch.nn.Module:
     ns = dict()
@@ -219,6 +220,7 @@ def _assign_attr(from_obj: Any, to_module: torch.nn.Module, target: str):
     else:
         setattr(to_module, field, from_obj)
 
+@compatibility(is_backward_compatible=True)
 class GraphModule(torch.nn.Module):
     """
     GraphModule is an nn.Module generated from an fx.Graph. Graphmodule has a
@@ -231,7 +233,6 @@ class GraphModule(torch.nn.Module):
         regenerated. However, if you edit the contents of the ``graph`` without reassigning
         the ``graph`` attribute itself, you must call ``recompile()`` to update the generated
         code.
-
     """
     def __new__(cls: 'Type[GraphModule]', *args, **kwargs):
         # each instance of a graph module needs its own forward method
@@ -243,6 +244,7 @@ class GraphModuleImpl(cls):  # type: ignore[misc, valid-type]
             pass
         return super().__new__(GraphModuleImpl)
 
+    @compatibility(is_backward_compatible=True)
     def __init__(self,
                  root: Union[torch.nn.Module, Dict[str, Any]],
                  graph: Graph,
@@ -266,7 +268,6 @@ def __init__(self,
             class_name (str): ``name`` denotes the name of this GraphModule for debugging purposes. If it's unset, all
                 error messages will report as originating from ``GraphModule``. It may be helpful to set this
                 to ``root``'s original name or a name that makes sense within the context of your transform.
-
         """
         super().__init__()
         self.__class__.__name__ = class_name
@@ -334,6 +335,7 @@ def graph(self, g : Graph) -> None:
         g.owning_module = self
         self.recompile()
 
+    @compatibility(is_backward_compatible=False)
     def to_folder(self, folder: Union[str, os.PathLike], module_name : str = "FxModule"):
         """Dumps out module to ``folder`` with ``module_name`` so that it can be
         imported with ``from <folder> import <module_name>``
@@ -398,6 +400,7 @@ def _gen_model_repr(module_name: str, module: torch.nn.Module) -> Optional[str]:
             warnings.warn("Was not able to save the following children modules as reprs -"
                           f"saved as pickled files instead: {blobified_modules}")
 
+    @compatibility(is_backward_compatible=True)
     def add_submodule(self, target: str, m: torch.nn.Module) -> bool:
         """
         Adds the given submodule to ``self``.
@@ -418,7 +421,6 @@ def add_submodule(self, target: str, m: torch.nn.Module) -> bool:
                 denoted by ``target`` must either a) not exist yet,
                 or b) reference an ``nn.Module`` (not a parameter or
                 other attribute)
-
         """
         *prefix, field = target.split('.')
         mod: torch.nn.Module = self
@@ -439,6 +441,7 @@ def add_submodule(self, target: str, m: torch.nn.Module) -> bool:
         mod.add_module(field, m)
         return True
 
+    @compatibility(is_backward_compatible=True)
     def delete_submodule(self, target: str) -> bool:
         """
         Deletes the given submodule from ``self``.
@@ -481,6 +484,7 @@ def delete_submodule(self, target: str) -> bool:
         delattr(mod, target_submod)
         return True
 
+    @compatibility(is_backward_compatible=True)
     def delete_all_unused_submodules(self) -> None:
         """
         Deletes all unused submodules from ``self``.
@@ -535,6 +539,7 @@ def code(self) -> str:
             raise RuntimeError('Code has not been generated! Please report a bug to PyTorch')
         return self._code
 
+    @compatibility(is_backward_compatible=True)
     def recompile(self) -> PythonCode:
         """
         Recompile this GraphModule from its ``graph`` attribute. This should be
@@ -613,7 +618,7 @@ def __reduce_deploy__(self, importer: Importer):
 
         python_code = self.recompile()
         import_block = _format_import_block(python_code.globals, importer)
-        return (reduce_deploy_graph_module, (dict_without_graph, import_block))
+        return (_reduce_deploy_graph_module, (dict_without_graph, import_block))
 
     def __reduce_package__(self, exporter: PackageExporter):
         dict_without_graph = self.__dict__.copy()
@@ -625,7 +630,7 @@ def __reduce_package__(self, exporter: PackageExporter):
         import_block = _format_import_block(python_code.globals, exporter.importer)
         module_code = import_block + self.code
         exporter.save_source_string(generated_module_name, module_code)
-        return (reduce_package_graph_module, (dict_without_graph, generated_module_name))
+        return (_reduce_package_graph_module, (dict_without_graph, generated_module_name))
 
     def __reduce__(self):
         """
@@ -639,7 +644,7 @@ def __reduce__(self):
         python_code = self.recompile()
         import_block = _format_import_block(python_code.globals, sys_importer)
         del dict_without_graph['_graph']
-        return (reduce_graph_module, (dict_without_graph, import_block))
+        return (_reduce_graph_module, (dict_without_graph, import_block))
 
     # because __reduce__ is defined for serialization,
     # we need to define deepcopy otherwise it will call __reduce__
diff --git a/torch/fx/immutable_collections.py b/torch/fx/immutable_collections.py
index 459c30e745dfd..1093a07c8d229 100644
--- a/torch/fx/immutable_collections.py
+++ b/torch/fx/immutable_collections.py
@@ -1,3 +1,4 @@
+from ._compatibility import compatibility
 
 _help_mutation = """\
 If you are attempting to modify the kwargs or args of a torch.fx.Node object,
@@ -20,5 +21,8 @@ def _create_immutable_container(base, mutable_functions):
                                               'clear', 'extend', 'insert', 'pop', 'remove'])
 immutable_list.__reduce__ = lambda self: (immutable_list, (tuple(iter(self)),))
 
+compatibility(is_backward_compatible=True)(immutable_list)
+
 immutable_dict = _create_immutable_container(dict, ['__delitem__', '__setitem__', 'clear', 'pop', 'popitem', 'update'])
 immutable_dict.__reduce__ = lambda self: (immutable_dict, (iter(self.items()),))
+compatibility(is_backward_compatible=True)(immutable_dict)
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index 20dcf62e0c3cb..64233b4cf18b6 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -3,8 +3,10 @@
 from .node import Argument, Node, Target, map_arg, map_aggregate
 from .proxy import Proxy
 from ._symbolic_trace import Tracer
+from ._compatibility import compatibility
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
+@compatibility(is_backward_compatible=True)
 class Interpreter:
     """
     An Interpreter executes an FX graph Node-by-Node. This pattern
@@ -59,6 +61,7 @@ def fn(x):
             execution. This can be disabled to, for example, examine all of the intermediate
             values in the execution by looking at the ``Interpreter.env`` attribute.
     """
+    @compatibility(is_backward_compatible=True)
     def __init__(self, module : GraphModule, garbage_collect_values : bool = True):
         assert isinstance(module, GraphModule)
         self.module = module
@@ -84,6 +87,7 @@ def register_last_uses(n : Node, user : Node):
                 map_arg(node.args, lambda n: register_last_uses(n, node))
                 map_arg(node.kwargs, lambda n: register_last_uses(n, node))
 
+    @compatibility(is_backward_compatible=True)
     def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None) -> Any:
         """
         Run `module` via interpretation and return the result.
@@ -123,6 +127,7 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None) -> Any:
                 output_val = self.env[node]
                 return output_val
 
+    @compatibility(is_backward_compatible=True)
     def run_node(self, n : Node) -> Any:
         """
         Run a specific node ``n`` and return the result.
@@ -142,7 +147,7 @@ def run_node(self, n : Node) -> Any:
         return getattr(self, n.op)(n.target, args, kwargs)
 
     # Main Node running APIs
-
+    @compatibility(is_backward_compatible=True)
     def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
         """
         Execute a ``placeholder`` node. Note that this is stateful:
@@ -168,6 +173,7 @@ def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D
         else:
             return next(self.args_iter)
 
+    @compatibility(is_backward_compatible=True)
     def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
         """
         Execute a ``get_attr`` node. Will retrieve an attribute
@@ -186,6 +192,7 @@ def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict
         assert isinstance(target, str)
         return self.fetch_attr(target)
 
+    @compatibility(is_backward_compatible=True)
     def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
         """
         Execute a ``call_function`` node and return the result.
@@ -205,6 +212,7 @@ def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs :
         # Execute the function and return the result
         return target(*args, **kwargs)
 
+    @compatibility(is_backward_compatible=True)
     def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
         """
         Execute a ``call_method`` node and return the result.
@@ -226,6 +234,7 @@ def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D
         assert isinstance(target, str)
         return getattr(self_obj, target)(*args_tail, **kwargs)
 
+    @compatibility(is_backward_compatible=True)
     def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
         """
         Execute a ``call_module`` node and return the result.
@@ -248,6 +257,7 @@ def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D
 
         return submod(*args, **kwargs)
 
+    @compatibility(is_backward_compatible=True)
     def output(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
         """
         Execute an ``output`` node. This really just retrieves
@@ -266,7 +276,7 @@ def output(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[s
         return args[0]
 
     # Helper methods
-
+    @compatibility(is_backward_compatible=True)
     def fetch_attr(self, target : str):
         """
         Fetch an attribute from the ``Module`` hierarchy of ``self.module``.
@@ -285,6 +295,7 @@ def fetch_attr(self, target : str):
             attr_itr = getattr(attr_itr, atom)
         return attr_itr
 
+    @compatibility(is_backward_compatible=True)
     def fetch_args_kwargs_from_env(self, n : Node) -> Tuple[Tuple, Dict]:
         """
         Fetch the concrete values of ``args`` and ``kwargs`` of node ``n``
@@ -302,6 +313,7 @@ def fetch_args_kwargs_from_env(self, n : Node) -> Tuple[Tuple, Dict]:
         assert isinstance(kwargs, dict)
         return args, kwargs
 
+    @compatibility(is_backward_compatible=True)
     def map_nodes_to_values(self, args : Argument, n : Node) -> Argument:
         """
         Recursively descend through ``args`` and look up the concrete value
@@ -319,6 +331,7 @@ def load_arg(n_arg : Node) -> Any:
             return self.env[n_arg]
         return map_arg(args, load_arg)
 
+@compatibility(is_backward_compatible=True)
 class Transformer(Interpreter):
     """
     ``Transformer`` is a special type of interpreter that produces a
@@ -357,6 +370,8 @@ def fn(x):
     Args:
         module (GraphModule): The ``Module`` to be transformed.
     """
+
+    @compatibility(is_backward_compatible=True)
     def __init__(self, module):
         super().__init__(module)
         self.new_graph = Graph()
@@ -371,6 +386,7 @@ def is_leaf_module(self, _, __) -> bool:
         self.tracer = TransformerTracer(self.new_graph)
         self.tracer.root = module
 
+    @compatibility(is_backward_compatible=True)
     def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy:
         """
         Execute a ``placeholder`` node. In ``Transformer``, this is
@@ -387,6 +403,7 @@ def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D
         assert isinstance(target, str)
         return Proxy(self.new_graph.placeholder(target), self.tracer)
 
+    @compatibility(is_backward_compatible=True)
     def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy:
         """
         Execute a ``get_attr`` node. In ``Transformer``, this is
@@ -403,16 +420,19 @@ def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict
         assert isinstance(target, str)
         return Proxy(self.new_graph.get_attr(target), self.tracer)
 
+    @compatibility(is_backward_compatible=True)
     def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
         # Override so that the leaf module policy from `self.tracer` is respected.
         assert isinstance(target, str)
         submod = self.fetch_attr(target)
         return self.tracer.call_module(submod, submod.forward, args, kwargs)
 
+    @compatibility(is_backward_compatible=True)
     def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
         # Override so that functions that were wrapped are still wrapped.
         return self.tracer.create_proxy('call_function', target, args, kwargs)
 
+    @compatibility(is_backward_compatible=True)
     def transform(self) -> GraphModule:
         """
         Transform ``self.module`` and return the transformed
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 8c4faf7d4fa27..61dfba7acb03f 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -1,5 +1,6 @@
 # Nodes represent a definition of a value in our graph of operators.
 from typing import TYPE_CHECKING, Union, Callable, Any, Tuple, List, Optional, Dict, Set
+from ._compatibility import compatibility
 from .immutable_collections import immutable_dict, immutable_list
 import torch
 import builtins
@@ -85,6 +86,7 @@ def _format_arg(arg) -> str:
     else:
         return str(arg)
 
+@compatibility(is_backward_compatible=True)
 class Node:
     """
     ``Node`` is the data structure that represents individual operations within
@@ -112,9 +114,37 @@ class Node:
     - ``output`` contains the output of the traced function in its ``args[0]`` attribute. This corresponds to the "return" statement
       in the Graph printout.
     """
+
+    @compatibility(is_backward_compatible=True)
     def __init__(self, graph: 'Graph', name: str, op: str, target: 'Target',
                  args: Tuple['Argument', ...], kwargs: Dict[str, 'Argument'],
                  return_type : Optional[Any] = None) -> None:
+        """
+        Instantiate an instance of ``Node``. Note: most often, you want to use the
+        Graph APIs, i.e. ``Graph.call_module``, ``Graph.call_method``, etc. rather
+        than instantiating a ``Node`` directly.
+
+        Args:
+            graph (Graph): The ``Graph`` to which this ``Node`` should belong.
+
+            name (str): The name to which the output of this ``Node`` should be assigned
+
+            op (str): The opcode for this ``Node``. Can be one of 'placeholder',
+                'call_method', 'call_module', 'call_function', 'get_attr',
+                'output'
+
+            target ('Target'): The target this op should call. See the broader
+                ``Node`` docstring for more details.
+
+            args (Tuple['Argument']): The args to be passed to ``target``
+
+            kwargs (Dict[str, 'Argument']): The kwargs to be passed to ``target``
+
+            return_type (Optional[Any]): The python type expression representing the
+                type of the output of this node. This field can be used for
+                annotation of values in the generated code or for other types
+                of analyses.
+        """
         self.graph = graph
         self.name = name  # unique name of value being created
         assert op in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr', 'output', 'root']
@@ -187,6 +217,7 @@ def prev(self) -> 'Node':
         """
         return self._prev
 
+    @compatibility(is_backward_compatible=True)
     def prepend(self, x: 'Node') -> None:
         """
         Insert x before this node in the list of nodes in the graph. Example::
@@ -205,6 +236,7 @@ def prepend(self, x: 'Node') -> None:
         p._next, x._prev = x, p
         x._next, self._prev = self, x
 
+    @compatibility(is_backward_compatible=True)
     def append(self, x: 'Node') -> None:
         """
         Insert x after this node in the list of nodes in the graph.
@@ -279,6 +311,7 @@ def all_input_nodes(self) -> List['Node']:
         """
         return list(self._input_nodes.keys())
 
+    @compatibility(is_backward_compatible=True)
     def update_arg(self, idx : int, arg : Argument) -> None:
         """
         Update an existing positional argument to contain the new value
@@ -293,6 +326,7 @@ def update_arg(self, idx : int, arg : Argument) -> None:
         args[idx] = arg
         self.args = tuple(args)
 
+    @compatibility(is_backward_compatible=True)
     def update_kwarg(self, key : str, arg : Argument) -> None:
         """
         Update an existing keyword argument to contain the new value
@@ -365,6 +399,7 @@ def _pretty_print_target(self, target):
                 return f'operator.{target.__name__}'
         return _get_qualified_name(target)
 
+    @compatibility(is_backward_compatible=True)
     def format_node(self,
                     placeholder_names: List[str] = None,
                     maybe_return_typename: List[str] = None) -> Optional[str]:
@@ -420,6 +455,7 @@ def format_node(self,
                    f'{self.op}[target={self._pretty_print_target(self.target)}](' \
                    f'args = {_format_arg(self.args)}, kwargs = {_format_arg(self.kwargs)})'
 
+    @compatibility(is_backward_compatible=True)
     def replace_all_uses_with(self, replace_with : 'Node') -> List['Node']:
         """
         Replace all uses of ``self`` in the Graph with the Node ``replace_with``.
@@ -449,6 +485,7 @@ def maybe_replace_node(n : Node) -> Node:
         assert len(self.users) == 0
         return to_process
 
+    @compatibility(is_backward_compatible=False)
     def is_impure(self):
         """
         Returns whether this op is impure, i.e. if its op is a placeholder or
@@ -478,6 +515,7 @@ def is_impure(self):
 
         return False
 
+    @compatibility(is_backward_compatible=False)
     def normalized_arguments(
             self, root : torch.nn.Module, arg_types : Optional[Tuple[Any]] = None,
             kwarg_types : Optional[Dict[str, Any]] = None,
@@ -513,7 +551,7 @@ def normalized_arguments(
 
         return None
 
-
+    @compatibility(is_backward_compatible=True)
     def replace_input_with(self, old_input: 'Node', new_input: 'Node'):
         """
         Loop through input nodes of ``self``, and replace all instances of
@@ -523,7 +561,6 @@ def replace_input_with(self, old_input: 'Node', new_input: 'Node'):
 
             old_input (Node): The old input node to be replaced.
             new_input (Node): The new input node to replace ``old_input``.
-
         """
         def maybe_replace_node(n : Node) -> Node:
             return new_input if n == old_input else n
@@ -535,13 +572,19 @@ def maybe_replace_node(n : Node) -> Node:
         self.__update_args_kwargs(new_args, new_kwargs)
 
 
+@compatibility(is_backward_compatible=True)
 def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument:
-    """ Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """
+    """
+    Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys.
+    """
     assert callable(fn), "torch.fx.map_arg(a, fn): fn must be a callable"
     return map_aggregate(a, lambda x: fn(x) if isinstance(x, Node) else x)
 
+@compatibility(is_backward_compatible=True)
 def map_aggregate(a: Argument, fn: Callable[[Argument], Argument]) -> Argument:
-    """ Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """
+    """
+    Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys.
+    """
     if isinstance(a, tuple):
         return tuple(map_aggregate(elem, fn) for elem in a)
     elif isinstance(a, list):
diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py
index 5f61ebe718ff1..ac559b19530c7 100644
--- a/torch/fx/operator_schemas.py
+++ b/torch/fx/operator_schemas.py
@@ -6,7 +6,9 @@
 import warnings
 from typing import Any, Callable, Dict, List, Optional, Tuple, NamedTuple, cast
 from torch._jit_internal import boolean_dispatched
+from ._compatibility import compatibility
 
+@compatibility(is_backward_compatible=False)
 class ArgsKwargsPair(NamedTuple):
     """
     Simple named tuple for wrapping args/kwargs pairs.
@@ -76,6 +78,7 @@ def _torchscript_schema_to_signature(ts_schema : torch._C.FunctionSchema) -> ins
 
     return inspect.Signature(parameters, return_annotation=return_type)
 
+@compatibility(is_backward_compatible=False)
 def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature]]:
     """
     Given an operator on the `torch` namespace, return a list of `inspect.Signature`
@@ -103,6 +106,7 @@ def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature
 
     return signatures
 
+@compatibility(is_backward_compatible=False)
 def create_type_hint(x):
     try:
         if isinstance(x, list) or isinstance(x, tuple):
@@ -130,6 +134,7 @@ def ret_type(x):
         pass
     return x
 
+@compatibility(is_backward_compatible=False)
 def type_matches(signature_type : Any, argument_type : Any):
     sig_origin_type = getattr(signature_type, '__origin__', signature_type)
 
@@ -177,6 +182,7 @@ def is_homogeneous_tuple(t):
 
     return False
 
+@compatibility(is_backward_compatible=False)
 def normalize_function(
         target: Callable, args: Tuple[Any], kwargs : Optional[Dict[str, Any]] = None, arg_types : Optional[Tuple[Any]] = None,
         kwarg_types : Optional[Dict[str, Any]] = None,
@@ -272,6 +278,7 @@ def normalize_function(
 
     return new_args_and_kwargs
 
+@compatibility(is_backward_compatible=False)
 def normalize_module(
         root: torch.nn.Module, target: str, args: Tuple[Any], kwargs : Optional[Dict[str, Any]] = None,
         normalize_to_only_use_kwargs : bool = False) -> Optional[ArgsKwargsPair]:
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 6f0f72d38c75f..816fbe7aaac6c 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -2,7 +2,9 @@
 import torch.fx
 from torch.fx.node import Node, map_aggregate
 from typing import Any, Tuple, NamedTuple, Optional
+from torch.fx._compatibility import compatibility
 
+@compatibility(is_backward_compatible=True)
 class TensorMetadata(NamedTuple):
     # TensorMetadata is a structure containing pertinent information
     # about a tensor within a PyTorch program.
@@ -20,7 +22,7 @@ class TensorMetadata(NamedTuple):
     q_scale : Optional[float]
     q_zero_point : Optional[int]
 
-def extract_tensor_metadata(result : torch.Tensor) -> TensorMetadata:
+def _extract_tensor_metadata(result : torch.Tensor) -> TensorMetadata:
     """
     Extract a TensorMetadata NamedTuple describing `result`.
     """
@@ -58,7 +60,7 @@ def extract_tensor_metadata(result : torch.Tensor) -> TensorMetadata:
     return TensorMetadata(
         shape, dtype, requires_grad, stride, memory_format, is_quantized, qscheme, q_scale, q_zero_point)
 
-
+@compatibility(is_backward_compatible=True)
 class ShapeProp(torch.fx.Interpreter):
     """
     Execute an FX graph Node-by-Node and
@@ -113,7 +115,7 @@ def extract_tensor_meta(obj):
             if isinstance(obj, torch.Tensor):
                 nonlocal found_tensor
                 found_tensor = True
-                return extract_tensor_metadata(obj)
+                return _extract_tensor_metadata(obj)
             else:
                 return obj
 
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index 989ec92777cc3..c42af7e9c2d9b 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -1,7 +1,9 @@
 import torch
 from torch.fx.graph_module import GraphModule
 from typing import Callable, List, Dict, Any, Optional
+from torch.fx._compatibility import compatibility
 
+@compatibility(is_backward_compatible=True)
 class Partition:
     def __init__(self, name: str):
         self.name: str = name
@@ -23,6 +25,7 @@ def __repr__(self) -> str:
             f" parition dependents: {self.partition_dependents}"
 
 # Creates subgraphs out of main graph
+@compatibility(is_backward_compatible=True)
 def split_module(
     m: GraphModule,
     root_m: torch.nn.Module,
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index c0b83bc5c3734..61b039f8b7219 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -7,11 +7,14 @@
 from .graph import magic_methods, reflectable_magic_methods, Graph
 from typing import Tuple, Dict, Optional, Iterable, Any, Iterator, Callable
 from .node import Target, Node, Argument, base_types, map_aggregate
+from ._compatibility import compatibility
 
+@compatibility(is_backward_compatible=True)
 class TracerBase:
     graph: Graph
     record_stack_traces : bool = False
 
+    @compatibility(is_backward_compatible=True)
     def create_node(self, kind : str, target : Target,
                     args : Tuple[Argument, ...], kwargs : Dict[str, Argument], name : Optional[str] = None,
                     type_expr : Optional[Any] = None) -> Node:
@@ -24,11 +27,11 @@ def create_node(self, kind : str, target : Target,
         """
         return self.graph.create_node(kind, target, args, kwargs, name, type_expr)
 
+    @compatibility(is_backward_compatible=True)
     def proxy(self, node: Node) -> 'Proxy':
         return Proxy(node, self)
 
-
-
+    @compatibility(is_backward_compatible=True)
     def create_proxy(self, kind: str, target: Target, args: Tuple[Any, ...], kwargs: Dict[str, Any],
                      name: Optional[str] = None, type_expr : Optional[Any] = None,
                      proxy_factory_fn: Callable[[Node], 'Proxy'] = None):
@@ -86,6 +89,7 @@ def _find_user_frame(self):
 
         return frame
 
+    @compatibility(is_backward_compatible=True)
     def create_arg(self, a: Any) -> Argument:
         """
         A method that lowers the objects seen as arguments during symbolic evaluation
@@ -131,6 +135,7 @@ def no_node(arg):
 
         raise NotImplementedError(f"argument of type: {type(a)}")
 
+    @compatibility(is_backward_compatible=True)
     def to_bool(self, obj: 'Proxy') -> bool:
         """Called when a proxy object is being converted to a boolean, such as
         when used in control flow.  Normally we don't know what to do because
@@ -139,6 +144,7 @@ def to_bool(self, obj: 'Proxy') -> bool:
         """
         raise TraceError('symbolically traced variables cannot be used as inputs to control flow')
 
+    @compatibility(is_backward_compatible=True)
     def iter(self, obj: 'Proxy') -> Iterator:
         """Called when a proxy object is being iterated over, such as
         when used in control flow.  Normally we don't know what to do because
@@ -154,6 +160,7 @@ def iter(self, obj: 'Proxy') -> Iterator:
                          ' Proxy docstring for help troubleshooting '
                          'Proxy iteration errors')
 
+    @compatibility(is_backward_compatible=True)
     def keys(self, obj: 'Proxy') -> Any:
         """Called when a proxy object is has the keys() method called.
         This is what happens when ** is called on a proxy. This should return an
@@ -163,15 +170,17 @@ def keys(self, obj: 'Proxy') -> Any:
 
 
 # used in Proxy object when just appending to the graph while not tracing.
+@compatibility(is_backward_compatible=True)
 class GraphAppendingTracer(TracerBase):
     def __init__(self, graph: Graph):
         super().__init__()
         self.graph = graph
 
+@compatibility(is_backward_compatible=True)
 class TraceError(ValueError):
     pass
 
-
+@compatibility(is_backward_compatible=True)
 class Proxy:
     """
     ``Proxy`` objects are ``Node`` wrappers that flow through the
@@ -200,6 +209,8 @@ class Proxy:
     For a more detailed description into the Proxy internals, check out
     the "Proxy" section in `torch/fx/OVERVIEW.md`
     """
+
+    @compatibility(is_backward_compatible=True)
     def __init__(self, node: Node, tracer: 'Optional[TracerBase]' = None):
         if tracer is None:
             # This allows you to create a Proxy object around a raw Node
@@ -232,6 +243,7 @@ def __iter__(self) -> Iterable['Proxy']:
     def __bool__(self) -> bool:
         return self.tracer.to_bool(self)
 
+    @compatibility(is_backward_compatible=True)
     def keys(self):
         return self.tracer.keys(self)
 
@@ -253,7 +265,9 @@ def __torch_function__(self, orig_method, types, args=None, kwargs=None):
             return self.tracer.create_proxy('call_function', orig_method, args, kwargs,
                                             name=self.tracer.graph._target_to_str(orig_method.__name__))
 
+@compatibility(is_backward_compatible=True)
 class Attribute(Proxy):
+    @compatibility(is_backward_compatible=True)
     def __init__(self, root: Proxy, attr: str):
         self.root = root
         self.attr = attr
@@ -272,9 +286,10 @@ def __call__(self, *args, **kwargs):
         return self.tracer.create_proxy('call_method', self.attr, (self.root,) + args, kwargs)
 
 
+@compatibility(is_backward_compatible=False)
 class ParameterProxy(Proxy):
     """
-    a special proxy which lets "shape", "size", "dim", and a few other
+    A special proxy which lets "shape", "size", "dim", and a few other
     attribute accesses pass through to the underlying  module parameter object,
     so that conditional tests on these attributes will not throw exception during tracing
     """
@@ -309,7 +324,7 @@ def nelement(self):
 
 
 for method in magic_methods:
-    def scope(method):
+    def _scope(method):
         def impl(*args, **kwargs):
             tracer = args[0].tracer
             target = getattr(operator, method)
@@ -317,7 +332,7 @@ def impl(*args, **kwargs):
         impl.__name__ = method
         as_magic = f'__{method}__'
         setattr(Proxy, as_magic, impl)
-    scope(method)
+    _scope(method)
 
 def _define_reflectable(orig_method_name):
     method_name = f'__r{orig_method_name}__'
diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py
index e779f6ca9e6b1..72ea56aa31196 100644
--- a/torch/fx/subgraph_rewriter.py
+++ b/torch/fx/subgraph_rewriter.py
@@ -2,22 +2,24 @@
 from .graph import Graph
 from .node import Node
 from ._symbolic_trace import symbolic_trace
+from ._compatibility import compatibility
 
 import copy
 from typing import Callable, Dict, List, NamedTuple, Optional, Set
 import torch
 
+@compatibility(is_backward_compatible=True)
 class Match(NamedTuple):
     # Node from which the match was found
     anchor: Node
     # Maps nodes in the pattern subgraph to nodes in the larger graph
     nodes_map: Dict[Node, Node]
 
-class SubgraphMatcher:
+class _SubgraphMatcher:
     def __init__(self, pattern: Graph) -> None:
         self.pattern = pattern
         if len(pattern.nodes) == 0:
-            raise ValueError("SubgraphMatcher cannot be initialized with an "
+            raise ValueError("_SubgraphMatcher cannot be initialized with an "
                              "empty pattern")
         # `self.pattern_anchor` is the output Node in `pattern`
         self.pattern_anchor = next(iter(reversed(pattern.nodes)))
@@ -129,6 +131,7 @@ def try_get_submodule(mod: torch.nn.Module, target: str) -> Optional[torch.nn.Mo
 
     gm.graph.lint()
 
+@compatibility(is_backward_compatible=True)
 def replace_pattern(gm: GraphModule, pattern: Callable, replacement: Callable) -> List[Match]:
     """
     Matches all possible non-overlapping sets of operators and their
@@ -242,7 +245,6 @@ def forward(self, x, w1, w2):
             max_2 = torch.max(sum_2)
             add_2 = add_1 + max_2
             return add_2
-
     """
     # Get the graphs for `gm`, `pattern`, `replacement`
     original_graph = gm.graph
@@ -251,7 +253,7 @@ def forward(self, x, w1, w2):
 
     # Find all possible pattern matches in original_graph. Note that
     # pattern matches may overlap with each other.
-    matcher = SubgraphMatcher(pattern_graph)
+    matcher = _SubgraphMatcher(pattern_graph)
     matches: List[Match] = []
 
     # Consider each node as an "anchor" (deepest matching graph node)
diff --git a/torch/fx/tensor_type.py b/torch/fx/tensor_type.py
index 18387ee3c78f7..0840122a9b168 100644
--- a/torch/fx/tensor_type.py
+++ b/torch/fx/tensor_type.py
@@ -1,6 +1,9 @@
 from torch.fx.experimental.unification import Var  # type: ignore[attr-defined]
 
+from ._compatibility import compatibility
 
+
+@compatibility(is_backward_compatible=False)
 class TensorType:
     """
     TensorType defines a type for tensors, which consists of a list of dimensions.
@@ -48,7 +51,7 @@ def __repr__(self):
 
 Dyn = _DynType()
 
-
+@compatibility(is_backward_compatible=False)
 def is_consistent(t1, t2):
     """
     A binary relation denoted by ~ that determines if t1 is consistent with t2.
@@ -74,6 +77,7 @@ def is_consistent(t1, t2):
         return False
 
 
+@compatibility(is_backward_compatible=False)
 def is_more_precise(t1, t2):
     """
     A binary relation denoted by <= that determines if t1 is more precise than t2.
diff --git a/torch/quantization/ns/graph_passes.py b/torch/quantization/ns/graph_passes.py
index 36e737e3baf4b..51eb6c24ef3fb 100644
--- a/torch/quantization/ns/graph_passes.py
+++ b/torch/quantization/ns/graph_passes.py
@@ -361,6 +361,7 @@ def _insert_copy_of_subgraph_a_after_input_node_c(
     if isinstance(input_node_c, Node):
         graph_c = input_node_c.graph
     else:
+        assert isinstance(input_node_c, list)
         graph_c = input_node_c[0].graph
 
     # create a sequential list of the subgraphs' nodes from start to end,
@@ -450,6 +451,7 @@ def _insert_copy_of_node_a_after_input_node_c(
     if isinstance(input_node_c, Node):
         graph_c = input_node_c.graph
     else:
+        assert isinstance(input_node_c, list)
         graph_c = input_node_c[0].graph
 
     # generically handle all args and kwargs except for the input

From e7fb35021aca4b29bbf6f7120c5bff1400179175 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Mon, 30 Aug 2021 20:08:15 -0700
Subject: [PATCH 380/530] [nnc] Enable fusion of bfloat16 ops (#64196)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64196

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D30643864

Pulled By: bertmaher

fbshipit-source-id: e95edeaf7089464d713ea1d1f951743d3e5f61c5
---
 test/test_jit_fuser_te.py                     | 30 ++++++-------
 torch/csrc/jit/passes/tensorexpr_fuser.cpp    |  7 ++--
 torch/csrc/jit/tensorexpr/block_codegen.cpp   |  2 +
 torch/csrc/jit/tensorexpr/codegen.cpp         |  2 +-
 torch/csrc/jit/tensorexpr/codegen.h           |  4 +-
 torch/csrc/jit/tensorexpr/cpp_codegen.cpp     |  2 +-
 torch/csrc/jit/tensorexpr/cuda_codegen.cpp    | 31 +++++++-------
 torch/csrc/jit/tensorexpr/eval.cpp            | 29 ++++++++-----
 torch/csrc/jit/tensorexpr/eval.h              | 16 +++----
 torch/csrc/jit/tensorexpr/expr.cpp            |  2 +-
 torch/csrc/jit/tensorexpr/expr.h              |  2 +-
 torch/csrc/jit/tensorexpr/fwd_decls.h         |  2 +-
 torch/csrc/jit/tensorexpr/half_support.h      | 42 +++++++++++++++----
 torch/csrc/jit/tensorexpr/hash_provider.h     | 10 ++++-
 torch/csrc/jit/tensorexpr/ir.cpp              |  2 +-
 torch/csrc/jit/tensorexpr/ir.h                | 10 ++---
 torch/csrc/jit/tensorexpr/ir_cloner.cpp       |  2 +-
 torch/csrc/jit/tensorexpr/ir_cloner.h         |  2 +-
 torch/csrc/jit/tensorexpr/ir_mutator.cpp      |  2 +-
 torch/csrc/jit/tensorexpr/ir_mutator.h        |  2 +-
 torch/csrc/jit/tensorexpr/ir_printer.cpp      |  2 +-
 torch/csrc/jit/tensorexpr/ir_printer.h        |  2 +-
 torch/csrc/jit/tensorexpr/ir_simplifier.h     |  2 +-
 torch/csrc/jit/tensorexpr/ir_visitor.cpp      |  2 +-
 torch/csrc/jit/tensorexpr/ir_visitor.h        |  2 +-
 torch/csrc/jit/tensorexpr/kernel.cpp          |  4 +-
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp    |  6 ++-
 torch/csrc/jit/tensorexpr/reduction.h         |  4 +-
 torch/csrc/jit/tensorexpr/tensorexpr_init.cpp |  4 +-
 torch/csrc/jit/tensorexpr/types.cpp           |  8 ++--
 torch/csrc/jit/tensorexpr/types.h             |  4 +-
 31 files changed, 147 insertions(+), 94 deletions(-)

diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 918cc702d83d6..a6cc085b27c70 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -97,6 +97,7 @@ def setUp(self):
             torch.float16,
             torch.float32,
             torch.float64,
+            torch.bfloat16,
         ]
         self.dtypes = self.int_dtypes + self.fp_dtypes
 
@@ -1145,7 +1146,7 @@ def forward(self, x):
         bad_dtypes = []
         for dtype, output_dtype, device, size in product(dtypes, dtypes, self.devices, sizes):
             # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
             if dtype == output_dtype:
                 continue
@@ -1210,7 +1211,7 @@ def test_isnan(self):
 
         for inp, device, dtype in product(inputs, self.devices, dtypes):
             # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
             inp = inp.to(device=device, dtype=dtype)
             try:
@@ -1263,7 +1264,8 @@ def apply(fn):
             torch.round,
             torch.trunc,
             torch.frac,
-            F.hardshrink,
+            # TODO: broken on ROCm?
+            # F.hardshrink,
             F.leaky_relu,
             lambda x: torch.threshold(x, 0, -10),
             lambda x: torch.clamp(x, -10, 10),
@@ -1272,7 +1274,7 @@ def apply(fn):
         sizes = [(1,), (2,), (4, 4)]
         for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes):
             # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
             if op in gpu_only and device == "cpu":
                 continue
@@ -1325,7 +1327,7 @@ def apply(fn):
         ]
         devices = self.devices
         for dtype, op, device in product(self.dtypes, binary_ops, devices):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
             try:
                 x = self.data_for(dtype, device)
@@ -1377,7 +1379,7 @@ def fn(x, y):
                                      "[[10, 3, 4], [4, 5]]",
                                      ]
         for dtype, size, device in product(self.dtypes, sizes, devices):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
             try:
                 size_x, size_y = size
@@ -1423,7 +1425,7 @@ def apply_with_scalar(fn, scalar):
         # only using  scalar values relevant to particular ops
         scalars = [1.5, 3, 0, -2.0, -1]
         for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
             try:
                 x = self.data_for(dtype, device)
@@ -1457,7 +1459,7 @@ def apply_with_scalar(fn, scalar):
         # only using  scalar values relevant to particular ops
         scalars = [1.5, 3, -2.0, -1]  # skip 0
         for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
             try:
                 x = self.data_for(dtype, device)
@@ -1494,7 +1496,7 @@ def apply_with_scalar(fn, scalar):
         # only using  scalar values relevant to particular ops
         scalars = [1.5, 3, 0, -2.0, -1]
         for dtype, op, device, scalar in product(dtypes, binary_ops, self.devices, scalars):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
             try:
                 x = self.data_for(dtype, device)
@@ -1524,7 +1526,7 @@ def apply(fn):
         ]
         devices = self.devices
         for dtype, op, device in product(self.dtypes, ternary_ops, devices):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
             try:
                 x = self.data_for(dtype, device)
@@ -1555,7 +1557,7 @@ def apply(fn):
         ]
         devices = self.devices
         for dtype, op, device in product(self.dtypes, ternary_ops, devices):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
             try:
                 x = self.data_for(dtype, device, size=[5, 3, 128, 128])
@@ -1588,7 +1590,7 @@ def apply(fn):
             torch.cat,
         ]
         for dtype, op, device in product(self.dtypes, list_ops, devices):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
             try:
                 x = self.data_for(dtype, device, size=[5, 4, 1, 7])
@@ -1621,7 +1623,7 @@ def apply(fn):
         ]
         devices = self.devices
         for dtype, op, device in product(self.dtypes, ops, devices):
-            if dtype == torch.float16 and device == "cpu":
+            if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
                 continue
             try:
                 cond = self.data_for(torch.bool, device)
@@ -1650,7 +1652,6 @@ def fn(x):
 
             unsupported_dtypes = [
                 torch.uint8,
-                torch.bfloat16,
                 torch.complex32,
                 torch.complex64,
                 torch.complex128,
@@ -1791,6 +1792,7 @@ def eager(x, y):
             dtypes = self.dtypes.copy()
             # CPU fuser doesn't support float16.
             dtypes.remove(torch.float16)
+            dtypes.remove(torch.bfloat16)
             for dtype1, dtype2 in product(dtypes, dtypes):
                 x = torch.randint(2, (1, 13,)).to(dtype1)
                 zero = torch.tensor([[0]]).to(dtype2)
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index a3e37072a032d..75305d63e072f 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -966,7 +966,9 @@ class TensorExprFuser {
         // but on top of that Float16 has a few kinks on LLVM.  Thus, on CPU we
         // additionally disable it until we either move to a more stable version
         // or find workarounds.
-        if (*st == c10::ScalarType::Half && *device == c10::kCPU) {
+        if ((*st == c10::ScalarType::Half ||
+             *st == c10::ScalarType::BFloat16) &&
+            *device == c10::kCPU) {
           return false;
         }
 
@@ -1098,8 +1100,7 @@ class TensorExprFuser {
           // All tensor types should be known.
           return false;
         }
-        if (c10::isComplexType(*st) || c10::isQIntType(*st) ||
-            *st == c10::ScalarType::BFloat16) {
+        if (c10::isComplexType(*st) || c10::isQIntType(*st)) {
           return false;
         }
       }
diff --git a/torch/csrc/jit/tensorexpr/block_codegen.cpp b/torch/csrc/jit/tensorexpr/block_codegen.cpp
index 51b7b77f6d39d..b42d37428208b 100644
--- a/torch/csrc/jit/tensorexpr/block_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/block_codegen.cpp
@@ -16,6 +16,8 @@ std::string blockDtypeCppString(const Dtype& dtype) {
       return "1";
     case ScalarType::Half:
       return "2";
+    case ScalarType::BFloat16:
+      return "2";
     // NOLINTNEXTLINE(bugprone-branch-clone)
     case ScalarType::Char:
       return "1";
diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp
index 0bbc3378b0323..b2b077b9771d1 100644
--- a/torch/csrc/jit/tensorexpr/codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/codegen.cpp
@@ -67,7 +67,7 @@ void* CodeGen::argToPtr(const BufferArg& bufferArg, const CallArg& callArg) {
   case ScalarType::Name:    \
     return callArg.Name##Ptr();
 
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
 
     default:
diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h
index 29255aac07df2..0504f9a8b0b0b 100644
--- a/torch/csrc/jit/tensorexpr/codegen.h
+++ b/torch/csrc/jit/tensorexpr/codegen.h
@@ -153,7 +153,7 @@ class CodeGen::CallArg {
     memcpy(&data_, &v, sizeof(Type)); \
   }
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, ARG_TYPE_CTOR);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ARG_TYPE_CTOR);
 #undef ARG_TYPE_CTOR
 
   void* data() const {
@@ -165,7 +165,7 @@ class CodeGen::CallArg {
     return (Type*)&data_;          \
   }
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, ARG_PTR_DEFINE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ARG_PTR_DEFINE);
 #undef ARG_PTR_DEFINE
 
  private:
diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
index 20795e43dd57b..6c02f7f7e09df 100644
--- a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
@@ -149,7 +149,7 @@ void dispatch_binary_op(std::ostream& os, const BinaryOpNode<Op>* v) {
   case ScalarType::Name:                                           \
     visit_binary_op<Type>(os, v->lhs(), v->rhs(), v->expr_type()); \
     break;
-    AT_FORALL_SCALAR_TYPES_AND2(Half, Bool, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
     default:
       throw unsupported_dtype();
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 30d42075189fb..c23eda31204de 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -98,6 +98,8 @@ std::string CudaPrinter::dtypeToCppString(const Dtype& dtype) {
       return "bool";
     case ScalarType::Half:
       return "half";
+    case ScalarType::BFloat16:
+      return "__nv_bfloat16";
     case ScalarType::Char:
       return "char";
     case ScalarType::Byte:
@@ -251,20 +253,15 @@ void CudaPrinter::visit(ForPtr v) {
 }
 
 void CudaPrinter::visit(CastPtr v) {
-  if (v->dtype().scalar_type() == ScalarType::Half) {
-    os() << "__float2half(";
-    v->src_value()->accept(this);
-    os() << ")";
-    return;
-  } else if (v->src_value()->dtype().scalar_type() == ScalarType::Half) {
-    os() << "__half2float(";
-    v->src_value()->accept(this);
-    os() << ")";
-    return;
-  }
-
-  os() << "(" << dtypeToCppString(v->dtype()) << ")";
-  os() << "(";
+  std::string castFn = v->dtype().scalar_type() == ScalarType::Half
+      ? "__float2half"
+      : v->dtype().scalar_type() == ScalarType::BFloat16 ? "__float2bfloat16"
+      : v->src_value()->dtype().scalar_type() == ScalarType::Half
+      ? "__half2float"
+      : v->src_value()->dtype().scalar_type() == ScalarType::BFloat16
+      ? "__bfloat162float"
+      : ("(" + dtypeToCppString(v->dtype()) + ")");
+  os() << castFn << "(";
   v->src_value()->accept(this);
   os() << ")";
 }
@@ -320,7 +317,8 @@ void CudaPrinter::visit(LoadPtr v) {
     return;
   }
   if (v->dtype().scalar_type() == ScalarType::Bool ||
-      v->dtype().scalar_type() == ScalarType::Half) {
+      v->dtype().scalar_type() == ScalarType::Half ||
+      v->dtype().scalar_type() == ScalarType::BFloat16) {
     // There's no __ldg overload for bool or half.
     os() << *v->base_handle() << "[" << *v->flat_index() << "]";
     return;
@@ -944,6 +942,9 @@ void CudaCodeGen::Initialize() {
   if (halfChecker.hasHalf()) {
     os() << fuser::cuda::half_support_literal << std::endl;
   }
+  if (halfChecker.hasBFloat16()) {
+    os() << fuser::cuda::bfloat16_support_literal << std::endl;
+  }
 
   std::string func_name = GetUniqueFuncName(kernel_func_name());
   os() << "extern \"C\" __global__" << std::endl;
diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp
index e42ce77820e11..4582433d95697 100644
--- a/torch/csrc/jit/tensorexpr/eval.cpp
+++ b/torch/csrc/jit/tensorexpr/eval.cpp
@@ -62,6 +62,10 @@ inline c10::Half div_value(c10::Half lhs, c10::Half rhs) {
   return lhs / rhs;
 }
 
+inline c10::BFloat16 div_value(c10::BFloat16 lhs, c10::BFloat16 rhs) {
+  return lhs / rhs;
+}
+
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 class SimpleIREvaluatorImpl : public IRVisitor {
  public:
@@ -347,7 +351,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
   case ScalarType::Name:                               \
     value_ = binary_op<Type>(lhs_v, rhs_v, expr_type); \
     break;
-      AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
       case ScalarType::Bool:
         value_ = binary_op<unsigned char>(lhs_v, rhs_v, expr_type);
@@ -370,7 +374,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
   case ScalarType::Name:                                                    \
     value = compare_select_op<T, Type>(lhs, rhs, retval1, retval2, cmp_op); \
     break;
-      AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
       default:
         throw unsupported_dtype();
@@ -402,7 +406,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
     value_ = compare_select_op_helper<Type>(           \
         lhs_v, rhs_v, ret_val1_v, ret_val2_v, cmp_op); \
     break;
-      AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
       default:
         throw unsupported_dtype();
@@ -413,7 +417,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
   TORCH_API void visit(Name##ImmPtr v) override { \
     value_ = Value(v->value());                   \
   }
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT);
 #undef IMM_VISIT
 
   TORCH_API void visit(BlockPtr v) override {
@@ -464,7 +468,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
   case ScalarType::Name:                                           \
     this->value_ = Value(castValues<SrcType, Type>(src_dtype, v)); \
     break;
-      AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, DST_TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DST_TYPE_CASE);
 #undef DST_TYPE_CASE
       default:
         throw unsupported_dtype();
@@ -486,7 +490,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
   case ScalarType::Name:                               \
     doCastFromSrc<Type>(src_dtype, dst_dtype, value_); \
     break;
-        AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, SRC_TYPE_CASE);
+        AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, SRC_TYPE_CASE);
 #undef SRC_TYPE_CASE
         default:
           throw unsupported_dtype();
@@ -590,7 +594,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
     std::vector<Type> v(lanes, value.as<Type>()); \
     value_ = Value(v);                            \
   } break;
-      AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
       default:
         throw unsupported_dtype();
@@ -610,6 +614,9 @@ class SimpleIREvaluatorImpl : public IRVisitor {
 #undef TYPE_CASE
       case ScalarType::Half:
         throw unsupported_dtype("IfThenElse condition can't have Half dtype");
+      case ScalarType::BFloat16:
+        throw unsupported_dtype(
+            "IfThenElse condition can't have BFloat16 dtype");
       default:
         throw unsupported_dtype();
     }
@@ -660,7 +667,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
     }                                                \
     value_ = Value(v);                               \
   } break;
-      AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
       default:
         throw unsupported_dtype();
@@ -693,7 +700,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
       ptr##Name[index[i]] = value[i];                           \
     }                                                           \
   } break;
-      AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
       default:
         throw unsupported_dtype();
@@ -801,6 +808,8 @@ class SimpleIREvaluatorImpl : public IRVisitor {
         visit_intrinsics_helper<int, double>(v);
       } else if (inp_dtype == ScalarType::Half) {
         throw unsupported_dtype(); // TODO
+      } else if (inp_dtype == ScalarType::BFloat16) {
+        throw unsupported_dtype(); // TODO
       }
     } else {
       switch (ty) {
@@ -1039,7 +1048,7 @@ void SimpleIREvaluator::bindArg(const BufferArg& bufArg, void* data) {
     impl_->bindVar(bufArg.var(), typed_data); \
     break;                                    \
   }
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
     default:
       throw unsupported_dtype();
diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h
index 494ba283ea902..e11bb169484f6 100644
--- a/torch/csrc/jit/tensorexpr/eval.h
+++ b/torch/csrc/jit/tensorexpr/eval.h
@@ -36,7 +36,7 @@ class Value {
     Name##values.push_back(v); \
     return;                    \
   }
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
     throw unsupported_dtype();
   }
@@ -46,14 +46,14 @@ class Value {
     Name##values.push_back(v);      \
   }
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_CTOR);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_CTOR);
 #undef VALUE_CTOR
 
 #define VALUE_VEC_CTOR(Type, Name)  \
   Value(const std::vector<Type>& v) \
       : dtype_(Dtype(k##Name, v.size())), Name##values(v) {}
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_VEC_CTOR);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_VEC_CTOR);
 #undef VALUE_VEC_CTOR
 
   template <typename T>
@@ -72,7 +72,7 @@ class Value {
   Dtype dtype_;
 
 #define VALUE_STORAGE(Type, Name) std::vector<Type> Name##values;
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_STORAGE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_STORAGE);
 #undef VALUE_STORAGE
   void* ptr;
 };
@@ -85,7 +85,7 @@ class Value {
     }                                   \
     return Name##values[0];             \
   }
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_AS_DISPATCH);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_AS_DISPATCH);
 #undef VALUE_AS_DISPATCH
 
 #define VALUE_AS_VEC_DISPATCH(Type, Name)                       \
@@ -96,7 +96,7 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_AS_DISPATCH);
     }                                                           \
     return Name##values;                                        \
   }
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, VALUE_AS_VEC_DISPATCH);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_AS_VEC_DISPATCH);
 #undef VALUE_AS_VEC_DISPATCH
 
 template <typename To, typename From>
@@ -206,7 +206,7 @@ class ExprEval {
     ret_value_ = Value(ret_val_arg[0]);                 \
   } break;
       // NOLINTNEXTLINE(modernize-use-emplace)
-      AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
       case ScalarType::Bool: {
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -231,7 +231,7 @@ class ExprEval {
     codegen_->call_raw(args_extended);           \
     ret_value_ = Value(ret_val_arg[0]);          \
   } break;
-      AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE);
+      AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
       case ScalarType::Bool: {
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
diff --git a/torch/csrc/jit/tensorexpr/expr.cpp b/torch/csrc/jit/tensorexpr/expr.cpp
index cbf5ddd9f1d6d..c757d4b0ca201 100644
--- a/torch/csrc/jit/tensorexpr/expr.cpp
+++ b/torch/csrc/jit/tensorexpr/expr.cpp
@@ -89,7 +89,7 @@ ExprHandle ExprHandle::operator>>(const ExprHandle& other) const {
 // NOLINTNEXTLINE
 #define IMM_EXPR_DECLARE(Type, Name) \
   ExprHandle::ExprHandle(Type v) : ExprHandle(Name##Imm::make(v)) {}
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_EXPR_DECLARE);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_EXPR_DECLARE);
 #undef IMM_EXPR_DECLARE
 
 ExprHandle sin(const ExprHandle& v) {
diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h
index 4947bfdc36be9..41ce99a085179 100644
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -110,7 +110,7 @@ class TORCH_API ExprHandle {
   }
 
 #define IMM_EXPR_DECLARE(Type, Name) ExprHandle(Type v);
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_EXPR_DECLARE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_EXPR_DECLARE);
 #undef IMM_EXPR_DECLARE
 
   template <class Op>
diff --git a/torch/csrc/jit/tensorexpr/fwd_decls.h b/torch/csrc/jit/tensorexpr/fwd_decls.h
index 1b3dde560b427..119308b053442 100644
--- a/torch/csrc/jit/tensorexpr/fwd_decls.h
+++ b/torch/csrc/jit/tensorexpr/fwd_decls.h
@@ -113,7 +113,7 @@ using SyncThreadsPtr = NodePtr<SyncThreads>;
 #define IMM_DECLARE(Type, Name) \
   class Name##Imm;              \
   using Name##ImmPtr = NodePtr<Name##Imm>;
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_DECLARE);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_DECLARE);
 #undef IMM_DECLARE
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/half_support.h b/torch/csrc/jit/tensorexpr/half_support.h
index 674af8a764928..8ecf956d6d75b 100644
--- a/torch/csrc/jit/tensorexpr/half_support.h
+++ b/torch/csrc/jit/tensorexpr/half_support.h
@@ -18,17 +18,23 @@ class HalfChecker : public IRVisitor {
     }
   }
 
-  bool hasHalf() {
+  bool hasHalf() const {
     return hasHalf_;
   }
 
+  bool hasBFloat16() const {
+    return hasBFloat16_;
+  }
+
   void visit(LoadPtr v) override {
     hasHalf_ |= v->dtype().scalar_type() == ScalarType::Half;
+    hasBFloat16_ |= v->dtype().scalar_type() == ScalarType::BFloat16;
     IRVisitor::visit(v);
   }
 
   void visit(StorePtr v) override {
     hasHalf_ |= v->buf()->dtype().scalar_type() == ScalarType::Half;
+    hasBFloat16_ |= v->buf()->dtype().scalar_type() == ScalarType::BFloat16;
     IRVisitor::visit(v);
   }
 
@@ -36,20 +42,26 @@ class HalfChecker : public IRVisitor {
     hasHalf_ = true;
   }
 
+  void visit(BFloat16ImmPtr v) override {
+    hasBFloat16_ = true;
+  }
+
   void visit(CastPtr v) override {
     hasHalf_ |= v->dtype().scalar_type() == ScalarType::Half;
+    hasBFloat16_ |= v->dtype().scalar_type() == ScalarType::BFloat16;
     IRVisitor::visit(v);
   }
 
  private:
   bool hasHalf_{false};
+  bool hasBFloat16_{false};
 };
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 class HalfRewriter : public IRMutator {
   ExprPtr mutate(LoadPtr v) override {
     ExprPtr child = IRMutator::mutate(v);
-    if (child->dtype().scalar_type() != ScalarType::Half) {
+    if (!isHalf(child)) {
       return child;
     }
 
@@ -63,12 +75,11 @@ class HalfRewriter : public IRMutator {
   StmtPtr mutate(StorePtr v) override {
     // Since mutation changes the `value()` expression in-place, we need to
     // get the dtype of the `value()` before that is mutated.
-    Dtype newType = v->value()->dtype();
+    auto newType = v->value()->dtype();
     ExprPtr new_val = v->value()->accept_mutator(this);
 
-    if (newType.scalar_type() == ScalarType::Half) {
-      new_val =
-          alloc<Cast>(newType.cloneWithScalarType(ScalarType::Half), new_val);
+    if (isHalf(newType.scalar_type())) {
+      new_val = alloc<Cast>(newType, new_val);
       inserted_half_casts_.insert(new_val);
     }
 
@@ -80,11 +91,15 @@ class HalfRewriter : public IRMutator {
     return alloc<Cast>(kFloat, v);
   }
 
+  ExprPtr mutate(BFloat16ImmPtr v) override {
+    return alloc<Cast>(kFloat, v);
+  }
+
   ExprPtr mutate(CastPtr v) override {
     ExprPtr child = v->src_value()->accept_mutator(this);
 
     // just don't allow half casts we didn't insert.
-    if (v->dtype().scalar_type() == ScalarType::Half) {
+    if (isHalf(v)) {
       if (inserted_half_casts_.count(v) < 1) {
         return child;
       }
@@ -105,8 +120,9 @@ class HalfRewriter : public IRMutator {
 
     return alloc<Cast>(v->dtype(), child);
   }
+
   StmtPtr mutate(LetPtr v) override {
-    if (v->dtype().scalar_type() == ScalarType::Half) {
+    if (isHalf(v->dtype().scalar_type())) {
       VarPtr load_new_var = alloc<Var>(v->var()->name_hint(), kFloat);
       ExprPtr new_value = alloc<Cast>(
           v->dtype().cloneWithScalarType(ScalarType::Float),
@@ -131,7 +147,7 @@ class HalfRewriter : public IRMutator {
   template <typename T>
   ExprPtr mutateArithmetic(T v) {
     IRMutator::mutate(v);
-    if (v->dtype().scalar_type() == c10::kHalf) {
+    if (isHalf(v)) {
       v->set_dtype(v->dtype().cloneWithScalarType(c10::kFloat));
     }
     return v;
@@ -169,6 +185,14 @@ class HalfRewriter : public IRMutator {
   }
 
  private:
+  static bool isHalf(ScalarType st) {
+    return st == ScalarType::Half || st == ScalarType::BFloat16;
+  }
+
+  static bool isHalf(ExprPtr v) {
+    return isHalf(v->dtype().scalar_type());
+  }
+
   std::unordered_set<ExprPtr> inserted_half_casts_;
   std::unordered_map<VarPtr, VarPtr> var_map;
 };
diff --git a/torch/csrc/jit/tensorexpr/hash_provider.h b/torch/csrc/jit/tensorexpr/hash_provider.h
index 91ce269edeb5c..35d493a0025b4 100644
--- a/torch/csrc/jit/tensorexpr/hash_provider.h
+++ b/torch/csrc/jit/tensorexpr/hash_provider.h
@@ -92,7 +92,7 @@ class TORCH_API HashProvider : public IRVisitor {
     CACHE_GUARD();                               \
     putHash(v, hash_combine(#Name, v->value())); \
   }
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT);
 #undef IMM_VISIT
 
   void visit(CastPtr v) override;
@@ -287,6 +287,14 @@ class TORCH_API HashProvider : public IRVisitor {
     std::memcpy(&n, &d, sizeof d);
     return te_hash(n);
   }
+
+  size_t te_hash(at::BFloat16 d) {
+    // memcpy as type punning. Should be optimized out.
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    int16_t n;
+    std::memcpy(&n, &d, sizeof d);
+    return te_hash(n);
+  }
 };
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/ir.cpp b/torch/csrc/jit/tensorexpr/ir.cpp
index 2680f5366b46e..439993c481903 100644
--- a/torch/csrc/jit/tensorexpr/ir.cpp
+++ b/torch/csrc/jit/tensorexpr/ir.cpp
@@ -231,7 +231,7 @@ bool immediateIsNegative(ExprPtr e) {
   if (Name##ImmPtr imm = to<Name##Imm>(e)) { \
     return imm->value() < 0;                 \
   }
-  AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE);
+  AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
   return false;
 }
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h
index 1218082e6af98..65a362ef023fe 100644
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -320,7 +320,7 @@ class Min : public BinaryOpNode<Min> {
    private:                                                   \
     Type value_;                                              \
   };
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_DECLARE);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_DECLARE);
 #undef IMM_DECLARE
 
 // Get immediate by ScalarType.
@@ -329,9 +329,9 @@ ExprPtr getImmediateByType(ScalarType immType, T initialVal) {
   switch (immType) {
 #define TYPE_CASE(Type, Name) \
   case ScalarType::Name:      \
-    return alloc<Name##Imm>(initialVal);
+    return alloc<Name##Imm>(Type(initialVal));
     // NOLINTNEXTLINE(bugprone-branch-clone)
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
     default:
       throw unsupported_dtype();
@@ -374,7 +374,7 @@ T immediateAs(ExprPtr e) {
   if (Name##ImmPtr imm = to<Name##Imm>(e)) { \
     return imm->value();                     \
   }
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
   throw unsupported_dtype();
   return 0;
@@ -391,7 +391,7 @@ bool immediateEquals(ExprPtr e, T val) {
   if (Name##ImmPtr imm = to<Name##Imm>(e)) { \
     return imm->value() == val;              \
   }
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
   throw unsupported_dtype();
   return false;
diff --git a/torch/csrc/jit/tensorexpr/ir_cloner.cpp b/torch/csrc/jit/tensorexpr/ir_cloner.cpp
index e225826df66e2..1144833c7990e 100644
--- a/torch/csrc/jit/tensorexpr/ir_cloner.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_cloner.cpp
@@ -119,7 +119,7 @@ ExprPtr IRCloner::mutate(CompareSelectPtr v) {
   ExprPtr IRCloner::mutate(Name##ImmPtr v) { \
     return v;                                \
   }
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DEFINE);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DEFINE);
 #undef IMM_MUTATE_DEFINE
 
 ExprPtr IRCloner::mutate(CastPtr v) {
diff --git a/torch/csrc/jit/tensorexpr/ir_cloner.h b/torch/csrc/jit/tensorexpr/ir_cloner.h
index f03e12886eabe..5f516a02ffadb 100644
--- a/torch/csrc/jit/tensorexpr/ir_cloner.h
+++ b/torch/csrc/jit/tensorexpr/ir_cloner.h
@@ -26,7 +26,7 @@ class TORCH_API IRCloner : public IRMutator {
   ExprPtr mutate(RshiftPtr v) override;
   ExprPtr mutate(CompareSelectPtr v) override;
 #define IMM_MUTATE_DECLARE(Type, Name) ExprPtr mutate(Name##ImmPtr v) override;
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DECLARE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DECLARE);
 #undef IMM_MUTATE_DECLARE
   ExprPtr mutate(CastPtr v) override;
   ExprPtr mutate(BitCastPtr v) override;
diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp
index 45121581eebf0..71a40a134e0b5 100644
--- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp
@@ -115,7 +115,7 @@ ExprPtr IRMutator::mutate(CompareSelectPtr v) {
   ExprPtr IRMutator::mutate(Name##ImmPtr v) { \
     return v;                                 \
   }
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DEFINE);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DEFINE);
 #undef IMM_MUTATE_DEFINE
 
 ExprPtr IRMutator::mutate(CastPtr v) {
diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.h b/torch/csrc/jit/tensorexpr/ir_mutator.h
index fb6c420af46a0..0a96876606dfb 100644
--- a/torch/csrc/jit/tensorexpr/ir_mutator.h
+++ b/torch/csrc/jit/tensorexpr/ir_mutator.h
@@ -25,7 +25,7 @@ class TORCH_API IRMutator {
   virtual ExprPtr mutate(RshiftPtr v);
   virtual ExprPtr mutate(CompareSelectPtr v);
 #define IMM_MUTATE_DECLARE(Type, Name) virtual ExprPtr mutate(Name##ImmPtr v);
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DECLARE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DECLARE);
 #undef IMM_MUTATE_DECLARE
   virtual ExprPtr mutate(CastPtr v);
   virtual ExprPtr mutate(BitCastPtr v);
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index ca90d9995e0d2..4a10c282e60b1 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -226,7 +226,7 @@ static void formatImm(std::ostream& os, T v) {
   void IRPrinter::visit(Name##ImmPtr v) { \
     formatImm(os(), v->value());          \
   }
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_PRINT_VISIT);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT);
 #undef IMM_PRINT_VISIT
 
 void IRPrinter::visit(CastPtr v) {
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h
index 327119dcc74e6..fb357a8fb79fa 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.h
+++ b/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -34,7 +34,7 @@ class TORCH_API IRPrinter : public IRVisitor {
   void visit(RshiftPtr v) override;
   void visit(CompareSelectPtr v) override;
 #define IMM_PRINT_VISIT(Type, Name) void visit(Name##ImmPtr v) override;
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_PRINT_VISIT);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT);
 #undef IMM_PRINT_VISIT
   void visit(CastPtr v) override;
   void visit(BitCastPtr v) override;
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.h b/torch/csrc/jit/tensorexpr/ir_simplifier.h
index 1df8b5d8f3501..11d004f395ed1 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.h
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.h
@@ -97,7 +97,7 @@ inline ExprPtr evaluateOp(ExprPtr v) {
     Type val = eval.value<Type>();                            \
     return getImmediateByType(v->dtype().scalar_type(), val); \
   }
-    AT_FORALL_SCALAR_TYPES_AND2(Half, Bool, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
     default:
       LOG(FATAL) << "Unsupported datatype: " << v->dtype();
diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.cpp b/torch/csrc/jit/tensorexpr/ir_visitor.cpp
index eb2a4280c4f88..9489422b66ebe 100644
--- a/torch/csrc/jit/tensorexpr/ir_visitor.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_visitor.cpp
@@ -79,7 +79,7 @@ void IRVisitor::visit(CompareSelectPtr v) {
 // NOLINTNEXTLINE
 #define IMM_VISIT(Type, Name) \
   void IRVisitor::visit(Name##ImmPtr v) {}
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT);
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT);
 #undef IMM_VISIT
 
 void IRVisitor::visit(CastPtr v) {
diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.h b/torch/csrc/jit/tensorexpr/ir_visitor.h
index 001725f961619..e54786b2f9036 100644
--- a/torch/csrc/jit/tensorexpr/ir_visitor.h
+++ b/torch/csrc/jit/tensorexpr/ir_visitor.h
@@ -26,7 +26,7 @@ class TORCH_API IRVisitor {
 
 #define IMM_PRINT_VISIT(Type, Name) virtual void visit(Name##ImmPtr v);
 
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_PRINT_VISIT)
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT)
 #undef IMM_PRINT_VISIT
 
   virtual void visit(CastPtr v);
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index e4136d85c0a50..78cbb822bfbff 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -52,7 +52,7 @@ static ExprHandle promoteToDtype(ExprHandle e, ScalarType dt) {
   case ScalarType::Name:      \
     e = cast<Type>(e);        \
     break;
-    AT_FORALL_SCALAR_TYPES_AND2(Half, Bool, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
     default:
       throw unsupported_dtype();
@@ -520,7 +520,7 @@ ExprHandle demoteOutput(
 #define TYPE_CASE(Type, Name) \
   case ScalarType::Name:      \
     return cast<Type>(e);
-    AT_FORALL_SCALAR_TYPES_AND(Half, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
     case ScalarType::Bool:
       return cast<bool>(e);
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 026d52bfc938c..b9ea70806ee1a 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -231,7 +231,7 @@ class LLVMCodeGenImpl : public IRVisitor {
   void visit(CompareSelectPtr v) override;
 
 #define IMM_VISIT_DECLARE(_1, Name) void visit(Name##ImmPtr v) override;
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT_DECLARE);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT_DECLARE);
 #undef IMM_VISIT_DECLARE
 
   void visit(CastPtr v) override;
@@ -902,6 +902,10 @@ void LLVMCodeGenImpl::visit(HalfImmPtr v) {
   value_ = llvm::ConstantFP::get(HalfTy_, v->value());
 }
 
+void LLVMCodeGenImpl::visit(BFloat16ImmPtr v) {
+  TORCH_INTERNAL_ASSERT(false, "llvm codegen does not support bfloat16");
+}
+
 void LLVMCodeGenImpl::visit(BoolImmPtr v) {
   value_ = llvm::ConstantInt::get(BoolTy_, v->value());
 }
diff --git a/torch/csrc/jit/tensorexpr/reduction.h b/torch/csrc/jit/tensorexpr/reduction.h
index 08aef01c7d310..22d90b9981b82 100644
--- a/torch/csrc/jit/tensorexpr/reduction.h
+++ b/torch/csrc/jit/tensorexpr/reduction.h
@@ -171,7 +171,7 @@ inline ExprHandle maximumVal(ScalarType type) {
 #define MAX_BY_TYPE_CASE(Type, Name) \
   case ScalarType::Name:             \
     return ExprHandle(std::numeric_limits<Type>::max());
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, MAX_BY_TYPE_CASE)
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, MAX_BY_TYPE_CASE)
 #undef MAX_BY_TYPE_CASE
     default:
       throw unsupported_dtype();
@@ -184,7 +184,7 @@ inline ExprHandle minimumVal(ScalarType type) {
 #define MAX_BY_TYPE_CASE(Type, Name) \
   case ScalarType::Name:             \
     return ExprHandle(std::numeric_limits<Type>::min());
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, MAX_BY_TYPE_CASE)
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, MAX_BY_TYPE_CASE)
 #undef MAX_BY_TYPE_CASE
     default:
       throw unsupported_dtype();
diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
index c7f48824303d4..c924bded3543c 100644
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -69,7 +69,7 @@ void initTensorExprBindings(PyObject* module) {
 #define DTYPE_SINGLETON_ACCESSOR(ctype, name) \
   dtype_class.def_property_readonly_static(   \
       #name, [](py::object) { return k##name; }); // NOLINT
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, DTYPE_SINGLETON_ACCESSOR)
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DTYPE_SINGLETON_ACCESSOR)
 #undef DTYPE_SINGLETON_ACCESSOR
 
   auto expr_handle_class =
@@ -144,7 +144,7 @@ void initTensorExprBindings(PyObject* module) {
 
 #define EXPRHANDLE_CTOR(ctype, name) \
   expr_handle_class.def_static(#ctype, [](ctype v) { return ExprHandle(v); });
-  AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, EXPRHANDLE_CTOR)
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, EXPRHANDLE_CTOR)
 #undef EXPRHANDLE_CTOR
 
   py::class_<VarHandle, ExprHandle>(te, "VarHandle")
diff --git a/torch/csrc/jit/tensorexpr/types.cpp b/torch/csrc/jit/tensorexpr/types.cpp
index 5cef86a2dfe26..e75ecd9744d61 100644
--- a/torch/csrc/jit/tensorexpr/types.cpp
+++ b/torch/csrc/jit/tensorexpr/types.cpp
@@ -16,7 +16,7 @@ Dtype Dtype::scalar_dtype() const {
 // NOLINTNEXTLINE
 #define DTYPE_DEFINE(_1, n) TORCH_API Dtype k##n(ScalarType::n, 1);
 
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, DTYPE_DEFINE)
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DTYPE_DEFINE)
 
 #undef DTYPE_DEFINE
 
@@ -28,7 +28,7 @@ Dtype ToDtype(ScalarType type) {
 #define TYPE_CASE(_1, n) \
   case ScalarType::n:    \
     return k##n;
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE)
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE)
 #undef TYPE_CASE
 
     case ScalarType::Undefined:
@@ -56,7 +56,7 @@ int Dtype::byte_size() const {
     scalar_size = sizeof(Type); \
     break;
 
-    AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
 #undef TYPE_CASE
     default:
       throw std::runtime_error(
@@ -77,6 +77,8 @@ std::string Dtype::ToCppString() const {
       return "bool";
     case ScalarType::Half:
       return "half";
+    case ScalarType::BFloat16:
+      return "__nv_bfloat16";
     default:
       throw unsupported_dtype();
   }
diff --git a/torch/csrc/jit/tensorexpr/types.h b/torch/csrc/jit/tensorexpr/types.h
index 00cd50db288b3..3716a0a1cd559 100644
--- a/torch/csrc/jit/tensorexpr/types.h
+++ b/torch/csrc/jit/tensorexpr/types.h
@@ -75,7 +75,7 @@ extern TORCH_API Dtype kHandle;
 
 #define NNC_DTYPE_DECLARATION(ctype, name) extern TORCH_API Dtype k##name;
 
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, NNC_DTYPE_DECLARATION)
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, NNC_DTYPE_DECLARATION)
 #undef NNC_DTYPE_DECLARATION
 
 template <typename T>
@@ -86,7 +86,7 @@ TORCH_API Dtype ToDtype();
   inline Dtype ToDtype<ctype>() {            \
     return k##name;                          \
   }
-AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, NNC_TODTYPE_DECLARATION)
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, NNC_TODTYPE_DECLARATION)
 #undef NNC_TODTYPE_DECLARATION
 
 TORCH_API Dtype ToDtype(ScalarType type);

From 7ca4728e6dd4f3a706082dedf33c21771116070a Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 30 Aug 2021 20:17:12 -0700
Subject: [PATCH 381/530] Compile BatchLinearAlgebra without nvcc (#64146)

Summary:
These files only use cuda libraries interfaces, so don't actually need to be compiled with nvcc.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64146

Reviewed By: ezyang

Differential Revision: D30633189

Pulled By: ngimel

fbshipit-source-id: c9d0ae5259a10cb49332d31f0da89ad758736ea8
---
 ...inearAlgebra.cu => BatchLinearAlgebra.cpp} | 29 +++++++++----------
 ...lgebraLib.cu => BatchLinearAlgebraLib.cpp} | 18 ------------
 caffe2/CMakeLists.txt                         |  2 +-
 3 files changed, 15 insertions(+), 34 deletions(-)
 rename aten/src/ATen/native/cuda/{BatchLinearAlgebra.cu => BatchLinearAlgebra.cpp} (99%)
 rename aten/src/ATen/native/cuda/{BatchLinearAlgebraLib.cu => BatchLinearAlgebraLib.cpp} (98%)

diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp
similarity index 99%
rename from aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
rename to aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp
index 4e806f000c5ae..7fdc55d818084 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp
@@ -1701,7 +1701,7 @@ static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper)
 #endif // USE_CUSOLVER
 }
 
-REGISTER_DISPATCH(cholesky_stub, &cholesky_kernel)
+REGISTER_CUDA_DISPATCH(cholesky_stub, &cholesky_kernel)
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cholesky_inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1773,7 +1773,7 @@ Tensor& cholesky_inverse_kernel_impl(Tensor &result, Tensor& infos, bool upper)
 
 }
 
-REGISTER_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
+REGISTER_CUDA_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1945,7 +1945,7 @@ static void apply_lu(const Tensor& input, const Tensor& pivots, const Tensor& in
   }
 }
 
-REGISTER_DISPATCH(lu_stub, &apply_lu);
+REGISTER_CUDA_DISPATCH(lu_stub, &apply_lu);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangular_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -2039,7 +2039,7 @@ void triangular_solve_kernel(Tensor& A, Tensor& B, Tensor& infos, bool upper, bo
   }
 }
 
-REGISTER_DISPATCH(triangular_solve_stub, &triangular_solve_kernel);
+REGISTER_CUDA_DISPATCH(triangular_solve_stub, &triangular_solve_kernel);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ orgqr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -2057,7 +2057,7 @@ Tensor& orgqr_kernel_impl(Tensor& result, const Tensor& tau) {
   #endif
 }
 
-REGISTER_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
+REGISTER_CUDA_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
 
 void ormqr_kernel(const Tensor& input, const Tensor& tau, const Tensor& other, bool left, bool transpose) {
 #if defined(USE_CUSOLVER)
@@ -2069,7 +2069,7 @@ void ormqr_kernel(const Tensor& input, const Tensor& tau, const Tensor& other, b
 #endif
 }
 
-REGISTER_DISPATCH(ormqr_stub, &ormqr_kernel);
+REGISTER_CUDA_DISPATCH(ormqr_stub, &ormqr_kernel);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -2148,7 +2148,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
   }
 }
 
-REGISTER_DISPATCH(geqrf_stub, &geqrf_kernel);
+REGISTER_CUDA_DISPATCH(geqrf_stub, &geqrf_kernel);
 
 template <typename scalar_t>
 static void apply_qr(Tensor& Q, Tensor& R, int64_t q_size_minus_2, int64_t r_size_minus_1, int64_t n_columns,
@@ -2423,7 +2423,7 @@ void linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, c
 #endif
 }
 
-REGISTER_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);
+REGISTER_CUDA_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -2513,7 +2513,7 @@ std::tuple<Tensor, Tensor> eig_kernel_impl(const Tensor& self, bool& eigenvector
   return std::tuple<Tensor, Tensor>(out_eigvals, out_eigvecs);
 }
 
-REGISTER_DISPATCH(eig_stub, &eig_kernel_impl);
+REGISTER_CUDA_DISPATCH(eig_stub, &eig_kernel_impl);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg_eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -2599,7 +2599,7 @@ void linalg_eig_kernel(Tensor& eigenvalues, Tensor& eigenvectors, Tensor& infos,
   });
 }
 
-REGISTER_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
+REGISTER_CUDA_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ svd ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -2671,8 +2671,7 @@ AT_ERROR("svd: MAGMA library not found in "
 
 std::tuple<Tensor, Tensor, Tensor> _svd_helper_cuda_legacy(const Tensor& self, bool some, bool compute_uv) {
   std::vector<int64_t> infos(batchCount(self), 0);
-  int64_t m = self.size(-2), n = self.size(-1);
-  int64_t k = std::min(m, n);
+  int64_t m = self.size(-2);
 
   char jobchar = compute_uv ? (some ? 'S' : 'A') : 'N';
 
@@ -2922,13 +2921,13 @@ static void lu_solve_trans_dispatch(const Tensor& b, const Tensor& lu, const Ten
   }
 }
 
-REGISTER_DISPATCH(lu_solve_trans_stub, &lu_solve_trans_dispatch);
+REGISTER_CUDA_DISPATCH(lu_solve_trans_stub, &lu_solve_trans_dispatch);
 
 static void lu_solve_dispatch(const Tensor& b, const Tensor& lu, const Tensor& pivots) {
   lu_solve_trans_dispatch(b, lu, pivots, 'N');
 }
 
-REGISTER_DISPATCH(lu_solve_stub, &lu_solve_dispatch);
+REGISTER_CUDA_DISPATCH(lu_solve_stub, &lu_solve_dispatch);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -3112,7 +3111,7 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul
   }
 }
 
-REGISTER_DISPATCH(lstsq_stub, &lstsq_kernel);
+REGISTER_CUDA_DISPATCH(lstsq_stub, &lstsq_kernel);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ legacy_lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp
similarity index 98%
rename from aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu
rename to aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp
index bb9af142955f0..13d67e571e7dc 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp
@@ -143,10 +143,6 @@ static void apply_triangular_solve_batched(Tensor& A, Tensor& B, bool upper, boo
   cublasDiagType_t diag = unitriangular ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
   cublasSideMode_t side = CUBLAS_SIDE_LEFT;
 
-  auto A_data = A.data_ptr<scalar_t>();
-  auto B_data = B.data_ptr<scalar_t>();
-  auto A_mat_stride = matrixStride(A);
-  auto B_mat_stride = matrixStride(B);
   auto batch_size = cuda_int_cast(batchCount(A), "batch_size");
   auto m = cuda_int_cast(A.size(-2), "m");
   auto n = cuda_int_cast(A.size(-1), "n");
@@ -329,8 +325,6 @@ Tensor& _linalg_inv_out_helper_cuda_lib(Tensor& result, Tensor& infos_getrf, Ten
   result.zero_();
   result.diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).fill_(1);
 
-  const int batch_size = cuda_int_cast(batchCount(result), "batchCount");
-
   if (result.dim() > 2) {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cuda", [&]{
       apply_batched_inverse_lib<scalar_t>(
@@ -435,10 +429,6 @@ inline static void _apply_svd_lib_gesvdjBatched(const Tensor& self, Tensor& U, T
   auto U_data = U.data_ptr<scalar_t>();
   auto S_data = S.data_ptr<value_t>();
   auto VT_data = VT.data_ptr<scalar_t>();
-  auto self_stride = matrixStride(self);
-  auto U_stride = matrixStride(U);
-  auto S_stride = S.size(-1);
-  auto VT_stride = matrixStride(VT);
 
   int batchsize = cuda_int_cast(batchCount(self), "batch size");
   int m = cuda_int_cast(self.size(-2), "m");
@@ -481,7 +471,6 @@ std::tuple<Tensor, Tensor, Tensor> _svd_helper_cuda_lib(const Tensor& self, bool
   at::Tensor infos = at::zeros({batch_size}, self.options().dtype(at::kInt));
   const int64_t m = self.size(-2);
   const int64_t n = self.size(-1);
-  const int64_t k = std::min(m, n);
 
   Tensor U_working_copy, S_working_copy, VT_working_copy;
   std::tie(U_working_copy, S_working_copy, VT_working_copy) = \
@@ -686,11 +675,7 @@ inline static void apply_cholesky_cusolver_potrsBatched(Tensor& self_working_cop
   const int64_t nrhs = self_working_copy.size(-1);
   const int64_t lda = std::max<int64_t>(1, n);
   const int64_t batch_size = batchCount(self_working_copy);
-  const int64_t self_matrix_stride = matrixStride(self_working_copy);
-  scalar_t* self_working_copy_ptr = self_working_copy.data_ptr<scalar_t>();
 
-  const scalar_t* A_ptr = A_column_major_copy.data_ptr<scalar_t>();
-  const int64_t A_matrix_stride = matrixStride(A_column_major_copy);
   const int64_t ldb = std::max<int64_t>(1, A_column_major_copy.size(-1));
 
   int* infos_ptr = infos.data_ptr<int>();
@@ -882,8 +867,6 @@ void geqrf_cusolver(const Tensor& input, const Tensor& tau) {
 */
 template <typename scalar_t>
 static void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& other, bool left, bool transpose) {
-  using value_t = typename c10::scalar_value_type<scalar_t>::type;
-
   auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
   auto trans = transpose ? (input.is_complex() ? CUBLAS_OP_C : CUBLAS_OP_T) : CUBLAS_OP_N;
 
@@ -957,7 +940,6 @@ void ormqr_cusolver(const Tensor& input, const Tensor& tau, const Tensor& other,
 */
 template <typename scalar_t>
 inline static void apply_orgqr(Tensor& self, const Tensor& tau) {
-  using value_t = typename c10::scalar_value_type<scalar_t>::type;
   auto self_data = self.data_ptr<scalar_t>();
   auto tau_data = tau.data_ptr<scalar_t>();
   auto self_matrix_stride = matrixStride(self);
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 1662a92268d37..8b403a7c4014e 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -175,7 +175,7 @@ endif()
 if(BUILD_SPLIT_CUDA)
   # Splitting the source files that'll be in torch_cuda between torch_cuda_cu and torch_cuda_cpp
   foreach(tmp ${Caffe2_GPU_SRCS})
-    if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*")
+    if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA|BatchLinearAlgebra).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*")
       # Currently, torch_cuda_cu will have all the .cu files in aten, as well as some others that depend on those files
       list(APPEND Caffe2_GPU_SRCS_CU ${tmp})
     else()

From bc9277dca3a40d99147d4a1a3e0160a4a8e91f9f Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Mon, 30 Aug 2021 20:53:50 -0700
Subject: [PATCH 382/530] [Pytorch lite predictor] Use KinetoEdgeCPUProfiler
 for operator profiling. (#63367)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63367

This diff changes the way operator profiling is done in lite predictor
benchmarking binary.
Instead of using custom callbacks it uses KinetoEdgeCPUProfiler to profile
events and then generate operator level metric from it.
Since KinetoEvents do not contain cpu clock time, now we report only wallclock
time.
This unifies various profiling effort that we have for benchmarking purpose. In
production we will still use observer based mechanism, but the advantage of
using kineto profiler is that we get few other things for free, such as:
- chrome trace generation.
- operator level memory profiling (to be added)
- flop counts (to be added)

Furthermore possible we can use python post processing script to parse chrome
trace and generate output similar to torch.profiler. (To be done)

Test Plan:
aibench run
Model without debug info:
https://www.internalfb.com/intern/aibench/details/219598441154763
Model with debug info and `--print_module_info true` (see Operator summary has now module hierarchy information).
https://www.internalfb.com/intern/aibench/details/617154236292985

Reviewed By: raziel

Differential Revision: D30327514

fbshipit-source-id: 3bb2f2daaaedfb04bd6f5d9c91292783f9c4344f
---
 test/cpp/jit/test_lite_interpreter.cpp  | 165 ------------------------
 tools/build_variables.bzl               |   4 +-
 torch/csrc/jit/mobile/debug_info.cpp    |  15 ++-
 torch/csrc/jit/mobile/import.cpp        |   3 +
 torch/csrc/jit/mobile/interpreter.cpp   |   3 +
 torch/csrc/jit/mobile/module.cpp        |   3 +-
 torch/csrc/jit/mobile/module.h          |  11 +-
 torch/csrc/jit/mobile/profiler_edge.cpp |  45 ++++++-
 torch/csrc/jit/mobile/profiler_edge.h   |   5 +
 9 files changed, 72 insertions(+), 182 deletions(-)

diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp
index 3bd2becd8779d..8fb5fe2c4ec11 100644
--- a/test/cpp/jit/test_lite_interpreter.cpp
+++ b/test/cpp/jit/test_lite_interpreter.cpp
@@ -455,171 +455,6 @@ TEST(LiteInterpreterTest, BuiltinFunction) {
   AT_ASSERT(str == expected);
 }
 
-#if !defined FB_XPLAT_BUILD
-TEST(LiteInterpreterTest, ModuleInfoBasic) {
-  Module m("M");
-  m.define(R"JIT(
-    def forward(self, x):
-      return 2 * x
-  )JIT");
-
-  std::stringstream ss;
-  m._save_for_mobile(ss, {}, true);
-  mobile::Module bc = _load_for_mobile(ss);
-
-  std::unordered_set<std::string> module_debug_info_set;
-  size_t pc = 0;
-  while (true) {
-    try {
-      std::string module_info = bc.get_forward_method_debug_info(pc);
-      if (!module_info.empty() &&
-          (module_info.find("debug_handle") == std::string::npos)) {
-        module_debug_info_set.insert(module_info);
-      }
-      ++pc;
-    } catch (const std::exception& e) {
-      break;
-    }
-  }
-
-  AT_ASSERT(module_debug_info_set.count("top(M)::<unknown>.aten::mul"));
-}
-
-TEST(LiteInterpreterTest, NotSaveModuleInfo) {
-  Module m("M");
-  m.define(R"JIT(
-    def forward(self, x):
-      return x + 5
-  )JIT");
-
-  std::stringstream ss;
-  m._save_for_mobile(ss);
-  mobile::Module bc = _load_for_mobile(ss);
-
-  size_t pc = 0;
-  while (true) {
-    try {
-      std::string module_info = bc.get_forward_method_debug_info(pc);
-      AT_ASSERT(
-          module_info.empty() ||
-          (module_info.find("debug_handle") != std::string::npos));
-      ++pc;
-    } catch (const std::exception& e) {
-      break;
-    }
-  }
-}
-
-TEST(LiteInterpreterTest, OneSubmoduleModuleInfo) {
-  Module a("A");
-  a.define(R"JIT(
-    def forward(self, x):
-      return 2 * x + 5
-  )JIT");
-  Module b("B");
-  b.register_module("A0", a);
-  b.define(R"JIT(
-    def forward(self, x):
-      return self.A0.forward(x) + 1
-  )JIT");
-
-  std::stringstream ss;
-  b._save_for_mobile(ss, {}, true);
-  mobile::Module bc = _load_for_mobile(ss);
-
-  std::set<std::string> module_debug_info_set;
-  size_t pc = 0;
-  while (true) {
-    try {
-      std::string module_info = bc.get_forward_method_debug_info(pc);
-      if (!module_info.empty() &&
-          (module_info.find("debug_handle") == std::string::npos)) {
-        module_debug_info_set.insert(module_info);
-      }
-      ++pc;
-    } catch (const std::exception& e) {
-      break;
-    }
-  }
-
-  AT_ASSERT(module_debug_info_set.count("top(B)::<unknown>.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(B)::<unknown>.A0(A)::forward.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(B)::<unknown>.A0(A)::forward.aten::mul"));
-}
-
-TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) {
-  Module a("A");
-  a.define(R"JIT(
-    def forward(self, x):
-      return x + 1
-  )JIT");
-  Module b("B");
-  b.define(R"JIT(
-    def forward(self, x):
-      return x + 2
-  )JIT");
-  Module c("C");
-  c.register_module("A0", a);
-  c.register_module("B0", b);
-  c.define(R"JIT(
-    def forward(self, x):
-      return self.A0.forward(x) + self.B0.forward(x)
-  )JIT");
-
-  std::stringstream ss;
-  c._save_for_mobile(ss, {}, true);
-  mobile::Module bc = _load_for_mobile(ss);
-
-  std::set<std::string> module_debug_info_set;
-  size_t pc = 0;
-  while (true) {
-    try {
-      std::string module_info = bc.get_forward_method_debug_info(pc);
-      if (!module_info.empty() &&
-          (module_info.find("debug_handle") == std::string::npos)) {
-        module_debug_info_set.insert(module_info);
-      }
-      ++pc;
-    } catch (const std::exception& e) {
-      break;
-    }
-  }
-
-  AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(C)::<unknown>.A0(A)::forward.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(C)::<unknown>.B0(B)::forward.aten::add"));
-}
-
-TEST(LiteInterpreterTest, GetRuntimeByteCodeVersion) {
-  auto runtime_bytecode_version = _get_runtime_bytecode_version();
-  AT_ASSERT(
-      runtime_bytecode_version ==
-      caffe2::serialize::kMaxSupportedBytecodeVersion);
-}
-
-/**
- * The test below is disarmed for FB internal xplat builds since
- * BUCK requires us to pass in the script_module_v4.ptl file in
- * as a resource dependency of the build rule for this file, and
- * we would need to access it via the C++ Resources API instead
- * of directly reading from disk (which is what the open source
- * build/run does).
- */
-TEST(LiteInterpreterTest, GetByteCodeVersion) {
-  std::string filePath(__FILE__);
-  auto test_model_file_v4 =
-      filePath.substr(0, filePath.find_last_of("/\\") + 1);
-  test_model_file_v4.append("script_module_v4.ptl");
-
-  auto version_v4 = _get_model_bytecode_version(test_model_file_v4);
-  AT_ASSERT(version_v4 == 4);
-}
-#endif // !defined(FB_XPLAT_BUILD)
-
 namespace {
 
 void compareModelOutput(
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index dd89981094d4f..e0c43d2f8e97f 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -319,7 +319,7 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/testing/hooks_for_testing.cpp",
     "torch/csrc/utils/tensor_flatten.cpp",
     "torch/csrc/utils/variadic.cpp",
-] + libtorch_profiler_sources
+]
 
 core_sources_full_mobile = core_sources_full_mobile_no_backend_interface + [
     "torch/csrc/jit/backends/backend_debug_info.cpp",
@@ -337,7 +337,7 @@ core_sources_full = core_sources_full_mobile + [
     "torch/csrc/jit/tensorexpr/external_functions_codegen.cpp",
 ]
 
-libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources)
+libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources + libtorch_profiler_sources)
 
 # These files are the only ones that are supported on Windows.
 libtorch_distributed_base_sources = [
diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp
index 41ce3c6d46d52..a75ffe16c61f5 100644
--- a/torch/csrc/jit/mobile/debug_info.cpp
+++ b/torch/csrc/jit/mobile/debug_info.cpp
@@ -13,6 +13,12 @@ namespace jit {
 
 namespace {
 
+C10_ALWAYS_INLINE std::string debugHandlesNotFoundMessage(
+    const std::string& debug_handles_string) {
+  return "Debug info for handle(s): " + debug_handles_string +
+      ", was not found.";
+}
+
 std::pair<std::vector<StackEntry>, std::string> getStackTraceWithModuleHierarchy(
     const DebugInfoTuple& source_callstack,
     const std::string& caller_name) {
@@ -152,8 +158,7 @@ std::string MobileDebugTable::getModuleHierarchyInfo(
     const std::string& top_module_type_name) const {
   const auto it = callstack_ptr_map_.find(debug_handle);
   if (it == callstack_ptr_map_.end()) {
-    return "Module info for handle, " + std::to_string(debug_handle) +
-        ", not found.";
+    return debugHandlesNotFoundMessage(std::to_string(debug_handle));
   }
   return (getStackTraceWithModuleHierarchy(
               {it->second}, "top", top_module_type_name))
@@ -172,8 +177,7 @@ std::string MobileDebugTable::getSourceDebugString(
     const std::string& top_module_type_name) const {
   const auto it = callstack_ptr_map_.find(debug_handle);
   if (it == callstack_ptr_map_.end()) {
-    return "Debug info for handle, " + std::to_string(debug_handle) +
-        ", not found.";
+    return debugHandlesNotFoundMessage(std::to_string(debug_handle));
   }
   return (getStackTraceWithModuleHierarchy(
               {it->second}, "top", top_module_type_name))
@@ -208,8 +212,7 @@ std::pair<std::string, std::string> MobileDebugTable::
       debug_handles_string += std::to_string(debug_handle);
     }
     debug_handles_string += "}";
-    debug_handles_string =
-        "Debug info for handles: " + debug_handles_string + ", was not found.";
+    debug_handles_string = debugHandlesNotFoundMessage(debug_handles_string);
     return {debug_handles_string, debug_handles_string};
   }
   return (getStackTraceWithModuleHierarchy(
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index 6a548103f6965..99be225255ffb 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -517,12 +517,15 @@ mobile::Module BytecodeDeserializer::deserialize(
   auto bvals = std::move(*readArchive("bytecode", mcu).toTuple()).elements();
 
   c10::optional<std::vector<IValue>> debug_handles;
+  bool has_debug_handles{false};
   if (reader_->hasRecord("mobile_debug_handles.pkl")) {
     debug_handles =
         readArchive("mobile_debug_handles", mcu).toTuple()->elements();
+    has_debug_handles = true;
   }
   parseMethods(bvals, debug_handles, *mcu);
   auto m = mobile::Module(readArchive("data", mcu).toObject(), mcu);
+  m.setHasDebugHandles(has_debug_handles);
 #if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE)
   MobileDebugTable debug_table = MobileDebugTable(reader_, compilation_unit_);
   m.setDebugTable(std::move(debug_table));
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index 02e7c35792693..ab558cd2bf5e0 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -57,6 +57,9 @@ bool InterpreterState::run(Stack& stack) {
       auto inst_with_handle = code_->instructions_with_handles_.at(pc);
       Instruction inst = inst_with_handle.instruction;
       DebugHandle debug_handle = inst_with_handle.debug_handle;
+      // If no valid debug handle found then just log pc.
+      // This is possible when we did not save debug handles
+      debug_handle = debug_handle == -1 ? pc : debug_handle;
 
       // std::cout << "RUNNING " << pc << " "
       //           << code_->instructions_with_handles_[pc].instruction;
diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp
index c04d9f74b7378..c74ca138d848a 100644
--- a/torch/csrc/jit/mobile/module.cpp
+++ b/torch/csrc/jit/mobile/module.cpp
@@ -145,8 +145,7 @@ std::string Module::getCallStack(const int64_t debug_handle) const {
 // We really need to change this part, so in the next step for profiling support
 // for delegates, the first thing will be to rewrite how profiling is done
 // for lite interpreter.
-std::string Module::get_forward_method_debug_info(size_t pc) const {
-  auto debug_handle = find_method("forward")->get_debug_handle(pc);
+std::string Module::get_forward_method_debug_info(int64_t debug_handle) const {
 #if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE)
   return getDebugTable().getModuleHierarchyInfo(
       debug_handle, getTopModuleTypeName(*this));
diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h
index 73637aa4584a0..6102aa517df66 100644
--- a/torch/csrc/jit/mobile/module.h
+++ b/torch/csrc/jit/mobile/module.h
@@ -78,7 +78,7 @@ class TORCH_API Module {
   }
   const std::vector<at::Tensor> parameters() const;
   const std::map<std::string, at::Tensor> named_parameters() const;
-  std::string get_forward_method_debug_info(size_t pc) const;
+  std::string get_forward_method_debug_info(int64_t debug_handle) const;
   std::string getModuleHierarchy(const int64_t debug_handle) const;
   std::string getCallStack(const int64_t debug_handle) const;
   /// Enables "training" mode.
@@ -115,11 +115,20 @@ class TORCH_API Module {
     return debug_table_;
   }
 
+  void setHasDebugHandles(bool has_debug_handles) {
+    has_debug_handles_ = has_debug_handles;
+  }
+
+  bool hasDebugHandles() const {
+    return has_debug_handles_;
+  }
+
  private:
   c10::intrusive_ptr<c10::ivalue::Object> object_;
   std::unordered_map<std::string, std::string> metadata_;
   std::shared_ptr<CompilationUnit> cu_;
   MobileDebugTable debug_table_;
+  bool has_debug_handles_;
 };
 } // namespace mobile
 } // namespace jit
diff --git a/torch/csrc/jit/mobile/profiler_edge.cpp b/torch/csrc/jit/mobile/profiler_edge.cpp
index bcd5a6258ee7c..162e43f0982a6 100644
--- a/torch/csrc/jit/mobile/profiler_edge.cpp
+++ b/torch/csrc/jit/mobile/profiler_edge.cpp
@@ -2,7 +2,6 @@
 #include <string>
 #include <vector>
 
-namespace profiler = torch::autograd::profiler;
 namespace torch {
 namespace jit {
 namespace mobile {
@@ -27,17 +26,26 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler(
   if (with_modules || with_stack) {
     auto post_processing = [this, with_stack, with_modules](
                                std::vector<profiler::KinetoEvent>& events) {
+      std::string no_debug_info("Model was not saved with debug information");
       for (auto& e : events) {
         if (with_modules) {
           // Since KinetoEvents's module hierarchy takes vector of strings we
           // just construct a temporary vector using one string element
-          e.moduleHierarchy(std::vector<std::string>(
-              {this->m_.getModuleHierarchy(e.debugHandle())}));
+          if (this->m_.hasDebugHandles()) {
+            e.moduleHierarchy(std::vector<std::string>(
+                {this->m_.getModuleHierarchy(e.debugHandle())}));
+          } else {
+            e.moduleHierarchy(std::vector<std::string>({no_debug_info}));
+          }
         } else if (with_stack) {
           // Since KinetoEvents's stack trace takes vector of strings we just
           // construct a temporary vector using one string element
-          e.stack(std::vector<std::string>(
-              {this->m_.getCallStack(e.debugHandle())}));
+          if (this->m_.hasDebugHandles()) {
+            e.stack(std::vector<std::string>(
+                {this->m_.getCallStack(e.debugHandle())}));
+          } else {
+            e.stack(std::vector<std::string>({no_debug_info}));
+          }
         }
       }
     };
@@ -55,8 +63,33 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler(
   trace_file_name_ = fname;
 }
 
+const std::unique_ptr<profiler::ProfilerResult>& KinetoEdgeCPUProfiler::
+    disableProfiler() {
+  TORCH_CHECK(
+      !profiler_result_,
+      "KinetoEdgeCPUProfiler already disabled. "
+      "To get list of events use getProfilerResults()");
+  profiler_result_ = profiler::disableProfiler();
+  return profiler_result_;
+}
+
+const std::unique_ptr<profiler::ProfilerResult>& KinetoEdgeCPUProfiler::
+    getProfilerResult() {
+  TORCH_CHECK(
+      profiler_result_,
+      "KinetoEdgeCPUProfiler has not been disabled. "
+      "use disableProfiler() API first, which returns the ProfilerResult.");
+  return profiler_result_;
+}
+
 KinetoEdgeCPUProfiler::~KinetoEdgeCPUProfiler() {
-  profiler::disableProfiler()->save(trace_file_name_);
+  if (!trace_file_name_.empty()) {
+    if (profiler_result_) {
+      profiler_result_->save(trace_file_name_);
+    } else {
+      profiler::disableProfiler()->save(trace_file_name_);
+    }
+  }
 }
 } // namespace mobile
 } // namespace jit
diff --git a/torch/csrc/jit/mobile/profiler_edge.h b/torch/csrc/jit/mobile/profiler_edge.h
index a245034e34f9b..ef37e01ed4c71 100644
--- a/torch/csrc/jit/mobile/profiler_edge.h
+++ b/torch/csrc/jit/mobile/profiler_edge.h
@@ -2,6 +2,7 @@
 #include <torch/csrc/autograd/profiler_kineto.h>
 #include <torch/csrc/jit/mobile/module.h>
 
+namespace profiler = torch::autograd::profiler;
 namespace torch {
 namespace jit {
 namespace mobile {
@@ -53,6 +54,9 @@ class TORCH_API KinetoEdgeCPUProfiler {
       const bool with_flops = false,
       const bool with_modules = false);
 
+  const std::unique_ptr<profiler::ProfilerResult>& disableProfiler();
+  const std::unique_ptr<profiler::ProfilerResult>& getProfilerResult();
+
   ~KinetoEdgeCPUProfiler();
 
  private:
@@ -62,6 +66,7 @@ class TORCH_API KinetoEdgeCPUProfiler {
    */
   const mobile::Module& m_;
   std::string trace_file_name_;
+  std::unique_ptr<profiler::ProfilerResult> profiler_result_;
 };
 } // namespace mobile
 } // namespace jit

From a3d6dae319f03e9b5450c875349dbd65cb437767 Mon Sep 17 00:00:00 2001
From: Facebook Community Bot <facebook-github-bot@users.noreply.github.com>
Date: Mon, 30 Aug 2021 21:31:11 -0700
Subject: [PATCH 383/530] Automated submodule update: FBGEMM (#64213)

Summary:
This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM).

New submodule commit: https://github.com/pytorch/FBGEMM/commit/9d69998df6236d6714aa37ae6142a2a2d4fb2bf6

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64213

Test Plan: Ensure that CI jobs succeed on GitHub before landing.

Reviewed By: jspark1105

Differential Revision: D30647878

fbshipit-source-id: b903b39441b4e28dda7eab226ac874e2227e750a
---
 third_party/fbgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/fbgemm b/third_party/fbgemm
index 9f4078a7bb92b..e6f80ee6570bb 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 9f4078a7bb92b88cdcfc913398ffade158160c91
+Subproject commit e6f80ee6570bb8a7ed15a5ad0d496fdfb8927470

From 3c15822f5f4ab616eb6a519a0ff9b82fc7a3dc63 Mon Sep 17 00:00:00 2001
From: Harut Movsisyan <harutm@fb.com>
Date: Tue, 31 Aug 2021 00:49:39 -0700
Subject: [PATCH 384/530] [Static Runtime] Implement aten::nonzero out variant
 (#64126)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64126

Test Plan:
Confirm out variant is called:

```
> buck run //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- --v=1
```

Reviewed By: mikeiovine

Differential Revision: D30617729

fbshipit-source-id: 752749638c8f467815efa57021cb3de5c728ab1b
---
 benchmarks/static_runtime/test_scripts.h      |  6 ++++++
 .../static_runtime/test_static_runtime.cc     |  9 +++++++-
 torch/csrc/jit/runtime/static/ops.cpp         | 21 +++++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index e26437fe4a6f9..37bb222f6a3d4 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -752,6 +752,12 @@ const auto append_tensor_script = R"JIT(
       return lst
 )JIT";
 
+const auto nonzero_tensor = R"JIT(
+  def forward(self, input: Tensor):
+      a = torch.nonzero(input).clone()
+      return (a)
+)JIT";
+
 const std::string quantize_script = R"IR(
   graph(%input: Tensor, %weights: Tensor):
       %scale: float = prim::Constant[value=1.]()
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index aa5cd35e38e56..8e498dbbc664e 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -1312,7 +1312,6 @@ TEST(StaticRuntime, IndividualOps_Cat) {
   testStaticRuntime(cat_script, args0, args1);
 }
 
-
 TEST(StaticRuntime, IndividualOps_Cumsum) {
   auto a = at::randn({2, 3});
   std::vector<IValue> args0{a, 0};
@@ -1333,3 +1332,11 @@ TEST(StaticRuntime, IndividualOps_CumsumDtype) {
   std::vector<IValue> args1{b, 1, dtype};
   testStaticRuntime(cumsum_script_dtype, args0, args1);
 }
+
+TEST(StaticRuntime, IndividualOps_Nonzero) {
+  auto a = at::randint(0, 2, {2, 3});
+  testStaticRuntime(nonzero_tensor, {a});
+
+  auto b = at::randint(0, 2, {4, 3, 2});
+  testStaticRuntime(nonzero_tensor, {a}, {b});
+}
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index a73872b540258..0cc38b0812f11 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1755,6 +1755,27 @@ REGISTER_OPERATOR_FUNCTOR(aten::cumsum, aten_cumsum, [](Node* n) -> SROperator {
   };
 });
 
+REGISTER_OPERATOR_FUNCTOR(
+    aten::nonzero,
+    aten_nonzero,
+    [](Node* n) -> SROperator {
+      if (!n->matches(torch::schema("aten::nonzero(Tensor self) -> Tensor"))) {
+        LogAndDumpSchema(n);
+        return nullptr;
+      }
+      return [](ProcessedNode* p_node) {
+        const auto& input = p_node->Input(0).toTensor();
+        if (p_node->Output(0).isNone()) {
+          p_node->Output(0) = at::native::nonzero_cpu(input);
+          return;
+        }
+
+        auto& output = p_node->Output(0).toTensor();
+        fastResizeToZero(output);
+        at::native::nonzero_out_cpu(input, output);
+      };
+    });
+
 namespace {
 
 void check_cat_no_zero_dim(const std::vector<at::Tensor>& tensors) {

From 67cb131458bc299c5362cefa021d527852b3f683 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Tue, 31 Aug 2021 07:36:53 -0700
Subject: [PATCH 385/530] Revert D30327514: [Pytorch lite predictor] Use
 KinetoEdgeCPUProfiler for operator profiling.

Test Plan: revert-hammer

Differential Revision:
D30327514 (https://github.com/pytorch/pytorch/commit/bc9277dca3a40d99147d4a1a3e0160a4a8e91f9f)

Original commit changeset: 3bb2f2daaaed

fbshipit-source-id: 0b2aa7c57d08de77c9aaa75e546a7d0938610f64
---
 test/cpp/jit/test_lite_interpreter.cpp  | 165 ++++++++++++++++++++++++
 tools/build_variables.bzl               |   4 +-
 torch/csrc/jit/mobile/debug_info.cpp    |  15 +--
 torch/csrc/jit/mobile/import.cpp        |   3 -
 torch/csrc/jit/mobile/interpreter.cpp   |   3 -
 torch/csrc/jit/mobile/module.cpp        |   3 +-
 torch/csrc/jit/mobile/module.h          |  11 +-
 torch/csrc/jit/mobile/profiler_edge.cpp |  45 +------
 torch/csrc/jit/mobile/profiler_edge.h   |   5 -
 9 files changed, 182 insertions(+), 72 deletions(-)

diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp
index 8fb5fe2c4ec11..3bd2becd8779d 100644
--- a/test/cpp/jit/test_lite_interpreter.cpp
+++ b/test/cpp/jit/test_lite_interpreter.cpp
@@ -455,6 +455,171 @@ TEST(LiteInterpreterTest, BuiltinFunction) {
   AT_ASSERT(str == expected);
 }
 
+#if !defined FB_XPLAT_BUILD
+TEST(LiteInterpreterTest, ModuleInfoBasic) {
+  Module m("M");
+  m.define(R"JIT(
+    def forward(self, x):
+      return 2 * x
+  )JIT");
+
+  std::stringstream ss;
+  m._save_for_mobile(ss, {}, true);
+  mobile::Module bc = _load_for_mobile(ss);
+
+  std::unordered_set<std::string> module_debug_info_set;
+  size_t pc = 0;
+  while (true) {
+    try {
+      std::string module_info = bc.get_forward_method_debug_info(pc);
+      if (!module_info.empty() &&
+          (module_info.find("debug_handle") == std::string::npos)) {
+        module_debug_info_set.insert(module_info);
+      }
+      ++pc;
+    } catch (const std::exception& e) {
+      break;
+    }
+  }
+
+  AT_ASSERT(module_debug_info_set.count("top(M)::<unknown>.aten::mul"));
+}
+
+TEST(LiteInterpreterTest, NotSaveModuleInfo) {
+  Module m("M");
+  m.define(R"JIT(
+    def forward(self, x):
+      return x + 5
+  )JIT");
+
+  std::stringstream ss;
+  m._save_for_mobile(ss);
+  mobile::Module bc = _load_for_mobile(ss);
+
+  size_t pc = 0;
+  while (true) {
+    try {
+      std::string module_info = bc.get_forward_method_debug_info(pc);
+      AT_ASSERT(
+          module_info.empty() ||
+          (module_info.find("debug_handle") != std::string::npos));
+      ++pc;
+    } catch (const std::exception& e) {
+      break;
+    }
+  }
+}
+
+TEST(LiteInterpreterTest, OneSubmoduleModuleInfo) {
+  Module a("A");
+  a.define(R"JIT(
+    def forward(self, x):
+      return 2 * x + 5
+  )JIT");
+  Module b("B");
+  b.register_module("A0", a);
+  b.define(R"JIT(
+    def forward(self, x):
+      return self.A0.forward(x) + 1
+  )JIT");
+
+  std::stringstream ss;
+  b._save_for_mobile(ss, {}, true);
+  mobile::Module bc = _load_for_mobile(ss);
+
+  std::set<std::string> module_debug_info_set;
+  size_t pc = 0;
+  while (true) {
+    try {
+      std::string module_info = bc.get_forward_method_debug_info(pc);
+      if (!module_info.empty() &&
+          (module_info.find("debug_handle") == std::string::npos)) {
+        module_debug_info_set.insert(module_info);
+      }
+      ++pc;
+    } catch (const std::exception& e) {
+      break;
+    }
+  }
+
+  AT_ASSERT(module_debug_info_set.count("top(B)::<unknown>.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(B)::<unknown>.A0(A)::forward.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(B)::<unknown>.A0(A)::forward.aten::mul"));
+}
+
+TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) {
+  Module a("A");
+  a.define(R"JIT(
+    def forward(self, x):
+      return x + 1
+  )JIT");
+  Module b("B");
+  b.define(R"JIT(
+    def forward(self, x):
+      return x + 2
+  )JIT");
+  Module c("C");
+  c.register_module("A0", a);
+  c.register_module("B0", b);
+  c.define(R"JIT(
+    def forward(self, x):
+      return self.A0.forward(x) + self.B0.forward(x)
+  )JIT");
+
+  std::stringstream ss;
+  c._save_for_mobile(ss, {}, true);
+  mobile::Module bc = _load_for_mobile(ss);
+
+  std::set<std::string> module_debug_info_set;
+  size_t pc = 0;
+  while (true) {
+    try {
+      std::string module_info = bc.get_forward_method_debug_info(pc);
+      if (!module_info.empty() &&
+          (module_info.find("debug_handle") == std::string::npos)) {
+        module_debug_info_set.insert(module_info);
+      }
+      ++pc;
+    } catch (const std::exception& e) {
+      break;
+    }
+  }
+
+  AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.A0(A)::forward.aten::add"));
+  AT_ASSERT(module_debug_info_set.count(
+      "top(C)::<unknown>.B0(B)::forward.aten::add"));
+}
+
+TEST(LiteInterpreterTest, GetRuntimeByteCodeVersion) {
+  auto runtime_bytecode_version = _get_runtime_bytecode_version();
+  AT_ASSERT(
+      runtime_bytecode_version ==
+      caffe2::serialize::kMaxSupportedBytecodeVersion);
+}
+
+/**
+ * The test below is disarmed for FB internal xplat builds since
+ * BUCK requires us to pass in the script_module_v4.ptl file in
+ * as a resource dependency of the build rule for this file, and
+ * we would need to access it via the C++ Resources API instead
+ * of directly reading from disk (which is what the open source
+ * build/run does).
+ */
+TEST(LiteInterpreterTest, GetByteCodeVersion) {
+  std::string filePath(__FILE__);
+  auto test_model_file_v4 =
+      filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  test_model_file_v4.append("script_module_v4.ptl");
+
+  auto version_v4 = _get_model_bytecode_version(test_model_file_v4);
+  AT_ASSERT(version_v4 == 4);
+}
+#endif // !defined(FB_XPLAT_BUILD)
+
 namespace {
 
 void compareModelOutput(
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index e0c43d2f8e97f..dd89981094d4f 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -319,7 +319,7 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/testing/hooks_for_testing.cpp",
     "torch/csrc/utils/tensor_flatten.cpp",
     "torch/csrc/utils/variadic.cpp",
-]
+] + libtorch_profiler_sources
 
 core_sources_full_mobile = core_sources_full_mobile_no_backend_interface + [
     "torch/csrc/jit/backends/backend_debug_info.cpp",
@@ -337,7 +337,7 @@ core_sources_full = core_sources_full_mobile + [
     "torch/csrc/jit/tensorexpr/external_functions_codegen.cpp",
 ]
 
-libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources + libtorch_profiler_sources)
+libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources)
 
 # These files are the only ones that are supported on Windows.
 libtorch_distributed_base_sources = [
diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp
index a75ffe16c61f5..41ce3c6d46d52 100644
--- a/torch/csrc/jit/mobile/debug_info.cpp
+++ b/torch/csrc/jit/mobile/debug_info.cpp
@@ -13,12 +13,6 @@ namespace jit {
 
 namespace {
 
-C10_ALWAYS_INLINE std::string debugHandlesNotFoundMessage(
-    const std::string& debug_handles_string) {
-  return "Debug info for handle(s): " + debug_handles_string +
-      ", was not found.";
-}
-
 std::pair<std::vector<StackEntry>, std::string> getStackTraceWithModuleHierarchy(
     const DebugInfoTuple& source_callstack,
     const std::string& caller_name) {
@@ -158,7 +152,8 @@ std::string MobileDebugTable::getModuleHierarchyInfo(
     const std::string& top_module_type_name) const {
   const auto it = callstack_ptr_map_.find(debug_handle);
   if (it == callstack_ptr_map_.end()) {
-    return debugHandlesNotFoundMessage(std::to_string(debug_handle));
+    return "Module info for handle, " + std::to_string(debug_handle) +
+        ", not found.";
   }
   return (getStackTraceWithModuleHierarchy(
               {it->second}, "top", top_module_type_name))
@@ -177,7 +172,8 @@ std::string MobileDebugTable::getSourceDebugString(
     const std::string& top_module_type_name) const {
   const auto it = callstack_ptr_map_.find(debug_handle);
   if (it == callstack_ptr_map_.end()) {
-    return debugHandlesNotFoundMessage(std::to_string(debug_handle));
+    return "Debug info for handle, " + std::to_string(debug_handle) +
+        ", not found.";
   }
   return (getStackTraceWithModuleHierarchy(
               {it->second}, "top", top_module_type_name))
@@ -212,7 +208,8 @@ std::pair<std::string, std::string> MobileDebugTable::
       debug_handles_string += std::to_string(debug_handle);
     }
     debug_handles_string += "}";
-    debug_handles_string = debugHandlesNotFoundMessage(debug_handles_string);
+    debug_handles_string =
+        "Debug info for handles: " + debug_handles_string + ", was not found.";
     return {debug_handles_string, debug_handles_string};
   }
   return (getStackTraceWithModuleHierarchy(
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index 99be225255ffb..6a548103f6965 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -517,15 +517,12 @@ mobile::Module BytecodeDeserializer::deserialize(
   auto bvals = std::move(*readArchive("bytecode", mcu).toTuple()).elements();
 
   c10::optional<std::vector<IValue>> debug_handles;
-  bool has_debug_handles{false};
   if (reader_->hasRecord("mobile_debug_handles.pkl")) {
     debug_handles =
         readArchive("mobile_debug_handles", mcu).toTuple()->elements();
-    has_debug_handles = true;
   }
   parseMethods(bvals, debug_handles, *mcu);
   auto m = mobile::Module(readArchive("data", mcu).toObject(), mcu);
-  m.setHasDebugHandles(has_debug_handles);
 #if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE)
   MobileDebugTable debug_table = MobileDebugTable(reader_, compilation_unit_);
   m.setDebugTable(std::move(debug_table));
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index ab558cd2bf5e0..02e7c35792693 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -57,9 +57,6 @@ bool InterpreterState::run(Stack& stack) {
       auto inst_with_handle = code_->instructions_with_handles_.at(pc);
       Instruction inst = inst_with_handle.instruction;
       DebugHandle debug_handle = inst_with_handle.debug_handle;
-      // If no valid debug handle found then just log pc.
-      // This is possible when we did not save debug handles
-      debug_handle = debug_handle == -1 ? pc : debug_handle;
 
       // std::cout << "RUNNING " << pc << " "
       //           << code_->instructions_with_handles_[pc].instruction;
diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp
index c74ca138d848a..c04d9f74b7378 100644
--- a/torch/csrc/jit/mobile/module.cpp
+++ b/torch/csrc/jit/mobile/module.cpp
@@ -145,7 +145,8 @@ std::string Module::getCallStack(const int64_t debug_handle) const {
 // We really need to change this part, so in the next step for profiling support
 // for delegates, the first thing will be to rewrite how profiling is done
 // for lite interpreter.
-std::string Module::get_forward_method_debug_info(int64_t debug_handle) const {
+std::string Module::get_forward_method_debug_info(size_t pc) const {
+  auto debug_handle = find_method("forward")->get_debug_handle(pc);
 #if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE)
   return getDebugTable().getModuleHierarchyInfo(
       debug_handle, getTopModuleTypeName(*this));
diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h
index 6102aa517df66..73637aa4584a0 100644
--- a/torch/csrc/jit/mobile/module.h
+++ b/torch/csrc/jit/mobile/module.h
@@ -78,7 +78,7 @@ class TORCH_API Module {
   }
   const std::vector<at::Tensor> parameters() const;
   const std::map<std::string, at::Tensor> named_parameters() const;
-  std::string get_forward_method_debug_info(int64_t debug_handle) const;
+  std::string get_forward_method_debug_info(size_t pc) const;
   std::string getModuleHierarchy(const int64_t debug_handle) const;
   std::string getCallStack(const int64_t debug_handle) const;
   /// Enables "training" mode.
@@ -115,20 +115,11 @@ class TORCH_API Module {
     return debug_table_;
   }
 
-  void setHasDebugHandles(bool has_debug_handles) {
-    has_debug_handles_ = has_debug_handles;
-  }
-
-  bool hasDebugHandles() const {
-    return has_debug_handles_;
-  }
-
  private:
   c10::intrusive_ptr<c10::ivalue::Object> object_;
   std::unordered_map<std::string, std::string> metadata_;
   std::shared_ptr<CompilationUnit> cu_;
   MobileDebugTable debug_table_;
-  bool has_debug_handles_;
 };
 } // namespace mobile
 } // namespace jit
diff --git a/torch/csrc/jit/mobile/profiler_edge.cpp b/torch/csrc/jit/mobile/profiler_edge.cpp
index 162e43f0982a6..bcd5a6258ee7c 100644
--- a/torch/csrc/jit/mobile/profiler_edge.cpp
+++ b/torch/csrc/jit/mobile/profiler_edge.cpp
@@ -2,6 +2,7 @@
 #include <string>
 #include <vector>
 
+namespace profiler = torch::autograd::profiler;
 namespace torch {
 namespace jit {
 namespace mobile {
@@ -26,26 +27,17 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler(
   if (with_modules || with_stack) {
     auto post_processing = [this, with_stack, with_modules](
                                std::vector<profiler::KinetoEvent>& events) {
-      std::string no_debug_info("Model was not saved with debug information");
       for (auto& e : events) {
         if (with_modules) {
           // Since KinetoEvents's module hierarchy takes vector of strings we
           // just construct a temporary vector using one string element
-          if (this->m_.hasDebugHandles()) {
-            e.moduleHierarchy(std::vector<std::string>(
-                {this->m_.getModuleHierarchy(e.debugHandle())}));
-          } else {
-            e.moduleHierarchy(std::vector<std::string>({no_debug_info}));
-          }
+          e.moduleHierarchy(std::vector<std::string>(
+              {this->m_.getModuleHierarchy(e.debugHandle())}));
         } else if (with_stack) {
           // Since KinetoEvents's stack trace takes vector of strings we just
           // construct a temporary vector using one string element
-          if (this->m_.hasDebugHandles()) {
-            e.stack(std::vector<std::string>(
-                {this->m_.getCallStack(e.debugHandle())}));
-          } else {
-            e.stack(std::vector<std::string>({no_debug_info}));
-          }
+          e.stack(std::vector<std::string>(
+              {this->m_.getCallStack(e.debugHandle())}));
         }
       }
     };
@@ -63,33 +55,8 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler(
   trace_file_name_ = fname;
 }
 
-const std::unique_ptr<profiler::ProfilerResult>& KinetoEdgeCPUProfiler::
-    disableProfiler() {
-  TORCH_CHECK(
-      !profiler_result_,
-      "KinetoEdgeCPUProfiler already disabled. "
-      "To get list of events use getProfilerResults()");
-  profiler_result_ = profiler::disableProfiler();
-  return profiler_result_;
-}
-
-const std::unique_ptr<profiler::ProfilerResult>& KinetoEdgeCPUProfiler::
-    getProfilerResult() {
-  TORCH_CHECK(
-      profiler_result_,
-      "KinetoEdgeCPUProfiler has not been disabled. "
-      "use disableProfiler() API first, which returns the ProfilerResult.");
-  return profiler_result_;
-}
-
 KinetoEdgeCPUProfiler::~KinetoEdgeCPUProfiler() {
-  if (!trace_file_name_.empty()) {
-    if (profiler_result_) {
-      profiler_result_->save(trace_file_name_);
-    } else {
-      profiler::disableProfiler()->save(trace_file_name_);
-    }
-  }
+  profiler::disableProfiler()->save(trace_file_name_);
 }
 } // namespace mobile
 } // namespace jit
diff --git a/torch/csrc/jit/mobile/profiler_edge.h b/torch/csrc/jit/mobile/profiler_edge.h
index ef37e01ed4c71..a245034e34f9b 100644
--- a/torch/csrc/jit/mobile/profiler_edge.h
+++ b/torch/csrc/jit/mobile/profiler_edge.h
@@ -2,7 +2,6 @@
 #include <torch/csrc/autograd/profiler_kineto.h>
 #include <torch/csrc/jit/mobile/module.h>
 
-namespace profiler = torch::autograd::profiler;
 namespace torch {
 namespace jit {
 namespace mobile {
@@ -54,9 +53,6 @@ class TORCH_API KinetoEdgeCPUProfiler {
       const bool with_flops = false,
       const bool with_modules = false);
 
-  const std::unique_ptr<profiler::ProfilerResult>& disableProfiler();
-  const std::unique_ptr<profiler::ProfilerResult>& getProfilerResult();
-
   ~KinetoEdgeCPUProfiler();
 
  private:
@@ -66,7 +62,6 @@ class TORCH_API KinetoEdgeCPUProfiler {
    */
   const mobile::Module& m_;
   std::string trace_file_name_;
-  std::unique_ptr<profiler::ProfilerResult> profiler_result_;
 };
 } // namespace mobile
 } // namespace jit

From eee054e6ead98fb872b264e092955b87964db75d Mon Sep 17 00:00:00 2001
From: Kevin Tse <ktse@fb.com>
Date: Tue, 31 Aug 2021 08:07:23 -0700
Subject: [PATCH 386/530] [DataPipe] implementing fork() (#63649)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63649

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30493945

Pulled By: NivekT

fbshipit-source-id: 40db7d4134facd266d86bc0dc2edf2729c4e5842
---
 test/test_datapipe.py                        | 110 ++++++++++++++++++-
 torch/utils/data/datapipes/iter/__init__.py  |   1 +
 torch/utils/data/datapipes/iter/combining.py | 108 +++++++++++++++++-
 3 files changed, 209 insertions(+), 10 deletions(-)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index c35698e057c89..842e4424e9169 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -591,6 +591,105 @@ def test_concat_datapipe(self):
 
         self.assertEqual(list(concat_dp), list(range(10)) + list(range(5)))
 
+
+    def test_fork_datapipe(self):
+        input_dp = IDP(range(10))
+
+        # Test Case: making sure all child DataPipe shares the same reference
+        dp1, dp2, dp3 = input_dp.fork(num_instances=3)
+        self.assertTrue(all(n1 is n2 for n1, n2 in zip(dp1, dp2)))
+        self.assertTrue(all(n1 is n3 for n1, n3 in zip(dp1, dp3)))
+
+        # Test Case: one child DataPipe yields all value at a time
+        output1, output2, output3 = list(dp1), list(dp2), list(dp3)
+        self.assertEqual(list(range(10)), output1)
+        self.assertEqual(list(range(10)), output2)
+        self.assertEqual(list(range(10)), output3)
+
+        # Test Case: two child DataPipes yield value together
+        dp1, dp2 = input_dp.fork(num_instances=2)
+        output = []
+        for n1, n2 in zip(dp1, dp2):
+            output.append((n1, n2))
+        self.assertEqual([(i, i) for i in range(10)], output)
+
+        # Test Case: one child DataPipe yields all value first, but buffer_size = 5 being too small
+        dp1, dp2 = input_dp.fork(num_instances=2, buffer_size=5)
+        it1 = iter(dp1)
+        for _ in range(5):
+            next(it1)
+        with self.assertRaises(BufferError):
+            next(it1)
+
+        # Test Case: two child DataPipes yield value together with buffer size 1
+        dp1, dp2 = input_dp.fork(num_instances=2, buffer_size=1)
+        output = []
+        for n1, n2 in zip(dp1, dp2):
+            output.append((n1, n2))
+        self.assertEqual([(i, i) for i in range(10)], output)
+
+        # Test Case: make sure logic related to slowest_ptr is working properly
+        dp1, dp2, dp3 = input_dp.fork(num_instances=3)
+        output1, output2 , output3 = [], [], []
+        for i, (n1, n2) in enumerate(zip(dp1, dp2)):
+            output1.append(n1)
+            output2.append(n2)
+            if i == 4:  # yield all of dp3 when halfway through dp1, dp2
+                output3 = list(dp3)
+                break
+        self.assertEqual(list(range(5)), output1)
+        self.assertEqual(list(range(5)), output2)
+        self.assertEqual(list(range(10)), output3)
+
+        # Test Case: DataPipe doesn't reset if this pipe hasn't been read
+        dp1, dp2 = input_dp.fork(num_instances=2)
+        i1, i2 = iter(dp1), iter(dp2)
+        output2 = []
+        for i, n2 in enumerate(i2):
+            output2.append(n2)
+            if i == 4:
+                i1 = iter(dp1)  # Doesn't reset because i1 hasn't been read
+        self.assertEqual(list(range(10)), output2)
+
+        # Test Case: DataPipe reset when some of it have been read
+        dp1, dp2 = input_dp.fork(num_instances=2)
+        i1, i2 = iter(dp1), iter(dp2)
+        output1, output2 = [], []
+        for i, (n1, n2) in enumerate(zip(i1, i2)):
+            output1.append(n1)
+            output2.append(n2)
+            if i == 4:
+                with warnings.catch_warnings(record=True) as wa:
+                    i1 = iter(dp1)  # Reset both all child DataPipe
+                    self.assertEqual(len(wa), 1)
+                    self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+        self.assertEqual(list(range(5)) + list(range(10)), output1)
+        self.assertEqual(list(range(5)) + list(range(10)), output2)
+
+        # Test Case: DataPipe reset, even when some other child DataPipes are not read
+        dp1, dp2, dp3 = input_dp.fork(num_instances=3)
+        output1, output2 = list(dp1), list(dp2)
+        self.assertEqual(list(range(10)), output1)
+        self.assertEqual(list(range(10)), output2)
+        output1, output2 = list(dp1), list(dp2)
+        with warnings.catch_warnings(record=True) as wa:
+            self.assertEqual(list(range(10)), list(dp1))  # Resets even though dp3 has not been read
+            self.assertEqual(len(wa), 1)
+            self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+        output3 = []
+        for i, n3 in enumerate(dp3):
+            output3.append(n3)
+            if i == 4:
+                with warnings.catch_warnings(record=True) as wa:
+                    output1 = list(dp1)  # Resets even though dp3 is only partially read
+                    self.assertEqual(len(wa), 1)
+                    self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+                self.assertEqual(list(range(5)), output3)
+                self.assertEqual(list(range(10)), output1)
+                break
+        self.assertEqual(list(range(10)), list(dp3))  # dp3 has to read from the start again
+
+
     def test_map_datapipe(self):
         input_dp = IDP(range(10))
 
@@ -1333,24 +1432,25 @@ def test_simple_traverse(self):
         expected: Dict[Any, Any] = {mapped_dp: {numbers_dp: {}}}
         self.assertEqual(expected, graph)
 
-    # TODO(VitalyFedyunin): This test is incorrect because of 'buffer' nature
-    # of the fork fake implementation, update fork first and fix this test too
     @skipIfNoDill
     def test_traverse_forked(self):
         numbers_dp = NumbersDataset(size=50)
-        dp0, dp1, dp2 = numbers_dp.fork(3)
+        dp0, dp1, dp2 = numbers_dp.fork(num_instances=3)
         dp0_upd = dp0.map(lambda x: x * 10)
         dp1_upd = dp1.filter(lambda x: x % 3 == 1)
         combined_dp = dp0_upd.mux(dp1_upd, dp2)
         graph = torch.utils.data.graph.traverse(combined_dp)
-        expected = {combined_dp: {dp0_upd: {dp0: {}}, dp1_upd: {dp1: {}}, dp2: {}}}
+        expected = {combined_dp: {dp0_upd: {dp0: {dp0.main_datapipe: {dp0.main_datapipe.main_datapipe: {}}}},
+                                  dp1_upd: {dp1: {dp1.main_datapipe: {dp1.main_datapipe.main_datapipe: {}}}},
+                                  dp2: {dp2.main_datapipe: {dp2.main_datapipe.main_datapipe: {}}}}}
         self.assertEqual(expected, graph)
 
 
 class TestSharding(TestCase):
+
     def _get_pipeline(self):
         numbers_dp = NumbersDataset(size=10)
-        dp0, dp1 = numbers_dp.fork(2)
+        dp0, dp1 = numbers_dp.fork(num_instances=2)
         dp0_upd = dp0.map(lambda x: x * 10)
         dp1_upd = dp1.filter(lambda x: x % 3 == 1)
         combined_dp = dp0_upd.mux(dp1_upd)
diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
index b55bbf6667509..b460d4d77cf5a 100644
--- a/torch/utils/data/datapipes/iter/__init__.py
+++ b/torch/utils/data/datapipes/iter/__init__.py
@@ -8,6 +8,7 @@
 )
 from torch.utils.data.datapipes.iter.combining import (
     ConcaterIterDataPipe as Concater,
+    ForkerIterDataPipe as Forker,
     ZipperIterDataPipe as Zipper,
 )
 from torch.utils.data.datapipes.iter.filelister import (
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 879e8be27ff0c..85b37324992fc 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -1,7 +1,9 @@
 import functools
+import warnings
 
 from torch.utils.data import IterDataPipe, functional_datapipe
-from typing import Iterator, Optional, Sized, Tuple, TypeVar
+from typing import Any, Iterator, Optional, Sized, Tuple, TypeVar, Deque
+from collections import deque
 
 T_co = TypeVar('T_co', covariant=True)
 
@@ -46,6 +48,7 @@ def __len__(self) -> int:
 # This is fake class to show API, going to be replaced by the copy from torchdata
 # TODO(VitalyFedyunin): Replace with valid version, documentation and tests
 class IterateBuffer(IterDataPipe):
+
     def __init__(self, buffer):
         self.buffer = buffer
 
@@ -56,11 +59,106 @@ def __iter__(self):
 
 @functional_datapipe('fork')
 class ForkerIterDataPipe(IterDataPipe):
+    r""" :class:`ForkerIterDataPipe`.
+
+        Iterable DataPipe to create multiple instances of the same Iterable DataPipe.
+        args:
+            datapipe: Iterable DataPipe being copied
+            num_instances: number of instances of the datapipe to create
+            buffer_size: this restricts how far ahead the leading child DataPipe
+             can read relative to the slowest child DataPipe
+    """
+    def __new__(cls, datapipe: IterDataPipe, num_instances: int, buffer_size: int = 1000):
+        container = _ForkerIterDataPipe(datapipe, num_instances, buffer_size)
+        return [_ChildDataPipe(container, i) for i in range(num_instances)]
 
-    def __new__(cls, datapipe, instances):
-        result = []
-        buffer = list(datapipe)
-        return [IterateBuffer(buffer) for i in range(instances)]
+
+class _ForkerIterDataPipe(IterDataPipe):
+    r""" :class:`_ForkerIterDataPipe`.
+
+        Container to hold instance-specific information on behalf of ForkerIterDataPipe. It tracks
+        the state of its child DataPipes, maintains the buffer, and yields the next value
+        as requested by the child DataPipes.
+    """
+    def __init__(self, datapipe: IterDataPipe, num_instances: int, buffer_size: int = 1000):
+        self.main_datapipe = datapipe
+        self._datapipe_iterator: Optional[Iterator[Any]] = None
+        self.num_instances = num_instances
+        self.buffer: Deque = deque()
+        self.buffer_size = buffer_size
+        self.child_pointers = [0] * num_instances  # Indicate the indices of the next element to get
+        self.slowest_ptr = 0
+        self.leading_ptr = 0
+        self.end_ptr: Optional[int] = None
+
+    def get_next_element_by_instance(self, instance_id: int):
+        if self._datapipe_iterator is None:
+            self._datapipe_iterator = iter(self.main_datapipe)
+        while self.end_ptr is None or self.child_pointers[instance_id] < self.end_ptr:
+            if not self.buffer or self.child_pointers[instance_id] > self.leading_ptr:
+                self.leading_ptr = self.child_pointers[instance_id]
+                if self.leading_ptr - self.slowest_ptr + 1 > self.buffer_size:
+                    raise BufferError("ForkerIterDataPipe buffer overflow," +
+                                      f"buffer size {self.buffer_size} is insufficient.")
+                try:
+                    self.buffer.append(next(self._datapipe_iterator))
+                    self.child_pointers[instance_id] += 1
+                    yield self.buffer[-1]
+                except StopIteration:
+                    self.end_ptr = self.leading_ptr
+            else:  # Child pointer is slower than or equal to the leading_ptr
+                buffer_index = self.child_pointers[instance_id] - self.slowest_ptr
+                return_val = self.buffer[buffer_index]
+                self.child_pointers[instance_id] += 1
+                if self.child_pointers[instance_id] - 1 == self.slowest_ptr:
+                    new_min = min(self.child_pointers)  # Can optimize by avoiding the call to min()
+                    if self.slowest_ptr < new_min:
+                        self.slowest_ptr = new_min
+                        self.buffer.popleft()
+                yield return_val
+
+    def is_instance_started(self, instance_id: int) -> bool:
+        return self.child_pointers[instance_id] != 0
+
+    def is_every_instance_exhausted(self) -> bool:
+        return all(self.end_ptr == ptr for ptr in self.child_pointers)
+
+    def reset(self):
+        self._datapipe_iterator = iter(self.main_datapipe)
+        self.buffer = deque()
+        self.child_pointers = [0] * self.num_instances
+        self.slowest_ptr = 0
+        self.leading_ptr = 0
+        self.end_ptr = None
+
+class _ChildDataPipe(IterDataPipe):
+    r""" :class:`_ChildDataPipe`.
+
+        Iteratable Datapipe that is a child of a main DataPipe. The instance of this class
+        will pass its instance_id to get the next value from its main DataPipe.
+        args:
+            main_datapipe: Main DataPipe with a method 'get_next_element_by_instance(instance_id)'
+            instance_id: integer identifier of this instance
+    """
+    def __init__(self, main_datapipe, instance_id: int):
+        required_attrs = ["get_next_element_by_instance", "is_instance_started", "is_every_instance_exhausted", "reset"]
+        required_ops = [getattr(main_datapipe, attr) for attr in required_attrs]
+        if any(not callable(op) for op in required_ops):
+            raise NotImplementedError(f"Main Datapipe must have methods {required_attrs} implemented.")
+        self.main_datapipe = main_datapipe
+        self.instance_id = instance_id
+
+    def __iter__(self):
+        if self.main_datapipe.is_instance_started(self.instance_id):  # Only reset if the DataPipe started to read
+            if not self.main_datapipe.is_every_instance_exhausted():
+                warnings.warn("Some child DataPipes are not exhausted when __iter__ is called. We are resetting "
+                              "the buffer and each child DataPipe will read from the start again.", UserWarning)
+            self.main_datapipe.reset()
+        # We want to separate the code for reset and yield, so that 'reset' exeutes before __next__ is called
+        return self.get_generator_by_instance(self.instance_id)
+
+    def get_generator_by_instance(self, instance_id: int):
+        yield from self.main_datapipe.get_next_element_by_instance(self.instance_id)
 
 
 @functional_datapipe('demux')

From 0deb7a0bc08bf4e65f346f2956de4e5227f3a12e Mon Sep 17 00:00:00 2001
From: Kevin Tse <ktse@fb.com>
Date: Tue, 31 Aug 2021 08:07:23 -0700
Subject: [PATCH 387/530] [DataPipe] implementing demux() (#63650)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63650

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30493944

Pulled By: NivekT

fbshipit-source-id: 0aa06dee8c7fb1744975b8f6a0694b90c11ef80d
---
 test/test_datapipe.py                        | 99 ++++++++++++++++++++
 torch/utils/data/datapipes/iter/__init__.py  |  2 +
 torch/utils/data/datapipes/iter/combining.py | 94 +++++++++++++++++--
 3 files changed, 187 insertions(+), 8 deletions(-)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 842e4424e9169..b6e3513622e13 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -690,6 +690,105 @@ def test_fork_datapipe(self):
         self.assertEqual(list(range(10)), list(dp3))  # dp3 has to read from the start again
 
 
+    def test_demux_datapipe(self):
+        input_dp = IDP(range(10))
+
+        # Test Case: split into 2 DataPipes and output them one at a time
+        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2)
+        output1, output2 = list(dp1), list(dp2)
+        self.assertEqual(list(range(0, 10, 2)), output1)
+        self.assertEqual(list(range(1, 10, 2)), output2)
+
+        # Test Case: split into 2 DataPipes and output them together
+        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2)
+        output = []
+        for n1, n2 in zip(dp1, dp2):
+            output.append((n1, n2))
+        self.assertEqual([(i, i + 1) for i in range(0, 10, 2)], output)
+
+        # Test Case: values of the same classification are lumped together, and buffer_size = 3 being too small
+        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: 0 if x >= 5 else 1, buffer_size=4)
+        it1 = iter(dp1)
+        with self.assertRaises(BufferError):
+            next(it1)  # Buffer raises because first 5 elements all belong to the a different child
+
+        # Test Case: values of the same classification are lumped together, and buffer_size = 5 is just enough
+        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: 0 if x >= 5 else 1, buffer_size=5)
+        output1, output2 = list(dp1), list(dp2)
+        self.assertEqual(list(range(5, 10)), output1)
+        self.assertEqual(list(range(0, 5)), output2)
+
+        # Test Case: classifer returns a value outside of [0, num_instance - 1]
+        dp = input_dp.demux(num_instances=1, classifier_fn=lambda x: x % 2)
+        it = iter(dp[0])
+        with self.assertRaises(ValueError):
+            next(it)
+            next(it)
+
+        # Test Case: DataPipe doesn't reset when it has not been read
+        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2)
+        i1 = iter(dp1)
+        output2 = []
+        i = 0
+        for i, n2 in enumerate(dp2):
+            output2.append(n2)
+            if i == 4:
+                i1 = iter(dp1)
+        self.assertEqual(list(range(1, 10, 2)), output2)
+
+        # Test Case: DataPipe reset when some of it has been read
+        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2)
+        output1, output2 = [], []
+        for n1, n2 in zip(dp1, dp2):
+            output1.append(n1)
+            output2.append(n2)
+            if n1 == 4:
+                break
+        with warnings.catch_warnings(record=True) as wa:
+            i1 = iter(dp1)  # Reset all child DataPipes
+            self.assertEqual(len(wa), 1)
+            self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+        for n1, n2 in zip(dp1, dp2):
+            output1.append(n1)
+            output2.append(n2)
+        self.assertEqual([0, 2, 4] + list(range(0, 10, 2)), output1)
+        self.assertEqual([1, 3, 5] + list(range(1, 10, 2)), output2)
+
+        # Test Case: DataPipe reset, even when not all child DataPipes are exhausted
+        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2)
+        output1 = list(dp1)
+        self.assertEqual(list(range(0, 10, 2)), output1)
+        with warnings.catch_warnings(record=True) as wa:
+            self.assertEqual(list(range(0, 10, 2)), list(dp1))  # Reset even when dp2 is not read
+            self.assertEqual(len(wa), 1)
+            self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+        output2 = []
+        for i, n2 in enumerate(dp2):
+            output2.append(n2)
+            if i == 1:
+                self.assertEqual(list(range(1, 5, 2)), output2)
+                with warnings.catch_warnings(record=True) as wa:
+                    self.assertEqual(list(range(0, 10, 2)), list(dp1))  # Can reset even when dp2 is partially read
+                    self.assertEqual(len(wa), 1)
+                    self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+                break
+        output2 = list(dp2)  # output2 has to read from beginning again
+        self.assertEqual(list(range(1, 10, 2)), output2)
+
+        # Test Case: drop_none = True
+        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2 if x % 5 != 0 else None,
+                                  drop_none=True)
+        self.assertEqual([2, 4, 6, 8], list(dp1))
+        self.assertEqual([1, 3, 7, 9], list(dp2))
+
+        # Test Case: drop_none = False
+        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2 if x % 5 != 0 else None,
+                                  drop_none=False)
+        it1 = iter(dp1)
+        with self.assertRaises(ValueError):
+            next(it1)
+
+
     def test_map_datapipe(self):
         input_dp = IDP(range(10))
 
diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
index b460d4d77cf5a..d4baef788ecca 100644
--- a/torch/utils/data/datapipes/iter/__init__.py
+++ b/torch/utils/data/datapipes/iter/__init__.py
@@ -8,7 +8,9 @@
 )
 from torch.utils.data.datapipes.iter.combining import (
     ConcaterIterDataPipe as Concater,
+    DemultiplexerIterDataPipe as Demultiplexer,
     ForkerIterDataPipe as Forker,
+    MultiplexerIterDataPipe as Multiplexer,
     ZipperIterDataPipe as Zipper,
 )
 from torch.utils.data.datapipes.iter.filelister import (
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 85b37324992fc..f44db96c15743 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -1,8 +1,7 @@
-import functools
 import warnings
 
 from torch.utils.data import IterDataPipe, functional_datapipe
-from typing import Any, Iterator, Optional, Sized, Tuple, TypeVar, Deque
+from typing import Any, Callable, Iterator, List, Optional, Sized, Tuple, TypeVar, Deque
 from collections import deque
 
 T_co = TypeVar('T_co', covariant=True)
@@ -163,14 +162,93 @@ def get_generator_by_instance(self, instance_id: int):
 
 @functional_datapipe('demux')
 class DemultiplexerIterDataPipe(IterDataPipe):
+    r""" :class:`DemultiplexerIterDataPipe`.
 
-    def __new__(cls, datapipe, instances, classifier_fn):
-        result = []
-        buffer = list(datapipe)
+        Iterable DataPipe to split the input DataPipe into multiple child DataPipes, using the given
+        classification function. A list of the child DataPipes is returned from this operation.
+        args:
+            datapipe: Iterable DataPipe being filtered
+            num_instances: number of instances of the DataPipe to create
+            classifier_fn: a function that maps values to an integer within the range [0, num_instances - 1] or None
+            drop_none: defaults to False, if True, the function will skip over elements classified as None
+            buffer_size: this defines the maximum number of inputs that the buffer can hold across all child
+                DataPipes while waiting for their values to be yielded
+    """
+    def __new__(cls, datapipe: IterDataPipe, num_instances: int,
+                classifier_fn: Callable[[T_co], int], drop_none: bool = False, buffer_size: int = 1000):
+        container = _DemultiplexerIterDataPipe(datapipe, num_instances, classifier_fn, drop_none, buffer_size)
+        return [_ChildDataPipe(container, i) for i in range(num_instances)]
+
+
+class _DemultiplexerIterDataPipe(IterDataPipe):
+    r""" :class:`_DemultiplexerIterDataPipe`.
+
+        Container to hold instance-specific information on behalf of DemultiplexerIterDataPipe. It tracks
+        the state of its child DataPipes, maintains the buffer, classifies and yields the next correct value
+        as requested by the child DataPipes.
+    """
+
+    def __init__(self, datapipe: IterDataPipe[T_co], num_instances: int,
+                 classifier_fn: Callable[[T_co], int], drop_none: bool, buffer_size: int):
+        self.main_datapipe = datapipe
+        self._datapipe_iterator: Optional[Iterator[Any]] = None
+        self.num_instances = num_instances
+        self.max_buffer_size = buffer_size
+        self.current_buffer_usage = 0
+        self.child_buffers: List[Deque[T_co]] = [deque() for _ in range(num_instances)]
+        self.instance_started: List[bool] = [False] * num_instances
+        self.classifier_fn = classifier_fn
+        self.drop_none = drop_none
+        self.main_datapipe_exhausted = False
+
+    def _find_next(self, instance_id: int) -> T_co:
+        while True:
+            if self._datapipe_iterator is None:
+                raise ValueError("_datapipe_iterator has not been set, likely because this private method is called directly "
+                                 "without invoking get_next_element_by_instance() first.")
+            value = next(self._datapipe_iterator)
+            classification = self.classifier_fn(value)
+            if classification is None and self.drop_none:
+                continue
+            if classification is None or classification >= self.num_instances or classification < 0:
+                raise ValueError(f"Output of the classification fn should be between 0 and {self.num_instances - 1}. " +
+                                 f"{classification} is returned.")
+            if classification == instance_id:
+                return value
+            self.child_buffers[classification].append(value)
+            self.current_buffer_usage += 1
+            if self.current_buffer_usage > self.max_buffer_size:
+                raise BufferError(
+                    f"DemultiplexerIterDataPipe buffer overflow, buffer size {self.max_buffer_size} is insufficient.")
 
-        def filter_fn(classifier_fn, i, x):
-            return classifier_fn(x) == i
-        return [IterateBuffer(buffer).filter(functools.partial(filter_fn, classifier_fn, i)) for i in range(instances)]
+    def get_next_element_by_instance(self, instance_id: int):
+        if self._datapipe_iterator is None:
+            self._datapipe_iterator = iter(self.main_datapipe)
+        stop = False
+        self.instance_started[instance_id] = True
+        while not stop:
+            if self.child_buffers[instance_id]:
+                self.current_buffer_usage -= 1
+                yield self.child_buffers[instance_id].popleft()
+            else:
+                try:
+                    yield self._find_next(instance_id)
+                except StopIteration:
+                    stop = True
+                    self.main_datapipe_exhausted = True
+
+    def is_instance_started(self, instance_id: int) -> bool:
+        return self.instance_started[instance_id]
+
+    def is_every_instance_exhausted(self) -> bool:
+        return self.main_datapipe_exhausted and all(not child_buffer for child_buffer in self.child_buffers)
+
+    def reset(self):
+        self._datapipe_iterator = iter(self.main_datapipe)
+        self.current_buffer_usage = 0
+        self.child_buffers = [deque() for _ in range(self.num_instances)]
+        self.instance_started = [False] * self.num_instances
+        self.main_datapipe_exhausted = False
 
 @functional_datapipe('mux')
 class MultiplexerIterDataPipe(IterDataPipe):

From 0ef8760bf6b3e8098ef42df60f1e451234151f32 Mon Sep 17 00:00:00 2001
From: Kevin Tse <ktse@fb.com>
Date: Tue, 31 Aug 2021 08:07:23 -0700
Subject: [PATCH 388/530] [DataPipe] implementing __len__ for fork (no valid
 length for demux) (#64215)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64215

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30648672

Pulled By: NivekT

fbshipit-source-id: 4780f2f6a79ae15a4009092475e7d92f96dd09a2
---
 test/test_datapipe.py                        | 13 +++++++++++++
 torch/utils/data/datapipes/iter/combining.py | 15 ++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index b6e3513622e13..4e37f41565226 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -689,6 +689,12 @@ def test_fork_datapipe(self):
                 break
         self.assertEqual(list(range(10)), list(dp3))  # dp3 has to read from the start again
 
+        # Test Case: Each DataPipe inherits the source datapipe's length
+        dp1, dp2, dp3 = input_dp.fork(num_instances=3)
+        self.assertEqual(len(input_dp), len(dp1))
+        self.assertEqual(len(input_dp), len(dp2))
+        self.assertEqual(len(input_dp), len(dp3))
+
 
     def test_demux_datapipe(self):
         input_dp = IDP(range(10))
@@ -788,6 +794,13 @@ def test_demux_datapipe(self):
         with self.assertRaises(ValueError):
             next(it1)
 
+        # Test Case: __len__ not implemented
+        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2)
+        with self.assertRaises(TypeError):
+            len(dp1)  # It is not implemented as we do not know length for each child in advance
+        with self.assertRaises(TypeError):
+            len(dp2)
+
 
     def test_map_datapipe(self):
         input_dp = IDP(range(10))
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index f44db96c15743..a837c5bb101c7 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -61,7 +61,8 @@ class ForkerIterDataPipe(IterDataPipe):
     r""" :class:`ForkerIterDataPipe`.
 
         Iterable DataPipe to create multiple instances of the same Iterable DataPipe.
-        args:
+
+        Args:
             datapipe: Iterable DataPipe being copied
             num_instances: number of instances of the datapipe to create
             buffer_size: this restricts how far ahead the leading child DataPipe
@@ -90,6 +91,9 @@ def __init__(self, datapipe: IterDataPipe, num_instances: int, buffer_size: int
         self.leading_ptr = 0
         self.end_ptr: Optional[int] = None
 
+    def __len__(self):
+        return len(self.main_datapipe)
+
     def get_next_element_by_instance(self, instance_id: int):
         if self._datapipe_iterator is None:
             self._datapipe_iterator = iter(self.main_datapipe)
@@ -135,7 +139,8 @@ class _ChildDataPipe(IterDataPipe):
 
         Iteratable Datapipe that is a child of a main DataPipe. The instance of this class
         will pass its instance_id to get the next value from its main DataPipe.
-        args:
+
+        Args:
             main_datapipe: Main DataPipe with a method 'get_next_element_by_instance(instance_id)'
             instance_id: integer identifier of this instance
     """
@@ -156,6 +161,9 @@ def __iter__(self):
         # We want to separate the code for reset and yield, so that 'reset' exeutes before __next__ is called
         return self.get_generator_by_instance(self.instance_id)
 
+    def __len__(self):
+        return len(self.main_datapipe)
+
     def get_generator_by_instance(self, instance_id: int):
         yield from self.main_datapipe.get_next_element_by_instance(self.instance_id)
 
@@ -166,7 +174,8 @@ class DemultiplexerIterDataPipe(IterDataPipe):
 
         Iterable DataPipe to split the input DataPipe into multiple child DataPipes, using the given
         classification function. A list of the child DataPipes is returned from this operation.
-        args:
+
+        Args:
             datapipe: Iterable DataPipe being filtered
             num_instances: number of instances of the DataPipe to create
             classifier_fn: a function that maps values to an integer within the range [0, num_instances - 1] or None

From ca8dd296ee42fd68b8c9360d10916e02e009eeff Mon Sep 17 00:00:00 2001
From: Kushashwa Ravi Shrimali <kushashwaravishrimali@gmail.com>
Date: Tue, 31 Aug 2021 09:45:09 -0700
Subject: [PATCH 389/530] Add OpInfo for `nn.functional.cosine_similarity`
 (#62959)

Summary:
Please see https://github.com/facebookresearch/functorch/issues/78 and https://github.com/pytorch/pytorch/issues/54261.

Notes:

* Some redundant tests from `test_nn.py` have been removed. I'm unsure about precision checks if they can be removed as well.
* Broadcasting is also checked in the OpInfo for `cosine_similarity`.

cc: mruberry zou3519 Chillee

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62959

Reviewed By: heitorschueroff

Differential Revision: D30520176

Pulled By: zou3519

fbshipit-source-id: 14e902eb4bcce875edab28a1669a2ea021052b9b
---
 test/test_nn.py                               | 20 --------------
 .../_internal/common_methods_invocations.py   | 27 +++++++++++++++++++
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 96321ba183be0..5008c7256acf7 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -9617,25 +9617,6 @@ def test_huber_loss_zero_delta():
         test_huber_loss_zero_delta()
 
     def test_cosine_similarity(self):
-        input1 = torch.randn(4, 4, requires_grad=True)
-        input2 = torch.randn(4, 4, requires_grad=True)
-        self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y), (input1, input2)))
-
-        input1 = torch.randn(4, 5, 6, requires_grad=True)
-        input2 = torch.randn(4, 5, 6, requires_grad=True)
-        self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=0), (input1, input2)))
-        self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=-1), (input1, input2)))
-
-        input1 = torch.randn((), requires_grad=True)
-        input2 = torch.randn((), requires_grad=True)
-        self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=0), (input1, input2)))
-        self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=-1), (input1, input2)))
-
-        # Check broadcasting
-        input1 = torch.randn(2, 1, 3, requires_grad=True)
-        input2 = torch.randn(1, 2, 3, requires_grad=True)
-        self.assertTrue(gradcheck(lambda x, y: F.cosine_similarity(x, y, dim=-1), (input1, input2)))
-
         # Check cosine_similarity input/output shapes
         input_size = (1, 3, 2, 1)
         expected_size = (1, 2, 1)
@@ -9662,7 +9643,6 @@ def test_cosine_similarity(self):
         with self.assertRaises(RuntimeError):
             F.cosine_similarity(input1, input2)
 
-
         # Check type promotion, issue #61454
         input = torch.tensor(12.)
         out = F.cosine_similarity(input.to(torch.int8), input, dim=-1)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index e7d93807511a5..3579310dc68c9 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1249,6 +1249,26 @@ def sample_inputs_linalg_norm(op_info, device, dtype, requires_grad):
                             dim=(0, 1))))
         return inputs
 
+def sample_inputs_cosine_similarity(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as input_shape, dict of dim and eps
+    cases: Tuple[tuple, dict] = (  # type: ignore[assignment]
+        ((S, S), {'dim': 1}),
+        ((S, 2), {'dim': -1}),
+        ((S,), {'dim': 0, 'eps': 0.5}),
+        ((), {'dim': 0}),
+        ((S, S, M), {'dim': 2}),
+        ((S, S), {})
+    )
+
+    def generator():
+        for input_shape, kwargs in cases:
+            yield SampleInput(make_arg(input_shape), args=(make_arg(input_shape),), kwargs=kwargs)
+        # Test for Broadcasting
+        yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1})
+
+    return list(generator())
 
 def sample_inputs_nn_activation_relu(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -7175,6 +7195,13 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                # FIXME: aminmax does not check for safe casting to output
                SkipInfo('TestCommon', 'test_out'),
            )),
+    OpInfo('nn.functional.cosine_similarity',
+           aten_name="cosine_similarity",
+           dtypes=floating_types_and(torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           sample_inputs_func=sample_inputs_cosine_similarity),
     OpInfo('nn.functional.adaptive_avg_pool2d',
            dtypes=floating_types(),
            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),

From b9275a40034377a99f245cde36f63462a9dc0995 Mon Sep 17 00:00:00 2001
From: Raghuraman Krishnamoorthi <raghuraman@fb.com>
Date: Tue, 31 Aug 2021 09:45:28 -0700
Subject: [PATCH 390/530] [ao][docs] Add description of qconfig and qengine to
 quantization page (#63582)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63582

Current quantization docs do not define qconfig and qengine. Added text to define these concepts before they are used.
ghstack-source-id: 137051719

Test Plan: Imported from OSS

Reviewed By: HDCharles

Differential Revision: D30658656

fbshipit-source-id: a45a0fcdf685ca1c3f5c3506337246a430f8f506
---
 docs/source/quantization.rst | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index eb6c74c72facd..7053ca68d920e 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -35,6 +35,13 @@ that perform all or part of the computation in lower precision. Higher-level
 APIs are provided that incorporate typical workflows of converting FP32 model
 to lower precision with minimal accuracy loss.
 
+Quantization requires users to be aware of three concepts:
+
+#. Quantization Config (Qconfig): Specifies how weights and activations are to be quantized. Qconfig is needed to create a quantized model.
+#. Backend: Refers to kernels that support quantization, usually with different numerics.
+#. Quantization engine (torch.backends.quantization.engine): When a quantized model is executed, the qengine specifies which backend is to be used for execution. It is important to ensure that the qengine is consistent with the Qconfig.
+
+
 Natively supported backends
 ---------------------------
 
@@ -45,7 +52,8 @@ Today, PyTorch supports the following backends for running quantized operators e
 * ARM CPUs (typically found in mobile/embedded devices), via
   `qnnpack` (`<https://github.com/pytorch/QNNPACK>`_).
 
-The corresponding implementation is chosen automatically based on the PyTorch build mode.
+The corresponding implementation is chosen automatically based on the PyTorch build mode, though users
+have the option to override this by setting `torch.backends.quantization.engine` to `fbgemm` or `qnnpack`.
 
 .. note::
 
@@ -58,7 +66,7 @@ The corresponding implementation is chosen automatically based on the PyTorch bu
 
 
 When preparing a quantized model, it is necessary to ensure that qconfig
-and the qengine used for quantized computations match the backend on which
+and the engine used for quantized computations match the backend on which
 the model will be executed. The qconfig controls the type of observers used
 during the quantization passes. The qengine controls whether `fbgemm` or
 `qnnpack` specific packing function is used when packing weights for linear

From 83e28a7d281c91a6d1a12b86bd5fb212dd424a85 Mon Sep 17 00:00:00 2001
From: Saketh Are <saketh.are@gmail.com>
Date: Tue, 31 Aug 2021 10:59:57 -0700
Subject: [PATCH 391/530] Use stacklevel for floordiv deprecation warnings
 (#64034)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/60548

`Tensor.__floordiv__` was indirectly deprecated by deprecation of `torch.floor_divide` (see https://github.com/pytorch/pytorch/issues/43874). Deprecating it directly provides clearer feedback.

Repro:
```
import torch
x = torch.tensor(0)
x // 1
```

Before this change, a deprecation warning was triggered within the C++ implementation of floor_divide:
```
UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:571.)
  return torch.floor_divide(self, other)
```

After this change, the warning instead cites the user's offending line of Python code:
```
UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
  x // 1
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64034

Reviewed By: mruberry

Differential Revision: D30658010

Pulled By: saketh-are

fbshipit-source-id: b0e6c5008d741897509d102f4a89efb47de4aa2a
---
 test/test_binary_ufuncs.py |  4 ++--
 test/test_sparse.py        |  4 ++--
 torch/_tensor.py           | 14 ++++++++++++--
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 1e9e804ab86d1..2695ab6a86115 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -1622,7 +1622,7 @@ def test_floor_divide_tensor(self, device, dtype):
         x = torch.randn(10, device=device).mul(30).to(dtype)
         y = torch.arange(1, 11, dtype=dtype, device=device)
 
-        with self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
+        with self.assertWarnsOnceRegex(UserWarning, "__floordiv__"):
             z = x // y
         z_alt = torch.trunc(x.double() / y.double()).to(dtype)
 
@@ -1634,7 +1634,7 @@ def test_floor_divide_tensor(self, device, dtype):
     def test_floor_divide_scalar(self, device, dtype):
         x = torch.randn(100, device=device).mul(10).to(dtype)
 
-        with self.assertWarnsOnceRegex(UserWarning, "floor_divide"):
+        with self.assertWarnsOnceRegex(UserWarning, "__floordiv__"):
             z = x // 3
         z_alt = torch.tensor([math.trunc(v.item() / 3.) for v in x], dtype=x.dtype, device=device)
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index aaf045c4b0ea0..8fa32edbc5e8a 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -1562,7 +1562,7 @@ def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, device,
         self.assertEqual(self.safeToDense(y1), expected)
         self.assertEqual(self.safeToDense(y2), expected)
 
-        with self.assertWarnsOnceRegex(UserWarning, 'floor_divide'):
+        with self.assertWarnsOnceRegex(UserWarning, '__floordiv__'):
             y1 = x1 // 37.5
         y2 = x1.clone()
         with self.assertWarnsOnceRegex(UserWarning, 'floor_divide'):
@@ -2915,7 +2915,7 @@ def test_div_by_sparse_error(self, device):
                                / torch.tensor(1., device=device).to_sparse())
 
     def test_floor_divide_by_sparse_error(self, device):
-        self.assertRaisesRegex(RuntimeError, 'Sparse floor division requires',
+        self.assertRaisesRegex(RuntimeError, 'Sparse division requires',
                                lambda: torch.tensor(1., device=device).to_sparse()
                                // torch.tensor(1., device=device).to_sparse())
 
diff --git a/torch/_tensor.py b/torch/_tensor.py
index b4cee9aa2a32c..e7bc4ed9165a2 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -582,11 +582,21 @@ def __rpow__(self, other):
 
     @_wrap_type_error_to_not_implemented
     def __floordiv__(self, other):
-        return torch.floor_divide(self, other)
+        warnings.warn("__floordiv__ is deprecated, and its behavior will change in a future version of pytorch. "
+                      "It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). "
+                      "This results in incorrect rounding for negative values. "
+                      "To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), "
+                      "or for actual floor division, use torch.div(a, b, rounding_mode='floor').", stacklevel=3)
+        return torch.div(self, other, rounding_mode='trunc')
 
     @_wrap_type_error_to_not_implemented
     def __rfloordiv__(self, other):
-        return torch.floor_divide(other, self)
+        warnings.warn("__rfloordiv__ is deprecated, and its behavior will change in a future version of pytorch. "
+                      "It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). "
+                      "This results in incorrect rounding for negative values. "
+                      "To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), "
+                      "or for actual floor division, use torch.div(a, b, rounding_mode='floor').", stacklevel=3)
+        return torch.div(other, self, rounding_mode='trunc')
 
     @_wrap_type_error_to_not_implemented
     def __rlshift__(self, other):

From 845bc89811f59822fe585cf44e774857adefcff7 Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@fb.com>
Date: Tue, 31 Aug 2021 11:29:07 -0700
Subject: [PATCH 392/530] [fx2trt] Add acc_ops.sign and converter for it
 (#63876)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63876

Add `acc_ops.sign` which maps from `torch.sign`.

Add a plugin (not support dynamic shape currently) for `acc_ops.sign`. The plugin calls `at::sign` directly.

Test Plan: buck test mode/opt -c python.package_style=inplace -c fbcode.nvcc_arch=a100 caffe2/torch/fb/fx2trt:test_unary_ops

Reviewed By: yinghai

Differential Revision: D30518081

fbshipit-source-id: a0b9e6c30deac0b04b8cb09a162579e229985330
---
 .../fx2trt/converters/acc_ops_converters.py    |  1 -
 torch/fx/experimental/fx2trt/fx2trt.py         | 18 +++++++++++-------
 torch/fx/experimental/fx_acc/acc_ops.py        |  6 ++++++
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
index ba370b2b067d4..e101b6b7f22ff 100644
--- a/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
+++ b/torch/fx/experimental/fx2trt/converters/acc_ops_converters.py
@@ -1098,7 +1098,6 @@ def acc_ops_clamp(network, target, args, kwargs, name):
 
     return input_val
 
-
 @tensorrt_converter(acc_ops.tuple_construct)
 def acc_ops_tuple_construct(network, target, args, kwargs, name):
     return kwargs["tensors"]
diff --git a/torch/fx/experimental/fx2trt/fx2trt.py b/torch/fx/experimental/fx2trt/fx2trt.py
index 72497a7d2aafc..f1d17e701790d 100644
--- a/torch/fx/experimental/fx2trt/fx2trt.py
+++ b/torch/fx/experimental/fx2trt/fx2trt.py
@@ -4,6 +4,7 @@
 import tensorrt as trt
 import torch
 import torch.fx
+from torch.fx.node import _get_qualified_name
 
 
 # Borrowed from torch2trt
@@ -226,14 +227,15 @@ def __init__(
         else:
             self.network = self.builder.create_network()
 
+        missing_ops = self.validate_conversion()
+        if missing_ops:
+            warnings.warn("Interpretation will fail due to missing operations \n"
+                          + "\n".join(f"{i}" for i in missing_ops))
+
         self.optimization_profiles: Optional[List] = None
         self.input_specs = input_specs
         self.input_specs_iter = 0
         self.validate_input_specs()
-        missing_ops = self.validate_conversion
-        if not missing_ops:
-            warnings.warn("Interpretation may fail due to missing operations \n"
-                          + "\n".join(f"{i}" for i in missing_ops))
         self._cur_node_name: Optional[str] = None
         self._input_names: List[str] = []
         self._output_names: List[str] = []
@@ -299,13 +301,15 @@ def validate_conversion(self):
         missing_converter = set()
 
         for node in self.module.graph.nodes:
-            if node.op in ["call_function", "call_method"] and not CONVERTERS.get(node.target):
-                missing_converter.add(f"{node.op} {node.target}")
+            if node.op == "call_function" and not CONVERTERS.get(node.target):
+                missing_converter.add(f"{node.op} {_get_qualified_name(node.target)}")
+            elif node.op == "call_method" and not CONVERTERS.get(node.target):
+                missing_converter.add(f"{node.op} torch.Tensor.{node.target}")
             elif node.op == "call_module":
                 submod = self.fetch_attr(node.target)
                 submod_type = getattr(submod, "_base_class_origin", type(submod))
                 if not CONVERTERS.get(submod_type):
-                    missing_converter.add(f"{node.op} {submod_type}")
+                    missing_converter.add(f"{node.op} {torch.typename(submod_type)}")
 
         return missing_converter
 
diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py
index 1b4b4690f732d..b10d35edd5baa 100644
--- a/torch/fx/experimental/fx_acc/acc_ops.py
+++ b/torch/fx/experimental/fx_acc/acc_ops.py
@@ -95,6 +95,12 @@ def avg_pool2d(
     return nn.functional.avg_pool2d(**locals())
 
 
+@register_acc_op_mapping(op_and_target=("call_function", torch.sign))
+@register_acc_op
+def sign(*, input):
+    return torch.sign(input)
+
+
 @register_acc_op
 def size(*, input):
     return input.size()

From 3a46edb8d8fa1fdb120102a9af4517c08864c580 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Tue, 31 Aug 2021 12:09:59 -0700
Subject: [PATCH 393/530] ns for fx: make layer types more readable (#64270)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64270

Before this PR, layer types were populated by doing
`str(module_instance)` and `str(function)`. This resulted
in moderately readable strings for modules, and poorly readable
strings for functions.

This PR switches the logic to use `torch.typename` utility instead.
The results are significantly more readable.

Example function type:

```
# before
'<built-in method linear of PyCapsule object at 0x7fe9b20ce7b0>'

# after
'torch._ops.quantized.PyCapsule.linear'
```

Example module type:

```
# before
"<class 'torch.nn.quantized.modules.conv.Conv2d'>"

# after
'torch.nn.quantized.modules.conv.Conv2d'
```

Test Plan:
Manually inspect NS results for modules and functions, verify they are
more readable.

Manually inspect NS results for modules and functions, verify they are
more readable.

Imported from OSS

Differential Revision:
D30669545
D30669545

Reviewed By: jerryzh168

Pulled By: vkuzo

fbshipit-source-id: 60959e5cafa0a4992b083bf99f5d8260f9acdac0
---
 torch/quantization/ns/utils.py        | 6 +++---
 torch/quantization/ns/weight_utils.py | 6 ++++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/torch/quantization/ns/utils.py b/torch/quantization/ns/utils.py
index 678f60a00c8cc..62397d0de0f94 100644
--- a/torch/quantization/ns/utils.py
+++ b/torch/quantization/ns/utils.py
@@ -317,15 +317,15 @@ def get_arg_indices_of_inputs_to_log(node: Node) -> List[int]:
 def get_target_type_str(node: Node, gm: GraphModule) -> str:
     """
     Returns a string representation of the type of the function or module
-    pointed to by this node, or '' for other op types.
+    pointed to by this node, or '' for other node types.
     """
     target_type = ""
     if node.op in ("call_function", "call_method"):
-        target_type = str(node.target)
+        target_type = torch.typename(node.target)
     elif node.op == "call_module":
         assert isinstance(node.target, str)
         target_mod = getattr_from_fqn(gm, node.target)
-        target_type = str(type(target_mod))
+        target_type = torch.typename(target_mod)
     return target_type
 
 
diff --git a/torch/quantization/ns/weight_utils.py b/torch/quantization/ns/weight_utils.py
index 724cdc7a40ae6..36e183efe1d8e 100644
--- a/torch/quantization/ns/weight_utils.py
+++ b/torch/quantization/ns/weight_utils.py
@@ -231,6 +231,8 @@ def extract_weight_from_node(
         op_to_type_to_weight_extraction_fn = get_op_to_type_to_weight_extraction_fn()
 
     ref_node_type = get_target_type_str(node, gm)
+    # for extracting weights, these are always the same
+    prev_node_type = ref_node_type
 
     if node.op == 'call_function':
         function_mapping = op_to_type_to_weight_extraction_fn['call_function']
@@ -241,7 +243,7 @@ def extract_weight_from_node(
                     'type': res_type,
                     'values': [weight],
                     'prev_node_name': node.name,
-                    'prev_node_target_type': str(node.target),
+                    'prev_node_target_type': prev_node_type,
                     'ref_node_name': node.name,
                     'ref_node_target_type': ref_node_type,
                     'index_within_arg': 0,
@@ -261,7 +263,7 @@ def extract_weight_from_node(
                     'type': res_type,
                     'values': [weight],
                     'prev_node_name': node.name,
-                    'prev_node_target_type': str(type(mod)),
+                    'prev_node_target_type': prev_node_type,
                     'ref_node_name': node.name,
                     'ref_node_target_type': ref_node_type,
                     'index_within_arg': 0,

From 347ef69529ae2bc38878ed97345b8fc8039dfa56 Mon Sep 17 00:00:00 2001
From: Raghuraman Krishnamoorthi <raghuraman@fb.com>
Date: Tue, 31 Aug 2021 12:22:13 -0700
Subject: [PATCH 394/530] [ao][docs] Clarify operator support for quantization
 (#63270)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63270

Add table to quantization main page showing supported modules
for static and dynamic quantization.
ghstack-source-id: 137087204

Test Plan: Imported from OSS

Reviewed By: HDCharles

Differential Revision: D30658654

fbshipit-source-id: a82c998e1db6370596d5b0ca4c7cc96c1c90f30e
---
 docs/source/quantization.rst | 41 +++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index 7053ca68d920e..a86368ef8d660 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -147,16 +147,13 @@ The following table compares the differences between Eager Mode Quantization and
 +-----------------+-------------------+-------------------+
 
 
-Eager Mode Quantization
-^^^^^^^^^^^^^^^^^^^^^^^
-
-There are three types of quantization supported in Eager Mode Quantization:
+There are three types of quantization supported:
 
 1. dynamic quantization (weights quantized with activations read/stored in
    floating point and quantized for compute.)
 2. static quantization (weights quantized, activations quantized, calibration
    required post training)
-3. quantization aware training (weights quantized, activations quantized,
+3. static quantization aware training (weights quantized, activations quantized,
    quantization numerics modeled during training)
 
 Please see our `Introduction to Quantization on Pytorch
@@ -164,6 +161,40 @@ Please see our `Introduction to Quantization on Pytorch
 for a more comprehensive overview of the tradeoffs between these quantization
 types.
 
+Operator coverage varies between dynamic and static quantization and is captured in the table below.
+Note that for FX quantization, the corresponding functionals are also supported.
+
++---------------------------+-------------------+--------------------+
+|                           |Static             | Dynamic            |
+|                           |Quantization       | Quantization       |
++---------------------------+-------------------+--------------------+
+| | nn.Linear               | | Y               | | Y                |
+| | nn.Conv1d/2d/3d         | | Y               | | N                |
++---------------------------+-------------------+--------------------+
+| | nn.LSTM                 | | N               | | Y                |
+| | nn.GRU                  | | N               | | Y                |
++---------------------------+-------------------+--------------------+
+| | nn.RNNCell              | | N               | | Y                |
+| | nn.GRUCell              | | N               | | Y                |
+| | nn.LSTMCell             | | N               | | Y                |
++---------------------------+-------------------+--------------------+
+|nn.EmbeddingBag            | Y (activations    |                    |
+|                           | are in fp32)      | Y                  |
++---------------------------+-------------------+--------------------+
+|nn.Embedding               | Y                 | N                  |
++---------------------------+-------------------+--------------------+
+|nn.MultiheadAttention      |Not Supported      | Not supported      |
++---------------------------+-------------------+--------------------+
+|Activations                |Broadly supported  | Un-changed,        |
+|                           |                   | computations       |
+|                           |                   | stay in fp32       |
++---------------------------+-------------------+--------------------+
+
+
+Eager Mode Quantization
+^^^^^^^^^^^^^^^^^^^^^^^
+
+
 Dynamic Quantization
 ~~~~~~~~~~~~~~~~~~~~
 

From 555171a273ef7bbea65b517508141192c83c95c5 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Tue, 31 Aug 2021 12:50:11 -0700
Subject: [PATCH 395/530] .circleci: Remove migrated jobs, move docs builds
 (#64222)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64222

Removes both backwards_compat as well as docs_test from the general
gcc5.4 config and moves the docs build from being run on every PR to
only being run on master.

We can remove docs builds when we migrate the docs push job (including
all secrets associated with that)

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D30650953

Pulled By: seemethere

fbshipit-source-id: ac11da6a551a6c81f3dc1d47fd81846cbfe9975a
---
 .../cimodel/data/pytorch_build_definitions.py | 29 ++-----------------
 .circleci/config.yml                          | 28 ++++++++++--------
 2 files changed, 18 insertions(+), 39 deletions(-)

diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py
index d7b20158759d0..305bbb4d354bb 100644
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@@ -214,7 +214,7 @@ def gen_docs_configs(xenial_parent_config):
         HiddenConf(
             "pytorch_python_doc_build",
             parent_build=xenial_parent_config,
-            filters=gen_filter_dict(branches_list=r"/.*/",
+            filters=gen_filter_dict(branches_list=["master"],
                                     tags_list=RC_PATTERN),
         )
     )
@@ -230,7 +230,7 @@ def gen_docs_configs(xenial_parent_config):
         HiddenConf(
             "pytorch_cpp_doc_build",
             parent_build=xenial_parent_config,
-            filters=gen_filter_dict(branches_list=r"/.*/",
+            filters=gen_filter_dict(branches_list=["master"],
                                     tags_list=RC_PATTERN),
         )
     )
@@ -241,13 +241,6 @@ def gen_docs_configs(xenial_parent_config):
             branch="master",
         )
     )
-
-    configs.append(
-        HiddenConf(
-            "pytorch_doc_test",
-            parent_build=xenial_parent_config
-        )
-    )
     return configs
 
 
@@ -396,24 +389,6 @@ def instantiate_configs(only_slow_gradcheck):
         if cuda_version == "10.2" and python_version == "3.6" and not is_libtorch and not is_slow_gradcheck:
             c.dependent_tests = gen_dependent_configs(c)
 
-        if (
-            compiler_name == "gcc"
-            and compiler_version == "5.4"
-            and not is_libtorch
-            and not is_vulkan
-            and not is_pure_torch
-            and parallel_backend is None
-        ):
-            bc_breaking_check = Conf(
-                "backward-compatibility-check",
-                [],
-                is_xla=False,
-                restrict_phases=["test"],
-                is_libtorch=False,
-                is_important=True,
-                parent_build=c,
-            )
-            c.dependent_tests.append(bc_breaking_check)
 
         if (
             compiler_name != "clang"
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 1bb32b5cc0a3d..324e5fdd9b3cb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7112,7 +7112,8 @@ workflows:
       - pytorch_python_doc_build:
           filters:
             branches:
-              only: /.*/
+              only:
+                - master
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
@@ -7132,7 +7133,8 @@ workflows:
       - pytorch_cpp_doc_build:
           filters:
             branches:
-              only: /.*/
+              only:
+                - master
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
@@ -7149,16 +7151,6 @@ workflows:
           name: pytorch_cpp_doc_push
           requires:
             - pytorch_cpp_doc_build
-      - pytorch_doc_test:
-          requires:
-            - pytorch_linux_xenial_py3_6_gcc5_4_build
-      - pytorch_linux_test:
-          name: pytorch_linux_backward_compatibility_check_test
-          requires:
-            - pytorch_linux_xenial_py3_6_gcc5_4_build
-          build_environment: "pytorch-linux-backward-compatibility-check-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-          resource_class: large
       - pytorch_linux_test:
           name: pytorch_linux_pytorch_linux_xenial_py3_6_gcc5_4_distributed_test
           requires:
@@ -9386,6 +9378,18 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-xenial-py3.6-gcc7"
           image_name: "pytorch-linux-xenial-py3.6-gcc7"
+      - pytorch_linux_build:
+          name: pytorch_linux_xenial_py3_6_gcc5_4_build
+          requires:
+            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
+          build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+      - pytorch_python_doc_build:
+          requires:
+            - pytorch_linux_xenial_py3_6_gcc5_4_build
+      - pytorch_cpp_doc_build:
+          requires:
+            - pytorch_linux_xenial_py3_6_gcc5_4_build
       - pytorch_linux_build:
           name: pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_build
           requires:

From 1c2b5e59ae53ef3042ad1b5cf9aa275391c92971 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Tue, 31 Aug 2021 12:51:20 -0700
Subject: [PATCH 396/530] Remove ref to test_distributed_fork (#64197)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64197

Removes this line as test is gone.
ghstack-source-id: 136986275

Test Plan: CI

Reviewed By: walterddr

Differential Revision: D30642929

fbshipit-source-id: a0c7dfdfb35a4a7f7ec1b881dbea53d85136012c
---
 test/run_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/run_test.py b/test/run_test.py
index 77e7f150c16e9..d0871fa2a0d30 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -345,7 +345,6 @@
 ]
 
 DISTRIBUTED_TESTS = [
-    "distributed/test_distributed_fork",
     "distributed/test_distributed_spawn",
 ]
 

From 8d08b103be936d78d5d4ed90c0547aeccb8ce166 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 31 Aug 2021 13:29:39 -0700
Subject: [PATCH 397/530] [CUDA graphs] Prototype API and documentation
 (#63269)

Summary:
RFC: https://github.com/pytorch/pytorch/issues/61880

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63269

Reviewed By: mruberry

Differential Revision: D30596643

Pulled By: ngimel

fbshipit-source-id: b1f8061406364b667e2c2d4d30fbce1f0d8456be
---
 aten/src/ATen/cuda/CUDAGraphsUtils.cuh |  13 +
 aten/src/ATen/native/cudnn/Conv_v7.cpp |   4 +
 c10/cuda/CUDACachingAllocator.cpp      |   2 +
 docs/source/cuda.rst                   |  11 +
 docs/source/notes/cuda.rst             | 453 ++++++++++++++++++++++++-
 test/test_cuda.py                      | 116 +++++--
 torch/_C/__init__.pyi.in               |   9 +-
 torch/csrc/cuda/Graph.cpp              |  25 +-
 torch/cuda/__init__.py                 |   3 +-
 torch/cuda/graphs.py                   | 408 ++++++++++++++++++++++
 torch/cuda/streams.py                  |   5 -
 11 files changed, 996 insertions(+), 53 deletions(-)
 create mode 100644 torch/cuda/graphs.py

diff --git a/aten/src/ATen/cuda/CUDAGraphsUtils.cuh b/aten/src/ATen/cuda/CUDAGraphsUtils.cuh
index c25ba88a6537c..9d42ed759939b 100644
--- a/aten/src/ATen/cuda/CUDAGraphsUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAGraphsUtils.cuh
@@ -42,5 +42,18 @@ inline void assertNotCapturing(std::string attempt) {
               status);
 }
 
+inline void errorIfCapturingCudnnBenchmark(std::string version_specific) {
+  auto status = currentStreamCaptureStatus();
+  TORCH_CHECK(status == CaptureStatus::None,
+              "Current cudaStreamCaptureStatus: ",
+              status,
+              "\nCapturing ",
+              version_specific,
+              "is prohibited. Possible causes of this error:\n"
+              "1. No warmup iterations occurred before capture.\n"
+              "2. The convolutions you're trying to capture use dynamic shapes, "
+              "in which case capturing them is generally prohibited.");
+}
+
 } // namespace cuda
 } // namespace at
diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp
index 7d16f0a9a910f..27863d060d2dd 100644
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@@ -11,6 +11,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Config.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/native/cudnn/ConvShared.h>
 
@@ -292,6 +293,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
       Workspace ws(max_ws_size);
+      at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind");
       AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionForwardAlgorithmEx(
           args.handle,
           args.idesc.desc(), args.input.data_ptr(),
@@ -362,6 +364,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
       Workspace ws(max_ws_size);
+      at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind");
       AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardDataAlgorithmEx(
           args.handle,
           args.wdesc.desc(), args.weight.data_ptr(),
@@ -434,6 +437,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
       Workspace ws(max_ws_size);
+      at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind");
       AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardFilterAlgorithmEx(
           args.handle,
           args.idesc.desc(), args.input.data_ptr(),
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 055375352ee08..659fea351d467 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -308,6 +308,8 @@ cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) {
   } else {
     // It's ok to capture cudaMallocs, as long as we never cudaFree those
     // addresses before replay.
+    // Capturing cudaMalloc behaves nicely: it gives the graph new VA,
+    // but is ignored (won't leakily allocate new memory) in replays.
     at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeRelaxed};
     return cudaMalloc(p, size);
   }
diff --git a/docs/source/cuda.rst b/docs/source/cuda.rst
index d4783c867b82a..75029332aa481 100644
--- a/docs/source/cuda.rst
+++ b/docs/source/cuda.rst
@@ -71,6 +71,17 @@ Streams and events
     Stream
     Event
 
+Graphs (prototype)
+------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    graph_pool_handle
+    CUDAGraph
+    graph
+    make_graphed_callables
+
 Memory management
 -----------------
 .. autosummary::
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 264017f0203cc..5d7c0ea48f669 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -262,7 +262,7 @@ have the same stream-semantics relationship as any group of ops::
 BC note: Using grads on the default stream
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-In prior versions of Pytorch (1.9 and earlier), the autograd engine always synced
+In prior versions of PyTorch (1.9 and earlier), the autograd engine always synced
 the default stream with all backward ops, so the following pattern::
 
     with torch.cuda.stream(s):
@@ -270,7 +270,7 @@ the default stream with all backward ops, so the following pattern::
     use grads
 
 was safe as long as ``use grads`` happened on the default stream.
-In present Pytorch, that pattern is no longer safe. If ``backward()``
+In present PyTorch, that pattern is no longer safe. If ``backward()``
 and ``use grads`` are in different stream contexts, you must sync the streams::
 
     with torch.cuda.stream(s):
@@ -513,3 +513,452 @@ by GIL of Python interpreter.
 
 If you use :class:`~torch.nn.parallel.DistributedDataParallel`, you could use
 `torch.distributed.launch` utility to launch your program, see :ref:`distributed-launch`.
+
+.. _cuda-graph-semantics:
+
+CUDA Graphs
+-----------
+
+A CUDA graph is a record of the work (mostly kernels and their arguments) that a
+CUDA stream and its dependent streams perform.
+For general principles and details on the underlying CUDA API, see
+`Getting Started with CUDA Graphs`_ and the
+`Graphs section`_ of the CUDA C Programming Guide.
+
+PyTorch supports the construction of CUDA graphs using `stream capture`_, which puts a
+CUDA stream in *capture mode*. CUDA work issued to a capturing stream doesn't actually
+run on the GPU. Instead, the work is recorded in a graph.
+
+After capture, the graph can be *launched* to run the GPU work as many times as needed.
+Each replay runs the same kernels with the same arguments. For pointer arguments this
+means the same memory addresses are used.
+By filling input memory with new data (e.g., from a new batch) before each replay,
+you can rerun the same work on new data.
+
+Why CUDA Graphs?
+^^^^^^^^^^^^^^^^
+
+Replaying a graph sacrifices the dynamic flexibility of typical eager execution in exchange for
+**greatly reduced CPU overhead**. A graph's arguments and kernels are fixed, so a graph replay
+skips all layers of argument setup and kernel dispatch, including Python, C++, and CUDA driver
+overheads. Under the hood, a replay submits the entire graph's work to the GPU with
+a single call to `cudaGraphLaunch`_.  Kernels in a replay also execute slightly faster
+on the GPU, but eliding CPU overhead is the main benefit.
+
+You should try CUDA graphs if all or part of your network is graph-safe (usually this means
+static shapes and static control flow, but see the other :ref:`constraints<capture-constraints>`)
+and you suspect its runtime is at least somewhat CPU-limited.
+
+.. _Getting Started with CUDA Graphs:
+    https://developer.nvidia.com/blog/cuda-graphs/
+.. _Graphs section:
+    https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-graphs
+.. _stream capture:
+    https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture
+.. _cudaGraphLaunch:
+    https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597
+
+PyTorch API
+^^^^^^^^^^^
+
+.. warning::
+    This API is a prototype and may change in future releases.
+
+PyTorch exposes graphs via a raw :class:`torch.cuda.CUDAGraph` class
+and two convenience wrappers,
+:class:`torch.cuda.graph` and
+:class:`torch.cuda.make_graphed_callables`.
+
+:class:`torch.cuda.graph` is a simple, versatile context manager that
+captures CUDA work in its context.
+Before capture, warm up the workload to be captured by running
+a few eager iterations. Warmup must occur on a side stream.
+Because the graph reads from and writes to the same memory addresses in every
+replay, you must maintain long-lived references to tensors that hold
+input and output data during capture.
+To run the graph on new input data, copy new data to the capture's input tensor(s),
+replay the graph, then read the new output from the capture's output tensor(s).
+Example::
+
+    g = torch.cuda.CUDAGraph()
+
+    # Placeholder input used for capture
+    static_input = torch.empty((5,), device="cuda")
+
+    # Warmup before capture
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for _ in range(3):
+            static_output = static_input * 2
+    torch.cuda.current_stream().wait_stream(s)
+
+    # Captures the graph
+    # To allow capture, automatically sets a side stream as the current stream in the context
+    with torch.cuda.graph(g):
+        static_output = static_input * 2
+
+    # Fills the graph's input memory with new data to compute on
+    static_input.copy_(torch.full((5,), 3, device="cuda"))
+    g.replay()
+    # static_output holds the results
+    print(static_output)  # full of 3 * 2 = 6
+
+    # Fills the graph's input memory with more data to compute on
+    static_input.copy_(torch.full((5,), 4, device="cuda"))
+    g.replay()
+    print(static_output)  # full of 4 * 2 = 8
+
+See
+:ref:`Whole-network capture<whole-network-capture>`,
+:ref:`Usage with torch.cuda.amp<graphs-with-amp>`, and
+:ref:`Usage with multiple streams<multistream-capture>`
+for realistic and advanced patterns.
+
+:class:`~torch.cuda.make_graphed_callables` is more sophisticated.
+:class:`~torch.cuda.make_graphed_callables` accepts Python functions and
+:class:`torch.nn.Module`\s. For each passed function or Module,
+it creates separate graphs of the forward-pass and backward-pass work. See
+:ref:`Partial-network capture<partial-network-capture>`.
+
+.. _capture-constraints:
+
+Constraints
+~~~~~~~~~~~
+
+A set of ops is *capturable* if it doesn't violate any of the following constraints.
+
+Constraints apply to all work in a
+:class:`torch.cuda.graph` context and all work in the forward and backward passes
+of any callable you pass to :func:`torch.cuda.make_graphed_callables`.
+
+Violating any of these will likely cause a runtime error:
+
+* Capture must occur on a non-default stream. (This is only a concern if you use the raw
+  :meth:`CUDAGraph.capture_begin<torch.cuda.CUDAGraph.capture_begin>` and
+  :meth:`CUDAGraph.capture_end<torch.cuda.CUDAGraph.capture_end>` calls.
+  :class:`~torch.cuda.graph` and
+  :func:`~torch.cuda.make_graphed_callables` set a side stream for you.)
+* Ops that sychronize the CPU with the GPU (e.g., ``.item()`` calls) are prohibited.
+* CUDA RNG ops are allowed, but must use default generators. For example, explicitly constructing a
+  new :class:`torch.Generator` instance and passing it as the ``generator`` argument to an RNG function
+  is prohibited.
+
+Violating any of these will likely cause silent numerical errors or undefined behavior:
+
+* Within a process, only one capture may be underway at a time.
+* No non-captured CUDA work may run in this process (on any thread) while capture is underway.
+* CPU work is not captured. If the captured ops include CPU work, that work will be elided during replay.
+* Every replay reads from and writes to the same (virtual) memory addresses.
+* Dynamic control flow (based on CPU or GPU data) is prohibited.
+* Dynamic shapes are prohibited. The graph assumes every tensor in the captured op sequence
+  has the same size and layout in every replay.
+* Using multiple streams in a capture is allowed, but there are :ref:`restrictions<multistream-capture>`.
+
+Non-constraints
+~~~~~~~~~~~~~~~
+
+* Once captured, the graph may be replayed on any stream.
+
+.. _whole-network-capture:
+
+Whole-network capture
+^^^^^^^^^^^^^^^^^^^^^^
+
+If your entire network is capturable, you can capture and replay an entire iteration::
+
+    N, D_in, H, D_out = 640, 4096, 2048, 1024
+    model = torch.nn.Sequential(torch.nn.Linear(D_in, H),
+                                torch.nn.Dropout(p=0.2),
+                                torch.nn.Linear(H, D_out),
+                                torch.nn.Dropout(p=0.1)).cuda()
+    loss_fn = torch.nn.MSELoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+
+    # Placeholders used for capture
+    static_input = torch.randn(N, D_in, device='cuda')
+    static_target = torch.randn(N, D_out, device='cuda')
+
+    # warmup
+    # Uses static_input and static_target here for convenience,
+    # but in a real setting, because the warmup includes optimizer.step()
+    # you must use a few batches of real data.
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for i in range(3):
+            optimizer.zero_grad(set_to_none=True)
+            y_pred = model(static_input)
+            loss = loss_fn(y_pred, static_target)
+            loss.backward()
+            optimizer.step()
+    torch.cuda.current_stream().wait_stream(s)
+
+    # capture
+    g = torch.cuda.CUDAGraph()
+    # Sets grads to None before capture, so backward() will create
+    # .grad attributes with allocations from the graph's private pool
+    optimizer.zero_grad(set_to_none=True)
+    with torch.cuda.graph(g):
+        static_y_pred = model(static_input)
+        static_loss = loss_fn(static_y_pred, static_target)
+        static_loss.backward()
+        optimizer.step()
+
+    real_inputs = [torch.rand_like(static_input) for _ in range(10)]
+    real_targets = [torch.rand_like(static_target) for _ in range(10)]
+
+    for data, target in zip(real_inputs, real_targets):
+        # Fills the graph's input memory with new data to compute on
+        static_input.copy_(data)
+        static_target.copy_(target)
+        # replay() includes forward, backward, and step.
+        # You don't even need to call optimizer.zero_grad() between iterations
+        # because the captured backward refills static .grad tensors in place.
+        g.replay()
+        # Params have been updated. static_y_pred, static_loss, and .grad
+        # attributes hold values from computing on this iteration's data.
+
+.. _partial-network-capture:
+
+Partial-network capture
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If some of your network is unsafe to capture (e.g., due to dynamic control flow,
+dynamic shapes, CPU syncs, or essential CPU-side logic), you can run the unsafe
+part(s) eagerly and use :func:`torch.cuda.make_graphed_callables` to graph only
+the capture-safe part(s).
+
+By default, callables returned by :func:`~torch.cuda.make_graphed_callables`
+are autograd-aware, and can be used in the training loop as direct replacements
+for the functions or :class:`nn.Module<torch.nn.Module>`\ s you passed.
+
+:func:`~torch.cuda.make_graphed_callables` internally creates
+:class:`~torch.cuda.CUDAGraph` objects, runs warmup iterations, and maintains
+static inputs and outputs as needed.  Therefore (unlike with
+:class:`torch.cuda.graph`) you don't need to handle those manually.
+
+In the following example, data-dependent dynamic control flow means the
+network isn't capturable end-to-end, but
+:func:`~torch.cuda.make_graphed_callables`
+lets us capture and run graph-safe sections as graphs regardless::
+
+    N, D_in, H, D_out = 640, 4096, 2048, 1024
+
+    module1 = torch.nn.Linear(D_in, H).cuda()
+    module2 = torch.nn.Linear(H, D_out).cuda()
+    module3 = torch.nn.Linear(H, D_out).cuda()
+
+    loss_fn = torch.nn.MSELoss()
+    optimizer = torch.optim.SGD(chain(module1.parameters() +
+                                      module2.parameters() +
+                                      module3.parameters()),
+                                lr=0.1)
+
+    # Sample inputs used for capture
+    # requires_grad state of sample inputs must match
+    # requires_grad state of real inputs each callable will see.
+    x = torch.randn(N, D_in, device='cuda')
+    h = torch.randn(N, H, device='cuda', requires_grad=True)
+
+    module1 = torch.cuda.make_graphed_callables(module1, (x,))
+    module2 = torch.cuda.make_graphed_callables(module2, (h,))
+    module3 = torch.cuda.make_graphed_callables(module3, (h,))
+
+    real_inputs = [torch.rand_like(x) for _ in range(10)]
+    real_targets = [torch.randn(N, D_out, device="cuda") for _ in range(10)]
+
+    for data, target in zip(real_inputs, real_targets):
+        optimizer.zero_grad(set_to_none=True)
+
+        tmp = module1(data)  # forward ops run as a graph
+
+        if tmp.sum().item() > 0:
+            tmp = module2(tmp)  # forward ops run as a graph
+        else:
+            tmp = module3(tmp)  # forward ops run as a graph
+
+        loss = loss_fn(tmp, y)
+        # module2's or module3's (whichever was chosen) backward ops,
+        # as well as module1's backward ops, run as graphs
+        loss.backward()
+        optimizer.step()
+
+.. _graphs-with-amp:
+
+Usage with torch.cuda.amp
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For typical optimizers, :meth:`GradScaler.step<torch.cuda.amp.GradScaler.step>` syncs
+the CPU with the GPU, which is prohibited during capture. To avoid errors, either use
+:ref:`partial-network capture<partial-network-capture>`, or (if forward, loss,
+and backward are capture-safe) capture forward, loss, and backward but not the
+optimizer step::
+
+    # warmup
+    # In a real setting, use a few batches of real data.
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for i in range(3):
+            optimizer.zero_grad(set_to_none=True)
+            with torch.cuda.amp.autocast():
+                y_pred = model(static_input)
+                loss = loss_fn(y_pred, static_target)
+            scaler.scale(loss).backward()
+            scaler.step(optimizer)
+            scaler.update()
+    torch.cuda.current_stream().wait_stream(s)
+
+    # capture
+    g = torch.cuda.CUDAGraph()
+    optimizer.zero_grad(set_to_none=True)
+    with torch.cuda.graph(g):
+        with torch.cuda.amp.autocast():
+            static_y_pred = model(static_input)
+            static_loss = loss_fn(static_y_pred, static_target)
+        scaler.scale(static_loss).backward()
+        # don't capture scaler.step(optimizer) or scaler.update()
+
+    real_inputs = [torch.rand_like(static_input) for _ in range(10)]
+    real_targets = [torch.rand_like(static_target) for _ in range(10)]
+
+    for data, target in zip(real_inputs, real_targets):
+        static_input.copy_(data)
+        static_target.copy_(target)
+        g.replay()
+        # Runs scaler.step and scaler.update eagerly
+        scaler.step(optimizer)
+        scaler.update()
+
+.. _multistream-capture:
+
+Usage with multiple streams
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Capture mode automatically propagates to any streams that sync with a capturing stream.
+Within capture, you may expose parallelism by issuing calls to different streams,
+but the overall stream dependency DAG must branch out from the
+initial capturing stream after capture begins and rejoin the initial stream
+before capture ends::
+
+    with torch.cuda.graph(g):
+        # at context manager entrance, torch.cuda.current_stream()
+        # is the initial capturing stream
+
+        # INCORRECT (does not branch out from or rejoin initial stream)
+        with torch.cuda.stream(s):
+            cuda_work()
+
+        # CORRECT:
+        # branches out from initial stream
+        s.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(s):
+            cuda_work()
+        # rejoins initial stream before capture ends
+        torch.cuda.current_stream().wait_stream(s)
+
+.. note::
+
+    To avoid confusion for power users looking at replays in nsight systems or nvprof:
+    Unlike eager execution, the graph interprets a nontrivial stream DAG in capture
+    as a hint, not a command. During replay, the graph may reorganize independent ops
+    onto different streams or enqueue them in a different order (while respecting your
+    original DAG's overall dependencies).
+
+Usage with DistributedDataParallel
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+NCCL < 2.9.6
+~~~~~~~~~~~~
+
+NCCL versions earlier than 2.9.6 don't allow collectives to be captured.
+You must use :ref:`partial-network capture<partial-network-capture>`,
+which defers allreduces to happen outside graphed sections of backward.
+
+Call :func:`~torch.cuda.make_graphed_callables` on graphable network sections
+*before* wrapping the network with DDP.
+
+NCCL >= 2.9.6
+~~~~~~~~~~~~~
+
+NCCL versions 2.9.6 or later allow collectives in the graph.
+Approaches that capture an :ref:`entire backward pass<whole-network-capture>`
+are a viable option, but need three setup steps.
+
+1. Disable DDP's internal async error handling::
+
+    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
+    torch.distributed.init_process_group(...)
+
+2. Before full-backward capture, DDP must be constructed in a side-stream context::
+
+    with torch.cuda.stream(s):
+        model = DistributedDataParallel(model)
+
+3. Your warmup must run at least 11 DDP-enabled eager iterations before capture.
+
+.. _graph-memory-management:
+
+Graph memory management
+^^^^^^^^^^^^^^^^^^^^^^^
+
+A captured graph acts on the same virtual addresses every time it replays.
+If PyTorch frees the memory, a later replay can hit an illegal memory access.
+If PyTorch reassigns the memory to new tensors, the replay can corrupt the values
+seen by those tensors.  Therefore, the virtual addresses used by the graph must be
+reserved for the graph across replays. The PyTorch caching allocator achieves this
+by detecting when capture is underway and satisfying the capture's allocations
+from a graph-private memory pool. The private pool stays alive until its
+:class:`~torch.cuda.CUDAGraph` object and all tensors created during capture
+go out of scope.
+
+Private pools are maintained automatically. By default, the allocator creates a
+separate private pool for each capture. If you capture multiple graphs,
+this conservative approach ensures graph replays never corrupt each other's values,
+but sometimes needlessly wastes memory.
+
+To economize the memory stashed in private pools, :class:`torch.cuda.graph`
+and :func:`torch.cuda.make_graphed_callables` optionally allow different
+captures to share the same private pool.
+It's safe for a set of graphs to share a private pool if you know they'll always
+be replayed in the same order they were captured,
+and never be replayed concurrently.
+
+Sharing memory across captures with torch.cuda.graph
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:class:`torch.cuda.graph`'s ``pool`` argument is a hint to use a particular private pool,
+and can be used to share memory across graphs as shown::
+
+    g1 = torch.cuda.CUDAGraph()
+    g2 = torch.cuda.CUDAGraph()
+
+    # (create static inputs for g1 and g2, run warmups of their workloads...)
+
+    # Captures g1
+    with torch.cuda.graph(g1):
+        static_out_1 = g1_workload(static_in_1)
+
+    # Captures g2, hinting that g2 may share a memory pool with g1
+    with torch.cuda.graph(g2, pool=g1.pool()):
+        static_out_2 = g2_workload(static_in_2)
+
+    static_in_1.copy_(real_data_1)
+    static_in_2.copy_(real_data_2)
+    g1.replay()
+    g2.replay()
+
+Sharing memory across captures with torch.cuda.make_graphed_callables
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+With :func:`torch.cuda.make_graphed_callables`, if you want to graph several
+callables and you know they'll always run in the same order (and never concurrently)
+pass them as a tuple in the same order they'll run in the live workload, and
+:func:`~torch.cuda.make_graphed_callables` will capture their graphs using a shared
+private pool.
+
+If, in the live workload, your callables will run in an order that occasionally changes,
+or if they'll run concurrently, passing them as a tuple to a single invocation of
+:func:`~torch.cuda.make_graphed_callables` is not allowed. Instead, you must call
+:func:`~torch.cuda.make_graphed_callables` separately for each one.
diff --git a/test/test_cuda.py b/test/test_cuda.py
index e90cb1703c06e..70f5a6ee4f586 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -3089,7 +3089,7 @@ def test_graph_capture_simple(self):
 
         with torch.cuda.stream(s):
             a = torch.full((1000,), 1, device="cuda")
-            g = torch.cuda._Graph()
+            g = torch.cuda.CUDAGraph()
             torch.cuda.empty_cache()
             g.capture_begin()
             b = a
@@ -3125,7 +3125,7 @@ def run(op, kwargs):
             with torch.cuda.stream(stream):
                 torch.cuda.manual_seed(5)
 
-                g = torch.cuda._Graph()
+                g = torch.cuda.CUDAGraph()
                 torch.cuda.empty_cache()
                 g.capture_begin()
                 graph_out = graph_in
@@ -3212,7 +3212,7 @@ def run(module, op, args, kwargs):
             with torch.cuda.stream(stream):
                 torch.cuda.manual_seed(5)
 
-                g = torch.cuda._Graph()
+                g = torch.cuda.CUDAGraph()
                 torch.cuda.empty_cache()
                 if (module == "torch"):
                     g.capture_begin()
@@ -3279,14 +3279,14 @@ def func_with_temps(t, val):
         s = torch.cuda.Stream()
 
         for share_mem in ("Don't share", "via pool()", "via graph_pool_handle()"):
-            g0 = torch.cuda._Graph()
-            g1 = torch.cuda._Graph()
+            g0 = torch.cuda.CUDAGraph()
+            g1 = torch.cuda.CUDAGraph()
 
             a = torch.ones((size,), device="cuda")
 
             s.wait_stream(torch.cuda.current_stream())
             with torch.cuda.stream(s):
-                g0_args = (torch.cuda._graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else ()
+                g0_args = (torch.cuda.graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else ()
                 g0.capture_begin(*g0_args)
                 b = a.clone()
                 for _ in range(5):
@@ -3343,8 +3343,8 @@ def func_with_temps(t, val):
         s = torch.cuda.Stream()
 
         for share_mem in ("Don't share", "via pool()", "via graph_pool_handle()"):
-            g0 = torch.cuda._Graph()
-            g1 = torch.cuda._Graph()
+            g0 = torch.cuda.CUDAGraph()
+            g1 = torch.cuda.CUDAGraph()
 
             s0 = torch.cuda.Stream()
             s1 = torch.cuda.Stream()
@@ -3353,7 +3353,7 @@ def func_with_temps(t, val):
 
             s.wait_stream(torch.cuda.current_stream())
             with torch.cuda.stream(s):
-                g0_args = (torch.cuda._graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else ()
+                g0_args = (torch.cuda.graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else ()
                 g0.capture_begin(*g0_args)
                 b = a.clone()
                 for _ in range(5):
@@ -3407,13 +3407,13 @@ def test_graph_three_successive(self):
         for share_mem in ("Don't share", "via pool()", "via graph_pool_handle()"):
             a = torch.ones((size,), device="cuda")
 
-            g0 = torch.cuda._Graph()
-            g1 = torch.cuda._Graph()
-            g2 = torch.cuda._Graph()
+            g0 = torch.cuda.CUDAGraph()
+            g1 = torch.cuda.CUDAGraph()
+            g2 = torch.cuda.CUDAGraph()
 
             s.wait_stream(torch.cuda.current_stream())
             with torch.cuda.stream(s):
-                g0_args = (torch.cuda._graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else ()
+                g0_args = (torch.cuda.graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else ()
                 g0.capture_begin(*g0_args)
                 b = a.clone()
                 c = b + 1
@@ -3499,7 +3499,7 @@ def test_graph_memory_stats_and_use_result_after_destroy_graph(self):
                 delta_active_blocks = 1  # We only check the large pool, which isn't affected by rng offset holder
                 delta_active_bytes = numel * elem
 
-            g = torch.cuda._Graph()
+            g = torch.cuda.CUDAGraph()
             s.wait_stream(torch.cuda.current_stream())
             with torch.cuda.stream(s):
                 # Allocation stat estimates assume input is created on the same stream as capture_begin()
@@ -3573,7 +3573,7 @@ def test_graph_record_stream(self):
         s0 = torch.cuda.Stream()
         s1 = torch.cuda.Stream()
         s2 = torch.cuda.Stream()
-        g = torch.cuda._Graph()
+        g = torch.cuda.CUDAGraph()
 
         torch.cuda.synchronize()
         with torch.cuda.stream(s0):
@@ -3620,7 +3620,7 @@ def test_graph_cudnn_dropout(self):
 
         y = model(x)
 
-        g = torch.cuda._Graph()
+        g = torch.cuda.CUDAGraph()
         s = torch.cuda.Stream()
         s.wait_stream(torch.cuda.current_stream())
         with torch.cuda.stream(s):
@@ -3638,7 +3638,7 @@ def test_graph_grad_scaling(self):
         torch.cuda.empty_cache()
 
         scaler = torch.cuda.amp.GradScaler(init_scale=4.)
-        g = torch.cuda._Graph()
+        g = torch.cuda.CUDAGraph()
         s = torch.cuda.Stream()
 
         weight = torch.ones((100,), device="cuda", requires_grad=True)
@@ -3646,18 +3646,15 @@ def test_graph_grad_scaling(self):
         static_input = torch.ones_like(weight)
         static_grad = torch.ones_like(weight)
 
-        s.wait_stream(torch.cuda.current_stream())
-        with torch.cuda.stream(s):
-            # warmup
-            loss = (weight.half() * static_input).sum()
-            scaler.scale(loss).backward()
-            opt.zero_grad(set_to_none=True)
-            # capture
-            g.capture_begin()
+        # warmup
+        loss = (weight.half() * static_input).sum()
+        scaler.scale(loss).backward()
+        opt.zero_grad(set_to_none=True)
+
+        # capture
+        with torch.cuda.graph(g):
             loss = (weight.half() * static_input).sum()
             scaler.scale(loss).backward()
-            g.capture_end()
-        torch.cuda.current_stream().wait_stream(s)
 
         input_vals = [5, 20000, 5, 40000]
         # If the scale gets updated properly, these are the scale, growth tracker,
@@ -3678,6 +3675,71 @@ def test_graph_grad_scaling(self):
             self.assertEqual(scaler._scale, scale)
             self.assertEqual(scaler._growth_tracker, growth_tracker)
 
+    @unittest.skipIf((not TEST_CUDA) or
+                     TEST_WITH_ROCM or
+                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    def test_graph_make_graphed_callables(self):
+        torch.manual_seed(5)
+        torch.cuda.manual_seed(5)
+
+        N, D_in, H, D_out = 640, 4096, 2048, 1024
+
+        models = []
+        for _ in range(2):
+            model_section1 = torch.nn.Sequential(torch.nn.Linear(D_in, H),
+                                                 torch.nn.Dropout(p=0.1)).cuda()
+            model_section2 = torch.nn.Sequential(torch.nn.Linear(H, D_out),
+                                                 torch.nn.Dropout(p=0.2)).cuda()
+            models.append(torch.nn.Sequential(model_section1, model_section2))
+
+        model_graphed = models[0]
+        model_control = models[1]
+
+        model_graphed.load_state_dict(model_control.state_dict())
+
+        opt_graphed = torch.optim.SGD(model_graphed.parameters(), lr=0.1)
+        opt_control = torch.optim.SGD(model_control.parameters(), lr=0.1)
+
+        x = torch.randn(N, D_in, device='cuda')
+        h = torch.randn(N, H, device='cuda', requires_grad=True)
+        y_pred = torch.randn(N, D_out, device='cuda', requires_grad=True)
+        y = torch.randn(N, D_out, device='cuda')
+
+        loss_fn_control = torch.nn.functional.mse_loss
+        relu_control = torch.nn.functional.relu
+
+        # This is a good stress test. It graphs four callables: two Modules and two python functions.
+        model_graphed[0], model_graphed[1], relu_graphed, loss_fn_graphed = \
+            torch.cuda.make_graphed_callables((model_graphed[0], model_graphed[1], relu_control, loss_fn_control),
+                                              ((x,), (h,), (y_pred,), (y_pred, y)))
+
+        real_inputs = [torch.rand_like(x) for _ in range(10)]
+        real_targets = [torch.rand_like(y) for _ in range(10)]
+
+        for m, opt, relu, loss_fn in zip((model_graphed, model_control),
+                                         (opt_graphed, opt_control),
+                                         (relu_graphed, relu_control),
+                                         (loss_fn_graphed, loss_fn_control)):
+            # Resets RNC states before iterations for graphed and ungraphed models,
+            # so dropout math should be bitwise identical for both.
+            torch.manual_seed(5)
+            torch.cuda.manual_seed(5)
+            for data, target in zip(real_inputs, real_targets):
+                opt.zero_grad(set_to_none=True)
+                y_pred = m(data)
+                y_pred = relu(y_pred)
+                loss = loss_fn(y_pred, target)
+                loss.backward()
+                opt.step()
+
+        for p, pc in zip(model_graphed.parameters(), model_control.parameters()):
+            self.assertEqual(p, pc)
+
+        # We graphed the models in training mode. Eval should still run ungraphed.
+        model_graphed.eval()
+        model_control.eval()
+        self.assertEqual(model_graphed(real_inputs[0]), model_control(real_inputs[0]))
+
     def test_batch_norm_gather_stats(self):
         input = torch.randn(1, 3, 3, 3, device='cuda')
         mean, invstd = torch.batch_norm_gather_stats(
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index c847e8deced62..352edbee6bc5e 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -888,8 +888,13 @@ class _CudaEventBase:
     def ipc_handle(self) -> bytes: ...
 
 # Defined in torch/csrc/cuda/Graph.cpp
-class _CudaGraphBase:
-    ...
+class _CUDAGraph:
+    def capture_begin(self,
+                      pool: Optional[Tuple[_int, _int]]=...) -> None: ...
+    def capture_end(self) -> None: ...
+    def replay(self) -> None: ...
+    def reset(self) -> None: ...
+    def pool(self) -> Tuple[_int, _int]: ...
 
 def _graph_pool_handle() -> Tuple[_int, _int]: ...
 
diff --git a/torch/csrc/cuda/Graph.cpp b/torch/csrc/cuda/Graph.cpp
index 123abb9666ee5..beacefa3f8878 100644
--- a/torch/csrc/cuda/Graph.cpp
+++ b/torch/csrc/cuda/Graph.cpp
@@ -23,36 +23,29 @@ void THCPGraph_init(PyObject *module) {
   auto torch_C_m = py::handle(module).cast<py::module>();
 
   torch_C_m
-      .def("_graph_pool_handle", &::at::cuda::graph_pool_handle);
+      .def("_graph_pool_handle",
+           &::at::cuda::graph_pool_handle);
 
-  shared_ptr_class_<::at::cuda::CUDAGraph>(torch_C_m, "_CudaGraphBase")
+  shared_ptr_class_<::at::cuda::CUDAGraph>
+      (torch_C_m,
+       "_CUDAGraph")
       .def(py::init<>())
       // I'm not sure this is the correct order of all the arguments. Pybind11 docs
       // aren't clear. But it works.
       .def("capture_begin",
            &::at::cuda::CUDAGraph::capture_begin,
            py::call_guard<py::gil_scoped_release>(),
-           R"(``capture_begin`` begins Cuda graph capture on the current stream.)",
            py::arg("pool") = c10::cuda::MempoolId_t{0, 0})
       .def("capture_end",
            &::at::cuda::CUDAGraph::capture_end,
-           py::call_guard<py::gil_scoped_release>(),
-           R"(``capture_end`` ends Cuda graph capture on the current stream.
-           After ``capture_end``, ``replay`` may be called on this instance.)")
+           py::call_guard<py::gil_scoped_release>())
       .def("replay",
            &::at::cuda::CUDAGraph::replay,
-           py::call_guard<py::gil_scoped_release>(),
-           R"(``replay`` replays the Cuda graph captured by this instance.)")
-      // reset is called in __del__ on the Python side
-      // (see class Graph in torch/cuda/streams.py for reasons and caveats)
+           py::call_guard<py::gil_scoped_release>())
       .def("reset",
            &::at::cuda::CUDAGraph::reset,
-           py::call_guard<py::gil_scoped_release>(),
-           R"(``reset`` deletes the graph currently held by this instance.)")
+           py::call_guard<py::gil_scoped_release>())
       .def("pool",
            &::at::cuda::CUDAGraph::pool,
-           py::call_guard<py::gil_scoped_release>(),
-           R"(``pool`` retrieves the id of this graph's memory pool.
-           This id can optionally be passed to another graph's capture_begin,
-           which hints that other graph may share the same memory pool.)");
+           py::call_guard<py::gil_scoped_release>());
 }
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index d5a9cbb52f34f..924782de8d024 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -16,7 +16,8 @@
 import threading
 from typing import List, Optional, Tuple, Union, Any
 from ._utils import _get_device_index, _dummy_type
-from .streams import Stream, Event, _Graph, _graph_pool_handle
+from .graphs import CUDAGraph, graph_pool_handle, graph, make_graphed_callables
+from .streams import Stream, Event
 from .. import device as _device
 import torch._C
 
diff --git a/torch/cuda/graphs.py b/torch/cuda/graphs.py
new file mode 100644
index 0000000000000..ff8a07f989f9d
--- /dev/null
+++ b/torch/cuda/graphs.py
@@ -0,0 +1,408 @@
+import gc
+import torch
+
+from ._utils import _dummy_type
+
+
+if not hasattr(torch._C, '_CudaStreamBase'):
+    # Define dummy base classes
+    torch._C.__dict__['_CUDAGraph'] = _dummy_type('_CUDAGraph')
+    torch._C.__dict__['_graph_pool_handle'] = _dummy_type('_graph_pool_handle')
+
+from torch._C import _CUDAGraph  # noqa: F401
+from torch._C import _graph_pool_handle
+
+
+# Python shim helps Sphinx process docstrings more reliably.
+def graph_pool_handle():
+    r"""
+    Returns an opaque token representing the id of a graph memory pool.
+    See :ref:`Graph memory management<graph-memory-management>`.
+
+    .. warning::
+        This API is a prototype and may change in future releases.
+    """
+    return _graph_pool_handle()
+
+
+# Python shim helps Sphinx process docstrings more reliably.
+class CUDAGraph(torch._C._CUDAGraph):
+    r"""
+    Wrapper around a CUDA graph.
+
+    .. warning::
+        This API is a prototype and may change in future releases.
+    """
+    def __new__(cls):
+        return super(CUDAGraph, cls).__new__(cls)
+
+    def __init__(self):
+        super(CUDAGraph, self).__init__()
+
+    def capture_begin(self, pool=None):
+        r"""
+        Begins capturing CUDA work on the current stream.
+
+        Typically, you shouldn't call ``capture_begin`` yourself.
+        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
+        which call ``capture_begin`` internally.
+
+        Arguments:
+            pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
+                :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
+                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
+        """
+        # I'm not sure if pybind11 converts a None arg to the default defined on the C++ side,
+        # so I'm not taking any chances.
+        if pool is None:
+            super(CUDAGraph, self).capture_begin()
+        else:
+            super(CUDAGraph, self).capture_begin(pool)
+
+    def capture_end(self):
+        r"""
+        Ends CUDA graph capture on the current stream.
+        After ``capture_end``, ``replay`` may be called on this instance.
+
+        Typically, you shouldn't call ``capture_end`` yourself.
+        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
+        which call ``capture_end`` internally.
+        """
+        super(CUDAGraph, self).capture_end()
+
+    def replay(self):
+        r"""
+        Replays the CUDA work captured by this graph.
+        """
+        super(CUDAGraph, self).replay()
+
+    def reset(self):
+        r"""
+        Deletes the graph currently held by this instance.
+        """
+        super(CUDAGraph, self).reset()
+
+    def pool(self):
+        r"""
+        Returns an opaque token representing the id of this graph's memory pool.
+        This id can optionally be passed to another graph's ``capture_begin``,
+        which hints the other graph may share the same memory pool.
+        """
+        return super(CUDAGraph, self).pool()
+
+
+class graph(object):
+    r"""
+    Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph`
+    object for later replay.
+
+    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
+    detailed use, and constraints.
+
+    Arguments:
+        cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture.
+        pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or
+            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) hinting this graph's capture
+            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
+        stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context.
+            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
+
+    .. note::
+        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
+        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.
+
+    .. warning::
+        This API is a prototype and may change in future releases.
+    """
+    default_capture_stream = None
+
+    def __init__(self,
+                 cuda_graph,
+                 pool=None,
+                 stream=None):
+        # Lazy-init of default_capture_stream helps avoid circular-import errors.
+        # Not thread safe, but graphs already have the general (explicitly documented)
+        # restriction that only one capture may be underway at a time in the process.
+        if self.__class__.default_capture_stream is None:
+            self.__class__.default_capture_stream = torch.cuda.Stream()
+
+        self.pool = () if pool is None else (pool,)
+        self.capture_stream = stream if stream is not None else self.__class__.default_capture_stream
+        assert self.capture_stream is not None
+        self.stream_ctx = torch.cuda.stream(self.capture_stream)
+        self.cuda_graph = cuda_graph
+
+    def __enter__(self):
+        # Free as much memory as we can for the graph
+        torch.cuda.synchronize()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        # Stackoverflow seems comfortable with this pattern
+        # https://stackoverflow.com/questions/26635684/calling-enter-and-exit-manually#39172487
+        self.stream_ctx.__enter__()
+
+        self.cuda_graph.capture_begin(*self.pool)
+
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.cuda_graph.capture_end()
+        self.stream_ctx.__exit__(exc_type, exc_value, traceback)
+        # returning None should propagate exceptions from either capture_end or stream_ctx.__exit__()
+
+
+def make_graphed_callables(callables, sample_args):
+    r"""
+    Accepts callables (functions or :class:`nn.Module<torch.nn.Module>`\ s)
+    and returns graphed versions.
+
+    Each graphed callable's forward pass runs its source callable's
+    forward CUDA work as a CUDA graph inside a single autograd node.
+
+    The graphed callable's forward pass also appends
+    a backward node to the autograd graph. During backward, this node runs the
+    callable's backward work as a CUDA graph.
+
+    Therefore, each graphed callable should be a drop-in replacement for its source callable
+    in an autograd-enabled training loop.
+
+    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.
+
+    If you pass a tuple of several callables, their captures will use the same memory pool.
+    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.
+
+    Arguments:
+        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
+            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
+            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
+            they'll run in the live workload.
+        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
+            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
+            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
+
+    .. note::
+        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
+        that's expected for the corresponding real input in the training loop.
+
+    .. warning::
+        This API is a prototype and may change in future releases.
+
+    .. warning::
+        ``sample_args`` for each callable must be a tuple of Tensors. Other types and keyword args
+        are not allowed.
+
+    .. warning::
+        Returned callables do not support higher order differentiation (e.g., double backward).
+
+    .. warning::
+        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
+        may be trainable. Buffers must have ``requires_grad=False``.
+
+    .. warning::
+        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
+        you may not add or remove any of that Module's parameters or buffers.
+
+    .. warning::
+        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
+        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
+        through :func:`~torch.cuda.make_graphed_callables` is allowed.
+
+    .. warning::
+        When running a graphed callable, you must pass its arguments in the same order and format
+        they appeared in that callable's ``sample_args``.
+
+    .. warning::
+        All Tensor outputs of graphed callables must require grad.
+    """
+    just_one_callable = False
+
+    if not isinstance(callables, tuple):
+        just_one_callable = True
+        callables = (callables,)
+        sample_args = (sample_args,)
+
+    for c, args in zip(callables, sample_args):
+        if isinstance(c, torch.nn.Module):
+            assert len(c._backward_hooks) == 0 and len(c._forward_hooks) == 0 and len(c._forward_pre_hooks) == 0, \
+                "Modules must not have hooks registered at the time they are passed. However, registering hooks " + \
+                "on modules after passing them through make_graphed_callables is allowed."
+            assert all(b.requires_grad is False for b in c.buffers()), "In any :class:`~torch.nn.Module` passed to " + \
+                ":func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have " + \
+                "``requires_grad=False``."
+        assert all(isinstance(arg, torch.Tensor) for arg in args), "In the prototype API, sample_args " + \
+            "for each callable must be a tuple of Tensors. Other types and keyword args are not allowed."
+
+
+    # If a callable is an nn.Module, its graph's full input surface is the args the user explicitly
+    # passes to forward (ie, its sample_args) AND the module's parameter attributes.
+    per_callable_len_user_args = [len(args) for args in sample_args]
+    per_callable_module_params = [tuple(c.parameters()) if isinstance(c, torch.nn.Module) else ()
+                                  for c in callables]
+    per_callable_static_input_surfaces = [sample_args[i] + per_callable_module_params[i]
+                                          for i in range(len(callables))]
+
+    fwd_graphs = [torch.cuda.CUDAGraph() for _ in range(len(callables))]
+    bwd_graphs = [torch.cuda.CUDAGraph() for _ in range(len(callables))]
+
+    mempool = graph_pool_handle()
+
+    # Warmup
+    # Hopefully prevents cudnn benchmarking and other lazy-initialization cuda work
+    # from ending up in any captures.
+    torch.cuda.synchronize()
+    with torch.cuda.stream(torch.cuda.Stream()):
+        for func, args, static_input_surface in zip(callables,
+                                                    sample_args,
+                                                    per_callable_static_input_surfaces):
+            for _ in range(3):
+                outputs = func(*args)
+                outputs = (outputs,) if isinstance(outputs, torch.Tensor) else outputs
+                grad_inputs = torch.autograd.grad(outputs=outputs,
+                                                  inputs=tuple(i for i in static_input_surface if i.requires_grad),
+                                                  grad_outputs=tuple(torch.empty_like(o) for o in outputs),
+                                                  only_inputs=True,
+                                                  allow_unused=False)
+            del outputs, grad_inputs
+    torch.cuda.synchronize()
+
+    # All captures here share a mempool. To avoid replays corrupting each other's memory,
+    # the safest approach is to capture all passes in the same order they'll run:
+    # fwd 1, fwd 2, ... fwd N, then bwd N, bwd N-1, ... bwd 1.
+
+    # Capture forward graphs
+    per_callable_static_outputs = []
+    per_callable_output_was_tensor = []
+    for func, args, fwd_graph in zip(callables,
+                                     sample_args,
+                                     fwd_graphs):
+        with torch.cuda.graph(fwd_graph, pool=mempool):
+            outputs = func(*args)
+
+        # Assumes model output is a tensor or tuple of tensors
+        if isinstance(outputs, torch.Tensor):
+            per_callable_output_was_tensor.append(True)
+            outputs = (outputs,)
+        else:
+            per_callable_output_was_tensor.append(False)
+
+        per_callable_static_outputs.append(outputs)
+
+    # Capture backward graphs in reverse order
+    per_callable_static_grad_outputs = []
+    per_callable_static_grad_inputs = []
+    for static_input_surface, static_outputs, bwd_graph, module_params in \
+            zip(reversed(per_callable_static_input_surfaces),
+                reversed(per_callable_static_outputs),
+                reversed(bwd_graphs),
+                reversed(per_callable_module_params)):
+
+        # For now, assumes all static_outputs require grad
+        assert all(o.requires_grad for o in static_outputs), "Outputs of graphed callables must require grad."
+        static_grad_outputs = tuple(torch.empty_like(o) for o in static_outputs)
+
+        with torch.cuda.graph(bwd_graph, pool=mempool):
+            grad_inputs = torch.autograd.grad(outputs=static_outputs,
+                                              inputs=tuple(i for i in static_input_surface if i.requires_grad),
+                                              grad_outputs=static_grad_outputs,
+                                              only_inputs=True,
+                                              allow_unused=False)
+
+        # Constructs a tuple suitable for returning from Graphed.backward:
+        # Pads out the actually-needed grads with Nones in gradient slots for inputs that don't require grad.
+        # I couldn't think of a slick one-liner for this pattern.
+        static_grad_inputs = []
+        grad_idx = 0
+        for arg in static_input_surface:
+            if arg.requires_grad:
+                static_grad_inputs.append(grad_inputs[grad_idx])
+                grad_idx += 1
+            else:
+                static_grad_inputs.append(None)  # type: ignore[arg-type]
+        static_grad_inputs = tuple(static_grad_inputs)  # type: ignore[assignment]
+
+        per_callable_static_grad_outputs.append(static_grad_outputs)
+        per_callable_static_grad_inputs.append(static_grad_inputs)
+
+    # Reverses the most recent two lists
+    per_callable_static_grad_outputs = list(reversed(per_callable_static_grad_outputs))
+    per_callable_static_grad_inputs = list(reversed(per_callable_static_grad_inputs))
+    # Now for every per_callable list, per_callable_*[i] holds the stuff for the ith callable.
+
+    def make_graphed_autograd_function(fwd_graph,
+                                       bwd_graph,
+                                       module_params,
+                                       len_user_args,
+                                       output_was_tensor,
+                                       static_input_surface,
+                                       static_outputs,
+                                       static_grad_outputs,
+                                       static_grad_inputs):
+        class Graphed(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *inputs):
+                # At this stage, only the user args may (potentially) be new tensors.
+                for i in range(len_user_args):
+                    if static_input_surface[i].data_ptr() != inputs[i].data_ptr():
+                        static_input_surface[i].copy_(inputs[i])
+                fwd_graph.replay()
+                assert isinstance(static_outputs, tuple)
+                return tuple(o.detach() for o in static_outputs)
+
+            @staticmethod
+            @torch.autograd.function.once_differentiable
+            def backward(ctx, *grads):
+                for g, grad in zip(static_grad_outputs, grads):
+                    if g is None:
+                        assert grad is None
+                    else:
+                        # don't copy if autograd gods have been kind and the
+                        # incoming grad is already in the right place
+                        if g.data_ptr() != grad.data_ptr():
+                            g.copy_(grad)
+                bwd_graph.replay()
+
+                # Input args that didn't require grad expect a None gradient.
+                assert isinstance(static_grad_inputs, tuple)
+                return tuple(b.detach() if b is not None else b for b in static_grad_inputs)
+
+        def functionalized(*user_args):
+            # Runs the autograd function with inputs == all inputs to the graph that might require grad
+            # (explicit user args + module parameters)
+            # Assumes module params didn't change since capture.
+            out = Graphed.apply(*(user_args + module_params))
+            return out[0] if output_was_tensor else out
+
+        return functionalized
+
+    # Put together the final graphed callables
+    ret = []
+    for i, func in enumerate(callables):
+        graphed = make_graphed_autograd_function(fwd_graphs[i],
+                                                 bwd_graphs[i],
+                                                 per_callable_module_params[i],
+                                                 per_callable_len_user_args[i],
+                                                 per_callable_output_was_tensor[i],
+                                                 per_callable_static_input_surfaces[i],
+                                                 per_callable_static_outputs[i],
+                                                 per_callable_static_grad_outputs[i],
+                                                 per_callable_static_grad_inputs[i])
+
+        if isinstance(func, torch.nn.Module):
+            def make_graphed_forward(func, graph_training_state, graphed, orig_fwd):
+                def new_fwd(*user_args):
+                    # If the module's training-or-eval state matches what we graphed,
+                    # run the graph, otherwise run the original forward method
+                    if func.training == graph_training_state:
+                        return graphed(*user_args)
+                    else:
+                        return orig_fwd(*user_args)
+                return new_fwd
+            func.forward = make_graphed_forward(func, func.training, graphed, func.forward)  # type: ignore[assignment]
+            ret.append(func)
+        else:
+            ret.append(graphed)
+
+    if just_one_callable:
+        return ret[0]
+
+    return tuple(ret)
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
index 0f983728f630a..2b4cc479e095f 100644
--- a/torch/cuda/streams.py
+++ b/torch/cuda/streams.py
@@ -8,8 +8,6 @@
     # Define dummy base classes
     torch._C.__dict__['_CudaStreamBase'] = _dummy_type('_CudaStreamBase')
     torch._C.__dict__['_CudaEventBase'] = _dummy_type('_CudaEventBase')
-    torch._C.__dict__['_CudaGraphBase'] = _dummy_type('_CudaGraphBase')
-    torch._C.__dict__['_graph_pool_handle'] = _dummy_type('_graph_pool_handle')
 
 class Stream(torch._C._CudaStreamBase):
     r"""Wrapper around a CUDA stream.
@@ -226,6 +224,3 @@ def __repr__(self):
             return '<torch.cuda.Event {0:#x}>'.format(self._as_parameter_.value)
         else:
             return '<torch.cuda.Event uninitialized>'
-
-_Graph = torch._C._CudaGraphBase
-_graph_pool_handle = torch._C._graph_pool_handle

From 13484084a64df5f2c5deea26b8cc2f30833038f3 Mon Sep 17 00:00:00 2001
From: Rishi Puri <puririshi98@berkeley.edu>
Date: Tue, 31 Aug 2021 13:47:29 -0700
Subject: [PATCH 398/530] fix syntax error in bfloat16 PR (#64122)

Summary:
fixes prior syntax error from PR ngimel

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64122

Reviewed By: H-Huang

Differential Revision: D30643596

Pulled By: ngimel

fbshipit-source-id: 0a2d5a40fb6dc7339cd03112e57ef0e1bf8a000e
---
 test/test_cuda.py      | 40 ++++++++++++++++++++++++++++++++++++++--
 torch/cuda/__init__.py |  3 ++-
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index 70f5a6ee4f586..6f742ec59f931 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -46,12 +46,15 @@
 TEST_LARGE_TENSOR = TEST_CUDA
 TEST_MEDIUM_TENSOR = TEST_CUDA
 TEST_CUDNN = TEST_CUDA
+TEST_BF16 = False
 if TEST_CUDA:
     torch.ones(1).cuda()  # initialize cuda context
     TEST_CUDNN = TEST_CUDA and (TEST_WITH_ROCM or
                                 torch.backends.cudnn.is_acceptable(torch.tensor(1., device=torch.device('cuda:0'))))
     TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 12e9
     TEST_MEDIUM_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 6e9
+    TEST_BF16 = torch.cuda.is_bf16_supported()
+
 
 types = [
     torch.FloatTensor,
@@ -2707,9 +2710,9 @@ def cast(val, to_type):
 
         if add_kwargs is None:
             add_kwargs = {}
-
+        fast_dtype = torch.bfloat16 if run_as_type == torch.bfloat16 else torch.float16
         self.assertFalse(torch.is_autocast_enabled())
-        with torch.autocast('cuda', ):
+        with torch.autocast('cuda', dtype=fast_dtype):
             self.assertTrue(torch.is_autocast_enabled())
 
             out_type = out_type if out_type is not None else run_as_type
@@ -2784,6 +2787,27 @@ def test_autocast_torch_fp16(self):
                 if not skip_test:
                     self._run_autocast_outofplace(op, args, torch.float16)
 
+    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    def test_autocast_torch_bf16(self):
+        with torch.backends.cudnn.flags(enabled=True, deterministic=True):
+            for op_with_args in self.autocast_lists.torch_fp16:
+                skip_test = False
+                op, args = op_with_args[0], op_with_args[1]
+                if len(op_with_args) == 3:
+                    skip_test = op_with_args[2]  # TEST_WITH_ROCM
+                should_error_from_not_implemented = 'cudnn' in op or 'prelu' in op or 'thnn' in op \
+                    or 'fused' in op or 'gru' in op or op == '_thnn_fused_lstm_cell' or op == 'lstm_cell'
+                if not skip_test:
+                    if should_error_from_not_implemented:
+                        with self.assertRaises(RuntimeError, msg=str(op) + ' should not be supported for bfloat16!'):
+                            self._run_autocast_outofplace(op, args, torch.bfloat16)
+                    else:
+                        if torch.cuda.is_bf16_supported():
+                            self._run_autocast_outofplace(op, args, torch.bfloat16)
+                        else:
+                            with self.assertRaisesRegex(RuntimeError, 'Device does not support bfloat16'):
+                                self._run_autocast_outofplace(op, args, torch.bfloat16)
+
     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
     def test_autocast_torch_fp32(self):
         for op_with_args in self.autocast_lists.torch_fp32:
@@ -2806,6 +2830,18 @@ def test_autocast_nn_fp16(self):
             for op, args in self.autocast_lists.nn_fp16:
                 self._run_autocast_outofplace(op, args, torch.float16, module=torch._C._nn)
 
+
+
+    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    def test_autocast_nn_bf16(self):
+        with torch.backends.cudnn.flags(enabled=True, deterministic=True):
+            for op, args in self.autocast_lists.nn_fp16:
+                if torch.cuda.is_bf16_supported():
+                    self._run_autocast_outofplace(op, args, torch.bfloat16, module=torch._C._nn)
+                else:
+                    with self.assertRaisesRegex(RuntimeError, 'Device does not support bfloat16'):
+                        self._run_autocast_outofplace(op, args, torch.bfloat16, module=torch._C._nn)
+
     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
     def test_autocast_nn_fp32(self):
         for op, args in self.autocast_lists.nn_fp32:
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 924782de8d024..80d9e108643b4 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -83,7 +83,8 @@ def is_bf16_supported():
     r"""Returns a bool indicating if the current CUDA device supports dtype bfloat16"""
     cu_vers = torch.version.cuda
     if cu_vers is not None:
-        cuda_maj_decide = int(cu_vers.split(',')[0]) >= 11
+        cuda_maj_decide = int(cu_vers.split('.')[0]) >= 11
+
     else:
         cuda_maj_decide = False
     return torch.cuda.get_device_properties(torch.cuda.current_device()).major >= 8 and cuda_maj_decide

From 9a0456939b4ce6173b32714513faceed102b229c Mon Sep 17 00:00:00 2001
From: zhouzhuojie <zhouzhuojie@gmail.com>
Date: Tue, 31 Aug 2021 13:48:28 -0700
Subject: [PATCH 399/530] Try the forked checkout action with retry (#64120)

Summary:
Fixes #{issue number}

The main difference is:
https://github.com/zhouzhuojie/checkout/commit/ffc6f93ad4b6e3cdcdd1a34e8c896765002f9b34

Can test multiple times in this PR to see if it works, will make the `retry` number configurable if it's usable.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64120

Reviewed By: malfet

Differential Revision: D30656099

Pulled By: zhouzhuojie

fbshipit-source-id: a89932196bb0c44e412a34664ed6a061b02ef92e
---
 .github/templates/bazel_ci_workflow.yml.j2    |  2 +-
 .github/templates/linux_ci_workflow.yml.j2    | 10 +++++-----
 .github/templates/windows_ci_workflow.yml.j2  |  6 +++---
 .github/workflows/build_linux_conda.yml       |  6 +++---
 .github/workflows/build_linux_libtorch.yml    |  6 +++---
 .github/workflows/build_linux_wheels.yml      |  6 +++---
 .github/workflows/create_release.yml          |  2 +-
 ...torch-linux-xenial-cuda10.2-py3.6-gcc7.yml |  4 ++--
 ...torch-linux-xenial-cuda11.3-py3.6-gcc7.yml |  4 ++--
 ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml |  8 ++++----
 ...rated-linux-bionic-py3.8-gcc9-coverage.yml |  8 ++++----
 ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml |  8 ++++----
 ...rated-linux-xenial-cuda11.3-py3.6-gcc7.yml |  8 ++++----
 .../generated-linux-xenial-py3.6-gcc5.4.yml   | 10 +++++-----
 ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml |  4 ++--
 ...torch-linux-xenial-cuda11.1-py3.6-gcc7.yml |  4 ++--
 ...iodic-linux-xenial-cuda11.1-py3.6-gcc7.yml |  8 ++++----
 ...rated-periodic-win-vs2019-cuda11.1-py3.yml |  6 +++---
 .../generated-win-vs2019-cpu-py3.yml          |  6 +++---
 .../generated-win-vs2019-cuda10.1-py3.yml     |  6 +++---
 .../generated-win-vs2019-cuda11.3-py3.yml     |  6 +++---
 .github/workflows/lint.yml                    | 20 +++++++++----------
 .../workflows/push_nightly_docker_ghcr.yml    |  2 +-
 .github/workflows/run_torchbench.yml          |  4 ++--
 .github/workflows/test_tools.yml              |  2 +-
 tools/test/test_extract_scripts.py            |  2 +-
 26 files changed, 79 insertions(+), 79 deletions(-)

diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2
index f4e0034a0f5d6..e9907ed679e3f 100644
--- a/.github/templates/bazel_ci_workflow.yml.j2
+++ b/.github/templates/bazel_ci_workflow.yml.j2
@@ -41,7 +41,7 @@ on:
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index d7be808898476..d5de86b1bbcfd 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -86,7 +86,7 @@ jobs:
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
@@ -163,7 +163,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
@@ -270,7 +270,7 @@ jobs:
       - name: Install dependencies
         run: pip install typing-extensions
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating test matrix
         id: set-matrix
         run: .github/scripts/generate_pytorch_test_matrix.py
@@ -308,7 +308,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
@@ -470,7 +470,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index 84a30bda92a36..38c346c1134f8 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -95,7 +95,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           submodules: recursive
           path: pytorch-${{ github.run_id }}
@@ -177,7 +177,7 @@ jobs:
       - name: Install dependencies
         run: pip install typing-extensions
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating test matrix
         id: set-matrix
         run: .github/scripts/generate_pytorch_test_matrix.py
@@ -206,7 +206,7 @@ jobs:
         working-directory: pytorch-${{ github.run_id }}
     steps:
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           submodules: recursive
           path: pytorch-${{ github.run_id }}
diff --git a/.github/workflows/build_linux_conda.yml b/.github/workflows/build_linux_conda.yml
index 2037f0c1cf561..536a18771831e 100644
--- a/.github/workflows/build_linux_conda.yml
+++ b/.github/workflows/build_linux_conda.yml
@@ -16,7 +16,7 @@ jobs:
       image: python:3.9
     steps:
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating build matrix
         id: set-matrix
         run: |
@@ -57,12 +57,12 @@ jobs:
       - name: Clean runner workspace
         run: rm -rf "$GITHUB_WORKSPACE"
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           path: pytorch
           submodules: recursive
       - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           repository: pytorch/builder
           path: builder
diff --git a/.github/workflows/build_linux_libtorch.yml b/.github/workflows/build_linux_libtorch.yml
index 9d4964a8594b1..9321c6ac8bf88 100644
--- a/.github/workflows/build_linux_libtorch.yml
+++ b/.github/workflows/build_linux_libtorch.yml
@@ -16,7 +16,7 @@ jobs:
       image: python:3.9
     steps:
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating build matrix
         id: set-matrix
         run: |
@@ -51,12 +51,12 @@ jobs:
       - name: Clean runner workspace
         run: rm -rf "$GITHUB_WORKSPACE"
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           path: pytorch
           submodules: recursive
       - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           repository: pytorch/builder
           path: builder
diff --git a/.github/workflows/build_linux_wheels.yml b/.github/workflows/build_linux_wheels.yml
index c32eee6892033..15a38f6cee0fe 100644
--- a/.github/workflows/build_linux_wheels.yml
+++ b/.github/workflows/build_linux_wheels.yml
@@ -16,7 +16,7 @@ jobs:
       image: python:3.9
     steps:
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating build matrix
         id: set-matrix
         run: |
@@ -46,12 +46,12 @@ jobs:
       - name: Clean runner workspace
         run: rm -rf "$GITHUB_WORKSPACE"
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           path: pytorch
           submodules: recursive
       - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           repository: pytorch/builder
           path: builder
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index fa65168a4709c..4cd0568be5aad 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -14,7 +14,7 @@ jobs:
     name: Create Release
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           submodules: 'recursive'
       - name: Fake name for PRs
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 72a9c4effeee3..27b21bc6b523b 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -60,7 +60,7 @@ jobs:
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
@@ -140,7 +140,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 937a531c977e5..b90a497441802 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -60,7 +60,7 @@ jobs:
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
@@ -140,7 +140,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index f34765c98160b..274e68e38bdb4 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -60,7 +60,7 @@ jobs:
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
@@ -140,7 +140,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
@@ -246,7 +246,7 @@ jobs:
       - name: Install dependencies
         run: pip install typing-extensions
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating test matrix
         id: set-matrix
         run: .github/scripts/generate_pytorch_test_matrix.py
@@ -284,7 +284,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index 6162b3cac1604..e03a019fbe7b9 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -60,7 +60,7 @@ jobs:
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
@@ -140,7 +140,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
@@ -246,7 +246,7 @@ jobs:
       - name: Install dependencies
         run: pip install typing-extensions
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating test matrix
         id: set-matrix
         run: .github/scripts/generate_pytorch_test_matrix.py
@@ -284,7 +284,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 2fe24a515ea2a..6aea843037eae 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -60,7 +60,7 @@ jobs:
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
@@ -140,7 +140,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
@@ -246,7 +246,7 @@ jobs:
       - name: Install dependencies
         run: pip install typing-extensions
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating test matrix
         id: set-matrix
         run: .github/scripts/generate_pytorch_test_matrix.py
@@ -284,7 +284,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 59b0e2535b3bb..1fdae9d1a0320 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -60,7 +60,7 @@ jobs:
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
@@ -140,7 +140,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
@@ -246,7 +246,7 @@ jobs:
       - name: Install dependencies
         run: pip install typing-extensions
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating test matrix
         id: set-matrix
         run: .github/scripts/generate_pytorch_test_matrix.py
@@ -284,7 +284,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index 1fa72f51255dd..e932b488d0c71 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -60,7 +60,7 @@ jobs:
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
@@ -140,7 +140,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
@@ -246,7 +246,7 @@ jobs:
       - name: Install dependencies
         run: pip install typing-extensions
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating test matrix
         id: set-matrix
         run: .github/scripts/generate_pytorch_test_matrix.py
@@ -284,7 +284,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
@@ -458,7 +458,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index 233144210dbcd..89deda0704df1 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -60,7 +60,7 @@ jobs:
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
@@ -135,7 +135,7 @@ jobs:
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 09989ef516a7e..63b462a19cf25 100644
--- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -58,7 +58,7 @@ jobs:
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
@@ -138,7 +138,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 246d5cabd86de..768146ee8cab2 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -58,7 +58,7 @@ jobs:
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
@@ -138,7 +138,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
@@ -244,7 +244,7 @@ jobs:
       - name: Install dependencies
         run: pip install typing-extensions
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating test matrix
         id: set-matrix
         run: .github/scripts/generate_pytorch_test_matrix.py
@@ -282,7 +282,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
           submodules: recursive
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
index 6c87f40accd64..aaf2d26e05af5 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
@@ -56,7 +56,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           submodules: recursive
           path: pytorch-${{ github.run_id }}
@@ -143,7 +143,7 @@ jobs:
       - name: Install dependencies
         run: pip install typing-extensions
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating test matrix
         id: set-matrix
         run: .github/scripts/generate_pytorch_test_matrix.py
@@ -169,7 +169,7 @@ jobs:
         working-directory: pytorch-${{ github.run_id }}
     steps:
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           submodules: recursive
           path: pytorch-${{ github.run_id }}
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index 30f328ae71fdd..08656c9dd99d8 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -56,7 +56,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           submodules: recursive
           path: pytorch-${{ github.run_id }}
@@ -135,7 +135,7 @@ jobs:
       - name: Install dependencies
         run: pip install typing-extensions
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating test matrix
         id: set-matrix
         run: .github/scripts/generate_pytorch_test_matrix.py
@@ -161,7 +161,7 @@ jobs:
         working-directory: pytorch-${{ github.run_id }}
     steps:
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           submodules: recursive
           path: pytorch-${{ github.run_id }}
diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
index 72dd21dce3899..2a8570d1e84b1 100644
--- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
@@ -58,7 +58,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           submodules: recursive
           path: pytorch-${{ github.run_id }}
@@ -145,7 +145,7 @@ jobs:
       - name: Install dependencies
         run: pip install typing-extensions
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating test matrix
         id: set-matrix
         run: .github/scripts/generate_pytorch_test_matrix.py
@@ -171,7 +171,7 @@ jobs:
         working-directory: pytorch-${{ github.run_id }}
     steps:
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           submodules: recursive
           path: pytorch-${{ github.run_id }}
diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
index eb6e02fb5c2f3..7235db7f3c9f5 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
@@ -58,7 +58,7 @@ jobs:
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           submodules: recursive
           path: pytorch-${{ github.run_id }}
@@ -145,7 +145,7 @@ jobs:
       - name: Install dependencies
         run: pip install typing-extensions
       - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Generating test matrix
         id: set-matrix
         run: .github/scripts/generate_pytorch_test_matrix.py
@@ -171,7 +171,7 @@ jobs:
         working-directory: pytorch-${{ github.run_id }}
     steps:
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           submodules: recursive
           path: pytorch-${{ github.run_id }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index f036bc17d2a2d..f69d2b01ab35a 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -16,7 +16,7 @@ jobs:
           python-version: 3.x
           architecture: x64
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Install requirements
         id: requirements
         run: pip3 install -r requirements.txt --user
@@ -101,7 +101,7 @@ jobs:
           python-version: 3.x
           architecture: x64
       - name: Fetch PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow us to use git merge-base
       - name: Run clang-format
@@ -140,7 +140,7 @@ jobs:
           python-version: 2.x
           architecture: x64
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Attempt to run setup.py
         run: |
           if ! python2 setup.py | grep -q "Python 2 has reached end-of-life and is no longer supported by PyTorch."; then
@@ -159,7 +159,7 @@ jobs:
           python-version: 3.x
           architecture: x64
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Install requirements
         id: requirements
         run: |
@@ -168,7 +168,7 @@ jobs:
         run: |
           pip3 install Jinja2==3.0.1 --user
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Regenerate workflows
         id: generate_workflows
         run: .github/scripts/generate_ci_workflows.py
@@ -238,7 +238,7 @@ jobs:
       - name: Setup Node
         uses: actions/setup-node@v2
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Install markdown-toc
         run: npm install -g markdown-toc
       - name: Regenerate ToCs and check that they didn't change
@@ -274,7 +274,7 @@ jobs:
           python-version: 3.x
           architecture: x64
       - name: Fetch PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 2 # to allow us to use github.event.pull_request.head.sha
       - name: Prepare output dir with HEAD commit SHA
@@ -326,7 +326,7 @@ jobs:
       image: ghcr.io/pytorch/cilint-clang-tidy:d8f0c777964d0dd8a147360de80aed1a13eb613a
     steps:
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # to allow tools/linter/clang_tidy.py to do its thing
       - name: Prepare output dir with HEAD commit SHA
@@ -412,7 +412,7 @@ jobs:
           python-version: 3.x
           architecture: x64
       - name: Fetch PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Install dependencies
         run: |
           set -eux
@@ -434,7 +434,7 @@ jobs:
           python-version: 3.8
           architecture: x64
       - name: Fetch PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
       - name: Install dependencies
         run: |
           set -eux
diff --git a/.github/workflows/push_nightly_docker_ghcr.yml b/.github/workflows/push_nightly_docker_ghcr.yml
index 311aa94601d6a..892cb5c17aa86 100644
--- a/.github/workflows/push_nightly_docker_ghcr.yml
+++ b/.github/workflows/push_nightly_docker_ghcr.yml
@@ -14,7 +14,7 @@ jobs:
       GHCR_PAT: ${{ secrets.GHCR_PAT }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           ref: master
       - name: Build and upload nightly docker
diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml
index 0ae189e99f06a..786d25f4e3b0f 100644
--- a/.github/workflows/run_torchbench.yml
+++ b/.github/workflows/run_torchbench.yml
@@ -18,11 +18,11 @@ jobs:
     timeout-minutes: 720
     steps:
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           path: pytorch
       - name: Checkout TorchBench
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           repository: pytorch/benchmark
           path: benchmark
diff --git a/.github/workflows/test_tools.yml b/.github/workflows/test_tools.yml
index 19a0fd9d4e7e7..afc790bb10e2b 100644
--- a/.github/workflows/test_tools.yml
+++ b/.github/workflows/test_tools.yml
@@ -16,7 +16,7 @@ jobs:
           python-version: 3.x
           architecture: x64
       - name: Checkout PyTorch
-        uses: actions/checkout@v2
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           fetch-depth: 0 # deep clone, to allow us to use git log
       - name: Install dependencies
diff --git a/tools/test/test_extract_scripts.py b/tools/test/test_extract_scripts.py
index 29802517963b3..3126893c4bb39 100644
--- a/tools/test/test_extract_scripts.py
+++ b/tools/test/test_extract_scripts.py
@@ -20,7 +20,7 @@ def test_extract_none(self) -> None:
         self.assertEqual(
             extract_scripts.extract({
                 'name': 'Checkout PyTorch',
-                'uses': 'actions/checkout@v2',
+                'uses': 'zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9',
             }),
             None,
         )

From 491bf7cb7474d82e4349ea4687b544840e591b50 Mon Sep 17 00:00:00 2001
From: Kevin Tse <ktse@fb.com>
Date: Tue, 31 Aug 2021 13:55:59 -0700
Subject: [PATCH 400/530] [DataPipe] adding description, __len__, tests for
 mux() (#64224)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64224

cc VitalyFedyunin ejguan

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30651551

Pulled By: NivekT

fbshipit-source-id: f8af98ba71a592900b992a8077432062ec57bb48
---
 test/test_datapipe.py                        | 42 ++++++++++++++++++++
 torch/utils/data/datapipes/iter/combining.py | 33 +++++++++++----
 2 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 4e37f41565226..24d0ce20d63dd 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -354,6 +354,15 @@ def test_demux_mux_datapipe(self):
         n = n1.mux(n2, n3)
         self.assertEqual(list(range(10)), list(n))
 
+        # Test Case: Uneven DataPipes
+        source_numbers = list(range(0, 10)) + [10, 12]
+        numbers_dp = IDP(source_numbers)
+        n1, n2 = numbers_dp.demux(2, lambda x: x % 2)
+        self.assertEqual([0, 2, 4, 6, 8, 10, 12], list(n1))
+        self.assertEqual([1, 3, 5, 7, 9], list(n2))
+        n = n1.mux(n2)
+        self.assertEqual(source_numbers, list(n))
+
 
 class FileLoggerSimpleHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
     def __init__(self, *args, logfile=None, **kwargs):
@@ -1221,6 +1230,39 @@ def fn(item, dtype=torch.float, *, sum=False):
                 map_dp[index], torch.tensor(input_dp[index], dtype=torch.int).sum()
             )
 
+    def test_mux_datapipe(self):
+
+        # Test Case: Elements are yielded one at a time from each DataPipe, until they are all exhausted
+        input_dp1 = IDP(range(4))
+        input_dp2 = IDP(range(4, 8))
+        input_dp3 = IDP(range(8, 12))
+        output_dp = input_dp1.mux(input_dp2, input_dp3)
+        expected_output = [0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11]
+        self.assertEqual(len(expected_output), len(output_dp))
+        self.assertEqual(expected_output, list(output_dp))
+
+        # Test Case: Uneven input Data Pipes
+        input_dp1 = IDP([1, 2, 3, 4])
+        input_dp2 = IDP([10])
+        input_dp3 = IDP([100, 200, 300])
+        output_dp = input_dp1.mux(input_dp2, input_dp3)
+        expected_output = [1, 10, 100, 2, 200, 3, 300, 4]
+        self.assertEqual(len(expected_output), len(output_dp))
+        self.assertEqual(expected_output, list(output_dp))
+
+        # Test Case: Empty Data Pipe
+        input_dp1 = IDP([0, 1, 2, 3])
+        input_dp2 = IDP([])
+        output_dp = input_dp1.mux(input_dp2)
+        self.assertEqual(len(input_dp1), len(output_dp))
+        self.assertEqual(list(input_dp1), list(output_dp))
+
+        # Test Case: raises TypeError when __len__ is called and an input doesn't have __len__
+        input_dp1 = IDP(range(10))
+        input_dp_no_len = IDP_NoLen(range(10))
+        output_dp = input_dp1.mux(input_dp_no_len)
+        with self.assertRaises(TypeError):
+            len(output_dp)
 
 # Metaclass conflict for Python 3.6
 # Multiple inheritance with NamedTuple is not supported for Python 3.9
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index a837c5bb101c7..ed1256fa1e757 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -1,7 +1,7 @@
 import warnings
 
 from torch.utils.data import IterDataPipe, functional_datapipe
-from typing import Any, Callable, Iterator, List, Optional, Sized, Tuple, TypeVar, Deque
+from typing import Any, Callable, Iterator, List, Optional, Set, Sized, Tuple, TypeVar, Deque
 from collections import deque
 
 T_co = TypeVar('T_co', covariant=True)
@@ -261,24 +261,41 @@ def reset(self):
 
 @functional_datapipe('mux')
 class MultiplexerIterDataPipe(IterDataPipe):
+    r""" :class:`MultiplexerIterDataPipe`.
 
+        Iterable DataPipe that yields one element at a time from each input Iterable DataPipe
+        (i.e. one element from the 1st input DataPipe, then one element from the 2nd DataPipe in the next iteration,
+        and so on). It skips over DataPipes that are exhausted, and ends when all input DataPipes are exhausted.
+
+        Args:
+            datapipes: Iterable DataPipes that will take turn to yield their elements, until they are all exhausted
+    """
     def __init__(self, *datapipes):
         self.datapipes = datapipes
+        self.length: Optional[int] = None
 
     def __iter__(self):
         iterators = [iter(x) for x in self.datapipes]
-        finished = {}
-        had_more = True
-        while had_more:
-            had_more = False
+        finished: Set[int] = set()
+        while len(finished) < len(iterators):
             for i in range(len(iterators)):
                 if i not in finished:
                     try:
-                        value = iterators[i].__next__()
-                        had_more = True
+                        value = next(iterators[i])
                         yield value
                     except StopIteration:
-                        finished[i] = 1
+                        finished.add(i)
+
+    def __len__(self):
+        if self.length is not None:
+            if self.length == -1:
+                raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))
+            return self.length
+        if all(isinstance(dp, Sized) for dp in self.datapipes):
+            self.length = sum(len(dp) for dp in self.datapipes)
+        else:
+            self.length = -1
+        return len(self)
 
 
 @functional_datapipe('zip')

From 6c8cb9bd76299be00d787be2ba02daed0b3921a6 Mon Sep 17 00:00:00 2001
From: Kevin Tse <ktse@fb.com>
Date: Tue, 31 Aug 2021 13:55:59 -0700
Subject: [PATCH 401/530] [DataPipe] export fork, mux, demux for public usage
 (#64279)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64279

cc VitalyFedyunin ejguan

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30671971

Pulled By: NivekT

fbshipit-source-id: 056ac12ef7183b254d1eec341145594639e47ef6
---
 torch/utils/data/datapipes/iter/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
index d4baef788ecca..26d715d310234 100644
--- a/torch/utils/data/datapipes/iter/__init__.py
+++ b/torch/utils/data/datapipes/iter/__init__.py
@@ -54,14 +54,17 @@
            'BucketBatcher',
            'Collator',
            'Concater',
+           'Demultiplexer',
            'FileLister',
            'FileLoader',
            'Filter',
+           'Forker',
            'Grouper',
            'HttpReader',
            'IterableWrapper',
            'LineReader',
            'Mapper',
+           'Multiplexer',
            'RoutedDecoder',
            'Sampler',
            'Shuffler',

From 0457a85d459479881ad07e84a8e9f53bf82bb48d Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@fb.com>
Date: Tue, 31 Aug 2021 14:53:01 -0700
Subject: [PATCH 402/530] Revert D30543236: Add python mode

Test Plan: revert-hammer

Differential Revision:
D30543236 (https://github.com/pytorch/pytorch/commit/4bd03b02424d93b72f15e28c542ede13f88ea929)

Original commit changeset: ef5444d96a5a

fbshipit-source-id: b0042ac2c22765fa11d6d00bf751f6a4489eb6d8
---
 aten/src/ATen/PythonModeTLS.cpp             | 26 -------
 aten/src/ATen/PythonModeTLS.h               | 17 -----
 aten/src/ATen/ThreadLocalState.cpp          |  3 -
 aten/src/ATen/ThreadLocalState.h            |  3 -
 aten/src/ATen/core/PythonFallbackKernel.cpp | 13 +---
 c10/core/TensorImpl.cpp                     | 20 +----
 c10/core/TensorImpl.h                       | 35 +--------
 test/run_test.py                            |  1 -
 test/test_python_dispatch.py                | 81 +--------------------
 tools/build_variables.bzl                   |  2 -
 torch/_C/__init__.pyi.in                    |  2 -
 torch/csrc/autograd/init.cpp                | 17 -----
 torch/csrc/autograd/python_mode.cpp         | 27 -------
 torch/csrc/autograd/python_mode.h           | 17 -----
 torch/csrc/autograd/python_variable.cpp     | 38 ++--------
 torch/csrc/utils/python_arg_parser.cpp      | 39 ++--------
 torch/csrc/utils/python_arg_parser.h        | 11 +--
 torch/csrc/utils/tensor_new.cpp             |  1 -
 torch/utils/_python_dispatch.py             | 34 ---------
 19 files changed, 21 insertions(+), 366 deletions(-)
 delete mode 100644 aten/src/ATen/PythonModeTLS.cpp
 delete mode 100644 aten/src/ATen/PythonModeTLS.h
 delete mode 100644 torch/csrc/autograd/python_mode.cpp
 delete mode 100644 torch/csrc/autograd/python_mode.h
 delete mode 100644 torch/utils/_python_dispatch.py

diff --git a/aten/src/ATen/PythonModeTLS.cpp b/aten/src/ATen/PythonModeTLS.cpp
deleted file mode 100644
index b53043ca84147..0000000000000
--- a/aten/src/ATen/PythonModeTLS.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <ATen/PythonModeTLS.h>
-
-namespace at { namespace impl {
-
-thread_local std::shared_ptr<TorchDispatchTypeObject> pythonModeState;
-
-void PythonModeTLS::set_state(const std::shared_ptr<TorchDispatchTypeObject>& state) {
-  pythonModeState = state;
-  if (state) {
-    c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true);
-  } else {
-    PythonModeTLS::reset_state();
-  }
-}
-
-const std::shared_ptr<TorchDispatchTypeObject>& PythonModeTLS::get_state() {
-  return pythonModeState;
-}
-
-void PythonModeTLS::reset_state() {
-  pythonModeState.reset((TorchDispatchTypeObject*)nullptr);
-  c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false);
-}
-
-} // namespace impl
-} // namespace at
diff --git a/aten/src/ATen/PythonModeTLS.h b/aten/src/ATen/PythonModeTLS.h
deleted file mode 100644
index be52b182c659b..0000000000000
--- a/aten/src/ATen/PythonModeTLS.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-#include <c10/macros/Macros.h>
-#include <torch/library.h>
-#include <ATen/core/dispatch/Dispatcher.h>
-
-namespace at {
-namespace impl {
-
-struct TORCH_API PythonModeTLS {
-  static void set_state(const std::shared_ptr<TorchDispatchTypeObject>& state);
-  static const std::shared_ptr<TorchDispatchTypeObject>& get_state();
-  static void reset_state();
-};
-
-} // namespace impl
-} // namespace at
diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index 19cfa89967ccb..98c2519e045ce 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -17,7 +17,6 @@ ThreadLocalState::ThreadLocalState()
   saved_tensors_default_hooks_ = SavedTensorDefaultHooks::get_hooks();
 
   bumped_record_all_functions_ = at::checkRecordAllFunctions();
-  python_mode_state_ = at::impl::PythonModeTLS::get_state();
 }
 
 void ThreadLocalState::set_grad_mode(bool enabled) {
@@ -31,8 +30,6 @@ void ThreadLocalState::setThreadLocalState(
   // restore the dispatch key set TLS at the same time.
   c10::AutogradState::set_tls_state(state.autograd_tls_);
 
-  at::impl::PythonModeTLS::set_state(state.python_mode_state_);
-
   at::set_record_function_tls_(state.rf_tls_);
 
   SavedTensorDefaultHooks::set_hooks(
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index c99ca6158ffa5..41146912819b4 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -6,7 +6,6 @@
 #include <c10/util/ThreadLocalDebugInfo.h>
 
 #include <ATen/record_function.h>
-#include <ATen/PythonModeTLS.h>
 
 namespace at {
 
@@ -41,8 +40,6 @@ class TORCH_API ThreadLocalState {
   // TLS for AutogradModes
   AutogradState autograd_tls_;
 
-  std::shared_ptr<TorchDispatchTypeObject> python_mode_state_;
-
   // TLS for saved tensors default hooks
   std::pair<PyObject*, PyObject*> saved_tensors_default_hooks_;
 
diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp
index 8e77d0952ec75..276eabfe458c0 100644
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@@ -1,18 +1,9 @@
 #include <torch/library.h>
 #include <ATen/core/dispatch/Dispatcher.h>
-#include <ATen/PythonModeTLS.h>
 
 namespace {
 
 void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  // If Python Mode is active, use its PyInterpreter for dispatch
-  const auto& maybe_python_mode_state = at::impl::PythonModeTLS::get_state();
-  if (maybe_python_mode_state) {
-    maybe_python_mode_state->pyinterpreter()->dispatch(op, stack, maybe_python_mode_state);
-    return;
-  }
-
-  // Otherwise, find a PyInterpreter on a Tensor
   const auto& schema = op.schema();
   const auto num_arguments = schema.arguments().size();
   // It is safe to dispatch on the very first Tensor with a pyobj_interpreter
@@ -24,7 +15,7 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
     if (ivalue.isTensor()) {
       auto* interpreter = ivalue.unsafeToTensorImpl()->pyobj_interpreter();
       if (interpreter) {
-        interpreter->dispatch(op, stack, nullptr);
+        interpreter->dispatch(op, stack);
         return;
       }
     } else if (ivalue.isTensorList()) {
@@ -33,7 +24,7 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
       for (const auto& nv : ivalue.toListRef()) {
         auto* interpreter = nv.unsafeToTensorImpl()->pyobj_interpreter();
         if (interpreter) {
-          interpreter->dispatch(op, stack, nullptr);
+          interpreter->dispatch(op, stack);
           return;
         }
       }
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 9a72659711743..de829c493732d 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -40,8 +40,7 @@ static c10::intrusive_ptr<TensorImpl> noop_detach_fn(
 static void noop_dispatch_fn(
     const PyInterpreter*,
     const c10::OperatorHandle& op,
-    torch::jit::Stack* stack,
-    const std::shared_ptr<TorchDispatchTypeObject>& type) {
+    torch::jit::Stack* stack) {
   TORCH_INTERNAL_ASSERT(
       0,
       "attempted to dispatch (__torch_dispatch__) an operator on Tensor with nontrivial PyObject after corresponding interpreter died");
@@ -609,23 +608,6 @@ void TensorImpl::copy_tensor_metadata(
   }
 }
 
-TorchDispatchTypeObject::TorchDispatchTypeObject(
-    PyObject* type_object,
-    c10::impl::PyInterpreter* pyinterpreter)
-    : data_(type_object), pyinterpreter_(pyinterpreter) {}
-
-TorchDispatchTypeObject::~TorchDispatchTypeObject() {
-  pyinterpreter_->decref(data_);
-}
-
-c10::impl::PyInterpreter* TorchDispatchTypeObject::pyinterpreter() const {
-  return pyinterpreter_;
-}
-
-PyObject* TorchDispatchTypeObject::ptr() const {
-  return data_;
-}
-
 namespace impl {
 
 namespace {
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index d110a17b46590..7051e36b35516 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -161,9 +161,6 @@ struct C10_API AutogradMetaInterface {
   virtual ~AutogradMetaInterface();
 };
 
-// forward declared
-struct TorchDispatchTypeObject;
-
 namespace impl {
 
 // Unfortunately, the definition of AutogradMeta lives in a separate
@@ -258,8 +255,7 @@ struct C10_API PyInterpreter {
   using dispatch_sig = void(
       const PyInterpreter*,
       const c10::OperatorHandle&,
-      torch::jit::Stack* stack,
-      const std::shared_ptr<TorchDispatchTypeObject>& type);
+      torch::jit::Stack* stack);
 
   PyInterpreter(
       name_sig* name_fn,
@@ -303,9 +299,8 @@ struct C10_API PyInterpreter {
   // Invoke the Python boxed fallback dispatch to go back into Python
   __ubsan_ignore_function__ void dispatch(
       const c10::OperatorHandle& op,
-      torch::jit::Stack* stack,
-      const std::shared_ptr<TorchDispatchTypeObject>& type) const {
-    return (*dispatch_fn_)(this, op, stack, type);
+      torch::jit::Stack* stack) const {
+    return (*dispatch_fn_)(this, op, stack);
   }
 
   // Disarm this PyInterpreter, making all of its methods noops.
@@ -353,30 +348,6 @@ struct C10_API NamedTensorMetaInterface {
   };
 };
 
-// NOTE [What is TorchDispatchTypeObject?]
-// A TorchDispatchTypeObject represents the type of a Tensor subclass that has
-// a __torch_dispatch__ classmethod. Concretely, it holds the class as a
-// PyObject* and a PyInterpreter* that says which python interpreter the class
-// came from.
-//
-// See NOTE [dispatch_fn's type argument] for more details
-struct C10_API TorchDispatchTypeObject {
-  // Steals a reference to type_object
-  TorchDispatchTypeObject(
-      PyObject* type_object,
-      c10::impl::PyInterpreter* pyinterpreter);
-
-  // Releases the stolen reference to type_object
-  ~TorchDispatchTypeObject();
-
-  c10::impl::PyInterpreter* pyinterpreter() const;
-  PyObject* ptr() const;
-
- private:
-  PyObject* data_;
-  c10::impl::PyInterpreter* pyinterpreter_;
-};
-
 // NOTE [ Version Counter Sharing ]
 //
 // Every Tensor has a version counter. Version counters are incremented whenever
diff --git a/test/run_test.py b/test/run_test.py
index d0871fa2a0d30..55b2f3841d7b5 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -104,7 +104,6 @@
     "test_optim",
     "test_functional_optim",
     "test_pytree",
-    "test_python_dispatch",
     "test_mobile_optimizer",
     "test_set_default_mobile_cpu_allocator",
     "test_xnnpack_integration",
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index e474f1f4783f0..0f5b6b9cbd70e 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -1,7 +1,6 @@
 import torch
 from torch.testing._internal.common_utils import TestCase, run_tests
 from torch.utils._pytree import tree_map
-from torch.utils._python_dispatch import enable_python_mode
 
 from typing import Iterator, List
 import logging
@@ -51,10 +50,7 @@ def unwrap(e):
         def wrap(e):
             return LoggingTensor(e) if isinstance(e, torch.Tensor) else e
 
-        # no_dispatch is only needed if you use enable_python_mode.
-        # It prevents infinite recursion.
-        with no_dispatch():
-            rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
+        rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
         logging.getLogger("LoggingTensor").info(f"{func.__module__}.{func.__name__}", args, kwargs, rs)
         return rs
 
@@ -339,81 +335,6 @@ def backward(ctx, grad_output):
 $5 = torch._ops.aten.mul($4, $0)
 $6 = torch._ops.aten.add_($1, $5)''')
 
-    def test_enable_python_mode_error(self) -> None:
-        with self.assertRaisesRegex(ValueError, "__torch_dispatch__"):
-            with enable_python_mode(torch.Tensor):
-                pass
-        z = LoggingTensor(torch.empty([]))
-        with self.assertRaisesRegex(ValueError, "must be the type"):
-            with enable_python_mode(z):
-                pass
-
-    def test_enable_python_mode_basic(self) -> None:
-        with enable_python_mode(LoggingTensor):
-            z = torch.empty([])
-            self.assertTrue(isinstance(z, LoggingTensor))
-
-    def test_enable_python_mode_unrelated_tensors(self) -> None:
-        x = torch.randn([])
-        y = torch.randn([])
-        with enable_python_mode(LoggingTensor):
-            z = x + y
-            self.assertTrue(isinstance(z, LoggingTensor))
-
-    def test_enable_python_mode_subclass_priority(self) -> None:
-        class ErrorA(RuntimeError):
-            pass
-
-        class ErrorB(RuntimeError):
-            pass
-
-        class A(torch.Tensor):
-            @staticmethod
-            def __new__(cls, elem):
-                return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
-
-            @classmethod
-            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
-                raise ErrorA
-
-        class B(A):
-            @staticmethod
-            def __new__(cls, elem):
-                return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
-
-            @classmethod
-            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
-                raise ErrorB
-
-        a = A(torch.empty(1))
-        b = B(torch.empty(1))
-        with self.assertRaises(ErrorA):
-            a + a
-
-        # B has precedence over A due to the subclass relationship
-        with self.assertRaises(ErrorB):
-            with enable_python_mode(A):
-                b + b
-        with self.assertRaises(ErrorB):
-            with enable_python_mode(B):
-                a + a
-        with self.assertRaises(ErrorB):
-            with enable_python_mode(B):
-                a + b
-
-    def test_enable_python_mode_respects_no_dispatch(self) -> None:
-        with enable_python_mode(LoggingTensor):
-            z = torch.ones([2, 3])
-            self.assertTrue(isinstance(z, LoggingTensor))
-            with no_dispatch():
-                expected = torch.ones([2, 3])
-                self.assertEqual(z.elem, expected)
-
-    def test_nested_enable_python_mode(self) -> None:
-        with self.assertRaisesRegex(RuntimeError, "has already been set"):
-            with enable_python_mode(LoggingTensor):
-                with enable_python_mode(LoggingTensor):
-                    pass
 
 if __name__ == '__main__':
     run_tests()
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index dd89981094d4f..34846b5d6c7b3 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -666,7 +666,6 @@ libtorch_python_core_sources = [
     "torch/csrc/autograd/init.cpp",
     "torch/csrc/autograd/python_anomaly_mode.cpp",
     "torch/csrc/autograd/python_saved_variable_hooks.cpp",
-    "torch/csrc/autograd/python_mode.cpp",
     "torch/csrc/autograd/python_cpp_function.cpp",
     "torch/csrc/autograd/python_engine.cpp",
     "torch/csrc/autograd/python_function.cpp",
@@ -794,7 +793,6 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/ParallelNativeTBB.cpp",
     "aten/src/ATen/ParallelOpenMP.cpp",
     "aten/src/ATen/ParallelThreadPoolNative.cpp",
-    "aten/src/ATen/PythonModeTLS.cpp",
     "aten/src/ATen/ScalarOps.cpp",
     "aten/src/ATen/SequenceNumber.cpp",
     "aten/src/ATen/SparseTensorImpl.cpp",
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 352edbee6bc5e..01fdf9e12500a 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -652,8 +652,6 @@ def __set_forward_AD_enabled(enabled: _bool) -> None: ...
 def __is_forward_AD_enabled() -> _bool: ...
 def _register_default_hooks(pack_hook: Callable, unpack_hook: Callable) -> None: ...
 def _reset_default_hooks() -> None: ...
-def _enter_python_mode(cls: Type) -> None: ...
-def _exit_python_mode() -> None: ...
 
 class _InferenceMode(object):
     def __init__(self, mode: _bool) -> None: ...
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 860aaec466218..697ca871f83c5 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -14,7 +14,6 @@
 #include <torch/csrc/autograd/python_saved_variable_hooks.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/autograd/utils/python_arg_parsing.h>
-#include <torch/csrc/autograd/python_mode.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
 #include <c10/core/ScalarType.h>
 
@@ -495,20 +494,6 @@ static PyObject * python_exit_dual_level(PyObject* _unused, PyObject* args, PyOb
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject * enter_python_mode(PyObject* _unused, PyObject* arg) {
-  HANDLE_TH_ERRORS
-  PythonMode::enter(arg);
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-static PyObject * exit_python_mode(PyObject* _unused, PyObject* arg) {
-  HANDLE_TH_ERRORS
-  PythonMode::exit();
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
 // autograd methods on torch._C
 static PyMethodDef methods[] = { // NOLINT
   {"_set_grad_enabled", set_grad_enabled, METH_O, nullptr},
@@ -529,8 +514,6 @@ static PyMethodDef methods[] = { // NOLINT
   {"is_anomaly_enabled", is_anomaly_mode_enabled, METH_NOARGS, nullptr},
   {"_enter_dual_level", python_enter_dual_level, METH_NOARGS, nullptr},
   {"_exit_dual_level", castPyCFunctionWithKeywords(python_exit_dual_level), METH_VARARGS | METH_KEYWORDS, nullptr},
-  {"_enter_python_mode", enter_python_mode, METH_O, nullptr},
-  {"_exit_python_mode", exit_python_mode, METH_NOARGS, nullptr},
   {nullptr, nullptr, 0, nullptr}
 };
 
diff --git a/torch/csrc/autograd/python_mode.cpp b/torch/csrc/autograd/python_mode.cpp
deleted file mode 100644
index 435842631a5bb..0000000000000
--- a/torch/csrc/autograd/python_mode.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#include <torch/csrc/autograd/python_mode.h>
-#include <torch/csrc/python_headers.h>
-#include <torch/csrc/autograd/python_variable.h>
-#include <ATen/PythonModeTLS.h>
-#include <c10/core/TensorImpl.h>
-
-namespace torch { namespace autograd {
-
-void PythonMode::enter(PyObject* type) {
-  if (at::impl::PythonModeTLS::get_state()) {
-    TORCH_CHECK(
-        false,
-        "python mode has already been set. We do not yet support nested python ",
-        "mode. Please file us an issue and reset it before setting it again.")
-  }
-  // TorchDispatchTypeObject steals a reference, See NOTE [What is TorchDispatchTypeObject?]
-  Py_INCREF(type);
-  auto state = std::make_shared<c10::TorchDispatchTypeObject>(type, getPyInterpreter());
-  at::impl::PythonModeTLS::set_state(state);
-}
-
-void PythonMode::exit() {
-  TORCH_INTERNAL_ASSERT(at::impl::PythonModeTLS::get_state(), "exiting Python Mode but it wasn't set!");
-  at::impl::PythonModeTLS::reset_state();
-}
-
-}}
diff --git a/torch/csrc/autograd/python_mode.h b/torch/csrc/autograd/python_mode.h
deleted file mode 100644
index 03da51c1c49e0..0000000000000
--- a/torch/csrc/autograd/python_mode.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-#include <torch/csrc/python_headers.h>
-#include <c10/core/TensorImpl.h>
-
-namespace torch { namespace autograd {
-
-struct TORCH_API PythonMode {
-  // Enter python mode, causing all operators to dispatch to the type's __torch_dispatch__.
-  // `type` is the type of a Tensor subclass that has __torch_dispatch__.
-  static void enter(PyObject* type);
-
-  // Exit the current python mode.
-  static void exit();
-};
-
-}}
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index abe90105cde2b..50d6eb9ab7e05 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -32,7 +32,6 @@
 
 #include <torch/library.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
-#include <torch/csrc/autograd/python_mode.h>
 
 
 #include <ATen/ATen.h>
@@ -65,12 +64,7 @@ void concrete_decref_fn(const c10::impl::PyInterpreter* self, PyObject* pyobj) {
     return;
 
   pybind11::gil_scoped_acquire gil;
-  // Two possibilities:
-  // 1. We are decref-ing a tensor. Then we must be careful about
-  // PyObject resurrection (this only applies to Tensors, see THPVariable_clear).
-  // 2. We are decref-ing some other Python object. We don't do
-  // PyObject resurrection on non-Tensors, so we just carry on as usual
-  if (THPVariable_Check(pyobj) && Py_REFCNT(pyobj) > 1) {
+  if (Py_REFCNT(pyobj) > 1) {
     // It's still alive!  This can happen if a weak ref resurrected
     // the PyObject without flipping ownership.  At this point it is
     // too late to rescue the object, so just stub out the PyObject
@@ -88,11 +82,7 @@ void concrete_decref_fn(const c10::impl::PyInterpreter* self, PyObject* pyobj) {
 };
 
 c10::intrusive_ptr<TensorImpl> concrete_detach_fn(const c10::impl::PyInterpreter*, const c10::TensorImpl* self);
-void concrete_dispatch_fn(
-    const c10::impl::PyInterpreter*,
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack,
-    const std::shared_ptr<TorchDispatchTypeObject>& type);
+void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHandle& op, torch::jit::Stack* stack);
 
 class PyInterpreterHolder {
  public:
@@ -1501,19 +1491,7 @@ bool isPythonTensor(const Tensor& tensor) {
   return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Python);
 }
 
-// NOTE [dispatch_fn's type argument]
-// `type` is nullable and represents the PythonMode going on.
-// Right now we only support a single PythonMode, but in the future we could
-// change this to a stack of PythonModes.
-//
-// If `type` isn't null, then we consider the type for dispatch by prepending
-// it to the overloaded_args list. `handle_torch_funciton_no_python_arg_parser`
-// is responsible for doing overload resolution.
-void concrete_dispatch_fn(
-    const c10::impl::PyInterpreter*,
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack,
-    const std::shared_ptr<TorchDispatchTypeObject>& type) {
+void concrete_dispatch_fn(const c10::impl::PyInterpreter*, const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   const auto& schema = op.schema();
   const auto num_returns = schema.returns().size();
 
@@ -1590,17 +1568,13 @@ void concrete_dispatch_fn(
   auto args = py::reinterpret_steal<py::object>(PyTuple_New(positional_default_start));
   py::dict kwargs;
 
-  if (type) {
-    append_overloaded_type(&overloaded_args, type->ptr());
-  }
-
   // Find overloaded tensors
   for (int64_t idx = 0; idx < arguments.size(); idx++) {
     const auto& ivalue = arguments[idx];
     if (ivalue.isTensor()) {
       const auto& tensor = ivalue.toTensor();
       if (isPythonTensor(tensor)) {
-        append_overloaded_tensor(&overloaded_args, py::cast(tensor).ptr());
+        append_overloaded_arg(&overloaded_args, py::cast(tensor).ptr());
       }
     } else if (ivalue.isList()) {
       const auto& list = ivalue.toListRef();
@@ -1609,7 +1583,7 @@ void concrete_dispatch_fn(
         if (nv.isTensor()) {
           const auto& tensor = nv.toTensor();
           if (isPythonTensor(tensor)) {
-            append_overloaded_tensor(&overloaded_args, py::cast(tensor).ptr());
+            append_overloaded_arg(&overloaded_args, py::cast(tensor).ptr());
           }
         }
       }
@@ -1659,7 +1633,7 @@ c10::intrusive_ptr<TensorImpl> concrete_detach_fn(const c10::impl::PyInterpreter
   Tensor self_t = Tensor(c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::unsafe_reclaim_from_nonowning(const_cast<c10::TensorImpl*>(self)));
   auto self_p = py::reinterpret_steal<py::object>(THPVariable_Wrap(self_t));
   TORCH_INTERNAL_ASSERT(isPythonTensor(self_t));
-  append_overloaded_tensor(&overloaded_args, self_p.ptr());
+  append_overloaded_arg(&overloaded_args, self_p.ptr());
   auto args = py::reinterpret_steal<py::object>(PyTuple_New(1));
   PyTuple_SET_ITEM(args.ptr(), 0, self_p.release().ptr());
 
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 3ee20c055bf94..6115dcdfbe61a 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -200,28 +200,12 @@ auto handle_torch_function(PyObject* self, const std::string& func_name, PyObjec
   return ret.release().ptr();
 }
 
-// Note: [Overloaded args]
-// An overloaded arg may be one of the following:
-// - an instance of an object that has a __torch_function__ method
-// - an instance of an object that has a __torch_dispatch__ classmethod
-// - a class type that has a __torch_dispatch__ classmethod
-//
-// This function returns the type of the arg (if the arg is an instance),
-// otherwise, it returns the arg.
-static PyObject* get_type_of_overloaded_arg(PyObject* obj_or_type) {
-  if (PyType_Check(obj_or_type)) {
-    return obj_or_type;
-  }
-  return (PyObject*)Py_TYPE(obj_or_type);
-}
-
-// See Note: [Overloaded args] for what they hold
 auto handle_torch_function_no_python_arg_parser(const std::vector<py::handle> &overloaded_args, PyObject* args, PyObject* kwargs, const char* func_name, PyObject* torch_api_function, const char* module_name, const char* torch_function_name) -> PyObject* {
   // overloaded_args already all have unique types
   std::vector<py::object> overloaded_types;
   overloaded_types.reserve(overloaded_args.size());
   for (auto &arg : overloaded_args) {
-    overloaded_types.push_back(py::reinterpret_borrow<py::object>(get_type_of_overloaded_arg(arg.ptr())));
+    overloaded_types.push_back(py::reinterpret_borrow<py::object>((PyObject *) Py_TYPE(arg.ptr())));
   }
   py::tuple py_types = py::cast(overloaded_types);
   py::object ret;
@@ -247,7 +231,7 @@ auto handle_torch_function_no_python_arg_parser(const std::vector<py::handle> &o
     ss << "no implementation found for '" << module_name << "." << func_name
        << "' on types that implement " << torch_function_name << ": [";
     for (auto &arg : overloaded_args) {
-      ss << PyObject_Repr(get_type_of_overloaded_arg(arg.ptr()));
+      ss << arg.ptr()->ob_type->tp_name;
       if (!arg.is(overloaded_args.back())) {
         ss << ", ";
       }
@@ -344,11 +328,10 @@ auto handle_torch_function_indexing(PyObject* self, PyObject* index, PyObject* v
  *
  */
 
-static void append_overloaded_arg(std::vector<py::handle>* overloaded_args, PyObject* obj, bool obj_is_type) {
+void append_overloaded_arg(std::vector<py::handle>* overloaded_args, PyObject* obj) {
   bool class_not_seen_yet = true;
-  PyObject* obj_type = obj_is_type ? obj : (PyObject*)Py_TYPE(obj);
   for (auto &arg : *overloaded_args) {
-    if (obj_type == get_type_of_overloaded_arg(arg.ptr())) {
+    if (Py_TYPE(obj) == Py_TYPE(arg.ptr())) {
       // obj is the same type as another parameter we've seen in a prior
       // iteration of the loop over parameters so we already have an entry
       // with the proper __torch_function__ implementation to call, so skip
@@ -360,7 +343,7 @@ static void append_overloaded_arg(std::vector<py::handle>* overloaded_args, PyOb
   if (class_not_seen_yet) {
     int arg_index = overloaded_args->size();
     for(const auto j : c10::irange(arg_index)) {
-      if (PyObject_IsSubclass(obj_type, (PyObject*)(get_type_of_overloaded_arg((*overloaded_args)[j].ptr())))) {
+      if (PyObject_IsInstance(obj, (PyObject*)(Py_TYPE((*overloaded_args)[j].ptr())))) {
         // obj is a subclass of another object we've seen already so its
         // __torch_function__ should be called first, therefore we
         // insert it into overloaded_args before the superclass
@@ -375,14 +358,6 @@ static void append_overloaded_arg(std::vector<py::handle>* overloaded_args, PyOb
   }
 }
 
-void append_overloaded_tensor(std::vector<py::handle>* overloaded_args, PyObject* obj) {
-  append_overloaded_arg(overloaded_args, obj, /*obj_is_type*/false);
-}
-
-void append_overloaded_type(std::vector<py::handle>* overloaded_args, PyObject* obj) {
-  append_overloaded_arg(overloaded_args, obj, /*obj_is_type*/true);
-}
-
 bool is_tensor_and_append_overloaded(PyObject* obj, std::vector<py::handle>* overloaded_args) {
   if (THPVariable_CheckExact(obj)) {
     // torch.Tensor instances (not subclasses, except for Parameter)
@@ -391,7 +366,7 @@ bool is_tensor_and_append_overloaded(PyObject* obj, std::vector<py::handle>* ove
 
   if (check_has_torch_function(obj)) {
     // tensor subclasses and unrelated objects with __torch_function__
-    append_overloaded_tensor(overloaded_args, obj);
+    append_overloaded_arg(overloaded_args, obj);
     return true;
   } else if (THPVariable_Check(obj)) {
     // tensor subclasses without __torch_function__
@@ -930,7 +905,7 @@ bool FunctionSignature::parse(PyObject* self, PyObject* args, PyObject* kwargs,
 
   int i = 0;
   if (self != nullptr && check_has_torch_function(self)) {
-    append_overloaded_tensor(&this->overloaded_args, self);
+    append_overloaded_arg(&this->overloaded_args, self);
   }
   for (auto& param : params) {
     PyObject* obj = nullptr;
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 6a05807e5a314..d132185ccaefb 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -818,15 +818,6 @@ bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector<py::handle>
  * 'overloaded_args': the vector to append the overloaded args
  * 'obj': the input tensor that is overloaded
  */
-void append_overloaded_tensor(std::vector<py::handle>* overloaded_args, PyObject* obj);
-
-/* Given an argument that is definitely a type and is definitely overloaded,
- * append it to the overloaded arguments list. Use this only with __torch_dispatch__,
- * where we operate on classes that have a __torch_dispatch__ classmethod.
- *
- * 'overloaded_args': the vector to append the overloaded type
- * 'obj': the input class that has a __torch_dispatch__ classmethod.
- */
-void append_overloaded_type(std::vector<py::handle>* overloaded_args, PyObject* obj);
+void append_overloaded_arg(std::vector<py::handle>* overloaded_args, PyObject* obj);
 
 } // namespace torch
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 25e9a5962614f..17d7acc37640c 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -267,7 +267,6 @@ Tensor internal_new_from_data(
   {
     at::AutoDispatchBelowADInplaceOrView guard;  // TODO: remove
     at::tracer::impl::NoTracerDispatchMode tracer_guard;
-    c10::impl::ExcludeDispatchKeyGuard pythonmode_guard(c10::DispatchKey::Python);
     // functorch uses FuncTorchDynamicLayerBackMode as a mode key to wrap all
     // tensors returned from operators in special TensorWrapper tensor extension
     // The problem with this is that TensorWrapper does not have storage so
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
deleted file mode 100644
index a7cfae10c37c1..0000000000000
--- a/torch/utils/_python_dispatch.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import torch
-import contextlib
-from typing import Iterator
-
-# Context manager that causes all pytorch operators to dispatch to the passed-in
-# type's __torch_dispatch__ function.
-# operation that accepts no tensors but returns a tensor.
-#
-# enable_python_mode is affected by torch._C._DisableTorchDispatch.
-#
-# NB: Calling an operator inside __torch_dispatch__ does go through
-# __torch_dispatch__ again. Please use _DisableTorchDispatch inside
-# __torch_dispatch__ to prevent infinite recursion.
-#
-# TODO: Limitations and things about enable_python_mode we should fix before exposing it:
-# - it currently cannot be nested. This should be simple to implement; we need a
-#   stack of TorchDispatchTypeObjects and the next bullet point.
-# - We need a better user-facing api for torch._C._DisableTorchDispatch that
-#   is able to selectively disable __torch_dispatch__ of a particular class.
-# - It doesn't work with the tensor constructors (torch.tensor, torch.Tensor)
-# - Better name (see https://github.com/pytorch/pytorch/pull/63496#discussion_r694091694)
-@contextlib.contextmanager
-def enable_python_mode(cls) -> Iterator[None]:
-    if not hasattr(cls, '__torch_dispatch__'):
-        raise ValueError('The class passed to enable_python_mode '
-                         'must have a __torch_dispatch__ classmethod')
-    if not isinstance(cls, type) or not issubclass(cls, (torch.Tensor,)):
-        raise ValueError('The argument passed to enable_python_mode '
-                         'must be the type of a Tensor subclass')
-    torch._C._enter_python_mode(cls)
-    try:
-        yield
-    finally:
-        torch._C._exit_python_mode()

From c2da103fe69d493dfce48b7075e56643ac8fce58 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 31 Aug 2021 17:19:11 -0700
Subject: [PATCH 403/530] Discover new tests in run_tests.py (#64246)

Summary:
Introduce `discover_tests` function that globs for all Python files
starting with `test_` in test folder excluding subfolders which are
executed differently

Fixes https://github.com/pytorch/pytorch/issues/64178

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64246

Reviewed By: walterddr, seemethere

Differential Revision: D30661652

Pulled By: malfet

fbshipit-source-id: a52e78ec717b6846add267579dd8d9ae75326bf9
---
 test/run_test.py | 220 +++++++++++++++++------------------------------
 1 file changed, 81 insertions(+), 139 deletions(-)
 mode change 100755 => 100644 test/run_test.py

diff --git a/test/run_test.py b/test/run_test.py
old mode 100755
new mode 100644
index 55b2f3841d7b5..5953919b16323
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -50,145 +50,87 @@
     )
 
 
-TESTS = [
-    "test_import_time",
-    "test_public_bindings",
-    "test_type_hints",
-    "test_ao_sparsity",
-    "test_autograd",
-    "benchmark_utils/test_benchmark_utils",
-    "test_binary_ufuncs",
-    "test_buffer_protocol",
-    "test_bundled_inputs",
-    "test_complex",
-    "test_cpp_api_parity",
-    "test_cpp_extensions_aot_no_ninja",
-    "test_cpp_extensions_aot_ninja",
-    "test_cpp_extensions_jit",
-    "distributed/test_c10d_common",
-    "distributed/test_c10d_gloo",
-    "distributed/test_c10d_nccl",
-    "distributed/test_jit_c10d",
-    "distributed/test_c10d_spawn_gloo",
-    "distributed/test_c10d_spawn_nccl",
-    "distributed/test_store",
-    "distributed/test_pg_wrapper",
-    "distributed/algorithms/test_join",
-    "test_cuda",
-    "test_autocast",
-    "test_jit_cuda_fuser",
-    "test_cuda_primary_ctx",
-    "test_dataloader",
-    "test_datapipe",
-    "distributed/test_data_parallel",
-    "distributed/test_distributed_spawn",
-    "distributions/test_constraints",
-    "distributions/test_distributions",
-    "test_dispatch",
-    "test_foreach",
-    "test_indexing",
-    "test_jit",
-    "test_linalg",
-    "test_logging",
-    "test_mkldnn",
-    "test_model_dump",
-    "test_module_init",
-    "test_modules",
-    "test_multiprocessing",
-    "test_multiprocessing_spawn",
-    "distributed/test_nccl",
-    "test_native_functions",
-    "test_numba_integration",
-    "test_nn",
-    "test_ops",
-    "test_optim",
-    "test_functional_optim",
-    "test_pytree",
-    "test_mobile_optimizer",
-    "test_set_default_mobile_cpu_allocator",
-    "test_xnnpack_integration",
-    "test_vulkan",
-    "test_sparse",
-    "test_sparse_csr",
-    "test_quantization",
-    "test_pruning_op",
-    "test_spectral_ops",
-    "test_serialization",
-    "test_shape_ops",
-    "test_show_pickle",
-    "test_sort_and_select",
-    "test_tensor_creation_ops",
-    "test_testing",
-    "test_torch",
-    "test_type_info",
-    "test_unary_ufuncs",
-    "test_utils",
-    "test_view_ops",
-    "test_vmap",
-    "test_namedtuple_return_api",
-    "test_numpy_interop",
-    "test_jit_profiling",
-    "test_jit_legacy",
-    "test_jit_fuser_legacy",
-    "test_tensorboard",
-    "test_namedtensor",
-    "test_reductions",
-    "test_type_promotion",
-    "test_jit_disabled",
-    "test_function_schema",
-    "test_overrides",
-    "test_jit_fuser_te",
-    "test_tensorexpr",
-    "test_tensorexpr_pybind",
-    "test_openmp",
-    "test_profiler",
-    "distributed/test_launcher",
-    "distributed/nn/jit/test_instantiator",
-    "distributed/rpc/test_faulty_agent",
-    "distributed/rpc/test_tensorpipe_agent",
-    "distributed/rpc/cuda/test_tensorpipe_agent",
-    "test_determination",
-    "test_futures",
-    "test_fx",
-    "test_fx_experimental",
-    "test_functional_autograd_benchmark",
-    "test_package",
-    "test_license",
-    "distributed/pipeline/sync/skip/test_api",
-    "distributed/pipeline/sync/skip/test_gpipe",
-    "distributed/pipeline/sync/skip/test_inspect_skip_layout",
-    "distributed/pipeline/sync/skip/test_leak",
-    "distributed/pipeline/sync/skip/test_portal",
-    "distributed/pipeline/sync/skip/test_stash_pop",
-    "distributed/pipeline/sync/skip/test_tracker",
-    "distributed/pipeline/sync/skip/test_verify_skippables",
-    "distributed/pipeline/sync/test_balance",
-    "distributed/pipeline/sync/test_bugs",
-    "distributed/pipeline/sync/test_checkpoint",
-    "distributed/pipeline/sync/test_copy",
-    "distributed/pipeline/sync/test_deferred_batch_norm",
-    "distributed/pipeline/sync/test_dependency",
-    "distributed/pipeline/sync/test_inplace",
-    "distributed/pipeline/sync/test_microbatch",
-    "distributed/pipeline/sync/test_phony",
-    "distributed/pipeline/sync/test_pipe",
-    "distributed/pipeline/sync/test_pipeline",
-    "distributed/pipeline/sync/test_stream",
-    "distributed/pipeline/sync/test_transparency",
-    "distributed/pipeline/sync/test_worker",
-    "distributed/optim/test_zero_redundancy_optimizer",
-    "distributed/elastic/timer/api_test",
-    "distributed/elastic/timer/local_timer_example",
-    "distributed/elastic/timer/local_timer_test",
-    "distributed/elastic/events/lib_test",
-    "distributed/elastic/metrics/api_test",
-    "distributed/elastic/utils/logging_test",
-    "distributed/elastic/utils/util_test",
-    "distributed/elastic/utils/distributed_test",
-    "distributed/elastic/multiprocessing/api_test",
-    "distributed/_sharding_spec/test_sharding_spec",
-    "distributed/_sharded_tensor/test_sharded_tensor",
-]
+def discover_tests(
+        base_dir: Optional[pathlib.Path] = None,
+        blocklisted_patterns: Optional[List[str]] = None,
+        blocklisted_tests: Optional[List[str]] = None,
+        extra_tests: Optional[List[str]] = None) -> List[str]:
+    """
+    Searches for all python files starting with test_ excluding one specified by patterns
+    """
+    def skip_test_p(name: str) -> bool:
+        rc = False
+        if blocklisted_patterns is not None:
+            rc |= any(name.startswith(pattern) for pattern in blocklisted_patterns)
+        if blocklisted_tests is not None:
+            rc |= name in blocklisted_tests
+        return rc
+    cwd = pathlib.Path(__file__).resolve().parent if base_dir is None else base_dir
+    all_py_files = list(cwd.glob('**/test_*.py'))
+    rc = [str(fname.relative_to(cwd))[:-3] for fname in all_py_files]
+    # Invert slashes on Windows
+    if sys.platform == "win32":
+        rc = [name.replace('\\', '/') for name in rc]
+    rc = [test for test in rc if not skip_test_p(test)]
+    if extra_tests is not None:
+        rc += extra_tests
+    return sorted(rc)
+
+
+TESTS = discover_tests(
+    blocklisted_patterns=[
+        'ao',
+        'bottleneck_test',
+        'custom_backend',
+        'custom_operator',
+        'fx',        # executed by test_fx.py
+        'jit',      # executed by test_jit.py
+        'mobile',
+        'onnx',
+        'package',  # executed by test_package.py
+        'quantization',  # executed by test_quantization.py
+    ],
+    blocklisted_tests=[
+        'test_bundled_images',
+        'test_cpp_extensions_aot',
+        'test_gen_backend_stubs',
+        'test_jit_fuser',
+        'test_jit_simple',
+        'test_jit_string',
+        'test_kernel_launch_checks',
+        'test_metal',
+        'test_nnapi',
+        'test_python_dispatch',
+        'test_segment_reductions',
+        'test_static_runtime',
+        'test_throughput_benchmark',
+        'test_typing',
+        "distributed/algorithms/ddp_comm_hooks/test_ddp_hooks",
+        "distributed/algorithms/quantization/test_quantization",
+        "distributed/bin/test_script",
+        "distributed/elastic/multiprocessing/bin/test_script",
+        "distributed/launcher/bin/test_script",
+        "distributed/launcher/bin/test_script_init_method",
+        "distributed/launcher/bin/test_script_is_torchelastic_launched",
+        "distributed/launcher/bin/test_script_local_rank",
+        "distributed/test_c10d_spawn",
+        'distributions/test_transforms',
+        'distributions/test_utils',
+    ],
+    extra_tests=[
+        "test_cpp_extensions_aot_ninja",
+        "test_cpp_extensions_aot_no_ninja",
+        "distributed/elastic/timer/api_test",
+        "distributed/elastic/timer/local_timer_example",
+        "distributed/elastic/timer/local_timer_test",
+        "distributed/elastic/events/lib_test",
+        "distributed/elastic/metrics/api_test",
+        "distributed/elastic/utils/logging_test",
+        "distributed/elastic/utils/util_test",
+        "distributed/elastic/utils/distributed_test",
+        "distributed/elastic/multiprocessing/api_test",
+    ]
+)
 
 # Tests need to be run with pytest.
 USE_PYTEST_LIST = [

From 44fcb00a569231be09419a97e6933152d64d92b7 Mon Sep 17 00:00:00 2001
From: Jay Leverett <jayleverett@fb.com>
Date: Tue, 31 Aug 2021 17:28:42 -0700
Subject: [PATCH 404/530] Fix redundant class definition in GraphModule
 singleton constructor (#64274)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/63883

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64274

Reviewed By: jamesr66a

Differential Revision: D30675970

Pulled By: jayleverett

fbshipit-source-id: e74ef2a28013f0fa7c58d14f38e66cfe48d26b74
---
 test/test_fx.py          | 13 +++++++++++++
 torch/fx/graph_module.py |  8 ++++++++
 2 files changed, 21 insertions(+)

diff --git a/test/test_fx.py b/test/test_fx.py
index eadcf6cc0b2f4..f4e4ab203a7bc 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -188,6 +188,19 @@ def forward(self, A, b=4, *args, c=5, **kwargs):
         t = T()
         symbolic_trace(t)
 
+        # test for issue described at https://github.com/pytorch/pytorch/issues/63883
+        class M3(torch.nn.Module):
+            def forward(self, x):
+                return torch.relu(x)
+
+        m3 = M3()
+        gm3 = symbolic_trace(m3)
+        new_instance = gm3.__new__(type(gm3))
+        new_instance.__init__(gm3, gm3.graph)
+
+        x = torch.randn(5, 3)
+        torch.testing.assert_allclose(new_instance(x), torch.relu(x))
+
     def test_custom_import(self):
         graph = torch.fx.Graph()
         a = graph.placeholder('x')
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index e7750db9353bd..89685bf3953a0 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -240,6 +240,14 @@ def __new__(cls: 'Type[GraphModule]', *args, **kwargs):
         # it is a subclass of the user-defined class, the only difference
         # is an extra layer to install the forward method
 
+        # address issue described at https://github.com/pytorch/pytorch/issues/63883
+        # in other words, traverse class hierarchy to fix the redundant class definition problem
+        for t in cls.__mro__:
+            c = t.__qualname__.split('.')[-1]
+            if c != 'GraphModuleImpl':
+                cls = t
+                break
+
         class GraphModuleImpl(cls):  # type: ignore[misc, valid-type]
             pass
         return super().__new__(GraphModuleImpl)

From 9e25634833735f26f7090d0e5341d6ad38b7eebb Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Tue, 31 Aug 2021 17:32:00 -0700
Subject: [PATCH 405/530] [TensorExpr] Move declaration of buildErrorMessage to
 exception.h (#64301)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64301

Test Plan: Imported from OSS

Reviewed By: navahgar, huiguoo

Differential Revision: D30678215

Pulled By: ZolotukhinM

fbshipit-source-id: 599c83b3890450a0fb6526815f037eec9563661c
---
 torch/csrc/jit/tensorexpr/exceptions.h | 2 ++
 torch/csrc/jit/tensorexpr/kernel.h     | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/exceptions.h b/torch/csrc/jit/tensorexpr/exceptions.h
index cf23bbc2289c4..7194dfe166aa8 100644
--- a/torch/csrc/jit/tensorexpr/exceptions.h
+++ b/torch/csrc/jit/tensorexpr/exceptions.h
@@ -84,6 +84,8 @@ class malformed_ir : public std::runtime_error {
             "MALFORMED IR: " + err + " - " + std::to_string(stmt)) {}
 };
 
+TORCH_API std::string buildErrorMessage(const std::string& s);
+
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h
index bdb9802ccdc3a..4b92b020fce31 100644
--- a/torch/csrc/jit/tensorexpr/kernel.h
+++ b/torch/csrc/jit/tensorexpr/kernel.h
@@ -300,8 +300,6 @@ TORCH_API void annotateInputShapes(
 TORCH_API std::shared_ptr<Graph> removeUnusedSelfArgument(
     const std::shared_ptr<Graph>& graph);
 
-TORCH_API std::string buildErrorMessage(const std::string& s);
-
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch

From 5ecb966e0ff383d65531c8f6de23e704b9cafc54 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 31 Aug 2021 17:33:11 -0700
Subject: [PATCH 406/530] Add ciflow-tracking issue to pytorch-probot (#64125)

Summary:
Doesn't do anything yet...

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64125

Reviewed By: zhouzhuojie

Differential Revision: D30620283

Pulled By: malfet

fbshipit-source-id: 91869d35c1b70a55e32261d2c32fb0136ec33960
---
 .github/pytorch-probot.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index c1e1218b76091..627b2648ad426 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -1 +1,2 @@
 tracking_issue: 24422
+ciflow_tracking_issue: 64124

From 23da90ab84d4fa6006e8f65e6d795d2016d37f3b Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Tue, 31 Aug 2021 17:38:42 -0700
Subject: [PATCH 407/530] .github: Consolidate linux setup / teardown (#64229)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64229

Consolidates linux setup / teardown into easy to use jinja2 macros

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra

Test Plan: Imported from OSS

Reviewed By: zhouzhuojie, driazati

Differential Revision: D30683810

Pulled By: seemethere

fbshipit-source-id: 2578630df3e212fb79392a699090553baef44cc2
---
 .github/scripts/display_ec2_information.sh    |  14 --
 .github/templates/bazel_ci_workflow.yml.j2    |  23 +--
 .github/templates/common.yml.j2               |  67 +++++++-
 .github/templates/linux_ci_workflow.yml.j2    | 128 ++------------
 ...torch-linux-xenial-cuda10.2-py3.6-gcc7.yml |  73 ++++++--
 ...torch-linux-xenial-cuda11.3-py3.6-gcc7.yml |  73 ++++++--
 ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 116 ++++++++++---
 ...rated-linux-bionic-py3.8-gcc9-coverage.yml | 116 ++++++++++---
 ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml | 116 ++++++++++---
 ...rated-linux-xenial-cuda11.3-py3.6-gcc7.yml | 116 ++++++++++---
 .../generated-linux-xenial-py3.6-gcc5.4.yml   | 157 ++++++++++++++----
 ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml |  75 +++++++--
 ...torch-linux-xenial-cuda11.1-py3.6-gcc7.yml |  73 ++++++--
 ...iodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 116 ++++++++++---
 ...rated-periodic-win-vs2019-cuda11.1-py3.yml |  22 ++-
 .../generated-win-vs2019-cpu-py3.yml          |  22 ++-
 .../generated-win-vs2019-cuda10.1-py3.yml     |  22 ++-
 .../generated-win-vs2019-cuda11.3-py3.yml     |  22 ++-
 18 files changed, 991 insertions(+), 360 deletions(-)
 delete mode 100755 .github/scripts/display_ec2_information.sh

diff --git a/.github/scripts/display_ec2_information.sh b/.github/scripts/display_ec2_information.sh
deleted file mode 100755
index be47418966025..0000000000000
--- a/.github/scripts/display_ec2_information.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-function get_ec2_metadata() {
-    # Pulled from instance metadata endpoint for EC2
-    # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-    category=$1
-    curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-}
-
-echo "ami-id: $(get_ec2_metadata ami-id)"
-echo "instance-id: $(get_ec2_metadata instance-id)"
-echo "instance-type: $(get_ec2_metadata instance-type)"
diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2
index e9907ed679e3f..7f9d5230e0d9f 100644
--- a/.github/templates/bazel_ci_workflow.yml.j2
+++ b/.github/templates/bazel_ci_workflow.yml.j2
@@ -31,21 +31,8 @@ on:
       NUM_TEST_SHARDS: !{{ num_test_shards }}
       CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
-          submodules: recursive
-      !{{ common.display_ec2_information() }}
+      !{{ common.setup_ec2_linux() }}
+      !{{ common.checkout_pytorch("recursive") }}
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -158,9 +145,5 @@ on:
           path:
             test-reports-*.zip
       !{{ common.upload_test_statistics(build_environment) }}
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
+      !{{ common.teardown_ec2_linux() }}
 {%- endblock %}
diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index bf72898d04c25..f9296e017a1cf 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -2,7 +2,16 @@
       - name: Display EC2 information
         shell: bash
         run: |
-          .github/scripts/display_ec2_information.sh
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
 {%- endmacro -%}
 
 {%- macro parse_ref() -%}
@@ -29,3 +38,59 @@
           python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
 {%- endmacro -%}
+
+{%- macro setup_ec2_linux() -%}
+      !{{ display_ec2_information() }}
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+{%- endmacro -%}
+
+{%- macro teardown_ec2_linux() -%}
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+{%- endmacro -%}
+
+{%- macro checkout_pytorch(submodules) -%}
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: !{{ submodules }}
+{%- endmacro -%}
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index d5de86b1bbcfd..49b6d7dd68c77 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -72,25 +72,8 @@ jobs:
     outputs:
       docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
     steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-          rm -f ~/.ssh/authorized_keys
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-      !{{ common.display_ec2_information() }}
+      !{{ common.setup_ec2_linux() }}
+      !{{ common.checkout_pytorch("false") }}
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -145,35 +128,11 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: !{{ build_environment }}-build
     steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-          rm -f ~/.ssh/authorized_keys
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
-          submodules: recursive
-      !{{ common.display_ec2_information() }}
+      !{{ common.setup_ec2_linux() }}
+      !{{ common.checkout_pytorch("recursive") }}
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Build PyTorch
         run: |
           docker run \
@@ -228,6 +187,7 @@ jobs:
           path:
             artifacts.zip
       {%- endif %}
+      !{{ common.teardown_ec2_linux() }}
       - name: Hold runner for 2 hours or until ssh sessions have drained
         # Always hold for active ssh sessions
         if: always()
@@ -290,29 +250,8 @@ jobs:
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
       CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-          rm -f ~/.ssh/authorized_keys
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
-          submodules: recursive
-      !{{ common.display_ec2_information() }}
+      !{{ common.setup_ec2_linux() }}
+      !{{ common.checkout_pytorch("recursive") }}
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -343,9 +282,6 @@ jobs:
       - name: Output disk space left
         run: |
           sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
           PR_NUMBER: ${{ github.event.pull_request.number }}
@@ -428,17 +364,7 @@ jobs:
             test-reports-*.zip
       !{{ common.parse_ref() }}
       !{{ common.upload_test_statistics(build_environment) }}
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-          # Prune all of the docker images
-          docker system prune -af
+      !{{ common.teardown_ec2_linux() }}
 {% endblock %}
 {%- endif -%}
 {%- if enable_doc_jobs %}
@@ -452,35 +378,11 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       DOCS_TYPE: ${{ matrix.docs_type }}
     steps:
-      - name: Log in to ECR
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-          rm -f ~/.ssh/authorized_keys
-      - name: "[FB EMPLOYEES] Enables SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
-          submodules: recursive
-      !{{ common.display_ec2_information() }}
+      !{{ common.setup_ec2_linux() }}
+      !{{ common.checkout_pytorch("recursive") }}
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
         name: Download PyTorch Build Artifacts
         with:
@@ -542,13 +444,5 @@ jobs:
           name: docs_${{ matrix.docs_type }}
           path: docs_${{ matrix.docs_type }}.zip
           if-no-files-found: error
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
+      !{{ common.teardown_ec2_linux() }}
 {%- endif -%}
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 27b21bc6b523b..e621bee2ad666 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -46,12 +46,27 @@ jobs:
     outputs:
       docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -59,15 +74,19 @@ jobs:
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
+          submodules: false
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -122,12 +141,27 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: libtorch-linux-xenial-cuda10.2-py3.6-gcc7-build
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -139,21 +173,18 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Build PyTorch
         run: |
           docker run \
@@ -201,6 +232,24 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
       - name: Clean up docker images
         if: always()
         run: |
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
index b90a497441802..9daf916ae2642 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -46,12 +46,27 @@ jobs:
     outputs:
       docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -59,15 +74,19 @@ jobs:
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
+          submodules: false
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -122,12 +141,27 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: libtorch-linux-xenial-cuda11.3-py3.6-gcc7-build
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -139,21 +173,18 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Build PyTorch
         run: |
           docker run \
@@ -201,6 +232,24 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
       - name: Clean up docker images
         if: always()
         run: |
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 274e68e38bdb4..4821c1e306715 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -46,12 +46,27 @@ jobs:
     outputs:
       docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -59,15 +74,19 @@ jobs:
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
+          submodules: false
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -122,12 +141,27 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-build
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -139,21 +173,18 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Build PyTorch
         run: |
           docker run \
@@ -212,6 +243,24 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
       - name: Clean up docker images
         if: always()
         run: |
@@ -266,15 +315,30 @@ jobs:
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
       CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Clean workspace
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
@@ -283,15 +347,15 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -322,9 +386,6 @@ jobs:
       - name: Output disk space left
         run: |
           sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
           PR_NUMBER: ${{ github.event.pull_request.number }}
@@ -422,10 +483,17 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
+      - name: Kill containers, clean up images
         if: always()
         run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index e03a019fbe7b9..79edf0d741950 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -46,12 +46,27 @@ jobs:
     outputs:
       docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -59,15 +74,19 @@ jobs:
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
+          submodules: false
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -122,12 +141,27 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: linux-bionic-py3.8-gcc9-coverage-build
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -139,21 +173,18 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Build PyTorch
         run: |
           docker run \
@@ -212,6 +243,24 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
       - name: Clean up docker images
         if: always()
         run: |
@@ -266,15 +315,30 @@ jobs:
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
       CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Clean workspace
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
@@ -283,15 +347,15 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -322,9 +386,6 @@ jobs:
       - name: Output disk space left
         run: |
           sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
           PR_NUMBER: ${{ github.event.pull_request.number }}
@@ -426,10 +487,17 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
+      - name: Kill containers, clean up images
         if: always()
         run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 6aea843037eae..316da3604fc91 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -46,12 +46,27 @@ jobs:
     outputs:
       docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -59,15 +74,19 @@ jobs:
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
+          submodules: false
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -122,12 +141,27 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: linux-xenial-cuda10.2-py3.6-gcc7-build
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -139,21 +173,18 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Build PyTorch
         run: |
           docker run \
@@ -212,6 +243,24 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
       - name: Clean up docker images
         if: always()
         run: |
@@ -266,15 +315,30 @@ jobs:
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
       CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Clean workspace
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
@@ -283,15 +347,15 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -322,9 +386,6 @@ jobs:
       - name: Output disk space left
         run: |
           sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
           PR_NUMBER: ${{ github.event.pull_request.number }}
@@ -422,10 +483,17 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
+      - name: Kill containers, clean up images
         if: always()
         run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 1fdae9d1a0320..6c9e67d380c29 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -46,12 +46,27 @@ jobs:
     outputs:
       docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -59,15 +74,19 @@ jobs:
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
+          submodules: false
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -122,12 +141,27 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-build
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -139,21 +173,18 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Build PyTorch
         run: |
           docker run \
@@ -212,6 +243,24 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
       - name: Clean up docker images
         if: always()
         run: |
@@ -266,15 +315,30 @@ jobs:
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
       CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Clean workspace
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
@@ -283,15 +347,15 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -322,9 +386,6 @@ jobs:
       - name: Output disk space left
         run: |
           sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
           PR_NUMBER: ${{ github.event.pull_request.number }}
@@ -422,10 +483,17 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
+      - name: Kill containers, clean up images
         if: always()
         run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index e932b488d0c71..2337b4f5bf429 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -46,12 +46,27 @@ jobs:
     outputs:
       docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -59,15 +74,19 @@ jobs:
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
+          submodules: false
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -122,12 +141,27 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: linux-xenial-py3.6-gcc5.4-build
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -139,21 +173,18 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Build PyTorch
         run: |
           docker run \
@@ -212,6 +243,24 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
       - name: Clean up docker images
         if: always()
         run: |
@@ -266,15 +315,30 @@ jobs:
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
       CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Clean workspace
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
@@ -283,15 +347,15 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -322,9 +386,6 @@ jobs:
       - name: Output disk space left
         run: |
           sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
           PR_NUMBER: ${{ github.event.pull_request.number }}
@@ -422,13 +483,20 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
+      - name: Kill containers, clean up images
         if: always()
         run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
 
   pytorch_doc_build:
     runs-on: linux.2xlarge
@@ -440,12 +508,27 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       DOCS_TYPE: ${{ matrix.docs_type }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -453,25 +536,22 @@ jobs:
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
-      - name: "[FB EMPLOYEES] Enables SSH (Click me for login details)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
         name: Download PyTorch Build Artifacts
         with:
@@ -537,8 +617,17 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
+      - name: Kill containers, clean up images
         if: always()
         run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index 89deda0704df1..17dc3a6742d73 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -46,12 +46,27 @@ jobs:
     outputs:
       docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -59,15 +74,19 @@ jobs:
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
+          submodules: false
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -125,24 +144,47 @@ jobs:
       NUM_TEST_SHARDS: 1
       CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -272,8 +314,21 @@ jobs:
           python3 -m pip install -r requirements.txt
           python3 -m pip install boto3==1.16.34
           python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Clean up docker images
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
         if: always()
         run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 63b462a19cf25..2f5cab7538601 100644
--- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -44,12 +44,27 @@ jobs:
     outputs:
       docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -57,15 +72,19 @@ jobs:
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
+          submodules: false
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -120,12 +139,27 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7-build
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -137,21 +171,18 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Build PyTorch
         run: |
           docker run \
@@ -199,6 +230,24 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
       - name: Clean up docker images
         if: always()
         run: |
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 768146ee8cab2..8c81ab1717221 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -44,12 +44,27 @@ jobs:
     outputs:
       docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -57,15 +72,19 @@ jobs:
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
           rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
           # deep clone, to allow use of git merge-base
           fetch-depth: 0
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
+          submodules: false
       - name: Calculate docker image tag
         id: calculate-tag
         run: |
@@ -120,12 +139,27 @@ jobs:
       DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
       JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.6-gcc7-build
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
@@ -137,21 +171,18 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Build PyTorch
         run: |
           docker run \
@@ -210,6 +241,24 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
       - name: Clean up docker images
         if: always()
         run: |
@@ -264,15 +313,30 @@ jobs:
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
       CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
     steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Log in to ECR
         run: |
           aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
           bash /tmp/ecr-login.sh
           rm /tmp/ecr-login.sh
       - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
         run: |
           # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)/../":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
       - name: Clean workspace
         run: |
           rm -rf "${GITHUB_WORKSPACE:?}/*"
@@ -281,15 +345,15 @@ jobs:
         uses: seemethere/add-github-ssh-key@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
-          fetch-depth: 0 # deep clone, to allow sharding to use git rev-list
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
           submodules: recursive
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          .github/scripts/display_ec2_information.sh
       - name: Pull docker image
         run: |
           docker pull "${DOCKER_IMAGE}"
@@ -320,9 +384,6 @@ jobs:
       - name: Output disk space left
         run: |
           sudo df -H
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         env:
           PR_NUMBER: ${{ github.event.pull_request.number }}
@@ -420,10 +481,17 @@ jobs:
         # Always hold for active ssh sessions
         if: always()
         run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
+      - name: Kill containers, clean up images
         if: always()
         run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
index aaf2d26e05af5..306e93aca7990 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
@@ -65,7 +65,16 @@ jobs:
       - name: Display EC2 information
         shell: bash
         run: |
-          .github/scripts/display_ec2_information.sh
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -178,7 +187,16 @@ jobs:
       - name: Display EC2 information
         shell: bash
         run: |
-          .github/scripts/display_ec2_information.sh
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         uses: seemethere/add-github-ssh-key@v1
         with:
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index 08656c9dd99d8..f79cad7b04c00 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -65,7 +65,16 @@ jobs:
       - name: Display EC2 information
         shell: bash
         run: |
-          .github/scripts/display_ec2_information.sh
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -170,7 +179,16 @@ jobs:
       - name: Display EC2 information
         shell: bash
         run: |
-          .github/scripts/display_ec2_information.sh
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         uses: seemethere/add-github-ssh-key@v1
         with:
diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
index 2a8570d1e84b1..35c6cede0eefe 100644
--- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
@@ -67,7 +67,16 @@ jobs:
       - name: Display EC2 information
         shell: bash
         run: |
-          .github/scripts/display_ec2_information.sh
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -180,7 +189,16 @@ jobs:
       - name: Display EC2 information
         shell: bash
         run: |
-          .github/scripts/display_ec2_information.sh
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         uses: seemethere/add-github-ssh-key@v1
         with:
diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
index 7235db7f3c9f5..4bfc5654186f7 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
@@ -67,7 +67,16 @@ jobs:
       - name: Display EC2 information
         shell: bash
         run: |
-          .github/scripts/display_ec2_information.sh
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: Install Visual Studio 2019 toolchain
         shell: powershell
         run: |
@@ -180,7 +189,16 @@ jobs:
       - name: Display EC2 information
         shell: bash
         run: |
-          .github/scripts/display_ec2_information.sh
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         uses: seemethere/add-github-ssh-key@v1
         with:

From 0b9cdeb2956d3dd281a04d30e8cee55bf2101ba9 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Tue, 31 Aug 2021 17:38:42 -0700
Subject: [PATCH 408/530] .circleci: Remove already migrated CUDA configs
 (#64231)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64231

This migrates over the CUDA 11.1 and CUDA 10.2 configs that we had
previously migrated to GHA

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra

Test Plan: Imported from OSS

Reviewed By: zhouzhuojie

Differential Revision: D30683811

Pulled By: seemethere

fbshipit-source-id: 71b0761461557d871c26eb02f665a2e4d9b1d9fb
---
 .circleci/cimodel/data/pytorch_build_data.py | 18 ------
 .circleci/config.yml                         | 58 --------------------
 2 files changed, 76 deletions(-)

diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index 5a85674d74fe9..df0cfa0027554 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -47,17 +47,6 @@
                     # ]),
                 ]),
             ]),
-            ("11.1", [
-                ("3.8", [
-                    ("shard_test", [XImportant(True)]),
-                    # UNCOMMENT THE BELOW TO REENABLE LIBTORCH
-                    # ("libtorch", [
-                    #     (True, [
-                    #         ('build_only', [X(True)]),
-                    #     ]),
-                    # ]),
-                ]),
-            ]),
         ]),
     ]),
     ("bionic", [
@@ -74,13 +63,6 @@
                 ]),
             ]),
         ]),
-        ("cuda", [
-            ("10.2", [
-                ("3.9", [
-                    ("shard_test", [XImportant(True)]),
-                ]),
-            ]),
-        ]),
         ("rocm", [
             ("3.9", [
                 ("3.6", [
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 324e5fdd9b3cb..9989f1a289b7d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7407,35 +7407,6 @@ workflows:
           build_environment: "pytorch-linux-pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_distributed-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
           resource_class: large
-      - pytorch_linux_build:
-          name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
-          requires:
-            - "docker-pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
-          build_environment: "pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
-      - pytorch_linux_test:
-          name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test1
-          requires:
-            - pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
-          build_environment: "pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-test1"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
-          use_cuda_docker_runtime: "1"
-          resource_class: gpu.medium
-      - pytorch_linux_test:
-          name: pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test2
-          requires:
-            - pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
-          build_environment: "pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-test2"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
-          use_cuda_docker_runtime: "1"
-          resource_class: gpu.medium
-      - pytorch_linux_test:
-          name: pytorch_linux_pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_distributed_test
-          requires:
-            - pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_build
-          build_environment: "pytorch-linux-pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_distributed-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7"
-          resource_class: large
       - pytorch_linux_build:
           name: pytorch_linux_bionic_py3_6_clang9_noarch_build
           requires:
@@ -7475,35 +7446,6 @@ workflows:
           build_environment: "pytorch-vulkan-linux-bionic-py3.6-clang9-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9"
           resource_class: large
-      - pytorch_linux_build:
-          name: pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_build
-          requires:
-            - "docker-pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7"
-          build_environment: "pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7"
-      - pytorch_linux_test:
-          name: pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_test1
-          requires:
-            - pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_build
-          build_environment: "pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7-test1"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7"
-          use_cuda_docker_runtime: "1"
-          resource_class: gpu.medium
-      - pytorch_linux_test:
-          name: pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_test2
-          requires:
-            - pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_build
-          build_environment: "pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7-test2"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7"
-          use_cuda_docker_runtime: "1"
-          resource_class: gpu.medium
-      - pytorch_linux_test:
-          name: pytorch_linux_pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_distributed_test
-          requires:
-            - pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_build
-          build_environment: "pytorch-linux-pytorch_linux_bionic_cuda10_2_cudnn7_py3_9_gcc7_distributed-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7"
-          resource_class: large
       - pytorch_linux_build:
           name: pytorch_linux_bionic_rocm3_9_py3_6_build
           requires:

From 09e610e36d0106410e37e129fd0cd5749c74ad5f Mon Sep 17 00:00:00 2001
From: Ray Peng <ruipeng@fb.com>
Date: Tue, 31 Aug 2021 17:45:50 -0700
Subject: [PATCH 409/530] [Static Runtime] Out version for softmax (#64243)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64243

Test Plan:
```
> buck run //caffe2/benchmarks/static_runtime:static_runtime_cpptest -- --v=1
...
V0830 16:35:22.524479 613839 impl.cpp:1410] Switch to out variant for node: %5 : Tensor = aten::softmax(%a.1, %dim.1, %dtype.1)
...
[       OK ] StaticRuntime.IndividualOps_Softmax (803 ms)
```

Reviewed By: hlu1

Differential Revision: D30656149

fbshipit-source-id: 115b7b4a75448fd6a5c526808080ca9a4251302c
---
 benchmarks/static_runtime/test_scripts.h      | 10 ++++++++
 .../static_runtime/test_static_runtime.cc     | 16 +++++++++++++
 torch/csrc/jit/runtime/static/ops.cpp         | 24 +++++++++++++++++++
 3 files changed, 50 insertions(+)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index 37bb222f6a3d4..99b73db79f3d1 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -676,6 +676,16 @@ const auto argmin_with_keep_dim_script = R"JIT(
       return torch.argmin(a, dim, True).clone()
 )JIT";
 
+const auto softmax_script = R"JIT(
+  def forward(self, a: Tensor, dim: int):
+      return torch.softmax(a, dim).clone()
+)JIT";
+
+const auto softmax_script_with_dtype = R"JIT(
+  def forward(self, a: Tensor, dim: int, dtype: int):
+      return torch.softmax(a, dim, dtype=dtype).clone()
+)JIT";
+
 const auto getitem_dict_tensor_script = R"JIT(
   def forward(self, key: Tensor):
       d = {key: 1}
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 8e498dbbc664e..16941dab84760 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -1,3 +1,4 @@
+#include <c10/core/ScalarType.h>
 #include <gtest/gtest.h>
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/ir/irparser.h>
@@ -1083,6 +1084,21 @@ TEST(StaticRuntime, IndividualOps_Argmin) {
   testStaticRuntime(argmin_with_keep_dim_script, args_a, args_b);
 }
 
+TEST(StaticRuntime, IndividualOps_Softmax) {
+  auto a = at::randn({2, 3});
+  auto b = at::randn({3, 3, 3});
+
+  testStaticRuntime(softmax_script, {a, 0});
+  testStaticRuntime(softmax_script, {a, 1});
+
+  testStaticRuntime(softmax_script, {b, 0});
+  testStaticRuntime(softmax_script, {b, 1});
+  testStaticRuntime(softmax_script, {b, 2});
+
+  testStaticRuntime(softmax_script_with_dtype, {a, 1, at::ScalarType::Float});
+  testStaticRuntime(softmax_script_with_dtype, {b, 1, at::ScalarType::Float});
+}
+
 TEST(StaticRuntime, IndividualOps_GetItem_Dict) {
   int int_key = 0;
   std::string str_key = "str";
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 0cc38b0812f11..7ede15c524296 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -15,6 +15,7 @@
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/qembeddingbag.h>
 #include <ATen/native/quantized/cpu/qembeddingbag_prepack.h>
+#include <c10/core/ScalarType.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/runtime/static/impl.h>
@@ -1338,6 +1339,29 @@ REGISTER_OPERATOR_FUNCTOR(aten::argmin, aten_argmin, [](Node* n) -> SROperator {
   };
 });
 
+REGISTER_OPERATOR_FUNCTOR(aten::softmax, aten_softmax, [](Node* n) -> SROperator {
+  if (!n->matches(torch::schema(
+          "aten::softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor"))) {
+    LogAndDumpSchema(n);
+    return nullptr;
+  }
+  return [](ProcessedNode* p_node) {
+    const auto& in_t = p_node->Input(0).toTensor();
+    const auto& dim = p_node->Input(1).toInt();
+    const auto& dtype = p_node->Input(2).toOptional<c10::ScalarType>();
+    if (p_node->Output(0).isNone()) {
+      p_node->Output(0) = at::native::softmax(in_t, dim, dtype);
+    } else {
+      auto& out_t = p_node->Output(0).toTensor();
+      fastResizeToZero(out_t);
+
+      auto half_to_float = in_t.scalar_type() == at::ScalarType::Half &&
+          dtype == at::ScalarType::Float;
+      at::cpu::_softmax_out(out_t, in_t, dim, half_to_float);
+    }
+  };
+});
+
 REGISTER_OPERATOR_FUNCTOR(
     static_runtime::layer_norm,
     aten_layer_norm,

From 0733582087b08dc9b3136768a1e4a704fda5b5bb Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Tue, 31 Aug 2021 17:51:55 -0700
Subject: [PATCH 410/530] Use the correct overloaded name to skip boxed
 autograd not implemented kernel registration (#64182)

Summary:
Some internal use_count tests are failing for `dequantize_self` because we only compare the skip list with the base name `dequantize` when we should be comparing with the full name including the overload

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64182

Reviewed By: albanD

Differential Revision: D30639909

Pulled By: soulitzer

fbshipit-source-id: d4d22dd1a5c8f7180251ce7739830764cce6f151
---
 tools/autograd/gen_variable_type.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index d0a9048df47f3..8591a6800605c 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -411,9 +411,9 @@ def gen_variable_type_func(
         if fn.info is None and not get_base_name(f) in RESET_GRAD_ACCUMULATOR \
                 and not get_base_name(f) in DONT_REQUIRE_DERIVATIVE \
                 and len(gen_differentiable_outputs(fn)) > 0 \
-                and not get_base_name(f) in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE \
-                and not get_base_name(f) in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT \
-                and not get_base_name(f) in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
+                and not cpp.name(f.func) in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE \
+                and not type_wrapper_name(f) in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT \
+                and not type_wrapper_name(f) in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
             # NOTE: [ Registering AutogradNotImplemented boxed kernel ]
             #
             # When there is no derivatives.yaml entry, we register a generic boxed

From b23e4f6086f3174ae66243d77c032aef876c0246 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 31 Aug 2021 17:55:23 -0700
Subject: [PATCH 411/530] Convert mul to use opmath_gpu_kernel_with_scalars
 (#64019)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64019

Note that previously the functor operated on scalar_t and
this modifies it to operate on opmath_t, but this is not
a problem as half precision was implemented by performing the
compute in float anyway.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D30575282

Pulled By: ezyang

fbshipit-source-id: cc6900ef996e755740afe48f9cb4d0366858dd47
---
 .../ATen/native/cuda/BinaryMulDivKernel.cu    | 45 +++++--------------
 1 file changed, 10 insertions(+), 35 deletions(-)

diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
index da615fe12221b..e6a5300780e57 100644
--- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
@@ -14,16 +14,6 @@
 
 namespace at { namespace native {
 
-template<typename scalar_t, typename accscalar_t>
-struct MulScalarFunctor {
-    MulScalarFunctor(accscalar_t b_): b(b_) {}
-    __device__ scalar_t operator() (scalar_t a) const {
-      return a * b;
-    }
-  private:
-    accscalar_t b;
-};
-
 template<typename scalar_t>
 struct DivFunctor {
   __device__ scalar_t operator() (scalar_t a, scalar_t b) const {
@@ -31,9 +21,9 @@ struct DivFunctor {
   }
 };
 
-template<typename scalar_t>
+template<typename T>
 struct MulFunctor {
-  __device__ scalar_t operator() (scalar_t a, scalar_t b) const {
+  __device__ T operator() (T a, T b) const {
     return a * b;
   }
 };
@@ -53,11 +43,11 @@ void div_true_kernel_cuda(TensorIteratorBase& iter) {
     // scalar, compute a * reciprocal(b). Note that this may lose one bit of
     // precision compared to computing the division.
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_true_cuda", [&]() {
-      using accscalar_t = at::acc_type<scalar_t, true>;
-      auto inv_b = accscalar_t(1.0) / iter.scalar_value<accscalar_t>(2);
+      using opmath_t = at::opmath_type<scalar_t>;
+      auto inv_b = opmath_t(1.0) / iter.scalar_value<opmath_t>(2);
       iter.remove_operand(2);
-      MulScalarFunctor<scalar_t, decltype(inv_b)> f(inv_b);
-      gpu_kernel(iter, f);
+      gpu_kernel(iter, BUnaryFunctor<scalar_t, scalar_t, scalar_t, MulFunctor<opmath_t>>(
+        MulFunctor<opmath_t>(), inv_b));
     });
   } else {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_true_cuda", [&]() {
@@ -180,25 +170,10 @@ void div_floor_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 void mul_kernel_cuda(TensorIteratorBase& iter) {
-  if (!isIntegralType(iter.common_dtype(), /*includeBool*/ true) &&
-    (iter.is_cpu_scalar(1) || iter.is_cpu_scalar(2))) {
-    //if common dtype is half the scalar constant can overflow in half precision, and yet the result can
-    //still be representable in the half dtype. Cast scalar to acc_type to have better accuracy
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "mul_cuda", [&]() {
-      using accscalar_t = at::acc_type<scalar_t, true>;
-      int scalar_arg = iter.is_cpu_scalar(1) ? 1 : 2;
-      auto b = iter.scalar_value<accscalar_t>(scalar_arg);
-      iter.remove_operand(scalar_arg);
-      const cuda::OptionalCUDAGuard device_guard(device_of(iter.tensor(1)));
-      MulScalarFunctor<scalar_t, decltype(b)> f(b);
-      gpu_kernel(iter, f);
-    });
-  } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_cuda", [&]() {
-      MulFunctor<scalar_t> f;
-      gpu_kernel_with_scalars(iter, f);
-    });
-  }
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_cuda", [&]() {
+    using opmath_t = at::opmath_type<scalar_t>;
+    opmath_gpu_kernel_with_scalars<scalar_t>(iter, MulFunctor<opmath_t>());
+  });
 }
 
 REGISTER_DISPATCH(div_true_stub, &div_true_kernel_cuda);

From c59970db6b7831c34053ecd5f86ce688bf68df7d Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Tue, 31 Aug 2021 18:22:23 -0700
Subject: [PATCH 412/530] [caffe2][easy] Save heap allocation in ConcatOp
 (#63529)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63529

Output() takes an IntArrayRef, so we can just use a std::initializer_list (stack-allocated array) instead of std::vector here.
ghstack-source-id: 137085908

Test Plan: existing CI

Reviewed By: mruberry

Differential Revision: D29687400

fbshipit-source-id: 9f2a7c6679f2552c098bb1bf7befaca18e0e5d4d
---
 caffe2/operators/concat_split_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/operators/concat_split_op.h b/caffe2/operators/concat_split_op.h
index bbe355e50420f..f1e8f10d4d3dc 100644
--- a/caffe2/operators/concat_split_op.h
+++ b/caffe2/operators/concat_split_op.h
@@ -282,7 +282,7 @@ bool ConcatOp<Context>::RunOnDevice() {
   // We can override default options(Context::GetDeviceType())
   // by explicitly passing in device type we want
   Tensor* split = Output(
-      1, std::vector<int64_t>(1, InputSize()), at::dtype<int>().device(CPU));
+      1, at::IntArrayRef({InputSize()}), at::dtype<int>().device(CPU));
   int* axis_data = split->template mutable_data<int>();
   auto& input_zero = Input(0);
   int adj_size = input_zero.dim() + (add_axis_ ? 1 : 0);

From 6bb4b5d150ab51ed15d15ed270471848bb84d4e3 Mon Sep 17 00:00:00 2001
From: Matti Picus <matti.picus@gmail.com>
Date: Tue, 31 Aug 2021 18:54:44 -0700
Subject: [PATCH 413/530] disallow empty named dims list to flatten(names,
 name) (#61953)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/61137 by raising an error if an empty tuple is passed in for the names:
```
>>> torch.empty((2, 3), names=['a', 'b']).flatten((), 'abc')
RuntimeError: flatten(tensor, dims, out_dim): dims cannot be empty
```

or from the original issue:
```
>>> torch.empty((2, 3)).flatten((), 'abc')
RuntimeError: flatten(tensor, dims, out_dim): dims cannot be empty
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61953

Reviewed By: iramazanli

Differential Revision: D30574571

Pulled By: malfet

fbshipit-source-id: e606e84458a8dd66e5da6d0eb1a260f37b4ce91b
---
 aten/src/ATen/native/TensorShape.cpp | 2 ++
 test/test_namedtensor.py             | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 1dc2a270c44c2..edbfa2329a02d 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -2042,6 +2042,8 @@ Tensor flatten(const Tensor& self, Dimname start_dim, Dimname end_dim, Dimname o
 
 Tensor flatten(const Tensor& self, DimnameList dims, Dimname out_dim) {
   auto positions = dimnames_to_positions(self, dims);
+  TORCH_CHECK(positions.size() > 0,
+      "flatten(tensor, dims, out_dim): dims cannot be empty");
   for (const auto i : c10::irange(positions.size() - 1)) {
     if (positions[i] + 1 == positions[i + 1]) continue;
     TORCH_CHECK(positions[i] + 1 == positions[i + 1],
diff --git a/test/test_namedtensor.py b/test/test_namedtensor.py
index b5e7aac402abb..2c6d2d80a2266 100644
--- a/test/test_namedtensor.py
+++ b/test/test_namedtensor.py
@@ -1072,6 +1072,11 @@ def test_flatten(self):
         with self.assertRaisesRegex(RuntimeError, "must be consecutive in"):
             tensor.flatten(['H', 'D', 'W'], 'features')
 
+    def test_flatten_nodims(self):
+        tensor = torch.empty((2, 3))
+        with self.assertRaisesRegex(RuntimeError, "cannot be empty"):
+            tensor.flatten((), 'abcd')
+
     def test_unflatten(self):
         # test args: tensor, int, namedshape
         self.assertTrue(torch.equal(

From fa5676a41b2c00a4cea37793cf4977cf995ab0c4 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Tue, 31 Aug 2021 20:14:08 -0700
Subject: [PATCH 414/530] Delete some dead code from RRefMessageBase (#64298)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64298

cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23

Test Plan: Imported from OSS

Reviewed By: rohan-varma

Differential Revision: D30676702

Pulled By: pbelevich

fbshipit-source-id: 77dbc0f8064c3518376454ff573d45ed0274956b
---
 torch/csrc/distributed/rpc/rref_proto.cpp | 16 +---------------
 torch/csrc/distributed/rpc/rref_proto.h   |  3 ---
 2 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/torch/csrc/distributed/rpc/rref_proto.cpp b/torch/csrc/distributed/rpc/rref_proto.cpp
index 6f059b1022db0..49e3287f5d778 100644
--- a/torch/csrc/distributed/rpc/rref_proto.cpp
+++ b/torch/csrc/distributed/rpc/rref_proto.cpp
@@ -46,20 +46,6 @@ const RRefId& RRefMessageBase::rrefId() {
   return rrefId_;
 }
 
-c10::intrusive_ptr<Message> RRefMessageBase::toMessageImpl() && {
-  return fromIValues({rrefId_.toIValue()}, type_);
-}
-
-at::IValue RRefMessageBase::fromMessage(
-    const Message& message,
-    MessageType type) {
-  auto values = toIValues(message, type);
-
-  TORCH_INTERNAL_ASSERT(
-      values.size() == 1, "ScriptUserDelete expects 1 IValue from message.");
-  return std::move(values.back());
-}
-
 /////////////////////////// ForkMessageBase //////////////////////////////////
 
 const ForkId& ForkMessageBase::forkId() {
@@ -76,7 +62,7 @@ std::pair<RRefId, ForkId> ForkMessageBase::fromMessage(
   auto ivalues = toIValues(message, type);
 
   TORCH_INTERNAL_ASSERT(
-      ivalues.size() == 2, "ScriptUserDelete expects 2 IValue from message.");
+      ivalues.size() == 2, "ForkMessageBase expects 2 IValue from message.");
 
   return std::make_pair(
       RRefId::fromIValue(ivalues[0]), ForkId::fromIValue(ivalues[1]));
diff --git a/torch/csrc/distributed/rpc/rref_proto.h b/torch/csrc/distributed/rpc/rref_proto.h
index d5a82c21f8632..4ce8066dfe1f7 100644
--- a/torch/csrc/distributed/rpc/rref_proto.h
+++ b/torch/csrc/distributed/rpc/rref_proto.h
@@ -22,9 +22,6 @@ class TORCH_API RRefMessageBase : public RpcCommandBase {
 
   const RRefId& rrefId();
 
-  c10::intrusive_ptr<Message> toMessageImpl() && override;
-  static at::IValue fromMessage(const Message& message, MessageType type);
-
  protected:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   const RRefId rrefId_;

From a87808de931a31c242bca0c2305ec4af67f08ef2 Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Tue, 31 Aug 2021 20:19:55 -0700
Subject: [PATCH 415/530] Fix bug in ShardedTensorMetadata serde. (#63902)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63902

The 'memory_format' field was not being serialized correctly and used
the same encoding for different fields.
ghstack-source-id: 137142406

Test Plan: waitforbuildbot

Reviewed By: bowangbj

Differential Revision: D30527324

fbshipit-source-id: f0f223e2d660ef6e4abae9649d9992acc36e1278
---
 .../_sharded_tensor/test_sharded_tensor.py    | 50 +++++++++++++++++++
 torch/distributed/_sharded_tensor/api.py      | 19 ++++---
 2 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/test/distributed/_sharded_tensor/test_sharded_tensor.py b/test/distributed/_sharded_tensor/test_sharded_tensor.py
index 718b594c831ee..77e35b76f3731 100644
--- a/test/distributed/_sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_sharded_tensor/test_sharded_tensor.py
@@ -1,6 +1,8 @@
 from functools import wraps
 import math
 import io
+import itertools
+import pickle
 import sys
 import torch
 import torch.distributed as dist
@@ -123,6 +125,54 @@ def wrapper(self):
         self.destroy_comms()
     return wrapper
 
+class TestShardedTensorMetadata(TestCase):
+    def test_serialize_and_deserialize(self):
+        shard_metadatas = [
+            ShardMetadata(
+                shard_offsets=[0, 0],
+                shard_lengths=[5, 5],
+                placement="rank:0/cuda:0",
+            ),
+            ShardMetadata(
+                shard_offsets=[0, 5],
+                shard_lengths=[5, 5],
+                placement="rank:1/cuda:1",
+            ),
+            ShardMetadata(
+                shard_offsets=[5, 0],
+                shard_lengths=[5, 5],
+                placement="rank:2/cuda:2",
+            ),
+            ShardMetadata(
+                shard_offsets=[5, 5],
+                shard_lengths=[5, 5],
+                placement="rank:3/cuda:3",
+            )
+        ]
+
+        dtypes = [
+            torch.float, torch.double, torch.cfloat, torch.cdouble, torch.half,
+            torch.bfloat16, torch.uint8, torch.int8, torch.short, torch.int,
+            torch.long, torch.bool]
+
+        layouts = [torch.strided, torch.sparse_coo]
+        requires_grads = [True, False]
+        memory_formats = [torch.contiguous_format, torch.channels_last, torch.preserve_format]
+        pin_memories = [True, False]
+
+        for tensor_properties_input in itertools.product(dtypes, layouts, requires_grads, memory_formats, pin_memories):
+            dtype, layout, requires_grad, memory_format, pin_memory = tensor_properties_input
+
+            expected_st_metadata = _sharded_tensor.ShardedTensorMetadata(
+                shard_metadatas,
+                (10, 10),
+                _sharded_tensor.TensorProperties(dtype, layout, requires_grad, memory_format, pin_memory)
+            )
+
+            pickled_obj = pickle.dumps(expected_st_metadata)
+            st_metadata = pickle.loads(pickled_obj)
+            self.assertEqual(expected_st_metadata, st_metadata)
+
 class TestCreateTensorFromParams(TestCase):
     @sandcastle_skip_if(torch.cuda.device_count() < 1, 'CUDA GPU is needed')
     def test_empty(self):
diff --git a/torch/distributed/_sharded_tensor/api.py b/torch/distributed/_sharded_tensor/api.py
index 3b7476dc25bcf..d6b7a54732445 100644
--- a/torch/distributed/_sharded_tensor/api.py
+++ b/torch/distributed/_sharded_tensor/api.py
@@ -70,6 +70,13 @@ class TensorProperties(object):
     memory_format: torch.memory_format = field(default=torch.contiguous_format)
     pin_memory: bool = False
 
+
+class MEM_FORMAT_ENCODING(Enum):
+    TORCH_CONTIGUOUS_FORMAT = 0
+    TORCH_CHANNELS_LAST = 1
+    TORCH_PRESERVE_FORMAT = 2
+
+
 @dataclass
 class ShardedTensorMetadata(object):
     """
@@ -93,11 +100,11 @@ def __getstate__(self):
         # Since torch.memory_format cannot be pickled!
         memory_format = self.tensor_properties.memory_format
         if memory_format == torch.contiguous_format:
-            mem_format_encoding = 0
+            mem_format_encoding = MEM_FORMAT_ENCODING.TORCH_CONTIGUOUS_FORMAT
         elif memory_format == torch.channels_last:
-            mem_format_encoding = 1
+            mem_format_encoding = MEM_FORMAT_ENCODING.TORCH_CHANNELS_LAST
         elif memory_format == torch.preserve_format:
-            mem_format_encoding = 1
+            mem_format_encoding = MEM_FORMAT_ENCODING.TORCH_PRESERVE_FORMAT
         else:
             raise RuntimeError(f'Invalid torch.memory_format: {memory_format}')
 
@@ -118,11 +125,11 @@ def __setstate__(
     ):
         (self.shards_metadata, self.size, dtype, layout, requires_grad, mem_format_encoding, pin_memory) = state
 
-        if mem_format_encoding == 0:
+        if mem_format_encoding == MEM_FORMAT_ENCODING.TORCH_CONTIGUOUS_FORMAT:
             memory_format = torch.contiguous_format
-        elif mem_format_encoding == 1:
+        elif mem_format_encoding == MEM_FORMAT_ENCODING.TORCH_CHANNELS_LAST:
             memory_format = torch.channels_last
-        elif mem_format_encoding == 2:
+        elif mem_format_encoding == MEM_FORMAT_ENCODING.TORCH_PRESERVE_FORMAT:
             memory_format = torch.preserve_format
         else:
             raise RuntimeError(f'Invalid torch.memory_format encoding: {mem_format_encoding}')

From 8337a3fb3f44a536ba40e895dad7441b2f4a59f2 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Tue, 31 Aug 2021 20:27:44 -0700
Subject: [PATCH 416/530] [TensorExpr] Wrap error messages with
 buildErrorMessage call. (#64330)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64330

Test Plan: Imported from OSS

Reviewed By: bertmaher

Differential Revision: D30687226

Pulled By: ZolotukhinM

fbshipit-source-id: ade1be2ad6847c6afbba60307ef854696821b4e3
---
 test/cpp/tensorexpr/test_loopnest.cpp         |   2 +-
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp    |  19 +--
 torch/csrc/jit/tensorexpr/loopnest.cpp        | 116 ++++++++++++++----
 .../jit/tensorexpr/mem_dependency_checker.cpp |  19 ++-
 torch/csrc/jit/tensorexpr/registerizer.cpp    |  15 ++-
 5 files changed, 129 insertions(+), 42 deletions(-)

diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index c2b33e2a184d2..b1d59a1dee066 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -3554,7 +3554,7 @@ TEST(LoopNest, DetectInlineRankMismatch) {
   LoopNest l({reshape}, {a, reshape});
   ASSERT_THROWS_WITH(
       l.computeInline(l.getLoopBodyFor(a)),
-      "Placeholder indexed access is inconsistent with its rank");
+      "Number of indices doesn't match buf rank in the fuser.");
 }
 
 TEST(LoopNest, CacheReadsSimple) {
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index b9ea70806ee1a..6c212e623df21 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -903,7 +903,9 @@ void LLVMCodeGenImpl::visit(HalfImmPtr v) {
 }
 
 void LLVMCodeGenImpl::visit(BFloat16ImmPtr v) {
-  TORCH_INTERNAL_ASSERT(false, "llvm codegen does not support bfloat16");
+  TORCH_INTERNAL_ASSERT(
+      false,
+      buildErrorMessage("Fuser's LLVM codegen does not support bfloat16"));
 }
 
 void LLVMCodeGenImpl::visit(BoolImmPtr v) {
@@ -1535,7 +1537,10 @@ void LLVMCodeGenImpl::emitIsNan(IntrinsicsPtr v) {
   if (!v->param(0)->dtype().is_floating_point()) {
     value_ = toVec(llvm::ConstantInt::get(dstType, 0), v->dtype().lanes());
   } else {
-    TORCH_INTERNAL_ASSERT(v->dtype().scalar_type() == ScalarType::Int);
+    TORCH_INTERNAL_ASSERT(
+        v->dtype().scalar_type() == ScalarType::Int,
+        buildErrorMessage(
+            "Unexpected non-Int dtype of Intrinsics' result value in the fuser."));
     auto is_nan = irb_.CreateFCmpUNO(
         value_, llvm::ConstantFP::get(value_->getType(), 0.));
     if (v->dtype().lanes() > 1) {
@@ -1762,11 +1767,11 @@ void LLVMCodeGenImpl::visit(IntrinsicsPtr v) {
   } else {
     TORCH_INTERNAL_ASSERT(
         false,
-        v,
-        "Unimplemented lowering:",
-        v->op_type(),
-        " for input of dtype",
-        v->dtype().scalar_dtype());
+        buildErrorMessage(
+            std::string("Unimplemented lowering for intrinsic '") +
+            std::to_string(v->op_type()) + "' for input of dtype " +
+            std::to_string(v->dtype().scalar_dtype()) +
+            " in LLVM codegen of the fuser."));
   }
 
   std::vector<llvm::Value*> params;
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 11020cc2eda08..e67d094065d1a 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -536,7 +536,8 @@ class FunctionInliner : public IRMutator {
         // (since we don't support in-place writes). Resolves issue 52581.
         TORCH_INTERNAL_ASSERT(
             *intValue(i) == 0,
-            "Constant index impression should always be zero");
+            buildErrorMessage(
+                "Unexpected non-zero constant index in inlined buffer in the fuser."));
         producer_index_vars_.push_back(nullptr);
       } else {
         throw std::logic_error("cannot inline Buf with compound indices");
@@ -547,21 +548,25 @@ class FunctionInliner : public IRMutator {
  private:
   ExprPtr mutate_loads(BufPtr buf, std::vector<ExprPtr> dims) {
     std::vector<VarPtr> index_vars;
-    TORCH_INTERNAL_ASSERT(buf->ndim() == producer_index_vars_.size());
+    TORCH_INTERNAL_ASSERT(
+        buf->ndim() == producer_index_vars_.size(),
+        buildErrorMessage(
+            "Dimensions of producer and consumer expressions do not match in inliner in the fuser."));
     for (const auto i : c10::irange(buf->ndim())) {
       VarPtr func_callee_arg = producer_index_vars_.at(i);
       ExprPtr func_caller_param = dims.at(i);
       if (func_callee_arg == nullptr) {
         TORCH_INTERNAL_ASSERT(
             intValue(func_caller_param) && *intValue(func_caller_param) == 0,
-            "We are implicitly assuming that if you have an index of 0, that must also be inlined into an index of 0");
+            buildErrorMessage(
+                "We are implicitly assuming that if you have an index of 0, that must also be inlined into an index of 0"));
         continue;
       }
       if (func_callee_arg == nullptr)
         continue;
       auto iter = inline_mapping_.find(func_callee_arg);
       if (iter != inline_mapping_.end()) {
-        throw std::runtime_error(
+        throw std::logic_error(
             "Duplicated variables: " + func_callee_arg->name_hint());
       }
       // Add a mapping for each function parameter to it's source name.
@@ -603,10 +608,10 @@ class FunctionInliner : public IRMutator {
       return IRMutator::mutate(v);
     }
 
-    if (v->indices().size() != buf->ndim()) {
-      throw malformed_input(
-          "Placeholder indexed access is inconsistent with its rank", v);
-    }
+    TORCH_INTERNAL_ASSERT(
+        v->indices().size() == buf->ndim(),
+        buildErrorMessage(
+            "Number of indices doesn't match buf rank in the fuser."));
     return mutate_loads(buf, v->indices());
   }
 
@@ -646,7 +651,10 @@ class FunctionInliner : public IRMutator {
     if (v == producer_ && !outputs_.count(buf_)) {
       in_producer_ = true;
       producer_ = to<Store>(IRMutator::mutate(v));
-      TORCH_INTERNAL_ASSERT(producer_ != nullptr);
+      TORCH_INTERNAL_ASSERT(
+          producer_,
+          buildErrorMessage(
+              "Producer statement for output buf should remain non-null in the fuser"));
       in_producer_ = false;
       return nullptr;
     } else {
@@ -748,7 +756,10 @@ bool LoopNest::computeInline(BufPtr b) {
     }
   }
 
-  TORCH_INTERNAL_ASSERT(relevant_store);
+  TORCH_INTERNAL_ASSERT(
+      relevant_store,
+      buildErrorMessage(
+          "Cannot find a relevant store to inline a buf in the fuser."));
 
   GRAPH_DEBUG("ComputeInline: Def: ", std::to_string(relevant_store));
   FunctionInliner inliner(relevant_store, output_bufs_);
@@ -772,7 +783,11 @@ void LoopNest::inlineIntermediateBufs(bool allow_duplicated_work) {
     auto input_bufs = getInputBufs();
 
     for (auto buf : intermediate_bufs) {
-      TORCH_INTERNAL_ASSERT(buf_load_store_uses.count(buf));
+      TORCH_INTERNAL_ASSERT(
+          buf_load_store_uses.count(buf),
+          buildErrorMessage(
+              "Could not find uses of buf '" + buf->name_hint() +
+              "' in the fuser."));
       std::vector<BufLoadOrStoreUse>& uses = buf_load_store_uses[buf];
       auto stores = c10::filter(
           uses, [](const BufLoadOrStoreUse& use) { return use.isStore; });
@@ -789,7 +804,11 @@ void LoopNest::inlineIntermediateBufs(bool allow_duplicated_work) {
           }
         } else {
           // If S is not a store, it must be an ExternalCall.
-          TORCH_INTERNAL_ASSERT(to<ExternalCall>(stores[0].s));
+          TORCH_INTERNAL_ASSERT(
+              to<ExternalCall>(stores[0].s),
+              buildErrorMessage(
+                  "Expected stmt: " + std::to_string(stores[0].s) +
+                  "\nto be either a Store or an ExternalCall in the fuser."));
         }
       }
 
@@ -1154,7 +1173,10 @@ bool LoopNest::optimizeConditionals() {
             ifthenelse_exprs.front(), &cond_var, &comp_values, &sub_exprs)) {
       continue;
     }
-    TORCH_INTERNAL_ASSERT(comp_values.size() >= 1);
+    TORCH_INTERNAL_ASSERT(
+        comp_values.size() >= 1,
+        buildErrorMessage(
+            "Expected at least one expression in optimizeConditional in the fuser."));
     comp_values.insert(comp_values.begin(), immLike(comp_values[0], 0));
 
     auto fors = getLoopStmtsFor(store);
@@ -1513,7 +1535,10 @@ void LoopNest::splitWithMask(ForPtr f, int factor, ForPtr* inner) {
 std::vector<ForPtr> LoopNest::distributeLoop(
     ForPtr loop,
     const std::unordered_set<StmtPtr>& pivots) {
-  TORCH_INTERNAL_ASSERT(loop);
+  TORCH_INTERNAL_ASSERT(
+      loop,
+      buildErrorMessage(
+          "Expected non-null loop in distributeLoop in the fuser."));
   auto root = loop->get_parent();
   if (root == nullptr) {
     throw malformed_input("Loop without parent: ", loop);
@@ -1758,7 +1783,10 @@ bool LoopNest::unsafeFuseLoops(
       break;
     }
   }
-  TORCH_INTERNAL_ASSERT(it != root_block->end());
+  TORCH_INTERNAL_ASSERT(
+      it != root_block->end(),
+      buildErrorMessage(
+          "Could not find the given loop in the root stmt in unsafeFuseLoop the fuser."));
   for (auto l : loops) {
     if (*it != l) {
       return false;
@@ -2032,7 +2060,10 @@ std::vector<ForPtr> LoopNest::reorder(
   parent->replace_stmt(loops.front(), empty_block);
   for (size_t i = 1; i < loops.size(); ++i) {
     auto block = to<Block>(loops[i]->get_parent());
-    TORCH_INTERNAL_ASSERT(block);
+    TORCH_INTERNAL_ASSERT(
+        block,
+        buildErrorMessage(
+            "Expected parent stmt to be a non-null Block in reorder transformation the fuser."));
     block->remove_stmt(loops[i]);
   }
 
@@ -2191,9 +2222,13 @@ std::vector<ForPtr> LoopNest::getLoopStmtsInLoopNest(ForPtr f, size_t num) {
   ForPtr curr_for = f;
   loops[0] = curr_for;
   for (size_t i = 1; i < num; ++i) {
-    TORCH_INTERNAL_ASSERT(curr_for->body()->nstmts() == 1);
+    TORCH_INTERNAL_ASSERT(
+        curr_for->body()->nstmts() == 1,
+        buildErrorMessage("Expected a single stmt in the loop body."));
     curr_for = to<For>(curr_for->body()->front());
-    TORCH_INTERNAL_ASSERT(curr_for);
+    TORCH_INTERNAL_ASSERT(
+        curr_for,
+        buildErrorMessage("Expected the only child stmt to be a For loop."));
     loops[i] = curr_for;
   }
   return loops;
@@ -2303,7 +2338,10 @@ void LoopNest::compressBuffer(BufPtr buf, StmtPtr stmt) {
 
   // Find the parent common to all the buffer accesses.
   BlockPtr parent = to<Block>(writes.front()->get_parent());
-  TORCH_INTERNAL_ASSERT(parent);
+  TORCH_INTERNAL_ASSERT(
+      parent,
+      buildErrorMessage(
+          "Expected parent stmt to be a non-null block in compressBuffer in the fuser."));
   for (auto w : writes) {
     parent = Block::getSharedParent(parent, w);
   }
@@ -2325,7 +2363,10 @@ void LoopNest::compressBuffer(BufPtr buf, StmtPtr stmt) {
   // Vector to indicate which dimensions could be compressed away.
   std::vector<bool> dims(buf->dims().size(), true);
   auto check_indices = [&](const std::vector<ExprPtr>& indices) {
-    TORCH_INTERNAL_ASSERT(indices.size() == dims.size());
+    TORCH_INTERNAL_ASSERT(
+        indices.size() == dims.size(),
+        buildErrorMessage(
+            "Expected ranks to match in compressBuffer in the fuser."));
     for (size_t i = 0; i < indices.size(); ++i) {
       auto index_vars = NodeFinder<Var>::find(indices[i]);
       for (auto iv : index_vars) {
@@ -2367,7 +2408,10 @@ void LoopNest::compressBuffer(BufPtr buf, StmtPtr stmt) {
 
   // Modify all access to reflect the removed dims.
   auto get_new_indices = [&](const std::vector<ExprPtr>& indices) {
-    TORCH_INTERNAL_ASSERT(indices.size() == dims.size());
+    TORCH_INTERNAL_ASSERT(
+        indices.size() == dims.size(),
+        buildErrorMessage(
+            "Expected ranks to match in compressBuffer in the fuser."));
     std::vector<ExprPtr> new_indices(indices);
     for (size_t i = 0; i < dims.size(); ++i) {
       if (dims[i]) {
@@ -2573,7 +2617,10 @@ class CacheReplacer : public IRMutator {
 
     // Map indices to call-parameters.
     std::vector<ExprPtr> newIndices;
-    TORCH_INTERNAL_ASSERT(offsets_.size() == v->indices().size());
+    TORCH_INTERNAL_ASSERT(
+        offsets_.size() == v->indices().size(),
+        buildErrorMessage(
+            "Expected ranks to match in CacheReplacer in the fuser."));
     for (size_t i = 0; i < v->indices().size(); ++i) {
       ExprPtr index = v->indices()[i]->accept_mutator(this);
       ExprPtr offset = offsets_[i];
@@ -2595,7 +2642,10 @@ class CacheReplacer : public IRMutator {
 
     // Map indices to call-parameters.
     std::vector<ExprPtr> newIndices;
-    TORCH_INTERNAL_ASSERT(offsets_.size() == v->indices().size());
+    TORCH_INTERNAL_ASSERT(
+        offsets_.size() == v->indices().size(),
+        buildErrorMessage(
+            "Expected ranks to match in CacheReplacer in the fuser."));
     for (size_t i = 0; i < v->indices().size(); ++i) {
       ExprPtr index = v->indices()[i]->accept_mutator(this);
       ExprPtr offset = offsets_[i];
@@ -2643,7 +2693,10 @@ LoopNest::AccessResult LoopNest::cacheAccesses(
     return {nullptr, nullptr};
   }
 
-  TORCH_INTERNAL_ASSERT(bounds_it->second.size() == 1);
+  TORCH_INTERNAL_ASSERT(
+      bounds_it->second.size() == 1,
+      buildErrorMessage(
+          "Unexpected number of bound info entries in cacheAccesses in the fuser."));
   TensorAccessBoundsInfo& info = bounds_it->second[0];
   bool hasReads = info.kind == kLoad || info.kind == kMutate;
   bool hasWrites = info.kind == kStore || info.kind == kMutate;
@@ -2998,7 +3051,10 @@ class RfactorStoreRewriter : public IRMutator {
       return IRMutator::mutate(v);
     }
 
-    TORCH_INTERNAL_ASSERT(old_indices_.size() == v->indices().size());
+    TORCH_INTERNAL_ASSERT(
+        old_indices_.size() == v->indices().size(),
+        buildErrorMessage(
+            "Expected ranks to match in RfactorStoreRewriter in the fuser."));
 
     bool equal_indices = true;
     for (size_t i = 0; i < v->indices().size(); ++i) {
@@ -3032,7 +3088,10 @@ class RfactorStoreRewriter : public IRMutator {
       return IRMutator::mutate(v);
     }
 
-    TORCH_INTERNAL_ASSERT(old_indices_.size() == v->indices().size());
+    TORCH_INTERNAL_ASSERT(
+        old_indices_.size() == v->indices().size(),
+        buildErrorMessage(
+            "Expected ranks to match in RfactorStoreRewriter in the fuser."));
 
     bool equal_indices = true;
     for (size_t i = 0; i < v->indices().size(); ++i) {
@@ -3141,7 +3200,10 @@ bool LoopNest::rfactor(
   //   X[*indexes] = ReduceOp(X[*indexes] + T[*indexes + {reduction_var}],
   //                          reduce_axis={reduction_var})
   BlockPtr b = outer_reduction_for->body();
-  TORCH_INTERNAL_ASSERT(b->nstmts() == 1);
+  TORCH_INTERNAL_ASSERT(
+      b->nstmts() == 1,
+      buildErrorMessage(
+          "Expected to have a single stmt in the block in rfactor transformation in the fuser."));
   StmtPtr first_reduction_loop = b->stmts().front();
   auto rfac_buf_indices = orig_buf_indices;
   rfac_buf_indices.emplace_back(reduction_var);
diff --git a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
index e1688e37cbe7f..3f77041f1a202 100644
--- a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
+++ b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
@@ -76,12 +76,16 @@ std::vector<ExprPtr> AccessInfo::getIndices() const {
 
 void AccessInfo::addDependency(const std::shared_ptr<AccessInfo>& write) {
   auto res = dependencies_.emplace(write->id(), write);
-  TORCH_INTERNAL_ASSERT(res.second);
+  TORCH_INTERNAL_ASSERT(
+      res.second,
+      buildErrorMessage("Duplicate entry in mem dep checker in the fuser."));
 }
 
 void AccessInfo::addDependent(const std::shared_ptr<AccessInfo>& read) {
   auto res = dependents_.emplace(read->id(), read);
-  TORCH_INTERNAL_ASSERT(res.second);
+  TORCH_INTERNAL_ASSERT(
+      res.second,
+      buildErrorMessage("Duplicate entry in mem dep checker in the fuser."));
 }
 
 bool AccessInfo::hasDependency(const std::shared_ptr<AccessInfo>& info) const {
@@ -590,7 +594,10 @@ bool executionSafetyCheck(
   if (aStrides.empty() || oStrides.empty()) {
     return false;
   }
-  TORCH_INTERNAL_ASSERT(info->bounds().size() == other->bounds().size());
+  TORCH_INTERNAL_ASSERT(
+      info->bounds().size() == other->bounds().size(),
+      buildErrorMessage(
+          "Dimension mismatch for two accesses in mem dep checker in the fuser."));
   for (size_t b = 0; b < info->bounds().size(); ++b) {
     ExprPtr aIndexStride = aStrides[b];
     ExprPtr oIndexStride = oStrides[b];
@@ -1150,7 +1157,11 @@ void MemDependencyChecker::visit(FreePtr v) {
 
   VarPtr var = v->buffer_var();
   auto it = intermediates_.find(var);
-  TORCH_INTERNAL_ASSERT(it != intermediates_.end());
+  TORCH_INTERNAL_ASSERT(
+      it != intermediates_.end(),
+      buildErrorMessage(
+          "Expected to find '" + var->name_hint() +
+          "' in intermediate vars in mem dep checker in the fuser."));
 
   IndexBounds bounds = it->second->bounds();
   auto info = std::make_shared<AccessInfo>(
diff --git a/torch/csrc/jit/tensorexpr/registerizer.cpp b/torch/csrc/jit/tensorexpr/registerizer.cpp
index 8684f2aabc810..c4c495762a79d 100644
--- a/torch/csrc/jit/tensorexpr/registerizer.cpp
+++ b/torch/csrc/jit/tensorexpr/registerizer.cpp
@@ -43,8 +43,14 @@ void AccessInfo::addLoad(
 }
 
 void AccessInfo::merge(const std::shared_ptr<AccessInfo>& other) {
-  TORCH_INTERNAL_ASSERT(hash_ == other->hash());
-  TORCH_INTERNAL_ASSERT(indices_.size() == other->indices().size());
+  TORCH_INTERNAL_ASSERT(
+      hash_ == other->hash(),
+      buildErrorMessage(
+          "Expected hashes to match in registerizer in the fuser."));
+  TORCH_INTERNAL_ASSERT(
+      indices_.size() == other->indices().size(),
+      buildErrorMessage(
+          "Expected ranks to match in registerizer in the fuser."));
 
   last_usage_ = other->last_usage();
   for (auto s : other->stores()) {
@@ -68,7 +74,10 @@ void AccessInfo::merge(const std::shared_ptr<AccessInfo>& other) {
 
 bool AccessInfo::overlaps(const std::shared_ptr<AccessInfo>& other) {
   // All accesses to a buf must have the same dimensionality.
-  TORCH_INTERNAL_ASSERT(indices_.size() == other->indices().size());
+  TORCH_INTERNAL_ASSERT(
+      indices_.size() == other->indices().size(),
+      buildErrorMessage(
+          "Expected ranks to match in registerizer in the fuser."));
 
   auto& other_indices = other->indices();
 

From 479fc4e41250a1710e34cc1dcaa6272c89832fb4 Mon Sep 17 00:00:00 2001
From: gmagogsfm <gmagogsfm@gmail.com>
Date: Tue, 31 Aug 2021 21:27:46 -0700
Subject: [PATCH 417/530] Remove outdated warning about RecursiveScriptModule
 not being copiable (#64085)

Summary:
RecursiveScriptModule has its customized `__copy__` and `__deepcopy__` defined. The warning/error  that says it is not copiable is outdated

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64085

Reviewed By: rohan-varma

Differential Revision: D30598623

Pulled By: gmagogsfm

fbshipit-source-id: 0701d8617f42d818bc7b88244caee4cd47fbe976
---
 test/test_jit.py                                           | 5 -----
 torch/distributed/nn/api/remote_module.py                  | 2 ++
 torch/jit/_script.py                                       | 7 -------
 .../_internal/distributed/nn/api/remote_module_test.py     | 3 +--
 4 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index d1a170da6f750..e94ed8db922b0 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -391,11 +391,6 @@ def __init__(self, cpu_device_str):
         self.assertFalse(m2.p0.is_cuda)
         self.assertFalse(m2.b0.is_cuda)
 
-    def test_model_save_error(self):
-        with TemporaryFileName() as fname:
-            with self.assertRaisesRegex(pickle.PickleError, "not supported"):
-                torch.save(FooToPickle(), fname)
-
     @unittest.skipIf(not RUN_CUDA, "restore device requires CUDA")
     def test_restore_device_cuda(self):
         class MyModule(torch.jit.ScriptModule):
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index ef26db64dbed8..fb3b160c8ebcc 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -288,11 +288,13 @@ def get_module_rref(self) -> rpc.RRef[nn.Module]:
         """
         return self.module_rref
 
+    @torch.jit.export
     def __getstate__(self):
         raise RuntimeError(
             "Cannot pickle RemoteModule in python pickler. RemoteModule can only be pickled when using RPC"
         )
 
+    @torch.jit.export
     def __setstate__(self, state):
         raise RuntimeError(
             "Cannot unpickle RemoteModule in python pickler. RemoteModule can only be unpickled when using RPC"
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index 09801bab938a7..de32e1ab8de37 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -785,13 +785,6 @@ def __setattr__(self, attr, value):
                 # It's fairly trivial to save enough info to warn in this case.
                 return super(RecursiveScriptModule, self).__setattr__(attr, value)
 
-        def __getstate__(self):
-            raise pickle.PickleError(
-                "ScriptModules cannot be deepcopied using copy.deepcopy or saved using torch.save. "
-                + "Mixed serialization of script and non-script modules is not supported. "
-                + "For purely script modules use my_script_module.save(<filename>) instead."
-            )
-
         def __copy__(self):
             return torch.jit._recursive.wrap_cpp_module(copy.copy(self._c))
 
diff --git a/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
index fb1d5fbbc4f75..997006353bfbd 100644
--- a/torch/testing/_internal/distributed/nn/api/remote_module_test.py
+++ b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
@@ -1,6 +1,5 @@
 #!/usr/bin/python3
 import enum
-import pickle
 from typing import Tuple
 
 import torch
@@ -467,7 +466,7 @@ def test_remote_module_py_pickle_not_supported_script(self):
             dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR_WITH_INTERFACE]
         ):
             with TemporaryFileName() as fname:
-                with self.assertRaises(pickle.PickleError):
+                with self.assertRaisesRegex(torch.jit.Error, "can only be pickled when using RPC"):
                     torch.save(remote_module, fname)
 
 
From 24e50b8453d861b10e799a3c02cef9d06cf996c6 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 31 Aug 2021 21:43:25 -0700
Subject: [PATCH 418/530] [CUDA graphs] hotfix for test_graph_ (#64339)

Summary:
Graphed workloads that try to capture a full backward pass must do warmup on a non-default stream. If warmup happens on the default stream, AccumulateGrad functions might tag themselves to run on the default stream, and therefore won't be capturable.

ngimel and I suspect some test_cuda.py tests run with the default stream as the ambient stream, which breaks `test_graph_grad_scaling` because `test_graph_grad_scaling` does warmup on the ambient stream _assuming_ the ambient stream is a non-default stream.

This PR explicitly sets a side stream for the warmup in `test_graph_grad_scaling`, which is what I should have done all along because it's what the new documentation recommends.

I pushed the PR branch straight to the main pytorch repo because we need to run ci-all on it, and I'm not sure what the requirements are these days.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64339

Reviewed By: mruberry

Differential Revision: D30690711

Pulled By: ngimel

fbshipit-source-id: 91ad75f46a11f311e25bc468ea184e22acdcc25a
---
 test/test_cuda.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index 6f742ec59f931..33dbade7380b8 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -3683,8 +3683,13 @@ def test_graph_grad_scaling(self):
         static_grad = torch.ones_like(weight)
 
         # warmup
-        loss = (weight.half() * static_input).sum()
-        scaler.scale(loss).backward()
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(s):
+            loss = (weight.half() * static_input).sum()
+            scaler.scale(loss).backward()
+        torch.cuda.current_stream().wait_stream(s)
+
         opt.zero_grad(set_to_none=True)
 
         # capture

From 05ecaefbbfa157ad2d6353ef9b2a53892f7ed34c Mon Sep 17 00:00:00 2001
From: Yuchen Huang <hyc@fb.com>
Date: Tue, 31 Aug 2021 22:00:11 -0700
Subject: [PATCH 419/530] [Metal][GPU] Enable metal for simulators and fix test
 failures if possible (#64322)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64322

As title
ghstack-source-id: 137143877

Test Plan:
- `aibench-cli mobile`
- Select iOS -> `y` -> `1` -> `n` -> "--metal_op_test"
- Select all iPhone 6 + iPhone 7 + iPhone 8 and a iPhone X or 11 or 12
```
Benchmark Submitted. Find more details at: https://our.intern.facebook.com/intern/aibench/details/318120612514604
Benchmark Status:
        D10 (https://github.com/pytorch/pytorch/commit/b8256280ce45f02a7e105d3b3db4a547990e683d)AP-12.0.1: DONE
        N71mAP-14.3: DONE
DUMMY latency:
        D10 (https://github.com/pytorch/pytorch/commit/b8256280ce45f02a7e105d3b3db4a547990e683d)AP-12.0.1: 4319.3
        N71mAP-14.3: 8868.51
I0831 16:06:27.210558 605277 ClientSingletonManager.cpp:99] Shutting down Manifold ClientSingletonManager
```

Reviewed By: xta0

Differential Revision: D30147163

fbshipit-source-id: 2de6bbd9bd525e32ca92b2845eb435800855edcc
---
 aten/src/ATen/native/metal/MetalContext.mm                | 6 ------
 .../ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm   | 8 +++++---
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/native/metal/MetalContext.mm b/aten/src/ATen/native/metal/MetalContext.mm
index 80ee55efa591e..f71d35f97a866 100644
--- a/aten/src/ATen/native/metal/MetalContext.mm
+++ b/aten/src/ATen/native/metal/MetalContext.mm
@@ -37,9 +37,6 @@ + (instancetype)sharedInstance {
 - (BOOL)available {
 #if !defined(__APPLE__)
   return false;
-#elif TARGET_IPHONE_SIMULATOR
-  // TODO[T90135707]: Enable Metal on iOS Simulators
-  return false;
 #elif TARGET_OS_IPHONE
   if (!MPSSupportsMTLDevice(_device)) {
     return false;
@@ -47,9 +44,6 @@ - (BOOL)available {
   if ([UIDevice currentDevice].systemVersion.floatValue < 11.0) {
     return false;
   }
-  if (![_device supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily3_v2]) {
-    return false;
-  }
 #elif TARGET_OS_MAC
   if (!MPSSupportsMTLDevice(_device)) {
     return false;
diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm b/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm
index f337e1dfc824e..5e749983c822d 100644
--- a/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm
+++ b/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm
@@ -76,13 +76,15 @@ - (void)registerTests {
   REG_TEST("test_hardtanh_", test_hardtanh_);
   REG_TEST("test_hardtanh", test_hardtanh);
   REG_TEST("test_reshape", test_reshape);
+  REG_TEST("test_chunk", test_chunk);
+  REG_TEST("test_chunk3", test_chunk3);
+  REG_TEST("test_reflection_pad2d", test_reflection_pad2d);
+#if !TARGET_IPHONE_SIMULATOR
   REG_TEST("test_mean_dim", test_mean_dim);
   REG_TEST("test_mean_dim2", test_mean_dim2);
   REG_TEST("test_mean_dim3", test_mean_dim3);
-  REG_TEST("test_chunk", test_chunk);
   REG_TEST("test_chunk2", test_chunk2);
-  REG_TEST("test_chunk3", test_chunk3);
-  REG_TEST("test_reflection_pad2d", test_reflection_pad2d);
+#endif
 }
 
 - (NSDictionary*)tests {

From 0c4e4e588e2e3308c659f741e7bc5cabf0975c09 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Tue, 31 Aug 2021 22:20:41 -0700
Subject: [PATCH 420/530] [FX] Rename reduce functions back to their old,
 public names (#64324)

Summary:
Unfortunately pickle serializes the names of these functions. Also put them under backward-compatibility enforcement.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64324

Test Plan: Local repro https://fb.workplace.com/groups/3440841732711443/permalink/4018921611570116/

Reviewed By: SplitInfinity, TailofJune

Differential Revision: D30684185

Pulled By: jamesr66a

fbshipit-source-id: 900701220155d15115cd0c07cf7774a2891bd04f
---
 ...mpat-fx_backcompat_function_signatures.expect |  3 +++
 torch/fx/graph_module.py                         | 16 +++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
index a73fde735bc2d..20d392fa9cbb1 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
@@ -28,6 +28,9 @@ torch.fx.graph_module.GraphModule.add_submodule(self, target: str, m: torch.nn.m
 torch.fx.graph_module.GraphModule.delete_all_unused_submodules(self) -> None
 torch.fx.graph_module.GraphModule.delete_submodule(self, target: str) -> bool
 torch.fx.graph_module.GraphModule.recompile(self) -> torch.fx.graph.PythonCode
+torch.fx.graph_module.reduce_deploy_graph_module(importer: Callable, body: Dict[Any, Any], import_block: str) -> torch.nn.modules.module.Module
+torch.fx.graph_module.reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.modules.module.Module
+torch.fx.graph_module.reduce_package_graph_module(importer: Callable, body: Dict[Any, Any], generated_module_name: str) -> torch.nn.modules.module.Module
 torch.fx.interpreter.Interpreter.__init__(self, module: torch.fx.graph_module.GraphModule, garbage_collect_values: bool = True)
 torch.fx.interpreter.Interpreter.call_function(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
 torch.fx.interpreter.Interpreter.call_method(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 89685bf3953a0..ca82d49e07cbe 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -96,7 +96,8 @@ def _format_import_block(globals: Dict[str, Any], importer: Importer):
     return '\n'.join(import_strs)
 
 
-def _reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Module:
+@compatibility(is_backward_compatible=True)
+def reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Module:
     # BC: attribute name was changed from `code` to `_code` to facilitate
     # making `code` into a property and adding a docstring to it
     fn_src = body.get('_code') or body['code']
@@ -104,14 +105,15 @@ def _reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Mo
     return _deserialize_graph_module(forward, body)
 
 
-def _reduce_package_graph_module(
+@compatibility(is_backward_compatible=True)
+def reduce_package_graph_module(
     importer: PackageImporter, body: Dict[Any, Any], generated_module_name: str
 ) -> torch.nn.Module:
     forward = importer.import_module(generated_module_name).forward
     return _deserialize_graph_module(forward, body)
 
-
-def _reduce_deploy_graph_module(
+@compatibility(is_backward_compatible=True)
+def reduce_deploy_graph_module(
     importer: PackageImporter, body: Dict[Any, Any], import_block: str
 ) -> torch.nn.Module:
     ns = dict()
@@ -626,7 +628,7 @@ def __reduce_deploy__(self, importer: Importer):
 
         python_code = self.recompile()
         import_block = _format_import_block(python_code.globals, importer)
-        return (_reduce_deploy_graph_module, (dict_without_graph, import_block))
+        return (reduce_deploy_graph_module, (dict_without_graph, import_block))
 
     def __reduce_package__(self, exporter: PackageExporter):
         dict_without_graph = self.__dict__.copy()
@@ -638,7 +640,7 @@ def __reduce_package__(self, exporter: PackageExporter):
         import_block = _format_import_block(python_code.globals, exporter.importer)
         module_code = import_block + self.code
         exporter.save_source_string(generated_module_name, module_code)
-        return (_reduce_package_graph_module, (dict_without_graph, generated_module_name))
+        return (reduce_package_graph_module, (dict_without_graph, generated_module_name))
 
     def __reduce__(self):
         """
@@ -652,7 +654,7 @@ def __reduce__(self):
         python_code = self.recompile()
         import_block = _format_import_block(python_code.globals, sys_importer)
         del dict_without_graph['_graph']
-        return (_reduce_graph_module, (dict_without_graph, import_block))
+        return (reduce_graph_module, (dict_without_graph, import_block))
 
     # because __reduce__ is defined for serialization,
     # we need to define deepcopy otherwise it will call __reduce__

From 92b31b59aff908fcecb76e5baff5bb52ce62608a Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@fb.com>
Date: Wed, 1 Sep 2021 07:16:55 -0700
Subject: [PATCH 421/530] Revert D29699456: [pytorch][PR] Enable Half,
 BFloat16, and Complex dtypes for coo-coo sparse matmul [CUDA]

Test Plan: revert-hammer

Differential Revision:
D29699456 (https://github.com/pytorch/pytorch/commit/ad4848565e1d9f4d408c60614f213acb52035181)

Original commit changeset: 407ae53392ac

fbshipit-source-id: b6c70ba8bb28c0c38de47857030b69792a8470de
---
 aten/src/ATen/cuda/CUDADataType.h             | 61 -------------------
 .../ATen/native/sparse/cuda/SparseMatMul.cu   | 54 ++++++----------
 test/test_sparse.py                           | 44 ++++++-------
 torch/testing/_internal/common_cuda.py        |  1 -
 torch/utils/hipify/cuda_to_hip_mappings.py    | 31 +++++-----
 5 files changed, 52 insertions(+), 139 deletions(-)
 delete mode 100644 aten/src/ATen/cuda/CUDADataType.h

diff --git a/aten/src/ATen/cuda/CUDADataType.h b/aten/src/ATen/cuda/CUDADataType.h
deleted file mode 100644
index 71c9af9af8aac..0000000000000
--- a/aten/src/ATen/cuda/CUDADataType.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#pragma once
-
-#include <c10/core/ScalarType.h>
-
-#include <cuda.h>
-#include <library_types.h>
-
-namespace at {
-namespace cuda {
-
-template <typename scalar_t>
-cudaDataType getCudaDataType() {
-  TORCH_INTERNAL_ASSERT(false, "Cannot convert type ", typeid(scalar_t).name(), " to cudaDataType.")
-}
-
-template<> cudaDataType getCudaDataType<at::Half>() {
-  return CUDA_R_16F;
-}
-template<> cudaDataType getCudaDataType<float>() {
-  return CUDA_R_32F;
-}
-template<> cudaDataType getCudaDataType<double>() {
-  return CUDA_R_64F;
-}
-template<> cudaDataType getCudaDataType<c10::complex<c10::Half>>() {
-  return CUDA_C_16F;
-}
-template<> cudaDataType getCudaDataType<c10::complex<float>>() {
-  return CUDA_C_32F;
-}
-template<> cudaDataType getCudaDataType<c10::complex<double>>() {
-  return CUDA_C_64F;
-}
-
-// HIP doesn't define integral types
-#ifndef __HIP_PLATFORM_HCC__
-template<> cudaDataType getCudaDataType<uint8_t>() {
-  return CUDA_R_8U;
-}
-template<> cudaDataType getCudaDataType<int8_t>() {
-  return CUDA_R_8I;
-}
-template<> cudaDataType getCudaDataType<int>() {
-  return CUDA_R_32I;
-}
-#endif
-
-#if !defined(__HIP_PLATFORM_HCC__) && defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-template<> cudaDataType getCudaDataType<int16_t>() {
-  return CUDA_R_16I;
-}
-template<> cudaDataType getCudaDataType<int64_t>() {
-  return CUDA_R_64I;
-}
-template<> cudaDataType getCudaDataType<at::BFloat16>() {
-  return CUDA_R_16BF;
-}
-#endif
-
-} // namespace cuda
-} // namespace at
diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
index a08c93d1d71bd..d5f31a1980bac 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@@ -16,7 +16,6 @@
 #include <THC/THCThrustAllocator.cuh>
 
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDADataType.h>
 #include <ATen/cuda/CUDAUtils.h>
 #include <cusparse.h>
 #include <ATen/native/sparse/cuda/SparseCUDABlas.h>
@@ -119,7 +118,14 @@ struct csrMatrixRef {
         nnz_{nnz},
         size_{size} {
     #if IS_CUSPARSE11_AVAILABLE()
-      cudaDataType cuda_data_type = at::cuda::getCudaDataType<scalar_t>();
+      cudaDataType cuda_data_type;
+      if ( std::is_same<float, scalar_t>::value ) {
+        cuda_data_type = CUDA_R_32F;
+      } else if ( std::is_same<double, scalar_t>::value) {
+        cuda_data_type = CUDA_R_64F;
+      } else {
+        TORCH_CHECK(false, "Tensor types must be either float32 or float64");
+      }
       TORCH_CUDASPARSE_CHECK(cusparseCreateCsr(
         &description_,
         this->size(0),
@@ -186,14 +192,8 @@ struct CusparseMatrixMultiplyOp {
   cusparseSpGEMMDescr_t spgemmDesc;
 
   CusparseMatrixMultiplyOp() {
-    static_assert(
-      std::is_same<c10::Half, scalar_t>::value ||
-          std::is_same<c10::BFloat16, scalar_t>::value ||
-          std::is_same<float, scalar_t>::value ||
-          std::is_same<double, scalar_t>::value ||
-          std::is_same<c10::complex<float>, scalar_t>::value ||
-          std::is_same<c10::complex<double>, scalar_t>::value,
-      "cusparseSpGEMM only supports data type of half, bfloat16, float, double and complex float, double.");
+    static_assert(std::is_same<float, scalar_t>::value || std::is_same<double, scalar_t>::value,
+      "cusparse csr sparse-sparse MM only supports data type of float and double.");
     // SpGEMM Computation
     TORCH_CUDASPARSE_CHECK(cusparseSpGEMM_createDescr(&spgemmDesc));
   }
@@ -212,6 +212,14 @@ struct CusparseMatrixMultiplyOp {
 
     const int B_num_cols = B.size(1);
 
+    cudaDataType computeType;
+    if ( std::is_same<float, scalar_t>::value ) {
+      computeType = CUDA_R_32F;
+    } else if ( std::is_same<double, scalar_t>::value) {
+      computeType = CUDA_R_64F;
+    } else {
+      TORCH_CHECK(false, "Tensor types must be either float32 or float64");
+    }
     csrOutput out({A.size(0), B.size(1)});
 
     out.csr_pointers_ = at::empty({out.size(0) + 1}, output_indices.options().dtype(kInt));
@@ -244,16 +252,6 @@ struct CusparseMatrixMultiplyOp {
     cusparseSpMatDescr_t matC = C.description_;
     //--------------------------------------------------------------------------
 
-    cudaDataType computeType = at::cuda::getCudaDataType<scalar_t>();
-
-    // If a specific GPU model does not provide native support for a given data type,
-    // the routine returns CUSPARSE_STATUS_ARCH_MISMATCH error
-    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-    TORCH_CHECK(prop->major >= 5 && !((10*prop->major + prop->minor) < 53 && computeType == CUDA_R_16F),
-        "sparse_mm: CUDA Float16 requires compute capability >= 53 (current: ", prop->major, prop->minor, ")");
-    TORCH_CHECK(!(prop->major < 8 && computeType == CUDA_R_16BF),
-        "sparse_mm: CUDA BFloat16 requires compute capability >= 80 (current: ", prop->major, prop->minor, ")");
-
     // ask bufferSize1 bytes for external memory
     TORCH_CUDASPARSE_CHECK(cusparseSpGEMM_workEstimation(
         handle,
@@ -648,14 +646,8 @@ void sparse_sparse_matmul_cuda_kernel(
     const Tensor& mat1,
     const Tensor& mat2) {
 
-  static_assert(
-    std::is_same<c10::Half, scalar_t>::value ||
-        std::is_same<c10::BFloat16, scalar_t>::value ||
-        std::is_same<float, scalar_t>::value ||
-        std::is_same<double, scalar_t>::value ||
-        std::is_same<c10::complex<float>, scalar_t>::value ||
-        std::is_same<c10::complex<double>, scalar_t>::value,
-    "sparse_sparse_matmul_cuda_kernel only supports data type of half, bfloat16, float, double and complex float, double.");
+  static_assert(std::is_same<float, scalar_t>::value || std::is_same<double, scalar_t>::value,
+    "sparse_sparse_matmul_cuda_kernel only supports float and double value types");
 
   Tensor mat1_indices_ = mat1._indices().contiguous();
   Tensor mat1_values = mat1._values().contiguous();
@@ -783,15 +775,9 @@ Tensor sparse_sparse_matmul_cuda(const Tensor& mat1_, const Tensor& mat2_) {
   auto output = at::native::empty_like(mat1_);
   output.sparse_resize_and_clear_({mat1_.size(0), mat2_.size(1)}, mat1_.sparse_dim(), 0);
 
-#if IS_CUSPARSE11_AVAILABLE()
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, mat1_.scalar_type(), "sparse_matmul", [&] {
-    sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
-  });
-#else
   AT_DISPATCH_FLOATING_TYPES(mat1_.scalar_type(), "sparse_matmul", [&] {
     sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
   });
-#endif
   return output;
 }
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 8fa32edbc5e8a..fb0a660333583 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -12,12 +12,8 @@
 from torch.testing._internal.common_cuda import TEST_CUDA, _get_torch_cuda_version
 from numbers import Number
 from typing import Dict, Any
-from torch.testing import get_all_complex_dtypes, get_all_fp_dtypes
-from torch.testing._internal.common_cuda import \
-    (SM53OrLater, SM80OrLater, CUDA11OrLater)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride,
-     deviceCountAtLeast)
+    (instantiate_device_type_tests, ops, dtypes, dtypesIfCPU, onlyCPU, onlyCUDA, deviceCountAtLeast)
 from torch.testing._internal.common_methods_invocations import \
     (sparse_unary_ufuncs)
 
@@ -3221,13 +3217,8 @@ def sparse_log(x):
     # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA
     @skipIfRocm
     @coalescedonoff
-    @dtypes(*get_all_complex_dtypes(),
-            *get_all_fp_dtypes(include_half=False, include_bfloat16=False))
-    @dtypesIfCUDA(*(get_all_complex_dtypes() if CUDA11OrLater else ()),
-                  *get_all_fp_dtypes(
-                      include_half=(CUDA11OrLater and SM53OrLater),
-                      include_bfloat16=(CUDA11OrLater and SM80OrLater)))
-    @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
+    @dtypes(torch.double)
+    @dtypesIfCPU(torch.double, torch.cdouble)
     def test_sparse_matmul(self, device, dtype, coalesced):
         """
         This function test `torch.sparse.mm` when both the mat1 and mat2 are sparse tensors.
@@ -3337,23 +3328,22 @@ def test_sparse_matmul(sparse_dims, nnz, shape_a, shape_b):
             r2 = torch.sparse.mm(a, b)
             self.assertEqual(r1, r2)
 
-            if dtype in [torch.double, torch.cdouble]:
-                a.requires_grad_(True)
-                b.requires_grad_(True)
+            a.requires_grad_(True)
+            b.requires_grad_(True)
 
-                # check autograd support on sparse matmul
-                def fn(D1, D2):
-                    return torch.sparse.mm(D1, D2).to_dense()
+            # check autograd support on sparse matmul
+            def fn(D1, D2):
+                return torch.sparse.mm(D1, D2).to_dense()
 
-                if a.is_cuda:
-                    # For cuda, `nondet_tol` is set with `1e-5`
-                    # This is because cuSparse sometimes returns approximate zero values like `~e-323`
-                    # TODO: Check this cuSparse issue.
-                    # This happens when you do chain multiplication `torch.sparse.mm` operations
-                    gradcheck(fn, (a, b), check_sparse_nnz=True, nondet_tol=1e-5)
-                else:
-                    gradcheck(fn, (a, b), check_sparse_nnz=True)
-                grad_with_custom_sparsity_pattern_test_helper(sparse_dims, nnz, shape_a, shape_b)
+            if a.is_cuda:
+                # For cuda, `nondet_tol` is set with `1e-5`
+                # This is because cuSparse sometimes returns approximate zero values like `~e-323`
+                # TODO: Check this cuSparse issue.
+                # This happens when you do chain multiplication `torch.sparse.mm` operations
+                gradcheck(fn, (a, b), check_sparse_nnz=True, nondet_tol=1e-5)
+            else:
+                gradcheck(fn, (a, b), check_sparse_nnz=True)
+            grad_with_custom_sparsity_pattern_test_helper(sparse_dims, nnz, shape_a, shape_b)
 
         def test_error_cases():
             def fn(sparse_dims, nnz, shape_a, shape_b):
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 36e7f8a178577..5d0849bb8407d 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -20,7 +20,6 @@
 CUDA9 = torch.version.cuda and torch.version.cuda.startswith('9.')
 SM53OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (5, 3)
 SM60OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (6, 0)
-SM80OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 0)
 
 TEST_MAGMA = TEST_CUDA
 if TEST_CUDA:
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 6b60516efe322..558acc24ef3c8 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -554,7 +554,6 @@
         ),
         ("device_functions.h", ("hip/device_functions.h", CONV_INCLUDE, API_RUNTIME)),
         ("driver_types.h", ("hip/driver_types.h", CONV_INCLUDE, API_RUNTIME)),
-        ("library_types.h", ("hip/library_types.h", CONV_INCLUDE, API_RUNTIME)),
         ("cuComplex.h", ("hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME)),
         ("cuda_fp16.h", ("hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME)),
         (
@@ -3787,21 +3786,21 @@
             ),
         ),
         ("cudaDataType_t", ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("cudaDataType", ("hipDataType", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_16F", ("HIP_R_16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_16F", ("HIP_C_16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_32F", ("HIP_R_32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_32F", ("HIP_C_32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_64F", ("HIP_R_64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_64F", ("HIP_C_64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_8I", ("HIP_R_8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_8I", ("HIP_C_8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_8U", ("HIP_R_8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_8U", ("HIP_C_8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_32I", ("HIP_R_32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_32I", ("HIP_C_32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_32U", ("HIP_R_32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_32U", ("HIP_C_32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("cudaDataType", ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_16F", ("hipR16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_16F", ("hipC16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_32F", ("hipR32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_32F", ("hipC32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_64F", ("hipR64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_64F", ("hipC64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_8I", ("hipR8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_8I", ("hipC8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_8U", ("hipR8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_8U", ("hipC8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_32I", ("hipR32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_32I", ("hipC32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_32U", ("hipR32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_32U", ("hipC32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
         (
             "MAJOR_VERSION",
             ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),

From d1f3d85fd80de2166114af1b5e16070a6d33a898 Mon Sep 17 00:00:00 2001
From: Nima Elyasi <nimaelyasi@fb.com>
Date: Wed, 1 Sep 2021 08:47:44 -0700
Subject: [PATCH 422/530] fix GradBucket.is_last() logic (#63768)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63768

passed number of buckets to GradBucket constructor, to check if index is equal to num_buckets - 1 in the .is_last() function.

Test Plan:
buck test mode/dev-nosan //caffe2/test/distributed/algorithms/ddp_comm_hooks:test_ddp_hooks

test output: https://www.internalfb.com/intern/testinfra/testconsole/testrun/8162774375985873/

Reviewed By: SciPioneer, mrshenli

Differential Revision: D30455913

fbshipit-source-id: 8c67ca69cbf191d6e189e09248407eb167bb24b6
---
 .../ddp_comm_hooks/test_ddp_hooks.py          | 30 +++++++++++++++++++
 torch/csrc/distributed/c10d/comm.hpp          |  5 +++-
 torch/csrc/distributed/c10d/reducer.cpp       |  2 ++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
index 7b889fdc3f1bb..3d00712ca5354 100644
--- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
+++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
@@ -177,6 +177,36 @@ def test_ddp_comm_hook_quantize_per_channel_hook(self):
 
         np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_is_last_hook(self):
+
+        store = dist.FileStore(self.file_name, self.world_size)
+        process_group = dist.ProcessGroupNCCL(store, self.rank, self.world_size)
+
+        def hook(flags, bucket):
+            flags.append(bucket.is_last())
+            fut = torch.futures.Future()
+            fut.set_result(bucket.buffer())
+            return fut
+
+        flags = []
+        device_id = gpus_for_rank(self.world_size)[self.rank][0]
+        model = nn.Sequential(
+            nn.Linear(2, 4000, bias=False),
+            *[nn.Linear(4000, 4000, bias=False) for _ in range(10)]
+        )
+        gpu_model = DistributedDataParallel(
+            model.to(device_id),
+            device_ids=[device_id],
+            process_group=process_group,
+        )
+        gpu_model.register_comm_hook(state=flags, hook=hook)
+        input = torch.randn(10, 2)
+        gpu_model(input).sum().backward()
+        self.assertTrue(flags[-1])
+        self.assertFalse(any(flags[:-1]))
+
 
 if __name__ == "__main__":
     assert (
diff --git a/torch/csrc/distributed/c10d/comm.hpp b/torch/csrc/distributed/c10d/comm.hpp
index 9b45795683004..4690c355ce71b 100644
--- a/torch/csrc/distributed/c10d/comm.hpp
+++ b/torch/csrc/distributed/c10d/comm.hpp
@@ -18,12 +18,14 @@ class TORCH_API GradBucket {
  public:
   explicit GradBucket(
       size_t index,
+      size_t bucket_count,
       const at::Tensor& tensor,
       const std::vector<size_t>& offsets,
       const std::vector<size_t>& lengths,
       const std::vector<c10::IntArrayRef>& sizes_vec,
       const std::vector<at::Tensor>& parameters)
       : index_(index),
+        bucket_count_(bucket_count),
         buffer_(tensor),
         offsets_(offsets),
         lengths_(lengths),
@@ -63,11 +65,12 @@ class TORCH_API GradBucket {
 
   // Returns whther this bucket is the last bucket to allreduce in an iteration.
   bool isLast() const {
-    return index_ == 0;
+    return index_ == bucket_count_ - 1;
   }
 
  private:
   size_t index_;
+  size_t bucket_count_;
   at::Tensor buffer_;
 
   // Per-variable info in buffer_.
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index eafc70cc5e30f..91db615181e56 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -472,6 +472,7 @@ std::vector<c10d::GradBucket> Reducer::get_grad_buckets(
     auto variables_for_bucket = get_variables_for_bucket(i, bucket);
     gradBuckets.emplace_back(
         i,
+        buckets_.size(),
         return_zero_tensors ? at::zeros_like(bucket.replicas[0].contents)
                             : bucket.replicas[0].contents,
         bucket.replicas[0].offsets,
@@ -888,6 +889,7 @@ void Reducer::all_reduce_bucket(Bucket& bucket) {
   auto variables_for_bucket = get_variables_for_bucket(next_bucket_, bucket);
   GradBucket grad_bucket(
       next_bucket_,
+      buckets_.size(),
       tensors[0],
       // Since we only support single-process single-device
       // mode, there is always only one replica in the bucket.

From d5bfdd3dac33dfa84e2a511fa79c4ad4e0e6b822 Mon Sep 17 00:00:00 2001
From: Kushashwa Ravi Shrimali <kushashwaravishrimali@gmail.com>
Date: Wed, 1 Sep 2021 08:48:25 -0700
Subject: [PATCH 423/530] OpInfo for `nn.functional.layer_norm` (#63276)

Summary:
Please see https://github.com/facebookresearch/functorch/issues/78 and https://github.com/pytorch/pytorch/issues/54261.

Note:

* This PR also adds a reference test inspired by existing tests in `test_nn.py`.

cc: mruberry zou3519

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63276

Reviewed By: ejguan

Differential Revision: D30452483

Pulled By: zou3519

fbshipit-source-id: 2578d01ca34e031668a41bd284db60c31ae1fba8
---
 test/test_nn.py                               | 26 --------
 .../_internal/common_methods_invocations.py   | 65 +++++++++++++++++++
 2 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 5008c7256acf7..e60ff698ed19b 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -13281,32 +13281,6 @@ def test_LayerNorm_general(self, device):
         if self.device_type == 'cuda':
             self._test_LayerNorm_cuda_half(device)
 
-    @onlyOnCPUAndCUDA
-    def test_LayerNorm_numeric(self, device):
-        def layer_norm_ref(X, gamma, beta, normalized_shape, eps):
-            feature_size = np.prod(normalized_shape)
-            X_view = X.view(-1, feature_size)
-            mean = X_view.mean(dim=-1, keepdim=True)
-            var = X_view.var(dim=-1, unbiased=False, keepdim=True)
-            Y = (X_view - mean) / torch.sqrt(var + eps)
-            Y = Y * gamma.view(-1) + beta.view(-1)
-            return Y.view(*X.size())
-
-        normalized_shape = [256, 256, 144]
-        layer_norm = nn.LayerNorm(normalized_shape).float().to(device)
-        X = torch.rand(2, *normalized_shape, dtype=torch.float32,
-                       device=device)
-
-        Y = layer_norm(X)
-        Y_ref = layer_norm_ref(X, layer_norm.weight.data, layer_norm.bias.data,
-                               normalized_shape, layer_norm.eps)
-        self.assertEqual(Y, Y_ref, rtol=0, atol=1e-5)
-
-        if self.device_type == 'cuda':
-            layer_norm.cpu()
-            Y_cpu = layer_norm(X.cpu())
-            self.assertEqual(Y_cpu, Y, rtol=0, atol=1e-5)
-
     @onlyOnCPUAndCUDA
     def test_GroupNorm_general(self, device):
         self._test_GroupNorm_general(device)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 3579310dc68c9..fe8e36fbe6758 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -2548,6 +2548,42 @@ def generator():
 
     return list(generator())
 
+def sample_inputs_layer_norm(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as input shape, normalized_shape and a kwarg dict for eps
+    cases: Tuple[Tuple[int], Tuple[int], dict] = (  # type: ignore[assignment]
+        ((1, 2, 3), (1, 2, 3), {'eps': 0.5}),
+        ((2, 2, 3), (2, 3), {'eps': -0.5}),
+        ((1,), (1,), {}),
+        ((1, 2), (2,), {}),
+        ((0, 1), (1,), {}),
+    )
+
+    def generator():
+        for input_shape, normalized_shape, kwargs in cases:
+            # Shape of weight and bias should be the same as normalized_shape
+            weight = make_arg(normalized_shape)
+            bias = make_arg(normalized_shape)
+            yield SampleInput(
+                make_arg(input_shape),
+                args=(normalized_shape, weight, bias),
+                kwargs=kwargs
+            )
+        # Without any optional args
+        yield SampleInput(make_arg((1, 2)), args=((2,),))
+
+        # TODO: @krshrimali, once to_numpy method in SampleInput class is modified to take None inputs,
+        # enable these inputs; see https://github.com/pytorch/pytorch/pull/63276#discussion_r691950400
+
+        # With weight and a `None` bias
+        # yield SampleInput(make_arg((1, 2)), args=((2,), make_arg((2,)), None))
+
+        # With `None` weight and bias (tests failing for this, see the link above)
+        # yield SampleInput(make_arg((1, 2)), args=((2,), None, make_arg((2,))))
+
+    return list(generator())
+
 def sample_inputs_hardswish(self, device, dtype, requires_grad):
     N = 5
     # make sure we are testing -3 -> 3 range. default is -10 -> 10 so maybe unnecessary ?
@@ -5595,6 +5631,21 @@ def reference_mse_loss(input, target, reduction="mean"):
         return se
 
 
+def reference_layer_norm(inp: np.ndarray, normalized_shape: Tuple[int], weight=None, bias=None, eps=1e-5):
+    feature_size = np.prod(normalized_shape)
+    inp_view = inp.reshape(-1, feature_size)  # type: ignore[call-overload]
+    mean = inp_view.mean(axis=-1, keepdims=True)
+    var = inp_view.var(axis=-1, ddof=0, keepdims=True)
+    Y = (inp_view - mean) / np.sqrt(var + eps)
+    if weight is None and bias is not None:
+        Y = Y + bias.reshape(-1)
+    elif weight is not None and bias is None:
+        Y = Y * weight.reshape(-1)
+    elif weight is not None and bias is not None:
+        Y = Y * weight.reshape(-1) + bias.reshape(-1)
+    return Y.reshape(*inp.shape)
+
+
 def gradcheck_wrapper_hermitian_input(op, input, *args, **kwargs):
     """Gradcheck wrapper for functions that take Hermitian matrices as input.
 
@@ -7235,6 +7286,20 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                SkipInfo('TestJit', 'test_variant_consistency_jit'),
            ),
            supports_out=False,),
+    OpInfo('nn.functional.layer_norm',
+           aten_name='layer_norm',
+           aliases=('layer_norm',),
+           ref=reference_layer_norm,
+           dtypes=floating_types_and(torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           decorators=[
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1e-03)}),
+                   'TestCommon', 'test_reference_testing'
+               ),
+           ],
+           sample_inputs_func=sample_inputs_layer_norm,),
     OpInfo('nn.functional.pad',
            variant_test_name='constant',
            aten_name='constant_pad_nd',

From c4f3f6e62d3201852fc9aac8f736d528a5a36bfe Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 1 Sep 2021 10:17:52 -0700
Subject: [PATCH 424/530] Fixes reduction launch config (#64304)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/48573
See also https://github.com/pytorch/pytorch/pull/64194

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64304

Reviewed By: janeyx99

Differential Revision: D30689600

Pulled By: ngimel

fbshipit-source-id: bf2103ca177fd3b6e27bc0324b81925234483a29
---
 aten/src/ATen/native/cuda/LinearAlgebra.cu | 1 -
 aten/src/ATen/native/cuda/Normalization.cu | 1 -
 aten/src/ATen/native/cuda/Reduce.cuh       | 4 ++--
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
index b7ecf386c6edc..b4936c069b0b1 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -5,7 +5,6 @@
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/cuda/Reduce.cuh>
 #include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/ReduceOps.h>
 
diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
index 1d4d1cc4bda4e..44e27a95647b1 100644
--- a/aten/src/ATen/native/cuda/Normalization.cu
+++ b/aten/src/ATen/native/cuda/Normalization.cu
@@ -2,7 +2,6 @@
 #include <ATen/native/ReduceOps.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/cuda/Reduce.cuh>
 #include <ATen/native/cuda/Resize.cuh>
 #include <ATen/native/cuda/Normalization.cuh>
 #include <c10/cuda/CUDAMathCompat.h>
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index b4600454f467d..3be7100483b3c 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -989,14 +989,14 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
       // Map block.x to the fastest reducing dimension. It implies:
       //   1. block_x_reduce is required.
       //   2. block.y now max out to num_outputs.
-      dim0 = iter.shape()[0];
+      dim0 = inputs_per_output;
       dim1 = num_outputs;
       fastest_moving_stride = iter.strides(/*arg=*/input_index)[0];
     } else {
       // Map block.x to the fastest non reducing dimension. It implies:
       //   1. block_x_reduce is turned off.
       //   2. block.y now max out to inputs_per_output.
-      dim0 = iter.shape()[iter.num_reduce_dims()];
+      dim0 = num_outputs;
       dim1 = inputs_per_output;
       fastest_moving_stride = iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()];
     }

From 87d8ab6e50314f72ea6730250904fe7ff073cc21 Mon Sep 17 00:00:00 2001
From: Raghavan Raman <raghavanr@fb.com>
Date: Wed, 1 Sep 2021 10:28:02 -0700
Subject: [PATCH 425/530] [nnc] Updated generic error message with info about
 turning off the fuser (#64316)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64316

Test Plan: Imported from OSS

Reviewed By: bertmaher

Differential Revision: D30683942

Pulled By: navahgar

fbshipit-source-id: d86607563672213f99a1436dcf4f5dc28053b713
---
 torch/csrc/jit/tensorexpr/kernel.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 78cbb822bfbff..f9653aea68840 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -67,10 +67,16 @@ namespace jit {
 namespace tensorexpr {
 
 std::string buildErrorMessage(const std::string& s) {
-  // TODO: Update this generic error message to include details regarding
-  // turning off the fuser.
-  static const std::string generic_error_message = "";
-  return s + " " + generic_error_message;
+  static const std::string generic_error_message =
+      "This error occured in the fuser. You can turn off the fuser with "
+      "torch._C._jit_override_can_fuse_on_cpu(False)";
+  if (s.empty()) {
+    return generic_error_message;
+  }
+  if (s.back() == '.') {
+    return s + " " + generic_error_message;
+  }
+  return s + ". " + generic_error_message;
 }
 
 static int te_cuda_pointwise_loop_levels = -1;

From c6505cc3837eb903f98163e40fad638a1cfeb502 Mon Sep 17 00:00:00 2001
From: Patrick Hu <patrickhu@fb.com>
Date: Wed, 1 Sep 2021 10:49:39 -0700
Subject: [PATCH 426/530] [FX] Fix python code generation for wrapped getattr()
 with default value (#64271)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64271

Closes #60417

Modified emit_node() in fx/graph.py to generate getattr() call with default value when len(node.args) != 2 instead of accessing the attribute.
Added test_torch_fx_getattr() in test/test_fx.py.

Test Plan:
pytest test/test_fx.py

Imported from OSS

Reviewed By: jamesr66a

Differential Revision: D30671265

fbshipit-source-id: f2db9ea47e0cb247547e200684f715aab006c374
---
 test/test_fx.py   | 10 ++++++++++
 torch/fx/graph.py |  6 ++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index f4e4ab203a7bc..5220f67ebf309 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -98,6 +98,8 @@ def a_lifted_leaf2(a, b):
 
 wrap('len')
 
+wrap('getattr')
+
 @wrap
 def wrapped_via_decorator(a):
     return a + 1
@@ -942,6 +944,14 @@ def forward(self, x):
         self.assertEqual(traced2(inp), inp + 3.0)
         self.assertIs(len, builtins.len)
 
+    def test_torch_fx_getattr(self):
+        class FXGetattrTest(torch.nn.Module):
+            def forward(self, x):
+                return getattr(x, 'nonexistent_attr', torch.Tensor([2, 3]))
+
+        traced = symbolic_trace(FXGetattrTest())
+        self.assertEqual(traced(torch.rand(3, 4)), torch.Tensor([2, 3]))
+
     def test_sqrt(self):
         class Sqrt1(torch.nn.Module):
             def forward(self, x):
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 29ffc416715a7..65e93d0ccc7a1 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -949,11 +949,13 @@ def emit_node(node : Node):
                     return
                 qualified_name = _get_qualified_name(node.target)
                 global_name = add_global(qualified_name, node.target)
+                # special case for getattr: node.args could be 2-argument or 3-argument
+                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
                 if global_name == 'getattr' and \
                    isinstance(node.args, tuple) and \
                    isinstance(node.args[1], str) and \
-                   node.args[1].isidentifier():
-                    # pretty print attribute access
+                   node.args[1].isidentifier() and \
+                   len(node.args) == 2:
                     body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(repr(node.args[0]), node.args[1])}')
                     return
                 body.append(f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')

From 0b48d968952a6183ae122679d624940e5228567f Mon Sep 17 00:00:00 2001
From: Patrick Kan <patrickkan@fb.com>
Date: Wed, 1 Sep 2021 12:20:50 -0700
Subject: [PATCH 427/530] [Bootcamp] Include both python unittest and parser
 parameters in --help and -h flag (#64297)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/45945

Creates a new thread to run -h or --help with unittest.main if the help flag is present, and keeps the add_help default for parameters.

Includes both python unittest and parser parameters in --help and -h flag and will remain up to date since both messages are displayed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64297

Test Plan:
Imported from GitHub

`python test/test_spectral_ops.py --help`

Output:
```
% python test/test_spectral_ops.py --help
usage: test_spectral_ops.py [-h] [-v] [-q] [--locals] [-f] [-c] [-b] [-k TESTNAMEPATTERNS] [tests [tests ...]]

positional arguments:
  tests                a list of any number of test modules, classes and test methods.

optional arguments:
  -h, --help           show this help message and exit
  -v, --verbose        Verbose output
  -q, --quiet          Quiet output
  --locals             Show local variables in tracebacks
  -f, --failfast       Stop on first fail or error
  -c, --catch          Catch Ctrl-C and display results so far
  -b, --buffer         Buffer stdout and stderr during tests
  -k TESTNAMEPATTERNS  Only run tests which match the given substring

Examples:
  test_spectral_ops.py                           - run default set of tests
  test_spectral_ops.py MyTestSuite               - run suite 'MyTestSuite'
  test_spectral_ops.py MyTestCase.testSomething  - run MyTestCase.testSomething
  test_spectral_ops.py MyTestCase                - run all 'test*' test methods
                                       in MyTestCase

usage: test_spectral_ops.py [-h] [--subprocess] [--seed SEED] [--accept] [--jit_executor JIT_EXECUTOR] [--repeat REPEAT]
                            [--test_bailouts] [--save-xml [SAVE_XML]] [--discover-tests] [--log-suffix LOG_SUFFIX]
                            [--run-parallel RUN_PARALLEL] [--import-slow-tests [IMPORT_SLOW_TESTS]]
                            [--import-disabled-tests [IMPORT_DISABLED_TESTS]]

optional arguments:
  -h, --help            show this help message and exit
  --subprocess          whether to run each test in a subprocess
  --seed SEED
  --accept
  --jit_executor JIT_EXECUTOR
  --repeat REPEAT
  --test_bailouts
  --save-xml [SAVE_XML]
  --discover-tests
  --log-suffix LOG_SUFFIX
  --run-parallel RUN_PARALLEL
  --import-slow-tests [IMPORT_SLOW_TESTS]
  --import-disabled-tests [IMPORT_DISABLED_TESTS]
  ```

Also ran some other tests to make sure tests still worked, and other tests with --help or -h flag

Reviewed By: seemethere

Differential Revision: D30677776

Pulled By: PatrickKan

fbshipit-source-id: eb3d6e3fa677137ec703ec3a23808efb99acc896
---
 torch/testing/_internal/common_utils.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 90f3551caae94..0a265b52401b6 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -23,6 +23,7 @@
 import random
 import contextlib
 import shutil
+import threading
 from pathlib import Path
 import socket
 import subprocess
@@ -156,7 +157,7 @@ def _get_test_report_path():
     return os.path.join('test-reports', test_source)
 
 
-parser = argparse.ArgumentParser(add_help=False)
+parser = argparse.ArgumentParser()
 parser.add_argument('--subprocess', action='store_true',
                     help='whether to run each test in a subprocess')
 parser.add_argument('--seed', type=int, default=1234)
@@ -173,6 +174,15 @@ def _get_test_report_path():
 parser.add_argument('--import-slow-tests', type=str, nargs='?', const=SLOW_TESTS_FILE)
 parser.add_argument('--import-disabled-tests', type=str, nargs='?', const=DISABLED_TESTS_FILE)
 
+# Only run when -h or --help flag is active to display both unittest and parser help messages.
+def run_unittest_help(argv):
+    unittest.main(argv=argv)
+
+if '-h' in sys.argv or '--help' in sys.argv:
+    help_thread = threading.Thread(target=run_unittest_help, args=(sys.argv,))
+    help_thread.start()
+    help_thread.join()
+
 args, remaining = parser.parse_known_args()
 if args.jit_executor == 'legacy':
     GRAPH_EXECUTOR = ProfilingMode.LEGACY

From 421d8f86b6def536df18371a5da2f5df4de6e262 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 1 Sep 2021 12:28:23 -0700
Subject: [PATCH 428/530] Add a record scope around
 autograd::engine::evaluate_function (#63619)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63619

Adds a RECORD_FUNCTION with the function that is being valuate as part
of backwards execution. This has been useful in picking up some operations
in the backwards pass that otherwise would not show up, for example custom cpp
functions that use custom C++ code.
ghstack-source-id: 137041723

Test Plan:
CI

benchmark:
buck run mode/opt //scripts/rvarm1/ddp:bench

Reviewed By: albanD

Differential Revision: D30439492

fbshipit-source-id: 955917770cdf2a2edb0303223ace710b668ba388
---
 test/test_autograd.py          |  3 +++
 torch/csrc/autograd/engine.cpp | 13 ++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 364d48807b737..8b3c8bd33af66 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -3005,6 +3005,9 @@ def test_profiler_seq_nr(self):
         found_bwd_add = found_bwd_sum = False
         found_empty = False
         for e in p.function_events:
+            # Ignore record_function user scope.
+            if "autograd::engine::evaluate_function" in e.name:
+                continue
             if e.name == "aten::add":
                 add_seq_nr = e.sequence_nr
                 self.assertFalse(found_add)
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index acd7971aad6a7..4ea002a8312f1 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -419,7 +419,18 @@ auto Engine::thread_main(const std::shared_ptr<GraphTask>& graph_task) -> void {
           // callbacks.
           GraphTaskGuard guard(local_graph_task);
           NodeGuard ndguard(task.fn_);
-          evaluate_function(local_graph_task, task.fn_.get(), task.inputs_, local_graph_task->cpu_ready_queue_);
+          {
+            RECORD_FUNCTION(
+                c10::str(
+                    "autograd::engine::evaluate_function: ",
+                    task.fn_.get()->name()),
+                std::vector<c10::IValue>());
+            evaluate_function(
+                local_graph_task,
+                task.fn_.get(),
+                task.inputs_,
+                local_graph_task->cpu_ready_queue_);
+          }
         } catch (std::exception& e) {
           thread_on_exception(local_graph_task, task.fn_, e);
         }

From 468001600cb38423deeec0ba0abc6ca33e3c60e4 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 1 Sep 2021 12:38:39 -0700
Subject: [PATCH 429/530] Back out "Revert D30327514: [Pytorch lite predictor]
 Use KinetoEdgeCPUProfiler for operator profiling." (#64307)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64307

Original commit changeset: 0b2aa7c57d08

Restores original changes.
This diff changes the way operator profiling is done in lite predictor
benchmarking binary.
Instead of using custom callbacks it uses KinetoEdgeCPUProfiler to profile
events and then generate operator level metric from it.
Since KinetoEvents do not contain cpu clock time, now we report only wallclock
time.
This unifies various profiling effort that we have for benchmarking purpose. In
production we will still use observer based mechanism, but the advantage of
using kineto profiler is that we get few other things for free, such as:
chrome trace generation.
operator level memory profiling (to be added)
flop counts (to be added)
Furthermore possible we can use python post processing script to parse chrome
trace and generate output similar to torch.profiler. (To be done)

Furthermore removes some tests from test_lite_interpreter.cpp which were testing module hierarchy in debug info. They should be covered by test_mobile_profiler.cpp.

Test Plan:
aibench run
Model without debug info:
https://www.internalfb.com/intern/aibench/details/219598441154763
Model with debug info and --print_module_info true (see Operator summary has now module hierarchy information).
https://www.internalfb.com/intern/aibench/details/617154236292985

Reviewed By: raziel

Differential Revision: D30680354

fbshipit-source-id: b6ba0d59c510c13d13d9935b1d8051cc82ffa4e9
---
 test/cpp/jit/test_lite_interpreter.cpp  | 319 ------------------------
 tools/build_variables.bzl               |   4 +-
 torch/csrc/jit/mobile/debug_info.cpp    |  15 +-
 torch/csrc/jit/mobile/import.cpp        |   3 +
 torch/csrc/jit/mobile/interpreter.cpp   |   3 +
 torch/csrc/jit/mobile/module.cpp        |   3 +-
 torch/csrc/jit/mobile/module.h          |  11 +-
 torch/csrc/jit/mobile/profiler_edge.cpp |  45 +++-
 torch/csrc/jit/mobile/profiler_edge.h   |   5 +
 9 files changed, 72 insertions(+), 336 deletions(-)

diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp
index 3bd2becd8779d..26100b3b6f508 100644
--- a/test/cpp/jit/test_lite_interpreter.cpp
+++ b/test/cpp/jit/test_lite_interpreter.cpp
@@ -456,144 +456,6 @@ TEST(LiteInterpreterTest, BuiltinFunction) {
 }
 
 #if !defined FB_XPLAT_BUILD
-TEST(LiteInterpreterTest, ModuleInfoBasic) {
-  Module m("M");
-  m.define(R"JIT(
-    def forward(self, x):
-      return 2 * x
-  )JIT");
-
-  std::stringstream ss;
-  m._save_for_mobile(ss, {}, true);
-  mobile::Module bc = _load_for_mobile(ss);
-
-  std::unordered_set<std::string> module_debug_info_set;
-  size_t pc = 0;
-  while (true) {
-    try {
-      std::string module_info = bc.get_forward_method_debug_info(pc);
-      if (!module_info.empty() &&
-          (module_info.find("debug_handle") == std::string::npos)) {
-        module_debug_info_set.insert(module_info);
-      }
-      ++pc;
-    } catch (const std::exception& e) {
-      break;
-    }
-  }
-
-  AT_ASSERT(module_debug_info_set.count("top(M)::<unknown>.aten::mul"));
-}
-
-TEST(LiteInterpreterTest, NotSaveModuleInfo) {
-  Module m("M");
-  m.define(R"JIT(
-    def forward(self, x):
-      return x + 5
-  )JIT");
-
-  std::stringstream ss;
-  m._save_for_mobile(ss);
-  mobile::Module bc = _load_for_mobile(ss);
-
-  size_t pc = 0;
-  while (true) {
-    try {
-      std::string module_info = bc.get_forward_method_debug_info(pc);
-      AT_ASSERT(
-          module_info.empty() ||
-          (module_info.find("debug_handle") != std::string::npos));
-      ++pc;
-    } catch (const std::exception& e) {
-      break;
-    }
-  }
-}
-
-TEST(LiteInterpreterTest, OneSubmoduleModuleInfo) {
-  Module a("A");
-  a.define(R"JIT(
-    def forward(self, x):
-      return 2 * x + 5
-  )JIT");
-  Module b("B");
-  b.register_module("A0", a);
-  b.define(R"JIT(
-    def forward(self, x):
-      return self.A0.forward(x) + 1
-  )JIT");
-
-  std::stringstream ss;
-  b._save_for_mobile(ss, {}, true);
-  mobile::Module bc = _load_for_mobile(ss);
-
-  std::set<std::string> module_debug_info_set;
-  size_t pc = 0;
-  while (true) {
-    try {
-      std::string module_info = bc.get_forward_method_debug_info(pc);
-      if (!module_info.empty() &&
-          (module_info.find("debug_handle") == std::string::npos)) {
-        module_debug_info_set.insert(module_info);
-      }
-      ++pc;
-    } catch (const std::exception& e) {
-      break;
-    }
-  }
-
-  AT_ASSERT(module_debug_info_set.count("top(B)::<unknown>.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(B)::<unknown>.A0(A)::forward.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(B)::<unknown>.A0(A)::forward.aten::mul"));
-}
-
-TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) {
-  Module a("A");
-  a.define(R"JIT(
-    def forward(self, x):
-      return x + 1
-  )JIT");
-  Module b("B");
-  b.define(R"JIT(
-    def forward(self, x):
-      return x + 2
-  )JIT");
-  Module c("C");
-  c.register_module("A0", a);
-  c.register_module("B0", b);
-  c.define(R"JIT(
-    def forward(self, x):
-      return self.A0.forward(x) + self.B0.forward(x)
-  )JIT");
-
-  std::stringstream ss;
-  c._save_for_mobile(ss, {}, true);
-  mobile::Module bc = _load_for_mobile(ss);
-
-  std::set<std::string> module_debug_info_set;
-  size_t pc = 0;
-  while (true) {
-    try {
-      std::string module_info = bc.get_forward_method_debug_info(pc);
-      if (!module_info.empty() &&
-          (module_info.find("debug_handle") == std::string::npos)) {
-        module_debug_info_set.insert(module_info);
-      }
-      ++pc;
-    } catch (const std::exception& e) {
-      break;
-    }
-  }
-
-  AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(C)::<unknown>.A0(A)::forward.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(C)::<unknown>.B0(B)::forward.aten::add"));
-}
-
 TEST(LiteInterpreterTest, GetRuntimeByteCodeVersion) {
   auto runtime_bytecode_version = _get_runtime_bytecode_version();
   AT_ASSERT(
@@ -795,187 +657,6 @@ TEST(LiteInterpreterTest, isCompatibleFail) {
   AT_ASSERT(result.status = ModelCompatibilityStatus::ERROR);
 }
 
-#if !defined FB_XPLAT_BUILD
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-TEST(LiteInterpreterTest, SequentialModuleInfo) {
-  Module a("A");
-  a.define(R"JIT(
-    def forward(self, x):
-      return x + 1
-  )JIT");
-  Module b("B");
-  b.define(R"JIT(
-    def forward(self, x):
-      return x + 2
-  )JIT");
-  Module c("C");
-  c.register_module("A0", a);
-  c.register_module("B0", b);
-  c.define(R"JIT(
-    def forward(self, x):
-      return self.A0.forward(self.B0.forward(x))
-  )JIT");
-
-  std::stringstream ss;
-  c._save_for_mobile(ss, {}, true);
-  mobile::Module bc = _load_for_mobile(ss);
-
-  std::set<std::string> module_debug_info_set;
-  size_t pc = 0;
-  while (true) {
-    try {
-      std::string module_info = bc.get_forward_method_debug_info(pc);
-      if (!module_info.empty() &&
-          (module_info.find("debug_handle") == std::string::npos)) {
-        module_debug_info_set.insert(module_info);
-      }
-      ++pc;
-    } catch (const std::exception& e) {
-      break;
-    }
-  }
-
-  // class A(nn.Module):
-  //   def __init__(self):
-  //     super(A, self).__init__()
-
-  //   def forward(self, x):
-  //     return x + 1
-
-  // class B(nn.Module):
-  //   def __init__(self):
-  //     super(B, self).__init__()
-
-  //   def forward(self, x):
-  //     return x + 2
-
-  // class C(nn.Module):
-  //   def __init__(self):
-  //     super(C, self).__init__()
-  //     self.A0 = A()
-  //     self.B0 = B()
-
-  //   def forward(self, x):
-  //     return self.A0.forward(self.B0.forward(x))
-
-  AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.prim::Return"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(C)::<unknown>.A0(A)::forward.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(C)::<unknown>.B0(B)::forward.aten::add"));
-}
-
-TEST(LiteInterpreterTest, HierarchyModuleInfo) {
-  Module a("A");
-  a.define(R"JIT(
-    def forward(self, x):
-      return x + 1
-  )JIT");
-  Module b("B");
-  b.register_module("A0", a);
-  b.define(R"JIT(
-    def forward(self, x):
-      return self.A0.forward(x) + 1
-  )JIT");
-  Module c("C");
-  c.register_module("B0", b);
-  c.define(R"JIT(
-    def forward(self, x):
-      return self.B0.forward(x) + 1
-  )JIT");
-
-  std::stringstream ss;
-  c._save_for_mobile(ss, {}, true);
-  mobile::Module bc = _load_for_mobile(ss);
-
-  std::set<std::string> module_debug_info_set;
-  size_t pc = 0;
-  while (true) {
-    try {
-      std::string module_info = bc.get_forward_method_debug_info(pc);
-      if (!module_info.empty() &&
-          (module_info.find("debug_handle") == std::string::npos)) {
-        module_debug_info_set.insert(module_info);
-      }
-      ++pc;
-    } catch (const std::exception& e) {
-      break;
-    }
-  }
-
-  // There are 3 module information strings here.
-  // "top(C).forward": for the add operator in top.
-  // "top(C).B0(B).forward": for the add operator in B0.
-  // "top(C).B0(B).forward.A0(A).forward": for the add operator in A0.
-  AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(C)::<unknown>.B0(B)::forward.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(C)::<unknown>.B0(B)::forward.A0(A)::forward.aten::add"));
-}
-
-TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) {
-  Module a("A");
-  a.define(R"JIT(
-    def forward(self, x):
-      return x + 5
-  )JIT");
-  Module b("B");
-  b.register_module("A0", a);
-  b.register_module("A1", a);
-  b.define(R"JIT(
-    def forward(self, x):
-      return self.A0.forward(x) + self.A1.forward(x)
-  )JIT");
-
-  std::stringstream ss;
-  b._save_for_mobile(ss, {}, true);
-  mobile::Module bc = _load_for_mobile(ss);
-
-  std::set<std::string> module_debug_info_set;
-  size_t pc = 0;
-  while (true) {
-    try {
-      std::string module_info = bc.get_forward_method_debug_info(pc);
-      if (!module_info.empty() &&
-          (module_info.find("debug_handle") == std::string::npos)) {
-        module_debug_info_set.insert(module_info);
-      }
-      ++pc;
-    } catch (const std::exception& e) {
-      break;
-    }
-  }
-
-  // class A(nn.Module):
-  //   def __init__(self):
-  //     super(A, self).__init__()
-
-  //   def forward(self, x):
-  //     return x + 5
-
-  // class B(nn.Module):
-  //   def __init__(self):
-  //     super(B, self).__init__()
-  //     self.A0 = A()
-  //     self.A1 = A()
-
-  //   def forward(self, x):
-  //     return self.A0.forward(x) + self.A1.forward(x)
-
-  // There are 3 module information strings here.
-  // "top(B).forward": for the add operator in top.
-  // "top(B).A0(A).forward": for the add operator in A0.
-  // "top(B).A1(A).forward": for the add operator in A1.
-
-  AT_ASSERT(module_debug_info_set.count("top(B)::<unknown>.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(B)::<unknown>.A0(A)::forward.aten::add"));
-  AT_ASSERT(module_debug_info_set.count(
-      "top(B)::<unknown>.A1(A)::forward.aten::add"));
-}
-#endif // !defined(FB_XPLAT_BUILD)
-
 TEST(LiteInterpreterTest, Eval) {
   std::vector<torch::jit::IValue> inputs;
 
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 34846b5d6c7b3..c4731570e6d77 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -319,7 +319,7 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/jit/testing/hooks_for_testing.cpp",
     "torch/csrc/utils/tensor_flatten.cpp",
     "torch/csrc/utils/variadic.cpp",
-] + libtorch_profiler_sources
+]
 
 core_sources_full_mobile = core_sources_full_mobile_no_backend_interface + [
     "torch/csrc/jit/backends/backend_debug_info.cpp",
@@ -337,7 +337,7 @@ core_sources_full = core_sources_full_mobile + [
     "torch/csrc/jit/tensorexpr/external_functions_codegen.cpp",
 ]
 
-libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources)
+libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources + libtorch_profiler_sources)
 
 # These files are the only ones that are supported on Windows.
 libtorch_distributed_base_sources = [
diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp
index 41ce3c6d46d52..a75ffe16c61f5 100644
--- a/torch/csrc/jit/mobile/debug_info.cpp
+++ b/torch/csrc/jit/mobile/debug_info.cpp
@@ -13,6 +13,12 @@ namespace jit {
 
 namespace {
 
+C10_ALWAYS_INLINE std::string debugHandlesNotFoundMessage(
+    const std::string& debug_handles_string) {
+  return "Debug info for handle(s): " + debug_handles_string +
+      ", was not found.";
+}
+
 std::pair<std::vector<StackEntry>, std::string> getStackTraceWithModuleHierarchy(
     const DebugInfoTuple& source_callstack,
     const std::string& caller_name) {
@@ -152,8 +158,7 @@ std::string MobileDebugTable::getModuleHierarchyInfo(
     const std::string& top_module_type_name) const {
   const auto it = callstack_ptr_map_.find(debug_handle);
   if (it == callstack_ptr_map_.end()) {
-    return "Module info for handle, " + std::to_string(debug_handle) +
-        ", not found.";
+    return debugHandlesNotFoundMessage(std::to_string(debug_handle));
   }
   return (getStackTraceWithModuleHierarchy(
               {it->second}, "top", top_module_type_name))
@@ -172,8 +177,7 @@ std::string MobileDebugTable::getSourceDebugString(
     const std::string& top_module_type_name) const {
   const auto it = callstack_ptr_map_.find(debug_handle);
   if (it == callstack_ptr_map_.end()) {
-    return "Debug info for handle, " + std::to_string(debug_handle) +
-        ", not found.";
+    return debugHandlesNotFoundMessage(std::to_string(debug_handle));
   }
   return (getStackTraceWithModuleHierarchy(
               {it->second}, "top", top_module_type_name))
@@ -208,8 +212,7 @@ std::pair<std::string, std::string> MobileDebugTable::
       debug_handles_string += std::to_string(debug_handle);
     }
     debug_handles_string += "}";
-    debug_handles_string =
-        "Debug info for handles: " + debug_handles_string + ", was not found.";
+    debug_handles_string = debugHandlesNotFoundMessage(debug_handles_string);
     return {debug_handles_string, debug_handles_string};
   }
   return (getStackTraceWithModuleHierarchy(
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index 6a548103f6965..99be225255ffb 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -517,12 +517,15 @@ mobile::Module BytecodeDeserializer::deserialize(
   auto bvals = std::move(*readArchive("bytecode", mcu).toTuple()).elements();
 
   c10::optional<std::vector<IValue>> debug_handles;
+  bool has_debug_handles{false};
   if (reader_->hasRecord("mobile_debug_handles.pkl")) {
     debug_handles =
         readArchive("mobile_debug_handles", mcu).toTuple()->elements();
+    has_debug_handles = true;
   }
   parseMethods(bvals, debug_handles, *mcu);
   auto m = mobile::Module(readArchive("data", mcu).toObject(), mcu);
+  m.setHasDebugHandles(has_debug_handles);
 #if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE)
   MobileDebugTable debug_table = MobileDebugTable(reader_, compilation_unit_);
   m.setDebugTable(std::move(debug_table));
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index 02e7c35792693..ab558cd2bf5e0 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -57,6 +57,9 @@ bool InterpreterState::run(Stack& stack) {
       auto inst_with_handle = code_->instructions_with_handles_.at(pc);
       Instruction inst = inst_with_handle.instruction;
       DebugHandle debug_handle = inst_with_handle.debug_handle;
+      // If no valid debug handle found then just log pc.
+      // This is possible when we did not save debug handles
+      debug_handle = debug_handle == -1 ? pc : debug_handle;
 
       // std::cout << "RUNNING " << pc << " "
       //           << code_->instructions_with_handles_[pc].instruction;
diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp
index c04d9f74b7378..c74ca138d848a 100644
--- a/torch/csrc/jit/mobile/module.cpp
+++ b/torch/csrc/jit/mobile/module.cpp
@@ -145,8 +145,7 @@ std::string Module::getCallStack(const int64_t debug_handle) const {
 // We really need to change this part, so in the next step for profiling support
 // for delegates, the first thing will be to rewrite how profiling is done
 // for lite interpreter.
-std::string Module::get_forward_method_debug_info(size_t pc) const {
-  auto debug_handle = find_method("forward")->get_debug_handle(pc);
+std::string Module::get_forward_method_debug_info(int64_t debug_handle) const {
 #if defined(SYMBOLICATE_MOBILE_DEBUG_HANDLE)
   return getDebugTable().getModuleHierarchyInfo(
       debug_handle, getTopModuleTypeName(*this));
diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h
index 73637aa4584a0..6102aa517df66 100644
--- a/torch/csrc/jit/mobile/module.h
+++ b/torch/csrc/jit/mobile/module.h
@@ -78,7 +78,7 @@ class TORCH_API Module {
   }
   const std::vector<at::Tensor> parameters() const;
   const std::map<std::string, at::Tensor> named_parameters() const;
-  std::string get_forward_method_debug_info(size_t pc) const;
+  std::string get_forward_method_debug_info(int64_t debug_handle) const;
   std::string getModuleHierarchy(const int64_t debug_handle) const;
   std::string getCallStack(const int64_t debug_handle) const;
   /// Enables "training" mode.
@@ -115,11 +115,20 @@ class TORCH_API Module {
     return debug_table_;
   }
 
+  void setHasDebugHandles(bool has_debug_handles) {
+    has_debug_handles_ = has_debug_handles;
+  }
+
+  bool hasDebugHandles() const {
+    return has_debug_handles_;
+  }
+
  private:
   c10::intrusive_ptr<c10::ivalue::Object> object_;
   std::unordered_map<std::string, std::string> metadata_;
   std::shared_ptr<CompilationUnit> cu_;
   MobileDebugTable debug_table_;
+  bool has_debug_handles_;
 };
 } // namespace mobile
 } // namespace jit
diff --git a/torch/csrc/jit/mobile/profiler_edge.cpp b/torch/csrc/jit/mobile/profiler_edge.cpp
index bcd5a6258ee7c..162e43f0982a6 100644
--- a/torch/csrc/jit/mobile/profiler_edge.cpp
+++ b/torch/csrc/jit/mobile/profiler_edge.cpp
@@ -2,7 +2,6 @@
 #include <string>
 #include <vector>
 
-namespace profiler = torch::autograd::profiler;
 namespace torch {
 namespace jit {
 namespace mobile {
@@ -27,17 +26,26 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler(
   if (with_modules || with_stack) {
     auto post_processing = [this, with_stack, with_modules](
                                std::vector<profiler::KinetoEvent>& events) {
+      std::string no_debug_info("Model was not saved with debug information");
       for (auto& e : events) {
         if (with_modules) {
           // Since KinetoEvents's module hierarchy takes vector of strings we
           // just construct a temporary vector using one string element
-          e.moduleHierarchy(std::vector<std::string>(
-              {this->m_.getModuleHierarchy(e.debugHandle())}));
+          if (this->m_.hasDebugHandles()) {
+            e.moduleHierarchy(std::vector<std::string>(
+                {this->m_.getModuleHierarchy(e.debugHandle())}));
+          } else {
+            e.moduleHierarchy(std::vector<std::string>({no_debug_info}));
+          }
         } else if (with_stack) {
           // Since KinetoEvents's stack trace takes vector of strings we just
           // construct a temporary vector using one string element
-          e.stack(std::vector<std::string>(
-              {this->m_.getCallStack(e.debugHandle())}));
+          if (this->m_.hasDebugHandles()) {
+            e.stack(std::vector<std::string>(
+                {this->m_.getCallStack(e.debugHandle())}));
+          } else {
+            e.stack(std::vector<std::string>({no_debug_info}));
+          }
         }
       }
     };
@@ -55,8 +63,33 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler(
   trace_file_name_ = fname;
 }
 
+const std::unique_ptr<profiler::ProfilerResult>& KinetoEdgeCPUProfiler::
+    disableProfiler() {
+  TORCH_CHECK(
+      !profiler_result_,
+      "KinetoEdgeCPUProfiler already disabled. "
+      "To get list of events use getProfilerResults()");
+  profiler_result_ = profiler::disableProfiler();
+  return profiler_result_;
+}
+
+const std::unique_ptr<profiler::ProfilerResult>& KinetoEdgeCPUProfiler::
+    getProfilerResult() {
+  TORCH_CHECK(
+      profiler_result_,
+      "KinetoEdgeCPUProfiler has not been disabled. "
+      "use disableProfiler() API first, which returns the ProfilerResult.");
+  return profiler_result_;
+}
+
 KinetoEdgeCPUProfiler::~KinetoEdgeCPUProfiler() {
-  profiler::disableProfiler()->save(trace_file_name_);
+  if (!trace_file_name_.empty()) {
+    if (profiler_result_) {
+      profiler_result_->save(trace_file_name_);
+    } else {
+      profiler::disableProfiler()->save(trace_file_name_);
+    }
+  }
 }
 } // namespace mobile
 } // namespace jit
diff --git a/torch/csrc/jit/mobile/profiler_edge.h b/torch/csrc/jit/mobile/profiler_edge.h
index a245034e34f9b..ef37e01ed4c71 100644
--- a/torch/csrc/jit/mobile/profiler_edge.h
+++ b/torch/csrc/jit/mobile/profiler_edge.h
@@ -2,6 +2,7 @@
 #include <torch/csrc/autograd/profiler_kineto.h>
 #include <torch/csrc/jit/mobile/module.h>
 
+namespace profiler = torch::autograd::profiler;
 namespace torch {
 namespace jit {
 namespace mobile {
@@ -53,6 +54,9 @@ class TORCH_API KinetoEdgeCPUProfiler {
       const bool with_flops = false,
       const bool with_modules = false);
 
+  const std::unique_ptr<profiler::ProfilerResult>& disableProfiler();
+  const std::unique_ptr<profiler::ProfilerResult>& getProfilerResult();
+
   ~KinetoEdgeCPUProfiler();
 
  private:
@@ -62,6 +66,7 @@ class TORCH_API KinetoEdgeCPUProfiler {
    */
   const mobile::Module& m_;
   std::string trace_file_name_;
+  std::unique_ptr<profiler::ProfilerResult> profiler_result_;
 };
 } // namespace mobile
 } // namespace jit

From 03a58a2ba0a18ba4e8d41ad1a8cd8431ac1e5a4b Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 1 Sep 2021 13:24:11 -0700
Subject: [PATCH 430/530] [Caffe2] Create fewer strings during argument
 fetching (#64285)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64285

With C++14 heterogeneous ordered container lookup, it is no longer necessary to create a `std::string` in order to look up elements of a `CaffeMap` keyed by std::string. Accordingly, this diff reworks the argument-getting operator functions to avoid that in favor of `c10::string_view`.
ghstack-source-id: 137139818
ghstack-source-id: 137139818

Test Plan: buildsizebot iOS apps -- code size win. less strings is probably marginally good for perf but this only happens at setup time anyway.

Reviewed By: dzhulgakov

Differential Revision: D26826676

fbshipit-source-id: ee653b14dc2c528bae8c90f0fc6a7a419cbca1d6
---
 aten/src/ATen/core/function_schema.h |  3 +-
 caffe2/core/operator.cc              |  2 +-
 caffe2/core/operator.h               | 17 ++++----
 caffe2/utils/proto_utils.cc          | 49 ++++++++++++---------
 caffe2/utils/proto_utils.h           | 64 +++++++++++++++++-----------
 5 files changed, 79 insertions(+), 56 deletions(-)

diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index a7b514990185b..f4b11fc4a304a 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/util/StringUtil.h>
+#include <c10/util/string_view.h>
 #include <ATen/core/jit_type.h>
 #include <ATen/core/interned_strings.h>
 #include <ATen/core/ivalue.h>
@@ -272,7 +273,7 @@ struct FunctionSchema {
         });
   }
 
-  c10::optional<int> argumentIndexWithName(const std::string& name) const {
+  c10::optional<int> argumentIndexWithName(c10::string_view name) const {
     for(size_t i = 0; i < arguments().size(); ++i) {
       if(name == arguments()[i].name())
         return i;
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index ca66f7846c300..e25c92a6d6075 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -831,7 +831,7 @@ std::function<void(const OperatorDef&)> GetOperatorLogger() {
 }
 
 c10::optional<int> OperatorBase::argumentIndexWithName(
-    const std::string& name) const {
+    c10::string_view name) const {
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   return getFunctionSchema().argumentIndexWithName(name);
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index b840254612929..15d1ead352762 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -15,6 +15,7 @@
 
 #include <c10/macros/Macros.h>
 #include <c10/util/Registry.h>
+#include <c10/util/string_view.h>
 #include <c10/util/typeid.h>
 #include <c10/core/Stream.h>
 #include "caffe2/core/blob.h"
@@ -97,7 +98,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
 
   /** @brief Checks if the operator has an argument of the given name.
    */
-  inline bool HasArgument(const string& name) const {
+  inline bool HasArgument(c10::string_view name) const {
     if (isLegacyOperator()) {
       CAFFE_ENFORCE(operator_def_, "operator_def was null!");
       return ArgumentHelper::HasArgument(*operator_def_, name);
@@ -108,7 +109,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
   // Functions that deal with arguments. Basically, this allows us to map an
   // argument name to a specific type of argument that we are trying to access.
   template <typename T>
-  inline T GetSingleArgument(const string& name, const T& default_value) const {
+  inline T GetSingleArgument(c10::string_view name, const T& default_value) const {
     if (isLegacyOperator()) {
       CAFFE_ENFORCE(operator_def_, "operator_def was null!");
       return ArgumentHelper::GetSingleArgument<OperatorDef, T>(
@@ -126,7 +127,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
   }
 
   template <typename T>
-  inline bool HasSingleArgumentOfType(const string& name) const {
+  inline bool HasSingleArgumentOfType(c10::string_view name) const {
     CAFFE_ENFORCE(operator_def_, "operator_def was null!");
     return ArgumentHelper::HasSingleArgumentOfType<OperatorDef, T>(
         *operator_def_, name);
@@ -141,7 +142,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
 
   template <typename T>
   inline vector<T> GetRepeatedArgument(
-      const string& name,
+      c10::string_view name,
       const vector<T>& default_value = {}) const;
 
   // Get the inputs and outputs as specific types.
@@ -654,7 +655,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     }
   }
 
-  c10::optional<int> argumentIndexWithName(const std::string& name) const;
+  c10::optional<int> argumentIndexWithName(c10::string_view name) const;
 
   // An event used by asynchronous execution.
   std::unique_ptr<Event> event_;
@@ -664,7 +665,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
 
 template <>
 inline NetDef OperatorBase::GetSingleArgument<NetDef>(
-    const std::string& name,
+    c10::string_view name,
     const NetDef& default_value) const {
   if (isLegacyOperator()) {
     CAFFE_ENFORCE(operator_def_, "operator_def was null!");
@@ -756,7 +757,7 @@ inline vector<int16_t> OperatorBase::GetVectorFromIValueList<int16_t>(
 
 template <typename T>
 inline vector<T> OperatorBase::GetRepeatedArgument(
-    const string& name,
+    c10::string_view name,
     const vector<T>& default_value) const {
   if (isLegacyOperator()) {
     CAFFE_ENFORCE(operator_def_, "operator_def was null!");
@@ -778,7 +779,7 @@ inline vector<T> OperatorBase::GetRepeatedArgument(
 // int16_t. We need to load it as List<int64_t> and transform to int16_t.
 template <>
 inline vector<int16_t> OperatorBase::GetRepeatedArgument<int16_t>(
-    const string& name,
+    c10::string_view name,
     const vector<int16_t>& default_value) const {
   if (isLegacyOperator()) {
     CAFFE_ENFORCE(operator_def_, "operator_def was null!");
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
index d2aa59e02b63f..db379462e5347 100644
--- a/caffe2/utils/proto_utils.cc
+++ b/caffe2/utils/proto_utils.cc
@@ -323,8 +323,12 @@ C10_EXPORT ArgumentHelper::ArgumentHelper(const NetDef& netdef) {
   }
 }
 
-C10_EXPORT bool ArgumentHelper::HasArgument(const string& name) const {
+C10_EXPORT bool ArgumentHelper::HasArgument(c10::string_view name) const {
+#ifdef CAFFE2_ENABLE_REDUCED_STRINGS_IN_ARGUMENT_LOOKUP
   return arg_map_.count(name);
+#else
+  return arg_map_.count(std::string(name));
+#endif
 }
 
 namespace {
@@ -364,18 +368,19 @@ std::ostream& operator<<(std::ostream& output, const NetDef& n) {
     T, fieldname, enforce_lossless_conversion)                         \
   template <>                                                          \
   C10_EXPORT T ArgumentHelper::GetSingleArgument<T>(                   \
-      const string& name, const T& default_value) const {              \
-    if (arg_map_.count(name) == 0) {                                   \
+      c10::string_view name, const T& default_value) const {           \
+    auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name);                      \
+    if (it == arg_map_.end()) {                                         \
       VLOG(1) << "Using default parameter value " << default_value     \
               << " for parameter " << name;                            \
       return default_value;                                            \
     }                                                                  \
     CAFFE_ENFORCE(                                                     \
-        arg_map_.at(name).has_##fieldname(),                           \
+        it->second.has_##fieldname(),                                  \
         "Argument ",                                                   \
         name,                                                          \
         " does not have the right field: expected field " #fieldname); \
-    auto value = arg_map_.at(name).fieldname();                        \
+    auto value = it->second.fieldname();                               \
     if (enforce_lossless_conversion) {                                 \
       auto supportsConversion =                                        \
           SupportsLosslessConversion<decltype(value), T>(value);       \
@@ -391,11 +396,12 @@ std::ostream& operator<<(std::ostream& output, const NetDef& n) {
   }                                                                    \
   template <>                                                          \
   C10_EXPORT bool ArgumentHelper::HasSingleArgumentOfType<T>(          \
-      const string& name) const {                                      \
-    if (arg_map_.count(name) == 0) {                                   \
+      c10::string_view name) const {                                   \
+    auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name);                     \
+    if (it == arg_map_.end()) {                                        \
       return false;                                                    \
     }                                                                  \
-    return arg_map_.at(name).has_##fieldname();                        \
+    return it->second.has_##fieldname();                               \
   }
 
 INSTANTIATE_GET_SINGLE_ARGUMENT(float, f, false)
@@ -415,13 +421,14 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(NetDef, n, false)
 #define INSTANTIATE_GET_REPEATED_ARGUMENT(                             \
     T, fieldname, enforce_lossless_conversion)                         \
   template <>                                                          \
-  C10_EXPORT std::vector<T> ArgumentHelper::GetRepeatedArgument<T>(         \
-      const string& name, const std::vector<T>& default_value) const { \
-    if (arg_map_.count(name) == 0) {                                   \
+  C10_EXPORT std::vector<T> ArgumentHelper::GetRepeatedArgument<T>( \
+      c10::string_view name, const std::vector<T>& default_value) const { \
+    auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name);                      \
+    if (it == arg_map_.end()) {                                         \
       return default_value;                                            \
     }                                                                  \
-    std::vector<T> values;                                                  \
-    for (const auto& v : arg_map_.at(name).fieldname()) {              \
+    std::vector<T> values;                                           \
+    for (const auto& v : it->second.fieldname()) {                     \
       if (enforce_lossless_conversion) {                               \
         auto supportsConversion =                                      \
             SupportsLosslessConversion<decltype(v), T>(v);             \
@@ -531,7 +538,7 @@ C10_EXPORT bool HasInput(const OperatorDef& op, const std::string& input) {
 // Return the argument index or -1 if it does not exist.
 C10_EXPORT int GetArgumentIndex(
     const google::protobuf::RepeatedPtrField<Argument>& args,
-    const string& name) {
+    c10::string_view name) {
   int index = 0;
   for (const Argument& arg : args) {
     if (arg.name() == name) {
@@ -544,7 +551,7 @@ C10_EXPORT int GetArgumentIndex(
 
 C10_EXPORT const Argument& GetArgument(
     const OperatorDef& def,
-    const string& name) {
+    c10::string_view name) {
   int index = GetArgumentIndex(def.arg(), name);
   if (index != -1) {
     return def.arg(index);
@@ -557,7 +564,7 @@ C10_EXPORT const Argument& GetArgument(
   }
 }
 
-C10_EXPORT const Argument& GetArgument(const NetDef& def, const string& name) {
+C10_EXPORT const Argument& GetArgument(const NetDef& def, c10::string_view name) {
   int index = GetArgumentIndex(def.arg(), name);
   if (index != -1) {
     return def.arg(index);
@@ -572,7 +579,7 @@ C10_EXPORT const Argument& GetArgument(const NetDef& def, const string& name) {
 
 C10_EXPORT const Argument* GetArgumentPtr(
     const OperatorDef& def,
-    const string& name) {
+    c10::string_view name) {
   int index = GetArgumentIndex(def.arg(), name);
   if (index != -1) {
     return &def.arg(index);
@@ -583,7 +590,7 @@ C10_EXPORT const Argument* GetArgumentPtr(
 
 C10_EXPORT const Argument* GetArgumentPtr(
     const NetDef& def,
-    const string& name) {
+    c10::string_view name) {
   int index = GetArgumentIndex(def.arg(), name);
   if (index != -1) {
     return &def.arg(index);
@@ -594,7 +601,7 @@ C10_EXPORT const Argument* GetArgumentPtr(
 
 C10_EXPORT bool GetFlagArgument(
     const google::protobuf::RepeatedPtrField<Argument>& args,
-    const string& name,
+    c10::string_view name,
     bool default_value) {
   int index = GetArgumentIndex(args, name);
   if (index != -1) {
@@ -609,13 +616,13 @@ C10_EXPORT bool GetFlagArgument(
 
 C10_EXPORT bool GetFlagArgument(
     const OperatorDef& def,
-    const string& name,
+    c10::string_view name,
     bool default_value) {
   return GetFlagArgument(def.arg(), name, default_value);
 }
 
 C10_EXPORT bool
-GetFlagArgument(const NetDef& def, const string& name, bool default_value) {
+GetFlagArgument(const NetDef& def, c10::string_view name, bool default_value) {
   return GetFlagArgument(def.arg(), name, default_value);
 }
 
diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h
index 57676982c7851..b5c6b312b3ab3 100644
--- a/caffe2/utils/proto_utils.h
+++ b/caffe2/utils/proto_utils.h
@@ -8,10 +8,18 @@
 #endif  // !CAFFE2_USE_LITE_PROTO
 
 #include <c10/util/Logging.h>
+#include <c10/util/string_view.h>
 
 #include "caffe2/utils/proto_wrap.h"
 #include "caffe2/proto/caffe2_pb.h"
 
+#ifndef C10_ANDROID
+#define CAFFE2_ENABLE_REDUCED_STRINGS_IN_ARGUMENT_LOOKUP
+#define CAFFE2_ARG_MAP_FIND(map, key) map.find(key)
+#else
+#define CAFFE2_ARG_MAP_FIND(map, key) map.find(std::string(key))
+#endif
+
 namespace caffe2 {
 
 using std::string;
@@ -204,40 +212,40 @@ TORCH_API bool HasInput(const OperatorDef& op, const std::string& input);
 class C10_EXPORT ArgumentHelper {
  public:
   template <typename Def>
-  static bool HasArgument(const Def& def, const string& name) {
+  static bool HasArgument(const Def& def, c10::string_view name) {
     return ArgumentHelper(def).HasArgument(name);
   }
 
   template <typename Def, typename T>
   static T GetSingleArgument(
       const Def& def,
-      const string& name,
+      c10::string_view name,
       const T& default_value) {
     return ArgumentHelper(def).GetSingleArgument<T>(name, default_value);
   }
 
   template <typename Def, typename T>
-  static bool HasSingleArgumentOfType(const Def& def, const string& name) {
+  static bool HasSingleArgumentOfType(const Def& def, c10::string_view name) {
     return ArgumentHelper(def).HasSingleArgumentOfType<T>(name);
   }
 
   template <typename Def, typename T>
   static std::vector<T> GetRepeatedArgument(
       const Def& def,
-      const string& name,
+      c10::string_view name,
       const std::vector<T>& default_value = std::vector<T>()) {
     return ArgumentHelper(def).GetRepeatedArgument<T>(name, default_value);
   }
 
   template <typename Def, typename MessageType>
-  static MessageType GetMessageArgument(const Def& def, const string& name) {
+  static MessageType GetMessageArgument(const Def& def, c10::string_view name) {
     return ArgumentHelper(def).GetMessageArgument<MessageType>(name);
   }
 
   template <typename Def, typename MessageType>
   static std::vector<MessageType> GetRepeatedMessageArgument(
       const Def& def,
-      const string& name) {
+      c10::string_view name) {
     return ArgumentHelper(def).GetRepeatedMessageArgument<MessageType>(name);
   }
 
@@ -255,24 +263,25 @@ class C10_EXPORT ArgumentHelper {
 
   explicit ArgumentHelper(const OperatorDef& def);
   explicit ArgumentHelper(const NetDef& netdef);
-  bool HasArgument(const string& name) const;
+  bool HasArgument(c10::string_view name) const;
 
   template <typename T>
-  T GetSingleArgument(const string& name, const T& default_value) const;
+  T GetSingleArgument(c10::string_view name, const T& default_value) const;
   template <typename T>
-  bool HasSingleArgumentOfType(const string& name) const;
+  bool HasSingleArgumentOfType(c10::string_view name) const;
   template <typename T>
   std::vector<T> GetRepeatedArgument(
-      const string& name,
+      c10::string_view name,
       const std::vector<T>& default_value = std::vector<T>()) const;
 
   template <typename MessageType>
-  MessageType GetMessageArgument(const string& name) const {
-    CAFFE_ENFORCE(arg_map_.count(name), "Cannot find parameter named ", name);
+  MessageType GetMessageArgument(c10::string_view name) const {
+    auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name);
+    CAFFE_ENFORCE(it != arg_map_.end(), "Cannot find parameter named ", name);
     MessageType message;
-    if (arg_map_.at(name).has_s()) {
+    if (it->second.has_s()) {
       CAFFE_ENFORCE(
-          message.ParseFromString(arg_map_.at(name).s()),
+          message.ParseFromString(it->second.s()),
           "Failed to parse content from the string");
     } else {
       VLOG(1) << "Return empty message for parameter " << name;
@@ -281,42 +290,47 @@ class C10_EXPORT ArgumentHelper {
   }
 
   template <typename MessageType>
-  std::vector<MessageType> GetRepeatedMessageArgument(const string& name) const {
-    CAFFE_ENFORCE(arg_map_.count(name), "Cannot find parameter named ", name);
-    std::vector<MessageType> messages(arg_map_.at(name).strings_size());
+  std::vector<MessageType> GetRepeatedMessageArgument(c10::string_view name) const {
+    auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name);
+    CAFFE_ENFORCE(it != arg_map_.end(), "Cannot find parameter named ", name);
+    std::vector<MessageType> messages(it->second.strings_size());
     for (int i = 0; i < messages.size(); ++i) {
       CAFFE_ENFORCE(
-          messages[i].ParseFromString(arg_map_.at(name).strings(i)),
+          messages[i].ParseFromString(it->second.strings(i)),
           "Failed to parse content from the string");
     }
     return messages;
   }
 
  private:
-  std::map<string, Argument> arg_map_;
+  std::map<string, Argument
+#ifdef CAFFE2_ENABLE_REDUCED_STRINGS_IN_ARGUMENT_LOOKUP
+  , std::less<>
+#endif
+  > arg_map_;
 };
 
 // **** Arguments Utils *****
 
 // Helper methods to get an argument from OperatorDef or NetDef given argument
 // name. Throws if argument does not exist.
-TORCH_API const Argument& GetArgument(const OperatorDef& def, const string& name);
-TORCH_API const Argument& GetArgument(const NetDef& def, const string& name);
+TORCH_API const Argument& GetArgument(const OperatorDef& def, c10::string_view name);
+TORCH_API const Argument& GetArgument(const NetDef& def, c10::string_view name);
 // Helper methods to get an argument from OperatorDef or NetDef given argument
 // name. Returns nullptr if argument does not exist.
-TORCH_API const Argument* GetArgumentPtr(const OperatorDef& def, const string& name);
-TORCH_API const Argument* GetArgumentPtr(const NetDef& def, const string& name);
+TORCH_API const Argument* GetArgumentPtr(const OperatorDef& def, c10::string_view name);
+TORCH_API const Argument* GetArgumentPtr(const NetDef& def, c10::string_view name);
 
 // Helper methods to query a boolean argument flag from OperatorDef or NetDef
 // given argument name. If argument does not exist, return default value.
 // Throws if argument exists but the type is not boolean.
 TORCH_API bool GetFlagArgument(
     const OperatorDef& def,
-    const string& name,
+    c10::string_view name,
     bool default_value = false);
 TORCH_API bool GetFlagArgument(
     const NetDef& def,
-    const string& name,
+    c10::string_view name,
     bool default_value = false);
 
 TORCH_API Argument* GetMutableArgument(

From 25e2578967494ee88da66820d861c426eb7a742d Mon Sep 17 00:00:00 2001
From: Tanvir Zaman <motanv@fb.com>
Date: Wed, 1 Sep 2021 13:31:45 -0700
Subject: [PATCH 431/530] Fix bytes_written and bytes_read (#64244)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64244

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64040

In operator cost inference functions, in many places we are using sizeof(x.data_type()). Since data_type() returns a 32 bit integer from [this enum](https://www.internalfb.com/code/fbsource/[15e7ffe4073cf08c61077c7c24a4839504b964a2]/fbcode/caffe2/caffe2/proto/caffe2.proto?lines=20), we are basically always getting 4 for sizeof(x.data_type()) no matter what actual data type x has. Big thanks to Jack Langman for specifically pointing to this bug.

We would instead use the size in bytes based on actual data type.

Test Plan:
Added unit tests BatchMatMulMemCostTest:

buck test //caffe2/caffe2/fb/fbgemm:batch_matmul_op_test -- BatchMatMulMemCostTest

Extended existing unit test test_columnwise_concat for different data types:

buck test //caffe2/caffe2/python/operator_test:concat_op_cost_test -- test_columnwise_concat

Reviewed By: CrazySherman

Differential Revision: D30656698

fbshipit-source-id: d42c0c9a0c5b0ddc5dba39e4994f1f85a5e618bf
---
 caffe2/core/operator_schema.h                 |  17 ++-
 caffe2/operators/batch_matmul_op.cc           | 113 ++++++++++--------
 caffe2/operators/concat_split_op.cc           |  15 ++-
 caffe2/operators/conv_pool_op_base.h          |  15 ++-
 caffe2/operators/distance_op.cc               |  28 +++--
 caffe2/operators/fc_inference.cc              |  22 ++--
 caffe2/operators/one_hot_ops.cc               |  30 +++--
 caffe2/operators/utility_ops.cc               |  13 +-
 .../operator_test/concat_op_cost_test.py      |  54 +++++----
 caffe2/python/workspace_test.py               |   2 +-
 caffe2/sgd/adagrad_op.cc                      |  55 ++++++---
 11 files changed, 224 insertions(+), 140 deletions(-)

diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
index 64f5ef3ed883a..0d048eb8d26e9 100644
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@@ -6,12 +6,13 @@
 #include <initializer_list>
 #include <ostream>
 #include <set>
-#include <vector>
 #include <unordered_map>
+#include <vector>
 
 #include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
+#include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/filler.h"
 #include "caffe2/utils/proto_utils.h"
@@ -273,8 +274,8 @@ class TORCH_API OpSchema {
   OpSchema&
   Arg(const char* name, const char* description, bool required = false);
 
-#define DECLARE_STANDARD_ARG(name, str)     \
-  static const char* Arg_##name; \
+#define DECLARE_STANDARD_ARG(name, str) \
+  static const char* Arg_##name;        \
   OpSchema& Arg##name(const char* description);
 
   DECLARE_STANDARD_ARG(IsTest, is_test)
@@ -339,7 +340,9 @@ class TORCH_API OpSchema {
     return inplace_enforced_(x, y);
   }
 
-  TORCH_API friend std::ostream& operator<<(std::ostream& out, const OpSchema& schema);
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const OpSchema& schema);
 
   const std::vector<Argument>& args() const {
     return args_;
@@ -562,8 +565,10 @@ OpSchema::Cost PointwiseCostInference(
   }
 
   c.flops = nElemX * OpsPerPoint;
-  c.bytes_read = nElemRead * sizeof(X.data_type());
-  c.bytes_written = nElemX * sizeof(X.data_type());
+  auto const& X_element_size_byte =
+      DataTypeToTypeMeta(X.data_type()).itemsize();
+  c.bytes_read = nElemRead * X_element_size_byte;
+  c.bytes_written = nElemX * X_element_size_byte;
   return c;
 }
 
diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc
index 32799ced10671..205acf74f1572 100644
--- a/caffe2/operators/batch_matmul_op.cc
+++ b/caffe2/operators/batch_matmul_op.cc
@@ -1,6 +1,7 @@
 #include "caffe2/operators/batch_matmul_op.h"
 
 #include "caffe2/core/operator_schema.h"
+#include "caffe2/core/types.h"
 
 namespace caffe2 {
 
@@ -116,9 +117,13 @@ OpSchema::Cost CostInferenceForBatchMatMul(
     K = in[0].dims(ndims_A - 1);
   }
 
+  auto const& A_element_size_byte =
+      DataTypeToTypeMeta(A.data_type()).itemsize();
+  auto const& Y_element_size_byte =
+      DataTypeToTypeMeta(Y.data_type()).itemsize();
   c.flops = 2 * nElemY * K;
-  c.bytes_read = (nElemA + nElemB) * sizeof(A.data_type());
-  c.bytes_written = nElemY * sizeof(Y.data_type());
+  c.bytes_read = (nElemA + nElemB) * A_element_size_byte;
+  c.bytes_written = nElemY * Y_element_size_byte;
   c.params_bytes = 0;
   return c;
 }
@@ -180,72 +185,76 @@ class GetBatchMatMulGradient : public GradientMakerBase {
     auto no_trans_arg = vector<Argument>();
     auto trans_a_arg = vector<Argument>{MakeArgument<int>("trans_a", 1)};
     auto trans_b_arg = vector<Argument>{MakeArgument<int>("trans_b", 1)};
-    auto trans_both_arg = vector<Argument>{MakeArgument<int>("trans_a", 1),
-                                           MakeArgument<int>("trans_b", 1)};
+    auto trans_both_arg = vector<Argument>{
+        MakeArgument<int>("trans_a", 1), MakeArgument<int>("trans_b", 1)};
 
     if (trans_a) {
       if (trans_b) {
         // A'B':
         // dA = B'G', dB = G'A'
-        return vector<OperatorDef>{CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{I(1), GO(0)},
-                                       vector<string>{GI(0)},
-                                       trans_both_arg),
-                                   CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{GO(0), I(0)},
-                                       vector<string>{GI(1)},
-                                       trans_both_arg)};
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(1), GO(0)},
+                vector<string>{GI(0)},
+                trans_both_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(0)},
+                vector<string>{GI(1)},
+                trans_both_arg)};
       } else {
         // A'B:
         // dA = BG', dB = AG
-        return vector<OperatorDef>{CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{I(1), GO(0)},
-                                       vector<string>{GI(0)},
-                                       trans_b_arg),
-                                   CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{I(0), GO(0)},
-                                       vector<string>{GI(1)},
-                                       no_trans_arg)};
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(1), GO(0)},
+                vector<string>{GI(0)},
+                trans_b_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(0), GO(0)},
+                vector<string>{GI(1)},
+                no_trans_arg)};
       }
     } else {
       if (trans_b) {
         // AB':
         // dA = GB, dB = G'A
-        return vector<OperatorDef>{CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{GO(0), I(1)},
-                                       vector<string>{GI(0)},
-                                       no_trans_arg),
-                                   CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{GO(0), I(0)},
-                                       vector<string>{GI(1)},
-                                       trans_a_arg)};
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(1)},
+                vector<string>{GI(0)},
+                no_trans_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(0)},
+                vector<string>{GI(1)},
+                trans_a_arg)};
       } else {
         // AB:
         // dA = GB', dB = A'G
-        return vector<OperatorDef>{CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{GO(0), I(1)},
-                                       vector<string>{GI(0)},
-                                       trans_b_arg),
-                                   CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{I(0), GO(0)},
-                                       vector<string>{GI(1)},
-                                       trans_a_arg)};
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(1)},
+                vector<string>{GI(0)},
+                trans_b_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(0), GO(0)},
+                vector<string>{GI(1)},
+                trans_a_arg)};
       }
     }
   }
diff --git a/caffe2/operators/concat_split_op.cc b/caffe2/operators/concat_split_op.cc
index 8eceb5ab4a577..86d6536b8880d 100644
--- a/caffe2/operators/concat_split_op.cc
+++ b/caffe2/operators/concat_split_op.cc
@@ -101,9 +101,12 @@ OpSchema::Cost CostInferenceForSplit(
   CAFFE_ENFORCE_GT(in.size(), 0);
   struct OpSchema::Cost cost;
   cost.flops = 0;
-  auto input_bytes_count = nElemFromDim(in[0]) * sizeof(in[0].data_type());
-  auto split_bytes_count =
-      (in.size() == 1) ? 0 : nElemFromDim(in[1]) * sizeof(in[1].data_type());
+  auto const& input_0_element_size_byte =
+      DataTypeToTypeMeta(in[0].data_type()).itemsize();
+  auto input_bytes_count = nElemFromDim(in[0]) * input_0_element_size_byte;
+  auto split_bytes_count = in.size() > 1
+      ? nElemFromDim(in[1]) * DataTypeToTypeMeta(in[1].data_type()).itemsize()
+      : 0;
   // There can be two input blobs:
   // (1) actual tensor to be split
   // (2) lengths of outputs along split axis
@@ -329,11 +332,13 @@ OpSchema::Cost CostInferenceForConcat(
   }
   auto split_info_bytes_count = in.size() * sizeof(int);
 
+  auto const& input_0_element_size_byte =
+      DataTypeToTypeMeta(in[0].data_type()).itemsize();
   struct OpSchema::Cost cost;
   cost.flops = 0;
-  cost.bytes_read = nElemRead * sizeof(in[0].data_type());
+  cost.bytes_read = nElemRead * input_0_element_size_byte;
   cost.bytes_written =
-      size * sizeof(in[0].data_type()) + split_info_bytes_count;
+      size * input_0_element_size_byte + split_info_bytes_count;
   cost.params_bytes = 0;
   return cost;
 }
diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h
index 25bd99a92e50f..b356ef952d79c 100644
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@@ -7,6 +7,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2_legacy.pb.h"
 #include "caffe2/utils/math.h"
 
@@ -519,14 +520,20 @@ class ConvPoolOpBase : public Operator<Context> {
     uint64_t nElemW = nElemFromDim(W);
     uint64_t nElemBias = inputs.size() > 2 ? nElemFromDim(inputs[2]) : 0;
 
+    auto const& X_elemenet_size_byte =
+        DataTypeToTypeMeta(X.data_type()).itemsize();
+    auto const& Y_element_size_byte =
+        DataTypeToTypeMeta(Y.data_type()).itemsize();
+    auto const& W_element_size_byte =
+        DataTypeToTypeMeta(W.data_type()).itemsize();
+
     // grouping is NOT properly handled yet
     c.flops = N * Y_t * Y_h * Y_w * kernel_t * kernel_w * kernel_h *
         in_channels * out_channels * 2;
-    c.bytes_read = (nElemX + nElemW + nElemBias) * sizeof(X.data_type());
-    c.bytes_written =
-        N * out_channels * Y_t * Y_h * Y_w * sizeof(Y.data_type());
+    c.bytes_read = (nElemX + nElemW + nElemBias) * X_elemenet_size_byte;
+    c.bytes_written = N * out_channels * Y_t * Y_h * Y_w * Y_element_size_byte;
     c.params_bytes = out_channels * in_channels * kernel_t * kernel_h *
-        kernel_w * sizeof(W.data_type());
+        kernel_w * W_element_size_byte;
     return c;
   }
 
diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc
index 1529534d8fb2e..9ea8eea5a2725 100644
--- a/caffe2/operators/distance_op.cc
+++ b/caffe2/operators/distance_op.cc
@@ -1,4 +1,5 @@
 #include "caffe2/operators/distance_op.h"
+#include "caffe2/core/types.h"
 #include "caffe2/utils/eigen_utils.h"
 #ifdef CAFFE2_USE_MKLDNN
 #include <caffe2/ideep/operators/operator_fallback_ideep.h>
@@ -7,7 +8,7 @@
 
 namespace caffe2 {
 
-template<>
+template <>
 bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
   auto& X = Input(0);
   auto& Y = Input(1);
@@ -257,7 +258,9 @@ OpSchema::Cost CostInferenceForDotProduct(
   CAFFE_ENFORCE_EQ(out[0].dims().size(), 1);
 
   struct OpSchema::Cost c = PointwiseCostInference<2>(def, in);
-  c.bytes_written = out[0].dims(0) * sizeof(out[0].data_type());
+  auto const& out_0_element_size_byte =
+      DataTypeToTypeMeta(out[0].data_type()).itemsize();
+  c.bytes_written = out[0].dims(0) * out_0_element_size_byte;
   c.params_bytes = 0;
   return c;
 }
@@ -379,10 +382,12 @@ bool DotProductWithPaddingOp<float, CPUContext>::RunOnDevice() {
 }
 
 // L2
-REGISTER_CPU_OPERATOR(SquaredL2Distance,
-                      SquaredL2DistanceOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
-                      SquaredL2DistanceGradientOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SquaredL2Distance,
+    SquaredL2DistanceOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SquaredL2DistanceGradient,
+    SquaredL2DistanceGradientOp<float, CPUContext>);
 
 OPERATOR_SCHEMA(SquaredL2Distance)
     .NumInputs(2)
@@ -402,7 +407,8 @@ class GetSquaredL2DistanceGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
-        "SquaredL2DistanceGradient", "",
+        "SquaredL2DistanceGradient",
+        "",
         vector<string>{I(0), I(1), GO(0)},
         vector<string>{GI(0), GI(1)});
   }
@@ -762,9 +768,9 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase {
       replicate = GetArgument(Def(), "replicate").i();
     }
 
-    const auto dot_arg =
-        vector<Argument>{MakeArgument<float>("pad_value", pad_value),
-                         MakeArgument<bool>("replicate", replicate)};
+    const auto dot_arg = vector<Argument>{
+        MakeArgument<float>("pad_value", pad_value),
+        MakeArgument<bool>("replicate", replicate)};
 
     return SingleGradientDef(
         "DotProductWithPaddingGradient",
@@ -775,4 +781,4 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase {
   }
 };
 REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient);
-}  // namespace caffe2
+} // namespace caffe2
diff --git a/caffe2/operators/fc_inference.cc b/caffe2/operators/fc_inference.cc
index a44c230980c7f..ba1b7122cdc9d 100644
--- a/caffe2/operators/fc_inference.cc
+++ b/caffe2/operators/fc_inference.cc
@@ -1,4 +1,5 @@
 #include "caffe2/operators/fc_inference.h"
+#include "caffe2/core/types.h"
 
 namespace caffe2 {
 std::vector<TensorShape> FCShapeInference(
@@ -51,11 +52,12 @@ OpSchema::Cost CostInferenceForFC(
       ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
       : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
 
-  const auto& X = in[0];
+  auto const& X_element_size_byte =
+      DataTypeToTypeMeta(in[0].data_type()).itemsize();
   c.flops = M * N * (2 * K + 1);
-  c.bytes_read = (K * (M + N) + N) * sizeof(X.data_type());
-  c.bytes_written = M * N * sizeof(X.data_type());
-  c.params_bytes = (K * N + N) * sizeof(X.data_type());
+  c.bytes_read = (K * (M + N) + N) * X_element_size_byte;
+  c.bytes_written = M * N * X_element_size_byte;
+  c.params_bytes = (K * N + N) * X_element_size_byte;
   return c;
 }
 
@@ -94,7 +96,11 @@ OpSchema::Cost CostInferenceForFCGradient(
 
   CAFFE_ENFORCE_LT(0, out.size());
   const TensorShape dW = out[0];
+  auto const& dW_element_size_byte =
+      DataTypeToTypeMeta(dW.data_type()).itemsize();
   const TensorShape db = out[1];
+  auto const& db_element_size_byte =
+      DataTypeToTypeMeta(db.data_type()).itemsize();
 
   auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
   const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
@@ -111,15 +117,17 @@ OpSchema::Cost CostInferenceForFCGradient(
   uint64_t size_db = nElemFromDim(db);
 
   c.flops = M * N * (2 * K + 1);
-  c.bytes_written = (size_dW + size_db) * sizeof(float);
+  c.bytes_written =
+      size_dW * dW_element_size_byte + size_db * db_element_size_byte;
   c.params_bytes = (K * N + N) * sizeof(float);
 
   if (out.size() == 3) {
     const TensorShape dX = out[2];
     uint64_t size_dX = nElemFromDim(dX);
-
+    auto const& dX_element_size_byte =
+        DataTypeToTypeMeta(dX.data_type()).itemsize();
     c.flops += 2 * M * N * K;
-    c.bytes_written += size_dX * sizeof(float);
+    c.bytes_written += size_dX * dX_element_size_byte;
   }
   return c;
 }
diff --git a/caffe2/operators/one_hot_ops.cc b/caffe2/operators/one_hot_ops.cc
index c3eaf05db0e8f..55c73a5be22c4 100644
--- a/caffe2/operators/one_hot_ops.cc
+++ b/caffe2/operators/one_hot_ops.cc
@@ -2,6 +2,7 @@
 
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"
 
 namespace caffe2 {
 
@@ -78,12 +79,21 @@ OpSchema::Cost CostInferenceForBatchOneHot(
   const auto& length = in[1];
   const auto& values = in[2];
 
-  uint64_t nBytesData = nElemFromDim(data) * sizeof(data.data_type());
-  uint64_t nBytesLength = nElemFromDim(length) * sizeof(length.data_type());
-  uint64_t nBytesValues = nElemFromDim(values) * sizeof(values.data_type());
+  auto const& data_element_size_byte =
+      DataTypeToTypeMeta(data.data_type()).itemsize();
+  auto const& length_element_size_byte =
+      DataTypeToTypeMeta(length.data_type()).itemsize();
+  auto const& values_element_size_byte =
+      DataTypeToTypeMeta(values.data_type()).itemsize();
+  auto const& output_element_size_byte =
+      DataTypeToTypeMeta(output.data_type()).itemsize();
+
+  uint64_t nBytesData = nElemFromDim(data) * data_element_size_byte;
+  uint64_t nBytesLength = nElemFromDim(length) * length_element_size_byte;
+  uint64_t nBytesValues = nElemFromDim(values) * values_element_size_byte;
   c.flops = 0;
   c.bytes_read = nBytesData + nBytesLength + nBytesValues;
-  c.bytes_written = nElemFromDim(output) * sizeof(output.data_type());
+  c.bytes_written = nElemFromDim(output) * output_element_size_byte;
   c.params_bytes = 0;
   return c;
 }
@@ -145,15 +155,15 @@ bool BatchBucketOneHotOp<CPUContext>::RunOnDevice() {
     for (int64_t j = 0; j < D; j++) {
       // here we assume the boundary values for each feature are sorted
       int64_t lower_bucket_idx = std::lower_bound(
-                                    boundaries_offset,
-                                    boundaries_offset + lens_data[j],
-                                    input_data[pos]) -
+                                     boundaries_offset,
+                                     boundaries_offset + lens_data[j],
+                                     input_data[pos]) -
           boundaries_offset;
 
       int64_t upper_bucket_idx = std::upper_bound(
-                                    boundaries_offset,
-                                    boundaries_offset + lens_data[j],
-                                    input_data[pos]) -
+                                     boundaries_offset,
+                                     boundaries_offset + lens_data[j],
+                                     input_data[pos]) -
           boundaries_offset;
 
       int64_t bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2;
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index 8b5e116024b81..561da9189b388 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -1,6 +1,7 @@
 #include "caffe2/operators/utility_ops.h"
 #include <cmath>
 #include <iostream>
+#include "caffe2/core/types.h"
 #include "caffe2/utils/eigen_utils.h"
 
 namespace caffe2 {
@@ -34,9 +35,11 @@ OpSchema::Cost CostInferenceForWeightedSum(
   const auto& nElem = nElemFromDim(X0);
   const auto& nInputs = in.size();
   c.flops = (nInputs - 1) * nElem;
-  c.bytes_read = (nInputs / 2) * (nElem + 1) * sizeof(X0.data_type());
-  c.bytes_written = nElem * sizeof(X0.data_type());
-  c.params_bytes = (nInputs / 2) * sizeof(X0.data_type());
+  auto const& X0_element_size_byte =
+      DataTypeToTypeMeta(X0.data_type()).itemsize();
+  c.bytes_read = (nInputs / 2) * (nElem + 1) * X0_element_size_byte;
+  c.bytes_written = nElem * X0_element_size_byte;
+  c.params_bytes = (nInputs / 2) * X0_element_size_byte;
   return c;
 }
 
@@ -48,9 +51,7 @@ REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(SumInt, SumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp<CPUContext>);
-REGISTER_CPU_OPERATOR(
-    ScatterWeightedSum,
-    ScatterWeightedSumOp<CPUContext>);
+REGISTER_CPU_OPERATOR(ScatterWeightedSum, ScatterWeightedSumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Scatter, ScatterOp<CPUContext>);
 
diff --git a/caffe2/python/operator_test/concat_op_cost_test.py b/caffe2/python/operator_test/concat_op_cost_test.py
index 996b330be4947..7dab4d6bd5d1f 100644
--- a/caffe2/python/operator_test/concat_op_cost_test.py
+++ b/caffe2/python/operator_test/concat_op_cost_test.py
@@ -7,33 +7,39 @@
 
 class TestConcatOpCost(TestCase):
     def test_columnwise_concat(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
-        workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=np.int32))
-        concat_op = core.CreateOperator(
-            "Concat",
-            ["input_1", "input_2"],
-            ["output", "split_info"],
-        )
-        workspace.RunOperatorOnce(concat_op)
+        def _test_columnwise_concat_for_type(dtype):
+            workspace.ResetWorkspace()
+            workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype))
+            workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=dtype))
+            concat_op = core.CreateOperator(
+                "Concat",
+                ["input_1", "input_2"],
+                ["output", "split_info"],
+            )
+            workspace.RunOperatorOnce(concat_op)
 
-        output = workspace.FetchBlob("output")
-        self.assertTupleEqual(output.shape, (2, 4))
-        np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])
+            output = workspace.FetchBlob("output")
+            self.assertTupleEqual(output.shape, (2, 4))
+            np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])
 
-        flops, bytes_written, bytes_read = workspace.GetOperatorCost(
-            concat_op, concat_op.input
-        )
+            flops, bytes_written, bytes_read = workspace.GetOperatorCost(
+                concat_op, concat_op.input
+            )
 
-        self.assertEqual(flops, 0)
-        self.assertEqual(
-            bytes_read,
-            sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
-        )
-        self.assertEqual(
-            bytes_written,
-            sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
-        )
+            self.assertEqual(flops, 0)
+            self.assertEqual(
+                bytes_read,
+                sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
+            )
+            self.assertEqual(
+                bytes_written,
+                sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
+            )
+
+        [
+            _test_columnwise_concat_for_type(t)
+            for t in [np.int64, np.float, np.half, np.int8]
+        ]
 
     def test_split_then_concat(self):
         workspace.ResetWorkspace()
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index afb2065027075..1bf7b607e1b7e 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -60,7 +60,7 @@ def testGetOperatorCost(self):
         self.assertTupleEqual(
             op_cost,
             namedtuple("Cost", ["flops", "bytes_written", "bytes_read"])(
-                1152, 256, 2084
+                1152, 256, 4168
             ),
         )
 
diff --git a/caffe2/sgd/adagrad_op.cc b/caffe2/sgd/adagrad_op.cc
index 0de50f03e62d5..0b6f604b48cdb 100644
--- a/caffe2/sgd/adagrad_op.cc
+++ b/caffe2/sgd/adagrad_op.cc
@@ -1,4 +1,5 @@
 #include "adagrad_op.h"
+#include "caffe2/core/types.h"
 
 namespace caffe2 {
 
@@ -23,22 +24,30 @@ static OpSchema::Cost CostInferenceForAdagrad(
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   c.flops = grad_size * 10;
 
+  auto const& moment_element_size_byte =
+      DataTypeToTypeMeta(moment.data_type()).itemsize();
+  auto const& param_element_size_byte =
+      DataTypeToTypeMeta(param.data_type()).itemsize();
+  auto const& grad_element_size_byte =
+      DataTypeToTypeMeta(grad.data_type()).itemsize();
+  auto const& lr_element_size_byte =
+      DataTypeToTypeMeta(lr.data_type()).itemsize();
   uint64_t bytes_written =
-      grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type()));
+      grad_size * param_element_size_byte + moment_element_size_byte;
 
   if (output_size == 3) {
     // also need to output effective learning rate in this case
     // assume it's the same data type as lr
-    bytes_written += grad_size * sizeof(lr.data_type());
+    bytes_written += grad_size * lr_element_size_byte;
   } else if (output_size == 4) {
     // also need to output effective learning rate and updates in this case
     // assume update is the same data type as param
     bytes_written +=
-        grad_size * (sizeof(lr.data_type()) + sizeof(param.data_type()));
+        grad_size * (lr_element_size_byte + param_element_size_byte);
   }
   c.bytes_written = bytes_written;
   c.bytes_read = c.bytes_written +
-      grad_size * (sizeof(grad.data_type()) + sizeof(lr.data_type()));
+      grad_size * (grad_element_size_byte + lr_element_size_byte);
 
   return c;
 }
@@ -102,10 +111,18 @@ static OpSchema::Cost CostInferenceForSparseAdagrad(
   // (optimistically count sqrt as one flop).
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   c.flops = grad_size * 7;
+  auto const& param_element_size_byte =
+      DataTypeToTypeMeta(param.data_type()).itemsize();
+  auto const& moment_element_size_byte =
+      DataTypeToTypeMeta(moment.data_type()).itemsize();
   c.bytes_written =
-      grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type()));
-  c.bytes_read = c.bytes_written + grad_size * sizeof(grad.data_type()) +
-      n * sizeof(indices.data_type());
+      grad_size * (param_element_size_byte + moment_element_size_byte);
+  auto const& grad_element_size_byte =
+      DataTypeToTypeMeta(grad.data_type()).itemsize();
+  auto const& indices_element_size_byte =
+      DataTypeToTypeMeta(indices.data_type()).itemsize();
+  c.bytes_read = c.bytes_written + grad_size * grad_element_size_byte +
+      n * indices_element_size_byte;
 
   return c;
 }
@@ -153,6 +170,16 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad(
   OpSchema::Cost c;
 
   if (n > 0) {
+    auto const& param_element_size_byte =
+        DataTypeToTypeMeta(param.data_type()).itemsize();
+    auto const& moment_element_size_byte =
+        DataTypeToTypeMeta(moment.data_type()).itemsize();
+    auto const& grad_element_size_byte =
+        DataTypeToTypeMeta(grad.data_type()).itemsize();
+    auto const& indices_element_size_byte =
+        DataTypeToTypeMeta(indices.data_type()).itemsize();
+    auto const& lr_element_size_byte =
+        DataTypeToTypeMeta(lr.data_type()).itemsize();
     auto block_size = grad_size / n;
     if (block_size == 1) {
       // +2: applying weight decay and add to grads
@@ -161,22 +188,22 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad(
       // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
       c.flops = n * 9;
       c.bytes_written =
-          n * (sizeof(param.data_type()) + sizeof(moment.data_type()));
+          n * (param_element_size_byte + moment_element_size_byte);
       c.bytes_read = c.bytes_written +
           n *
-              (sizeof(grad.data_type()) + sizeof(indices.data_type()) +
-               sizeof(lr.data_type()));
+              (grad_element_size_byte + indices_element_size_byte +
+               lr_element_size_byte);
     } else {
       // 5 per block (not counting index transforms)
       // 8 for each value of a block
       // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
       c.flops = n * (5 + (block_size * 8));
-      c.bytes_written =
-          n * sizeof(moment.data_type()) + n * block_size * (param.data_type());
+      c.bytes_written = n * moment_element_size_byte +
+          n * block_size * param_element_size_byte;
 
-      c.bytes_read = c.bytes_written + n * (sizeof(lr.data_type())) +
+      c.bytes_read = c.bytes_written + n * lr_element_size_byte +
           2 * n * block_size *
-              (sizeof(grad.data_type()) + sizeof(param.data_type()));
+              (grad_element_size_byte + param_element_size_byte);
     }
   }
   return c;

From e322547fe6dd4f0ca9261a1ac2ae7095800b98a1 Mon Sep 17 00:00:00 2001
From: Alban Desmaison <albandes@fb.com>
Date: Wed, 1 Sep 2021 13:34:48 -0700
Subject: [PATCH 432/530] Add forward AD support for custom Functions (#64061)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64061

Test Plan: Imported from OSS

Reviewed By: soulitzer

Differential Revision: D30640868

Pulled By: albanD

fbshipit-source-id: b0e6610430a879074d6d5306443772fc154b431f
---
 test/test_autograd.py                   | 114 +++++++++++++++
 torch/autograd/function.py              |  24 +++
 torch/csrc/autograd/custom_function.cpp | 187 +++++++++++++++++++++++-
 torch/csrc/autograd/custom_function.h   |  12 +-
 torch/csrc/autograd/python_function.cpp |  59 +++++++-
 5 files changed, 385 insertions(+), 11 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 8b3c8bd33af66..ebe3aa5d29e18 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -5494,6 +5494,11 @@ def backward(ctx, foo):
             def vjp(ctx, foo):
                 return foo
 
+        class BadJvp(Function):
+            @staticmethod
+            def forward(ctx, foo):
+                return foo.clone()
+
         inp = torch.rand(1, requires_grad=True)
         with self.assertRaisesRegex(NotImplementedError, "must implement the forward"):
             BadFw.apply(inp)
@@ -5504,6 +5509,115 @@ def vjp(ctx, foo):
         with self.assertRaisesRegex(RuntimeError, "Implementing both 'backward' and 'vjp'"):
             BadBw2.apply(inp).sum().backward()
 
+        with self.assertRaisesRegex(RuntimeError, "must implement the jvp function"):
+            with fwAD.dual_level():
+                d = fwAD.make_dual(inp, torch.rand_like(inp))
+                res = BadJvp.apply(d)
+
+    def test_custom_function_forward_mode_view_checks(self):
+        flag_to_error = {
+            "ok": None,
+            "not_a_view": "jvp is not returning a view",
+            "not_a_view_of_inp": "jvp is not returning a view of the given",
+            "not_a_view_of_inp_base": "jvp is not returning a view of the same base",
+        }
+
+        class ViewFn(Function):
+            @staticmethod
+            def forward(ctx, foo, flag):
+                ctx.flag = flag
+                ctx.size = foo.size()
+                return foo.narrow(0, 0, 2)
+
+            @staticmethod
+            def vjp(ctx, gO):
+                gI = gO.new_zeros(ctx.size)
+                gI.narrow(0, 0, 2).copy_(gO)
+                return gI, None
+
+            @staticmethod
+            def jvp(ctx, gI, _):
+                res = gI.narrow(0, 0, 2)
+                if ctx.flag != "ok":
+                    # Break the view in the gradients!
+                    res = res.clone()
+                if ctx.flag in ["not_a_view_of_inp", "not_a_view_of_inp_base"]:
+                    # Result should be a view, just of the wrong thing
+                    res = res.view_as(res)
+                return res
+
+        inp = torch.rand(4, 4, dtype=torch.double, requires_grad=True)
+
+        for flag, msg in flag_to_error.items():
+            def test_fn(inp):
+                if flag == "not_a_view_of_inp_base":
+                    inp = inp.view_as(inp)
+                return ViewFn.apply(inp, flag)
+
+            if msg is None:
+                gradcheck(test_fn, inp, check_forward_ad=True)
+            else:
+                with self.assertRaisesRegex(RuntimeError, msg):
+                    gradcheck(test_fn, inp, check_forward_ad=True)
+
+    def test_custom_function_forward_mode_inplace_checks(self):
+        class InplaceFn(Function):
+            @staticmethod
+            def forward(ctx, foo, flag):
+                ctx.mark_dirty(foo)
+                ctx.flag = flag
+                foo.mul_(2)
+                return foo
+
+            @staticmethod
+            def vjp(ctx, gO):
+                return 2 * gO, None
+
+            @staticmethod
+            def jvp(ctx, gI, _):
+                if ctx.flag:
+                    # Don't do the change inplace
+                    return 2 * gI
+                else:
+                    gI.mul_(2)
+                    return gI
+
+        inp = torch.rand(4, 4, dtype=torch.double, requires_grad=True)
+
+        def test_fn(inp, flag):
+            inp = inp.clone()
+            return InplaceFn.apply(inp, flag)
+
+        gradcheck(test_fn, (inp, False), check_forward_ad=True)
+
+        with self.assertRaisesRegex(RuntimeError, "inplace custom Function is not modifying the forward mode gradients inplace"):
+            gradcheck(test_fn, (inp, True), check_forward_ad=True)
+
+    def test_custom_function_forward_mode_wrong_formula(self):
+        class UserFn(Function):
+            @staticmethod
+            def forward(ctx, foo, should_fail):
+                ctx.should_fail = should_fail
+                return foo * 2
+
+            @staticmethod
+            def vjp(ctx, gO):
+                return 2 * gO, None
+
+            @staticmethod
+            def jvp(ctx, gI, _):
+                if ctx.should_fail:
+                    # Wrong gradient formula
+                    return 3 * gI
+                else:
+                    return 2 * gI
+
+        inp = torch.rand(10, dtype=torch.double, requires_grad=True)
+        gradcheck(UserFn.apply, (inp, False), check_forward_ad=True)
+
+        with self.assertRaisesRegex(RuntimeError, "Jacobian computed with forward mode mismatch for output 0"):
+            gradcheck(UserFn.apply, (inp, True), check_forward_ad=True)
+
     def test_custom_function_local_inplace(self):
         class MyFn(torch.autograd.Function):
             @staticmethod
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index 90aeea5f1dfea..909e71959320b 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -198,6 +198,10 @@ def apply(self, *args):
         user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn
         return user_fn(self, *args)
 
+    def apply_jvp(self, *args):
+        # _forward_cls is defined by derived class
+        return self._forward_cls.jvp(self, *args)  # type: ignore[attr-defined]
+
 
 class FunctionMeta(type):
     """Function metaclass.
@@ -307,6 +311,26 @@ def backward(ctx: Any, *grad_outputs: Any) -> Any:
     # vjp and backward are alias of each other
     vjp = backward
 
+    @staticmethod
+    def jvp(ctx: Any, *grad_inputs: Any) -> Any:
+        r"""Defines a formula for differentiating the operation with forward mode
+        automatic differentiation.
+        This function is to be overridden by all subclasses.
+        It must accept a context :attr:`ctx` as the first argument, followed by
+        as many inputs as the :func:`forward` got (None will be passed in
+        for non tensor inputs of the forward function),
+        and it should return as many tensors as there were outputs to
+        :func:`forward`. Each argument is the gradient w.r.t the given input,
+        and each returned value should be the gradient w.r.t. the
+        corresponding output. If an output is not a Tensor or the function is not
+        differentiable with respect to that output, you can just pass None as a
+        gradient for that input.
+
+        You can use the :attr:`ctx` object to pass any value from the forward to this
+        functions.
+        """
+        raise NotImplementedError("You must implement the jvp function for custom "
+                                  "autograd.Function to use it with forward mode AD.")
 
 def once_differentiable(fn):
 
diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp
index fdcf9971a0606..1bb4cb836f1e8 100644
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@@ -26,8 +26,175 @@ Variable VariableInfo::zeros(at::OptionalDeviceGuard& device_guard) const {
   }
 }
 
+// This function has two main goals:
+//  1) Use the user-provided jvp function to populate the the outputs' forward gradient
+//  2) Perform error checking to ensure that view and inplace ops are properly handled
+//
+// For 1) we have to:
+//  - Create a variable_list of grad_inputs based on the function inputs
+//  - Call the user jvp function with these to get the grad_outputs
+//  - Set the forward grad field on each output based on these grad_outputs
+//
+// For 2) we want to check the following:
+//  - If an output is a view, then the generated forward grad must be a view as well and
+//    the output's base's forward grad must be the output's forward grad's base.
+//  - If an input was modified inplace (it must be an output as well) we make sure that its
+//    forward grad was also modified inplace and already present on the corresponding output.
+void _process_forward_mode_AD(const variable_list &inputs,
+  std::unordered_map<at::TensorImpl*, size_t> inputs_mapping,
+  const at::ArrayRef<c10::optional<Variable>> raw_outputs,
+  const optional_variable_list &outputs,
+  const std::unordered_set<at::TensorImpl*> &non_differentiable,
+  const std::unordered_set<at::TensorImpl*> &dirty_inputs,
+  _jvp_fn_t jvp_user_function) {
+
+  // TODO handle multiple levels here
+  uint64_t level = 0;
+
+  const auto num_inputs = inputs.size();
+  const auto num_outputs = outputs.size();
+
+  // The tracking info below are used to perform the view and inplace checks.
+  // They are lazily initialized to reduce the cost of this function in the common
+  // case where the user is not using forward mode AD.
+  variable_list input_grads;
+  std::vector<int64_t> grad_versions;
+  std::vector<at::TensorImpl*> grad_impls;
+  std::unordered_map<at::TensorImpl*, size_t> inputs_bases;
+
+  auto init_tracked_info = [&] () {
+    input_grads.resize(num_inputs);
+    grad_versions.resize(num_inputs);
+    grad_impls.resize(num_inputs);
+
+    for (const auto i: c10::irange(num_inputs)) {
+      const auto& inp = inputs[i];
+      if (inp.is_view() && impl::get_view_autograd_meta(inp)->has_fw_view()) {
+        inputs_bases.emplace(impl::get_view_autograd_meta(inp)->get_forward_view().base_.unsafeGetTensorImpl(), i);
+      } else {
+        inputs_bases.emplace(inp.unsafeGetTensorImpl(), i);
+      }
+
+    }
+  };
+
+  bool any_input_has_grad = false;
+  // Extract the input's forward gradients and record any info we will need later
+  for (const auto i : c10::irange(num_inputs)) {
+    const auto& inp = inputs[i];
+    if (!inp.defined()) {
+      continue;
+    }
+    const auto& fw_grad = inp._fw_grad(level);
+    if (fw_grad.defined()) {
+      if (!any_input_has_grad) {
+        any_input_has_grad = true;
+        init_tracked_info();
+      }
+      input_grads[i] = fw_grad;
+      grad_versions[i] = fw_grad._version();
+      grad_impls[i] = fw_grad.unsafeGetTensorImpl();
+    }
+  }
+
+  // If no input has forward grad, nothing to do here
+  if (!any_input_has_grad) {
+    return;
+  }
+
+
+  auto forward_grads = jvp_user_function(inputs, input_grads);
+
+
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  const auto num_forward_grads = forward_grads.size();
+  // contrary to backward mode, we don't allow returning too many gradients
+  TORCH_CHECK(num_forward_grads == num_outputs, "Function's jvp returned "
+              "an invalid number of of forward gradients (expected ", num_outputs,
+              " but got ", num_forward_grads, ")");
+
+  for (const auto i : c10::irange(num_outputs)) {
+    const auto& out = outputs[i].has_value()? outputs[i].value() : at::Tensor();
+    const auto& out_grad = forward_grads[i];
+    if (!out.defined()) {
+      TORCH_CHECK(!out_grad.defined(), "Function's jvp returned a gradient at position ", i, ", but "
+                  " the corresponding forward output is not a differentiable Tensor");
+      continue;
+    }
+
+    TORCH_INTERNAL_ASSERT(raw_outputs[i].has_value());
+    auto out_tensor_impl = raw_outputs[i].value().unsafeGetTensorImpl();
+    bool is_input = inputs_mapping.count(out_tensor_impl) > 0;
+    bool is_modified = dirty_inputs.count(out_tensor_impl) > 0;
+
+    if (is_modified) {
+      TORCH_CHECK(is_input, "Only input Tensors should be given to ctx.mark_dirty(). If a Tensor is not an input, there"
+                 " is no need to pass it to mark_dirty().");
+      auto inp_idx = inputs_mapping[out_tensor_impl];
+      if (grad_impls[inp_idx]) {
+        // If there was already a forward grad for that input
+        // Just make sure that it is modified inplace and returned as-is
+        TORCH_CHECK(out_grad._version() != grad_versions[inp_idx], "An inplace custom Function is not modifying the "
+                    "forward mode gradients inplace. If the forward is modifying an input inplace, then the jvp "
+                    "function must modify the corresponding gradient inplace.")
+        TORCH_CHECK(out_grad.unsafeGetTensorImpl() == grad_impls[inp_idx], "An inplace custom Function is not returning the "
+                    "forward mode gradients as-is. If the forward is modifying an input inplace, then the jvp "
+                    "function must modify the gradient inplace and return it as-is.")
+      } else {
+        // If that Tensor didn't had gradients already, set the newly returned one
+        // We could also use inputs[inp_idx] here as it is the same as out
+        out._set_fw_grad(out_grad, level, /* is_inplace_op */ true);
+      }
+    } else {
+      // At this point, outputs[i] cannot be one of the input (raw_outputs[i] might be but was changed by the backward code)
+      TORCH_INTERNAL_ASSERT(!is_input);
+
+      if (out.is_view() && impl::get_view_autograd_meta(out)->has_fw_view()) {
+        // If the output is a view
+        const auto& out_view_info = impl::get_view_autograd_meta(out)->get_forward_view();
+        if (inputs_bases.count(out_view_info.base_.unsafeGetTensorImpl())) {
+          // And it is a view of an input (either that input is its base or they have a common base)
+          const auto matching_input_idx = inputs_bases[out_view_info.base_.unsafeGetTensorImpl()];
+          const auto& matching_input = inputs[matching_input_idx];
+
+          const auto& matching_input_grad = matching_input._fw_grad(level);
+
+          // If the matching input has a forward grad, the user should have returned a view of that Tensor
+          if (matching_input_grad.defined()) {
+            TORCH_CHECK(out_grad.is_view() && impl::get_view_autograd_meta(out_grad)->has_fw_view(),
+                        "A custom Function's forward is returning a view but the jvp is not returning a view.");
+
+            const auto& out_grad_base = impl::get_view_autograd_meta(out_grad)->get_forward_view().base_;
+            if (matching_input_grad.is_view() && impl::get_view_autograd_meta(matching_input_grad)->has_fw_view()) {
+              // If the matching input's grad is a view, ensure that the out_grad is a view of the same base
+              const auto& matching_input_grad_base = impl::get_view_autograd_meta(matching_input_grad)->get_forward_view().base_;
+              TORCH_CHECK(matching_input_grad_base.unsafeGetTensorImpl() == out_grad_base.unsafeGetTensorImpl(),
+                          "A custom Function is returning a view but the jvp is not returning a view of the same base as "
+                          "the given grad input.");
+            } else {
+              // If the matching input's grad is not a view, then it must be the output gradient's base
+              TORCH_CHECK(matching_input_grad.unsafeGetTensorImpl() == out_grad_base.unsafeGetTensorImpl(),
+                          "A custom Function is returning a view but the jvp is not returning a view of the given grad input.");
+            }
+          } else {
+            // We have a view op where the input didn't have a forward grad but the user returned one for the output
+            // To ensure that we maintain the view/inplace constraints, we consider this as an inplace op
+            // This case CANNOT happen in codegen as all view ops are mapping from one Tensor to one Tensor and so the output
+            // of the view cannot have a forward grad if the base does not.
+            out._set_fw_grad(out_grad, level, /* is_inplace_op */ true);
+            return;
+          }
+
+        }
+      }
+
+      out._set_fw_grad(out_grad, level, /* is_inplace_op */ false);
+    }
+  }
+}
+
 optional_variable_list _process_backward_mode_ad(
-  const std::unordered_set<at::TensorImpl*> &inputs_set,
+  const std::unordered_map<at::TensorImpl*, size_t> &inputs_mapping,
   const std::unordered_set<at::TensorImpl*> &non_differentiable,
   const std::unordered_set<at::TensorImpl*> &dirty_inputs,
   const at::ArrayRef<c10::optional<Variable>> raw_outputs,
@@ -121,7 +288,7 @@ optional_variable_list _process_backward_mode_ad(
     Variable var = raw_outputs[i].value();
 
     auto out_tensor_impl = var.unsafeGetTensorImpl();
-    bool is_input = inputs_set.count(out_tensor_impl) > 0;
+    bool is_input = inputs_mapping.count(out_tensor_impl) > 0;
     bool is_modified = dirty_inputs.count(out_tensor_impl) > 0;
     bool is_differentiable = cdata && non_differentiable.count(out_tensor_impl) == 0
                               && isDifferentiableType(var.scalar_type());
@@ -179,16 +346,20 @@ optional_variable_list _wrap_outputs(const variable_list &input_vars,
   const std::unordered_set<at::TensorImpl*> &non_differentiable,
   const std::unordered_set<at::TensorImpl*> &dirty_inputs,
   const at::ArrayRef<c10::optional<Variable>> raw_outputs,
-  const std::shared_ptr<Node> &cdata) {
+  const std::shared_ptr<Node> &cdata,
+  _jvp_fn_t jvp_user_function) {
 
-  std::unordered_set<at::TensorImpl*> inputs_set;
-  inputs_set.reserve(input_vars.size());
-  for (auto& var : input_vars) {
-    inputs_set.emplace(var.unsafeGetTensorImpl());
+  std::unordered_map<at::TensorImpl*, size_t> inputs_mapping;
+  inputs_mapping.reserve(input_vars.size());
+  for (const auto i: c10::irange(input_vars.size())) {
+    inputs_mapping.emplace(input_vars[i].unsafeGetTensorImpl(), i);
   }
 
-  auto outputs = _process_backward_mode_ad(inputs_set, non_differentiable, dirty_inputs, raw_outputs, cdata);
+  auto outputs = _process_backward_mode_ad(inputs_mapping, non_differentiable, dirty_inputs, raw_outputs, cdata);
 
+  // This must happen after the backward processing as we expect the computations happening here to track
+  // backward mode gradients.
+  _process_forward_mode_AD(input_vars, inputs_mapping, raw_outputs, outputs, non_differentiable, dirty_inputs, jvp_user_function);
 
   return outputs;
 }
diff --git a/torch/csrc/autograd/custom_function.h b/torch/csrc/autograd/custom_function.h
index 376cab693e453..94e62bf7b63c7 100644
--- a/torch/csrc/autograd/custom_function.h
+++ b/torch/csrc/autograd/custom_function.h
@@ -10,13 +10,15 @@
 namespace torch { namespace autograd {
 
 using optional_variable_list = std::vector<c10::optional<Variable>>;
+using _jvp_fn_t = std::function<variable_list(variable_list, variable_list)>;
 
 TORCH_API std::vector<c10::optional<Variable>> _wrap_outputs(
   const variable_list &input_vars,
   const std::unordered_set<at::TensorImpl*> &non_differentiable,
   const std::unordered_set<at::TensorImpl*> &dirty_inputs,
   const at::ArrayRef<c10::optional<Variable>> raw_outputs,
-  const std::shared_ptr<Node> &cdata);
+  const std::shared_ptr<Node> &cdata,
+  _jvp_fn_t jvp_user_function);
 
 TORCH_API void check_variable_result(const Variable& original,
   const Variable& result, std::string hook_name);
@@ -265,12 +267,18 @@ auto Function<T>::apply(Args&&... args) -> std::enable_if_t<std::is_same<X,T>::v
     outputs = T::forward(&node->ctx_, std::forward<Args>(args)...);
   }
 
+  _jvp_fn_t jvp_fn = [](variable_list inputs, variable_list gI) -> variable_list {
+    TORCH_CHECK(false, "jvp is not implemented for the c++ API of custom Function yet.",
+                "Please open a feature request on Github if you need this.");
+  };
+
   auto wrapped_outputs = _wrap_outputs(
     input_vars,
     node->ctx_.get_non_differentiable(),
     node->ctx_.get_and_bump_dirty(),
     to_optional(outputs),
-    is_executable ? node : nullptr);
+    is_executable ? node : nullptr,
+    jvp_fn);
 
   node->output_info_.reserve(wrapped_outputs.size());
   for (auto& output : wrapped_outputs) {
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 14874186d6f22..eee56f71ed7d8 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -340,8 +340,61 @@ static void _wrap_outputs(const std::shared_ptr<PyNode>& cdata, THPFunction *sel
     }
   }
 
+  _jvp_fn_t jvp_user_function = [self](variable_list inputs, variable_list grad_inputs) {
+    pybind11::gil_scoped_acquire gil;
+
+    // Massage a C++ variable_list into a Python arguments tuple
+    // Making sure to introduce the proper None for non-Tensor inputs
+    auto num_inputs = self->is_variable_input.size();
+    THPObjectPtr pyInputs(PyTuple_New(num_inputs));
+    if (!pyInputs) throw_python_error();
+    auto var_input_idx = 0;
+    for (const auto i : c10::irange(num_inputs)) {
+      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+      PyObject* input;
+      if (self->is_variable_input[i]) {
+        if (grad_inputs[i].defined() || !self->materialize_grads) {
+          input = THPVariable_Wrap(grad_inputs[i]);
+        } else {
+          input = THPVariable_Wrap(at::zeros_like(inputs[i]));
+        }
+        if (!input) throw_python_error();
+      } else {
+        Py_INCREF(Py_None);
+        input = Py_None;
+      }
+      PyTuple_SET_ITEM(pyInputs.get(), i, input);
+    }
+
+    THPObjectPtr apply_jvp_fn(PyObject_GetAttrString((PyObject*)self, "apply_jvp"));
+    if (!apply_jvp_fn) throw_python_error();
+    THPObjectPtr r(PyObject_CallObject(apply_jvp_fn, pyInputs.get()));
+    if (!r) throw_python_error();
+    ensure_tuple(r);
+
+    // Massage the Python results tuple back into a C++ variable_list
+    // Don't do any check on the number of results here as
+    // it is handled by the caller
+    const int num_outputs = PyTuple_GET_SIZE(r.get());
+    variable_list results;
+    results.reserve(num_outputs);
+    for (int i = 0; i != num_outputs; ++i) {
+      PyObject* output = PyTuple_GET_ITEM(r.get(), i);
+      if (output == Py_None) {
+        results.emplace_back();
+      } else {
+        TORCH_CHECK(THPVariable_Check(output), "expected Variable or None (got ",
+                    THPUtils_typename(output), ") for grad output ", i, ".")
+        results.emplace_back(THPVariable_Unpack(output));
+      }
+    }
+
+    return results;
+  };
+
   // Wrap only the tensor outputs.
-  auto wrapped_outputs = _wrap_outputs(input_vars, non_differentiable, dirty_inputs, raw_output_vars, cdata_if_executable);
+  auto wrapped_outputs = _wrap_outputs(input_vars, non_differentiable, dirty_inputs,
+                                       raw_output_vars, cdata_if_executable, jvp_user_function);
 
   for(const auto i : c10::irange(num_outputs)) {
     PyObject* obj = PyTuple_GetItem(raw_output, i);
@@ -571,6 +624,9 @@ PyObject* process_outputs(PyObject *op_obj, const std::shared_ptr<PyNode>& cdata
   bool is_inplace = static_cast<bool>(grad_fn->dirty_tensors);
   _wrap_outputs(cdata, grad_fn, unpacked.input_vars, raw_output, outputs, is_executable);
   _trace_post_record(node, op_obj, unpacked.input_vars, outputs, is_inplace, unpack_output);
+
+  // It is important that creating the SavedVariables happen after the output wrapping as the
+  // outputs must have their grad_fn/fw_grad properly set before we save them.
   if (is_executable) {
     _save_variables(cdata, grad_fn);
   } else {
@@ -651,6 +707,7 @@ PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs)
   THPObjectPtr tensor_outputs;
   {
     AutoGradMode grad_mode(false);
+    at::AutoFwGradMode fw_grad_mode(false);
     THPObjectPtr forward_fn(PyObject_GetAttrString(cls, "forward"));
     if (!forward_fn) return nullptr;
     tensor_outputs = PyObject_CallObject(forward_fn, ctx_input_tuple);

From 15ff25d1fc212c36cf472f988cf0e709420cd248 Mon Sep 17 00:00:00 2001
From: David Reiss <dreiss@fb.com>
Date: Wed, 1 Sep 2021 13:41:37 -0700
Subject: [PATCH 433/530] Break up "@generated" string so Phabricator shows
 changes

Summary: Created from CodeHub with https://fburl.com/edit-in-codehub

Test Plan:
CI

Sandcastle run

Reviewed By: larryliu0820

Differential Revision: D30701781

fbshipit-source-id: 3acab8b65a327c4ec7da90bc855ecf02f801c40a
---
 tools/autograd/gen_variable_type.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 8591a6800605c..e3f4d5553c34f 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -374,7 +374,7 @@ def gen_variable_type(
     """
     fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
     fm.write('VariableType.h', lambda: {
-        'generated_comment': f'@generated from {template_path}/VariableType.h'
+        'generated_comment': "@" f'generated from {template_path}/VariableType.h'
     })
 
     # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
@@ -385,7 +385,7 @@ def gen_variable_type(
         key_fn=lambda fn: cpp.name(fn.func.func),
         base_env={
             'generated_comment':
-            f'@generated from {template_path}/VariableType.cpp',
+            "@" f'generated from {template_path}/VariableType.cpp',
         },
         env_callable=gen_variable_type_func,
         num_shards=5,

From 86c96542914bf9b3dfda0c7f6373fd13b48c6b97 Mon Sep 17 00:00:00 2001
From: Salil Desai <salilsdesai@fb.com>
Date: Wed, 1 Sep 2021 14:08:02 -0700
Subject: [PATCH 434/530] Update optimize_for_mobile to preserve node's debug
 information (#63106)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63106

Propagate debug info to the re-written nodes in the graph.

Test Plan:
- Clone open source repo and build
- ``` python3 test/test_jit.py TestOptimizeForMobilePreserveDebugInfo ```
- Tests pass

Reviewed By: kimishpatel

Differential Revision: D28654659

fbshipit-source-id: 2d7c87f2fb95a3be53246375f35639bbd97c237e
---
 ...optimize_for_mobile_preserve_debug_info.py | 261 ++++++++++++++++++
 test/test_jit.py                              |   1 +
 torch/csrc/jit/passes/xnnpack_rewrite.cpp     | 165 ++++++++---
 3 files changed, 388 insertions(+), 39 deletions(-)
 create mode 100644 test/jit/test_optimize_for_mobile_preserve_debug_info.py

diff --git a/test/jit/test_optimize_for_mobile_preserve_debug_info.py b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
new file mode 100644
index 0000000000000..c08f3b5838fae
--- /dev/null
+++ b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
@@ -0,0 +1,261 @@
+import torch
+import torch._C
+import torch.backends.xnnpack
+import torch.nn.functional as F
+from torch.testing._internal.jit_utils import JitTestCase
+
+class TestOptimizeForMobilePreserveDebugInfo(JitTestCase):
+    def check_replacement(
+        self,
+        model,
+        replacements,
+        jit_pass,
+    ):
+        """
+        model: Model which optimization is performed on
+        replacements: Dict mapping from nodes' kinds in the optimized model
+            to the kinds of nodes they replaced in the original model
+        jit_pass: Function to perform optimization
+        """
+
+        original_kinds = set(replacements.values())
+        original_source_ranges = {
+            node.kind(): node.sourceRange()
+            for node in model.graph.nodes()
+            if node.kind() in original_kinds
+        }
+
+        jit_pass(model._c)
+
+        for node in model.graph.nodes():
+            if node.kind() in replacements:
+                self.assertEqual(
+                    node.sourceRange(),
+                    original_source_ranges[replacements[node.kind()]],
+                )
+
+    def test_replace_conv1d_with_conv2d(self):
+        class TestConv1d(torch.nn.Module):
+            def __init__(self, weight, bias):
+                super(TestConv1d, self).__init__()
+                self.weight = weight
+                self.bias = bias
+
+            def forward(self, x):
+                return F.conv1d(x, self.weight, self.bias)
+
+        self.check_replacement(
+            model=torch.jit.script(
+                TestConv1d(
+                    weight=torch.rand(3, 3, 3),
+                    bias=torch.rand(3),
+                ),
+            ),
+            replacements={
+                "prim::ListUnpack": "aten::conv1d",
+                "prim::ListConstruct": "aten::conv1d",
+                "aten::unsqueeze": "aten::conv1d",
+                "aten::conv2d": "aten::conv1d",
+                "aten::squeeze": "aten::conv1d",
+            },
+            jit_pass=torch._C._jit_pass_transform_conv1d_to_conv2d,
+        )
+
+    def test_insert_pre_packed_linear_before_inline_and_conv_2d_op(self):
+        class TestPrepackedLinearBeforeInlineAndConv2dOp(torch.nn.Module):
+            def __init__(
+                self,
+                linear_weight,
+                linear_bias,
+                conv2d_weight,
+                conv2d_bias,
+                conv_transpose2d_weight,
+                conv_transpose2d_bias,
+            ):
+                super(
+                    TestPrepackedLinearBeforeInlineAndConv2dOp,
+                    self,
+                ).__init__()
+                self.linear_weight = linear_weight.float()
+                self.linear_bias = linear_bias.float()
+                self.conv2d_weight = conv2d_weight.float()
+                self.conv2d_bias = conv2d_bias.float()
+                self.conv_transpose2d_weight = conv_transpose2d_weight.float()
+                self.conv_transpose2d_bias = conv_transpose2d_bias.float()
+
+            def forward(self, x):
+                linear_res = F.linear(
+                    x.float(),
+                    self.linear_weight,
+                    self.linear_bias,
+                )
+                conv2d_res = F.conv2d(
+                    input=linear_res.unsqueeze(dim=0).float(),
+                    weight=self.conv2d_weight,
+                    bias=self.conv2d_bias,
+                )
+                return F.conv_transpose2d(
+                    input=conv2d_res,
+                    weight=self.conv_transpose2d_weight,
+                    bias=self.conv_transpose2d_bias,
+                )
+
+        minibatch = 1
+        in_channels = 6
+        iH = 4
+        iW = 5
+        out_channels = 6
+        kH = 2
+        kW = 3
+
+        self.check_replacement(
+            model=torch.jit.script(
+                TestPrepackedLinearBeforeInlineAndConv2dOp(
+                    linear_weight=torch.rand(iW, 3),
+                    linear_bias=torch.rand(iW),
+                    conv2d_weight=torch.rand(out_channels, in_channels, kH, kW),
+                    conv2d_bias=torch.rand(out_channels),
+                    conv_transpose2d_weight=torch.rand(
+                        out_channels,
+                        in_channels,
+                        kH,
+                        kW,
+                    ),
+                    conv_transpose2d_bias=torch.rand(out_channels),
+                ),
+            ),
+            replacements={
+                "prepacked::linear_clamp_prepack": "prim::CallFunction",
+                "prepacked::linear_clamp_run": "prim::CallFunction",
+                "prepacked::conv2d_clamp_prepack": "aten::conv2d",
+                "prepacked::conv2d_clamp_run": "aten::conv2d",
+                "prepacked::conv2d_transpose_clamp_prepack":
+                    "aten::conv_transpose2d",
+                "prepacked::conv2d_transpose_clamp_run":
+                    "aten::conv_transpose2d",
+            },
+            jit_pass=torch._C._jit_pass_insert_prepacked_ops,
+        )
+
+    def test_insert_pre_packed_linear_op(self):
+        self.check_replacement(
+            model=torch.jit.trace(torch.nn.Linear(5, 4), torch.rand(3, 2, 5)),
+            replacements={
+                "prepacked::linear_clamp_prepack": "aten::linear",
+                "prepacked::linear_clamp_run": "aten::linear"
+            },
+            jit_pass=torch._C._jit_pass_insert_prepacked_ops,
+        )
+
+    def run_test_fuse_activation_with_pack_ops_linear_conv2d(
+        self,
+        linear_activation,
+        linear_activation_kind,
+        conv2d_activation,
+        conv2d_activation_kind,
+    ):
+        class TestFuseActivationLinearConv2d(torch.nn.Module):
+            def __init__(
+                self,
+                linear_weight,
+                linear_bias,
+                conv2d_weight,
+                conv2d_bias,
+            ):
+                super(TestFuseActivationLinearConv2d, self).__init__()
+                self.linear_weight = linear_weight
+                self.linear_bias = linear_bias
+                self.conv2d_weight = conv2d_weight
+                self.conv2d_bias = conv2d_bias
+
+            def forward(self, x):
+                x = F.linear(
+                    input=x,
+                    weight=self.linear_weight,
+                    bias=self.linear_bias,
+                )
+                x = linear_activation(x)
+                x = F.conv2d(
+                    input=x.unsqueeze(dim=0),
+                    weight=self.conv2d_weight,
+                    bias=self.conv2d_bias,
+                )
+                return conv2d_activation(x)
+
+        linear_in_features = 5
+        linear_out_features = 4
+        conv2d_in_channels = 3
+        conv2d_out_channels = 4
+        conv2d_kernel = 2
+        x_shape = (3, 2, 5)
+
+        model = torch.jit.trace(
+            TestFuseActivationLinearConv2d(
+                linear_weight=torch.nn.Parameter(
+                    data=torch.rand(
+                        linear_out_features,
+                        linear_in_features,
+                    ),
+                    requires_grad=False,
+                ),
+                linear_bias=torch.nn.Parameter(
+                    data=torch.rand(linear_out_features),
+                    requires_grad=False,
+                ),
+                conv2d_weight=torch.rand(
+                    conv2d_out_channels,
+                    conv2d_in_channels,
+                    conv2d_kernel,
+                    conv2d_kernel,
+                ),
+                conv2d_bias=torch.rand(conv2d_out_channels),
+            ),
+            torch.rand(x_shape),
+        )
+
+        torch._C._jit_pass_insert_prepacked_ops(model._c)
+
+        self.check_replacement(
+            model=model,
+            replacements={
+                "prepacked::linear_clamp_prepack":
+                    "prepacked::linear_clamp_prepack",
+                "prepacked::linear_clamp_run": linear_activation_kind,
+                "prepacked::conv2d_clamp_prepack":
+                    "prepacked::conv2d_clamp_prepack",
+                "prepacked::conv2d_clamp_run": conv2d_activation_kind,
+            },
+            jit_pass=torch._C._jit_pass_fuse_clamp_w_prepacked_linear_conv,
+        )
+
+    def test_fuse_activation_with_pack_ops_linear_conv2d_1(self):
+        self.run_test_fuse_activation_with_pack_ops_linear_conv2d(
+            linear_activation=F.hardtanh,
+            linear_activation_kind="aten::hardtanh",
+            conv2d_activation=F.hardtanh_,
+            conv2d_activation_kind="aten::hardtanh_"
+        )
+
+    def test_fuse_activation_with_pack_ops_linear_conv2d_2(self):
+        self.run_test_fuse_activation_with_pack_ops_linear_conv2d(
+            linear_activation=F.hardtanh_,
+            linear_activation_kind="aten::hardtanh_",
+            conv2d_activation=F.hardtanh,
+            conv2d_activation_kind="aten::hardtanh"
+        )
+
+    def test_fuse_activation_with_pack_ops_linear_conv2d_3(self):
+        self.run_test_fuse_activation_with_pack_ops_linear_conv2d(
+            linear_activation=F.relu,
+            linear_activation_kind="aten::relu",
+            conv2d_activation=F.relu_,
+            conv2d_activation_kind="aten::relu_"
+        )
+
+    def test_fuse_activation_with_pack_ops_linear_conv2d_4(self):
+        self.run_test_fuse_activation_with_pack_ops_linear_conv2d(
+            linear_activation=F.relu_,
+            linear_activation_kind="aten::relu_",
+            conv2d_activation=F.relu,
+            conv2d_activation_kind="aten::relu"
+        )
diff --git a/test/test_jit.py b/test/test_jit.py
index e94ed8db922b0..8d1981d772763 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -61,6 +61,7 @@
 from jit.test_parametrization import TestParametrization  # noqa: F401
 from jit.test_attr import TestGetDefaultAttr  # noqa: F401
 from jit.test_aten_pow import TestAtenPow  # noqa: F401
+from jit.test_optimize_for_mobile_preserve_debug_info import TestOptimizeForMobilePreserveDebugInfo  # noqa: F401
 
 # Torch
 from torch import Tensor
diff --git a/torch/csrc/jit/passes/xnnpack_rewrite.cpp b/torch/csrc/jit/passes/xnnpack_rewrite.cpp
index 11210a4ea05b9..9b2cac6e25f9e 100644
--- a/torch/csrc/jit/passes/xnnpack_rewrite.cpp
+++ b/torch/csrc/jit/passes/xnnpack_rewrite.cpp
@@ -26,8 +26,8 @@ namespace {
 void replaceConv1dWithConv2d(std::shared_ptr<Graph>& graph) {
   std::string conv_1d_pattern = R"(
     graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %groups:int):
-        %r = aten::conv1d(%input, %weight, %bias, %stride, %padding, %dilation, %groups)
-        return (%r) )";
+        %res = aten::conv1d(%input, %weight, %bias, %stride, %padding, %dilation, %groups)
+        return (%res) )";
 
   std::string conv_2d_pattern = R"(
     graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %groups:int):
@@ -47,8 +47,24 @@ void replaceConv1dWithConv2d(std::shared_ptr<Graph>& graph) {
         %output : Tensor = aten::squeeze(%output_2d, %two)
         return (%output) )";
 
+  std::vector<std::pair<std::string, std::string>> value_mappings(
+      {{"zero", "res"},
+       {"one", "res"},
+       {"stride_w", "res"},
+       {"stride_2d", "res"},
+       {"padding_w", "res"},
+       {"padding_2d", "res"},
+       {"dilation_w", "res"},
+       {"dilation_2d", "res"},
+       {"two", "res"},
+       {"input_2d", "res"},
+       {"weight_2d", "res"},
+       {"output_2d", "res"},
+       {"output", "res"}});
+
   SubgraphRewriter rewriter;
-  rewriter.RegisterRewritePattern(conv_1d_pattern, conv_2d_pattern);
+  rewriter.RegisterRewritePattern(
+      conv_1d_pattern, conv_2d_pattern, value_mappings);
   rewriter.runOnGraph(graph);
 }
 
@@ -80,8 +96,8 @@ void insertPrePackedLinearOp(std::shared_ptr<Graph>& graph) {
 
   std::string linear_before_inline = R"(
     graph(%linear, %input, %weight, %bias):
-        %r = prim::CallFunction(%linear, %input, %weight, %bias)
-        return (%r))";
+        %res = prim::CallFunction(%linear, %input, %weight, %bias)
+        return (%res))";
   std::string prepacked_ops_pattern_before_inline = R"(
     graph(%linear, %input, %weight, %bias):
         %output_min_max : None = prim::Constant()
@@ -91,8 +107,8 @@ void insertPrePackedLinearOp(std::shared_ptr<Graph>& graph) {
         return (%res))";
   std::string linear_pattern = R"(
     graph(%input, %weight, %bias):
-        %r = aten::linear(%input, %weight, %bias)
-        return (%r))";
+        %res = aten::linear(%input, %weight, %bias)
+        return (%res))";
   std::string prepacked_ops_pattern = R"(
     graph(%input, %weight, %bias):
         %output_min_max : None = prim::Constant()
@@ -112,13 +128,24 @@ void insertPrePackedLinearOp(std::shared_ptr<Graph>& graph) {
     return false;
   };
 
+  std::vector<std::pair<std::string, std::string>> value_mappings(
+      {{"output_min_max", "res"},
+       {"packed_weight_bias", "res"},
+       {"res", "res"}});
+
   SubgraphRewriter linear_call_fn_rewriter;
   linear_call_fn_rewriter.RegisterRewritePattern(
-      linear_before_inline, prepacked_ops_pattern_before_inline);
+      linear_before_inline,
+      prepacked_ops_pattern_before_inline,
+      value_mappings);
   linear_call_fn_rewriter.runOnGraph(graph, filter);
 
+  value_mappings = {
+      {"output_min_max", "res"}, {"packed_weight_bias", "res"}, {"res", "res"}};
+
   SubgraphRewriter linear_rewriter;
-  linear_rewriter.RegisterRewritePattern(linear_pattern, prepacked_ops_pattern);
+  linear_rewriter.RegisterRewritePattern(
+      linear_pattern, prepacked_ops_pattern, value_mappings);
   linear_rewriter.runOnGraph(graph);
 }
 
@@ -128,8 +155,8 @@ void insertPrePackedConv2dOp(std::shared_ptr<Graph>& graph) {
 
   std::string conv_2d_pattern = R"(
     graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %groups:int):
-        %r = aten::conv2d(%input, %weight, %bias, %stride, %padding, %dilation, %groups)
-        return (%r) )";
+        %res = aten::conv2d(%input, %weight, %bias, %stride, %padding, %dilation, %groups)
+        return (%res) )";
 
   std::string prepacked_ops_conv2d_pattern = R"(
     graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %groups:int):
@@ -137,19 +164,24 @@ void insertPrePackedConv2dOp(std::shared_ptr<Graph>& graph) {
         %packed_weight_bias = prepacked::conv2d_clamp_prepack(
             %weight, %bias, %stride, %padding, %dilation, %groups,
             %output_min_max, %output_min_max)
-        %r = prepacked::conv2d_clamp_run(%input, %packed_weight_bias)
-        return (%r) )";
+        %res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias)
+        return (%res) )";
+
+  std::vector<std::pair<std::string, std::string>> value_mappings(
+      {{"output_min_max", "res"},
+       {"packed_weight_bias", "res"},
+       {"res", "res"}});
 
   SubgraphRewriter rewriter;
   rewriter.RegisterRewritePattern(
-      conv_2d_pattern, prepacked_ops_conv2d_pattern);
+      conv_2d_pattern, prepacked_ops_conv2d_pattern, value_mappings);
   rewriter.runOnGraph(graph);
 
   std::string conv_2d_transpose_pattern = R"(
       graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[],
           %output_padding:int[], %groups:int):
-        %r = aten::conv_transpose2d(%input, %weight, %bias, %stride, %padding, %output_padding, %groups, %dilation)
-        return (%r) )";
+        %res = aten::conv_transpose2d(%input, %weight, %bias, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%res) )";
 
   std::string prepacked_ops_conv2d_transpose_pattern = R"(
     graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %output_padding:int[], %groups:int):
@@ -157,12 +189,17 @@ void insertPrePackedConv2dOp(std::shared_ptr<Graph>& graph) {
         %packed_weight_bias = prepacked::conv2d_transpose_clamp_prepack(
             %weight, %bias, %stride, %padding, %output_padding, %dilation, %groups,
             %output_min_max, %output_min_max)
-        %r = prepacked::conv2d_transpose_clamp_run(%input, %packed_weight_bias)
-        return (%r) )";
+        %res = prepacked::conv2d_transpose_clamp_run(%input, %packed_weight_bias)
+        return (%res) )";
+
+  value_mappings = {
+      {"output_min_max", "res"}, {"packed_weight_bias", "res"}, {"res", "res"}};
 
   SubgraphRewriter transpose_rewriter;
   transpose_rewriter.RegisterRewritePattern(
-      conv_2d_transpose_pattern, prepacked_ops_conv2d_transpose_pattern);
+      conv_2d_transpose_pattern,
+      prepacked_ops_conv2d_transpose_pattern,
+      value_mappings);
   transpose_rewriter.runOnGraph(graph);
 }
 
@@ -182,8 +219,8 @@ void fuseHardtanhWithPackedOps(std::shared_ptr<Graph>& graph) {
         %packed_weight_bias : __torch__.torch.classes.xnnpack.Conv2dOpContext = prepacked::conv2d_clamp_prepack(
             %weight, %bias, %stride, %padding, %dilation, %groups,
             %output_min, %output_max)
-        %r = prepacked::conv2d_clamp_run(%input, %packed_weight_bias)
-        return (%r) )";
+        %res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias)
+        return (%res) )";
 
   std::string linear_prepack_run_hardtanh = R"(
     graph(%input, %weight, %bias, %output_min, %output_max, %dummy_min_max):
@@ -193,8 +230,13 @@ void fuseHardtanhWithPackedOps(std::shared_ptr<Graph>& graph) {
         %res = aten::hardtanh(%linear_res, %output_min, %output_max)
         return (%res))";
 
+  std::vector<std::pair<std::string, std::string>> value_mappings(
+      {{"packed_weight_bias", "packed_weight_bias"}, {"res", "res"}});
+
   rewriter.RegisterRewritePattern(
-      linear_prepack_run_hardtanh, linear_prepack_run_hardtanh_fused);
+      linear_prepack_run_hardtanh,
+      linear_prepack_run_hardtanh_fused,
+      value_mappings);
 
   std::string conv2d_prepack_run_hardtanh = R"(
     graph(%input, %weight, %bias, %stride:int[], %padding:int[],
@@ -203,11 +245,16 @@ void fuseHardtanhWithPackedOps(std::shared_ptr<Graph>& graph) {
             %weight, %bias, %stride, %padding, %dilation, %groups,
             %dummy_min_max, %dummy_min_max)
         %conv2d_res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias)
-        %r = aten::hardtanh(%conv2d_res, %output_min, %output_max)
-        return (%r) )";
+        %res = aten::hardtanh(%conv2d_res, %output_min, %output_max)
+        return (%res) )";
+
+  value_mappings = {
+      {"packed_weight_bias", "packed_weight_bias"}, {"res", "res"}};
 
   rewriter.RegisterRewritePattern(
-      conv2d_prepack_run_hardtanh, conv2d_prepack_run_hardtanh_fused);
+      conv2d_prepack_run_hardtanh,
+      conv2d_prepack_run_hardtanh_fused,
+      value_mappings);
 
   std::string linear_prepack_run_hardtanh_inplace = R"(
     graph(%input, %weight, %bias, %output_min, %output_max, %dummy_min_max):
@@ -224,13 +271,24 @@ void fuseHardtanhWithPackedOps(std::shared_ptr<Graph>& graph) {
             %weight, %bias, %stride, %padding, %dilation, %groups,
             %dummy_min_max, %dummy_min_max)
         %conv2d_res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias)
-        %r = aten::hardtanh_(%conv2d_res, %output_min, %output_max)
-        return (%r) )";
+        %res = aten::hardtanh_(%conv2d_res, %output_min, %output_max)
+        return (%res) )";
+
+  value_mappings = {
+      {"packed_weight_bias", "packed_weight_bias"}, {"res", "res"}};
 
   rewriter.RegisterRewritePattern(
-      linear_prepack_run_hardtanh_inplace, linear_prepack_run_hardtanh_fused);
+      linear_prepack_run_hardtanh_inplace,
+      linear_prepack_run_hardtanh_fused,
+      value_mappings);
+
+  value_mappings = {
+      {"packed_weight_bias", "packed_weight_bias"}, {"res", "res"}};
+
   rewriter.RegisterRewritePattern(
-      conv2d_prepack_run_hardtanh_inplace, conv2d_prepack_run_hardtanh_fused);
+      conv2d_prepack_run_hardtanh_inplace,
+      conv2d_prepack_run_hardtanh_fused,
+      value_mappings);
 
   rewriter.runOnGraph(graph, torch::jit::graph_rewrite_helper::isClampFusable);
 }
@@ -255,8 +313,8 @@ void fuseReluWithPackedOps(std::shared_ptr<Graph>& graph) {
         %packed_weight_bias : __torch__.torch.classes.xnnpack.Conv2dOpContext = prepacked::conv2d_clamp_prepack(
             %weight, %bias, %stride, %padding, %dilation, %groups,
             %output_min, %output_max)
-        %r = prepacked::conv2d_clamp_run(%input, %packed_weight_bias)
-        return (%r) )";
+        %res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias)
+        return (%res) )";
 
   std::string linear_prepack_run_relu = R"(
     graph(%input, %weight, %bias, %dummy_min_max):
@@ -266,8 +324,14 @@ void fuseReluWithPackedOps(std::shared_ptr<Graph>& graph) {
         %res = aten::relu(%linear_res)
         return (%res))";
 
+  std::vector<std::pair<std::string, std::string>> value_mappings(
+      {{"output_min", "packed_weight_bias"},
+       {"output_max", "packed_weight_bias"},
+       {"packed_weight_bias", "packed_weight_bias"},
+       {"res", "res"}});
+
   rewriter.RegisterRewritePattern(
-      linear_prepack_run_relu, linear_prepack_run_relu_fused);
+      linear_prepack_run_relu, linear_prepack_run_relu_fused, value_mappings);
 
   std::string conv2d_prepack_run_relu = R"(
     graph(%input, %weight, %bias, %stride:int[], %padding:int[],
@@ -276,11 +340,17 @@ void fuseReluWithPackedOps(std::shared_ptr<Graph>& graph) {
             %weight, %bias, %stride, %padding, %dilation, %groups,
             %dummy_min_max, %dummy_min_max)
         %conv2d_res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias)
-        %r = aten::relu(%conv2d_res)
-        return (%r) )";
+        %res = aten::relu(%conv2d_res)
+        return (%res) )";
+
+  value_mappings = {
+      {"output_min", "packed_weight_bias"},
+      {"output_max", "packed_weight_bias"},
+      {"packed_weight_bias", "packed_weight_bias"},
+      {"res", "res"}};
 
   rewriter.RegisterRewritePattern(
-      conv2d_prepack_run_relu, conv2d_prepack_run_relu_fused);
+      conv2d_prepack_run_relu, conv2d_prepack_run_relu_fused, value_mappings);
 
   std::string linear_prepack_run_relu_inplace = R"(
     graph(%input, %weight, %bias, %dummy_min_max):
@@ -297,13 +367,30 @@ void fuseReluWithPackedOps(std::shared_ptr<Graph>& graph) {
             %weight, %bias, %stride, %padding, %dilation, %groups,
             %dummy_min_max, %dummy_min_max)
         %conv2d_res = prepacked::conv2d_clamp_run(%input, %packed_weight_bias)
-        %r = aten::relu_(%conv2d_res)
-        return (%r) )";
+        %res = aten::relu_(%conv2d_res)
+        return (%res) )";
+
+  value_mappings = {
+      {"output_min", "packed_weight_bias"},
+      {"output_max", "packed_weight_bias"},
+      {"packed_weight_bias", "packed_weight_bias"},
+      {"res", "res"}};
 
   rewriter.RegisterRewritePattern(
-      linear_prepack_run_relu_inplace, linear_prepack_run_relu_fused);
+      linear_prepack_run_relu_inplace,
+      linear_prepack_run_relu_fused,
+      value_mappings);
+
+  value_mappings = {
+      {"output_min", "packed_weight_bias"},
+      {"output_max", "packed_weight_bias"},
+      {"packed_weight_bias", "packed_weight_bias"},
+      {"res", "res"}};
+
   rewriter.RegisterRewritePattern(
-      conv2d_prepack_run_relu_inplace, conv2d_prepack_run_relu_fused);
+      conv2d_prepack_run_relu_inplace,
+      conv2d_prepack_run_relu_fused,
+      value_mappings);
   rewriter.runOnGraph(graph, torch::jit::graph_rewrite_helper::isClampFusable);
 }
 

From 4aad366111f88010c9e4027d054da2fe357e1bfe Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Wed, 1 Sep 2021 14:19:21 -0700
Subject: [PATCH 435/530] [Static Runtime] Make per-op latency readable by
 FAI-PEP (#64315)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64315

Add a new flag `generate_ai_pep_output` to `StaticRuntime::benchmark`. If set, produces per-op-kind average total latency in milliseconds in a JSON format recognized by [Facebook AI performance evaluation platform (FAI-PEP)](https://github.com/facebook/FAI-PEP).

This is useful for observing the impact of changes that make a big difference for a specific op, but do not affect the overall SR latency by more than a few percent.

Reviewed By: hlu1

Differential Revision: D30679352

fbshipit-source-id: c847fa6ea20774aaf1e7949b11db4421d1f70b7e
---
 torch/csrc/jit/runtime/static/impl.cpp | 29 +++++++++++++++++++++++++-
 torch/csrc/jit/runtime/static/impl.h   |  3 ++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index e22447819ea67..7697613e79573 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -19,6 +19,11 @@
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 #include <stdexcept>
 
+#ifdef FBCODE_CAFFE2
+#include <folly/dynamic.h>
+#include <folly/json.h>
+#endif
+
 namespace torch {
 namespace jit {
 
@@ -873,12 +878,30 @@ c10::IValue StaticRuntime::operator()(
   return std::move(*outputs_[0]);
 }
 
+namespace {
+
+std::string generate_node_time_json(const std::string& kind, float millis) {
+#ifdef FBCODE_CAFFE2
+  folly::dynamic json = folly::dynamic::object();
+  json["type"] = kind;
+  json["metric"] = "latency";
+  json["unit"] = "ms";
+  json["value"] = millis;
+  return folly::toJson(json);
+#else
+  return "";
+#endif
+}
+
+} // namespace
+
 void StaticRuntime::benchmark(
     const std::vector<c10::IValue>& args,
     const std::unordered_map<std::string, c10::IValue>& kwargs,
     const int warmup_runs,
     const int main_runs,
-    bool print_per_node_time) {
+    bool print_per_node_time,
+    bool generate_ai_pep_output) {
   float time_per_iter = benchmark_model(args, kwargs, warmup_runs, main_runs);
   std::cout << "Static runtime ms per iter: " << time_per_iter
             << ". Iters per second: " << 1000.0 / time_per_iter << std::endl;
@@ -916,6 +939,10 @@ void StaticRuntime::benchmark(
     } else {
       std::cout << ")" << std::endl;
     }
+
+    if (generate_ai_pep_output) {
+      LOG(INFO) << "PyTorchObserver " << generate_node_time_json(kind, ms);
+    }
   }
   std::cout << std::setw(15) << results.total_time << " ms. in Total"
             << std::endl;
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index d8a99f78cad2d..0d2378760f270 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -234,7 +234,8 @@ class TORCH_API StaticRuntime {
       const std::unordered_map<std::string, c10::IValue>& kwargs,
       const int warmup_runs,
       const int main_runs,
-      bool print_per_node_time = false);
+      bool print_per_node_time = false,
+      bool generate_ai_pep_output = false);
 
   float benchmark_model(
       const std::vector<c10::IValue>& args,

From 968d7ee46a66316557a04a64333d2810b544ed9b Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Wed, 1 Sep 2021 14:24:54 -0700
Subject: [PATCH 436/530] [structured] Preserve computed elements from meta
 func to impl (#61746)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61746

**Summary**
This commit introduces a new feature for structured kernels that allows
kernels to declare quantities as "precomputed" in
`native_functions.yaml`, compute them once in the `meta` function and
reuse them again in the `impl`. The names and types of these quantities
are used to generate code for a struct containing them that the `meta`
function must return. In the case of a handful of surveyed kernels
(`all,`, `any`, `avg_pool2d`), these quantities that are used both in
the `meta` and `impl` have the same meaning as certain kernel arguments
and in fact supersede them. Accordingly, the correspondence between a
kernel argument and the precomputed elements that supersede it is also
captured in `native_functions.yaml`. This information is used to unpack
the struct returned by `meta` and pass its contents correctly to the
`impl` function.

The primary goal is to avoid recompute and enhance developer experience
(e.g. sometimes people can forget to compute these elements while
porting a kernel).

Test Plan: Imported from OSS

Reviewed By: tugsbayasgalan

Differential Revision: D30407831

Pulled By: SplitInfinity

fbshipit-source-id: 00975525ea373721fe52d06f75cd4ac91f3dc556
---
 aten/src/ATen/TensorMeta.h                  |  10 ++
 aten/src/ATen/native/AveragePool2d.cpp      | 120 ++++++++++++--------
 aten/src/ATen/native/Pool.h                 |   7 +-
 aten/src/ATen/native/ReduceOps.cpp          |  13 ++-
 aten/src/ATen/native/cpu/AvgPoolKernel.cpp  |  18 +--
 aten/src/ATen/native/cuda/AveragePool2d.cu  | 109 ++++++++++--------
 aten/src/ATen/native/native_functions.yaml  |   8 ++
 tools/codegen/api/structured.py             |  22 +++-
 tools/codegen/dest/register_dispatch_key.py |  24 +++-
 tools/codegen/gen.py                        |  91 ++++++++++++++-
 tools/codegen/model.py                      |  52 +++++++++
 11 files changed, 361 insertions(+), 113 deletions(-)

diff --git a/aten/src/ATen/TensorMeta.h b/aten/src/ATen/TensorMeta.h
index ac295ec9bde79..6a5491ab3d50b 100644
--- a/aten/src/ATen/TensorMeta.h
+++ b/aten/src/ATen/TensorMeta.h
@@ -26,6 +26,16 @@ namespace impl {
 #define TORCH_META_FUNC(name) void structured_##name::meta
 #define TORCH_META_FUNC2(name, overload) void structured_##name##_##overload::meta
 
+// These are versions of TORCH_META_FUNC(2) that include a precompute_out struct as a return value.
+// They should be used when the kernel in question has precomputed values declared in native_functions.yaml and
+// the corresponding implementation should return an instance of the aforementioned struct.
+#define TORCH_PRECOMPUTE_META_FUNC(name) structured_##name::meta_return_ty structured_##name::meta
+#define TORCH_PRECOMPUTE_META_FUNC2(name, overload) structured_##name##_##overload::meta_return_ty structured_##name##_##overload::meta
+
+// Use this to create a precompute struct in a meta function.
+#define TORCH_PRECOMPUTE_STRUCT(name) structured_##name::precompute_out<>
+#define TORCH_PRECOMPUTE_STRUCT2(name, overload) structured_##name##_##overload::precompute_out<>
+
 // Use this to define the prototype for an implementation.  This takes only
 // one argument, which is the name of the dispatch key entry you're
 // implementing.
diff --git a/aten/src/ATen/native/AveragePool2d.cpp b/aten/src/ATen/native/AveragePool2d.cpp
index 2693cc6ba49c5..8f264c007c6be 100644
--- a/aten/src/ATen/native/AveragePool2d.cpp
+++ b/aten/src/ATen/native/AveragePool2d.cpp
@@ -8,59 +8,81 @@ namespace at {
 namespace meta{
 using namespace native;
 
-TORCH_META_FUNC(avg_pool2d) (
-  const Tensor& input,
-  IntArrayRef kernel_size,
-  IntArrayRef stride,
-  IntArrayRef padding,
-  bool ceil_mode,
-  bool count_include_pad,
-  c10::optional<int64_t> divisor_override
-) {
+TORCH_PRECOMPUTE_META_FUNC(avg_pool2d)
+(const Tensor& input,
+ IntArrayRef kernel_size,
+ IntArrayRef stride,
+ IntArrayRef padding,
+ bool ceil_mode,
+ bool count_include_pad,
+ c10::optional<int64_t> divisor_override) {
   // #20866, #22032: Guarantee this for the official C++ API?
   TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
     "avg_pool2d: kernel_size must either be a single int, or a tuple of two ints");
-  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
-  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
+  const int64_t kH = kernel_size[0];
+  const int64_t kW = kernel_size.size() == 1 ? kH : kernel_size[1];
 
   TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
     "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints");
-  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
-  const int dW = stride.empty() ? kW :
-                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
+  const int64_t dH = stride.empty() ? kH : stride[0];
+  const int64_t dW = stride.empty() ? kW : stride.size() == 1 ? dH : stride[1];
 
   TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
     "avg_pool2d: padding must either be a single int, or a tuple of two ints");
-  const int padH = safe_downcast<int, int64_t>(padding[0]);
-  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+  const int64_t padH = padding[0];
+  const int64_t padW = padding.size() == 1 ? padH : padding[1];
 
   TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0,
     "divisor must be not zero");
 
-  /* sizes */
   const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
   const int64_t nInputPlane = input.size(-3);
   const int64_t inputHeight = input.size(-2);
   const int64_t inputWidth = input.size(-1);
 
-  const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
-  const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
+  const int64_t outputHeight = pooling_output_shape<int64_t>(
+      inputHeight, kH, padH, dH, 1, ceil_mode);
+  const int64_t outputWidth =
+      pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
 
   auto memory_format = input.suggest_memory_format();
   pool2d_shape_check(
-    input,
-    kH, kW, dH, dW, padH, padW, 1, 1,
-    nInputPlane,
-    inputHeight, inputWidth,
-    outputHeight, outputWidth, memory_format);
+      input,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      1,
+      1,
+      nInputPlane,
+      inputHeight,
+      inputWidth,
+      outputHeight,
+      outputWidth,
+      memory_format);
 
   /* resize output */
   if (input.ndimension() == 3) {
-    set_output(0, {nInputPlane, outputHeight, outputWidth}, input.options());
+    set_output(
+        0,
+        {nInputPlane,
+         outputHeight,
+         outputWidth},
+        input.options());
   }
   else {
-    set_output(0, {nbatch, nInputPlane, outputHeight, outputWidth}, input.options().memory_format(memory_format));
+    set_output(
+        0,
+        {nbatch,
+         nInputPlane,
+         outputHeight,
+         outputWidth},
+        input.options().memory_format(memory_format));
   }
+
+  return TORCH_PRECOMPUTE_STRUCT(avg_pool2d)().set_kH(kH).set_kW(kW).set_dH(dH).set_dW(dW).set_padH(padH).set_padW(padW);
 }
 
 TORCH_META_FUNC(avg_pool2d_backward) (
@@ -119,30 +141,30 @@ TORCH_META_FUNC(avg_pool2d_backward) (
 
 namespace native {
 
-TORCH_IMPL_FUNC(avg_pool2d_out_cpu) (
-  const Tensor &input,
-  IntArrayRef kernel_size,
-  IntArrayRef stride,
-  IntArrayRef padding,
-  bool ceil_mode,
-  bool count_include_pad,
-  c10::optional<int64_t> divisor_override,
-  const Tensor &output
-) {
-  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
-  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
-
-  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
-  const int dW = stride.empty() ? kW :
-                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
-
-  const int padH = safe_downcast<int, int64_t>(padding[0]);
-  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
-
+TORCH_IMPL_FUNC(avg_pool2d_out_cpu)
+(const Tensor& input,
+ int64_t kH,
+ int64_t kW,
+ int64_t dH,
+ int64_t dW,
+ int64_t padH,
+ int64_t padW,
+ bool ceil_mode,
+ bool count_include_pad,
+ c10::optional<int64_t> divisor_override,
+ const Tensor& output) {
   avg_pool2d_kernel(
-      kCPU, output, input,
-      kW, kH, dW, dH, padW, padH,
-      count_include_pad, divisor_override);
+      kCPU,
+      output,
+      input,
+      kW,
+      kH,
+      dW,
+      dH,
+      padW,
+      padH,
+      count_include_pad,
+      divisor_override);
 }
 
 TORCH_IMPL_FUNC(avg_pool2d_backward_out_cpu) (
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index 5fe979df2c953..da774911b5737 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -16,10 +16,13 @@ DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel);
 DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel);
 
 // averge pooling has same signature for forward and backward
-using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH,
+using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH,
+    int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, c10::optional<int64_t> divisor_override);
+using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH,
     int dW, int dH, int padW, int padH, bool count_include_pad, c10::optional<int64_t> divisor_override);
+
 DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel);
-DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_backward_kernel);
+DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel);
 
 namespace {
 
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 6e5a1532bd8d1..620908b5b79bf 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -109,16 +109,18 @@ void check_all_any(const char* name, const Tensor& self, const Tensor& result) {
   }
 }
 
-TORCH_META_FUNC2(all, dim)(const Tensor& self, int64_t dim, bool keepdim) {
+TORCH_PRECOMPUTE_META_FUNC2(all, dim)(const Tensor& self, int64_t dim, bool keepdim) {
   check_all_any("all", self, maybe_get_output());
   auto out_dtype = get_result_or_bytebool_dtype(self, maybe_get_output());
   resize_reduction(*this, self, dim, keepdim, out_dtype);
+  return TORCH_PRECOMPUTE_STRUCT2(all, dim)().set_dim(maybe_wrap_dim(dim, self.dim()));
 }
 
-TORCH_META_FUNC2(any, dim)(const Tensor& self, int64_t dim, bool keepdim) {
+TORCH_PRECOMPUTE_META_FUNC2(any, dim)(const Tensor& self, int64_t dim, bool keepdim) {
   check_all_any("any", self, maybe_get_output());
   auto out_dtype = get_result_or_bytebool_dtype(self, maybe_get_output());
   resize_reduction(*this, self, dim, keepdim, out_dtype);
+  return TORCH_PRECOMPUTE_STRUCT2(any, dim)().set_dim(maybe_wrap_dim(dim, self.dim()));
 }
 
 void check_argmax_argmin(
@@ -1338,7 +1340,6 @@ Tensor all(const Tensor& self) {
 
 TORCH_IMPL_FUNC(all_out)
 (const Tensor& self, int64_t dim, bool keepdim, const Tensor& result) {
-  dim = maybe_wrap_dim(dim, self.dim());
   auto iter = get_allany_iter(self, result, dim, keepdim);
   auto mut_result = const_cast<Tensor&>(result);
   if (!_dimreduce_return_trivial(mut_result, self, 1, dim, keepdim)) {
@@ -1370,8 +1371,10 @@ Tensor any(const Tensor& self) {
 }
 
 TORCH_IMPL_FUNC(any_out)
-(const Tensor& self, int64_t dim, bool keepdim, const Tensor& result) {
-  dim = maybe_wrap_dim(dim, self.dim());
+(const Tensor& self,
+ int64_t dim,
+ bool keepdim,
+ const Tensor& result) {
   auto iter = get_allany_iter(self, result, dim, keepdim);
   auto mut_result = const_cast<Tensor&>(result);
   if (!_dimreduce_return_trivial(mut_result, self, 0, dim, keepdim)) {
diff --git a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
index 2aa075f5933bd..2bee0206ff6b5 100644
--- a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
@@ -14,9 +14,9 @@ template <typename scalar_t>
 void cpu_avg_pool(
     const Tensor& output_,
     const Tensor& input_,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
+    int64_t kW, int64_t kH,
+    int64_t dW, int64_t dH,
+    int64_t padW, int64_t padH,
     bool count_include_pad,
     c10::optional<int64_t> divisor_override) {
   auto input = input_.contiguous();
@@ -98,9 +98,9 @@ template <typename scalar_t>
 void cpu_avg_pool_channels_last(
     const Tensor& output_,
     const Tensor& input_,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
+    int64_t kW, int64_t kH,
+    int64_t dW, int64_t dH,
+    int64_t padW, int64_t padH,
     bool count_include_pad,
     c10::optional<int64_t> divisor_override) {
   TORCH_CHECK(input_.ndimension() == 4,
@@ -359,9 +359,9 @@ void cpu_avg_pool_backward_channels_last(
 void avg_pool2d_kernel_impl(
     const Tensor& output,
     const Tensor& input,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
+    int64_t kW, int64_t kH,
+    int64_t dW, int64_t dH,
+    int64_t padW, int64_t padH,
     bool count_include_pad,
     c10::optional<int64_t> divisor_override) {
   switch (input.suggest_memory_format()) {
diff --git a/aten/src/ATen/native/cuda/AveragePool2d.cu b/aten/src/ATen/native/cuda/AveragePool2d.cu
index 5de3adc08bee8..df9fcfef64167 100644
--- a/aten/src/ATen/native/cuda/AveragePool2d.cu
+++ b/aten/src/ATen/native/cuda/AveragePool2d.cu
@@ -232,30 +232,31 @@ __global__ void avg_pool2d_backward_out_cuda_frame_nhwc(const int nthreads,
 
 } // anonymous namespace
 
-TORCH_IMPL_FUNC(avg_pool2d_out_cuda) (
-  const Tensor& input_,
-  IntArrayRef kernel_size,
-  IntArrayRef stride,
-  IntArrayRef padding,
-  bool ceil_mode,
-  bool count_include_pad,
-  c10::optional<int64_t> divisor_override,
-  const Tensor& output
-) {
+TORCH_IMPL_FUNC(avg_pool2d_out_cuda)
+(const Tensor& input_,
+ int64_t kH_,
+ int64_t kW_,
+ int64_t dH_,
+ int64_t dW_,
+ int64_t padH_,
+ int64_t padW_,
+ bool ceil_mode,
+ bool count_include_pad,
+ c10::optional<int64_t> divisor_override,
+ const Tensor& output) {
   TensorArg output_arg{ output, "output", 1 };
   TensorArg input_arg{ input_, "input_", 2 };
 
   checkAllSameGPU("avg_pool2d_out_cuda", {output_arg, input_arg});
 
-  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
-  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
+  const int kH = safe_downcast<int, int64_t>(kH_);
+  const int kW = safe_downcast<int, int64_t>(kW_);
 
-  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
-  const int dW = stride.empty() ? kW :
-                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
+  const int dH = safe_downcast<int, int64_t>(dH_);
+  const int dW = safe_downcast<int, int64_t>(dW_);
 
-  const int padH = safe_downcast<int, int64_t>(padding[0]);
-  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+  const int padH = safe_downcast<int, int64_t>(padH_);
+  const int padW = safe_downcast<int, int64_t>(padW_);
 
   /* sizes */
   const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
@@ -263,8 +264,8 @@ TORCH_IMPL_FUNC(avg_pool2d_out_cuda) (
   const int64_t inputHeight = input_.size(-2);
   const int64_t inputWidth = input_.size(-1);
 
-  const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
-  const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
+  int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
+  int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
   const auto memory_format = input_.suggest_memory_format();
 
   Tensor input = input_.contiguous(memory_format);
@@ -289,37 +290,55 @@ TORCH_IMPL_FUNC(avg_pool2d_out_cuda) (
           case MemoryFormat::ChannelsLast: {
             output.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::ChannelsLast);
             avg_pool2d_out_cuda_frame_nhwc<scalar_t, accscalar_t>
-                <<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                  count,
-                  input_data,
-                  nbatch,
-                  nInputPlane,
-                  inputHeight, inputWidth,
-                  outputHeight, outputWidth,
-                  kH, kW,
-                  dH, dW,
-                  padH, padW,
-                  output_data,
-                  divisor_override_value,
-                  count_include_pad, use_divisor);
+                <<<num_blocks,
+                   num_threads,
+                   0,
+                   at::cuda::getCurrentCUDAStream()>>>(
+                    count,
+                    input_data,
+                    nbatch,
+                    nInputPlane,
+                    inputHeight,
+                    inputWidth,
+                    outputHeight,
+                    outputWidth,
+                    kH,
+                    kW,
+                    dH,
+                    dW,
+                    padH,
+                    padW,
+                    output_data,
+                    divisor_override_value,
+                    count_include_pad,
+                    use_divisor);
             C10_CUDA_KERNEL_LAUNCH_CHECK();
             break;
           }
           case MemoryFormat::Contiguous: {
             avg_pool2d_out_cuda_frame<scalar_t, accscalar_t>
-              <<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                count,
-                input_data,
-                nbatch,
-                nInputPlane,
-                inputHeight, inputWidth,
-                outputHeight, outputWidth,
-                kH, kW,
-                dH, dW,
-                padH, padW,
-                output_data,
-                divisor_override_value,
-                count_include_pad, use_divisor);
+                <<<num_blocks,
+                   num_threads,
+                   0,
+                   at::cuda::getCurrentCUDAStream()>>>(
+                    count,
+                    input_data,
+                    nbatch,
+                    nInputPlane,
+                    inputHeight,
+                    inputWidth,
+                    outputHeight,
+                    outputWidth,
+                    kH,
+                    kW,
+                    dH,
+                    dW,
+                    padH,
+                    padW,
+                    output_data,
+                    divisor_override_value,
+                    count_include_pad,
+                    use_divisor);
             C10_CUDA_KERNEL_LAUNCH_CHECK();
             break;
           }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 688763ea39c13..fae433cd6aae6 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -487,6 +487,8 @@
 - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured: True
+  precomputed:
+  - dim -> int dim
   dispatch:
     CPU, CUDA: all_out
 
@@ -508,6 +510,8 @@
 - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured: True
+  precomputed:
+  - dim -> int dim
   dispatch:
     CPU, CUDA: any_out
 
@@ -8816,6 +8820,10 @@
 - func: avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   structured: True
+  precomputed:
+  - kernel_size -> int kH, int kW
+  - stride -> int dH, int dW
+  - padding -> int padH, int padW
   dispatch:
     CPU: avg_pool2d_out_cpu
     CUDA: avg_pool2d_out_cuda
diff --git a/tools/codegen/api/structured.py b/tools/codegen/api/structured.py
index 4f1437fb6f3ff..6aab794413c64 100644
--- a/tools/codegen/api/structured.py
+++ b/tools/codegen/api/structured.py
@@ -84,7 +84,27 @@ def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> List[B
 
 def impl_arguments(g: NativeFunctionsGroup) -> List[Binding]:
     args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
-    args.extend(g.out.func.arguments.non_out)
+
+    if g.out.precomputed:
+        # A list of parameters for the impl function with
+        # certain parameters replaced with precomputed counterparts
+        # as specified in native_functions.yaml.
+        non_out_args_replaced: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
+
+        for a in g.out.func.arguments.non_out:
+            if isinstance(a, Argument) and a.name in g.out.precomputed.replace:
+                # If a is in precompute.replace, append the parameters
+                # that should replace it onto non_out_args_replaced.
+                for replacement in g.out.precomputed.replace[a.name]:
+                    non_out_args_replaced.append(replacement)
+            else:
+                # If not, push a as it is.
+                non_out_args_replaced.append(a)
+
+        args.extend(non_out_args_replaced)
+    else:
+        args.extend(g.out.func.arguments.non_out)
+
     args.extend(g.out.func.arguments.out)
     return [r for arg in args for r in argument(arg)]
 
diff --git a/tools/codegen/dest/register_dispatch_key.py b/tools/codegen/dest/register_dispatch_key.py
index 784ee56e765fb..ec3a2e6afc0b1 100644
--- a/tools/codegen/dest/register_dispatch_key.py
+++ b/tools/codegen/dest/register_dispatch_key.py
@@ -584,7 +584,29 @@ def generate_defn(cpp_sig: CppSignature) -> str:
                     method=False
                 )
             )
-            sig_body.append(f"op.meta({meta_exprs});")
+
+            if self.g.out.precomputed:
+                # If this function group has precomputed elements, the meta function
+                # returns a struct containing them which must be saved so that it
+                # can be unpacked when generating code to call the impl.
+                sig_body.append(f"auto precompute = op.meta({meta_exprs});")
+
+                # Put all of the contents of the precompute struct into the context
+                # so that translate will be able to return the correct args for the
+                # call to the impl.
+                for precomputed_elems in self.g.out.precomputed.replace.values():
+                    for arg in precomputed_elems:
+                        context.append(Expr(
+                            expr=f"precompute.{arg.name}",
+                            type=structured.argument_type(arg, binds=arg.name),
+                        ))
+
+                # Add a use of the precompute struct so FB internal compilers don't
+                # complain that there is an unused variable.
+                sig_body.append("(void)precompute;")
+            else:
+                sig_body.append(f"op.meta({meta_exprs});")
+
 
             # After running meta, op.outputs_ is guaranteed to be valid;
             # add it to the context
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 203b5a99c356c..c986f8311604d 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -456,9 +456,98 @@ def compute_meta_function_declaration(g: NativeFunctionsGroup) -> Optional[str]:
         parent_class = g.out.structured_inherits
         if parent_class is None:
             parent_class = "at::impl::MetaBase"
+        meta_return = "void"
+        precomputed = g.out.precomputed if g.structured else None
+
+        if precomputed:
+            # Generate the template declaration with one bool parameter for each
+            # precomputed element. Each parameter is true if the corresponding (in
+            # terms of position) precomputed element has been set.
+            precomputed_elements = [elem for replace_list in precomputed.replace.values() for elem in replace_list]
+            precomputed_template_parameters = [elem.name.upper() for elem in precomputed_elements]
+            precomputed_template_params_str = ", ".join(f"bool {param} = false" for param in precomputed_template_parameters)
+            precompute_template_decl = f"template <{precomputed_template_params_str}>"
+
+            # Generate a string containing declarations of all precomputed elements.
+            precomputed_elements_with_cpp_types = [
+                structured.argument_type(elem, binds=elem.name)
+                for elem in precomputed_elements
+            ]
+
+            precomputed_elements_decl = ";\n".join(
+                f"{elem.cpp_type(strip_ref=True)} {elem.name}" for elem in precomputed_elements_with_cpp_types
+            )
+
+            # Generate "setter" methods for each precomputed element. Each method will return
+            # a new instance of precompute_out with the template parameter that corresponds to
+            # the member set by the method to true (to indicate that it has been set).
+            setter_methods = []
+            for i, elem in enumerate(precomputed_elements):
+                # Generate the signature. The return type will be the same
+                # as the type of `this` but with the template parameter
+                # corresponding to the element set by this method set to true.
+                # The assert generated below will ensure that this template
+                # parameter is false on the type of `this`.
+                return_ty_templates = ", ".join(
+                    precomputed_template_parameters[:i] + ["true"] + precomputed_template_parameters[i + 1:]
+                )
+                return_ty = f"precompute_out<{return_ty_templates}>"
+                elem_cpp_ty = precomputed_elements_with_cpp_types[i].cpp_type(strip_ref=True)
+                signature = f"{return_ty} set_{elem.name}({elem_cpp_ty} value)"
+
+                # Generate an assert which checks that the
+                # template parameter corresponding to the precomputed
+                # element that is set by this method is false on the
+                # class corresponding to the object that `this` points to.
+                # This ensures that each element can be set only once.
+                assert_msg = f"\"{precomputed_elements[i].name} already set\""
+                assert_stmt = f"static_assert({precomputed_template_parameters[i]} == false, {assert_msg});"
+
+                # Generate the new object construction block. All state
+                # except the element that this method sets is copied from the
+                # object that `this` points to. The value for the element that
+                # the method sets is taken from a method parameter.
+                construction_stmts = []
+                construction_stmts.append(f"{return_ty} ret;")
+
+                for j, elem in enumerate(precomputed_elements):
+                    if i == j:
+                        construction_stmts.append(f"ret.{elem.name} = value;")
+                    else:
+                        construction_stmts.append(f"ret.{elem.name} = this->{elem.name};")
+
+                construction_stmts.append("return ret;")
+                construction_block = "\n".join(construction_stmts)
+
+                setter_methods.append(f"""
+                    {signature} {{
+                        {assert_stmt}
+                        {construction_block}
+                    }}
+                """)
+            setter_methods_decl = "\n".join(setter_methods)
+
+            # Meta should return an instance of the struct containing the precomputed elements.
+            meta_return_template_params = ", ".join(["true"] * len(precomputed_template_parameters))
+            # This typedef (actually a using statement) is needed so that TORCH_META_FUNC can reuse the return
+            # type (which has a variable number of template parameters).
+            meta_return_typedef = f"using meta_return_ty = precompute_out <{meta_return_template_params}>;"
+            meta_return = "meta_return_ty"
+            precomputed_decl = f"""
+                {precompute_template_decl}
+                struct TORCH_API precompute_out {{
+                    {setter_methods_decl}
+                    {precomputed_elements_decl};
+            }};"""
+        else:
+            meta_return_typedef = ""
+            precomputed_decl = ""
+
         return f"""\
 struct TORCH_API structured_{name} : public {parent_class} {{
-    void meta({args_str});
+    {precomputed_decl}
+    {meta_return_typedef}
+    {meta_return} meta({args_str});
 }};
 """
 
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index 4f82b70ee31f2..e604e72d3a1ad 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -229,6 +229,14 @@ class NativeFunction:
     # changes the semantics of set_output to call the parent class.
     structured_inherits: Optional[str]
 
+    # Structured kernels can declare elements as "precomputed". These elements
+    # are returned by the meta function in one struct and passed to the impl
+    # function in lieu of certain kernel arguments that these precomputed
+    # elements supersede. Information about the names and types of these
+    # precomputed elements and how they correspond to kernel arguments is stored
+    # in this member, if applicable.
+    precomputed: Optional['Precompute']
+
     # Argument names whose default  should be excluded from the C++ interface.
     # Intended for resolving overload ambiguities between signatures.
     cpp_no_default_args: Set[str]
@@ -320,6 +328,10 @@ def from_yaml(
         category_override = e.pop('category_override', None)
         assert category_override is None or isinstance(category_override, str), f'not a str: {category_override}'
 
+        precomputed_dict = e.pop('precomputed', None)
+        assert precomputed_dict is None or structured is True
+        precomputed = Precompute.parse(precomputed_dict) if precomputed_dict else None
+
         from tools.codegen.api import cpp
 
         raw_dispatch = e.pop('dispatch', None)
@@ -389,6 +401,7 @@ def from_yaml(
             structured=structured,
             structured_delegate=structured_delegate,
             structured_inherits=structured_inherits,
+            precomputed=precomputed,
             manual_kernel_registration=manual_kernel_registration,
             manual_cpp_binding=manual_cpp_binding,
             python_module=python_module,
@@ -1496,3 +1509,42 @@ def parse_returns(return_decl: str) -> Tuple[Return, ...]:
     if return_decl[0] == '(' and return_decl[-1] == ')':
         return_decl = return_decl[1:-1]
     return tuple(Return.parse(arg) for arg in return_decl.split(', '))
+
+
+# A Precompute instance consists of a map from kernel argument name
+# to the list of Argument instances that should replace that
+# kernel argument in the impl function.
+@dataclass(frozen=True)
+class Precompute:
+    # A map from kernel argument name -> a list of precomputed
+    # elements that replaces/supersedes it.
+    replace: Dict[str, List[Argument]]
+
+    @staticmethod
+    def parse(src: object) -> 'Precompute':
+        assert isinstance(src, list)
+
+        # src is a list of strings of the format:
+        #   {kernel param name} -> {replacement decl}[, {replacement decl}, ...]
+        # Parse this list to get the names of which precomputed elements
+        # should replace which kernel arguments.
+        replace = {}
+        for raw_replace_item in src:
+            assert isinstance(raw_replace_item, str)
+
+            arg, with_list_raw = raw_replace_item.split(' -> ')
+            with_list = with_list_raw.split(',')
+            with_list_args = [Argument.parse(name.strip()) for name in with_list]
+            replace[arg] = with_list_args
+
+        r = Precompute(replace=replace)
+        assert r.to_list() == src, 'r.to_list() != src'
+        return r
+
+    def to_list(self) -> List[str]:
+        replace_list = []
+        for kernel_param, replacement_params in self.replace.items():
+            replacements = ', '.join(str(param) for param in replacement_params)
+            replace_list.append(f'{kernel_param} -> {replacements}')
+
+        return replace_list

From 9495674905053b67247e8e809d5c088f4fa62abc Mon Sep 17 00:00:00 2001
From: Yuchen Huang <hyc@fb.com>
Date: Wed, 1 Sep 2021 14:48:00 -0700
Subject: [PATCH 437/530] [xplat][metal] Add getters and setters for ivars in
 Conv2dOpContext (#57395)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/57395

As title
ghstack-source-id: 137223806

(Note: this ignores all push blocking failures!)

Test Plan:
### Lib Build
- `buck build caffe2:aten_metal_prepack`

### Integration Test
- `arc focus2 pp-ops -a ModelRunner`
- Click "Test Person/Hair Segmentation Model"

{F612831435}

- Image Classification Demo

{F614144868}

Reviewed By: xta0

Differential Revision: D28132020

fbshipit-source-id: 73560263a9d14e9ecfa39c69deb158a2ed8cb179
---
 .../ATen/native/metal/MetalPrepackOpContext.h | 107 +++++++++++++-----
 .../ATen/native/metal/ops/MetalConvolution.mm |  28 ++---
 2 files changed, 92 insertions(+), 43 deletions(-)

diff --git a/aten/src/ATen/native/metal/MetalPrepackOpContext.h b/aten/src/ATen/native/metal/MetalPrepackOpContext.h
index e6b3f0b78a518..5976d7af23e53 100644
--- a/aten/src/ATen/native/metal/MetalPrepackOpContext.h
+++ b/aten/src/ATen/native/metal/MetalPrepackOpContext.h
@@ -21,14 +21,14 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder {
  public:
   SerializationTypeConv2dPrePack pack() {
     return std::make_tuple(
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        groups,
-        output_min,
-        output_max);
+        weight_,
+        bias_,
+        stride_,
+        padding_,
+        dilation_,
+        groups_,
+        output_min_,
+        output_max_);
   }
   Conv2dOpContext() = delete;
   Conv2dOpContext(
@@ -40,32 +40,81 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder {
       int64_t groups,
       const c10::optional<Scalar>& output_min,
       const c10::optional<Scalar>& output_max)
-      : weight(std::move(weight)),
-        bias(std::move(bias)),
-        stride(stride),
-        padding(padding),
-        dilation(dilation),
-        groups(groups),
-        output_min(output_min),
-        output_max(output_max) {}
+      : weight_(std::move(weight)),
+        bias_(std::move(bias)),
+        stride_(stride),
+        padding_(padding),
+        dilation_(dilation),
+        groups_(groups),
+        output_min_(output_min),
+        output_max_(output_max) {}
 
   void release_resources() override {
-    if (releaseCallback) {
-      releaseCallback(conv2dOp);
-      conv2dOp = nullptr;
+    if (releaseCallback_) {
+      releaseCallback_(conv2dOp_);
+      conv2dOp_ = nullptr;
     }
   }
 
-  Tensor weight;
-  c10::optional<Tensor> bias;
-  std::vector<int64_t> stride;
-  std::vector<int64_t> padding;
-  std::vector<int64_t> dilation;
-  int64_t groups;
-  c10::optional<Scalar> output_min;
-  c10::optional<Scalar> output_max;
-  void* conv2dOp = nullptr; // reserved to hold MPSCNNConv2dOp objects
-  std::function<void(void*)> releaseCallback = nullptr;
+  const Tensor& get_weight() const {
+    return weight_;
+  }
+
+  const c10::optional<Tensor>& get_bias() const {
+    return bias_;
+  }
+
+  const std::vector<int64_t>& get_stride() const {
+    return stride_;
+  }
+
+  const std::vector<int64_t>& get_padding() const {
+    return padding_;
+  }
+
+  const std::vector<int64_t>& get_dilation() const {
+    return dilation_;
+  }
+
+  int64_t get_groups() const {
+    return groups_;
+  }
+
+  const c10::optional<Scalar>& get_output_min() const {
+    return output_min_;
+  }
+
+  const c10::optional<Scalar>& get_output_max() const {
+    return output_max_;
+  }
+
+  void set_conv2dOpPtr(void* ptr) {
+      conv2dOp_ = ptr;
+  }
+
+  void* get_conv2dOpPtr() const {
+    return conv2dOp_;
+  }
+
+  void set_releaseCallback(const std::function<void(void*)>& func) {
+    releaseCallback_ = func;
+  }
+
+  std::function<void(void*)>& get_releaseCallback() {
+     return releaseCallback_;
+  }
+
+  private:
+    Tensor weight_;
+    c10::optional<Tensor> bias_;
+    std::vector<int64_t> stride_;
+    std::vector<int64_t> padding_;
+    std::vector<int64_t> dilation_;
+    int64_t groups_;
+    c10::optional<Scalar> output_min_;
+    c10::optional<Scalar> output_max_;
+    std::function<void(void*)> releaseCallback_ = nullptr;
+    void* conv2dOp_ = nullptr; // reserved to hold MPSCNNConv2dOp objects
 };
 
 using SerializationTypeLinearPrePack = std::tuple<
diff --git a/aten/src/ATen/native/metal/ops/MetalConvolution.mm b/aten/src/ATen/native/metal/ops/MetalConvolution.mm
index c726382dde45f..4f07f5f77161d 100644
--- a/aten/src/ATen/native/metal/ops/MetalConvolution.mm
+++ b/aten/src/ATen/native/metal/ops/MetalConvolution.mm
@@ -55,28 +55,28 @@ Tensor conv2d(
 Tensor conv2d(const Tensor& input, Conv2dOpContext& context) {
   MPSImage* X = imageFromTensor(input);
   Conv2DParams params{input.sizes(),
-                      context.weight.sizes(),
-                      context.padding,
-                      context.stride,
-                      context.dilation,
-                      context.groups};
+                      context.get_weight().sizes(),
+                      context.get_padding(),
+                      context.get_stride(),
+                      context.get_dilation(),
+                      context.get_groups()};
   auto outputSize = params.output_sizes();
   if(c10::multiply_integers(outputSize) == 0){
     return makeTensor({outputSize}, input.options());
   }
-  MPSCNNConvOp* op = (__bridge MPSCNNConvOp*)(context.conv2dOp);
-  NeuronType nt = neuronType(context.output_min, context.output_max);
+  MPSCNNConvOp* op = (__bridge MPSCNNConvOp*)(context.get_conv2dOpPtr());
+  NeuronType nt = neuronType(context.get_output_min(), context.get_output_max());
   if (!op) {
-    float* w = context.weight.data_ptr<float>();
-    float* b = context.bias.has_value() ? ((*context.bias).data_ptr<float>())
+    float* w = context.get_weight().data_ptr<float>();
+    float* b = context.get_bias().has_value() ? ((*context.get_bias()).data_ptr<float>())
                                         : nullptr;
     op = [MPSCNNConvOp conv2d:params weights:w bias:b neuronFilter:nt];
-    context.conv2dOp = (void*)CFBridgingRetain(op);
-    context.releaseCallback = ^(void* res) {
+    context.set_conv2dOpPtr((void*)CFBridgingRetain(op));
+    context.set_releaseCallback(^(void* res) {
       if (res) {
         CFBridgingRelease(res);
       }
-    };
+    });
   }
   MetalTensorImplStorage mt{outputSize};
   MetalCommandBuffer* commandBuffer = getCommandBuffer(input);
@@ -86,8 +86,8 @@ Tensor conv2d(const Tensor& input, Conv2dOpContext& context) {
   // fuse hardtanh with convolution
   if (nt == NeuronType::Clamp) {
     MPSImage* Y2 = createTemporaryImage(commandBuffer, [Y1 sizes]);
-    float min = context.output_min.value().toFloat();
-    float max = context.output_max.value().toFloat();
+    float min = context.get_output_min().value().toFloat();
+    float max = context.get_output_max().value().toFloat();
     MPSCNNClampOp* clampOp =
         [MPSCNNClampOp newWithTextures:@[ Y1, Y2 ] Args:@[ @(min), @(max) ]];
     [clampOp encode:commandBuffer.buffer];

From 93bc03622eebb0bab4a79511c61e73b7315bf961 Mon Sep 17 00:00:00 2001
From: zhouzhuojie <zhouzhuojie@gmail.com>
Date: Wed, 1 Sep 2021 14:53:25 -0700
Subject: [PATCH 438/530] Silent rm error for sccache log file (#64388)

Summary:
Sample reporting from dr.ci

![image](https://user-images.githubusercontent.com/658840/131724645-75afa04f-7554-4674-8e7c-cf139c84d994.png)

The `rm` command is not actually running into problems, just need to silent the console output.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64388

Reviewed By: walterddr, malfet, seemethere

Differential Revision: D30704439

Pulled By: zhouzhuojie

fbshipit-source-id: ecd35531decf05b75cef30d08d46635f81112f67
---
 .jenkins/caffe2/common.sh  | 2 +-
 .jenkins/pytorch/common.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.jenkins/caffe2/common.sh b/.jenkins/caffe2/common.sh
index 026cb8349d3d9..168e823ba2cc4 100644
--- a/.jenkins/caffe2/common.sh
+++ b/.jenkins/caffe2/common.sh
@@ -18,7 +18,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
     if which sccache > /dev/null; then
         # Save sccache logs to file
         sccache --stop-server || true
-        rm ~/sccache_error.log || true
+        rm -f ~/sccache_error.log || true
         SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
 
         # Report sccache stats for easier debugging
diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh
index 52b91510c4029..09e814b07d62d 100644
--- a/.jenkins/pytorch/common.sh
+++ b/.jenkins/pytorch/common.sh
@@ -74,7 +74,7 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
   if which sccache > /dev/null; then
     # Save sccache logs to file
     sccache --stop-server > /dev/null  2>&1 || true
-    rm ~/sccache_error.log || true
+    rm -f ~/sccache_error.log || true
     if [[ -n "${SKIP_SCCACHE_INITIALIZATION:-}" ]]; then
       # sccache --start-server seems to hang forever on self hosted runners for GHA
       # so let's just go ahead and skip the --start-server altogether since it seems

From 7ffcf1550374af29f5b3ce316d4e576423945be0 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 1 Sep 2021 14:56:14 -0700
Subject: [PATCH 439/530] [quant][graphmode][api] Add backend_config_dict to
 prepare_fx api (#64135)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64135

We want to start aligning the api with the design in https://github.com/pytorch/pytorch/wiki/Extending-PyTorch-Quantization-to-Custom-Backends

We plan to gradually move things from `prepare_custom_config_dict` and `convert_custom_config_dict`
to `backend_config_dict` and allow custom backend developer to define their own way of quantizing operators.

Test Plan:
python test/test_quantization.py TestQuantizeFx
python test/test_quantization.py TestQuantizeFxOps

Imported from OSS

Reviewed By: zou3519

Differential Revision: D30699456

fbshipit-source-id: e3c068da8d3da2270f57719f7159cc71cafa8598
---
 torch/quantization/fx/prepare.py  |  1 +
 torch/quantization/quantize_fx.py | 28 +++++++++++++++++++---------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py
index a6fd660e5e84c..fb526d09279dc 100644
--- a/torch/quantization/fx/prepare.py
+++ b/torch/quantization/fx/prepare.py
@@ -1114,6 +1114,7 @@ def prepare(
         node_name_to_scope: Dict[str, Tuple[str, type]],
         prepare_custom_config_dict: Optional[Dict[str, Any]] = None,
         equalization_qconfig_dict: Optional[Dict[str, Any]] = None,
+        backend_config_dict: Optional[Dict[str, Any]] = None,
         is_standalone_module: bool = False) -> ObservedGraphModule:
     """ standalone_module means it a submodule that is not inlined in
     parent module, and will be quantized separately as one unit.
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index aa8edbba64e49..2dd98ea6ffe4c 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -140,8 +140,9 @@ def create_node(self, kind : str, target : Target,
         return node
 
 def _prepare_fx(model: torch.nn.Module, qconfig_dict: Any,
-                prepare_custom_config_dict: Dict[str, Any] = None,
-                equalization_qconfig_dict: Dict[str, Any] = None,
+                prepare_custom_config_dict: Optional[Dict[str, Any]] = None,
+                equalization_qconfig_dict: Optional[Dict[str, Any]] = None,
+                backend_config_dict: Optional[Dict[str, Any]] = None,
                 is_standalone_module: bool = False) -> ObservedGraphModule:
     r""" Internal helper function for prepare_fx
     Args:
@@ -203,7 +204,8 @@ def _prepare_fx(model: torch.nn.Module, qconfig_dict: Any,
 def _prepare_standalone_module_fx(
         model: torch.nn.Module,
         qconfig_dict: Any,
-        prepare_custom_config_dict: Dict[str, Any] = None) -> GraphModule:
+        prepare_custom_config_dict: Dict[str, Any] = None,
+        backend_config_dict: Dict[str, Any] = None) -> GraphModule:
     r""" [Internal use only] Prepare a standalone module, so that it can be used when quantizing the
     parent module.
     standalone_module means it a submodule that is not inlined in parent module,
@@ -224,7 +226,7 @@ def _prepare_standalone_module_fx(
                 same as input_quantized_idxs configuration provided
                 for the standalone module
     """
-    return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module=True)
+    return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, backend_config_dict, is_standalone_module=True)
 
 def fuse_fx(model: torch.nn.Module,
             fuse_custom_config_dict: Dict[str, Any] = None) -> GraphModule:
@@ -265,8 +267,9 @@ def fuse_fx(model: torch.nn.Module,
 
 def prepare_fx(
         model: torch.nn.Module, qconfig_dict: Any,
-        prepare_custom_config_dict: Dict[str, Any] = None,
-        equalization_qconfig_dict: Dict[str, Any] = None) -> ObservedGraphModule:
+        prepare_custom_config_dict: Optional[Dict[str, Any]] = None,
+        equalization_qconfig_dict: Optional[Dict[str, Any]] = None,
+        backend_config_dict: Optional[Dict[str, Any]] = None) -> ObservedGraphModule:
     r""" Prepare a model for post training static quantization
 
     Args:
@@ -392,6 +395,11 @@ def prepare_fx(
       with a similar structure as qconfig_dict except it will contain
       configurations specific to equalization techniques such as input-weight
       equalization.
+      `backend_config_dict`: a dictionary that specifies how operators are quantized
+       in a backend, this includes how the operaetors are observed,
+       supported fusion patterns, how quantize/dequantize ops are
+       inserted, supported dtypes etc. The structure of the dictionary is still WIP
+       and will change in the future, please don't use right now.
 
 
     Return:
@@ -420,16 +428,18 @@ def calibrate(model, data_loader):
     torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_fx")
     assert not model.training, 'prepare_fx only works for models in ' + \
         'eval mode'
-    return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, equalization_qconfig_dict)
+    return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, equalization_qconfig_dict, backend_config_dict)
 
 def prepare_qat_fx(
         model: torch.nn.Module, qconfig_dict: Any,
-        prepare_custom_config_dict: Dict[str, Any] = None) -> ObservedGraphModule:
+        prepare_custom_config_dict: Optional[Dict[str, Any]] = None,
+        backend_config_dict: Optional[Dict[str, Any]] = None) -> ObservedGraphModule:
     r""" Prepare a model for quantization aware training
     Args:
       `model`: torch.nn.Module model, must be in train mode
       `qconfig_dict`: see :func:`~torch.quantization.prepare_fx`
       `prepare_custom_config_dict`: see :func:`~torch.quantization.prepare_fx`
+      `backend_config_dict`: see :func:`~torch.quantization.prepare_fx`
 
     Return:
       A GraphModule with fake quant modules (configured by qconfig_dict), ready for
@@ -457,7 +467,7 @@ def train_loop(model, train_data):
     torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_qat_fx")
     assert model.training, 'prepare_qat_fx only works for models in  ' + \
         'train mode'
-    return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict)
+    return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, backend_config_dict)
 
 def _convert_fx(
         graph_module: GraphModule, is_reference: bool,

From 535526b95cb26be10c0942129911db7d281d5bd9 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Wed, 1 Sep 2021 15:12:05 -0700
Subject: [PATCH 440/530] Restore LayerNorm numerics test (#64385)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64385

It was deleted in https://github.com/pytorch/pytorch/pull/63276.

The numerics test was meant to check LayerNorm behavior on large inputs,
but we deleted it without realizing that.

Test Plan: - wait for tests.

Reviewed By: ngimel

Differential Revision: D30702950

Pulled By: zou3519

fbshipit-source-id: a480e26c45ec38fb628938b70416cdb22d976a46
---
 test/test_nn.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/test/test_nn.py b/test/test_nn.py
index e60ff698ed19b..5008c7256acf7 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -13281,6 +13281,32 @@ def test_LayerNorm_general(self, device):
         if self.device_type == 'cuda':
             self._test_LayerNorm_cuda_half(device)
 
+    @onlyOnCPUAndCUDA
+    def test_LayerNorm_numeric(self, device):
+        def layer_norm_ref(X, gamma, beta, normalized_shape, eps):
+            feature_size = np.prod(normalized_shape)
+            X_view = X.view(-1, feature_size)
+            mean = X_view.mean(dim=-1, keepdim=True)
+            var = X_view.var(dim=-1, unbiased=False, keepdim=True)
+            Y = (X_view - mean) / torch.sqrt(var + eps)
+            Y = Y * gamma.view(-1) + beta.view(-1)
+            return Y.view(*X.size())
+
+        normalized_shape = [256, 256, 144]
+        layer_norm = nn.LayerNorm(normalized_shape).float().to(device)
+        X = torch.rand(2, *normalized_shape, dtype=torch.float32,
+                       device=device)
+
+        Y = layer_norm(X)
+        Y_ref = layer_norm_ref(X, layer_norm.weight.data, layer_norm.bias.data,
+                               normalized_shape, layer_norm.eps)
+        self.assertEqual(Y, Y_ref, rtol=0, atol=1e-5)
+
+        if self.device_type == 'cuda':
+            layer_norm.cpu()
+            Y_cpu = layer_norm(X.cpu())
+            self.assertEqual(Y_cpu, Y, rtol=0, atol=1e-5)
+
     @onlyOnCPUAndCUDA
     def test_GroupNorm_general(self, device):
         self._test_GroupNorm_general(device)

From 69f4401b7b6ea6a51a090ca8c958968a80a529e2 Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Wed, 1 Sep 2021 15:18:14 -0700
Subject: [PATCH 441/530] Make datasets in `ConcatDataset` not need to be sized
 (#64114)

Summary:
`datasets` needs to be iterable, but also sized because the length is checked. But immediately after it's converted to a list. By changing the order of these 2 lines, it doesn't need to be sized anymore.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64114

Reviewed By: H-Huang

Differential Revision: D30641480

Pulled By: ejguan

fbshipit-source-id: 7e16548c2123afa65b83845f9929271fa07fe1e8
---
 torch/utils/data/dataset.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index 609e1a1eb6e2d..50488d13ae5d3 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -271,9 +271,8 @@ def cumsum(sequence):
 
     def __init__(self, datasets: Iterable[Dataset]) -> None:
         super(ConcatDataset, self).__init__()
-        # Cannot verify that datasets is Sized
-        assert len(datasets) > 0, 'datasets should not be an empty iterable'  # type: ignore[arg-type]
         self.datasets = list(datasets)
+        assert len(self.datasets) > 0, 'datasets should not be an empty iterable'  # type: ignore[arg-type]
         for d in self.datasets:
             assert not isinstance(d, IterableDataset), "ConcatDataset does not support IterableDataset"
         self.cumulative_sizes = self.cumsum(self.datasets)

From ed89937d2cbda8f4c5b67439b8b7b138cff42552 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 1 Sep 2021 15:48:54 -0700
Subject: [PATCH 442/530] [quant][graphmode][fx] Add fbgemm backend_config_dict
 (#64288)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64288

This is just to setup the file structure and unblock experimentation.
The format for backend_config_dict will change in the future

Test Plan:
python test/test_quantization.py TestQuantizeFx
python test/test_quantization.py TestQuantizeFxOps

Imported from OSS

Reviewed By: zou3519

Differential Revision: D30699457

fbshipit-source-id: 28211a4def05d34757850c045a36e311f54760fe
---
 torch/quantization/fx/backend_config_dict/__init__.py |  4 ++++
 torch/quantization/fx/backend_config_dict/fbgemm.py   | 11 +++++++++++
 torch/quantization/fx/prepare.py                      | 11 +++++++++--
 3 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 torch/quantization/fx/backend_config_dict/__init__.py
 create mode 100644 torch/quantization/fx/backend_config_dict/fbgemm.py

diff --git a/torch/quantization/fx/backend_config_dict/__init__.py b/torch/quantization/fx/backend_config_dict/__init__.py
new file mode 100644
index 0000000000000..edb2b956851b7
--- /dev/null
+++ b/torch/quantization/fx/backend_config_dict/__init__.py
@@ -0,0 +1,4 @@
+from .fbgemm import get_fbgemm_backend_config_dict
+
+def validate_backend_config_dict(backend_config_dict):
+    return "quant_patterns" in backend_config_dict
diff --git a/torch/quantization/fx/backend_config_dict/fbgemm.py b/torch/quantization/fx/backend_config_dict/fbgemm.py
new file mode 100644
index 0000000000000..4f40b100f0b78
--- /dev/null
+++ b/torch/quantization/fx/backend_config_dict/fbgemm.py
@@ -0,0 +1,11 @@
+from ..pattern_utils import get_default_quant_patterns
+
+def get_fbgemm_backend_config_dict():
+    """ Get the backend config dictionary for fbgemm backend
+    NOTE: Current api will change in the future, it's just to unblock experimentation for
+    new backends, please don't use it right now.
+    """
+    # TODO: add output_activation_post_process_map
+    return {
+        "quant_patterns": get_default_quant_patterns()
+    }
diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py
index fb526d09279dc..0b65e339ce0a3 100644
--- a/torch/quantization/fx/prepare.py
+++ b/torch/quantization/fx/prepare.py
@@ -42,7 +42,6 @@
 
 from .pattern_utils import (
     MatchResult,
-    get_default_quant_patterns,
     get_default_output_activation_post_process_map,
 )
 
@@ -84,6 +83,9 @@
     weight_dtype,
 )
 
+from .backend_config_dict import get_fbgemm_backend_config_dict
+from .backend_config_dict import validate_backend_config_dict
+
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 def is_activation_post_process_node(node: Node, modules: Dict[str, torch.nn.Module]) -> bool:
@@ -1140,6 +1142,10 @@ def prepare(
         prepare_custom_config_dict = {}
     if equalization_qconfig_dict is None:
         equalization_qconfig_dict = {}
+    if backend_config_dict is None:
+        backend_config_dict = get_fbgemm_backend_config_dict()
+
+    validate_backend_config_dict(backend_config_dict)
 
     additional_quant_patterns = \
         prepare_custom_config_dict.get("additional_quant_pattern", {})
@@ -1153,8 +1159,9 @@ def prepare(
     #   ((<function relu at 0x7f766a7360d0>, <built-in function add>):
     #     <class 'torch.quantization.fx.quantize.Add'>),
     # }
+    quant_patterns = backend_config_dict["quant_patterns"]
     patterns: Dict[Pattern, QuantizeHandler] = get_combined_dict(
-        get_default_quant_patterns(), additional_quant_patterns)
+        quant_patterns, additional_quant_patterns)
 
     convert_dict_to_ordered_dict(qconfig_dict)
     convert_dict_to_ordered_dict(equalization_qconfig_dict)

From a8f9aab84074ceab684da166199cc598afc62d54 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Wed, 1 Sep 2021 16:09:46 -0700
Subject: [PATCH 443/530] [DDP Comm Hook] Add bf16 gradient compression to
 ddp_comm_hooks.rst (#64346)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64346

as title
ghstack-source-id: 137170288

Test Plan: N/A

Reviewed By: rohan-varma

Differential Revision: D30693513

fbshipit-source-id: 8c64b8404ff3b0322e1bbbd93f6ef051ea91307d
---
 docs/source/ddp_comm_hooks.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/ddp_comm_hooks.rst b/docs/source/ddp_comm_hooks.rst
index aed70c0752825..5bd0378e7c7b7 100644
--- a/docs/source/ddp_comm_hooks.rst
+++ b/docs/source/ddp_comm_hooks.rst
@@ -44,11 +44,13 @@ The input ``bucket`` is a :class:`torch.distributed.GradBucket` object.
 .. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.default_hooks
 .. autofunction:: allreduce_hook
 .. autofunction:: fp16_compress_hook
+.. autofunction:: bf16_compress_hook
 
-Additionally, a communication hook wraper is provided to support :meth:`~fp16_compress_hook` as a wrapper,
+Additionally, a communication hook wraper is provided to support :meth:`~fp16_compress_hook` or :meth:`~bf16_compress_hook` as a wrapper,
 which can be combined with other communication hooks.
 
 .. autofunction:: fp16_compress_wrapper
+.. autofunction:: bf16_compress_wrapper
 
 PowerSGD Communication Hook
 ---------------------------

From 5d80a48cef373e22393af1b1f4f4e3f2ad948a76 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Wed, 1 Sep 2021 16:11:38 -0700
Subject: [PATCH 444/530] Add fast path for addmm when the inputs are conjugate
 (#59380)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/59380

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D28898374

Pulled By: anjali411

fbshipit-source-id: eab0e64d37bb57c18b54cabb8e5c00666338ba04
---
 aten/src/ATen/ConjugateFallback.cpp           |  11 ++
 aten/src/ATen/cuda/CUDABlas.cpp               |   4 +-
 aten/src/ATen/native/CPUBlas.cpp              |   4 +-
 aten/src/ATen/native/CPUBlas.h                |   2 +-
 aten/src/ATen/native/LinearAlgebra.cpp        |  35 ++++--
 aten/src/ATen/native/NegateFallback.cpp       |   1 +
 aten/src/ATen/native/TensorFactories.cpp      |  13 ++-
 aten/src/ATen/native/cuda/Blas.cpp            |  65 ++++++++---
 test/test_linalg.py                           |  32 ++++++
 test/test_torch.py                            |  11 +-
 .../_internal/common_methods_invocations.py   | 104 ++++++++++++++----
 11 files changed, 223 insertions(+), 59 deletions(-)

diff --git a/aten/src/ATen/ConjugateFallback.cpp b/aten/src/ATen/ConjugateFallback.cpp
index a64ef4950940b..2cf9538c9bb32 100644
--- a/aten/src/ATen/ConjugateFallback.cpp
+++ b/aten/src/ATen/ConjugateFallback.cpp
@@ -60,6 +60,17 @@ TORCH_LIBRARY_IMPL(aten, Conjugate, m) {
   m.impl("vdot", torch::CppFunction::makeFallthrough());
   m.impl("dot.out", torch::CppFunction::makeFallthrough());
   m.impl("vdot.out", torch::CppFunction::makeFallthrough());
+  m.impl("alias", torch::CppFunction::makeFallthrough());
+  m.impl("mm", torch::CppFunction::makeFallthrough());
+  m.impl("mm.out", torch::CppFunction::makeFallthrough());
+  m.impl("addmm", torch::CppFunction::makeFallthrough());
+  m.impl("addmm_", torch::CppFunction::makeFallthrough());
+  m.impl("addmm.out", torch::CppFunction::makeFallthrough());
+  m.impl("bmm", torch::CppFunction::makeFallthrough());
+  m.impl("bmm.out", torch::CppFunction::makeFallthrough());
+  m.impl("baddbmm", torch::CppFunction::makeFallthrough());
+  m.impl("baddbmm_", torch::CppFunction::makeFallthrough());
+  m.impl("baddbmm.out", torch::CppFunction::makeFallthrough());
 }
 
 } // namespace at
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 75e59d0ecc100..70c3dda6f3401 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -64,8 +64,8 @@ static void _cublasAdjustLdLevel3(
     int64_t* lda,
     int64_t* ldb,
     int64_t* ldc) {
-  bool transa_ = ((transa == 't') || (transa == 'T'));
-  bool transb_ = ((transb == 't') || (transb == 'T'));
+  bool transa_ = ((transa != 'n') && (transa != 'N'));
+  bool transb_ = ((transb != 'n') && (transb != 'N'));
 
   // Note: leading dimensions generally are checked that they are > 0
   // and at least as big the result requires (even if the value won't
diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp
index 1a1f6737f23f1..f14e4dce68b5a 100644
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@@ -78,7 +78,7 @@ char to_blas(TransposeType trans) {
   switch (trans) {
   case Transpose: return 't';
   case NoTranspose: return 'n';
-  // case ConjTranspose: return 'c';
+  case ConjTranspose: return 'c';
   }
   TORCH_INTERNAL_ASSERT(false, "Invalid transpose type");
 }
@@ -89,7 +89,7 @@ fbgemm::matrix_op_t to_fbgemm(TransposeType trans) {
   switch (trans) {
   case Transpose: return fbgemm::matrix_op_t::Transpose;
   case NoTranspose: return fbgemm::matrix_op_t::NoTranspose;
-  // case ConjTranspose: return fbgemm::matrix_op_t::Transpose;
+  case ConjTranspose: TORCH_INTERNAL_ASSERT(false, "ConjTranspose type is not supported in fbgemm");
   }
   TORCH_INTERNAL_ASSERT(false, "Invalid transpose type");
 }
diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h
index e61207f7c76b8..3a483e4361bd2 100644
--- a/aten/src/ATen/native/CPUBlas.h
+++ b/aten/src/ATen/native/CPUBlas.h
@@ -12,7 +12,7 @@ namespace cpublas {
 enum TransposeType {
   Transpose,
   NoTranspose,
-  // ConjTranspose, -- Not implemented
+  ConjTranspose,
 };
 
 namespace internal {
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 10576a0c63a49..2ae6202ce87e2 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -959,7 +959,6 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
 static void addmm_impl_cpu_(
     Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) {
   TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2);
-
   // Array access is faster than .size(n) and .stride(n)
   const auto self_sizes = self.sizes();
   auto m1_strides = m1.strides();
@@ -992,18 +991,18 @@ static void addmm_impl_cpu_(
   if (result_strides[0] == 1 &&
       (result_sizes[1] == 1 || result_strides[1] >= std::max(int64_t{1}, result_sizes[0]))) {
     transpose_c = false;
-    c = result;
+    c = result.resolve_conj();
   } else if (result_strides[1] == 1 &&
              (result_sizes[0] == 1 || result_strides[0] >= std::max(int64_t{1}, result_sizes[1]))) {
     std::swap(m1, m2);
     std::swap(m1_sizes, m2_sizes);
     std::swap(m1_strides, m2_strides);
     transpose_c = true;
-    c = result;
+    c = result.resolve_conj();
   } else {
     transpose_c = false;
     // make c FORTRAN contiguous
-    c = result.transpose(0, 1).contiguous().transpose_(0, 1);
+    c = result.resolve_conj().transpose(0, 1).contiguous().transpose_(0, 1);
   }
 
   const int64_t m = result_sizes[transpose_c ? 1 : 0];
@@ -1017,7 +1016,7 @@ static void addmm_impl_cpu_(
   if (m1_strides[transpose_c ? 1 : 0] == 1 &&
       m1_strides[transpose_c ? 0 : 1] >= std::max(int64_t{1}, m)) {
     transpose_a = false;
-    a = m1;
+    a = m1.resolve_conj();
   } else if (m1_strides[transpose_c ? 0 : 1] == 1 &&
              m1_strides[transpose_c ? 1 : 0] >= std::max(int64_t{1}, k)) {
     transpose_a = true;
@@ -1034,7 +1033,7 @@ static void addmm_impl_cpu_(
   if (m2_strides[transpose_c ? 1 : 0] == 1 &&
       m2_strides[transpose_c ? 0 : 1] >= std::max(int64_t{1}, k)) {
     transpose_b = false;
-    b = m2;
+    b = m2.resolve_conj();
   } else if (m2_strides[transpose_c ? 0 : 1] == 1 &&
              m2_strides[transpose_c ? 1 : 0] >= std::max(int64_t{1}, n)) {
     transpose_b = true;
@@ -1048,13 +1047,16 @@ static void addmm_impl_cpu_(
   const int64_t ldb = b.strides()[(transpose_b == transpose_c) ? 1 : 0];
   const int64_t ldc = c.strides()[transpose_c ? 0 : 1];
 
+  // Always ensure the conjugation for c is resolved since there's no way to specify c's conjugation in the gemm call
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c.is_conj());
+
   // Apply BLAS routine
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16,
       result.scalar_type(), "addmm_impl_cpu_",
       [&]{
         at::native::cpublas::gemm(
-            transpose_a ? cpublas::Transpose : cpublas::NoTranspose,
-            transpose_b ? cpublas::Transpose : cpublas::NoTranspose,
+            transpose_a ? a.is_conj() ? cpublas::ConjTranspose : cpublas::Transpose : cpublas::NoTranspose,
+            transpose_b ? b.is_conj() ? cpublas::ConjTranspose : cpublas::Transpose : cpublas::NoTranspose,
             m, n, k,
             alpha.to<scalar_t>(),
             a.data_ptr<scalar_t>(), lda,
@@ -1349,8 +1351,18 @@ Tensor& baddbmm_out_cpu(const Tensor& self_, const Tensor& batch1, const Tensor&
   return at::native::baddbmm__cpu(result, batch1, batch2, beta, alpha);
 }
 
+Tensor& conjugate_mutable_input_if_needed(Tensor& self, bool conjugate) {
+  if (conjugate) {
+    self.conj_physical_();
+  }
+  return self;
+}
+
 Tensor& baddbmm__cpu(Tensor& self, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha) {
-  return bmm_out_or_baddbmm_(self, batch1, batch2, beta, alpha, false);
+  bool self_is_conj = self.is_conj();
+  conjugate_mutable_input_if_needed(self, self_is_conj);
+  bmm_out_or_baddbmm_(self, batch1.resolve_conj(), batch2.resolve_conj(), beta, alpha, false);
+  return conjugate_mutable_input_if_needed(self, self_is_conj);
 }
 
 Tensor bmm_cpu(const Tensor& self, const Tensor& mat2) {
@@ -1363,7 +1375,10 @@ Tensor& bmm_out_cpu(const Tensor& batch1, const Tensor& batch2, Tensor &result)
   Scalar alpha(1.0);
   {
   NoNamesGuard guard;
-  bmm_out_or_baddbmm_(result, batch1, batch2, beta, alpha, true);
+  bool result_is_conj = result.is_conj();
+  conjugate_mutable_input_if_needed(result, result_is_conj);
+  bmm_out_or_baddbmm_(result, batch1.resolve_conj(), batch2.resolve_conj(), beta, alpha, true);
+  conjugate_mutable_input_if_needed(result, result_is_conj);
   }
   namedinference::propagate_names_if_nonempty(
       result,
diff --git a/aten/src/ATen/native/NegateFallback.cpp b/aten/src/ATen/native/NegateFallback.cpp
index 86dbe05ff904f..d8381f58d036b 100644
--- a/aten/src/ATen/native/NegateFallback.cpp
+++ b/aten/src/ATen/native/NegateFallback.cpp
@@ -55,6 +55,7 @@ TORCH_LIBRARY_IMPL(aten, Negative, m) {
   m.impl("view", torch::CppFunction::makeFallthrough());
   m.impl("_unsafe_view", torch::CppFunction::makeFallthrough());
   m.impl("reshape", torch::CppFunction::makeFallthrough());
+  m.impl("alias", torch::CppFunction::makeFallthrough());
 }
 
 } // namespace at
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 3ee909be029ff..4712c3d99b6d8 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -1411,17 +1411,18 @@ Tensor from_file(c10::string_view filename, c10::optional<bool> shared, c10::opt
 Tensor clone(const Tensor& src, c10::optional<c10::MemoryFormat> optional_memory_format) {
   auto memory_format =
       optional_memory_format.value_or(MemoryFormat::Preserve);
+  Tensor self;
   if (memory_format == MemoryFormat::Preserve) {
     if (src.is_non_overlapping_and_dense()) {
-      // Copy all strides
-      auto self = at::empty_strided(src.sizes(), src.strides(), src.options());
-      self.copy_(src);
-      return self;
+      // Copy all strides, this is marginally faster than calling empty_like
+      self = at::empty_strided(src.sizes(), src.strides(), src.options());
     } else {
-      memory_format = src.suggest_memory_format();
+      self = at::empty_like(src);
     }
+  } else {
+    self = at::empty_like(src, src.options(), memory_format);
   }
-  auto self = at::empty_like(src, src.options(), memory_format);
+
   self.copy_(src);
   return self;
 }
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index b4479101c59c9..269307d605aec 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -4,24 +4,51 @@
 #include <ATen/native/Resize.h>
 #include <c10/util/MaybeOwned.h>
 
-
 namespace at { namespace native {
 
 namespace {
 
+// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492
+c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) {
+  if (resolve_conj && tensor.is_conj()) {
+    return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
+  } else {
+    return c10::MaybeOwned<Tensor>::borrowed(tensor);
+  }
+}
+
+c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) {
+  if (tensor.is_non_overlapping_and_dense()) { // common case
+      transpose_tensor = tensor.is_contiguous();
+      return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor);
+  }
+  IntArrayRef tensor_strides = tensor.strides();
+  IntArrayRef tensor_sizes = tensor.sizes();
+  if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
+    transpose_tensor = false;
+    return resolve_conj_if_indicated(tensor, !transpose_result);
+  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
+    transpose_tensor = true;
+    return resolve_conj_if_indicated(tensor, transpose_result);
+  } else {
+    transpose_tensor = true;
+    return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
+  }
+}
+
 c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) {
   if (tensor.is_non_overlapping_and_dense()) { // common case
       transpose_tensor = tensor.is_contiguous();
-      return c10::MaybeOwned<Tensor>::borrowed(tensor);
+      return resolve_conj_if_indicated(tensor, true);
   }
   IntArrayRef tensor_strides = tensor.strides();
   IntArrayRef tensor_sizes = tensor.sizes();
   if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
     transpose_tensor = false;
-    return c10::MaybeOwned<Tensor>::borrowed(tensor);
+    return resolve_conj_if_indicated(tensor, true);
   } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
     transpose_tensor = true;
-    return c10::MaybeOwned<Tensor>::borrowed(tensor);
+    return resolve_conj_if_indicated(tensor, true);
   } else {
     transpose_tensor = true;
     return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
@@ -39,19 +66,19 @@ c10::MaybeOwned<Tensor> prepare_batch_matrix_for_cublas(const Tensor& tensor, bo
   if (tensor_strides[fast_dim] == 1 &&
     (tensor_strides[leading_dim] >= std::max<int64_t>(1, m))) {
     transpose_tensor = false;
-    tensor_ = c10::MaybeOwned<Tensor>::borrowed(tensor);
-    ld_tensor = tensor_strides[leading_dim];
+    tensor_ = resolve_conj_if_indicated(tensor, true);
+    ld_tensor = tensor_->strides()[leading_dim];
   } else if ((tensor_strides[leading_dim] == 1) &&
     (tensor_strides[fast_dim] >= std::max<int64_t>(1, n))) {
     transpose_tensor = true;
-    tensor_ = c10::MaybeOwned<Tensor>::borrowed(tensor);
-    ld_tensor = tensor_strides[fast_dim];
+    tensor_ = resolve_conj_if_indicated(tensor, false);
+    ld_tensor = tensor_->strides()[fast_dim];
   } else {
     transpose_tensor = !transpose_result;
     // gemm call requires leading dimension and stride parameters to be non-zero
     bool is_stride_non_zero = tensor.strides()[1] != 0 && tensor.strides()[2] != 0;
     if (tensor.is_contiguous() && is_stride_non_zero) {
-      tensor_ = c10::MaybeOwned<Tensor>::borrowed(tensor);
+      tensor_ = resolve_conj_if_indicated(tensor, transpose_result);
     } else {
       tensor_ = c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
     }
@@ -104,8 +131,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
   c10::MaybeOwned<Tensor> result_ = prepare_matrix_for_cublas(result, transpose_result);
   bool transpose_mat1;
   bool transpose_mat2;
-  c10::MaybeOwned<Tensor> mat1_ = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_mat1);
-  c10::MaybeOwned<Tensor> mat2_ = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_mat2);
+  auto mat1_ = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_mat1, transpose_result);
+  auto mat2_ = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_mat2, transpose_result);
 
   if (transpose_result) {
     transpose_mat1 = !transpose_mat1;
@@ -141,6 +168,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
             c10::nullopt /* pin_memory */));
   }
 
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result_->is_conj());
+
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] {
     scalar_t alpha_val = alpha.to<scalar_t>();
     scalar_t beta_val = beta.to<scalar_t>();
@@ -148,8 +177,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
     scalar_t* mat2_ptr = mat2_->data_ptr<scalar_t>();
     scalar_t* result_ptr = result_->data_ptr<scalar_t>();
     at::cuda::blas::gemm<scalar_t>(
-      transpose_mat1 ? 't' : 'n',
-      transpose_mat2 ? 't' : 'n',
+      transpose_mat1 ? mat1_->is_conj() ? 'c' : 't' : 'n',
+      transpose_mat2 ? mat2_->is_conj() ? 'c' : 't' : 'n',
       m, n, k,
       alpha_val,
       mat1_ptr, mat1_ld,
@@ -207,11 +236,11 @@ Tensor& baddbmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor&
 
   if ((result_strides[1] == 1) &&
       ((result_sizes[2] == 1) || (result_strides[2] >= std::max<int64_t>(1, result_sizes[1])))) {
-    result_ = c10::MaybeOwned<Tensor>::borrowed(result);
+    result_ = resolve_conj_if_indicated(result, true);
   } else if ((result_strides[2] == 1) &&
     (result_sizes[1] == 1 || (result_strides[1] >= std::max<int64_t>(1, result_sizes[2])))) {
     transpose_result = true;
-    result_ = c10::MaybeOwned<Tensor>::borrowed(result);
+    result_ = resolve_conj_if_indicated(result, true);
   } else {
     result_ = c10::MaybeOwned<Tensor>::owned(result.transpose(1, 2).clone(at::MemoryFormat::Contiguous).transpose(1, 2));
   }
@@ -230,6 +259,8 @@ Tensor& baddbmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor&
   ldc = result_->strides()[leading_dim];
   int64_t num_batches = result_->sizes()[0];
 
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result_->is_conj());
+
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "baddbmm_cuda", [&] {
     scalar_t alpha_val = alpha.to<scalar_t>();
     scalar_t beta_val = beta.to<scalar_t>();
@@ -237,8 +268,8 @@ Tensor& baddbmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor&
     scalar_t* batch2_ptr = batch2_->data_ptr<scalar_t>();
     scalar_t* result_ptr = result_->data_ptr<scalar_t>();
     at::cuda::blas::bgemm<scalar_t>(
-      transpose_batch1 ? 't' : 'n',
-      transpose_batch2 ? 't' : 'n',
+      transpose_batch1 ? batch1_->is_conj() ? 'c' : 't' : 'n',
+      transpose_batch2 ? batch2_->is_conj() ? 'c' : 't' : 'n',
       m, n, k,
       alpha_val,
       batch1_ptr, lda, batch1_->strides()[0],
diff --git a/test/test_linalg.py b/test/test_linalg.py
index f7ce39272bf86..fbd219b3c5981 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -6165,6 +6165,38 @@ def genf_float(x, y):
 
             _test_mm(n, m, p, dtype, genf)
 
+    @onlyOnCPUAndCUDA
+    def test_mm_bmm_non_memory_dense(self, device):
+        def _slice(tensor, fn):
+            return fn(tensor)[..., ::2]
+        A = torch.randn(3, 6, dtype=torch.cfloat, device=device)
+        B = torch.randn(3, 3, dtype=torch.cfloat, device=device)
+        out = torch.empty(3, 3, device=device, dtype=torch.complex64).t()
+        out1 = torch.empty(3, 3, device=device, dtype=torch.complex64).t()
+        A_conj = _slice(A, torch.conj)
+        A_conj_physical = _slice(A, torch.conj_physical)
+
+        self.assertEqual(torch.mm(A_conj, B, out=out), torch.mm(A_conj_physical, B, out=out))
+        self.assertEqual(torch.mm(A_conj.t(), B, out=out), torch.mm(A_conj_physical.t(), B, out=out))
+
+        Ab = torch.randn(2, 3, 6, dtype=torch.cfloat, device=device)
+        Bb = torch.randn(2, 3, 3, dtype=torch.cfloat, device=device)
+        Bb_ = torch.randn(1, 3, 3, dtype=torch.cfloat, device=device).expand(2, 3, 3)
+        out_b = torch.empty(2, 3, 3, device=device, dtype=torch.complex64).transpose(-1, -2)
+
+        Ab_conj = _slice(Ab, torch.conj)
+        Ab_conj_physical = _slice(Ab, torch.conj_physical)
+
+        def t_b(tensor):
+            return tensor.transpose(-1, -2)
+
+        self.assertEqual(torch.bmm(Ab_conj, Bb, out=out_b), torch.bmm(Ab_conj_physical, Bb, out=out_b))
+        self.assertEqual(torch.bmm(t_b(Ab_conj), Bb, out=out_b), torch.bmm(t_b(Ab_conj_physical), Bb, out=out_b))
+
+        # test broadcasting
+        self.assertEqual(torch.bmm(Ab_conj, Bb_, out=out_b), torch.bmm(Ab_conj_physical, Bb_, out=out_b))
+        self.assertEqual(torch.bmm(t_b(Ab_conj), Bb_, out=out_b), torch.bmm(t_b(Ab_conj_physical), Bb_, out=out_b))
+
     @onlyOnCPUAndCUDA
     @dtypes(torch.float32, torch.float64)
     def test_strided_mm_bmm(self, device, dtype):
diff --git a/test/test_torch.py b/test/test_torch.py
index b267b9cd6b610..a790839bbd50e 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -5328,6 +5328,13 @@ def test_clone_zero_stride_dim(self, device):
         y = x.as_strided([2, 1, 5], [1, 0, 2])
         self.assertEqual(y, y.clone())
 
+    def test_clone_not_memory_dense(self):
+        # github issue: https://github.com/pytorch/pytorch/issues/64176
+        x = torch.randn(10, 8).t()[::2, ::2]
+        y = x.clone()
+        # should retain permutation after densification
+        self.assertTrue(y.stride() == (1, 4))
+
     @dtypesIfCUDA(*set(torch.testing.get_all_math_dtypes('cuda')))
     @dtypes(*set(torch.testing.get_all_math_dtypes('cpu')))
     def test_addcmul(self, device, dtype):
@@ -6013,9 +6020,9 @@ def test_masked_select_discontiguous(self, device):
             out_dc = torch.empty(size * size, device=device)[::2]
             for v, m in product(vals_list, mask_list):
                 if m.is_contiguous():
-                    expected = v[:, ::2].clone().view(-1)
+                    expected = v[:, ::2].clone().reshape((-1, ))
                 else:
-                    expected = v[::2].clone().view(-1)
+                    expected = v[::2].clone().reshape((-1, ))
                 out = torch.masked_select(v, m)
                 self.assertEqual(out, expected, atol=0, rtol=0)
                 torch.masked_select(v, m, out=out_dc)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index fe8e36fbe6758..10aae4146bae8 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1606,15 +1606,29 @@ def sample_inputs_t(op_info, device, dtype, requires_grad, **kwargs):
 
 
 def sample_inputs_mm(op_info, device, dtype, requires_grad, **kwargs):
-    args_list = (
-        ((S, M), (M, S)),
-    )
-    inputs = tuple(SampleInput(make_tensor(first_shape, device, dtype,
-                                           requires_grad=requires_grad),
-                               args=(make_tensor(second_shape, device, dtype,
-                                     requires_grad=requires_grad),))
-                   for first_shape, second_shape in args_list)
-    return inputs
+    first_shape, second_shape = (S, M), (M, S)
+    sample_inputs = []
+    sample_inputs.append(
+        SampleInput(make_tensor(first_shape, device, dtype,
+                                requires_grad=requires_grad),
+                    args=(make_tensor(second_shape, device, dtype,
+                                      requires_grad=requires_grad),)))
+
+    if dtype.is_complex:
+        sample_inputs.append(
+            SampleInput(make_tensor(first_shape, device, dtype,
+                                    requires_grad=requires_grad),
+                        args=(
+                            make_tensor(second_shape, device, dtype,
+                                        requires_grad=requires_grad).conj(),)))
+
+        sample_inputs.append(
+            SampleInput(make_tensor(first_shape, device, dtype,
+                                    requires_grad=requires_grad).transpose(0, 1),
+                        args=(
+                            make_tensor(second_shape, device, dtype,
+                                        requires_grad=requires_grad).transpose(0, 1).conj(),)))
+    return sample_inputs
 
 def sample_inputs_addmm(op_info, device, dtype, requires_grad, **kwargs):
     alpha_val = kwargs.get('alpha', 2 + 3j if dtype.is_complex else 0.6)
@@ -1627,15 +1641,40 @@ def sample_inputs_addmm(op_info, device, dtype, requires_grad, **kwargs):
         ((), (2, 2), (2, 3), True)
     ]
     test_cases = tests_list + tests_with_lhs_broadcasting  # type: ignore[operator]
-    inputs = tuple(SampleInput(make_tensor(shape_a, device, dtype, requires_grad=requires_grad),
-                               args=(make_tensor(shape_b, device, dtype,
-                                                 requires_grad=requires_grad),
-                                     make_tensor(shape_c, device, dtype,
-                                                 requires_grad=requires_grad)),
-                               kwargs={'alpha': alpha_val, 'beta': beta_val},
-                               broadcasts_input=broadcasts_input)
-                   for shape_a, shape_b, shape_c, broadcasts_input in test_cases)
-    return inputs
+
+    sample_inputs = []
+
+    for shape_a, shape_b, shape_c, broadcasts_input in test_cases:
+        sample_inputs.append(
+            SampleInput(
+                make_tensor(shape_a, device, dtype, requires_grad=requires_grad),
+                args=(
+                    make_tensor(shape_b, device, dtype,
+                                requires_grad=requires_grad),
+                    make_tensor(shape_c, device, dtype,
+                                requires_grad=requires_grad)),
+                kwargs={'alpha': alpha_val, 'beta': beta_val},
+                broadcasts_input=broadcasts_input))
+
+    if dtype.is_complex:
+        shape = (3, 3)
+        sample_inputs.append(
+            SampleInput(make_tensor(shape, device, dtype, requires_grad=requires_grad),
+                        args=(
+                            make_tensor(shape, device, dtype,
+                                        requires_grad=requires_grad).t().conj(),
+                            make_tensor(shape, device, dtype,
+                                        requires_grad=requires_grad)),
+                        kwargs={'alpha': alpha_val, 'beta': beta_val},))
+        sample_inputs.append(
+            SampleInput(make_tensor(shape, device, dtype, requires_grad=requires_grad),
+                        args=(
+                            make_tensor(shape, device, dtype,
+                                        requires_grad=requires_grad),
+                            make_tensor(shape, device, dtype,
+                                        requires_grad=requires_grad).t().conj()),
+                        kwargs={'alpha': alpha_val, 'beta': beta_val},))
+    return sample_inputs
 
 def sample_inputs_mv(self, device, dtype, requires_grad, **kwargs):
     return (
@@ -1767,6 +1806,23 @@ def sample_inputs_baddbmm(op_info, device, dtype, requires_grad, **kwargs):
             sample_inputs.append(SampleInput(args[0], args=(args[1], args[2]),
                                              kwargs=dict(beta=beta * (1 + 2j), alpha=alpha * (2 + 3j)),
                                              broadcasts_input=broadcasts_input))
+
+    if dtype.is_complex:
+        shapes = [(S, S, S), (S, M, S), (S, S, M)]
+        args = (make_tensor(shapes[0], device, dtype,
+                            low=None, high=None,
+                            requires_grad=requires_grad),
+                make_tensor(shapes[1], device, dtype,
+                            low=None, high=None,
+                            requires_grad=requires_grad),
+                make_tensor(shapes[2], device, dtype,
+                            low=None, high=None,
+                            requires_grad=requires_grad))
+        sample_inputs.append(
+            SampleInput(
+                args[0].transpose(-1, 1), args=(args[1].transpose(-1, 1).conj(), args[2].transpose(-1, 1).conj()),
+                kwargs=dict(beta=beta * (1 + 2j), alpha=alpha * (2 + 3j)),))
+
     return tuple(sample_inputs)
 
 def sample_inputs_addr(op_info, device, dtype, requires_grad, **kwargs):
@@ -5847,6 +5903,13 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                                                     *[torch.bfloat16] if SM53OrLater else [],
                                                     torch.complex64, torch.complex128),
            supports_forward_ad=True,
+           decorators=[
+               DecorateInfo(
+                   toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
+                   'TestCommon', 'test_variant_consistency_eager', device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
+                   'TestMathBits', 'test_conj_view', device_type='cuda')],
            skips=(
                # FIXME: bfloat16 backward support likely depends on CUDA11+
                #   and SM53+
@@ -7045,7 +7108,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            skips=(
                # matmul does not correctly warn when resizing out= inputs
                SkipInfo('TestCommon', 'test_out'),
-               SkipInfo('TestCommon', 'test_conj_view', device_type='cpu'),
            )),
     OpInfo('max',
            op=torch.max,
@@ -7835,6 +7897,10 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            assert_autodiffed=True,
            sample_inputs_func=sample_inputs_matmul,
            supports_out=False,
+           decorators=[
+               DecorateInfo(
+                   toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
+                   'TestMathBits', 'test_conj_view')],
            skips=(
                SkipInfo('TestJit', 'test_variant_consistency_jit',),
            )),

From 9b8f9d5a25ca7a9ebd54a07ab88b3a540111e5b3 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 1 Sep 2021 16:21:31 -0700
Subject: [PATCH 445/530] [c10d] Prefer use of torch_check (#63928)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63928

throw std::invalid_argument results in not getting stacktraces with
TORCH_SHOW_CPP_STACKTRACES=1, so instead prefer torch_check here.
ghstack-source-id: 137135328

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D30533955

fbshipit-source-id: 33e5bf4f449e3043dec68da93f8022f6624d9675
---
 test/distributed/test_c10d_gloo.py            | 114 +++++++++---------
 .../distributed/c10d/ProcessGroupGloo.cpp     |  18 +--
 2 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 55b2948b93b71..789d76e9d115a 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -259,43 +259,43 @@ def test_broadcast_checks(self):
         t2 = torch.zeros([1], dtype=torch.float64)
         t3 = torch.zeros([2], dtype=torch.float32)
 
-        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+        with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
             opts = c10d.BroadcastOptions()
             opts.rootRank = -1
             opts.rootTensor = 0
             pg.broadcast([t1], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+        with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
             opts = c10d.BroadcastOptions()
             opts.rootRank = self.world_size
             opts.rootTensor = 0
             pg.broadcast([t1], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid root tensor"):
+        with self.assertRaisesRegex(RuntimeError, "invalid root tensor"):
             opts = c10d.BroadcastOptions()
             opts.rootRank = self.rank
             opts.rootTensor = -1
             pg.broadcast([t1], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid root tensor"):
+        with self.assertRaisesRegex(RuntimeError, "invalid root tensor"):
             opts = c10d.BroadcastOptions()
             opts.rootRank = self.rank
             opts.rootTensor = 1
             pg.broadcast([t1], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid root tensor"):
+        with self.assertRaisesRegex(RuntimeError, "invalid root tensor"):
             opts = c10d.BroadcastOptions()
             opts.rootRank = self.rank
             opts.rootTensor = 0
             pg.broadcast([], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor type"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor type"):
             opts = c10d.BroadcastOptions()
             opts.rootRank = self.rank
             opts.rootTensor = 0
             pg.broadcast([t1, t2], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor size"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor size"):
             opts = c10d.BroadcastOptions()
             opts.rootRank = self.rank
             opts.rootTensor = 0
@@ -394,15 +394,15 @@ def test_allreduce_checks(self):
         t2 = torch.zeros([1], dtype=torch.float64)
         t3 = torch.zeros([2], dtype=torch.float32)
 
-        with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"):
+        with self.assertRaisesRegex(RuntimeError, "requires non-empty tensor list"):
             opts = c10d.AllreduceOptions()
             pg.allreduce([], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor type"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor type"):
             opts = c10d.AllreduceOptions()
             pg.allreduce([t1, t2], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor size"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor size"):
             opts = c10d.AllreduceOptions()
             pg.allreduce([t1, t3], opts)
 
@@ -553,19 +553,19 @@ def test_allreduce_coalesced_checks(self):
         t2 = torch.zeros(1, dtype=torch.float64)
         t3 = torch.sparse_coo_tensor([[0]], [1], size=(1,))
 
-        with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"):
+        with self.assertRaisesRegex(RuntimeError, "requires non-empty tensor list"):
             opts = c10d.AllreduceCoalescedOptions()
             pg.allreduce_coalesced([], opts)
 
-        with self.assertRaisesRegex(ValueError, "tensors must all have the same type"):
+        with self.assertRaisesRegex(RuntimeError, "tensors must all have the same type"):
             opts = c10d.AllreduceCoalescedOptions()
             pg.allreduce_coalesced([t1, t2], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor layout at index"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor layout at index"):
             opts = c10d.AllreduceCoalescedOptions()
             pg.allreduce_coalesced([t1, t3], opts)
 
-        with self.assertRaisesRegex(ValueError, "unsupported layout"):
+        with self.assertRaisesRegex(RuntimeError, "unsupported layout"):
             opts = c10d.AllreduceCoalescedOptions()
             pg.allreduce_coalesced([t3, t3.clone()], opts)
 
@@ -579,7 +579,7 @@ def test_allreduce_coalesced_checks_cuda(self):
 
         t1 = torch.zeros(1, dtype=torch.float32)
 
-        with self.assertRaisesRegex(ValueError, "unsupported device type"):
+        with self.assertRaisesRegex(RuntimeError, "unsupported device type"):
             opts = c10d.AllreduceCoalescedOptions()
             pg.allreduce_coalesced([t1.cuda(), t1.cuda()], opts)
 
@@ -647,21 +647,21 @@ def test_sparse_allreduce_checks(self):
         t2 = torch.sparse_coo_tensor([[0]], [1], size=(2,))
         t3 = torch.sparse_coo_tensor([[0]], [1], size=(4,))
 
-        with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"):
+        with self.assertRaisesRegex(RuntimeError, "requires non-empty tensor list"):
             opts = c10d.AllreduceOptions()
             pg.allreduce([], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor layout"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor layout"):
             opts = c10d.AllreduceOptions()
             pg.allreduce([t1, t2], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor size"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor size"):
             opts = c10d.AllreduceOptions()
             pg.allreduce([t2, t3], opts)
 
         # Sparse allreduce only works with c10d.ReduceOp.SUM.
         for op in [c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX]:
-            with self.assertRaisesRegex(ValueError, "unsupported reduction operation"):
+            with self.assertRaisesRegex(RuntimeError, "unsupported reduction operation"):
                 opts = c10d.AllreduceOptions()
                 opts.reduceOp = op
                 pg.allreduce([t3], opts)
@@ -705,36 +705,36 @@ def test_scatter_checks(self):
         t2 = torch.zeros([1], dtype=torch.float64)
         t3 = torch.zeros([2], dtype=torch.float32)
 
-        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+        with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
             opts = c10d.ScatterOptions()
             opts.rootRank = -1
             pg.scatter([t1], [], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+        with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
             opts = c10d.ScatterOptions()
             opts.rootRank = self.world_size
             pg.scatter([t1], [], opts)
 
         with self.assertRaisesRegex(
-            ValueError, "requires a single-element output tensor list"
+            RuntimeError, "requires a single-element output tensor list"
         ):
             opts = c10d.ScatterOptions()
             opts.rootRank = 0
             pg.scatter([], [], opts)
 
         with self.assertRaisesRegex(
-            ValueError, "requires a single-element output tensor list"
+            RuntimeError, "requires a single-element output tensor list"
         ):
             opts = c10d.ScatterOptions()
             opts.rootRank = 0
             pg.scatter([t1, t1], [], opts)
 
-        with self.assertRaisesRegex(ValueError, "requires a single-element input list"):
+        with self.assertRaisesRegex(RuntimeError, "requires a single-element input list"):
             opts = c10d.ScatterOptions()
             opts.rootRank = self.rank
             pg.scatter([t1], [], opts)
 
-        with self.assertRaisesRegex(ValueError, "requires a single-element input list"):
+        with self.assertRaisesRegex(RuntimeError, "requires a single-element input list"):
             opts = c10d.ScatterOptions()
             opts.rootRank = self.rank
             pg.scatter([t1], [[t1] * self.world_size, [t1] * self.world_size], opts)
@@ -743,7 +743,7 @@ def test_scatter_checks(self):
         incorrect_list_size = self.world_size - 1
         err_str = "Incorrect input list size {}. Input list size should be {}"
         with self.assertRaisesRegex(
-            ValueError, err_str.format(incorrect_list_size, desired_list_size)
+            RuntimeError, err_str.format(incorrect_list_size, desired_list_size)
         ):
             opts = c10d.ScatterOptions()
             opts.rootRank = self.rank
@@ -751,23 +751,23 @@ def test_scatter_checks(self):
 
         incorrect_list_size = self.world_size + 1
         with self.assertRaisesRegex(
-            ValueError, err_str.format(incorrect_list_size, desired_list_size)
+            RuntimeError, err_str.format(incorrect_list_size, desired_list_size)
         ):
             opts = c10d.ScatterOptions()
             opts.rootRank = self.rank
             pg.scatter([t1], [[t1] * incorrect_list_size], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor type"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor type"):
             opts = c10d.ScatterOptions()
             opts.rootRank = self.rank
             pg.scatter([t1], [[t2] * self.world_size], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor size"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor size"):
             opts = c10d.ScatterOptions()
             opts.rootRank = self.rank
             pg.scatter([t1], [[t3] * self.world_size], opts)
 
-        with self.assertRaisesRegex(ValueError, "requires empty input on non-root"):
+        with self.assertRaisesRegex(RuntimeError, "requires empty input on non-root"):
             opts = c10d.ScatterOptions()
             opts.rootRank = (self.rank + 1) % self.world_size
             pg.scatter([t1], [[t1] * self.world_size], opts)
@@ -872,39 +872,39 @@ def test_gather_checks(self):
         t2 = torch.zeros([1], dtype=torch.float64)
         t3 = torch.zeros([2], dtype=torch.float32)
 
-        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+        with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
             opts = c10d.GatherOptions()
             opts.rootRank = -1
             pg.gather([], [t1], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+        with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
             opts = c10d.GatherOptions()
             opts.rootRank = self.world_size
             pg.gather([], [t1], opts)
 
         with self.assertRaisesRegex(
-            ValueError, "requires a single-element input tensor list"
+            RuntimeError, "requires a single-element input tensor list"
         ):
             opts = c10d.GatherOptions()
             opts.rootRank = 0
             pg.gather([], [], opts)
 
         with self.assertRaisesRegex(
-            ValueError, "requires a single-element input tensor list"
+            RuntimeError, "requires a single-element input tensor list"
         ):
             opts = c10d.GatherOptions()
             opts.rootRank = 0
             pg.gather([], [t1, t1], opts)
 
         with self.assertRaisesRegex(
-            ValueError, "requires a single-element output list"
+            RuntimeError, "requires a single-element output list"
         ):
             opts = c10d.GatherOptions()
             opts.rootRank = self.rank
             pg.gather([], [t1], opts)
 
         with self.assertRaisesRegex(
-            ValueError, "requires a single-element output list"
+            RuntimeError, "requires a single-element output list"
         ):
             opts = c10d.GatherOptions()
             opts.rootRank = self.rank
@@ -914,7 +914,7 @@ def test_gather_checks(self):
         incorrect_list_size = self.world_size - 1
         err_str = "Incorrect output list size {}. Output list size should be {}"
         with self.assertRaisesRegex(
-            ValueError, err_str.format(incorrect_list_size, desired_list_size)
+            RuntimeError, err_str.format(incorrect_list_size, desired_list_size)
         ):
             opts = c10d.GatherOptions()
             opts.rootRank = self.rank
@@ -922,23 +922,23 @@ def test_gather_checks(self):
 
         incorrect_list_size = self.world_size + 1
         with self.assertRaisesRegex(
-            ValueError, err_str.format(incorrect_list_size, desired_list_size)
+            RuntimeError, err_str.format(incorrect_list_size, desired_list_size)
         ):
             opts = c10d.GatherOptions()
             opts.rootRank = self.rank
             pg.gather([[t1] * incorrect_list_size], [t1], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor type"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor type"):
             opts = c10d.GatherOptions()
             opts.rootRank = self.rank
             pg.gather([[t2] * self.world_size], [t1], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor size"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor size"):
             opts = c10d.GatherOptions()
             opts.rootRank = self.rank
             pg.gather([[t3] * self.world_size], [t1], opts)
 
-        with self.assertRaisesRegex(ValueError, "requires empty output on non-root"):
+        with self.assertRaisesRegex(RuntimeError, "requires empty output on non-root"):
             opts = c10d.GatherOptions()
             opts.rootRank = (self.rank + 1) % self.world_size
             pg.gather([[t1] * self.world_size], [t1], opts)
@@ -1039,39 +1039,39 @@ def test_allgather_checks(self):
         t2 = torch.zeros([1], dtype=torch.float64)
         t3 = torch.zeros([2], dtype=torch.float32)
 
-        with self.assertRaisesRegex(ValueError, "requires non-empty input tensor list"):
+        with self.assertRaisesRegex(RuntimeError, "requires non-empty input tensor list"):
             pg.allgather([], [])
 
         with self.assertRaisesRegex(
-            ValueError, "requires input/output tensor lists to have the same length"
+            RuntimeError, "requires input/output tensor lists to have the same length"
         ):
             pg.allgather([], [t1])
 
         with self.assertRaisesRegex(
-            ValueError, "requires input/output tensor lists to have the same length"
+            RuntimeError, "requires input/output tensor lists to have the same length"
         ):
             pg.allgather([[t1] * self.world_size, [t1] * self.world_size], [t1])
 
-        with self.assertRaisesRegex(ValueError, "invalid output tensor list"):
+        with self.assertRaisesRegex(RuntimeError, "invalid output tensor list"):
             pg.allgather([[t1] * (self.world_size - 1)], [t1])
 
-        with self.assertRaisesRegex(ValueError, "invalid output tensor list"):
+        with self.assertRaisesRegex(RuntimeError, "invalid output tensor list"):
             pg.allgather([[t1] * (self.world_size + 1)], [t1])
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor type"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor type"):
             pg.allgather(
                 [[t1, t1] * (self.world_size), [t1, t1] * (self.world_size)], [t1, t2]
             )
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor size"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor size"):
             pg.allgather(
                 [[t1, t1] * (self.world_size), [t1, t1] * (self.world_size)], [t1, t3]
             )
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor type"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor type"):
             pg.allgather([([t1, t2] * (self.world_size))[: self.world_size]], [t1])
 
-        with self.assertRaisesRegex(ValueError, "invalid tensor size"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor size"):
             pg.allgather([([t1, t3] * (self.world_size))[: self.world_size]], [t1])
 
     def _test_allgather_basics(self, fn):
@@ -1160,13 +1160,13 @@ def test_allgather_coalesced_checks(self):
         # One of output tensors does not match input list.
         dummy_output_lists[0] = [torch.zeros([0], dtype=torch.float32)]
         with self.assertRaisesRegex(
-            ValueError, "invalid size of output tensor at index 0"
+            RuntimeError, "invalid size of output tensor at index 0"
         ):
             c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg)
 
         # One of output tensors does not match input list.
         dummy_output_lists[0] = [torch.zeros([1], dtype=torch.float64)]
-        with self.assertRaisesRegex(ValueError, "invalid tensor type at index 0"):
+        with self.assertRaisesRegex(RuntimeError, "invalid tensor type at index 0"):
             c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg)
 
         # Output lists have too many elements
@@ -1174,7 +1174,7 @@ def test_allgather_coalesced_checks(self):
             [torch.zeros([1], dtype=torch.float32)] for _ in range(self.world_size + 1)
         ]
         with self.assertRaisesRegex(
-            ValueError, "output lists should be equal to world size"
+            RuntimeError, "output lists should be equal to world size"
         ):
             c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg)
 
@@ -1194,26 +1194,26 @@ def test_reduce_checks(self):
 
         t1 = torch.zeros([1], dtype=torch.float32)
 
-        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+        with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
             opts = c10d.ReduceOptions()
             opts.rootRank = -1
             opts.rootTensor = 0
             pg.reduce([t1], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+        with self.assertRaisesRegex(RuntimeError, "invalid root rank"):
             opts = c10d.ReduceOptions()
             opts.rootRank = self.world_size
             opts.rootTensor = 0
             pg.reduce([t1], opts)
 
-        with self.assertRaisesRegex(ValueError, "invalid root tensor"):
+        with self.assertRaisesRegex(RuntimeError, "invalid root tensor"):
             opts = c10d.ReduceOptions()
             opts.rootRank = self.rank
             opts.rootTensor = 1
             pg.reduce([t1], opts)
 
         with self.assertRaisesRegex(
-            ValueError, "requires a single-element tensor list"
+            RuntimeError, "requires a single-element tensor list"
         ):
             opts = c10d.ReduceOptions()
             opts.rootRank = self.rank
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index ba26409c9b990..b8f5aa3989ce4 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -923,7 +923,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
     std::vector<at::Tensor>& inputs,
     const BroadcastOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
-    throw std::invalid_argument("ProcessGroupGloo::broadcast: " + msg);
+    TORCH_CHECK(false, "ProcessGroupGloo::broadcast: " + msg);
   };
 
   assertRootRank(invalidArgument, opts.rootRank, size_);
@@ -1414,7 +1414,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
     std::vector<at::Tensor>& inputs,
     const AllreduceOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
-    throw std::invalid_argument("ProcessGroupGloo::allreduce: " + msg);
+    TORCH_CHECK(false, "ProcessGroupGloo::allreduce: " + msg);
   };
 
   assertNonEmpty(invalidArgument, inputs);
@@ -1475,7 +1475,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
-    throw std::invalid_argument(
+    TORCH_CHECK(false,
         "ProcessGroupGloo::allreduce_coalesced: " + msg);
   };
   assertNonEmpty(invalidArgument, tensors);
@@ -1644,7 +1644,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::reduce(
     std::vector<at::Tensor>& inputs,
     const ReduceOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
-    throw std::invalid_argument("ProcessGroupGloo::reduce: " + msg);
+    TORCH_CHECK(false, "ProcessGroupGloo::reduce: " + msg);
   };
 
   assertRootRank(invalidArgument, opts.rootRank, size_);
@@ -1821,7 +1821,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather(
     std::vector<at::Tensor>& inputs,
     const AllgatherOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
-    throw std::invalid_argument("ProcessGroupGloo::allgather: " + msg);
+    TORCH_CHECK(false, "ProcessGroupGloo::allgather: " + msg);
   };
 
   if (inputs.size() == 0) {
@@ -1955,7 +1955,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather_coalesced(
     std::vector<at::Tensor>& input_list,
     const AllgatherOptions& /* unused */) {
   static auto invalidArgument = [](const std::string& msg) {
-    throw std::invalid_argument(
+    TORCH_CHECK(false,
         "ProcessGroupGloo::allgather_coalesced: " + msg);
   };
 
@@ -2152,7 +2152,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::gather(
     std::vector<at::Tensor>& inputs,
     const GatherOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
-    throw std::invalid_argument("ProcessGroupGloo::gather: " + msg);
+    TORCH_CHECK(false, "ProcessGroupGloo::gather: " + msg);
   };
 
   assertRootRank(invalidArgument, opts.rootRank, size_);
@@ -2336,7 +2336,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::scatter(
     std::vector<std::vector<at::Tensor>>& inputs,
     const ScatterOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
-    throw std::invalid_argument("ProcessGroupGloo::scatter: " + msg);
+    TORCH_CHECK(false, "ProcessGroupGloo::scatter: " + msg);
   };
 
   assertRootRank(invalidArgument, opts.rootRank, size_);
@@ -2530,7 +2530,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::alltoall_base(
     std::vector<int64_t>& inputCounts,
     const AllToAllOptions& /* unused */) {
   static auto invalidArgument = [](const std::string& msg) {
-    throw std::invalid_argument("ProcessGroupGloo::alltoall_base: " + msg);
+    TORCH_CHECK(false, "ProcessGroupGloo::alltoall_base: " + msg);
   };
 
   TORCH_CHECK(

From 59fcbd172b5dfdeb12e2f1b7a78c2ce95c1eb680 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 1 Sep 2021 16:25:00 -0700
Subject: [PATCH 446/530] Fix incorrect DDP test (#64074)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64074

Previous PR https://github.com/pytorch/pytorch/pull/63831 did not actually test the error in https://github.com/pytorch/pytorch/issues/63812. Introduce a test
directly from the repro that simulates it.
ghstack-source-id: 137171460

Test Plan: CI

Reviewed By: SciPioneer

Differential Revision: D30569719

fbshipit-source-id: fd61250ef6d291c093607663d91d6d2cb5574eb7
---
 .../_internal/distributed/distributed_test.py | 31 ++++++++++---------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 333458c5f8308..f17842ee02a22 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -3761,25 +3761,28 @@ def test_DistributedDataParallel_requires_grad(self):
             self._barrier()
 
         @sandcastle_skip_if(
-            BACKEND != "nccl" and BACKEND != "gloo",
-            "Only NCCL and GLOO backend support DistributedDataParallel",
+            BACKEND == "nccl",
+            "Gloo-only test"
         )
-        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
         def test_ddp_create_graph(self):
-            rank = self.rank
-            torch.cuda.set_device(rank)
-            net = torch.nn.parallel.DistributedDataParallel(
-                torch.nn.Linear(1, 1, bias=False).cuda(rank),
-                device_ids=[rank]
-            )
-            inp = torch.randn((2, 1), device=rank)
+            class Model(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.p = nn.Parameter(torch.tensor(1.))
+
+                def forward(self):
+                    return self.p.pow(2)
+
+            model = Model()
+            ddp_model = torch.nn.parallel.DistributedDataParallel(model)
             for _ in range(6):
-                loss = net(inp).sum()
-                # Verify DDP works with create_graph=True
-                loss.backward(create_graph=True)
+                # Verify DDP doesn't throw when ran with create_graph=True.
+                # Although we do warn about potential issues, please see
+                # https://github.com/pytorch/pytorch/issues/63929 for details.
+                ddp_model().backward(create_graph=True)
                 # grad tensors should require grad.
                 self.assertTrue(
-                    all([param.requires_grad for param in net.parameters()])
+                    all([param.requires_grad for param in ddp_model.parameters()])
                 )
 
         @sandcastle_skip_if(

From baceea442621346cc42f86c28d9d239531dfa006 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 1 Sep 2021 17:04:37 -0700
Subject: [PATCH 447/530] [DDP] Add more logging iterations (#64071)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64071

Adding more logging iterations to get additional data.
ghstack-source-id: 137119476

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D30579367

fbshipit-source-id: 57195266ada5e5926f0d8eaf4fb4e01dc98924d7
---
 torch/csrc/distributed/c10d/logger.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index 9fa7289c16568..0bb960a639907 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -8,7 +8,7 @@ namespace c10d {
 
 // When training runs at these iterations, log the runtime
 // stats.
-const int LoggingIterations[] = {10, 20, 100, 1000};
+const int LoggingIterations[] = {10, 20, 100, 1000, 5000, 10000, 20000}; // NOLINT
 
 std::ostream& operator<<(std::ostream& output, const Logger& logger) {
   auto& ddp_logging_data = (*logger.ddp_logging_data_);

From bf9d66586c388c0aa223644b1d224227443ae34b Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Wed, 1 Sep 2021 17:32:39 -0700
Subject: [PATCH 448/530] [DDP Comm Hook] Create a noop hook for performance
 debugging (#64344)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64344

As title.

Additionally, avoid using numpy array in test_ddp_hooks.py.
ghstack-source-id: 137170449

Test Plan: buck test mode/dev-nosan caffe2/test/distributed/algorithms/ddp_comm_hooks:test_ddp_hooks -- test_ddp_comm_hook_noop_hook

Reviewed By: rohan-varma

Differential Revision: D30693220

fbshipit-source-id: e17f0d1c6198863cf20a53566f586a6bff602522
---
 .../ddp_comm_hooks/test_ddp_hooks.py          | 34 +++++++++++++++----
 .../algorithms/ddp_comm_hooks/__init__.py     |  4 +++
 .../ddp_comm_hooks/debugging_hooks.py         | 26 ++++++++++++++
 3 files changed, 58 insertions(+), 6 deletions(-)
 create mode 100644 torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py

diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
index 3d00712ca5354..67175b2d22495 100644
--- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
+++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
@@ -2,7 +2,6 @@
 import os
 import sys
 
-import numpy as np
 import torch
 from torch import nn
 import torch.distributed as dist
@@ -105,7 +104,9 @@ def _run_and_get_grads(self, model):
         # Run backward
         output.mean().backward()
 
-        return [p.grad.data.cpu().numpy() for p in model.parameters()]
+        # The only layer
+        param = next(model.parameters())
+        return param.grad
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
@@ -122,7 +123,7 @@ def test_ddp_comm_hook_allreduce_hook(self):
         # Register hook case, get the hook grads.
         hook_grads = self._get_grads(process_group, DDPCommHookType.ALLREDUCE)
 
-        np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0)
+        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0)
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
@@ -139,7 +140,7 @@ def test_ddp_comm_hook_fp16compress_hook(self):
         # Register hook case, get the hook grads.
         hook_grads = self._get_grads(process_group, DDPCommHookType.FP16_COMPRESS)
 
-        np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
+        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
@@ -156,7 +157,7 @@ def test_ddp_comm_hook_quantize_per_tensor_hook(self):
         # Register hook case, get the hook grads.
         hook_grads = self._get_grads(process_group, DDPCommHookType.QUANTIZE_PER_TENSOR)
 
-        np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
+        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
@@ -175,7 +176,28 @@ def test_ddp_comm_hook_quantize_per_channel_hook(self):
             process_group, DDPCommHookType.QUANTIZE_PER_CHANNEL
         )
 
-        np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
+        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
+
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_comm_hook_noop_hook(self):
+        """
+        This unit test verifies the ``noop`` hook registered case and a subsequent allreduce
+        gives same result with no hook registered case.
+        """
+        store = dist.FileStore(self.file_name, self.world_size)
+        process_group = dist.ProcessGroupNCCL(store, self.rank, self.world_size)
+
+        # No hook registered case, get the reference grads.
+        reference_grads = self._get_grads(process_group, None)
+        # Register hook case, get the hook grads.
+        hook_grads = self._get_grads(process_group, DDPCommHookType.NOOP)
+        # Apply a subsequent allreduce to average grads.
+        hook_grads.div_(self.world_size)
+        dist.all_reduce(hook_grads, group=process_group)
+
+        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0)
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
index c3f3b066ee478..ff22a818f925d 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -5,6 +5,7 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from . import (
+    debugging_hooks as debugging,
     default_hooks as default,
     powerSGD_hook as powerSGD,
     quantization_hooks as quantization,
@@ -78,6 +79,9 @@ class DDPCommHookType(Enum):
         comm_hook=powerSGD.batched_powerSGD_hook,
         matrix_approximation_rank=2,
     )
+    NOOP = partial(
+        _ddp_comm_hook_wrapper, comm_hook=debugging.noop_hook,
+    )
 
 
 def register_ddp_comm_hook(
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py
new file mode 100644
index 0000000000000..0c60762caf2ed
--- /dev/null
+++ b/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py
@@ -0,0 +1,26 @@
+from typing import Any
+
+import torch
+import torch.distributed as dist
+
+
+def noop_hook(_: Any, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+    """
+    This DDP communication hook returns the a future that wraps the input,
+    so it is a noop that does not incur any communication overheads.
+
+    This hook should **only** be used for headroom analysis of allreduce optimization,
+    instead of the normal gradient synchronization.
+    For example, if only less than 10% speedup of training time can be observed after this hook is registered,
+    it usually implies that allreduce is not a performance bottleneck for this case.
+    Such instrumentation can be particularly useful
+    if GPU traces cannot be easily retrieved or the trace analysis is complicated
+    some factors such as the overlap between allreduce and computation or the desynchronization across ranks.
+
+    Example::
+        >>> ddp_model.register_comm_hook(None, noop_hook)
+    """
+    fut: torch.futures.Future[torch.Tensor] = torch.futures.Future()
+    fut.set_result(bucket.buffer())
+
+    return fut

From 778af565048e6160ce59fb5eedd8455e629f7942 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Wed, 1 Sep 2021 17:32:39 -0700
Subject: [PATCH 449/530] [DDP Comm Hook] Add debugging communication hooks to
 ddp_comm_hooks.rst (#64352)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64352

as title
ghstack-source-id: 137246253

Test Plan: N/A

Reviewed By: rohan-varma

Differential Revision: D30694089

fbshipit-source-id: a78110b11d59bb0718f43c99ede23f2fd8ab21d0
---
 docs/source/ddp_comm_hooks.rst | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/source/ddp_comm_hooks.rst b/docs/source/ddp_comm_hooks.rst
index 5bd0378e7c7b7..d0f11fe0b0412 100644
--- a/docs/source/ddp_comm_hooks.rst
+++ b/docs/source/ddp_comm_hooks.rst
@@ -84,6 +84,18 @@ PowerSGD Hooks
 .. autofunction:: powerSGD_hook
 .. autofunction:: batched_powerSGD_hook
 
+Debugging Communication Hooks
+-----------------------------
+
+As the name implies, debugging communication hooks are **only** used for debugging and performance optimization purpose.
+
+.. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks
+
+.. warning ::
+    Debugging communication hooks do not necessarily output the correct results.
+
+.. autofunction:: noop_hook
+
 Acknowledgements
 ----------------
 

From 59c6ceb6a8338c5de3f3aee7b7790b1d0daefb0a Mon Sep 17 00:00:00 2001
From: Zeina Migeed <migeedz@fb.com>
Date: Wed, 1 Sep 2021 18:04:19 -0700
Subject: [PATCH 450/530] add documentation to shape inference algorithm
 (#64312)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64312

Test Plan: Imported from OSS

Reviewed By: zou3519

Differential Revision: D30709254

Pulled By: migeed-z

fbshipit-source-id: 3297d26fe6727c5b9ca176625b1683d787f59659
---
 .../experimental/graph_gradual_typechecker.py | 152 +++++++++++++-----
 1 file changed, 114 insertions(+), 38 deletions(-)

diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index a54e52151f858..6094952f1695e 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -28,7 +28,7 @@ def expand_to_tensor_dim(t, n):
     Expand a type to the desired tensor dimension if possible
     Raise an error otherwise.
     - t is the given type
-    - n is a number to expand to
+    - n is a number of dimensions to expand to
     """
     if t == Dyn:
         dims = [Dyn] * n
@@ -42,6 +42,13 @@ def expand_to_tensor_dim(t, n):
 
 
 def broadcast_types(t1, t2):
+    """
+    Applies broadcasting to both given types such that they
+    become consistent with eachother and returns two new
+    resulting types
+    """
+
+    # if either type is Dyn, do nothing since the types are already consistent
     if t1 == Dyn or t2 == Dyn or isinstance(t1, Var) or isinstance(t2, Var):
         return t1, t2
 
@@ -52,7 +59,8 @@ def broadcast_types(t1, t2):
         new_t1 = list(t1.__args__)
         new_t2 = list(t2.__args__)
 
-        # here, we make our tensors the same length
+        # We make the types the same length which is the first requirement
+        # for consistency
         if s1 > s2:
             for i in range(s1 - s2):
                 new_t2.insert(0, 1)
@@ -61,15 +69,18 @@ def broadcast_types(t1, t2):
             for i in range(s2 - s1):
                 new_t1.insert(0, 1)
 
+        # we replace occurrences of "1" with each tensor with
+        # the corresponding type from the other tensor
         for i, (x, y) in enumerate(zip(new_t1, new_t2)):
             if x == 1:
                 new_t1[i] = y
             elif y == 1:
                 new_t2[i] = x
 
+        # at this point our tensors should be consistent
+        # and we can apply the element-wise operation and find the right dimension
+        # for the output of the operation
         (t1, t2) = TensorType(tuple(new_t1)), TensorType(tuple(new_t2))
-
-
         return (t1, t2)
     else:
         raise TypeError(f'Cannot broadcast types {t1} and {t2}')
@@ -77,7 +88,7 @@ def broadcast_types(t1, t2):
 def register_inference_rule(call_target):
     def register(fn):
         if call_target in _INFERENCE_RULES:
-            raise RuntimeError('Inference rule already registered for {call_target}!')
+            raise RuntimeError(f'Inference rule already registered for {call_target}!')
         _INFERENCE_RULES[call_target] = fn
         return fn
     return register
@@ -85,7 +96,7 @@ def register(fn):
 def register_refinement_rule(call_target):
     def register(fn):
         if call_target in _REFINEMENT_RULES:
-            raise RuntimeError('Refinement rule already registered for {call_target}!')
+            raise RuntimeError(f'Refinement rule already registered for {call_target}!')
         _REFINEMENT_RULES[call_target] = fn
         return fn
     return register
@@ -93,7 +104,7 @@ def register(fn):
 def register_algebraic_expressions_inference_rule(call_target):
     def register(fn):
         if call_target in _RULES:
-            raise RuntimeError('Rule already registered for {call_target}!')
+            raise RuntimeError(f'Rule already registered for {call_target}!')
         _RULES[call_target] = fn
         return fn
     return register
@@ -101,6 +112,17 @@ def register(fn):
 @register_inference_rule(torch.add)
 @register_inference_rule(operator.add)
 def add_inference_rule(n: Node):
+    """
+    Apply the addition inference rule. This includes:
+    - scalar addition
+    - broadcasting semantics
+
+    Note that we always return the least precise type between
+    the operands (after applying broadcasting) to be the final type of the operation
+
+    Note that we do not modify the operand types themselves after applying broadcasting
+    to them. We only use them to calculate the final type
+    """
     assert isinstance(n.args[0], Node)
     assert isinstance(n.args[1], Node)
     t1 = n.args[0].type
@@ -111,10 +133,15 @@ def add_inference_rule(n: Node):
         n.type = t2
         return n.type
 
+    # handle scalar addition
     elif t2 == int and isinstance(t1, TensorType):
         n.type = t1
         return n.type
 
+    # we bring the new types to the point where
+    # we can check for consistency
+    # any inconsistency would not have been caused
+    # by broadcasting at this point
     (new_t1, new_t2) = broadcast_types(t1, t2)
 
     if new_t1 != t1 or new_t2 != t2:
@@ -122,13 +149,13 @@ def add_inference_rule(n: Node):
         n.meta[str(n.args[0])] = new_t1
         n.meta[str(n.args[1])] = new_t2
 
-    # Todo: maybe figure out that broadcasting definitely did not happen?
     else:
         n.meta['broadcast'] = False
 
     new_t1 = t1 if not n.meta['broadcast'] else new_t1
     new_t2 = t2 if not n.meta['broadcast'] else new_t2
 
+    # we check for consistency between the new types
     if is_consistent(new_t1, new_t2):
         # we return the less precise type because
         # broadcasting may have happened
@@ -145,6 +172,12 @@ def add_inference_rule(n: Node):
 
 @register_inference_rule(getattr)
 def get_attr_inference_rule(n: Node, traced):
+    """
+    The current getattr rule only handles the shape attribute
+    Can be extended to other attributes
+    The most representitive type we have is "Dyn" but the system
+    can be extended with more types, such as a type to represent shapes
+    """
     attr_node = n.args[0]
     attr_name = n.args[1]
 
@@ -158,6 +191,10 @@ def get_attr_inference_rule(n: Node, traced):
 
 @register_inference_rule(torch.transpose)
 def transpose_inference_rule(n: Node):
+    """
+    We check that dimentions for the transpose operations
+    are within range of the tensor type of the node
+    """
     if n.target == torch.transpose:
         assert isinstance(n.args[0], Node)
         t = n.args[0].type
@@ -171,12 +208,11 @@ def transpose_inference_rule(n: Node):
             return n.type
 
         elif isinstance(t, TensorType):
-
             if 0 <= dim1 < len(t.__args__) and 0 <= dim2 < len(t.__args__):
                 new_type = list(t.__args__)
                 new_type[dim1], new_type[dim2] = new_type[dim2], new_type[dim1]
                 final = TensorType(new_type)
-                n.type = final
+                n.type = get_greatest_upper_bound(n.type, final)
                 return n.type
             else:
                 raise TypeError(f'Cannot transpose {dim1} and {dim2} in type {t} for node {n}')
@@ -186,6 +222,15 @@ def transpose_inference_rule(n: Node):
 
 @register_inference_rule(torch.reshape)
 def reshape_inference_rule(n: Node):
+    """
+    Without dynamism, the rule checks that the
+    product of the elements of the argument tensor
+    type is equal to the product of the elements
+    of the required shape. We gradualize this rule
+    by adding a case to handle fully dynamic input
+    as well as input where some of the tensor dimensions
+    are unknown. In this case we check for divisibility
+    """
     assert isinstance(n.args[0], Node)
     t1 = n.args[0].type
 
@@ -201,7 +246,7 @@ def reshape_inference_rule(n: Node):
 
     # if any of the dimensions are unknown,
     # we check for divisibility
-    elif isinstance(t1, TensorType) and Dyn in t1.__args__ or -1 in t2:
+    elif isinstance(t1, TensorType):
         assert isinstance(t1, TensorType)
         a = [e if e != Dyn else 1 for e in t1.__args__]
         p1 = reduce(lambda x, y: x * y, a)
@@ -211,17 +256,6 @@ def reshape_inference_rule(n: Node):
             return t2_type
         else:
             raise TypeError(f'Cannot reshape in node {n} from {t1} to {t2_type}')
-
-    # if all dimensions are known we check the products
-    elif isinstance(t1, TensorType):
-        p1 = reduce(lambda x, y: x * y, t1.__args__)
-        p2 = reduce(lambda x, y: x * y, t2)
-        if p1 == p2:
-            n.type = t2_type
-            return t2_type
-        else:
-            raise TypeError(f'Cannot reshape in node {n} from {t1} to {t2_type}')
-
     else:
         raise TypeError(f'Cannot reshape in node {n} from {t1} to {t2_type}')
 
@@ -260,7 +294,7 @@ def bn2d_inference_rule(n: Node, module_instance):
 
 def calculate_out_dimension(d_in, module_instance, index):
     """
-    For calculating h_in and w_out.
+    For calculating h_in and w_out according to the conv2D documentation
     """
     padding = (module_instance.padding, module_instance.padding) \
         if isinstance(module_instance.padding, int) else module_instance.padding
@@ -346,6 +380,10 @@ def relu_inference_rule(n: Node, module_instance):
 
 
 def maxpool2d_check(typ, module_instance):
+    """
+    Applies the maxpool2d shape information to the input
+    this affects the last two dimensions
+    """
     new_type_list = list(typ.__args__)
     if len(new_type_list) == 4 or len(new_type_list) == 3:
         w_in = new_type_list[-1]
@@ -391,7 +429,6 @@ def linear_check(tensor_type, module_instance):
     """
     if len(tensor_type.__args__) >= 2:
         if is_consistent(module_instance.in_features, tensor_type.__args__[-1]):
-            # Todo backwards propagation
             new_type_args = list(tensor_type.__args__)
             new_type_args[-1] = module_instance.out_features
             return TensorType(tuple(new_type_args))
@@ -403,6 +440,10 @@ def linear_check(tensor_type, module_instance):
 
 @register_inference_rule(torch.nn.Linear)
 def linear_inference_rule(n: Node, module_instance):
+    """
+    Applies the shape information to the input then gets the greatest upper bound
+    of the resulting type and the existing type
+    """
     assert isinstance(n.args[0], Node)
     if n.args[0].type == Dyn and isinstance(n.type, TensorType):
         n.args[0].type = expand_to_tensor_dim(n.args[0].type, len(n.type.__args__))
@@ -470,6 +511,10 @@ def flatten_check(tensor_type, start_dim, end_dim):
 
 @register_inference_rule(torch.flatten)
 def flatten_inference_rule(n: Node):
+    """
+    Applies the flatten shape information to the input then gets the
+    greatest upper bound of the resulting type and the existing type
+    """
     assert isinstance(n.args[0], Node)
 
     # set the default start and end dims
@@ -568,6 +613,10 @@ def get_node_type(a):
 
 @register_refinement_rule(Conv2d)
 def conv_refinement_rule(n: Node):
+    """
+    The equality constraints are between the first dimension of
+    the input and output
+    """
     res = []
     assert isinstance(n.args[0], Node)
     arg_type = n.args[0].type
@@ -578,6 +627,10 @@ def conv_refinement_rule(n: Node):
 
 @register_refinement_rule(torch.nn.Linear)
 def linear_refinement_rule(n: Node):
+    """
+    The equality constraints are between the first dimension of
+    the input and output
+    """
     res = []
     assert isinstance(n.args[0], Node)
     arg_type = n.args[0].type
@@ -585,10 +638,12 @@ def linear_refinement_rule(n: Node):
         res = [Equality(arg_type.__args__[0], n.type.__args__[0])]
     return res
 
-# todo needs review for addition. Is this constraint correct?
 @register_refinement_rule(BatchNorm2d)
 @register_refinement_rule(torch.nn.ReLU)
 def all_eq(n: Node):
+    """
+    For operations where the input shape is equal to the output shape
+    """
     res = []
     assert isinstance(n.args[0], Node)
     arg_type = n.args[0].type
@@ -600,7 +655,12 @@ def all_eq(n: Node):
 
 
 @register_refinement_rule(torch.nn.AdaptiveAvgPool2d)
-def first_two__eq(n: Node):
+@register_refinement_rule(torch.nn.MaxPool2d)
+def first_two_eq(n: Node):
+    """
+    For operations where the first two dimensions of the input and output shape
+    are equal
+    """
     res = []
     assert isinstance(n.args[0], Node)
     arg_type = n.args[0].type
@@ -610,19 +670,37 @@ def first_two__eq(n: Node):
         res = [Equality(args1[0], args2[0]), Equality(args1[1], args2[1])]
     return res
 
+
 @register_refinement_rule(torch.add)
 @register_refinement_rule(operator.add)
-def add_eq(n: Node):
+def element_wise_eq(n: Node):
+    """
+    For element-wise operations and handles broadcasting.
+    Note that after applying broadcasting to the arguments
+    we are able to determine if certain dimensions have not been broadcast
+    if they are symbolicallu equal.
+
+    in this case, we can establish equality between those dimensions and the
+    corresponding output dimensions.
+
+    Note that it takes two iterations for this result. One iteration to establish
+    equality between certain dimensions of the operands (requiring the whole solver
+    including unification) and another iteration to establish equality between the operands
+    and the resulting type, requiring another round of constraint generation and unificaiton.
+    """
     res = []
     if isinstance(n.args[0], Node) and isinstance(n.args[1], Node):
         arg_type1 = n.args[0].type
         arg_type2 = n.args[1].type
         if isinstance(arg_type1, TensorType) and isinstance(arg_type2, TensorType) and isinstance(n.type, TensorType):
             args1, args2 = broadcast_types(arg_type1, arg_type2)
-            # by this point, we know for sure that args1 and args2 are the same size.
+            # by this point, we know that args1 and args2 are the same size.
             a1 = args1.__args__
             a2 = args2.__args__
             a3 = n.type.__args__
+
+            # we would be here in the second iteration where we establish equality
+            # between operand type dimensions and the resulting type dimensions
             r = []
             for x, y, z in zip(a1, a2, a3):
                 if x == y:
@@ -630,19 +708,13 @@ def add_eq(n: Node):
             res = r
     return res
 
-@register_refinement_rule(torch.nn.MaxPool2d)
-def first_two(n: Node):
-    res = []
-    assert isinstance(n.args[0], Node)
-    arg_type = n.args[0].type
-    if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType):
-        args1 = arg_type.__args__
-        args2 = n.type.__args__
-        res = [Equality(args1[0], args2[0]), Equality(args1[1], args2[1])]
-    return res
 
 @register_refinement_rule(torch.flatten)
 def flatten_refinement_rule(n: Node):
+    """
+    Generates equality constraints between the dimensions of the input and output
+    that will not be involved in the flatten operation
+    """
     assert isinstance(n.args[0], Node)
 
     eq_const = []
@@ -674,6 +746,10 @@ def flatten_refinement_rule(n: Node):
 
 @register_algebraic_expressions_inference_rule(Conv2d)
 def conv_rule(n: Node, module_instance):
+    """
+    Represents the outout in terms of an algrbraic expression w.r.t
+    the input when possible
+    """
     assert isinstance(n.args[0], Node)
     arg_type = n.args[0].type
     if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType):

From 4d6314a16e78027832186f5442df888dbabbc159 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 1 Sep 2021 18:12:02 -0700
Subject: [PATCH 451/530] [DDP] Log num threads (#64072)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64072

Log gloo threads to DDP logging.
ghstack-source-id: 137119480

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D30596083

fbshipit-source-id: 2b4f6e762cb5d850be6056bcc5922029a1af3c91
---
 torch/csrc/distributed/c10d/ProcessGroupGloo.hpp      |  4 ++++
 torch/csrc/distributed/c10d/logger.cpp                | 11 +++++++++++
 .../testing/_internal/distributed/distributed_test.py |  6 ++++++
 3 files changed, 21 insertions(+)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index 036ce91b85faf..5c0c76afa2453 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -318,6 +318,10 @@ class TORCH_API ProcessGroupGloo : public ProcessGroup {
   // may indicate that there is some sort of collective desynchronization.
   uint64_t getSequenceNumberForGroup() override;
 
+  int getNumThreads() {
+    return options_->threads;
+  }
+
  protected:
   std::unique_ptr<::gloo::rendezvous::Store> store_;
   const c10::intrusive_ptr<Options> options_;
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index 0bb960a639907..b1efd0b238378 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -4,6 +4,10 @@
 #include <fmt/format.h>
 #include <string>
 
+#ifdef USE_C10D_GLOO
+#include <c10d/ProcessGroupGloo.hpp>
+#endif
+
 namespace c10d {
 
 // When training runs at these iterations, log the runtime
@@ -68,6 +72,13 @@ void Logger::set_env_variables() {
         parse_env("GLOO_SOCKET_IFNAME");
     ddp_logging_data_->strs_map["gloo_device_transport"] =
         parse_env("GLOO_DEVICE_TRANSPORT");
+
+    #ifdef USE_C10D_GLOO
+    auto gloo_pg =
+        static_cast<c10d::ProcessGroupGloo*>(reducer_->process_group_.get());
+    auto n_threads = gloo_pg->getNumThreads();
+    ddp_logging_data_->ints_map["gloo_num_threads"] = n_threads;
+    #endif
   }
 }
 
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index f17842ee02a22..613e23ede8f84 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -5074,6 +5074,12 @@ def parse_env(var):
                     ddp_logging_data.get("gloo_device_transport"),
                     parse_env("GLOO_DEVICE_TRANSPORT"),
                 )
+                default_gloo_threads = 2
+                self.assertEqual(
+                    ddp_logging_data.get("gloo_num_threads"),
+                    default_gloo_threads,
+                )
+
             self.assertEqual(ddp_logging_data.get("nccl_socket_ifname"), None)
             self.assertEqual(ddp_logging_data.get("nccl_blocking_wait"), None)
             self.assertEqual(ddp_logging_data.get("nccl_async_error_handling"), None)

From d067f156220f987e73a524ad414cfa082ae39aac Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 1 Sep 2021 21:07:01 -0700
Subject: [PATCH 452/530] [Dist CI] Move rest of distributed tests to their own
 CI job (#64253)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64253

Follow up to D30496178 (https://github.com/pytorch/pytorch/commit/f4aff3a346a0525e37d6071f318f7a4c54d5e1fb) to move the rest of distributed tests to their own jobs for Linux GHA.
ghstack-source-id: 137233785

Test Plan: CI

Reviewed By: walterddr

Differential Revision: D30662999

fbshipit-source-id: f7cfbc0d1223aca52120f17f9da987d70fda8de6
---
 test/run_test.py | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/test/run_test.py b/test/run_test.py
index 5953919b16323..5d3856ba3e144 100644
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -286,7 +286,56 @@ def skip_test_p(name: str) -> bool:
 ]
 
 DISTRIBUTED_TESTS = [
+    "distributed/test_data_parallel",
+    "distributed/test_launcher",
+    "distributed/nn/jit/test_instantiator",
+    "distributed/rpc/test_faulty_agent",
+    "distributed/rpc/test_tensorpipe_agent",
+    "distributed/rpc/cuda/test_tensorpipe_agent",
+    "distributed/test_c10d_common",
+    "distributed/test_c10d_gloo",
+    "distributed/test_c10d_nccl",
+    "distributed/test_jit_c10d",
+    "distributed/test_c10d_spawn_gloo",
+    "distributed/test_c10d_spawn_nccl",
+    "distributed/test_store",
+    "distributed/test_pg_wrapper",
+    "distributed/algorithms/test_join",
     "distributed/test_distributed_spawn",
+    "distributed/pipeline/sync/skip/test_api",
+    "distributed/pipeline/sync/skip/test_gpipe",
+    "distributed/pipeline/sync/skip/test_inspect_skip_layout",
+    "distributed/pipeline/sync/skip/test_leak",
+    "distributed/pipeline/sync/skip/test_portal",
+    "distributed/pipeline/sync/skip/test_stash_pop",
+    "distributed/pipeline/sync/skip/test_tracker",
+    "distributed/pipeline/sync/skip/test_verify_skippables",
+    "distributed/pipeline/sync/test_balance",
+    "distributed/pipeline/sync/test_bugs",
+    "distributed/pipeline/sync/test_checkpoint",
+    "distributed/pipeline/sync/test_copy",
+    "distributed/pipeline/sync/test_deferred_batch_norm",
+    "distributed/pipeline/sync/test_dependency",
+    "distributed/pipeline/sync/test_inplace",
+    "distributed/pipeline/sync/test_microbatch",
+    "distributed/pipeline/sync/test_phony",
+    "distributed/pipeline/sync/test_pipe",
+    "distributed/pipeline/sync/test_pipeline",
+    "distributed/pipeline/sync/test_stream",
+    "distributed/pipeline/sync/test_transparency",
+    "distributed/pipeline/sync/test_worker",
+    "distributed/optim/test_zero_redundancy_optimizer",
+    "distributed/elastic/timer/api_test",
+    "distributed/elastic/timer/local_timer_example",
+    "distributed/elastic/timer/local_timer_test",
+    "distributed/elastic/events/lib_test",
+    "distributed/elastic/metrics/api_test",
+    "distributed/elastic/utils/logging_test",
+    "distributed/elastic/utils/util_test",
+    "distributed/elastic/utils/distributed_test",
+    "distributed/elastic/multiprocessing/api_test",
+    "distributed/_sharding_spec/test_sharding_spec",
+    "distributed/_sharded_tensor/test_sharded_tensor",
 ]
 
 # Dictionary matching test modules (in TESTS) to lists of test cases (within that test_module) that would be run when

From 71e149834b786f9e451788c16096c470191c9f04 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 1 Sep 2021 21:48:36 -0700
Subject: [PATCH 453/530] Add a warning about DataLoader num_workers > 0
 "memory leak" (#64337)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64337

See https://github.com/pytorch/pytorch/issues/13246

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: H-Huang

Differential Revision: D30690320

Pulled By: ezyang

fbshipit-source-id: 2751aca05a94e63d25162599f458855988516fad
---
 docs/source/data.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/docs/source/data.rst b/docs/source/data.rst
index 9135c87d09262..b03fcb5858531 100644
--- a/docs/source/data.rst
+++ b/docs/source/data.rst
@@ -264,6 +264,21 @@ Setting the argument :attr:`num_workers` as a positive integer will
 turn on multi-process data loading with the specified number of loader worker
 processes.
 
+.. warning::
+   After several iterations, the loader worker processes will consume
+   the same amount of CPU memory as the parent process for all Python
+   objects in the parent process which are accessed from the worker
+   processes.  This can be problematic if the Dataset contains a lot of
+   data (e.g., you are loading a very large list of filenames at Dataset
+   construction time) and/or you are using a lot of workers (overall
+   memory usage is ``number of workers * size of parent process``).  The
+   simplest workaround is to replace Python objects with non-refcounted
+   representations such as Pandas, Numpy or PyArrow objects.  Check out
+   `issue #13246
+   <https://github.com/pytorch/pytorch/issues/13246#issuecomment-905703662>`_
+   for more details on why this occurs and example code for how to
+   workaround these problems.
+
 In this mode, each time an iterator of a :class:`~torch.utils.data.DataLoader`
 is created (e.g., when you call ``enumerate(dataloader)``), :attr:`num_workers`
 worker processes are created. At this point, the :attr:`dataset`,

From 69e1207084e6b8932b870ee2a315eb539859a67f Mon Sep 17 00:00:00 2001
From: Shirong Wu <shirong@fb.com>
Date: Wed, 1 Sep 2021 22:09:42 -0700
Subject: [PATCH 454/530] Move graph util to fx2trt (#64064)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64064

Move original util in torch2trt to fx2trt dir since torch2trt is gonne be deprecated. This is a follow up diff for D30379124

Test Plan: manual

Reviewed By: yinghai, mikekgfb

Differential Revision: D30591687

fbshipit-source-id: ae0e59dfbc2d2e2aa4f3ccea7cff2291c7deb388
---
 .../experimental/fx2trt/tools/graph_util.py   | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 torch/fx/experimental/fx2trt/tools/graph_util.py

diff --git a/torch/fx/experimental/fx2trt/tools/graph_util.py b/torch/fx/experimental/fx2trt/tools/graph_util.py
new file mode 100644
index 0000000000000..96c8b12915da4
--- /dev/null
+++ b/torch/fx/experimental/fx2trt/tools/graph_util.py
@@ -0,0 +1,64 @@
+import graphviz  # type: ignore[import]
+
+def get_layer_name_type(layer):
+    return "\n".join(f"{i}" for i in [layer.name, layer.type])
+
+def trt_network_to_dot_graph(network):
+    dot = graphviz.Digraph(comment="Network")
+
+    # add nodes (layers)
+    for i in range(network.num_layers):
+        layer = network.get_layer(i)
+        dot.node(get_layer_name_type(layer))
+
+    # add nodes (inputs)
+    for i in range(network.num_inputs):
+        dot.node(network.get_input(i).name)
+
+    # add nodes (outputs)
+    for i in range(network.num_outputs):
+        dot.node(network.get_output(i).name)
+
+    # add layer->layer edges
+    for a in range(network.num_layers):
+        layer_a = network.get_layer(a)
+
+        for b in range(network.num_layers):
+            layer_b = network.get_layer(b)
+
+            for i in range(layer_a.num_outputs):
+                output_i = layer_a.get_output(i)
+
+                for j in range(layer_b.num_inputs):
+                    input_j = layer_b.get_input(j)
+
+                    if output_i == input_j:
+                        dot.edge(get_layer_name_type(layer_a), get_layer_name_type(layer_b), label=str(input_j.shape))
+
+    # add input->layer edges
+    for i in range(network.num_inputs):
+        input_i = network.get_input(i)
+
+        for b in range(network.num_layers):
+            layer_b = network.get_layer(b)
+
+            for j in range(layer_b.num_inputs):
+                input_j = layer_b.get_input(j)
+
+                if input_i == input_j:
+                    dot.edge(input_i.name, get_layer_name_type(layer_b), label=str(input_j.shape))
+
+    # add layer->output edges
+    for i in range(network.num_outputs):
+        input_i = network.get_output(i)
+
+        for b in range(network.num_layers):
+            layer_b = network.get_layer(b)
+
+            for j in range(layer_b.num_outputs):
+                input_j = layer_b.get_output(j)
+
+                if input_i == input_j:
+                    dot.edge(get_layer_name_type(layer_b), input_i.name, label=str(input_j.shape))
+
+    return dot

From 0addd75be9a87d03830790a5c6a9b2e201a09c13 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 2 Sep 2021 00:48:03 -0700
Subject: [PATCH 455/530] Remove unnecessary resize_output (#64272)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64272

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: H-Huang, bdhirsh

Differential Revision: D30686941

Pulled By: ezyang

fbshipit-source-id: de60e6f1115648f8cf7daaa1e652594fe8b06742
---
 aten/src/ATen/native/UnaryOps.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index d5052a77f5b62..b7e596392c716 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -191,7 +191,6 @@ TORCH_IMPL_FUNC(polygamma_out)
 }
 
 TORCH_IMPL_FUNC(signbit_out) (const Tensor& self, const Tensor& result) {
-  at::native::resize_output(result, self.sizes());
   if (self.dtype() == at::kBool) {
     result.fill_(false);
   } else {

From 8d5b95019d69d43963b33a1b188ad1fec8079664 Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@fb.com>
Date: Thu, 2 Sep 2021 00:50:40 -0700
Subject: [PATCH 456/530] [PyTorch Edge] Support default args with out arg,
 flag off (#63540)

Summary:
1. Allow consuming operators with defaults arguments and out arguments. Flag is off to keep the same behavior as v6, in pr 63651, turn on the flag.
2. Add two unittests to cover this type of operators.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63540

ghstack-source-id: 137211562

Test Plan:
```
caffe2/test/cpp/jit:jit - LiteInterpreterTest.DefaultArgsWithOutArg
caffe2/test/cpp/jit:jit - LiteInterpreterTest.DefaultArgsPinvWithOutArg
```

Reviewed By: raziel, iseeyuan, tugsbayasgalan

Differential Revision: D30414156

fbshipit-source-id: 0f3a219a22aee10ac53184cbd95940726c459d1f
---
 caffe2/serialize/versions.h                   |  2 +-
 test/cpp/jit/test_lite_interpreter.cpp        | 62 +++++++++++++++++++
 torch/csrc/jit/mobile/function.cpp            | 38 ++++++++----
 torch/csrc/jit/runtime/interpreter.cpp        |  2 +
 torch/csrc/jit/runtime/interpreter.h          |  1 +
 .../csrc/jit/runtime/interpreter/code_impl.h  | 37 ++++++-----
 6 files changed, 115 insertions(+), 27 deletions(-)

diff --git a/caffe2/serialize/versions.h b/caffe2/serialize/versions.h
index 61c8c46666e67..ed5795841d1f9 100644
--- a/caffe2/serialize/versions.h
+++ b/caffe2/serialize/versions.h
@@ -85,7 +85,7 @@ static_assert(kProducedBytecodeVersion >= kProducedFileFormatVersion,
 // we should support this model_version. For example, we provide a wrapper to
 // handle an updated operator.
 constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L;
-constexpr uint64_t kMaxSupportedBytecodeVersion = 0x6L;
+constexpr uint64_t kMaxSupportedBytecodeVersion = 0x7L;
 
 } // namespace serialize
 } // namespace caffe2
diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp
index 26100b3b6f508..b362c8a6ddb06 100644
--- a/test/cpp/jit/test_lite_interpreter.cpp
+++ b/test/cpp/jit/test_lite_interpreter.cpp
@@ -1035,6 +1035,68 @@ TEST(LiteInterpreterTest, DefaultArgsPinvSpecifyDefault) {
   testLiteModuleCompareResultTensors(m, inputs);
 }
 
+void testDefaultArgsPinvWithOutArg(int num_args) {
+  Module m("m");
+  if (num_args == 1) {
+    m.define(R"(
+      def forward(self, input):
+        return torch.linalg_pinv(input, out=input)
+    )");
+  } else if (num_args == 2) {
+    m.define(R"(
+      def forward(self, input):
+        return torch.linalg_pinv(input, 1e-5, out=input)
+    )");
+  } else if (num_args == 3) {
+    m.define(R"(
+      def forward(self, input):
+        return torch.linalg_pinv(input, 1e-5, True, out=input)
+    )");
+  }
+
+  const int N = 28;
+  auto input = torch::range(1, N * N, 1);
+  input[0] = 10000; // a more stable matrix
+  input = input.view({N, N});
+  auto ref = m.run_method("forward", input);
+  TORCH_CHECK(!input.equal(torch::range(1, N * N, 1)));
+  TORCH_CHECK(input.equal(ref.toTensor()));
+}
+
+TEST(LiteInterpreterTest, DefaultArgsPinvWithOutArg) {
+  // Test with different number of specified arguments + out arg.
+  // Arguments not specified take default value.
+  for (int num_args = 1; num_args <= 3; ++num_args) {
+    testDefaultArgsPinvWithOutArg(num_args);
+  }
+}
+
+TEST(LiteInterpreterTest, DefaultArgsWithOutArg) {
+  Module m("m");
+  m.define(R"(
+    def forward(self, x, h):
+      torch.add(x, h, out=x)
+  )");
+
+  std::vector<IValue> inputs;
+  auto input_x = 2 * torch::ones({});
+  auto input_h = torch::ones({});
+  auto ref = m.run_method("forward", input_x, input_h);
+
+  std::stringstream ss;
+
+  m._save_for_mobile(ss, {}, true);
+  mobile::Module bc = _load_for_mobile(ss);
+  bc.run_method("forward", input_x, input_h);
+  AT_ASSERT(input_x.equal(4 * torch::ones({})));
+
+  auto ops = _get_model_ops_and_info(ss);
+  auto op = ops.find("aten::add.out");
+  TORCH_CHECK(
+      op != ops.end() && op->second.num_schema_args.has_value() &&
+      op->second.num_schema_args.value() == 4);
+}
+
 TEST(LiteInterpreterTest, TestExceptionStackWithTwoLevelModuleHierarchy) {
   Module a("A");
   a.define(R"(
diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp
index 127bd5f9418d4..fad8c39bd1f4d 100644
--- a/torch/csrc/jit/mobile/function.cpp
+++ b/torch/csrc/jit/mobile/function.cpp
@@ -99,21 +99,35 @@ bool Function::append_operator(
     // from model. We can use it to handle backward compatibility.
     if (num_specified_args &&
         num_specified_args.value() < static_cast<int64_t>(args.size())) {
-      // Sanity check at load time, to save perf at runtime
-      for (size_t i = num_specified_args.value(); i < args.size(); ++i) {
-        auto default_val = args[i].default_value();
-        TORCH_CHECK(
-            default_val.has_value(),
-            "Error happened at preparing for default values for the argument. The ",
-            i,
-            "th arguement of operator",
-            opname,
-            " does not have a specified value or default value. ");
-      }
       fn = [fn, num_specified_args, args](Stack& stack) {
-        for (size_t i = num_specified_args.value(); i < args.size(); ++i) {
+        std::vector<IValue> out_args;
+        // The following logic pops and temporarily stores all out arguments
+        // from the stack (which can be 0 or more, and always appended to the
+        // schema), in order to push the necessary default values. Finally, the
+        // out arguments are pushed back into the stack.
+        for (size_t i = args.size() - 1; i > 0 && args.at(i).is_out(); i--) {
+          out_args.push_back(stack.back());
+          stack.pop_back();
+        }
+        size_t start_index = num_specified_args.value() - out_args.size();
+        TORCH_CHECK(
+            start_index >= 0,
+            "The number of output arguments is: ",
+            out_args.size(),
+            ", which is more then the number of specified arguments: ",
+            num_specified_args.value());
+        for (size_t i = start_index; i < (args.size() - out_args.size()); ++i) {
+          TORCH_CHECK(
+              args[i].default_value().has_value(),
+              "Error happened at preparing for default values for the argument. The ",
+              i,
+              "th argument ",
+              args[i].name(),
+              " does not have a specified value or default value. ");
+
           stack.push_back(args[i].default_value());
         }
+        stack.insert(stack.end(), out_args.rbegin(), out_args.rend());
         fn(stack);
       };
     }
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 70c9c6c653326..b34827176b2f3 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -978,11 +978,13 @@ MobileCode::MobileCode(
     const std::shared_ptr<Graph>& graph,
     std::string function_name,
     bool emit_default_input_instructions,
+    bool support_default_args_before_out,
     size_t remaining_bailout_depth)
     : Code(new interpreter::MobileCodeImpl(
           graph,
           std::move(function_name),
           emit_default_input_instructions,
+          support_default_args_before_out,
           remaining_bailout_depth)) {}
 
 MobileCode::~MobileCode() = default;
diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h
index 80720ea2ca42f..3471e558e5a41 100644
--- a/torch/csrc/jit/runtime/interpreter.h
+++ b/torch/csrc/jit/runtime/interpreter.h
@@ -82,6 +82,7 @@ struct TORCH_API MobileCode : Code {
       const std::shared_ptr<Graph>& graph,
       std::string function_name,
       bool emit_default_input_instructions = true,
+      bool support_default_args_before_out = false,
       size_t remaining_bailout_depth = 0);
   ~MobileCode();
 };
diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h
index 682c695138674..15ba0cec04d33 100644
--- a/torch/csrc/jit/runtime/interpreter/code_impl.h
+++ b/torch/csrc/jit/runtime/interpreter/code_impl.h
@@ -721,9 +721,11 @@ struct MobileCodeImpl : CodeImpl {
       const std::shared_ptr<Graph>& graph,
       std::string function_name,
       bool emit_default_input_instructions,
+      bool support_default_args_before_out,
       size_t remaining_bailout_depth)
       : CodeImpl(graph, function_name, remaining_bailout_depth, false),
-        emit_default_input_instructions_(emit_default_input_instructions) {
+        emit_default_input_instructions_(emit_default_input_instructions),
+        support_default_args_before_out_(support_default_args_before_out) {
     // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
     run();
   }
@@ -746,11 +748,12 @@ struct MobileCodeImpl : CodeImpl {
         // skip if schema has vararg
         if (!op_schema.is_vararg()) {
           auto specifiedArgs = CalculateNecessaryArgs(
-              op_schema.arguments(), node->inputs(), false);
-          // preserving the old behavior
-          auto numInclude = specifiedArgs.first;
-          // TODO uncomment this
-          // auto numInclude = specifiedArgs.first + specifiedArgs.second;
+              op_schema.arguments(),
+              node->inputs(),
+              support_default_args_before_out_);
+
+          size_t numInclude = specifiedArgs.first +
+              (support_default_args_before_out_ ? specifiedArgs.second : 0);
           auto unique_name = op_schema.overload_name() != ""
               ? op_schema.name() + "." + op_schema.overload_name()
               : op_schema.name();
@@ -782,21 +785,27 @@ struct MobileCodeImpl : CodeImpl {
         if (it != op_to_num_specified_args_.end()) {
           num_include = it->second;
         }
-        emitLoadInputs(node->inputs(), num_include);
-        // TODO: uncomment this
-        // auto num_out = op_to_num_out_args_.find(unique_op_name)->second;
-        // auto num_specified_before_out = num_include - num_out;
-        // emitLoadInputs(node->inputs(), 0, num_specified_before_out);
-        // emitLoadInputs(node->inputs(), node->inputs().size() - num_out,
-        // node->inputs().size());
-
+        if (support_default_args_before_out_) {
+          auto num_out = op_to_num_out_args_.find(unique_op_name)->second;
+          auto num_specified_before_out = num_include - num_out;
+          emitLoadInputs(node->inputs(), 0, num_specified_before_out);
+          emitLoadInputs(
+              node->inputs(),
+              node->inputs().size() - num_out,
+              node->inputs().size());
+        } else {
+          emitLoadInputs(node->inputs(), num_include);
+        }
         insertInstruction(OP, operator_table_.size());
       }
       operator_table_.emplace_back(op.getOperation(node));
     }
   }
 
+  // To support forward compatibility for bytecode version bump from v5 to v6
   bool emit_default_input_instructions_;
+  // To support forward compatibility for bytecode version bump from v6 to v7
+  bool support_default_args_before_out_;
 };
 
 } // namespace interpreter

From ee8a6c1d141ae49e23323bdd485fb4b390541f69 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Thu, 2 Sep 2021 00:57:39 -0700
Subject: [PATCH 457/530] Replace std::unordered_map<c10::Device, c10::Device>
 with DeviceMap (#64393)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64393

cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23

Test Plan: Imported from OSS

Reviewed By: rohan-varma

Differential Revision: D30708384

Pulled By: pbelevich

fbshipit-source-id: 1c565727e4f09cd9e560874dd90aa403470b4a97
---
 .../distributed/autograd/functions/recvrpc_backward.cpp   | 2 +-
 .../distributed/autograd/functions/recvrpc_backward.h     | 4 ++--
 .../autograd/rpc_messages/rpc_with_autograd.cpp           | 8 ++++----
 .../distributed/autograd/rpc_messages/rpc_with_autograd.h | 8 ++++----
 torch/csrc/distributed/autograd/utils.cpp                 | 4 ++--
 torch/csrc/distributed/autograd/utils.h                   | 4 ++--
 torch/csrc/distributed/rpc/request_callback_no_python.cpp | 2 +-
 torch/csrc/distributed/rpc/rpc_agent.h                    | 2 +-
 .../distributed/rpc/testing/faulty_tensorpipe_agent.cpp   | 2 +-
 .../distributed/rpc/testing/faulty_tensorpipe_agent.h     | 2 +-
 torch/csrc/distributed/rpc/utils.cpp                      | 2 +-
 11 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp b/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp
index 0d82c07835f55..a492d9847fb37 100644
--- a/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp
+++ b/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp
@@ -15,7 +15,7 @@ RecvRpcBackward::RecvRpcBackward(
     const AutogradMetadata& autogradMetadata,
     ContextPtr autogradContext,
     rpc::worker_id_t fromWorkerId,
-    std::unordered_map<c10::Device, c10::Device> deviceMap)
+    rpc::DeviceMap deviceMap)
     : autogradMetadata_(autogradMetadata),
       // NOLINTNEXTLINE(performance-move-const-arg)
       autogradContext_(std::move(autogradContext)),
diff --git a/torch/csrc/distributed/autograd/functions/recvrpc_backward.h b/torch/csrc/distributed/autograd/functions/recvrpc_backward.h
index 46bdb297cdf46..6e6678b128985 100644
--- a/torch/csrc/distributed/autograd/functions/recvrpc_backward.h
+++ b/torch/csrc/distributed/autograd/functions/recvrpc_backward.h
@@ -23,7 +23,7 @@ class TORCH_API RecvRpcBackward : public torch::autograd::Node {
       const AutogradMetadata& autogradMetadata,
       std::shared_ptr<DistAutogradContext> autogradContext,
       rpc::worker_id_t fromWorkerId,
-      std::unordered_map<c10::Device, c10::Device> deviceMap);
+      rpc::DeviceMap deviceMap);
 
   torch::autograd::variable_list apply(
       torch::autograd::variable_list&& grads) override;
@@ -41,7 +41,7 @@ class TORCH_API RecvRpcBackward : public torch::autograd::Node {
   rpc::worker_id_t fromWorkerId_;
 
   // Device mapping for tensors sent over RPC.
-  const std::unordered_map<c10::Device, c10::Device> deviceMap_;
+  const rpc::DeviceMap deviceMap_;
 };
 
 } // namespace autograd
diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp
index 4d84e99753961..b8d28f7be7c2d 100644
--- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp
+++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.cpp
@@ -19,7 +19,7 @@ RpcWithAutograd::RpcWithAutograd(
     MessageType messageType,
     const AutogradMetadata& autogradMetadata,
     c10::intrusive_ptr<rpc::Message> wrappedMessage,
-    std::unordered_map<c10::Device, c10::Device> deviceMap)
+    rpc::DeviceMap deviceMap)
     : fromWorkerId_(fromWorkerId),
       messageType_(messageType),
       autogradMetadata_(autogradMetadata),
@@ -39,7 +39,7 @@ RpcWithAutograd::RpcWithAutograd(
     std::unique_ptr<RpcCommandBase> wrappedRpc,
     MessageType wrappedMessageType,
     std::vector<torch::Tensor> tensors,
-    std::unordered_map<c10::Device, c10::Device> deviceMap)
+    rpc::DeviceMap deviceMap)
     : fromWorkerId_(fromWorkerId),
       messageType_(messageType),
       autogradMetadata_(autogradMetadata),
@@ -112,7 +112,7 @@ std::unique_ptr<RpcWithAutograd> RpcWithAutograd::fromMessage(
   auto c10DeviceMap = tupleElements[4].to<c10::Dict<std::string, std::string>>();
 
   // Convert to regular map.
-  std::unordered_map<c10::Device, c10::Device> deviceMap;
+  rpc::DeviceMap deviceMap;
   for (const auto& mapEntry : c10DeviceMap) {
     deviceMap.insert({mapEntry.key(), mapEntry.value()});
   }
@@ -169,7 +169,7 @@ rpc::worker_id_t RpcWithAutograd::fromWorkerId() const {
   return fromWorkerId_;
 }
 
-const std::unordered_map<c10::Device, c10::Device>& RpcWithAutograd::
+const rpc::DeviceMap& RpcWithAutograd::
     deviceMap() {
   return deviceMap_;
 }
diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h
index 1884cc9742939..6d0b6111cc88c 100644
--- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h
+++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h
@@ -19,7 +19,7 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase {
       rpc::MessageType messageType,
       const AutogradMetadata& autogradMetadata,
       c10::intrusive_ptr<rpc::Message> wrappedMessage,
-      std::unordered_map<c10::Device, c10::Device> deviceMap = {});
+      rpc::DeviceMap deviceMap = {});
 
   // Used when receiving an RPC over the wire.
   RpcWithAutograd(
@@ -29,7 +29,7 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase {
       std::unique_ptr<rpc::RpcCommandBase> wrappedRpc,
       rpc::MessageType wrappedMessageType,
       std::vector<torch::Tensor> tensors,
-      std::unordered_map<c10::Device, c10::Device> deviceMap = {});
+      rpc::DeviceMap deviceMap = {});
 
   c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
 
@@ -55,7 +55,7 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase {
   rpc::worker_id_t fromWorkerId() const;
 
   // Retrieve the device map.
-  const std::unordered_map<c10::Device, c10::Device>& deviceMap();
+  const rpc::DeviceMap& deviceMap();
 
  private:
   // WorkerId from which this RPC originated. This is necessary for knowing
@@ -90,7 +90,7 @@ class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase {
   std::vector<torch::Tensor> tensors_;
 
   // Device mapping for tensors that are sent across an RPC to another node.
-  std::unordered_map<c10::Device, c10::Device> deviceMap_;
+  rpc::DeviceMap deviceMap_;
 };
 
 } // namespace autograd
diff --git a/torch/csrc/distributed/autograd/utils.cpp b/torch/csrc/distributed/autograd/utils.cpp
index 4e29bfcc1ffe9..9db40766c598a 100644
--- a/torch/csrc/distributed/autograd/utils.cpp
+++ b/torch/csrc/distributed/autograd/utils.cpp
@@ -53,7 +53,7 @@ ContextPtr addRecvRpcBackward(
     const AutogradMetadata& autogradMetadata,
     std::vector<torch::Tensor>& tensors,
     rpc::worker_id_t fromWorkerId,
-    const std::unordered_map<c10::Device, c10::Device>& deviceMap) {
+    const rpc::DeviceMap& deviceMap) {
   // Initialize autograd context if necessary.
   auto& autogradContainer = DistAutogradContainer::getInstance();
   auto autogradContext =
@@ -105,7 +105,7 @@ c10::intrusive_ptr<Message> getMessageWithAutograd(
     c10::intrusive_ptr<torch::distributed::rpc::Message> wrappedRpcMsg,
     MessageType msgType,
     bool forceGradRecording,
-    const std::unordered_map<c10::Device, c10::Device>& deviceMap) {
+    const rpc::DeviceMap& deviceMap) {
   auto& autogradContainer = DistAutogradContainer::getInstance();
 
   // If there is no valid context and no tensor requires grads, send original
diff --git a/torch/csrc/distributed/autograd/utils.h b/torch/csrc/distributed/autograd/utils.h
index fae675d3b81c6..94883ce605269 100644
--- a/torch/csrc/distributed/autograd/utils.h
+++ b/torch/csrc/distributed/autograd/utils.h
@@ -31,7 +31,7 @@ TORCH_API ContextPtr addRecvRpcBackward(
     const AutogradMetadata& autogradMetadata,
     std::vector<torch::Tensor>& tensors,
     rpc::worker_id_t fromWorkerId,
-    const std::unordered_map<c10::Device, c10::Device>& deviceMap);
+    const rpc::DeviceMap& deviceMap);
 
 // This method is a wrapper utility used internally to wrap autograd info
 // and attach autograd function for each type of rpc call if it has valid
@@ -44,7 +44,7 @@ TORCH_API c10::intrusive_ptr<rpc::Message> getMessageWithAutograd(
     c10::intrusive_ptr<rpc::Message> wrappedRpcMsg,
     rpc::MessageType msgType,
     bool forceGradRecording = false,
-    const std::unordered_map<c10::Device, c10::Device>& deviceMap =
+    const rpc::DeviceMap& deviceMap =
         {});
 
 // Send message after autograd checking
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
index 5eada8d573f2f..9e16061e0ad42 100644
--- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@@ -290,7 +290,7 @@ c10::intrusive_ptr<JitFuture> RequestCallbackNoPython::
 
   // Need to reverse the device map for the backward pass of distributed
   // autograd.
-  std::unordered_map<c10::Device, c10::Device> reverseDeviceMap;
+  DeviceMap reverseDeviceMap;
   for (const auto& mapEntry : rpcWithAutograd.deviceMap()) {
     reverseDeviceMap.insert({mapEntry.second, mapEntry.first});
   }
diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h
index a83e77bfe56f9..7cd228e57da8e 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.h
+++ b/torch/csrc/distributed/rpc/rpc_agent.h
@@ -164,7 +164,7 @@ class TORCH_API RpcAgent {
       const WorkerInfo& to,
       c10::intrusive_ptr<Message> message,
       const float rpcTimeoutSeconds = kUnsetRpcTimeout,
-      const std::unordered_map<c10::Device, c10::Device>& deviceMap = {}) = 0;
+      const DeviceMap& deviceMap = {}) = 0;
 
   // Retries sending the message up to maxRetries times until an ACK is
   // receieved. The duration between consecutive sends is increased over
diff --git a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.cpp
index 72d4d5dfec82e..a2e052535efac 100644
--- a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.cpp
@@ -67,7 +67,7 @@ c10::intrusive_ptr<JitFuture> FaultyTensorPipeAgent::send(
     const WorkerInfo& to,
     c10::intrusive_ptr<Message> message,
     const float rpcTimeoutSeconds,
-    const std::unordered_map<c10::Device, c10::Device>& /* unused */) {
+    const DeviceMap& /* unused */) {
   // We only fail control messages that have been specified by the test case.
   // For all other messages, we just send them without any failures.
   if (!shouldFailMessage(message->type())) {
diff --git a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h
index 5d6059747c219..e69a76cddc8ed 100644
--- a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h
+++ b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h
@@ -53,7 +53,7 @@ class TORCH_API FaultyTensorPipeAgent : public TensorPipeAgent {
       const WorkerInfo& to,
       c10::intrusive_ptr<Message> message,
       const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout,
-      const std::unordered_map<c10::Device, c10::Device>& deviceMap = {})
+      const DeviceMap& deviceMap = {})
       override;
 
   // Add delay to writes
diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp
index 615abbf300666..820ec31691a0a 100644
--- a/torch/csrc/distributed/rpc/utils.cpp
+++ b/torch/csrc/distributed/rpc/utils.cpp
@@ -177,7 +177,7 @@ std::unique_ptr<RpcCommandBase> deserializeResponse(
 
       // Need to reverse the device map for the backward pass of distributed
       // autograd.
-      std::unordered_map<c10::Device, c10::Device> reverseDeviceMap;
+      DeviceMap reverseDeviceMap;
       for (const auto& mapEntry : rpcWithAutograd.deviceMap()) {
         reverseDeviceMap.insert({mapEntry.second, mapEntry.first});
       }

From 76e187aa08556ce90e84b17e836784ffbb6905e0 Mon Sep 17 00:00:00 2001
From: Kushashwa Ravi Shrimali <kushashwaravishrimali@gmail.com>
Date: Thu, 2 Sep 2021 01:08:53 -0700
Subject: [PATCH 458/530] Port `gather` to structured kernel (#63312)

Summary:
Will add a description once this is ready for review.

cc: ysiraichi ezyang

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63312

Reviewed By: iramazanli

Differential Revision: D30597447

Pulled By: ezyang

fbshipit-source-id: d36e59835c2f4b38e286032dd2a1111a7e16b7e5
---
 aten/src/ATen/native/ScatterGatherChecks.h    | 32 +++-------
 .../ATen/native/TensorAdvancedIndexing.cpp    | 51 ++++++++++-----
 aten/src/ATen/native/TensorAdvancedIndexing.h | 12 ++--
 .../ATen/native/cpu/ScatterGatherKernel.cpp   | 34 +++-------
 .../ATen/native/cuda/ScatterGatherKernel.cu   | 62 +++----------------
 aten/src/ATen/native/native_functions.yaml    |  7 +--
 test/test_torch.py                            |  7 ++-
 7 files changed, 76 insertions(+), 129 deletions(-)

diff --git a/aten/src/ATen/native/ScatterGatherChecks.h b/aten/src/ATen/native/ScatterGatherChecks.h
index ad3b3fca097ca..0fc38d5bd7418 100644
--- a/aten/src/ATen/native/ScatterGatherChecks.h
+++ b/aten/src/ATen/native/ScatterGatherChecks.h
@@ -9,7 +9,7 @@ namespace at { namespace native {
 namespace {
 
 // checks whether index.dtype == int64
-// and self.dtyp == src.dtype if src is a Tensor
+// and self.dtype == src.dtype if src is a Tensor
 static void scatter_gather_dtype_check(
   const std::string& method_name,
   const Tensor& self,
@@ -31,42 +31,31 @@ static void scatter_gather_dtype_check(
 }
 
 // Used for `gather`-like methods
+// Note: self means the input tensor here
 // Test:
-// 1. index.size(d) == self.size(d) for all d != dim
-// 2. index.size(d) <= src.size(d) for all d != dim
-// 3. index.dim() == self.dim() == src.dim()
+// 1. index.size(d) <= self.size(d) for all d != dim
+// 2. index.dim() == self.dim()
 static C10_UNUSED void gather_shape_check(const Tensor& self, int64_t dim,
-  const Tensor& index, const Tensor& src
+  const Tensor& index
 ) {
   auto self_dims = ensure_nonempty_dim(self.dim());
   TORCH_CHECK(self_dims == ensure_nonempty_dim(index.dim()),
-    "Index tensor must have the same number of dimensions as out tensor"
-  );
-
-  auto src_dims = ensure_nonempty_dim(src.dim());
-  TORCH_CHECK(src_dims == ensure_nonempty_dim(index.dim()),
     "Index tensor must have the same number of dimensions as input tensor"
   );
 
   for (int64_t i = 0; i < self_dims; ++i) {
     if (i != dim) {
       TORCH_CHECK(
-        ensure_nonempty_size(index, i) == ensure_nonempty_size(self, i),
-        "Size does not match at dimension ", i,
-        " get ", ensure_nonempty_size(self, i),
-        " vs ", ensure_nonempty_size(index, i)
-      );
-
-      TORCH_CHECK(
-        ensure_nonempty_size(index, i) <= ensure_nonempty_size(src, i),
+        ensure_nonempty_size(index, i) <= ensure_nonempty_size(self, i),
         "Size does not match at dimension ", i,
         " expected index ", index.sizes(),
-        " to be smaller than src ", src.sizes(),
+        " to be smaller than self ", self.sizes(),
         " apart from dimension ", dim
       );
     }
   }
 }
+
 // Used for `scatter` and `scatter_add`
 // Tests:
 //  1. index.size(d) <= self.size(d) for all d != dim
@@ -76,10 +65,7 @@ static C10_UNUSED void scatter_shape_check(
   const Tensor& self, int64_t dim, const Tensor& index,
   const c10::optional<Tensor>& src_opt = c10::nullopt
 ) {
-  if (index.numel() == 0) {
-    return;
-  }
-
+  if (index.numel() == 0) return;
   TORCH_CHECK(
     ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()),
     "Index tensor must have the same number of dimensions as self tensor"
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 43cebba51b9e7..3fb38cc8832ec 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -83,6 +83,31 @@ native::SCATTER_GATHER_OP get_operator_enum(const c10::string_view reduce) {
   }
 }
 
+TORCH_META_FUNC(gather)
+(const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad) {
+  const Tensor& result = maybe_get_output(0);
+  int64_t wrapped_dim = at::maybe_wrap_dim(dim, self.dim());
+
+  // Memory overlap checks need to be done after resizing (if required) is done.
+  // But it only makes sense to do these checks when result was defined, hence
+  // the boolean variable `check_result` here.
+  // For more details, see: https://github.com/pytorch/pytorch/pull/63312#discussion_r694794832
+  // and https://github.com/pytorch/pytorch/issues/63837
+  bool check_result = result.defined();
+  set_output(index.sizes(), self.options());
+  if (check_result) {
+    at::assert_no_internal_overlap(result);
+    at::assert_no_overlap(result, self);
+    at::assert_no_partial_overlap(result, index);
+  }
+
+  TORCH_CHECK(
+    index.scalar_type() == at::ScalarType::Long,
+    "gather", "(): Expected dtype int64 for index"
+  );
+  at::native::gather_shape_check(self, wrapped_dim, index);
+}
+
 template <typename Meta>
 void scatter_meta_impl(
     Meta& meta,
@@ -1112,23 +1137,12 @@ Tensor index_fill(const Tensor & self, int64_t dim, const Tensor & index, const
   return self.clone(at::MemoryFormat::Preserve).index_fill_(dim, index, source);
 }
 
-Tensor& gather_out_cpu_cuda(
-    const Tensor& self,
-    int64_t dim,
-    const Tensor& index,
-    bool sparse_grad,
-    Tensor& result) {
-  at::native::resize_output(result, index.sizes());
-  at::assert_no_internal_overlap(result);
-  at::assert_no_overlap(result, self);
-  at::assert_no_partial_overlap(result, index);
+// gather_out_cpu_cuda
+TORCH_IMPL_FUNC(gather_out)
+(const Tensor& self, int64_t dim, const Tensor& index, bool sparse_grad, const Tensor& result) {
+  if (index.numel() == 0) return;
+  dim = at::maybe_wrap_dim(dim, self.dim());
   gather_stub(result.device().type(), result, self, dim, index);
-  return result;
-}
-
-Tensor gather(const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad) {
-  Tensor result = at::empty({0}, self.options());
-  return at::native::gather_out_cpu_cuda(self, dim, index, sparse_grad, result);
 }
 
 Tensor gather_backward(const Tensor& grad, const Tensor& self, int64_t dim, const Tensor& index, bool sparse_grad) {
@@ -1148,6 +1162,8 @@ void scatter_impl(
     ReduceStub& reduce_stub,
     FillStub& fill_stub,
     const c10::optional<c10::string_view> reduce = nullopt) {
+  if (index.numel() == 0) return;
+  dim = at::maybe_wrap_dim(dim, self.dim());
   auto mut_out = const_cast<Tensor&>(out);
 
   if (!self.is_same(mut_out)) {
@@ -1217,11 +1233,14 @@ TORCH_IMPL_FUNC(scatter_add)
  const Tensor& src,
  const Tensor& out) {
   auto mut_out = const_cast<Tensor&>(out);
+  dim = maybe_wrap_dim(dim, self.dim());
 
   if (!self.is_same(mut_out)) {
     mut_out.copy_(self);
   }
 
+  if (index.numel() == 0) return;
+
   if (globalContext().deterministicAlgorithms() && self.device().type() == DeviceType::CUDA && self.dim() == 1) {
     TORCH_CHECK(index.dim() == 1 && src.dim() == 1, "index and src should be 1D tensors when self is a 1D tensor, "
       "but their dims are ", index.dim(), " and ", src.dim(), ", respectively");
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.h b/aten/src/ATen/native/TensorAdvancedIndexing.h
index cd2835aa8139b..d8271a8355ded 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.h
@@ -24,13 +24,13 @@ using take_fn = void(*)(TensorIterator & iter, const Tensor& input);
 using masked_select_fn = void(*)(TensorIterator &, int64_t orig_stride);
 using masked_scatter_fn = void(*)(TensorIterator &, const Tensor &);
 
-using gather_fn = void (*)(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index);
-using scatter_fn = void(*)(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
-using scatter_fill_fn = void(*)(Tensor& self, int64_t dim, const Tensor& index, const Scalar& src);
-using scatter_add_fn = void(*)(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
-using scatter_reduce_fn = void(*)(Tensor& self, const int64_t dim, const Tensor& index,
+using gather_fn = void (*)(const Tensor & result, const Tensor & self, int64_t dim, const Tensor & index);
+using scatter_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
+using scatter_fill_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src);
+using scatter_add_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
+using scatter_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
                                   const Tensor& src, const SCATTER_GATHER_OP& reduce);
-using scatter_scalar_reduce_fn = void(*)(Tensor& self, const int64_t dim, const Tensor& index,
+using scatter_scalar_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
                                          const Scalar& value, const SCATTER_GATHER_OP& reduce);
 
 DECLARE_DISPATCH(index_fn, index_stub);
diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
index c32efeb276bd7..2ab92fbdb2bb2 100644
--- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
+++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
@@ -100,15 +100,9 @@ struct _cpu_scatter_gather_dim_loop {
 template <bool is_scatter_like = true>
 struct cpu_scatter_gather_base_kernel {
   template <typename func_t>
-  void operator()(Tensor& self, int64_t dim,
+  void operator()(const Tensor& self, int64_t dim,
     const Tensor& index, const Scalar& value,
     const std::string& method_name, func_t& kernel_func) {
-    // no-op if index is empty
-    if (index.numel() == 0) {
-      return;
-    }
-
-    dim = maybe_wrap_dim(dim, self.dim());
 
     auto index_sizes = ensure_nonempty_vec(index.sizes().vec());
     auto index_strides = ensure_nonempty_vec(index.strides().vec());
@@ -193,22 +187,10 @@ struct cpu_scatter_gather_base_kernel {
   }
 
   template <typename func_t>
-  void operator()(Tensor& self, int64_t dim,
+  void operator()(const Tensor& self, int64_t dim,
     const Tensor& index, const Tensor& src,
     const std::string& method_name, func_t& kernel_func) {
 
-    // no-op if index is empty
-    if (index.numel() == 0) {
-      return;
-    }
-
-    dim = maybe_wrap_dim(dim, self.dim());
-
-    scatter_gather_dtype_check(method_name, self, index, src);
-    if (!is_scatter_like) {
-      gather_shape_check(self, dim, index, src);
-    }
-
     auto iter = TensorIteratorConfig()
       .check_all_same_dtype(false)
       .resize_outputs(false)
@@ -292,30 +274,30 @@ struct cpu_scatter_gather_base_kernel {
   }
 };
 
-void gather_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim, const Tensor& index) {
+void gather_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim, const Tensor& index) {
   cpu_scatter_gather_base_kernel</*is_scatter_like=*/false>()(
     result, dim, index, self,
     "gather_out_cpu", tensor_assign);
 }
 
-void scatter_cpu_kernel(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) {
+void scatter_cpu_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) {
   cpu_scatter_gather_base_kernel<>()(
     self, dim, index, src, "scatter_cpu_", tensor_assign);
 }
 
-void scatter_fill_cpu_kernel(Tensor& self, int64_t dim, const Tensor& index, const Scalar& value) {
+void scatter_fill_cpu_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& value) {
   cpu_scatter_gather_base_kernel<>()(
     self, dim, index, value, "scatter_fill_cpu_", tensor_assign);
 }
 
-void scatter_add_cpu_kernel(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) {
+void scatter_add_cpu_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) {
   cpu_scatter_gather_base_kernel<>()(
     self, dim, index, src,
     "scatter_add_", reduce_add);
 
 }
 
-void scatter_reduce_cpu_kernel(Tensor& self, const int64_t dim, const Tensor& index,
+void scatter_reduce_cpu_kernel(const Tensor& self, const int64_t dim, const Tensor& index,
                                const Tensor& src, const SCATTER_GATHER_OP& reduce) {
   switch (reduce) {
   case SCATTER_GATHER_OP::REDUCE_ADD :
@@ -329,7 +311,7 @@ void scatter_reduce_cpu_kernel(Tensor& self, const int64_t dim, const Tensor& in
   }
 }
 
-void scatter_scalar_reduce_cpu_kernel(Tensor& self, const int64_t dim, const Tensor& index,
+void scatter_scalar_reduce_cpu_kernel(const Tensor& self, const int64_t dim, const Tensor& index,
                                       const Scalar& value, const SCATTER_GATHER_OP& reduce) {
   switch (reduce) {
   case SCATTER_GATHER_OP::REDUCE_ADD :
diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
index b95570109de91..5f03cc450f206 100644
--- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
+++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
@@ -89,10 +89,6 @@ struct _cuda_scatter_gather_internal_kernel {
     int64_t index_stride,
     const func_t& f
   ) {
-    if (iter.numel() == 0) {
-      return;
-    }
-
     if (!iter.can_use_32bit_indexing()) {
       for (auto& sub_iter : iter.with_32bit_indexing()) {
         _cuda_scatter_gather_internal_kernel<is_scatter_like, scalar_t>()(
@@ -132,24 +128,13 @@ template <bool is_scatter_like = true, bool cast_to_opaque = true>
 struct cuda_scatter_gather_base_kernel {
   template <typename func_t>
   void operator()(
-    Tensor& self, int64_t dim,
+    const Tensor& self, int64_t dim,
     const Tensor& index, const Tensor& src,
     const std::string& method_name,
     const func_t& f
   ) {
-    // no-op if index is empty
-    if (index.numel() == 0) {
-      return;
-    }
     at::assert_no_internal_overlap(self);
 
-    dim = maybe_wrap_dim(dim, self.dim());
-
-    scatter_gather_dtype_check(method_name, self, index, src);
-    if (!is_scatter_like) {
-      gather_shape_check(self, dim, index, src);
-    }
-
     auto index_sizes = ensure_nonempty_vec(index.sizes().vec());
     auto self_strides = ensure_nonempty_vec(self.strides().vec());
     auto src_strides = ensure_nonempty_vec(src.strides().vec());
@@ -201,24 +186,13 @@ struct cuda_scatter_gather_base_kernel {
   }
 
   void operator()(
-    Tensor& self, int64_t dim,
+    const Tensor& self, int64_t dim,
     const Tensor& index, const Tensor& src,
     const std::string& method_name,
     const ReduceMultiply& f
   ) {
-    // no-op if index is empty
-    if (index.numel() == 0) {
-      return;
-    }
     at::assert_no_internal_overlap(self);
 
-    dim = maybe_wrap_dim(dim, self.dim());
-
-    scatter_gather_dtype_check(method_name, self, index, src);
-    if (!is_scatter_like) {
-      gather_shape_check(self, dim, index, src);
-    }
-
     auto index_sizes = ensure_nonempty_vec(index.sizes().vec());
     auto self_strides = ensure_nonempty_vec(self.strides().vec());
     auto src_strides = ensure_nonempty_vec(src.strides().vec());
@@ -280,10 +254,6 @@ struct _cuda_scatter_fill_internal_kernel {
     int64_t index_stride,
     const func_t& f
   ) {
-    if (iter.numel() == 0) {
-      return;
-    }
-
     if (!iter.can_use_32bit_indexing()) {
       for (auto& sub_iter : iter.with_32bit_indexing()) {
         _cuda_scatter_fill_internal_kernel<scalar_t>()(
@@ -322,19 +292,13 @@ template <bool cast_to_opaque = true>
 struct cuda_scatter_fill_base_kernel {
   template <typename func_t>
   void operator()(
-    Tensor& self, int64_t dim,
+    const Tensor& self, int64_t dim,
     const Tensor& index, Scalar src,
     const std::string& method_name,
     const func_t& f
   ) {
-    // no-op if index is empty
-    if (index.numel() == 0) {
-      return;
-    }
     at::assert_no_internal_overlap(self);
 
-    dim = maybe_wrap_dim(dim, self.dim());
-
     auto index_sizes = ensure_nonempty_vec(index.sizes().vec());
 
     // restride self such that
@@ -371,19 +335,13 @@ struct cuda_scatter_fill_base_kernel {
   }
 
   void operator()(
-    Tensor& self, int64_t dim,
+    const Tensor& self, int64_t dim,
     const Tensor& index, Scalar src,
     const std::string& method_name,
     const ReduceMultiply& f
   ) {
-    // no-op if index is empty
-    if (index.numel() == 0) {
-      return;
-    }
     at::assert_no_internal_overlap(self);
 
-    dim = maybe_wrap_dim(dim, self.dim());
-
     auto index_sizes = ensure_nonempty_vec(index.sizes().vec());
 
     // restride self such that
@@ -420,25 +378,25 @@ struct cuda_scatter_fill_base_kernel {
   }
 }; // struct cuda_scatter_fill_base_kernel
 
-void gather_cuda_kernel(Tensor& result, const Tensor& self, int64_t dim, const Tensor& index) {
+void gather_cuda_kernel(const Tensor& result, const Tensor& self, int64_t dim, const Tensor& index) {
   cuda_scatter_gather_base_kernel</*is_scatter_like=*/false>()(
     result, dim, index, self,
     "gather_out_cuda", tensor_assign);
 }
 
-void scatter_cuda_kernel(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) {
+void scatter_cuda_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) {
   cuda_scatter_gather_base_kernel<>()(
     self, dim, index, src,
     "scatter_cuda_", tensor_assign);
 }
 
-void scatter_fill_cuda_kernel(Tensor& self, int64_t dim, const Tensor& index, const Scalar& src) {
+void scatter_fill_cuda_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src) {
   cuda_scatter_fill_base_kernel<>()(
     self, dim, index, src,
     "scatter_fill_cuda_", tensor_assign);
 }
 
-void scatter_add_cuda_kernel(Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) {
+void scatter_add_cuda_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) {
   // See Note [Writing Nondeterministic Operations]
   // Nondeterministic because of atomicAdd usage
   globalContext().alertNotDeterministic("scatter_add_cuda_kernel");
@@ -447,7 +405,7 @@ void scatter_add_cuda_kernel(Tensor& self, int64_t dim, const Tensor& index, con
     "scatter_add_cuda_", reduce_add);
 }
 
-void scatter_reduce_cuda_kernel(Tensor& self, const int64_t dim, const Tensor& index,
+void scatter_reduce_cuda_kernel(const Tensor& self, const int64_t dim, const Tensor& index,
                                const Tensor& src, const SCATTER_GATHER_OP& reduce) {
   switch (reduce) {
   case SCATTER_GATHER_OP::REDUCE_ADD :
@@ -461,7 +419,7 @@ void scatter_reduce_cuda_kernel(Tensor& self, const int64_t dim, const Tensor& i
   }
 }
 
-void scatter_scalar_reduce_cuda_kernel(Tensor& self, const int64_t dim, const Tensor& index,
+void scatter_scalar_reduce_cuda_kernel(const Tensor& self, const int64_t dim, const Tensor& index,
                                const Scalar& value, const SCATTER_GATHER_OP& reduce) {
   switch (reduce) {
   case SCATTER_GATHER_OP::REDUCE_ADD :
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index fae433cd6aae6..ca13e058411a4 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6601,14 +6601,13 @@
   variants: method, function
 
 - func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
+  structured: True
   dispatch:
-    CPU: gather_out_cpu_cuda
-    CUDA: gather_out_cpu_cuda
+    CPU, CUDA: gather_out
 
 - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
   variants: method, function
-  dispatch:
-    CPU, CUDA: gather
+  structured_delegate: gather.out
 
 - func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor
   variants: function
diff --git a/test/test_torch.py b/test/test_torch.py
index a790839bbd50e..2899f2ef4c3b2 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -1057,8 +1057,11 @@ def _test_gather(self, cast, test_bounds=True):
                 torch.gather(src, dim, idx.to(torch.int))
 
             # should throw an error when out.dtype != src.dtype.
-            with self.assertRaisesRegex(RuntimeError, 'Expected self.dtype to be equal to src.dtype'):
-                torch.gather(src, dim, idx, out=expected.to(torch.int))
+            # Note that on Windows, the out tensor's dtype is returned as: struct c10::complex<double> in the error
+            # message, hence the use of .* in regex here
+            with self.assertRaisesRegex(RuntimeError,
+                                        'Expected out tensor to have dtype .*c10::complex<double>, but got int instead'):
+                torch.gather(src.to(torch.complex128), dim, idx, out=expected.to(torch.int))
 
             # checks for the same dimensionality
             with self.assertRaisesRegex(RuntimeError, 'Index tensor must have the same number of dimensions as input tensor'):

From 6db8f7a70920f91418078fe09477eed0b0adefdb Mon Sep 17 00:00:00 2001
From: Kefei Lu <kefeilu@fb.com>
Date: Thu, 2 Sep 2021 01:17:56 -0700
Subject: [PATCH 459/530] Fix TRTModule not adding outputs in order (#64418)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64418

In T99368564, we found that when running TRT lowered module, the output tensors are out-of-order, as compared to the output from the original, non-lowered module. It turns out that in `TRTModule.forward()`, we cannot rely on `ICudaEngine` bindings natural order indices to create the output tensors, but rather, we should explicitly construct the output tensor from the bindings' names, in an ordered that we supply.

Test Plan:
* Arc lint
* Run CI/sandcastle tests
* Run GPU lowering using commands and code changes in D30171741 and ensure we don't observe out-of-order outputs

Reviewed By: yinghai

Differential Revision: D30693545

fbshipit-source-id: 32a894ceeb148fcf4e8d279be3835c7d1f1aa2ba
---
 torch/fx/experimental/fx2trt/fx2trt.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/torch/fx/experimental/fx2trt/fx2trt.py b/torch/fx/experimental/fx2trt/fx2trt.py
index f1d17e701790d..4c0b44c83085f 100644
--- a/torch/fx/experimental/fx2trt/fx2trt.py
+++ b/torch/fx/experimental/fx2trt/fx2trt.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import List, NamedTuple, Iterable, Any, Optional, Tuple
+from typing import List, NamedTuple, Iterable, Any, Optional, Tuple, Sequence
 
 import tensorrt as trt
 import torch
@@ -53,6 +53,12 @@ def __init__(
         # Indicate output is in fp16
         self.fp16_output = fp16_output
 
+        # Indices of outputs into the CUDA engine bindings, in the order as they are
+        # in the fx graph's `output` node.
+        self.output_indices_in_order: Sequence[int] = [
+            self.engine.get_binding_index(name) for name in self.output_names
+        ]
+
     def _on_state_dict(self, state_dict, prefix, local_metadata):
         state_dict[prefix + "engine"] = bytearray(self.engine.serialize())
         state_dict[prefix + "input_names"] = self.input_names
@@ -96,7 +102,7 @@ def forward(self, *inputs):
 
         # create output tensors
         outputs: List[torch.Tensor] = []
-        for idx in range(len(inputs), len(inputs) + len(self.output_names)):
+        for idx in self.output_indices_in_order:
             dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))
 
             if self.engine.has_implicit_batch_dimension:

From 1c735768ede21a60ad4f6ed9565a21d9b4f5bc92 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 2 Sep 2021 03:45:06 -0700
Subject: [PATCH 460/530] Update hub.load() signature to avoid polluting kwargs
 param (#63755)

Summary:
This PR addresses an old comment about Python2 EOL, directly putting some parameters in the function signature instead of in a `**kargs` dict.

I believe the changes are fully backward compatible.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63755

Reviewed By: zou3519

Differential Revision: D30695634

Pulled By: NicolasHug

fbshipit-source-id: 398f347c5a04bfb58e77e46773a869cb9d0eb225
---
 torch/hub.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/torch/hub.py b/torch/hub.py
index 499640b8bc6ee..7cf752f89166c 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -316,11 +316,8 @@ def help(github, model, force_reload=False, skip_validation=False):
     return entry.__doc__
 
 
-# Ideally this should be `def load(github, model, *args, forece_reload=False, **kwargs):`,
-# but Python2 complains syntax error for it. We have to skip force_reload in function
-# signature here but detect it in kwargs instead.
-# TODO: fix it after Python2 EOL
-def load(repo_or_dir, model, *args, **kwargs):
+def load(repo_or_dir, model, *args, source='github', force_reload=False, verbose=True, skip_validation=False,
+         **kwargs):
     r"""
     Load a model from a github repo or a local directory.
 
@@ -329,7 +326,7 @@ def load(repo_or_dir, model, *args, **kwargs):
 
     If :attr:`source` is ``'github'``, :attr:`repo_or_dir` is expected to be
     of the form ``repo_owner/repo_name[:tag_name]`` with an optional
-    tag/branch.
+    tag/branch. The default branch is ``master`` if not specified.
 
     If :attr:`source` is ``'local'``, :attr:`repo_or_dir` is expected to be a
     path to a local directory.
@@ -367,10 +364,7 @@ def load(repo_or_dir, model, *args, **kwargs):
         >>> path = '/some/local/path/pytorch/vision'
         >>> model = torch.hub.load(path, 'resnet50', pretrained=True)
     """
-    source = kwargs.pop('source', 'github').lower()
-    force_reload = kwargs.pop('force_reload', False)
-    verbose = kwargs.pop('verbose', True)
-    skip_validation = kwargs.pop('skip_validation', False)
+    source = source.lower()
 
     if source not in ('github', 'local'):
         raise ValueError(

From 030154e24119cdd16819ed4459d60379cf44c51f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 2 Sep 2021 03:46:59 -0700
Subject: [PATCH 461/530] Remove outdated comment in hub.py (#63757)

Summary:
This PR removes an outdated comment about Python2 that was orginally introduced in https://github.com/pytorch/pytorch/pull/25083/files. The code has changed since then, but the comment wasn't removed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63757

Reviewed By: zou3519

Differential Revision: D30695656

Pulled By: NicolasHug

fbshipit-source-id: 431cf414588b9e5a1ad6acdae724ff5af1b16971
---
 torch/hub.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/torch/hub.py b/torch/hub.py
index 7cf752f89166c..4cfbc83421bfe 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -425,8 +425,6 @@ def download_url_to_file(url, dst, hash_prefix=None, progress=True):
 
     """
     file_size = None
-    # We use a different API for python2 since urllib(2) doesn't recognize the CA
-    # certificates in older Python
     req = Request(url, headers={"User-Agent": "torch.hub"})
     u = urlopen(req)
     meta = u.info()

From aedd70fcfe62a8bfb726b609f16edb8448a74299 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 2 Sep 2021 03:48:44 -0700
Subject: [PATCH 462/530] Fix list() and help() torchhub functions for Windows
 (#63773)

Summary:
This PR Fixes the help() and list() torchhub functions which were probably failing for Windows since the `/` OS separator was hardcoded.

Before merging this I need to double check whether the CI actually runs the corresponding tests on Windows or not

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63773

Reviewed By: zou3519

Differential Revision: D30695664

Pulled By: NicolasHug

fbshipit-source-id: fac328163fd05db804a8186ae28f22b3cc3a6404
---
 torch/hub.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/torch/hub.py b/torch/hub.py
index 4cfbc83421bfe..bcd53f79d9c77 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -277,7 +277,8 @@ def list(github, force_reload=False, skip_validation=False):
 
     sys.path.insert(0, repo_dir)
 
-    hub_module = import_module(MODULE_HUBCONF, repo_dir + '/' + MODULE_HUBCONF)
+    hubconf_path = os.path.join(repo_dir, MODULE_HUBCONF)
+    hub_module = import_module(MODULE_HUBCONF, hubconf_path)
 
     sys.path.remove(repo_dir)
 
@@ -307,7 +308,8 @@ def help(github, model, force_reload=False, skip_validation=False):
 
     sys.path.insert(0, repo_dir)
 
-    hub_module = import_module(MODULE_HUBCONF, repo_dir + '/' + MODULE_HUBCONF)
+    hubconf_path = os.path.join(repo_dir, MODULE_HUBCONF)
+    hub_module = import_module(MODULE_HUBCONF, hubconf_path)
 
     sys.path.remove(repo_dir)
 

From be5b05c1dc7f8281311842a3953288b8f158a07a Mon Sep 17 00:00:00 2001
From: Michael Dagitses <mikeyd@fb.com>
Date: Thu, 2 Sep 2021 04:04:59 -0700
Subject: [PATCH 463/530] require that `TARGET_DET_LIST` is sorted (and sort it
 here) (#64102)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64102

We sort this list so that we may add comments to indicate the absence
of a file right where that file would need to be put. This makes it
difficult to wrongly add such a file.

The sorting itself was done programmatically to ensure that no entries
were inadvertently removed.

I printed the sorted list with:

```
  for p in sorted(TARGET_DET_LIST):
    print(f'    "{p}",')
```

Then copied it back into the file.

Test Plan: Imported from OSS

Reviewed By: driazati

Differential Revision: D30625076

Pulled By: dagitses

fbshipit-source-id: cf36fcb3e53e274b76d1f4aae83da1f53c03f9ed
---
 test/test_determination.py                 |  7 ++
 tools/testing/modulefinder_determinator.py | 87 +++++++++++-----------
 2 files changed, 51 insertions(+), 43 deletions(-)

diff --git a/test/test_determination.py b/test/test_determination.py
index 277bbd2bc166c..ca00835429c4c 100644
--- a/test/test_determination.py
+++ b/test/test_determination.py
@@ -33,6 +33,13 @@ def determined_tests(cls, changed_files):
             if run_test.should_run_test(run_test.TARGET_DET_LIST, test, changed_files, DummyOptions())
         ]
 
+    def test_target_det_list_is_sorted(self):
+        # We keep TARGET_DET_LIST sorted to minimize merge conflicts
+        # but most importantly to allow us to comment on the absence
+        # of a test. It would be very difficult to add a file right
+        # next to a comment that says to keep it out of the list.
+        self.assertListEqual(run_test.TARGET_DET_LIST, sorted(run_test.TARGET_DET_LIST))
+
     def test_config_change_only(self):
         """CI configs trigger all tests"""
         self.assertEqual(
diff --git a/tools/testing/modulefinder_determinator.py b/tools/testing/modulefinder_determinator.py
index 8acd0ed9cc2f0..b6c94e7a2d48f 100644
--- a/tools/testing/modulefinder_determinator.py
+++ b/tools/testing/modulefinder_determinator.py
@@ -12,50 +12,8 @@
 # run with --determine-from, we use another generated list based on this one and the
 # previous test stats.
 TARGET_DET_LIST = [
-    "distributions/test_distributions",
-    "test_nn",
-    "test_autograd",
-    "test_cpp_extensions_jit",
-    "test_jit_legacy",
-    "test_dataloader",
-    "test_overrides",
-    "test_linalg",
-    "test_jit",
-    "test_jit_profiling",
-    "test_torch",
-    "test_binary_ufuncs",
-    "test_numpy_interop",
-    "test_reductions",
-    "test_shape_ops",
-    "test_sort_and_select",
-    "test_testing",
-    "test_view_ops",
-    "distributed/nn/jit/test_instantiator",
-    "distributed/rpc/test_tensorpipe_agent",
-    "distributed/rpc/cuda/test_tensorpipe_agent",
     "distributed/algorithms/ddp_comm_hooks/test_ddp_hooks",
-    "distributed/test_distributed_spawn",
-    "test_cuda",
-    "test_cuda_primary_ctx",
-    "test_cpp_extensions_aot_ninja",
-    "test_cpp_extensions_aot_no_ninja",
-    "test_serialization",
-    "test_optim",
-    "test_utils",
-    "test_multiprocessing",
-    "test_tensorboard",
-    "distributed/test_c10d_common",
-    "distributed/test_c10d_gloo",
-    "distributed/test_c10d_nccl",
-    "distributed/test_jit_c10d",
-    "distributed/test_c10d_spawn_gloo",
-    "distributed/test_c10d_spawn_nccl",
-    "distributed/test_store",
-    "distributed/test_pg_wrapper",
-    "test_quantization",
-    "test_pruning_op",
-    "test_determination",
-    "test_futures",
+    "distributed/nn/jit/test_instantiator",
     "distributed/pipeline/sync/skip/test_api",
     "distributed/pipeline/sync/skip/test_gpipe",
     "distributed/pipeline/sync/skip/test_inspect_skip_layout",
@@ -78,8 +36,51 @@
     "distributed/pipeline/sync/test_stream",
     "distributed/pipeline/sync/test_transparency",
     "distributed/pipeline/sync/test_worker",
+    "distributed/rpc/cuda/test_tensorpipe_agent",
+    "distributed/rpc/test_tensorpipe_agent",
+    "distributed/test_c10d_common",
+    "distributed/test_c10d_gloo",
+    "distributed/test_c10d_nccl",
+    "distributed/test_c10d_spawn_gloo",
+    "distributed/test_c10d_spawn_nccl",
+    "distributed/test_distributed_spawn",
+    "distributed/test_jit_c10d",
+    "distributed/test_pg_wrapper",
+    "distributed/test_store",
+    "distributions/test_distributions",
+    "test_autograd",
+    "test_binary_ufuncs",
+    "test_cpp_extensions_aot_ninja",
+    "test_cpp_extensions_aot_no_ninja",
+    "test_cpp_extensions_jit",
+    "test_cuda",
+    "test_cuda_primary_ctx",
+    "test_dataloader",
+    "test_determination",
+    "test_futures",
+    "test_jit",
+    "test_jit_legacy",
+    "test_jit_profiling",
+    "test_linalg",
+    "test_multiprocessing",
+    "test_nn",
+    "test_numpy_interop",
+    "test_optim",
+    "test_overrides",
+    "test_pruning_op",
+    "test_quantization",
+    "test_reductions",
+    "test_serialization",
+    "test_shape_ops",
+    "test_sort_and_select",
+    "test_tensorboard",
+    "test_testing",
+    "test_torch",
+    "test_utils",
+    "test_view_ops",
 ]
 
+
 _DEP_MODULES_CACHE: Dict[str, Set[str]] = {}
 
 
From cdb46f4c6e836ffe559781a40846c2f3b50b9e9c Mon Sep 17 00:00:00 2001
From: Michael Dagitses <mikeyd@fb.com>
Date: Thu, 2 Sep 2021 04:04:59 -0700
Subject: [PATCH 464/530] extract TestAutogradComplex into its own test file
 (#63400)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63400

This is the first step to break up test_autograd.py for #63205.

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D30541499

Pulled By: dagitses

fbshipit-source-id: 8d9d32007938b9eade0e88f95a6a3190e7e2ef01
---
 test/autograd/test_complex.py              | 103 ++++++++++++++++++++
 test/test_autograd.py                      | 106 +--------------------
 tools/testing/modulefinder_determinator.py |   5 +-
 3 files changed, 112 insertions(+), 102 deletions(-)
 create mode 100644 test/autograd/test_complex.py

diff --git a/test/autograd/test_complex.py b/test/autograd/test_complex.py
new file mode 100644
index 0000000000000..74fcfdafbce2a
--- /dev/null
+++ b/test/autograd/test_complex.py
@@ -0,0 +1,103 @@
+import torch
+
+from torch.testing._internal.common_utils import TestCase, run_tests, gradcheck
+
+
+class TestAutogradComplex(TestCase):
+    def test_view_func_for_complex_views(self):
+        # case 1: both parent and child have view_func
+        x = torch.randn(2, 2, 2, dtype=torch.double, requires_grad=True)
+        y = x.detach().requires_grad_(True)
+
+        x0 = x.clone()
+        x1 = torch.view_as_complex(x0)
+        x2 = torch.view_as_real(x1)
+        x2.mul_(2)
+        x2.sum().backward()
+
+        y0 = y.clone()
+        y0.mul_(2)
+        y0.sum().backward()
+
+        self.assertEqual(x.grad, y.grad)
+
+        # case 2: parent has view_func but child does not
+        x = torch.randn(2, 2, 2, dtype=torch.double, requires_grad=True)
+        y = x.detach().requires_grad_(True)
+
+        def fn(a):
+            b = a.clone()
+            b1 = torch.view_as_complex(b)
+            b2 = b1.reshape(b1.numel())
+            return b2
+
+        x0 = fn(x)
+        x0.mul_(2)
+        x0.sum().backward()
+
+        y0 = fn(y)
+        y1 = y0.mul(2)
+        y1.sum().backward()
+
+        self.assertEqual(x.grad, y.grad)
+
+        # case 3: parent does not have a view_func but child does
+        x = torch.randn(10, dtype=torch.cdouble, requires_grad=True)
+        y = x.detach().requires_grad_(True)
+
+        def fn(a, dim0_size=5):
+            b = a.clone()
+            b1 = b.reshape(dim0_size, 2)
+            b2 = torch.view_as_real(b1)
+            return b2
+
+        x0 = fn(x)
+        x0.mul_(2)
+        x0.sum().backward()
+
+        y0 = fn(y)
+        y1 = y0.mul(2)
+        y1.sum().backward()
+
+        self.assertEqual(x.grad, y.grad)
+
+    def test_view_with_multi_output(self):
+        x = torch.randn(2, 2, 2, dtype=torch.double)
+
+        x1 = torch.view_as_complex(x)
+        # Taking an invalid view should always be allowed as long as it is not
+        # modified inplace
+        res = x1.unbind(0)
+
+        with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"):
+            res[0] += torch.rand(2, requires_grad=True)
+
+        x.requires_grad_(True)
+        x1 = torch.view_as_complex(x)
+        # Taking an invalid view should always be allowed as long as it is not
+        # modified inplace
+        res = x1.unbind(0)
+
+        with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"):
+            res[0] += torch.rand(2, requires_grad=True)
+
+    def as_identity(self):
+        # view_as_real and view_as_complex behavior should be like an identity
+        def func(z):
+            z_ = torch.view_as_complex(z)
+            z_select = torch.select(z_, z_.dim() - 1, 0)
+            z_select_real = torch.view_as_real(z_select)
+            return z_select_real.sum()
+
+        z = torch.randn(10, 2, 2, dtype=torch.double, requires_grad=True)
+        gradcheck(func, [z])
+        func(z).backward()
+
+        z1 = z.clone().detach().requires_grad_(True)
+        torch.select(z1, z1.dim() - 2, 0).sum().backward()
+
+        self.assertEqual(z.grad, z1.grad)
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_autograd.py b/test/test_autograd.py
index ebe3aa5d29e18..fde64b0b062d3 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -28,7 +28,6 @@
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import (TestCase, run_tests, skipIfNoLapack,
                                                   suppress_warnings, slowTest,
-                                                  load_tests,
                                                   IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck,
                                                   TEST_WITH_ROCM, disable_gc,
                                                   gradcheck, gradgradcheck)
@@ -44,11 +43,6 @@
                                                         deviceCountAtLeast, skipCUDAIfCudnnVersionLessThan,
                                                         skipCUDAIf, skipMeta)
 
-
-# load_tests from common_utils is used to automatically filter tests for
-# sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
-
 import pickle
 
 PRECISION = 1e-4
@@ -6173,101 +6167,6 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
         test_case.assertEqual(self_variable.size(), self_variable.grad.size())
 
 
-class TestAutogradComplex(TestCase):
-    def test_view_func_for_complex_views(self):
-        # case 1: both parent and child have view_func
-        x = torch.randn(2, 2, 2, dtype=torch.double, requires_grad=True)
-        y = x.detach().requires_grad_(True)
-
-        x0 = x.clone()
-        x1 = torch.view_as_complex(x0)
-        x2 = torch.view_as_real(x1)
-        x2.mul_(2)
-        x2.sum().backward()
-
-        y0 = y.clone()
-        y0.mul_(2)
-        y0.sum().backward()
-
-        self.assertEqual(x.grad, y.grad)
-
-        # case 2: parent has view_func but child does not
-        x = torch.randn(2, 2, 2, dtype=torch.double, requires_grad=True)
-        y = x.detach().requires_grad_(True)
-
-        def fn(a):
-            b = a.clone()
-            b1 = torch.view_as_complex(b)
-            b2 = b1.reshape(b1.numel())
-            return b2
-
-        x0 = fn(x)
-        x0.mul_(2)
-        x0.sum().backward()
-
-        y0 = fn(y)
-        y1 = y0.mul(2)
-        y1.sum().backward()
-
-        self.assertEqual(x.grad, y.grad)
-
-        # case 3: parent does not have a view_func but child does
-        x = torch.randn(10, dtype=torch.cdouble, requires_grad=True)
-        y = x.detach().requires_grad_(True)
-
-        def fn(a, dim0_size=5):
-            b = a.clone()
-            b1 = b.reshape(dim0_size, 2)
-            b2 = torch.view_as_real(b1)
-            return b2
-
-        x0 = fn(x)
-        x0.mul_(2)
-        x0.sum().backward()
-
-        y0 = fn(y)
-        y1 = y0.mul(2)
-        y1.sum().backward()
-
-        self.assertEqual(x.grad, y.grad)
-
-    def test_view_with_multi_output(self):
-        x = torch.randn(2, 2, 2, dtype=torch.double)
-
-        x1 = torch.view_as_complex(x)
-        # Taking an invalid view should always be allowed as long as it is not
-        # modified inplace
-        res = x1.unbind(0)
-
-        with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"):
-            res[0] += torch.rand(2, requires_grad=True)
-
-        x.requires_grad_(True)
-        x1 = torch.view_as_complex(x)
-        # Taking an invalid view should always be allowed as long as it is not
-        # modified inplace
-        res = x1.unbind(0)
-
-        with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"):
-            res[0] += torch.rand(2, requires_grad=True)
-
-    def as_identity(self):
-        # view_as_real and view_as_complex behavior should be like an identity
-        def func(z):
-            z_ = torch.view_as_complex(z)
-            z_select = torch.select(z_, z_.dim() - 1, 0)
-            z_select_real = torch.view_as_real(z_select)
-            return z_select_real.sum()
-
-        z = torch.randn(10, 2, 2, dtype=torch.double, requires_grad=True)
-        gradcheck(func, [z])
-        func(z).backward()
-
-        z1 = z.clone().detach().requires_grad_(True)
-        torch.select(z1, z1.dim() - 2, 0).sum().backward()
-
-        self.assertEqual(z.grad, z1.grad)
-
 class TestAutogradFunctional(TestCase):
     def _assert_same_struct(self, res, base):
         # base and res should be Tensors or tuple of Tensors with the same size
@@ -9640,6 +9539,11 @@ def fn(x1, x2):
         torch.autograd.gradcheck(fn, [inp_r, inp_c], check_forward_ad=True)
         torch.autograd.gradcheck(fn, [inp_c, inp_r], check_forward_ad=True)
 
+# Import test cases from below autograd/ here. These are found
+# implicitly by the loader, so Flake8 thinks they are unused, hence
+# the suppressions.
+
+from autograd.test_complex import TestAutogradComplex  # noqa: F401
 
 # e.g., TestAutogradDeviceTypeCPU and TestAutogradDeviceTypeCUDA
 instantiate_device_type_tests(
diff --git a/tools/testing/modulefinder_determinator.py b/tools/testing/modulefinder_determinator.py
index b6c94e7a2d48f..32dc1031b5616 100644
--- a/tools/testing/modulefinder_determinator.py
+++ b/tools/testing/modulefinder_determinator.py
@@ -48,7 +48,10 @@
     "distributed/test_pg_wrapper",
     "distributed/test_store",
     "distributions/test_distributions",
-    "test_autograd",
+    # test_autograd.py is not slow, so it does not belong here. But
+    # note that if you try to add it back it will run into
+    # https://bugs.python.org/issue40350 because it imports files
+    # under test/autograd/.
     "test_binary_ufuncs",
     "test_cpp_extensions_aot_ninja",
     "test_cpp_extensions_aot_no_ninja",

From 99b064fac4b24a5a76808b52107b88425a402c60 Mon Sep 17 00:00:00 2001
From: Ivan Kobzarev <ivankobzarev@fb.com>
Date: Thu, 2 Sep 2021 05:27:59 -0700
Subject: [PATCH 465/530] [jit] shape propagation for prepack (#63585)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63585

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D30428905

Pulled By: IvanKobzarev

fbshipit-source-id: c18f6605a69b2e000bdf14a23e637c5a1c2ec64c
---
 aten/src/ATen/native/xnnpack/Convolution.cpp  |   15 +
 aten/src/ATen/native/xnnpack/Convolution.h    |    3 +
 aten/src/ATen/native/xnnpack/Linear.cpp       |   10 +
 aten/src/ATen/native/xnnpack/Linear.h         |    3 +
 .../native/xnnpack/RegisterOpContextClass.cpp |    3 +
 .../jit/runtime/symbolic_shape_registry.cpp   |   39 +-
 .../csrc/jit/serialization/import_source.cpp  | 1128 ++++++++---------
 torch/csrc/jit/serialization/import_source.h  |   61 +-
 8 files changed, 684 insertions(+), 578 deletions(-)

diff --git a/aten/src/ATen/native/xnnpack/Convolution.cpp b/aten/src/ATen/native/xnnpack/Convolution.cpp
index 8c5d99a242196..f46052d9c5ef6 100644
--- a/aten/src/ATen/native/xnnpack/Convolution.cpp
+++ b/aten/src/ATen/native/xnnpack/Convolution.cpp
@@ -425,6 +425,21 @@ Tensor conv2d_clamp_run(
   return op_context->run(input);
 }
 
+// Op is registered to have Any argument as we plan to reuse it for prepacked conv2d of other backends
+std::tuple<IntArrayRef, c10::optional<IntArrayRef>, IntArrayRef, IntArrayRef, IntArrayRef, int64_t>
+unpack_prepacked_sizes_conv2d(const IValue& ivalue) {
+  auto op_context = ivalue.toCustomClass<xnnpack::Conv2dOpContext>();
+  const auto tuple = op_context->unpack();
+  const auto& bias = std::get<1>(tuple);
+  return std::make_tuple(
+      std::get<0>(tuple).sizes(),
+      (bias && bias->defined()) ? c10::optional<IntArrayRef>(bias->sizes()) : c10::nullopt,
+      std::get<2>(tuple),
+      std::get<3>(tuple),
+      std::get<4>(tuple),
+      std::get<5>(tuple));
+}
+
 Tensor conv2d_transpose_clamp_run(
     const Tensor& input,
     const c10::intrusive_ptr<xnnpack::TransposeConv2dOpContext>& op_context) {
diff --git a/aten/src/ATen/native/xnnpack/Convolution.h b/aten/src/ATen/native/xnnpack/Convolution.h
index 403f26cdec70e..b89059de2c615 100644
--- a/aten/src/ATen/native/xnnpack/Convolution.h
+++ b/aten/src/ATen/native/xnnpack/Convolution.h
@@ -39,6 +39,9 @@ Tensor conv2d_clamp_run(
     const Tensor& input,
     const c10::intrusive_ptr<xnnpack::Conv2dOpContext>& op_context);
 
+std::tuple<IntArrayRef, c10::optional<IntArrayRef>, IntArrayRef, IntArrayRef, IntArrayRef, int64_t>
+unpack_prepacked_sizes_conv2d(const IValue& ivalue);
+
 Tensor conv2d_transpose_clamp_run(
     const Tensor& input,
     const c10::intrusive_ptr<xnnpack::TransposeConv2dOpContext>& op_context);
diff --git a/aten/src/ATen/native/xnnpack/Linear.cpp b/aten/src/ATen/native/xnnpack/Linear.cpp
index 9a459b660d6fb..19c474f34cef9 100644
--- a/aten/src/ATen/native/xnnpack/Linear.cpp
+++ b/aten/src/ATen/native/xnnpack/Linear.cpp
@@ -180,6 +180,16 @@ Tensor linear_clamp_run(
   return op_context->run(input);
 }
 
+std::tuple<IntArrayRef, c10::optional<IntArrayRef>>
+unpack_prepacked_sizes_linear(const IValue& ivalue) {
+  auto op_context = ivalue.toCustomClass<xnnpack::LinearOpContext>();
+  const auto tuple = op_context->unpack();
+  const auto& bias = std::get<1>(tuple);
+  return std::make_tuple(
+      std::get<0>(tuple).sizes(),
+      (bias && bias->defined()) ? c10::optional<IntArrayRef>(bias->sizes()) : c10::nullopt);
+}
+
 } // namespace linear
 } // namespace internal
 
diff --git a/aten/src/ATen/native/xnnpack/Linear.h b/aten/src/ATen/native/xnnpack/Linear.h
index 3e4df0466d261..d25f63bafa739 100644
--- a/aten/src/ATen/native/xnnpack/Linear.h
+++ b/aten/src/ATen/native/xnnpack/Linear.h
@@ -20,6 +20,9 @@ c10::intrusive_ptr<xnnpack::LinearOpContext> createLinearClampPrePackOpContext(
 
 Tensor linear_clamp_run(const Tensor& input, const c10::intrusive_ptr<xnnpack::LinearOpContext>& op_context);
 
+std::tuple<IntArrayRef, c10::optional<IntArrayRef>>
+unpack_prepacked_sizes_linear(const IValue& ivalue);
+
 ContextLinear create(
     const Tensor& weight,
     const c10::optional<Tensor>& bias,
diff --git a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
index 03ac612aa12d0..f09c2dc22a39c 100644
--- a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
+++ b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
@@ -80,7 +80,10 @@ TORCH_LIBRARY(xnnpack, m) {
 
 }
 
+// Registration using the TORCH_LIBRARY def gives dispatching errors when there is no tensor input
 TORCH_LIBRARY(prepacked, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::unpack_prepacked_sizes_conv2d(Any W_prepack) -> (int[], int[]?, int[], int[], int[], int)"), [](const IValue& inp) { return internal::convolution2d::unpack_prepacked_sizes_conv2d(inp);});
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::unpack_prepacked_sizes_linear(Any W_prepack) -> (int[], int[]?)"), [](const IValue& inp) { return internal::linear::unpack_prepacked_sizes_linear(inp);});
   m.def(TORCH_SELECTIVE_SCHEMA("prepacked::linear_clamp_prepack(Tensor W, Tensor? B=None, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.LinearOpContext"));
   m.def(TORCH_SELECTIVE_SCHEMA("prepacked::linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.Conv2dOpContext"));
diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
index d4471998d11e8..871b65d75f6b7 100644
--- a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
+++ b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/jit/passes/inliner.h>
 #include <torch/csrc/jit/runtime/operator.h>
 #include <torch/csrc/jit/runtime/symbolic_shape_registry.h>
+#include <torch/csrc/jit/serialization/import_source.h>
 #include <unordered_map>
 
 namespace torch {
@@ -292,7 +293,21 @@ const std::string shape_compute_functions =
           for i in range(end_dim + 1, len(input)):
             shape.append(input[i])
           return shape
-    )";
+    )"
+#ifdef USE_XNNPACK
+    R"(
+        def prepacked_conv2d_clamp_run(input: List[int], conv2dOpContext: Any):
+          assert isinstance(conv2dOpContext, __torch__.torch.classes.xnnpack.Conv2dOpContext)
+          (weight, bias, stride, padding, dilation, groups) = ops.prepacked.unpack_prepacked_sizes_conv2d(conv2dOpContext)
+          return conv2d(input, weight, bias, stride, padding, dilation, groups)
+
+        def prepacked_linear_clamp_run(input: List[int], linearOpContext: Any):
+          assert isinstance(linearOpContext, __torch__.torch.classes.xnnpack.LinearOpContext)
+          (weight, bias) = ops.prepacked.unpack_prepacked_sizes_linear(linearOpContext)
+          return linear(input, weight, bias)
+    )"
+#endif
+    ;
 
 // mapping function schema to shape compute graphs allows multiple functions to
 // share the same shape compute graph, which is memory efficient and also will
@@ -317,8 +332,11 @@ static const OperatorMap<std::string>& get_schema_to_function_graph() {
       {"aten::div.Scalar(Tensor self, Scalar other) -> Tensor", "unary_one_unused_input"},
       {"aten::gt.Tensor(Tensor self, Tensor other) -> Tensor", "broadcast"},
       {"aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor", "broadcast_one_unused_input"},
+      {"aten::add_.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor", "broadcast_one_unused_input"},
       {"aten::add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", "unary_two_unused_inputs"},
       {"aten::hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor", "unary_two_unused_inputs"},
+      {"aten::hardswish_(Tensor self) -> Tensor", "unary"},
+      {"aten::hardsigmoid_(Tensor self) -> Tensor", "unary"},
       {"aten::adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor", "adaptive_avg_pool2d"},
       {"aten::mm(Tensor self, Tensor mat2) -> Tensor", "mm"},
       {"aten::dot(Tensor self, Tensor tensor) -> Tensor", "dot"},
@@ -335,6 +353,10 @@ static const OperatorMap<std::string>& get_schema_to_function_graph() {
       {"aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)", "view"},
       {"aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "mean_dim"},
       {"aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor", "addmm"},
+#ifdef USE_XNNPACK
+      {"prepacked::conv2d_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.Conv2dOpContext W_prepack) -> Tensor Y", "prepacked_conv2d_clamp_run"},
+      {"prepacked::linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y", "prepacked_linear_clamp_run"},
+#endif
   };
   // clang-format on
   return schema_to_function_graph;
@@ -344,7 +366,7 @@ std::unordered_map<const FunctionSchema*, std::shared_ptr<Graph>>
     cached_schema_to_graph;
 
 // CompilationUnit that holds all these Functions and keeps them alive.
-CompilationUnit compilation_unit;
+auto compilation_unit = std::make_shared<CompilationUnit>();
 
 void loadModule(const CompilationUnit& module) {
   std::unordered_map<std::string, std::shared_ptr<Graph>> reused_functions;
@@ -371,9 +393,16 @@ void loadModule(const CompilationUnit& module) {
 }
 
 void loadFunctions() {
-  compilation_unit.define(
-      c10::nullopt, shape_compute_functions, nativeResolver(), nullptr);
-  loadModule(compilation_unit);
+  auto src = std::make_shared<Source>(shape_compute_functions);
+  std::vector<at::IValue> constantTable;
+  auto resolver = std::make_shared<SourceImporterImpl>(
+      compilation_unit,
+      &constantTable,
+      [&](const std::string& name) -> std::shared_ptr<Source> { return src; },
+      1);
+  compilation_unit->define(
+      c10::nullopt, shape_compute_functions, resolver, nullptr);
+  loadModule(*compilation_unit);
 }
 } // anonymous namespace
 
diff --git a/torch/csrc/jit/serialization/import_source.cpp b/torch/csrc/jit/serialization/import_source.cpp
index fb1de17a54eea..e7d9da26df41d 100644
--- a/torch/csrc/jit/serialization/import_source.cpp
+++ b/torch/csrc/jit/serialization/import_source.cpp
@@ -91,629 +91,615 @@ struct ConstantTableValue : public SugaredValue {
   const std::vector<at::IValue>* constants_;
 };
 
-struct SourceImporterImpl : public Resolver,
-                            std::enable_shared_from_this<SourceImporterImpl> {
-  SourceImporterImpl(
-      std::shared_ptr<CompilationUnit> cu,
-      const std::vector<at::IValue>* constant_table,
-      SourceLoader source_loader,
-      size_t version)
-      : cu_(std::move(cu)), source_loader_(std::move(source_loader)) {
-    env_ = {
-        {"torch", std::make_shared<BuiltinModule>("aten", version)},
-        {"ops", std::make_shared<OpsValue>(version)},
-        // Constants present in the model. Used to resolve "CONSTANTS.n" to the
-        // actual value
-        {"CONSTANTS", std::make_shared<ConstantTableValue>(constant_table)},
-        {"fork", SpecialFormValue::create(prim::fork)},
-        {"annotate", SpecialFormValue::create(prim::annotate)},
-        {"unchecked_cast", SpecialFormValue::create(prim::unchecked_cast)},
-        {"uninitialized", SpecialFormValue::create(prim::Uninitialized)},
-    };
-  }
-
-  TypePtr findNamedType(const QualifiedName& name) {
-    if (auto custom_class = getCustomClass(name.qualifiedName())) {
-      return custom_class;
-    }
-    parseSourceIfNeeded(name.prefix());
-    auto it = to_be_defined_.find(name);
-    if (it != to_be_defined_.end() && it->second->kind() == TK_CLASS_DEF) {
-      ClassDef cd(it->second);
-      to_be_defined_.erase(it);
-      importNamedType(name.prefix(), cd);
-    }
-    return cu_->get_type(name);
+SourceImporterImpl::SourceImporterImpl(
+    std::shared_ptr<CompilationUnit> cu,
+    const std::vector<at::IValue>* constant_table,
+    SourceLoader source_loader,
+    size_t version)
+    : cu_(std::move(cu)), source_loader_(std::move(source_loader)) {
+  env_ = {
+      {"torch", std::make_shared<BuiltinModule>("aten", version)},
+      {"ops", std::make_shared<OpsValue>(version)},
+      // Constants present in the model. Used to resolve "CONSTANTS.n" to the
+      // actual value
+      {"CONSTANTS", std::make_shared<ConstantTableValue>(constant_table)},
+      {"fork", SpecialFormValue::create(prim::fork)},
+      {"annotate", SpecialFormValue::create(prim::annotate)},
+      {"unchecked_cast", SpecialFormValue::create(prim::unchecked_cast)},
+      {"uninitialized", SpecialFormValue::create(prim::Uninitialized)},
+  };
+}
+
+TypePtr SourceImporterImpl::findNamedType(const QualifiedName& name) {
+  if (auto custom_class = getCustomClass(name.qualifiedName())) {
+    return custom_class;
+  }
+  parseSourceIfNeeded(name.prefix());
+  auto it = to_be_defined_.find(name);
+  if (it != to_be_defined_.end() && it->second->kind() == TK_CLASS_DEF) {
+    ClassDef cd(it->second);
+    to_be_defined_.erase(it);
+    importNamedType(name.prefix(), cd);
   }
+  return cu_->get_type(name);
+}
 
-  Function* findFunction(const QualifiedName& name) {
-    parseSourceIfNeeded(name.prefix());
-    auto it = to_be_defined_.find(name);
-    if (it != to_be_defined_.end() && it->second->kind() == TK_DEF) {
-      Def d(it->second);
-      to_be_defined_.erase(it);
-      importFunction(name.prefix(), d);
-    }
-    return cu_->find_function(name);
+Function* SourceImporterImpl::findFunction(const QualifiedName& name) {
+  parseSourceIfNeeded(name.prefix());
+  auto it = to_be_defined_.find(name);
+  if (it != to_be_defined_.end() && it->second->kind() == TK_DEF) {
+    Def d(it->second);
+    to_be_defined_.erase(it);
+    importFunction(name.prefix(), d);
   }
+  return cu_->find_function(name);
+}
 
-  void parseSourceIfNeeded(const std::string& qualifier) {
-    // qualifier may be blank, for instance checking if __torch__ is a class.
-    if (qualifier == "" || loaded_sources_.count(qualifier)) {
-      return;
-    }
-    loaded_sources_.insert(qualifier);
-    std::shared_ptr<Source> src = source_loader_(qualifier);
-
-    // The importer, when looking for classes/functions doesn't know if 'foo'
-    // contains definitions or if it is a prefix of 'foo.bar', we only figure it
-    // out by testing if `foo.py` exists in the source loader. If it doesn't
-    // then there is nothing to load here
-    if (!src) {
-      return;
-    }
-    Parser p(src);
-    parsePossibleVersionNumber(p.lexer());
-
-    auto& L = p.lexer();
-
-    while (L.cur().kind != TK_EOF) {
-      parseImports(L);
-      auto tk = L.cur();
-      auto kind = tk.kind;
-      switch (kind) {
-        case TK_CLASS_DEF: {
-          auto parsed_treeref = ClassDef(p.parseClass());
-          to_be_defined_[QualifiedName(
-              qualifier, parsed_treeref.name().name())] = parsed_treeref;
-        } break;
-        case TK_DEF: {
-          auto parsed_treeref = Def(p.parseFunction(/*is_method=*/false));
-          to_be_defined_[QualifiedName(
-              qualifier, parsed_treeref.name().name())] = parsed_treeref;
-        } break;
-        default:
-          throw ErrorReport(L.cur().range)
-              << "Unexpected token in code import: " << kindToString(kind);
-      }
+void SourceImporterImpl::parseSourceIfNeeded(const std::string& qualifier) {
+  // qualifier may be blank, for instance checking if __torch__ is a class.
+  if (qualifier == "" || loaded_sources_.count(qualifier)) {
+    return;
+  }
+  loaded_sources_.insert(qualifier);
+  std::shared_ptr<Source> src = source_loader_(qualifier);
+
+  // The importer, when looking for classes/functions doesn't know if 'foo'
+  // contains definitions or if it is a prefix of 'foo.bar', we only figure it
+  // out by testing if `foo.py` exists in the source loader. If it doesn't
+  // then there is nothing to load here
+  if (!src) {
+    return;
+  }
+  Parser p(src);
+  parsePossibleVersionNumber(p.lexer());
+
+  auto& L = p.lexer();
+
+  while (L.cur().kind != TK_EOF) {
+    parseImports(L);
+    auto tk = L.cur();
+    auto kind = tk.kind;
+    switch (kind) {
+      case TK_CLASS_DEF: {
+        auto parsed_treeref = ClassDef(p.parseClass());
+        to_be_defined_[QualifiedName(qualifier, parsed_treeref.name().name())] =
+            parsed_treeref;
+      } break;
+      case TK_DEF: {
+        auto parsed_treeref = Def(p.parseFunction(/*is_method=*/false));
+        to_be_defined_[QualifiedName(qualifier, parsed_treeref.name().name())] =
+            parsed_treeref;
+      } break;
+      default:
+        throw ErrorReport(L.cur().range)
+            << "Unexpected token in code import: " << kindToString(kind);
     }
   }
+}
 
-  void LEGACY_import_methods(
-      const Module& mod,
-      const std::shared_ptr<Source>& src) {
-    auto self = SimpleSelf(mod.type());
-    c10::QualifiedName prefix = *mod.type()->name();
-    Parser p(src);
+void SourceImporterImpl::LEGACY_import_methods(
+    const Module& mod,
+    const std::shared_ptr<Source>& src) {
+  auto self = SimpleSelf(mod.type());
+  c10::QualifiedName prefix = *mod.type()->name();
+  Parser p(src);
 
-    parsePossibleVersionNumber(p.lexer());
+  parsePossibleVersionNumber(p.lexer());
 
-    parseImports(p.lexer());
+  parseImports(p.lexer());
 
-    std::vector<Def> definitions;
-    std::vector<ResolverPtr> resolvers;
-    while (p.lexer().cur().kind != TK_EOF) {
-      auto def = Def(p.parseFunction(/*is_method=*/true));
-      definitions.emplace_back(def);
-      resolvers.emplace_back(shared_from_this());
-    }
-    cu_->define(
-        prefix,
-        /*properties=*/{},
-        /*propResolvers=*/{},
-        definitions,
-        resolvers,
-        &self);
+  std::vector<Def> definitions;
+  std::vector<ResolverPtr> resolvers;
+  while (p.lexer().cur().kind != TK_EOF) {
+    auto def = Def(p.parseFunction(/*is_method=*/true));
+    definitions.emplace_back(def);
+    resolvers.emplace_back(shared_from_this());
   }
+  cu_->define(
+      prefix,
+      /*properties=*/{},
+      /*propResolvers=*/{},
+      definitions,
+      resolvers,
+      &self);
+}
 
-  std::shared_ptr<SugaredValue> resolveValue(
-      const std::string& name,
-      Function& m,
-      const SourceRange& loc) override {
-    auto it = env_.find(name);
-    if (it != env_.end()) {
-      return it->second;
-    }
-    auto graph = m.graph();
-    if (name == "inf") {
-      return std::make_shared<SimpleValue>(
-          graph->insertConstant(std::numeric_limits<double>::infinity(), loc));
-    }
-    if (name == "nan") {
-      return std::make_shared<SimpleValue>(
-          graph->insertConstant(std::numeric_limits<double>::quiet_NaN(), loc));
-    }
-    if (name == "infj") {
-      return std::make_shared<SimpleValue>(graph->insertConstant(
-          c10::complex<double>(0, std::numeric_limits<double>::infinity()),
-          loc));
-    }
-    if (name == "nanj") {
-      return std::make_shared<SimpleValue>(graph->insertConstant(
-          c10::complex<double>(0, std::numeric_limits<double>::quiet_NaN()),
-          loc));
-    }
-    if (name == "__torch__") {
-      return std::make_shared<ClassNamespaceValue>(
-          c10::QualifiedName(name), shared_from_this());
-    }
-    return nullptr;
+std::shared_ptr<SugaredValue> SourceImporterImpl::resolveValue(
+    const std::string& name,
+    Function& m,
+    const SourceRange& loc) {
+  auto it = env_.find(name);
+  if (it != env_.end()) {
+    return it->second;
+  }
+  auto graph = m.graph();
+  if (name == "inf") {
+    return std::make_shared<SimpleValue>(
+        graph->insertConstant(std::numeric_limits<double>::infinity(), loc));
+  }
+  if (name == "nan") {
+    return std::make_shared<SimpleValue>(
+        graph->insertConstant(std::numeric_limits<double>::quiet_NaN(), loc));
   }
+  if (name == "infj") {
+    return std::make_shared<SimpleValue>(graph->insertConstant(
+        c10::complex<double>(0, std::numeric_limits<double>::infinity()), loc));
+  }
+  if (name == "nanj") {
+    return std::make_shared<SimpleValue>(graph->insertConstant(
+        c10::complex<double>(0, std::numeric_limits<double>::quiet_NaN()),
+        loc));
+  }
+  if (name == "__torch__") {
+    return std::make_shared<ClassNamespaceValue>(
+        c10::QualifiedName(name), shared_from_this());
+  }
+  return nullptr;
+}
+
+TypePtr SourceImporterImpl::resolveType(
+    const std::string& name,
+    const SourceRange& loc) {
+  return findNamedType(QualifiedName(name));
+}
 
-  TypePtr resolveType(const std::string& name, const SourceRange& loc)
-      override {
-    return findNamedType(QualifiedName(name));
+void SourceImporterImpl::importFunction(
+    const std::string& qualifier,
+    const Def& def) {
+  std::vector<Def> definitions{def};
+  std::vector<ResolverPtr> resolvers{shared_from_this()};
+  cu_->define(
+      qualifier,
+      /*properties=*/{},
+      /*propResolvers=*/{},
+      definitions,
+      resolvers,
+      nullptr);
+}
+
+void SourceImporterImpl::importNamedType(
+    const std::string& qualifier,
+    const ClassDef& class_def) {
+  const auto qualified_name =
+      QualifiedName(QualifiedName(qualifier), class_def.name().name());
+  if (!class_def.superclass().present()) {
+    return importClass(qualified_name, class_def, /*is_module=*/false);
+  }
+  const auto& superclass_name = Var(class_def.superclass().get()).name().name();
+  if (superclass_name == "Module") {
+    importClass(qualified_name, class_def, /*is_module=*/true);
+  } else if (superclass_name == "NamedTuple") {
+    // NamedTuples have special rules (since they are TupleTypes and not
+    // ClassTypes)
+    return importNamedTuple(qualified_name, class_def);
+  } else if (superclass_name == "Interface") {
+    cu_->define_interface(
+        qualified_name, class_def, shared_from_this(), /*is_module=*/false);
+  } else if (superclass_name == "ModuleInterface") {
+    cu_->define_interface(
+        qualified_name, class_def, shared_from_this(), /*is_module=*/true);
+  } else if (superclass_name == "Enum") {
+    importEnum(qualified_name, class_def);
+  } else {
+    throw ErrorReport(class_def.range())
+        << "Torchscript does not support class inheritance.";
   }
+}
 
- private:
-  void importFunction(const std::string& qualifier, const Def& def) {
-    std::vector<Def> definitions{def};
-    std::vector<ResolverPtr> resolvers{shared_from_this()};
-    cu_->define(
-        qualifier,
-        /*properties=*/{},
-        /*propResolvers=*/{},
-        definitions,
-        resolvers,
-        nullptr);
-  }
-
-  void importNamedType(
-      const std::string& qualifier,
-      const ClassDef& class_def) {
-    const auto qualified_name =
-        QualifiedName(QualifiedName(qualifier), class_def.name().name());
-    if (!class_def.superclass().present()) {
-      return importClass(qualified_name, class_def, /*is_module=*/false);
+c10::optional<Assign> SourceImporterImpl::
+    attributeAssignmentSpecialHandlingHack(
+        const QualifiedName& qualified_classname,
+        const Assign& assign) {
+  struct AttrTypeReplacementDescr {
+    std::string attr_name;
+    std::string expected_type;
+    std::string replacement_type;
+  };
+
+  // module demangled qualname -> ReplacementDescr
+  static std::unordered_map<std::string, AttrTypeReplacementDescr> replacements{
+      {"__torch__.torch.nn.quantized.modules.linear.LinearPackedParams",
+       {"_packed_params",
+        "Tensor",
+        "__torch__.torch.classes.quantized.LinearPackedParamsBase"}},
+      {"__torch__.torch.nn.quantized.modules.linear.Linear",
+       {"_packed_params",
+        "Tensor",
+        "__torch__.torch.classes.quantized.LinearPackedParamsBase"}},
+      {"__torch__.torch.nn.quantized.dynamic.modules.linear.Linear",
+       {"_packed_params",
+        "Tensor",
+        "__torch__.torch.classes.quantized.LinearPackedParamsBase"}},
+      {"__torch__.torch.nn.quantized.modules.conv.Conv2d",
+       {"_packed_params",
+        "Tensor",
+        "__torch__.torch.classes.quantized.Conv2dPackedParamsBase"}},
+      {"__torch__.torch.nn.intrinsic.quantized.modules.conv_relu.ConvReLU2d",
+       {"_packed_params",
+        "Tensor",
+        "__torch__.torch.classes.quantized.Conv2dPackedParamsBase"}},
+      {"__torch__.torch.nn.quantized.modules.conv.Conv3d",
+       {"_packed_params",
+        "Tensor",
+        "__torch__.torch.classes.quantized.Conv3dPackedParamsBase"}},
+      {"__torch__.torch.nn.intrinsic.quantized.modules.conv_relu.ConvReLU3d",
+       {"_packed_params",
+        "Tensor",
+        "__torch__.torch.classes.quantized.Conv3dPackedParamsBase"}}};
+  // @lint-ignore-every CLANGTIDY facebook-hte-StdRegexIsAwful
+  static std::regex mangle_re("\\.___torch_mangle_\\d+");
+  auto demangled_classname =
+      std::regex_replace(qualified_classname.qualifiedName(), mangle_re, "");
+  if (replacements.count(demangled_classname)) {
+    auto lhs = Var(assign.lhs());
+    if (!assign.type().present() || assign.type().get().kind() != TK_VAR) {
+      return c10::nullopt;
     }
-    const auto& superclass_name =
-        Var(class_def.superclass().get()).name().name();
-    if (superclass_name == "Module") {
-      importClass(qualified_name, class_def, /*is_module=*/true);
-    } else if (superclass_name == "NamedTuple") {
-      // NamedTuples have special rules (since they are TupleTypes and not
-      // ClassTypes)
-      return importNamedTuple(qualified_name, class_def);
-    } else if (superclass_name == "Interface") {
-      cu_->define_interface(
-          qualified_name, class_def, shared_from_this(), /*is_module=*/false);
-    } else if (superclass_name == "ModuleInterface") {
-      cu_->define_interface(
-          qualified_name, class_def, shared_from_this(), /*is_module=*/true);
-    } else if (superclass_name == "Enum") {
-      importEnum(qualified_name, class_def);
-    } else {
-      throw ErrorReport(class_def.range())
-          << "Torchscript does not support class inheritance.";
+    auto type = Var(assign.type().get());
+
+    auto& attr_name = replacements.at(demangled_classname).attr_name;
+    auto& expected_type = replacements.at(demangled_classname).expected_type;
+    auto& replacement_type =
+        replacements.at(demangled_classname).replacement_type;
+    if (lhs.name().name() == attr_name && type.name().name() == expected_type) {
+      Parser p(std::make_shared<Source>(replacement_type));
+      auto typename_expr = p.parseExp();
+      auto maybe_typename =
+          Maybe<Expr>::create(typename_expr.range(), typename_expr);
+      return Assign::create(
+          assign.range(), assign.lhs_list(), assign.rhs(), maybe_typename);
     }
   }
+  return c10::nullopt;
+}
 
-  c10::optional<Assign> attributeAssignmentSpecialHandlingHack(
-      const QualifiedName& qualified_classname,
-      const Assign& assign) {
-    struct AttrTypeReplacementDescr {
-      std::string attr_name;
-      std::string expected_type;
-      std::string replacement_type;
-    };
-
-    // module demangled qualname -> ReplacementDescr
-    static std::unordered_map<std::string, AttrTypeReplacementDescr> replacements{
-        {"__torch__.torch.nn.quantized.modules.linear.LinearPackedParams",
-         {"_packed_params",
-          "Tensor",
-          "__torch__.torch.classes.quantized.LinearPackedParamsBase"}},
-        {"__torch__.torch.nn.quantized.modules.linear.Linear",
-         {"_packed_params",
-          "Tensor",
-          "__torch__.torch.classes.quantized.LinearPackedParamsBase"}},
-        {"__torch__.torch.nn.quantized.dynamic.modules.linear.Linear",
-         {"_packed_params",
-          "Tensor",
-          "__torch__.torch.classes.quantized.LinearPackedParamsBase"}},
-        {"__torch__.torch.nn.quantized.modules.conv.Conv2d",
-         {"_packed_params",
-          "Tensor",
-          "__torch__.torch.classes.quantized.Conv2dPackedParamsBase"}},
-        {"__torch__.torch.nn.intrinsic.quantized.modules.conv_relu.ConvReLU2d",
-         {"_packed_params",
-          "Tensor",
-          "__torch__.torch.classes.quantized.Conv2dPackedParamsBase"}},
-        {"__torch__.torch.nn.quantized.modules.conv.Conv3d",
-         {"_packed_params",
-          "Tensor",
-          "__torch__.torch.classes.quantized.Conv3dPackedParamsBase"}},
-        {"__torch__.torch.nn.intrinsic.quantized.modules.conv_relu.ConvReLU3d",
-         {"_packed_params",
-          "Tensor",
-          "__torch__.torch.classes.quantized.Conv3dPackedParamsBase"}}};
-    static std::regex mangle_re("\\.___torch_mangle_\\d+");
-    auto demangled_classname =
-        std::regex_replace(qualified_classname.qualifiedName(), mangle_re, "");
-    if (replacements.count(demangled_classname)) {
-      auto lhs = Var(assign.lhs());
-      if (!assign.type().present() || assign.type().get().kind() != TK_VAR) {
-        return c10::nullopt;
-      }
-      auto type = Var(assign.type().get());
-
-      auto& attr_name = replacements.at(demangled_classname).attr_name;
-      auto& expected_type = replacements.at(demangled_classname).expected_type;
-      auto& replacement_type =
-          replacements.at(demangled_classname).replacement_type;
-      if (lhs.name().name() == attr_name &&
-          type.name().name() == expected_type) {
-        Parser p(std::make_shared<Source>(replacement_type));
-        auto typename_expr = p.parseExp();
-        auto maybe_typename =
-            Maybe<Expr>::create(typename_expr.range(), typename_expr);
-        return Assign::create(
-            assign.range(), assign.lhs_list(), assign.rhs(), maybe_typename);
-      }
-    }
-    return c10::nullopt;
-  }
-
-  void importClass(
-      const QualifiedName& qualified_classname,
-      const ClassDef& class_def,
-      bool is_module) {
-    // BC for TorchBind classes
-    //
-    // Previously we would serialize TorchBind classes as actual
-    // classes with methods that delegate to things in the
-    // torch.ops.* namespace. We've switched away from this and
-    // now just rely on those classes being present in the binary
-    // and emit code for them based on the ClassType in memory.
-    //
-    // TODO: remove this once we no longer have old TorchBind code
-    // in production models
-    {
-      static QualifiedName torch_classes_qualname("__torch__.torch.classes");
-      if (torch_classes_qualname.isPrefixOf(qualified_classname)) {
-        return;
-      }
+void SourceImporterImpl::importClass(
+    const QualifiedName& qualified_classname,
+    const ClassDef& class_def,
+    bool is_module) {
+  // BC for TorchBind classes
+  //
+  // Previously we would serialize TorchBind classes as actual
+  // classes with methods that delegate to things in the
+  // torch.ops.* namespace. We've switched away from this and
+  // now just rely on those classes being present in the binary
+  // and emit code for them based on the ClassType in memory.
+  //
+  // TODO: remove this once we no longer have old TorchBind code
+  // in production models
+  {
+    static QualifiedName torch_classes_qualname("__torch__.torch.classes");
+    if (torch_classes_qualname.isPrefixOf(qualified_classname)) {
+      return;
     }
-    auto class_type = ClassType::create(
-        c10::QualifiedName(qualified_classname), cu_, is_module);
-
-    std::vector<Def> methods;
-    std::vector<ResolverPtr> method_resolvers;
-    std::map<std::string, Def> pre_hook_def_map;
-    std::map<std::string, Def> hook_def_map;
-    std::map<std::string, ResolverPtr> pre_hook_resolver_map;
-    std::map<std::string, ResolverPtr> hook_resolver_map;
-    std::vector<Assign> attributes;
-    std::vector<Assign> constants;
-
-    // Module-specific: which attrs are parameters?
-    std::unordered_set<std::string> parameter_names;
-    std::unordered_set<std::string> buffer_names;
-    std::unordered_set<std::string> pre_hook_names;
-    std::unordered_set<std::string> hook_names;
-    // used to keep track of original ordering of hooks and prehooks
-    // in case any are called more than once
-    std::vector<std::string> pre_hooks_order;
-    std::vector<std::string> hooks_order;
-    // Process statements, splitting things into attribute and method
-    // definitions.
-    for (const auto& statement : class_def.body()) {
-      switch (statement.kind()) {
-        case TK_ASSIGN: {
-          const auto assign = Assign(statement);
-          switch (assign.lhs().kind()) {
-            case TK_VAR: {
-              const auto name = Var(assign.lhs()).name().name();
-              if (name == "__parameters__") {
-                // Populate the module parameter list. This is a field that
-                // looks like:
-                //   __parameters__ = ["foo", "bar", "baz"]
-                // which tells us which attributes are module parameters.
-                TORCH_INTERNAL_ASSERT(
-                    is_module,
-                    "Assignments in class body only "
-                    "supported on modules right now");
-                const auto param_list =
-                    ListLiteral(assign.rhs().get()).inputs();
-                for (const auto& param : param_list) {
-                  parameter_names.insert(StringLiteral(param).text());
-                }
-              } else if (name == "__annotations__") {
-                // This is to initialize the annotations dict, just ignore.
-                continue;
-              } else if (name == "__buffers__") {
-                TORCH_INTERNAL_ASSERT(
-                    is_module, "Buffers only exist on modules at the moment");
-                const auto buffer_list =
-                    ListLiteral(assign.rhs().get()).inputs();
-                for (const auto& buffer : buffer_list) {
-                  buffer_names.insert(StringLiteral(buffer).text());
-                }
-              } else if (name == "__forward_pre_hooks__") {
-                TORCH_INTERNAL_ASSERT(
-                    is_module,
-                    "Forward pre hooks only exist on modules at the moment");
-                const auto pre_hook_list =
-                    ListLiteral(assign.rhs().get()).inputs();
-                for (const auto& pre_hook : pre_hook_list) {
-                  std::string pre_hook_name = StringLiteral(pre_hook).text();
-                  pre_hook_names.insert(pre_hook_name);
-                  pre_hooks_order.emplace_back(pre_hook_name);
-                }
-              } else if (name == "__forward_hooks__") {
-                TORCH_INTERNAL_ASSERT(
-                    is_module,
-                    "Forward hooks only exist on modules at the moment");
-                const auto hook_list = ListLiteral(assign.rhs().get()).inputs();
-                for (const auto& hook : hook_list) {
-                  std::string hook_name = StringLiteral(hook).text();
-                  hook_names.insert(hook_name);
-                  hooks_order.emplace_back(hook_name);
-                }
-              } else {
-                if (auto fixed_up = attributeAssignmentSpecialHandlingHack(
-                        qualified_classname, assign)) {
-                  attributes.push_back(std::move(*fixed_up));
-                } else if (assign.rhs().present()) {
-                  // This is a constant assignment, of the form:
-                  // foo : Final[int] = 3
-                  constants.push_back(assign);
-                } else {
-                  // This is a regular attribute assignment, of the form:
-                  // foo : Tensor
-                  attributes.push_back(assign);
-                }
+  }
+  auto class_type = ClassType::create(
+      c10::QualifiedName(qualified_classname), cu_, is_module);
+
+  std::vector<Def> methods;
+  std::vector<ResolverPtr> method_resolvers;
+  std::map<std::string, Def> pre_hook_def_map;
+  std::map<std::string, Def> hook_def_map;
+  std::map<std::string, ResolverPtr> pre_hook_resolver_map;
+  std::map<std::string, ResolverPtr> hook_resolver_map;
+  std::vector<Assign> attributes;
+  std::vector<Assign> constants;
+
+  // Module-specific: which attrs are parameters?
+  std::unordered_set<std::string> parameter_names;
+  std::unordered_set<std::string> buffer_names;
+  std::unordered_set<std::string> pre_hook_names;
+  std::unordered_set<std::string> hook_names;
+  // used to keep track of original ordering of hooks and prehooks
+  // in case any are called more than once
+  std::vector<std::string> pre_hooks_order;
+  std::vector<std::string> hooks_order;
+  // Process statements, splitting things into attribute and method
+  // definitions.
+  for (const auto& statement : class_def.body()) {
+    switch (statement.kind()) {
+      case TK_ASSIGN: {
+        const auto assign = Assign(statement);
+        switch (assign.lhs().kind()) {
+          case TK_VAR: {
+            const auto name = Var(assign.lhs()).name().name();
+            if (name == "__parameters__") {
+              // Populate the module parameter list. This is a field that
+              // looks like:
+              //   __parameters__ = ["foo", "bar", "baz"]
+              // which tells us which attributes are module parameters.
+              TORCH_INTERNAL_ASSERT(
+                  is_module,
+                  "Assignments in class body only "
+                  "supported on modules right now");
+              const auto param_list = ListLiteral(assign.rhs().get()).inputs();
+              for (const auto& param : param_list) {
+                parameter_names.insert(StringLiteral(param).text());
               }
-            } break;
-            case TK_SUBSCRIPT: {
-              // This is a special attribute assignment where the attribute
-              // is not a valid python, identifier. Looks like:
-              //    __annotations__["0"] = Tensor
-              const auto lhs = Subscript(assign.lhs());
+            } else if (name == "__annotations__") {
+              // This is to initialize the annotations dict, just ignore.
+              continue;
+            } else if (name == "__buffers__") {
               TORCH_INTERNAL_ASSERT(
-                  Var(lhs.value()).name().name() == "__annotations__");
-              TORCH_INTERNAL_ASSERT(lhs.subscript_exprs().size() == 1);
-              attributes.push_back(assign);
-            } break;
-            default: {
+                  is_module, "Buffers only exist on modules at the moment");
+              const auto buffer_list = ListLiteral(assign.rhs().get()).inputs();
+              for (const auto& buffer : buffer_list) {
+                buffer_names.insert(StringLiteral(buffer).text());
+              }
+            } else if (name == "__forward_pre_hooks__") {
               TORCH_INTERNAL_ASSERT(
-                  false,
-                  "Unexpected statement kind in module metadata: ",
-                  kindToString(statement.kind()));
+                  is_module,
+                  "Forward pre hooks only exist on modules at the moment");
+              const auto pre_hook_list =
+                  ListLiteral(assign.rhs().get()).inputs();
+              for (const auto& pre_hook : pre_hook_list) {
+                std::string pre_hook_name = StringLiteral(pre_hook).text();
+                pre_hook_names.insert(pre_hook_name);
+                pre_hooks_order.emplace_back(pre_hook_name);
+              }
+            } else if (name == "__forward_hooks__") {
+              TORCH_INTERNAL_ASSERT(
+                  is_module,
+                  "Forward hooks only exist on modules at the moment");
+              const auto hook_list = ListLiteral(assign.rhs().get()).inputs();
+              for (const auto& hook : hook_list) {
+                std::string hook_name = StringLiteral(hook).text();
+                hook_names.insert(hook_name);
+                hooks_order.emplace_back(hook_name);
+              }
+            } else {
+              if (auto fixed_up = attributeAssignmentSpecialHandlingHack(
+                      qualified_classname, assign)) {
+                attributes.push_back(std::move(*fixed_up));
+              } else if (assign.rhs().present()) {
+                // This is a constant assignment, of the form:
+                // foo : Final[int] = 3
+                constants.push_back(assign);
+              } else {
+                // This is a regular attribute assignment, of the form:
+                // foo : Tensor
+                attributes.push_back(assign);
+              }
             }
+          } break;
+          case TK_SUBSCRIPT: {
+            // This is a special attribute assignment where the attribute
+            // is not a valid python, identifier. Looks like:
+            //    __annotations__["0"] = Tensor
+            const auto lhs = Subscript(assign.lhs());
+            TORCH_INTERNAL_ASSERT(
+                Var(lhs.value()).name().name() == "__annotations__");
+            TORCH_INTERNAL_ASSERT(lhs.subscript_exprs().size() == 1);
+            attributes.push_back(assign);
+          } break;
+          default: {
+            TORCH_INTERNAL_ASSERT(
+                false,
+                "Unexpected statement kind in module metadata: ",
+                kindToString(statement.kind()));
           }
-        } break;
-        case TK_DEF: {
-          Def def = Def(statement);
-          if (pre_hook_names.find(def.name().name()) != pre_hook_names.end()) {
-            pre_hook_def_map.emplace(def.name().name(), def);
-            pre_hook_resolver_map.emplace(
-                def.name().name(), shared_from_this());
-          } else if (hook_names.find(def.name().name()) != hook_names.end()) {
-            hook_def_map.emplace(def.name().name(), def);
-            hook_resolver_map.emplace(def.name().name(), shared_from_this());
-          } else {
-            methods.emplace_back(def);
-            method_resolvers.push_back(shared_from_this());
-          }
-        } break;
-        default: {
-          TORCH_INTERNAL_ASSERT(
-              false,
-              "Unexpected statement kind in class body: ",
-              kindToString(statement.kind()));
         }
-      }
-    }
-
-    // Populate class attributes
-    ScriptTypeParser type_parser(shared_from_this());
-    for (const auto& assign : attributes) {
-      switch (assign.lhs().kind()) {
-        case TK_VAR: {
-          const auto name = Var(assign.lhs()).name().name();
-          TORCH_INTERNAL_ASSERT(name != "__parameters__");
-          const auto type = type_parser.parseTypeFromExpr(assign.type().get());
-          const bool is_parameter = parameter_names.count(name);
-          const bool is_buffer = buffer_names.count(name);
-          class_type->addAttribute(name, type, is_parameter, is_buffer);
-        } break;
-        case TK_SUBSCRIPT: {
-          const auto name =
-              StringLiteral(Subscript(assign.lhs()).subscript_exprs()[0])
-                  .text();
-          const auto type = type_parser.parseTypeFromExpr(assign.rhs().get());
-          const bool is_parameter = parameter_names.count(name);
-          const bool is_buffer = buffer_names.count(name);
-          class_type->addAttribute(name, type, is_parameter, is_buffer);
+      } break;
+      case TK_DEF: {
+        Def def = Def(statement);
+        if (pre_hook_names.find(def.name().name()) != pre_hook_names.end()) {
+          pre_hook_def_map.emplace(def.name().name(), def);
+          pre_hook_resolver_map.emplace(def.name().name(), shared_from_this());
+        } else if (hook_names.find(def.name().name()) != hook_names.end()) {
+          hook_def_map.emplace(def.name().name(), def);
+          hook_resolver_map.emplace(def.name().name(), shared_from_this());
+        } else {
+          methods.emplace_back(def);
+          method_resolvers.push_back(shared_from_this());
         }
+      } break;
+      default: {
+        TORCH_INTERNAL_ASSERT(
+            false,
+            "Unexpected statement kind in class body: ",
+            kindToString(statement.kind()));
       }
     }
+  }
 
-    // Populate class constants
-    for (const auto& assign : constants) {
-      auto const_val = type_parser.parseClassConstant(assign);
-      const auto name = Var(assign.lhs()).name().name();
-      class_type->addConstant(name, const_val);
+  // Populate class attributes
+  ScriptTypeParser type_parser(shared_from_this());
+  for (const auto& assign : attributes) {
+    switch (assign.lhs().kind()) {
+      case TK_VAR: {
+        const auto name = Var(assign.lhs()).name().name();
+        TORCH_INTERNAL_ASSERT(name != "__parameters__");
+        const auto type = type_parser.parseTypeFromExpr(assign.type().get());
+        const bool is_parameter = parameter_names.count(name);
+        const bool is_buffer = buffer_names.count(name);
+        class_type->addAttribute(name, type, is_parameter, is_buffer);
+      } break;
+      case TK_SUBSCRIPT: {
+        const auto name =
+            StringLiteral(Subscript(assign.lhs()).subscript_exprs()[0]).text();
+        const auto type = type_parser.parseTypeFromExpr(assign.rhs().get());
+        const bool is_parameter = parameter_names.count(name);
+        const bool is_buffer = buffer_names.count(name);
+        class_type->addAttribute(name, type, is_parameter, is_buffer);
+      }
     }
+  }
 
-    // build pre hook and hook def/resolver pairs
-    // pairs are dedupped in ir_emitter.cpp's CompilationUnit::define_hooks()
-    // ordering here is call order for hooks
-    std::vector<Def> hooks;
-    std::vector<ResolverPtr> hook_resolvers;
-    for (const std::string& hook_name : hooks_order) {
-      hooks.emplace_back(hook_def_map.find(hook_name)->second);
-      hook_resolvers.push_back(hook_resolver_map.find(hook_name)->second);
-    }
-    std::vector<Def> pre_hooks;
-    std::vector<ResolverPtr> pre_hook_resolvers;
-    for (const std::string& pre_hook_name : pre_hooks_order) {
-      pre_hooks.emplace_back(pre_hook_def_map.find(pre_hook_name)->second);
-      pre_hook_resolvers.push_back(
-          pre_hook_resolver_map.find(pre_hook_name)->second);
-    }
+  // Populate class constants
+  for (const auto& assign : constants) {
+    auto const_val = type_parser.parseClassConstant(assign);
+    const auto name = Var(assign.lhs()).name().name();
+    class_type->addConstant(name, const_val);
+  }
 
-    cu_->register_type(class_type);
-    const auto self = SimpleSelf(class_type);
-    cu_->define(
-        qualified_classname,
-        /*properties=*/{},
-        /*propResolvers=*/{},
-        methods,
-        method_resolvers,
-        &self);
-    cu_->define_hooks(
-        qualified_classname,
-        hooks,
-        hook_resolvers,
-        pre_hooks,
-        pre_hook_resolvers,
-        &self);
-  }
-
-  void importEnum(
-      const QualifiedName& qualified_name,
-      const ClassDef& enum_def) {
-    std::vector<at::EnumNameValue> names_values;
-
-    TypePtr value_type = nullptr;
-    auto set_or_check_type = [&value_type](
-                                 const TypePtr& t, const SourceRange& loc) {
-      if (!value_type) {
-        value_type = t;
-      } else if (value_type != t) {
-        throw ErrorReport(loc)
-            << "Enum class with varying value types are not supported.";
-      }
-    };
+  // build pre hook and hook def/resolver pairs
+  // pairs are dedupped in ir_emitter.cpp's CompilationUnit::define_hooks()
+  // ordering here is call order for hooks
+  std::vector<Def> hooks;
+  std::vector<ResolverPtr> hook_resolvers;
+  for (const std::string& hook_name : hooks_order) {
+    hooks.emplace_back(hook_def_map.find(hook_name)->second);
+    hook_resolvers.push_back(hook_resolver_map.find(hook_name)->second);
+  }
+  std::vector<Def> pre_hooks;
+  std::vector<ResolverPtr> pre_hook_resolvers;
+  for (const std::string& pre_hook_name : pre_hooks_order) {
+    pre_hooks.emplace_back(pre_hook_def_map.find(pre_hook_name)->second);
+    pre_hook_resolvers.push_back(
+        pre_hook_resolver_map.find(pre_hook_name)->second);
+  }
 
-    for (const auto& statement : enum_def.body()) {
-      if (statement.kind() != TK_ASSIGN) {
-        throw ErrorReport(statement.range())
-            << "Unexpected statement in Enum class body: "
-               "only enum attribute definitions are currently supported.";
-      }
+  cu_->register_type(class_type);
+  const auto self = SimpleSelf(class_type);
+  cu_->define(
+      qualified_classname,
+      /*properties=*/{},
+      /*propResolvers=*/{},
+      methods,
+      method_resolvers,
+      &self);
+  cu_->define_hooks(
+      qualified_classname,
+      hooks,
+      hook_resolvers,
+      pre_hooks,
+      pre_hook_resolvers,
+      &self);
+}
 
-      const auto assign = Assign(statement);
-      const auto name = Var(assign.lhs()).name().name();
-
-      IValue ivalue;
-      auto rhs = assign.rhs().get();
-      switch (rhs.kind()) {
-        case TK_STRINGLITERAL:
-          ivalue = IValue(StringLiteral(rhs).text());
-          set_or_check_type(StringType::get(), statement.range());
-          break;
-        case TK_CONST: {
-          auto numeric_const = Const(rhs);
-          if (numeric_const.isFloatingPoint()) {
-            ivalue = IValue(numeric_const.asFloatingPoint());
-            set_or_check_type(FloatType::get(), statement.range());
-          } else if (numeric_const.isIntegral()) {
-            ivalue = IValue(numeric_const.asIntegral());
-            set_or_check_type(IntType::get(), statement.range());
-          }
-          break;
-        }
-        default:
-          throw ErrorReport(rhs.range())
-              << "Unsupported enum value type: " << rhs.kind()
-              << ". Only Integers, Floats and Strings are supported.";
-      }
+void SourceImporterImpl::importEnum(
+    const QualifiedName& qualified_name,
+    const ClassDef& enum_def) {
+  std::vector<at::EnumNameValue> names_values;
 
-      names_values.emplace_back(std::make_pair(name, ivalue));
+  TypePtr value_type = nullptr;
+  auto set_or_check_type = [&value_type](
+                               const TypePtr& t, const SourceRange& loc) {
+    if (!value_type) {
+      value_type = t;
+    } else if (value_type != t) {
+      throw ErrorReport(loc)
+          << "Enum class with varying value types are not supported.";
     }
+  };
 
-    if (!value_type) {
-      throw ErrorReport(enum_def.range())
-          << "No enum values defined for " << qualified_name.qualifiedName();
+  for (const auto& statement : enum_def.body()) {
+    if (statement.kind() != TK_ASSIGN) {
+      throw ErrorReport(statement.range())
+          << "Unexpected statement in Enum class body: "
+             "only enum attribute definitions are currently supported.";
     }
 
-    auto enum_type = EnumType::create(
-        qualified_name, std::move(value_type), std::move(names_values), cu_);
-    cu_->register_type(enum_type);
-  }
-
-  void importNamedTuple(
-      const QualifiedName& qualified_name,
-      const ClassDef& named_tuple_def) {
-    ScriptTypeParser type_parser(shared_from_this());
-    std::vector<std::string> field_names;
-    std::vector<TypePtr> field_types;
-    std::vector<IValue> field_defaults;
-    for (const auto& statement : named_tuple_def.body()) {
-      if (statement.kind() != TK_ASSIGN) {
-        throw ErrorReport(statement.range())
-            << "Unexpected statement in NamedTuple body: "
-               "only attribute annotations are currently supported.";
-      }
-      const auto assign = Assign(statement);
-
-      auto name = Var(Assign(statement).lhs()).name().name();
-      c10::optional<IValue> default_val;
-      if (assign.rhs().present()) {
-        std::vector<IValue> parsed = type_parser.evaluateDefaults(
-            assign.rhs().range(), {assign.rhs().get()}, {assign.type().get()});
-        TORCH_INTERNAL_ASSERT(parsed.size() == 1);
-        default_val = parsed[0];
+    const auto assign = Assign(statement);
+    const auto name = Var(assign.lhs()).name().name();
+
+    IValue ivalue;
+    auto rhs = assign.rhs().get();
+    switch (rhs.kind()) {
+      case TK_STRINGLITERAL:
+        ivalue = IValue(StringLiteral(rhs).text());
+        set_or_check_type(StringType::get(), statement.range());
+        break;
+      case TK_CONST: {
+        auto numeric_const = Const(rhs);
+        if (numeric_const.isFloatingPoint()) {
+          ivalue = IValue(numeric_const.asFloatingPoint());
+          set_or_check_type(FloatType::get(), statement.range());
+        } else if (numeric_const.isIntegral()) {
+          ivalue = IValue(numeric_const.asIntegral());
+          set_or_check_type(IntType::get(), statement.range());
+        }
+        break;
       }
+      default:
+        throw ErrorReport(rhs.range())
+            << "Unsupported enum value type: " << rhs.kind()
+            << ". Only Integers, Floats and Strings are supported.";
+    }
 
-      auto type = type_parser.parseTypeFromExpr(assign.type().get());
+    names_values.emplace_back(std::make_pair(name, ivalue));
+  }
 
-      field_names.emplace_back(std::move(name));
-      field_types.emplace_back(std::move(type));
-      if (default_val) {
-        field_defaults.emplace_back(std::move(*default_val));
-      }
+  if (!value_type) {
+    throw ErrorReport(enum_def.range())
+        << "No enum values defined for " << qualified_name.qualifiedName();
+  }
+
+  auto enum_type = EnumType::create(
+      qualified_name, std::move(value_type), std::move(names_values), cu_);
+  cu_->register_type(enum_type);
+}
+
+void SourceImporterImpl::importNamedTuple(
+    const QualifiedName& qualified_name,
+    const ClassDef& named_tuple_def) {
+  ScriptTypeParser type_parser(shared_from_this());
+  std::vector<std::string> field_names;
+  std::vector<TypePtr> field_types;
+  std::vector<IValue> field_defaults;
+  for (const auto& statement : named_tuple_def.body()) {
+    if (statement.kind() != TK_ASSIGN) {
+      throw ErrorReport(statement.range())
+          << "Unexpected statement in NamedTuple body: "
+             "only attribute annotations are currently supported.";
+    }
+    const auto assign = Assign(statement);
+
+    auto name = Var(Assign(statement).lhs()).name().name();
+    c10::optional<IValue> default_val;
+    if (assign.rhs().present()) {
+      std::vector<IValue> parsed = type_parser.evaluateDefaults(
+          assign.rhs().range(), {assign.rhs().get()}, {assign.type().get()});
+      TORCH_INTERNAL_ASSERT(parsed.size() == 1);
+      default_val = parsed[0];
     }
 
-    auto tt = TupleType::createNamed(
-        qualified_name, field_names, field_types, field_defaults);
-    cu_->register_type(tt);
-  }
+    auto type = type_parser.parseTypeFromExpr(assign.type().get());
 
-  void parsePossibleVersionNumber(Lexer& L) {
-    // Older versions of serialization produced an op_version_set string
-    // per-file We now just use a single version which is handled by
-    // PyTorchStreamReader. We used to check if op_version_set was _newer_ for
-    // forward compatibility reasons but now that it doesn't exist there can't
-    // be a newer one, so we just discard this.
-    if (L.cur().kind == TK_IDENT && L.cur().text() == "op_version_set") {
-      auto range = L.cur().range;
-      L.next();
-      L.expect('=');
-      std::string version_text = L.expect(TK_NUMBER).text();
-      L.expect(TK_NEWLINE);
+    field_names.emplace_back(std::move(name));
+    field_types.emplace_back(std::move(type));
+    if (default_val) {
+      field_defaults.emplace_back(std::move(*default_val));
     }
   }
 
-  // older versions of serialization required import statements,
-  // and defined classes file-at-a-time in import order.
-  // The problem is that in Python
-  // it is possible to construct cyclic dependencies between files even
-  // when there are none between individual classes. New versions of loading
-  // just compile class-at-a-time, so we no longer need to follow the import
-  // order. Future serialization may stop producing the import code.
-  void parseImports(Lexer& L) {
-    while (L.nextIf(TK_IMPORT)) {
-      std::ostringstream s;
-      while (L.cur().kind != TK_NEWLINE) {
-        s << L.cur().text();
-        L.next();
-      }
-      L.expect(TK_NEWLINE);
-    }
+  auto tt = TupleType::createNamed(
+      qualified_name, field_names, field_types, field_defaults);
+  cu_->register_type(tt);
+}
+
+void SourceImporterImpl::parsePossibleVersionNumber(Lexer& L) {
+  // Older versions of serialization produced an op_version_set string
+  // per-file We now just use a single version which is handled by
+  // PyTorchStreamReader. We used to check if op_version_set was _newer_ for
+  // forward compatibility reasons but now that it doesn't exist there can't
+  // be a newer one, so we just discard this.
+  if (L.cur().kind == TK_IDENT && L.cur().text() == "op_version_set") {
+    auto range = L.cur().range;
+    L.next();
+    L.expect('=');
+    std::string version_text = L.expect(TK_NUMBER).text();
+    L.expect(TK_NEWLINE);
   }
+}
 
-  std::shared_ptr<CompilationUnit> cu_;
-  std::unordered_map<std::string, std::shared_ptr<SugaredValue>> env_;
-  SourceLoader source_loader_;
-  std::unordered_set<std::string> loaded_sources_;
-  // named types and functions loaded from a file but not yet defined because
-  // their type has not been requested yet.
-  std::unordered_map<QualifiedName, TreeRef> to_be_defined_;
-};
+// older versions of serialization required import statements,
+// and defined classes file-at-a-time in import order.
+// The problem is that in Python
+// it is possible to construct cyclic dependencies between files even
+// when there are none between individual classes. New versions of loading
+// just compile class-at-a-time, so we no longer need to follow the import
+// order. Future serialization may stop producing the import code.
+void SourceImporterImpl::parseImports(Lexer& L) {
+  while (L.nextIf(TK_IMPORT)) {
+    std::ostringstream s;
+    while (L.cur().kind != TK_NEWLINE) {
+      s << L.cur().text();
+      L.next();
+    }
+    L.expect(TK_NEWLINE);
+  }
+}
 
 std::shared_ptr<SugaredValue> ClassNamespaceValue::attr(
     const SourceRange& loc,
diff --git a/torch/csrc/jit/serialization/import_source.h b/torch/csrc/jit/serialization/import_source.h
index e87ab59271594..f52f38afe6b15 100644
--- a/torch/csrc/jit/serialization/import_source.h
+++ b/torch/csrc/jit/serialization/import_source.h
@@ -1,22 +1,79 @@
 #pragma once
 
+#include <ATen/core/ivalue_inl.h>
+#include <ATen/core/qualified_name.h>
 #include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/parser.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/frontend/script_type_parser.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/serialization/export.h>
+#include <torch/custom_class.h>
 #include <functional>
 #include <memory>
+#include <regex>
 #include <string>
 #include <vector>
 
 namespace torch {
 namespace jit {
 
-struct SourceImporterImpl;
+using SourceLoader = std::function<std::shared_ptr<Source>(const std::string&)>;
+
+struct SourceImporterImpl : public Resolver,
+                            std::enable_shared_from_this<SourceImporterImpl> {
+  SourceImporterImpl(
+      std::shared_ptr<CompilationUnit> cu,
+      const std::vector<at::IValue>* constant_table,
+      SourceLoader source_loader,
+      size_t version);
+  TypePtr findNamedType(const QualifiedName& name);
+  Function* findFunction(const QualifiedName& name);
+  void parseSourceIfNeeded(const std::string& qualifier);
+  void LEGACY_import_methods(
+      const Module& mod,
+      const std::shared_ptr<Source>& src);
+
+  std::shared_ptr<SugaredValue> resolveValue(
+      const std::string& name,
+      Function& m,
+      const SourceRange& loc) override;
+  TypePtr resolveType(const std::string& name, const SourceRange& loc) override;
+
+ private:
+  void importFunction(const std::string& qualifier, const Def& def);
+  void importNamedType(const std::string& qualifier, const ClassDef& class_def);
+  c10::optional<Assign> attributeAssignmentSpecialHandlingHack(
+      const QualifiedName& qualified_classname,
+      const Assign& assign);
+  void importClass(
+      const QualifiedName& qualified_classname,
+      const ClassDef& class_def,
+      bool is_module);
+  void importEnum(
+      const QualifiedName& qualified_name,
+      const ClassDef& enum_def);
+  void importNamedTuple(
+      const QualifiedName& qualified_name,
+      const ClassDef& named_tuple_def);
+
+  void parsePossibleVersionNumber(Lexer& L);
+
+  void parseImports(Lexer& L);
+
+  std::shared_ptr<CompilationUnit> cu_;
+  std::unordered_map<std::string, std::shared_ptr<SugaredValue>> env_;
+  SourceLoader source_loader_;
+  std::unordered_set<std::string> loaded_sources_;
+  // named types and functions loaded from a file but not yet defined because
+  // their type has not been requested yet.
+  std::unordered_map<QualifiedName, TreeRef> to_be_defined_;
+};
 
 // Given a directory of serialized TorchScript sources,
 // This class allows the loading of individual named types in source.
 // Resolves the dependencies between source files and parses
 // the source files as necessary.
-using SourceLoader = std::function<std::shared_ptr<Source>(const std::string&)>;
 
 struct TORCH_API SourceImporter {
   SourceImporter(

From b2c7c1dfcf9c366ecef5db635b201954981c609f Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 2 Sep 2021 06:12:07 -0700
Subject: [PATCH 466/530] fix copy.deepcopy on LinearPackedParams (#64367)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64367

This is the same thing as https://github.com/pytorch/pytorch/pull/56154
but for quantized linear. It fixes the behavior of `copy.deepcopy` on
these modules. Before this PR, copied instances of `LinearPackedParams`
were not properly initialized, and inspecting them raised errors of
missing `_modules`. After this PR, inspecting and using the copies
works.

Test Plan:
```
python test/test_quantization.py TestStaticQuantizedModule.test_linear_api
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D30702667

fbshipit-source-id: 38c26d1e72663416eeb989985b77ffc2052c12b9
---
 .../core/test_quantized_module.py             | 29 +++++++++++++++----
 torch/nn/quantized/modules/linear.py          | 10 +++++++
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py
index b0bc78294d9b5..51e62174cc081 100644
--- a/test/quantization/core/test_quantized_module.py
+++ b/test/quantization/core/test_quantized_module.py
@@ -103,8 +103,7 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias,
         zero_point = 3
         qlinear = class_map[use_fused](in_features, out_features)
 
-        qlinear_copy = qlinear  # deepcopy does not work right now
-        # qlinear_copy = copy.deepcopy(qlinear)
+        qlinear_copy = copy.deepcopy(qlinear)
         self.checkScriptable(qlinear_copy, [[X_q]], check_save_load=True)
         # Run module with default-initialized parameters.
         # This tests that the constructor is correct.
@@ -155,15 +154,16 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias,
                          linear_unpack(loaded_qlinear._packed_params._packed_params))
         self.assertEqual(qlinear.scale, loaded_qlinear.scale)
         self.assertEqual(qlinear.zero_point, loaded_qlinear.zero_point)
-        # make sure loaded_qlinear has the same dir as qlinear since
-        # scripting the module will add __overloads__ to __dict__
-        self.checkScriptable(loaded_qlinear, [[X_q]], check_save_load=True)
+        # scripting will add __overloads__ to __dict__, which is why we script a copy
+        # to be able to do the check in the next line
+        self.checkScriptable(copy.deepcopy(loaded_qlinear), [[X_q]], check_save_load=True)
         self.assertTrue(dir(qlinear) == dir(loaded_qlinear))
         self.assertEqual(qlinear._weight_bias(), loaded_qlinear._weight_bias())
         self.assertEqual(qlinear._weight_bias(), torch.ops.quantized.linear_unpack(qlinear._packed_params._packed_params))
         Z_q2 = loaded_qlinear(X_q)
         self.assertEqual(Z_q, Z_q2)
 
+        # Test serialization
         b = io.BytesIO()
         torch.save(qlinear, b)
         b.seek(0)
@@ -172,6 +172,25 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias,
         self.assertEqual(qlinear.scale, loaded.scale)
         self.assertEqual(qlinear.zero_point, loaded.zero_point)
 
+        # Test copy and deepcopy
+        copied_linear = copy.copy(qlinear)
+        self.assertEqual(copied_linear.bias(), qlinear.bias())
+        self.assertEqual(copied_linear.scale, qlinear.scale)
+        self.assertEqual(copied_linear.zero_point,
+                         qlinear.zero_point)
+        Y_copied = copied_linear(X_q)
+        np.testing.assert_array_almost_equal(
+            Z_q.int_repr().numpy(), Y_copied.int_repr().numpy(), decimal=0)
+
+        deepcopied_linear = copy.deepcopy(qlinear)
+        self.assertEqual(deepcopied_linear.bias(), qlinear.bias())
+        self.assertEqual(deepcopied_linear.scale, qlinear.scale)
+        self.assertEqual(deepcopied_linear.zero_point,
+                         qlinear.zero_point)
+        Y_deepcopied = copied_linear(X_q)
+        np.testing.assert_array_almost_equal(
+            Z_q.int_repr().numpy(), Y_deepcopied.int_repr().numpy(), decimal=0)
+
         # Test JIT
         self.checkScriptable(qlinear, [[X_q]], check_save_load=True)
 
diff --git a/torch/nn/quantized/modules/linear.py b/torch/nn/quantized/modules/linear.py
index 4abd2115e4125..4df775105ba82 100644
--- a/torch/nn/quantized/modules/linear.py
+++ b/torch/nn/quantized/modules/linear.py
@@ -94,6 +94,16 @@ def __setstate__(self, state):
         self.set_weight_bias(state[0], state[1])
         self.training = state[2]
 
+    def __deepcopy__(self, memo):
+        new_instance = type(self).__new__(type(self))
+        torch.nn.Module.__init__(new_instance)
+        state = self.__getstate__()
+        new_instance.__setstate__(state)
+        return new_instance
+
+    def __copy__(self):
+        return self.__deepcopy__({})
+
     def __repr__(self):
         return self._weight_bias().__repr__()
 

From b737629ff0d4dd82f246b0efa6aef53f15971e78 Mon Sep 17 00:00:00 2001
From: Michael Dagitses <mikeyd@fb.com>
Date: Thu, 2 Sep 2021 06:49:09 -0700
Subject: [PATCH 467/530] simplify op name determination into a single forward
 pass (#64261)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64261

Note that this does not preserve byte-for-byte compatibility with
existing names.

Test Plan:
* Rely on CI to catch gross errors.
* Merge after release cut to catch subtle issues.

Reviewed By: albanD

Differential Revision: D30700647

Pulled By: dagitses

fbshipit-source-id: 7b02f34b8fae3041240cc78fbc6bcae498c3acd4
---
 ...tAutograd.test_function-x_grad_desc.expect |  2 +-
 ...tAutograd.test_function-y_grad_desc.expect |  2 +-
 test/test_autograd.py                         |  4 +-
 test/test_cuda.py                             |  2 +-
 tools/autograd/load_derivatives.py            | 82 +++++++------------
 torch/csrc/autograd/variable.cpp              |  6 +-
 6 files changed, 38 insertions(+), 60 deletions(-)

diff --git a/test/expect/TestAutograd.test_function-x_grad_desc.expect b/test/expect/TestAutograd.test_function-x_grad_desc.expect
index b6fdb63db272a..68242e2ffae90 100644
--- a/test/expect/TestAutograd.test_function-x_grad_desc.expect
+++ b/test/expect/TestAutograd.test_function-x_grad_desc.expect
@@ -1 +1 @@
-CopyBackwards(None, AddBackward0(ExpandBackward(AccumulateGrad()), MulBackward0(ExpandBackward(AccumulateGrad()), AccumulateGrad())))
\ No newline at end of file
+CopyBackwards(None, AddBackward0(ExpandBackward0(AccumulateGrad()), MulBackward0(ExpandBackward0(AccumulateGrad()), AccumulateGrad())))
\ No newline at end of file
diff --git a/test/expect/TestAutograd.test_function-y_grad_desc.expect b/test/expect/TestAutograd.test_function-y_grad_desc.expect
index e32d5888e1e7a..88db87320a92e 100644
--- a/test/expect/TestAutograd.test_function-y_grad_desc.expect
+++ b/test/expect/TestAutograd.test_function-y_grad_desc.expect
@@ -1 +1 @@
-CopyBackwards(None, AddBackward0(MulBackward0(ExpandBackward(AccumulateGrad()), None), MulBackward0(ExpandBackward(AccumulateGrad()), AccumulateGrad())))
\ No newline at end of file
+CopyBackwards(None, AddBackward0(MulBackward0(ExpandBackward0(AccumulateGrad()), None), MulBackward0(ExpandBackward0(AccumulateGrad()), AccumulateGrad())))
\ No newline at end of file
diff --git a/test/test_autograd.py b/test/test_autograd.py
index fde64b0b062d3..2da74cbd01938 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -3450,7 +3450,7 @@ def test_inplace_on_view_backward(self):
         gradient_penalty.backward()
 
         fn = gradient_penalty.grad_fn.next_functions[0][0].next_functions[1][0]
-        self.assertEqual(fn.name(), "ThresholdBackwardBackward")
+        self.assertEqual(fn.name(), "ThresholdBackwardBackward0")
 
     def test_inplace_on_view_weak_grad_fn(self):
         # Issue 23502: Test that b's grad_fn is preserved.
@@ -4859,7 +4859,7 @@ def maybe_check_raise(fn, should_raise):
         # The 3 elements are for view_as, first output of unbind and second output of unbind
         run_test(grad_mode=True, requires_grad=False, is_view=True,
                  should_raise_tuple=(None, None, None))
-        inp_change_err = "Output {} of UnbindBackward is a view and is being modified inplace."
+        inp_change_err = "Output {} of UnbindBackward0 is a view and is being modified inplace."
         run_test(grad_mode=True, requires_grad=True, is_view=True,
                  should_raise_tuple=(None, inp_change_err.format("0"), inp_change_err.format("1")))
         leaf_grad_err = "A view was created in no_grad mode and is being modified inplace"
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 33dbade7380b8..cddd15a7670e9 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -3049,7 +3049,7 @@ def test_autocast_rnn(self):
                 # Autocast wrapper requires at::_cudnn_rnn is autograd-exposed.  This check can't guarantee
                 # at::_cudnn_rnn is autograd-exposed, but if it fires, it indicates some funny business has
                 # occurred and we should double check that at::_cudnn_rnn remains autograd-exposed.
-                self.assertEqual(out.grad_fn.name(), "CudnnRnnBackward")
+                self.assertEqual(out.grad_fn.name(), "CudnnRnnBackward0")
                 out.sum().backward()
                 grads = [p.grad.clone() for p in rnn.parameters()]
 
diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py
index 3ff11f4d18691..8a5904b732918 100644
--- a/tools/autograd/load_derivatives.py
+++ b/tools/autograd/load_derivatives.py
@@ -2,9 +2,9 @@
 #
 # Each autograd function is represented by `DifferentiabilityInfo` containing
 # a list of `Derivative`. See `tools.codegen.api.autograd` for the data models.
-from collections import defaultdict, Counter
+from collections import defaultdict
 import re
-from typing import Sequence, Any, Tuple, List, Set, Dict, Match, Optional
+from typing import Counter, Sequence, Any, Tuple, List, Set, Dict, Match, Optional
 import yaml
 
 from tools.codegen.api.autograd import (Derivative, DifferentiabilityInfo,
@@ -43,32 +43,15 @@ def load_derivatives(derivatives_yaml_path: str, native_yaml_path: str) -> Seque
             assert str(function.func) not in functions_by_schema
             functions_by_schema[str(function.func)] = function
 
+        # Keep track of how many of which ops we've seen so we can
+        # disambiguate them with a numeric suffix.
+        op_counter = Counter[str]()
+
         infos = [
-            create_differentiability_info(defn, functions_by_signature, functions_by_schema)
+            create_differentiability_info(defn, functions_by_signature, functions_by_schema, op_counter)
             for defn in definitions]
 
-        # To keep it byte-for-byte compatible with the old codegen, we assign op names as a separate
-        # step. We only assign op names to those with differentiable args, and only append suffix to
-        # duplicated op names. This can be simplified if the first of the duplicates can be named
-        # 'XyzBackward' instead of 'XyzBackward0' or unconditionally append '0' to singletons.
-        op_names = create_op_names(infos)
-        res = [
-            DifferentiabilityInfo(
-                name=info.name,
-                func=info.func,
-                op=op_name,
-                derivatives=info.derivatives,
-                forward_derivatives=info.forward_derivatives,
-                all_saved_inputs=info.all_saved_inputs,
-                all_saved_outputs=info.all_saved_outputs,
-                args_with_derivatives=info.args_with_derivatives,
-                non_differentiable_arg_names=info.non_differentiable_arg_names,
-                output_differentiability=info.output_differentiability,
-                output_differentiability_conditions=info.output_differentiability_conditions,
-            )
-            for info, op_name in zip(infos, op_names)]
-
-        _GLOBAL_LOAD_DERIVATIVE_CACHE[key] = res
+        _GLOBAL_LOAD_DERIVATIVE_CACHE[key] = infos
 
     return _GLOBAL_LOAD_DERIVATIVE_CACHE[key]
 
@@ -279,6 +262,7 @@ def create_differentiability_info(
     defn: Dict[Any, Any],
     functions_by_signature: Dict[FunctionSchema, List[NativeFunction]],
     functions_by_schema: Dict[str, NativeFunction],
+    op_counter: Counter[str],
 ) -> DifferentiabilityInfo:
     """Processes a single entry `defn` in derivatives.yaml"""
 
@@ -424,10 +408,17 @@ def set_up_derivatives(f: NativeFunction) -> Tuple[
 
     derivatives, forward_derivatives, args_with_derivatives, non_differentiable_arg_names = set_up_derivatives(canonical)
 
+    # only assign an op name if we are actually going to calculate a derivative
+    op = None
+    if args_with_derivatives:
+        op_prefix = _create_op_prefix(defn_name)
+        op = f'{op_prefix}{op_counter[op_prefix]}'
+        op_counter[op_prefix] += 1
+
     return DifferentiabilityInfo(
         name=defn_name,
         func=canonical,
-        op=None,
+        op=op,
         derivatives=derivatives,
         forward_derivatives=forward_derivatives,
         all_saved_inputs=dedup_vars([v for d in derivatives for v in d.saved_inputs]),
@@ -566,35 +557,22 @@ def repl(m: Match[str]) -> str:
 
     return formula, tuple(saved)
 
-def create_op_name(info: DifferentiabilityInfo) -> Optional[str]:
-    # only assign an op name if we are actually going to calculate a derivative
-    if not info.args_with_derivatives:
-        return None
-    name = info.name
+def _create_op_prefix(name: str) -> str:
+    """Takes a native function name converts to a op prefix name.
+
+    Note that the "name" parameter must be the native function name
+    without the optional variant suffix, so "add" instead of
+    "add.out".
+
+    OP names correspond to classes, hence the change to title case.
+
+    Example::
+    >>> _create_op_prefix('add')
+    'AddBackward'
+    """
     camel_case = ''.join([p.title() for p in name.split('_')])
     return (camel_case + 'Backward').replace('ForwardBackward', 'Backward')
 
-def create_op_names(infos: Sequence[DifferentiabilityInfo]) -> Sequence[Optional[str]]:
-    names = list(map(create_op_name, infos))
-    dups = set(item for item, count in Counter(names).items() if count > 1)
-
-    # de-duplicate operation names
-    # you end up with something like:
-    #   AddBackward0
-    #   AddBackward1
-    # one for each overload
-    counter: Dict[str, int] = Counter()
-    dedup: List[Optional[str]] = []
-    for name in names:
-        if name is None:
-            # Keep a placeholder
-            dedup.append(None)
-        elif name in dups:
-            dedup.append(f'{name}{counter[name]}')
-            counter[name] += 1
-        else:
-            dedup.append(name)
-    return dedup
 
 def dedup_vars(vars: Sequence[SavedAttribute]) -> Sequence[SavedAttribute]:
     seen: Set[str] = set()
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index 32af5f97ad4e4..7ae1ac0bdee8d 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -551,10 +551,10 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(const Tenso
       //   self = view_op_n(view_n-1)
       //   self = inplace_op(self)
       //
-      // For CPU/CUDA backends, we employ one AsStridedBackward Node to represent the chain of
+      // For CPU/CUDA backends, we employ one AsStridedBackward0 Node to represent the chain of
       // view backward ops for effienciency.
       //
-      // However in XLA backend we don't have full support of AsStridedBackward, we instead run a full
+      // However in XLA backend we don't have full support of AsStridedBackward0, we instead run a full
       // forward pass with a tensor that requires gradient to get proper grad_fn setup,
       // then save it to DifferentiableViewMeta for future use.
       // This is fairly cheap for XLA lazy tensor approach (but would be really expensive for CPU/CUDA).
@@ -572,7 +572,7 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(const Tenso
         auto diff_view = view_fn(view_info.base_);
         diff_view_meta->grad_fn_ = diff_view.grad_fn();
       } else {
-        auto fn = std::make_shared<torch::autograd::generated::AsStridedBackward>();
+        auto fn = std::make_shared<torch::autograd::generated::AsStridedBackward0>();
         fn->self_geometry = at::TensorGeometry(view_info.base_);
         fn->size = self.sizes().vec();
         fn->stride = self.strides().vec();

From f04e6594ed7d7657a059ef63e82e136aa2bbc0fd Mon Sep 17 00:00:00 2001
From: Seth Elliott <sethelliott@fb.com>
Date: Thu, 2 Sep 2021 07:48:47 -0700
Subject: [PATCH 468/530] Fix broken caffe2 test:
 PlanExecutorTest.BlockingErrorPlan (#64401)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64401

PlanExecutorTest.BlockingErrorPlan uses `ASSERT_DEATH` which internally performs a `fork()`. This can cause problems under certain configurations that use threads. This change updates this test to use the "threadsafe" style for GTest death tests in order to improve its quality in multithreaded environments.

Test Plan:
I confirmed that this change fixes the issue on my devvm with the following command:
```
buck test mode/dev //caffe2/caffe2:caffe2_test_cpu -- PlanExecutorTest.BlockingErrorPlan
```

Reviewed By: praihan

Differential Revision: D30709447

fbshipit-source-id: 12ffd9ad0371e2e5b43a9873c80568e5ab02d246
---
 caffe2/core/plan_executor_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/caffe2/core/plan_executor_test.cc b/caffe2/core/plan_executor_test.cc
index 6f0c237a8b086..7a54403805ecb 100644
--- a/caffe2/core/plan_executor_test.cc
+++ b/caffe2/core/plan_executor_test.cc
@@ -290,6 +290,8 @@ TEST(PlanExecutorTest, BlockingErrorPlan) {
 #endif
 #endif
 
+  testing::GTEST_FLAG(death_test_style) = "threadsafe";
+
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
   ASSERT_DEATH(
       [] {

From cd3be4675faddb1ddda5d7d2b7d163574de6f9eb Mon Sep 17 00:00:00 2001
From: CodemodService FBSourceClangFormatLinterBot <>
Date: Thu, 2 Sep 2021 08:10:37 -0700
Subject: [PATCH 469/530] [AutoAccept][Codemod][FBSourceClangFormatLinter]
 Daily `arc lint --take CLANGFORMAT`

Reviewed By: zertosh

Differential Revision: D30710635

fbshipit-source-id: e8dae05a7e3a19d656067a4f102aab4a3c93ac42
---
 torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h
index e69a76cddc8ed..22c732862620a 100644
--- a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h
+++ b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h
@@ -53,8 +53,7 @@ class TORCH_API FaultyTensorPipeAgent : public TensorPipeAgent {
       const WorkerInfo& to,
       c10::intrusive_ptr<Message> message,
       const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout,
-      const DeviceMap& deviceMap = {})
-      override;
+      const DeviceMap& deviceMap = {}) override;
 
   // Add delay to writes
   void pipeWrite(

From 616fd9219da18bcfe69da8b0c3a96dd2c6298066 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Thu, 2 Sep 2021 08:12:48 -0700
Subject: [PATCH 470/530] [Static Runtime] Add sign/abs/lop1p/mul fusion pass
 (#64209)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64209

Add a new fusion pass that turns transforms the following pattern:
```
graph(%input):
    %0 : Tensor = aten::sign(%input)
    %1 : Tensor = aten::abs(%input)
    %2 : Tensor = aten::log1p(%1)
    %res : Tensor = aten::mul(%0, %2)
    return (%res)
```
Into a single op:
```
graph(%input):
    %res : Tensor = static_runtim::signed_log1p(%input)
    return (%res)
```

The intent is to reduce the number of passes over the tensor. However, enabling this pass actually causes a performance regression, probably due to a lack of vectorization in the fused implementation. Because of this issue, this diff **does not** enable this pass.

Followup: navahgar will add an NNC kernel which is faster than the the unfused version and enable this pass. We still need this version as a fallback since the NNC kernel will not support all dtypes.

Test Plan:
`buck test caffe2/benchmarks/static_runtime:static_runtime_cpptest -- SignedLog1p`

Test passed with new graph pass disabled and enabled.

Reviewed By: hlu1

Differential Revision: D30559929

fbshipit-source-id: e4e080cb2e6a705cfdde1fc98bee92b723f8132a
---
 benchmarks/static_runtime/test_scripts.h      | 11 ++++
 .../static_runtime/test_static_runtime.cc     |  8 +++
 torch/csrc/jit/runtime/static/ops.cpp         | 63 +++++++++++++++++++
 torch/csrc/jit/runtime/static/passes.cpp      | 22 +++++++
 torch/csrc/jit/runtime/static/passes.h        |  2 +
 5 files changed, 106 insertions(+)

diff --git a/benchmarks/static_runtime/test_scripts.h b/benchmarks/static_runtime/test_scripts.h
index 99b73db79f3d1..b17ddeda45dff 100644
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@@ -827,3 +827,14 @@ const auto cumsum_script_dtype = R"JIT(
    def forward(self, a: Tensor, dim: int, dtype: int):
       return torch.cumsum(a, dim, dtype=dtype).clone()
 )JIT";
+
+const std::string signed_log1p_script = R"IR(
+  graph(%input):
+      %0 : Tensor = aten::sign(%input)
+      %1 : Tensor = aten::abs(%input)
+      %2 : Tensor = aten::log1p(%1)
+      %3 : Tensor = aten::mul(%0, %2)
+      %none : NoneType = prim::Constant()
+      %res : Tensor = aten::clone(%3, %none)
+      return (%res)
+)IR";
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 16941dab84760..5eb3dfe28bd84 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -1356,3 +1356,11 @@ TEST(StaticRuntime, IndividualOps_Nonzero) {
   auto b = at::randint(0, 2, {4, 3, 2});
   testStaticRuntime(nonzero_tensor, {a}, {b});
 }
+
+TEST(StaticRuntime, SignedLog1p) {
+  std::vector<IValue> args1 = {at::randn({2, 2})};
+  testStaticRuntime(signed_log1p_script, args1, {}, true);
+
+  std::vector<IValue> args2 = {at::randn({3, 3, 3})};
+  testStaticRuntime(signed_log1p_script, args1, args2, true);
+}
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 7ede15c524296..62f5bb28c1553 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1837,5 +1837,68 @@ REGISTER_OPERATOR_FUNCTOR(
         }
       };
     });
+
+namespace {
+
+// This template and its specialization help us avoid compiler warnings
+// about taking the absolute value of an unsigned type in signed_log1p
+template <class T>
+T abs_if_signed(T val) {
+  return std::abs(val);
+}
+
+template <>
+unsigned char abs_if_signed<unsigned char>(unsigned char val) {
+  return val;
+}
+
+// Computes f(x) = sign(x) * ln(|1 + x|) for each x in the input tensor
+void signed_log1p_out(at::Tensor& out, const at::Tensor& input) {
+  at::native::resize_(out, input.sizes(), c10::nullopt);
+
+  const auto input_contig = input.expect_contiguous();
+  auto output_contig = out.expect_contiguous();
+
+  AT_DISPATCH_ALL_TYPES(input.scalar_type(), "signed_log1p_kernel", [&]() {
+    const auto input_data = input_contig->data_ptr<scalar_t>();
+    auto output_data = output_contig->data_ptr<float>();
+    const auto N = input.numel();
+
+    for (const auto i : c10::irange(N)) {
+      const int sign = input_data[i] < 0 ? -1 : 1;
+      output_data[i] = std::log1p(abs_if_signed(input_data[i])) * sign;
+    }
+  });
+}
+
+at::Tensor signed_log1p(const at::Tensor& input) {
+  auto out = create_empty_from(input);
+  signed_log1p_out(out, input);
+  return out;
+}
+
+} // namespace
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+REGISTER_OPERATOR_FUNCTOR(
+    static_runtime::signed_log1p,
+    static_runtime_signed_log1p,
+    [](Node* n) -> SROperator {
+      if (!n->matches(torch::schema(
+              "static_runtime::signed_log1p(Tensor x) -> Tensor"))) {
+        LogAndDumpSchema(n);
+        return nullptr;
+      }
+      return [](ProcessedNode* p_node) {
+        const auto& input = p_node->Input(0).toTensor();
+        if (p_node->Output(0).isNone()) {
+          p_node->Output(0) = signed_log1p(input);
+        } else {
+          auto& out = p_node->Output(0).toTensor();
+          fastResizeToZero(out);
+          signed_log1p_out(out, input);
+        }
+      };
+    });
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
index 5099dc1ba6e2b..0eaebfdf0e7aa 100644
--- a/torch/csrc/jit/runtime/static/passes.cpp
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@@ -306,6 +306,28 @@ TORCH_LIBRARY_FRAGMENT(static_runtime, m) {
   m.def(torch::schema(
       "static_runtime::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> (Tensor, Tensor, Tensor)",
       c10::AliasAnalysisKind::PURE_FUNCTION));
+  m.def("static_runtime::signed_log1p(Tensor input) -> Tensor");
+}
+
+void FuseSignLog1P(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string pattern = R"IR(
+    graph(%input):
+        %0 : Tensor = aten::sign(%input)
+        %1 : Tensor = aten::abs(%input)
+        %2 : Tensor = aten::log1p(%1)
+        %res : Tensor = aten::mul(%0, %2)
+        return (%res)
+  )IR";
+
+  std::string fused_pattern = R"IR(
+    graph(%input):
+        %res : Tensor = static_runtime::signed_log1p(%input)
+        return (%res)
+    )IR";
+
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, fused_pattern);
+  fuse.runOnGraph(graph);
 }
 
 bool HasInplaceOp(std::shared_ptr<Graph>& graph, const AliasDb& alias_db) {
diff --git a/torch/csrc/jit/runtime/static/passes.h b/torch/csrc/jit/runtime/static/passes.h
index a42bc97f19618..0904d37fb02c4 100644
--- a/torch/csrc/jit/runtime/static/passes.h
+++ b/torch/csrc/jit/runtime/static/passes.h
@@ -20,5 +20,7 @@ TORCH_API bool HasInplaceOp(
     std::shared_ptr<Graph>& graph,
     const AliasDb& alias_db);
 
+TORCH_API void FuseSignLog1P(std::shared_ptr<Graph>& graph);
+
 } // namespace jit
 } // namespace torch

From 2c258d91cc1dc11c338e97d6970ac77a4f8978ec Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Thu, 2 Sep 2021 08:59:53 -0700
Subject: [PATCH 471/530] Fix torch.istft length mismatch and window runtime
 error (#63469)

Summary:
The PR fixes two issues:
- See https://github.com/pytorch/pytorch/issues/62747 and https://github.com/pytorch/audio/issues/1409. The length mismatch when the given ``length`` parameter is longer than expected. Add padding logic in consistent with librosa.
- See https://github.com/pytorch/pytorch/issues/62323. The current implementations checks if the min value of window_envelop.abs() is greater than zero.  In librosa they normalize the signal on non-zero values by indexing. Like
```
approx_nonzero_indices = ifft_window_sum > util.tiny(ifft_window_sum)
y[approx_nonzero_indices] /= ifft_window_sum[approx_nonzero_indices]
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63469

Reviewed By: fmassa

Differential Revision: D30695827

Pulled By: nateanl

fbshipit-source-id: d034e53f0d65b3fd1dbd150c9c5acf3faf25a164
---
 aten/src/ATen/native/SpectralOps.cpp | 10 ++++-
 test/test_spectral_ops.py            | 64 +++++++++++++++++++++++++---
 torch/functional.py                  |  3 +-
 3 files changed, 69 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index cd042073794c3..f9472b1f3dd3d 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -920,7 +920,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
 
   // We need to trim the front padding away if centered
   const auto start = center ? n_fft / 2 : 0;
-  const auto end = lengthOpt.has_value()? start + lengthOpt.value() : - n_fft / 2;
+  const auto end = lengthOpt.has_value() ? start + lengthOpt.value() : (center ? - n_fft / 2 : -1);
 
   y = y.slice(2, start, end, 1);
   window_envelop = window_envelop.slice(2, start, end, 1);
@@ -935,6 +935,14 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
   if (input_dim == 3) {
     y = y.squeeze(0);
   }
+  // zero padding if the given lengthOpt is longer than expected
+  if(end > expected_output_signal_len) {
+    TORCH_WARN_ONCE(
+      "The length of signal is shorter than the length parameter. Result is being padded with zeros in the tail. "
+      "Please check your center and hop_length settings."
+    );
+    y = at::constant_pad_nd(y, {0, end - expected_output_signal_len}, 0);
+  }
   return y;
 
   #undef REPR
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index fdc8c01417fd1..f632e95d9c704 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -1126,9 +1126,6 @@ def _test_istft_is_inverse_of_stft(stft_kwargs):
                     original = torch.randn(*sizes, dtype=dtype, device=device)
                     stft = torch.stft(original, return_complex=True, **stft_kwargs)
                     inversed = torch.istft(stft, length=original.size(1), **istft_kwargs)
-
-                    # trim the original for case when constructed signal is shorter than original
-                    original = original[..., :inversed.size(-1)]
                     self.assertEqual(
                         inversed, original, msg='istft comparison against original',
                         atol=7e-6, rtol=0, exact_dtype=True)
@@ -1167,21 +1164,63 @@ def _test_istft_is_inverse_of_stft(stft_kwargs):
                 'normalized': True,
                 'onesided': False,
             },
-            # hamming_window, not centered, not normalized, onesided
+            # hamming_window, centered, not normalized, onesided
             # window same size as n_fft
             {
                 'n_fft': 5,
                 'hop_length': 2,
                 'win_length': 5,
                 'window': torch.hamming_window(5, dtype=dtype, device=device),
-                'center': False,
+                'center': True,
                 'pad_mode': 'constant',
                 'normalized': False,
                 'onesided': True,
             },
+        ]
+        for i, pattern in enumerate(patterns):
+            _test_istft_is_inverse_of_stft(pattern)
+
+    @onlyOnCPUAndCUDA
+    @skipCPUIfNoFFT
+    @dtypes(torch.double)
+    def test_istft_round_trip_with_padding(self, device, dtype):
+        """long hop_length or not centered may cause length mismatch in the inversed signal"""
+        def _test_istft_is_inverse_of_stft_with_padding(stft_kwargs):
+            # generates a random sound signal for each tril and then does the stft/istft
+            # operation to check whether we can reconstruct signal
+            num_trials = 100
+            sizes = stft_kwargs['size']
+            del stft_kwargs['size']
+            istft_kwargs = stft_kwargs.copy()
+            del istft_kwargs['pad_mode']
+            for i in range(num_trials):
+                original = torch.randn(*sizes, dtype=dtype, device=device)
+                stft = torch.stft(original, return_complex=True, **stft_kwargs)
+                with self.assertWarnsOnceRegex(UserWarning, "The length of signal is shorter than the length parameter."):
+                    inversed = torch.istft(stft, length=original.size(-1), **istft_kwargs)
+                n_frames = stft.size(-1)
+                if stft_kwargs["center"] is True:
+                    len_expected = stft_kwargs["n_fft"] // 2 + stft_kwargs["hop_length"] * (n_frames - 1)
+                else:
+                    len_expected = stft_kwargs["n_fft"] + stft_kwargs["hop_length"] * (n_frames - 1)
+                # trim the original for case when constructed signal is shorter than original
+                padding = inversed[..., len_expected:]
+                inversed = inversed[..., :len_expected]
+                original = original[..., :len_expected]
+                # test the padding points of the inversed signal are all zeros
+                zeros = torch.zeros_like(padding, device=padding.device)
+                self.assertEqual(
+                    padding, zeros, msg='istft padding values against zeros',
+                    atol=7e-6, rtol=0, exact_dtype=True)
+                self.assertEqual(
+                    inversed, original, msg='istft comparison against original',
+                    atol=7e-6, rtol=0, exact_dtype=True)
+
+        patterns = [
             # hamming_window, not centered, not normalized, not onesided
             # window same size as n_fft
             {
+                'size': [2, 20],
                 'n_fft': 3,
                 'hop_length': 2,
                 'win_length': 3,
@@ -1191,9 +1230,22 @@ def _test_istft_is_inverse_of_stft(stft_kwargs):
                 'normalized': False,
                 'onesided': False,
             },
+            # hamming_window, centered, not normalized, onesided, long hop_length
+            # window same size as n_fft
+            {
+                'size': [2, 500],
+                'n_fft': 256,
+                'hop_length': 254,
+                'win_length': 256,
+                'window': torch.hamming_window(256, dtype=dtype, device=device),
+                'center': True,
+                'pad_mode': 'constant',
+                'normalized': False,
+                'onesided': True,
+            },
         ]
         for i, pattern in enumerate(patterns):
-            _test_istft_is_inverse_of_stft(pattern)
+            _test_istft_is_inverse_of_stft_with_padding(pattern)
 
     @onlyOnCPUAndCUDA
     def test_istft_throws(self, device):
diff --git a/torch/functional.py b/torch/functional.py
index 81b3de234e1ca..63470cf2d443f 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -569,7 +569,8 @@ def istft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
 
     Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame,
     ``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False
-    since the signal isn't padded).
+    since the signal isn't padded). If `length` is given in the arguments and is longer than expected,
+    ``istft`` will pad zeros to the end of the returned signal.
 
     If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc.
     Left padding can be trimmed off exactly because they can be calculated but right padding cannot be

From 50067c020a14d183b49861771effa35d472220e9 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 2 Sep 2021 09:02:35 -0700
Subject: [PATCH 472/530] TST Adds __repr__ and str to module info (#63737)

Summary:
Follow up to https://github.com/pytorch/pytorch/pull/61935

This PR adds `test_repr` to `test_modules`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63737

Reviewed By: gchanan

Differential Revision: D30729642

Pulled By: jbschlosser

fbshipit-source-id: c11a28bc0739abd3ed40727389dd28ed4069edad
---
 test/test_modules.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/test/test_modules.py b/test/test_modules.py
index 52520dad080de..6d6adbc7ac57d 100644
--- a/test/test_modules.py
+++ b/test/test_modules.py
@@ -110,6 +110,20 @@ def test_factory_kwargs(self, device, dtype, module_info):
                             buffer.dtype, dtype,
                             f'Buffer {name} is of dtype {buffer.dtype} instead of the expected dtype {dtype}')
 
+    @modules(module_db)
+    def test_repr(self, device, dtype, module_info):
+        # Test module can be represented with repr and str without errors.
+        module_cls = module_info.module_cls
+        module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
+                                                       requires_grad=False)
+        for module_input in module_inputs:
+            args, kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs
+            m = module_cls(*args, **kwargs)
+
+            # Check that these methods do not raise errors
+            m.__repr__()
+            str(m)
+
     @modules(module_db)
     def test_pickle(self, device, dtype, module_info):
         # Test that module can be pickled and unpickled.

From 66ddc6ef9e33c4fb3a3694106432c3ba81d1ad90 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 2 Sep 2021 09:27:44 -0700
Subject: [PATCH 473/530] Fixes and details to torchhub docs (#63783)

Summary:
This PR:

- adds a few details regarding the newly added `skip_validation` parameter https://github.com/pytorch/pytorch/pull/62139
- uses double-backticks instead of single-backticks since this is rst, not mardown.
- adds a few minor doc nits here and there

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63783

Reviewed By: zou3519

Differential Revision: D30696658

Pulled By: NicolasHug

fbshipit-source-id: 6f01c7eb3cfcd7e17e4c33c09d193054fa18ad36
---
 torch/hub.py | 61 ++++++++++++++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/torch/hub.py b/torch/hub.py
index bcd53f79d9c77..82287d84b14f6 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -257,18 +257,20 @@ def set_dir(d):
 
 def list(github, force_reload=False, skip_validation=False):
     r"""
-    List all entrypoints available in `github` hubconf.
+    List all callable entrypoints available in the repo specified by ``github``.
 
     Args:
         github (string): a string with format "repo_owner/repo_name[:tag_name]" with an optional
-            tag/branch. The default branch is `master` if not specified.
+            tag/branch. The default branch is ``master`` if not specified.
             Example: 'pytorch/vision[:hub]'
         force_reload (bool, optional): whether to discard the existing cache and force a fresh download.
-            Default is `False`.
-        skip_validation (bool, optional): whether to check package validity against github.
-            Default is `False`.
+            Default is ``False``.
+        skip_validation (bool, optional): if ``False``, torchhub will check that the branch or commit
+            specified by the ``github`` argument properly belongs to the repo owner. This will make
+            requests to the GitHub API; you can specify a non-default GitHub token by setting the
+            ``GITHUB_TOKEN`` environment variable. Default is ``False``.
     Returns:
-        entrypoints: a list of available entrypoint names
+        list: The available callables entrypoint
 
     Example:
         >>> entrypoints = torch.hub.list('pytorch/vision', force_reload=True)
@@ -290,17 +292,19 @@ def list(github, force_reload=False, skip_validation=False):
 
 def help(github, model, force_reload=False, skip_validation=False):
     r"""
-    Show the docstring of entrypoint `model`.
+    Show the docstring of entrypoint ``model``.
 
     Args:
         github (string): a string with format <repo_owner/repo_name[:tag_name]> with an optional
-            tag/branch. The default branch is `master` if not specified.
+            tag/branch. The default branch is ``master`` if not specified.
             Example: 'pytorch/vision[:hub]'
-        model (string): a string of entrypoint name defined in repo's hubconf.py
+        model (string): a string of entrypoint name defined in repo's ``hubconf.py``
         force_reload (bool, optional): whether to discard the existing cache and force a fresh download.
-            Default is `False`.
-        skip_validation (bool, optional): whether to check package validity against github.
-            Default is `False`.
+            Default is ``False``.
+        skip_validation (bool, optional): if ``False``, torchhub will check that the branch or commit
+            specified by the ``github`` argument properly belongs to the repo owner. This will make
+            requests to the GitHub API; you can specify a non-default GitHub token by setting the
+            ``GITHUB_TOKEN`` environment variable. Default is ``False``.
     Example:
         >>> print(torch.hub.help('pytorch/vision', 'resnet18', force_reload=True))
     """
@@ -326,11 +330,11 @@ def load(repo_or_dir, model, *args, source='github', force_reload=False, verbose
     Note: Loading a model is the typical use case, but this can also be used to
     for loading other objects such as tokenizers, loss functions, etc.
 
-    If :attr:`source` is ``'github'``, :attr:`repo_or_dir` is expected to be
+    If ``source`` is 'github', ``repo_or_dir`` is expected to be
     of the form ``repo_owner/repo_name[:tag_name]`` with an optional
     tag/branch. The default branch is ``master`` if not specified.
 
-    If :attr:`source` is ``'local'``, :attr:`repo_or_dir` is expected to be a
+    If ``source`` is 'local', ``repo_or_dir`` is expected to be a
     path to a local directory.
 
     Args:
@@ -339,9 +343,9 @@ def load(repo_or_dir, model, *args, source='github', force_reload=False, verbose
             ``source = 'local'``.
         model (string): the name of a callable (entrypoint) defined in the
             repo/dir's ``hubconf.py``.
-        *args (optional): the corresponding args for callable :attr:`model`.
-        source (string, optional): ``'github'`` | ``'local'``. Specifies how
-            ``repo_or_dir`` is to be interpreted. Default is ``'github'``.
+        *args (optional): the corresponding args for callable ``model``.
+        source (string, optional): 'github' or 'local'. Specifies how
+            ``repo_or_dir`` is to be interpreted. Default is 'github'.
         force_reload (bool, optional): whether to force a fresh download of
             the github repo unconditionally. Does not have any effect if
             ``source = 'local'``. Default is ``False``.
@@ -349,13 +353,14 @@ def load(repo_or_dir, model, *args, source='github', force_reload=False, verbose
             local caches. Note that the message about first download cannot be
             muted. Does not have any effect if ``source = 'local'``.
             Default is ``True``.
-        skip_validation (bool, optional): whether to check package validity against github.
-            Default is `False`.
-        **kwargs (optional): the corresponding kwargs for callable
-            :attr:`model`.
+        skip_validation (bool, optional): if ``False``, torchhub will check that the branch or commit
+            specified by the ``github`` argument properly belongs to the repo owner. This will make
+            requests to the GitHub API; you can specify a non-default GitHub token by setting the
+            ``GITHUB_TOKEN`` environment variable. Default is ``False``.
+        **kwargs (optional): the corresponding kwargs for callable ``model``.
 
     Returns:
-        The output of the :attr:`model` callable when called with the given
+        The output of the ``model`` callable when called with the given
         ``*args`` and ``**kwargs``.
 
     Example:
@@ -387,7 +392,7 @@ def _load_local(hubconf_dir, model, *args, **kwargs):
         hubconf_dir (string): path to a local directory that contains a
             ``hubconf.py``.
         model (string): name of an entrypoint defined in the directory's
-            `hubconf.py`.
+            ``hubconf.py``.
         *args (optional): the corresponding args for callable ``model``.
         **kwargs (optional): the corresponding kwargs for callable ``model``.
 
@@ -416,8 +421,8 @@ def download_url_to_file(url, dst, hash_prefix=None, progress=True):
 
     Args:
         url (string): URL of the object to download
-        dst (string): Full path where object will be saved, e.g. `/tmp/temporary_file`
-        hash_prefix (string, optional): If not None, the SHA256 downloaded file should start with `hash_prefix`.
+        dst (string): Full path where object will be saved, e.g. ``/tmp/temporary_file``
+        hash_prefix (string, optional): If not None, the SHA256 downloaded file should start with ``hash_prefix``.
             Default: None
         progress (bool, optional): whether or not to display a progress bar to stderr
             Default: True
@@ -513,8 +518,8 @@ def load_state_dict_from_url(url, model_dir=None, map_location=None, progress=Tr
 
     If the object is already present in `model_dir`, it's deserialized and
     returned.
-    The default value of `model_dir` is ``<hub_dir>/checkpoints`` where
-    `hub_dir` is the directory returned by :func:`~torch.hub.get_dir`.
+    The default value of ``model_dir`` is ``<hub_dir>/checkpoints`` where
+    ``hub_dir`` is the directory returned by :func:`~torch.hub.get_dir`.
 
     Args:
         url (string): URL of the object to download
@@ -527,7 +532,7 @@ def load_state_dict_from_url(url, model_dir=None, map_location=None, progress=Tr
             digits of the SHA256 hash of the contents of the file. The hash is used to
             ensure unique names and to verify the contents of the file.
             Default: False
-        file_name (string, optional): name for the downloaded file. Filename from `url` will be used if not set.
+        file_name (string, optional): name for the downloaded file. Filename from ``url`` will be used if not set.
 
     Example:
         >>> state_dict = torch.hub.load_state_dict_from_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')

From aeafcde087fa76618708b8a2841c450e7e184761 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Thu, 2 Sep 2021 09:50:56 -0700
Subject: [PATCH 474/530] CI: Enable using labels to control GHA workflows
 (#64314)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/62852

Sets a global environment variable containing a list of PR labels. For this PR, the PR_LABELS variable looks like:
```
[
  "cla signed",
  "ciflow/default"
]
```
confirmed in a run: https://github.com/pytorch/pytorch/runs/3490072161?check_suite_focus=true

This information can be used in other workflow steps to control the logic. For example, if I want to force a build, I can label my PR with "force-build" and do something like the following in my build script:
```
if [[ "${PR_LABELS}" = *force-build* ]]; then
   python setup.py install
else
   #use cached wheel or something
fi
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64314

Reviewed By: driazati

Differential Revision: D30714570

Pulled By: janeyx99

fbshipit-source-id: 80b060ee32643ddd22eb7b8ec548579c7ccf6441
---
 .github/templates/bazel_ci_workflow.yml.j2                    | 2 ++
 .github/templates/linux_ci_workflow.yml.j2                    | 4 ++++
 .github/templates/windows_ci_workflow.yml.j2                  | 1 +
 .../generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml   | 2 ++
 .../generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml   | 2 ++
 .../workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml  | 3 +++
 .../workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml  | 3 +++
 .../workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml  | 3 +++
 .../workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml  | 3 +++
 .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml     | 4 ++++
 .../generated-linux-xenial-py3.6-gcc7-bazel-test.yml          | 3 +++
 ...ted-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml | 2 ++
 .../generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml   | 3 +++
 .../workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml  | 1 +
 .github/workflows/generated-win-vs2019-cpu-py3.yml            | 1 +
 .github/workflows/generated-win-vs2019-cuda10.1-py3.yml       | 1 +
 .github/workflows/generated-win-vs2019-cuda11.3-py3.yml       | 1 +
 17 files changed, 39 insertions(+)

diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2
index 7f9d5230e0d9f..57b4567876e35 100644
--- a/.github/templates/bazel_ci_workflow.yml.j2
+++ b/.github/templates/bazel_ci_workflow.yml.j2
@@ -62,6 +62,7 @@ on:
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e PR_LABELS \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
             -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \
@@ -110,6 +111,7 @@ on:
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e CONTINUE_THROUGH_ERROR \
+            -e PR_LABELS \
             -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 49b6d7dd68c77..01ea7af15305d 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -46,6 +46,7 @@ env:
   # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
   CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
   group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}
@@ -144,6 +145,7 @@ jobs:
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
             -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \
+            -e PR_LABELS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
@@ -310,6 +312,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e PR_LABELS \
             -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
@@ -405,6 +408,7 @@ jobs:
             -e CIRCLE_SHA1="$GITHUB_SHA" \
             -e DOCS_VERSION="${target}" \
             -e DOCS_TYPE \
+            -e PR_LABELS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index 38c346c1134f8..b927281b84ff8 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -51,6 +51,7 @@ env:
   IN_CI: 1
   INSTALL_WINDOWS_SDK: 1
   PYTHON_VERSION: "3.8"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
   SCCACHE_BUCKET: "ossci-compiler-cache"
   VC_PRODUCT: "BuildTools"
   VC_VERSION: ""
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
index e621bee2ad666..4a434ac9772a2 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -24,6 +24,7 @@ env:
   # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
   CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
   group: libtorch-linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
@@ -196,6 +197,7 @@ jobs:
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e PR_LABELS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 9daf916ae2642..34fd21e15dcde 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -24,6 +24,7 @@ env:
   # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
   CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
   group: libtorch-linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
@@ -196,6 +197,7 @@ jobs:
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e PR_LABELS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 4821c1e306715..f1c0ea491c415 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -24,6 +24,7 @@ env:
   # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
   CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
   group: linux-bionic-cuda10.2-py3.9-gcc7-${{ github.event.pull_request.number || github.sha }}
@@ -196,6 +197,7 @@ jobs:
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e PR_LABELS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
@@ -414,6 +416,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e PR_LABELS \
             -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index 79edf0d741950..e13c6191ac64e 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -24,6 +24,7 @@ env:
   # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
   CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
   group: linux-bionic-py3.8-gcc9-coverage-${{ github.event.pull_request.number || github.sha }}
@@ -196,6 +197,7 @@ jobs:
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e PR_LABELS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
@@ -414,6 +416,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e PR_LABELS \
             -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 316da3604fc91..e44dfbe44759a 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -24,6 +24,7 @@ env:
   # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
   CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
   group: linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
@@ -196,6 +197,7 @@ jobs:
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e PR_LABELS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
@@ -414,6 +416,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e PR_LABELS \
             -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 6c9e67d380c29..618b13148567d 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -24,6 +24,7 @@ env:
   # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
   CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
   group: linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
@@ -196,6 +197,7 @@ jobs:
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e PR_LABELS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
@@ -414,6 +416,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e PR_LABELS \
             -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index 2337b4f5bf429..6ceabb04770e6 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -24,6 +24,7 @@ env:
   # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
   CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
   group: linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}
@@ -196,6 +197,7 @@ jobs:
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e PR_LABELS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
@@ -414,6 +416,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e PR_LABELS \
             -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
@@ -574,6 +577,7 @@ jobs:
             -e CIRCLE_SHA1="$GITHUB_SHA" \
             -e DOCS_VERSION="${target}" \
             -e DOCS_TYPE \
+            -e PR_LABELS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index 17dc3a6742d73..4bfe1d1211786 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -24,6 +24,7 @@ env:
   # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
   CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
   group: linux-xenial-py3.6-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }}
@@ -214,6 +215,7 @@ jobs:
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e PR_LABELS \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
@@ -264,6 +266,7 @@ jobs:
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e CONTINUE_THROUGH_ERROR \
+            -e PR_LABELS \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 2f5cab7538601..ce4fe5bc0e250 100644
--- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -22,6 +22,7 @@ env:
   # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
   CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
   group: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
@@ -194,6 +195,7 @@ jobs:
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e PR_LABELS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 8c81ab1717221..5b60a1bcdf900 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -22,6 +22,7 @@ env:
   # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
   CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
   group: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
@@ -194,6 +195,7 @@ jobs:
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e PR_LABELS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
@@ -412,6 +414,7 @@ jobs:
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e PR_LABELS \
             -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
index 306e93aca7990..de74cdc16f889 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
@@ -18,6 +18,7 @@ env:
   IN_CI: 1
   INSTALL_WINDOWS_SDK: 1
   PYTHON_VERSION: "3.8"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
   SCCACHE_BUCKET: "ossci-compiler-cache"
   VC_PRODUCT: "BuildTools"
   VC_VERSION: ""
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index f79cad7b04c00..4f43d2743d18b 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -20,6 +20,7 @@ env:
   IN_CI: 1
   INSTALL_WINDOWS_SDK: 1
   PYTHON_VERSION: "3.8"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
   SCCACHE_BUCKET: "ossci-compiler-cache"
   VC_PRODUCT: "BuildTools"
   VC_VERSION: ""
diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
index 35c6cede0eefe..ad4cf37f958e1 100644
--- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
@@ -20,6 +20,7 @@ env:
   IN_CI: 1
   INSTALL_WINDOWS_SDK: 1
   PYTHON_VERSION: "3.8"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
   SCCACHE_BUCKET: "ossci-compiler-cache"
   VC_PRODUCT: "BuildTools"
   VC_VERSION: ""
diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
index 4bfc5654186f7..299b48d66a411 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
@@ -20,6 +20,7 @@ env:
   IN_CI: 1
   INSTALL_WINDOWS_SDK: 1
   PYTHON_VERSION: "3.8"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
   SCCACHE_BUCKET: "ossci-compiler-cache"
   VC_PRODUCT: "BuildTools"
   VC_VERSION: ""

From 6da7552a8eaae6b85e271bf3edac2fa2ae9f1148 Mon Sep 17 00:00:00 2001
From: Kefei Lu <kefeilu@fb.com>
Date: Thu, 2 Sep 2021 10:38:43 -0700
Subject: [PATCH 475/530] Add fx2trt pass for removing duplicate output args
 (#64433)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64433

Fx2TRT does not support duplicate nodes in the output args tuple.

This pass removes duplicate output args from the target subnets and fixes their uses in the top level module where the subnets are called. This pass must be called after acc split on the top-level net and subsequent calls to the acc trace on the subnets.

This pass will change both the subnets and top level module.

Test Plan:
Run:

```
buck run mode/opt -c python.package_style=inplace //caffe2/torch/fb/fx2trt/tests/passes/:test_remove_duplicate_output_args

```

Reviewed By: 842974287

Differential Revision: D30468409

fbshipit-source-id: b4d91b76ab5d8a5275d68dd48d1327a44c22568e
---
 .../passes/remove_duplicate_output_args.py    | 133 ++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py

diff --git a/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py b/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py
new file mode 100644
index 0000000000000..488ce45c75952
--- /dev/null
+++ b/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+
+import operator
+import typing as t
+import logging
+import torch.fx as fx
+import dataclasses as dc
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def remove_duplicate_output_args(
+    top_level: fx.GraphModule,
+    target_subnets: t.Collection[str]
+) -> t.Mapping[str, "RemoveDuplicateResult"]:
+    """Removes duplicate output args.
+
+    This pass removes duplicate output args from the target subnets and fixes
+    their uses in the top level module where the subnets are called. This pass
+    must be called after acc split on the top-level net and subsequent calls to
+    the acc trace on the subnets.
+
+    This pass will change both the subnets and top level module.
+
+    Returns:
+        a mapping of the target subnet name to its dedupcate result
+    """
+
+    processed_subnets = {}
+    for node in top_level.graph.nodes:
+        node: fx.Node
+        if node.op == "call_module" and node.name in target_subnets:
+            sub_gm = top_level.get_submodule(node.target)
+            assert isinstance(sub_gm, fx.GraphModule)
+
+            replace_res = _remove_duplicate_output_args(sub_gm)
+            processed_subnets[node.name] = replace_res
+            if replace_res.replacement_map is None:
+                continue
+            sub_gm.recompile()
+
+            needs_recompile = False
+            # iterate on the copy since we will be changing elements of node.users
+            for user in list(node.users):
+                idx = _ensure_proper_output_use(user, node)
+                idx_new = replace_res.replacement_map[idx]
+                if idx_new != idx:
+                    user.args = (user.args[0], idx_new)
+                    needs_recompile = True
+
+            if needs_recompile:
+                top_level.recompile()
+    return processed_subnets
+
+
+@dc.dataclass(frozen=True)
+class RemoveDuplicateResult:
+    replacement_map: t.Optional[t.List[int]]
+    module: fx.GraphModule
+
+
+def _ensure_proper_output_use(user: fx.Node, target_node: fx.Node) -> int:
+    """
+    Ensures the node looks in proper form of calling the output of an fx2trt
+    splitter sub-net. Specifically:
+
+    1. op is call function, target: operator.getitem
+    2. args is a 2-element tuple
+    3. args[0] is the name of the subnet's output
+    4. args[1] is the index into the subnet output tuple
+
+    E.g.:
+
+        %getitem_4 : [#users=1] = call_function[target=operator.getitem](args = (%_run_on_acc_1, 4), kwargs = {})
+
+    returns the index into the subnet output tuple
+    """
+    _LOGGER.info(f"Checking user node: {user.format_node()}")
+    assert (
+        user.op == "call_function"
+        and user.target == operator.getitem
+        and len(user.args) == 2
+        and user.args[0].name == target_node.name
+        and isinstance(user.args[1], int)
+    ), f"Node is not a proper user of splitter output: {user.format_node()}"
+
+    return user.args[1]
+
+
+def _remove_duplicate_output_args(gm: fx.GraphModule) -> RemoveDuplicateResult:
+    output_nodes = [n for n in gm.graph.nodes if n.op == "output"]
+    assert len(output_nodes) == 1, \
+           f"Expecting exactly one `output` node, but got {len(output_nodes)}"
+
+    changed = False
+    # arg node name to its index in the new output args tuple
+    name_to_idx: t.Dict[str, int] = {}
+    output_node = output_nodes[0]
+
+    # Output op only uses its `args[0]`, and it does not have `kwargs`.
+    # https://pytorch.org/docs/stable/fx.html#torch.fx.Node
+    args = output_node.args[0]
+
+    # Only concern outselves to the case where the args is an iterable of fx.Node.
+    # Other return cases (e.g., a single value) is possible and we don't handle
+    # that in this pass.
+    if not (isinstance(args, t.Iterable) and all(isinstance(a, fx.Node) for a in args)):
+        return RemoveDuplicateResult(replacement_map=None, module=gm)
+
+    # Map old index of the arg node to the remaining node's idx,
+    # initialized to `i => i`
+    replacement_map: t.List[int] = list(range(len(args)))
+    args_new = []
+    for idx, a in enumerate(args):
+        assert isinstance(a, fx.Node), \
+               f"Expecting fx.Node instance, but got: {type(a)}"
+
+        if a.name not in name_to_idx:
+            args_new.append(a)
+            name_to_idx[a.name] = len(args_new) - 1
+        else:
+            changed = True
+            _LOGGER.warning(
+                f"Replaced duplicate output arg '{a.name}': "
+                f"{idx} -> {name_to_idx[a.name]}"
+            )
+        replacement_map[idx] = name_to_idx[a.name]
+
+    output_node.args = (tuple(args_new),)
+    if changed:
+        gm.recompile()
+    return RemoveDuplicateResult(replacement_map, module=gm)

From 9214450b7fe3113a6078618514199f5af7bf82a0 Mon Sep 17 00:00:00 2001
From: Hui Guo <huiguo@fb.com>
Date: Thu, 2 Sep 2021 10:40:02 -0700
Subject: [PATCH 476/530] [tensorexpr] Wrap error msgs with buildErrorMessages
 for internal asserts (#64409)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64409

Test Plan: Imported from OSS

Reviewed By: navahgar

Differential Revision: D30717786

Pulled By: huiguoo

fbshipit-source-id: a3b147d339ff4927f14efa24407cd3b63d80001d
---
 torch/csrc/jit/tensorexpr/graph_opt.cpp     | 41 +++++++++++++++------
 torch/csrc/jit/tensorexpr/ir_mutator.cpp    | 12 ++++--
 torch/csrc/jit/tensorexpr/ir_simplifier.cpp |  8 +++-
 torch/csrc/jit/tensorexpr/kernel.cpp        | 36 +++++++++++++-----
 4 files changed, 70 insertions(+), 27 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/graph_opt.cpp b/torch/csrc/jit/tensorexpr/graph_opt.cpp
index 67f9a671bfa20..d55ea0559e5e1 100644
--- a/torch/csrc/jit/tensorexpr/graph_opt.cpp
+++ b/torch/csrc/jit/tensorexpr/graph_opt.cpp
@@ -26,14 +26,21 @@ Node* moveCatAfterUse(Node* cat, Node* user, std::shared_ptr<Graph> subgraph) {
   //   %4 = aten::cat(%3, ...)
   //   return (%4)
 
-  TORCH_INTERNAL_ASSERT(cat->output()->hasUses());
-  TORCH_INTERNAL_ASSERT(cat->output()->uses().size() == 1);
-  TORCH_INTERNAL_ASSERT(cat->input(0)->node()->kind() == prim::ListConstruct);
+  TORCH_INTERNAL_ASSERT(
+      cat->output()->hasUses(),
+      buildErrorMessage("aten::cat output is not used."));
+  TORCH_INTERNAL_ASSERT(
+      cat->output()->uses().size() == 1,
+      buildErrorMessage("aten::cat output is used in multiple places."));
+  TORCH_INTERNAL_ASSERT(
+      cat->input(0)->node()->kind() == prim::ListConstruct,
+      buildErrorMessage("aten::cat inputs are not expected."));
   auto cat_list = cat->input(0)->node();
   auto cat_inputs = cat_list->inputs();
 
   auto user_tensor_type = user->output()->type()->cast<c10::TensorType>();
-  TORCH_INTERNAL_ASSERT(user_tensor_type);
+  TORCH_INTERNAL_ASSERT(
+      user_tensor_type, buildErrorMessage("Unexpected user tensor type"));
   std::unordered_map<Value*, Value*> new_cat_inputs;
   for (auto inp : cat_inputs) {
     auto new_cat_input = subgraph->createClone(
@@ -41,7 +48,8 @@ Node* moveCatAfterUse(Node* cat, Node* user, std::shared_ptr<Graph> subgraph) {
     // Since we are cloning user, its result should be the same scalar type
     // as the user. But the dims should correspond to that of the input.
     auto input_tensor_type = inp->type()->cast<c10::TensorType>();
-    TORCH_INTERNAL_ASSERT(input_tensor_type);
+    TORCH_INTERNAL_ASSERT(
+        input_tensor_type, buildErrorMessage("Unexpected input tensor type"));
     auto new_input_type =
         input_tensor_type->withScalarType(user_tensor_type->scalarType());
     new_cat_input->output()->setType(new_input_type);
@@ -60,7 +68,9 @@ Node* moveCatAfterUse(Node* cat, Node* user, std::shared_ptr<Graph> subgraph) {
   user->output()->replaceAllUsesWith(new_cat->output());
   user->destroy();
 
-  TORCH_INTERNAL_ASSERT(!cat->output()->hasUses());
+  TORCH_INTERNAL_ASSERT(
+      !cat->output()->hasUses(),
+      buildErrorMessage("aten::cat output is not used."));
   cat->destroy();
 
   if (!cat_list->output()->hasUses()) {
@@ -84,10 +94,15 @@ int numTensorInputs(Node* node) {
 // If the inputs to `cat` are of different types, then the implementation
 // of `cat` is expected to promote type.
 bool doesCatPromoteTypes(Node* node) {
-  TORCH_INTERNAL_ASSERT(node->kind() == aten::cat);
-  TORCH_INTERNAL_ASSERT(node->input(0)->node()->kind() == prim::ListConstruct);
+  TORCH_INTERNAL_ASSERT(
+      node->kind() == aten::cat,
+      buildErrorMessage("Graph node is not aten::cat."));
+  TORCH_INTERNAL_ASSERT(
+      node->input(0)->node()->kind() == prim::ListConstruct,
+      buildErrorMessage("aten::cat inputs are not expected."));
   auto inputs = node->input(0)->node()->inputs();
-  TORCH_INTERNAL_ASSERT(!inputs.empty());
+  TORCH_INTERNAL_ASSERT(
+      !inputs.empty(), buildErrorMessage("Empty inputs of ListConstruct"));
   auto scalar_type =
       inputs.front()->type()->cast<c10::TensorType>()->scalarType();
   for (size_t i = 1; i < inputs.size(); ++i) {
@@ -122,14 +137,18 @@ bool doesCatPromoteTypes(Node* node) {
 //        it user needs to reflect the original type. This is currently not
 //        handled. TODO
 void moveCatOpToEnd(Node* cat, std::shared_ptr<Graph> subgraph) {
-  TORCH_INTERNAL_ASSERT(cat->kind() == aten::cat);
+  TORCH_INTERNAL_ASSERT(
+      cat->kind() == aten::cat,
+      buildErrorMessage("Graph node is not aten::cat."));
   if (cat->output()->uses().size() == 1) {
     auto use = cat->output()->uses().front();
     if (use.user->isMemberOf(supported_eltwise_set()) &&
         numTensorInputs(use.user) == 1) {
       if (!doesCatPromoteTypes(cat)) {
         TORCH_INTERNAL_ASSERT(
-            use.user->output()->owningGraph() == subgraph.get());
+            use.user->output()->owningGraph() == subgraph.get(),
+            buildErrorMessage(
+                "aten::cat user graph does not math the given subgraph."));
         auto new_cat = moveCatAfterUse(cat, use.user, subgraph);
         moveCatOpToEnd(new_cat, subgraph);
       }
diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp
index 71a40a134e0b5..e2e9c46e133a5 100644
--- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp
@@ -424,14 +424,16 @@ StmtPtr IRMutator::mutate(SyncThreadsPtr v) {
 StmtPtr IRMutator::mutate(ExternalCallPtr v) {
   BufPtr buf = v->buf();
   BufPtr buf_new = to<Buf>(buf->accept_mutator(this));
-  TORCH_INTERNAL_ASSERT(buf_new);
+  TORCH_INTERNAL_ASSERT(
+      buf_new, buildErrorMessage("IRMutator produced null for Buf."));
 
   bool buf_args_changed = false;
   std::vector<BufPtr> buf_args_new;
   buf_args_new.reserve(v->buf_args().size());
   for (BufPtr buf_arg : v->buf_args()) {
     BufPtr buf_arg_new = to<Buf>(buf_arg->accept_mutator(this));
-    TORCH_INTERNAL_ASSERT(buf_arg_new);
+    TORCH_INTERNAL_ASSERT(
+        buf_arg_new, buildErrorMessage("IRMutator produced null for Buf."));
     buf_args_new.push_back(buf_arg_new);
     buf_args_changed |= buf_arg_new != buf_arg;
   }
@@ -460,7 +462,8 @@ StmtPtr IRMutator::mutate(ExternalCallPtr v) {
 StmtPtr IRMutator::mutate(AllocatePtr v) {
   BufPtr buf = v->buf();
   BufPtr buf_new = to<Buf>(buf->accept_mutator(this));
-  TORCH_INTERNAL_ASSERT(buf_new);
+  TORCH_INTERNAL_ASSERT(
+      buf_new, buildErrorMessage("IRMutator produced null for Buf."));
   if (buf != buf_new) {
     v->set_buf(buf_new);
   }
@@ -470,7 +473,8 @@ StmtPtr IRMutator::mutate(AllocatePtr v) {
 StmtPtr IRMutator::mutate(FreePtr v) {
   BufPtr buf = v->buf();
   BufPtr buf_new = to<Buf>(buf->accept_mutator(this));
-  TORCH_INTERNAL_ASSERT(buf_new);
+  TORCH_INTERNAL_ASSERT(
+      buf_new, buildErrorMessage("IRMutator produced null for Buf."));
   if (buf != buf_new) {
     v->set_buf(buf_new);
   }
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index 6820bbb5748a2..3ce194325f08a 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -2351,7 +2351,9 @@ ExprPtr buf_flat_size(BufPtr v) {
 StmtPtr TermExpander::mutate(AllocatePtr v) {
   BufPtr buf = v->buf();
   BufPtr buf_new = to<Buf>(v->buf()->accept_mutator(this));
-  TORCH_INTERNAL_ASSERT(buf_new);
+  TORCH_INTERNAL_ASSERT(
+      buf_new,
+      buildErrorMessage("TermExpander mutation produced null for Buf."));
   ExprPtr flattened = buf_flat_size(buf_new);
 
   if (flattened->isConstant() && immediateEquals(flattened, 0)) {
@@ -2368,7 +2370,9 @@ StmtPtr TermExpander::mutate(AllocatePtr v) {
 StmtPtr TermExpander::mutate(FreePtr v) {
   BufPtr buf = v->buf();
   BufPtr buf_new = to<Buf>(v->buf()->accept_mutator(this));
-  TORCH_INTERNAL_ASSERT(buf_new);
+  TORCH_INTERNAL_ASSERT(
+      buf_new,
+      buildErrorMessage("TermExpander mutation produced null for Buf."));
 
   if (eliminated_allocations_.count(buf_new->base_handle())) {
     eliminated_allocations_.erase(buf_new->base_handle());
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index f9653aea68840..a86cb33a1b8bd 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -549,7 +549,7 @@ std::vector<int64_t> bufferSizes(BufPtr b) {
   std::vector<int64_t> sizes;
   for (size_t i = 0; i < b->ndim(); i++) {
     auto dim = intValue(b->dim(i));
-    TORCH_INTERNAL_ASSERT(dim);
+    TORCH_INTERNAL_ASSERT(dim, buildErrorMessage("Non-constant buf dims"));
     sizes.push_back(*dim);
   }
   return sizes;
@@ -889,7 +889,8 @@ ExprHandle promoteIntegerToDefaultType(const ExprHandle& e) {
 
   // We intend to promote Integers to floating-point types
   TORCH_INTERNAL_ASSERT(
-      !c10::isIntegralType(defaultType, /*includeBool*/ true));
+      !c10::isIntegralType(defaultType, /*includeBool*/ true),
+      buildErrorMessage("Non-integer type"));
 
   return Cast::make(
       Dtype(
@@ -1165,7 +1166,8 @@ std::pair<ScalarType, std::vector<BufHandle>> processCatList(
   std::vector<BufHandle> nonEmptyInputs;
   for (auto buf : bufList) {
     bufInputs.push_back(buf);
-    TORCH_INTERNAL_ASSERT(buf.node()->dims().size() > 0);
+    TORCH_INTERNAL_ASSERT(
+        buf.node()->dims().size() > 0, buildErrorMessage("Invalid buf rank"));
     if (buf.node()->dims().size() == 1 &&
         immediateAs<int>(buf.node()->dim(0)) == 0) {
       continue;
@@ -1378,7 +1380,9 @@ Tensor tensorexpr::computeOperandValue(
       auto add_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) {
         return boolToInteger(lhs) + boolToInteger(rhs);
       };
-      TORCH_INTERNAL_ASSERT(inputs.size() == 2 || inputs.size() == 3);
+      TORCH_INTERNAL_ASSERT(
+          inputs.size() == 2 || inputs.size() == 3,
+          buildErrorMessage("Invalid number of input operands"));
       return (inputs.size() > 2)
           ? computeTwoOperandWithAlpha(
                 "aten_add", inputs, outputShape, outputType, add_lambda)
@@ -1390,7 +1394,9 @@ Tensor tensorexpr::computeOperandValue(
         // NB: sub isn't supported on boolean, no need to promote to integer.
         return lhs - rhs;
       };
-      TORCH_INTERNAL_ASSERT(inputs.size() == 2 || inputs.size() == 3);
+      TORCH_INTERNAL_ASSERT(
+          inputs.size() == 2 || inputs.size() == 3,
+          buildErrorMessage("Invalid number of input operands"));
       return (inputs.size() > 2)
           ? computeTwoOperandWithAlpha(
                 "aten_sub", inputs, outputShape, outputType, sub_lambda)
@@ -2153,7 +2159,8 @@ Tensor tensorexpr::computeOperandValue(
           outputShape,
           outputType,
           [outputType](const ExprHandle& a) {
-            TORCH_INTERNAL_ASSERT(outputType);
+            TORCH_INTERNAL_ASSERT(
+                outputType, buildErrorMessage("Output type is null."));
             return Cast::make(ToDtype(*outputType), a);
           });
     } break;
@@ -2272,7 +2279,9 @@ Tensor tensorexpr::computeOperandValue(
             "aten_transpose",
             c10::fmap<DimArg>(outputShape),
             [&](std::vector<VarHandle> axes) {
-              TORCH_INTERNAL_ASSERT(axes.size() <= 1);
+              TORCH_INTERNAL_ASSERT(
+                  axes.size() <= 1,
+                  buildErrorMessage("Invalid axes size in transpose"));
               return A.load(axes);
             });
       }
@@ -2935,7 +2944,10 @@ bool denseAndNonOverlapping(
 
 Tensor TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
   const TensorTypePtr& tt = v->type()->expect<TensorType>();
-  TORCH_INTERNAL_ASSERT(bufs_.count(v));
+  TORCH_INTERNAL_ASSERT(
+      bufs_.count(v),
+      buildErrorMessage(
+          "Ouput tensor has no corresponding bufs in the fuser."));
   BufPtr buf = bufs_.at(v);
 
   // No shape info is present in the graph
@@ -2945,13 +2957,17 @@ Tensor TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
     throw malformed_input(msg);
   }
 
-  TORCH_INTERNAL_ASSERT(tt->sizes().concrete_sizes());
+  TORCH_INTERNAL_ASSERT(
+      tt->sizes().concrete_sizes(),
+      buildErrorMessage("Output shapes are unknown."));
   auto sizes = *tt->sizes().concrete_sizes();
   std::vector<int64_t> default_strides = TensorType::contiguousStridesOf(sizes);
   if (!tt->strides().concrete_sizes()) {
     return Tensor(buf, nullptr);
   }
-  TORCH_INTERNAL_ASSERT(tt->strides().concrete_sizes());
+  TORCH_INTERNAL_ASSERT(
+      tt->strides().concrete_sizes(),
+      buildErrorMessage("Output strides are unknown."));
   const std::vector<int64_t> strides = *tt->strides().concrete_sizes();
   // All Tensors in NNC are layed out in default, contiguous layout.
   // If the output is also default contiguous we don't need to do anything

From c0cdbb1cc53e8b55f26604a84135fc22640dec41 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Thu, 2 Sep 2021 10:56:57 -0700
Subject: [PATCH 477/530] Revert D30468409: Add fx2trt pass for removing
 duplicate output args

Test Plan: revert-hammer

Differential Revision:
D30468409 (https://github.com/pytorch/pytorch/commit/6da7552a8eaae6b85e271bf3edac2fa2ae9f1148)

Original commit changeset: b4d91b76ab5d

fbshipit-source-id: e138dc425fe55ffe3585ea5fac4db476931bafed
---
 .../passes/remove_duplicate_output_args.py    | 133 ------------------
 1 file changed, 133 deletions(-)
 delete mode 100644 torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py

diff --git a/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py b/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py
deleted file mode 100644
index 488ce45c75952..0000000000000
--- a/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env python3
-
-import operator
-import typing as t
-import logging
-import torch.fx as fx
-import dataclasses as dc
-
-
-_LOGGER = logging.getLogger(__name__)
-
-
-def remove_duplicate_output_args(
-    top_level: fx.GraphModule,
-    target_subnets: t.Collection[str]
-) -> t.Mapping[str, "RemoveDuplicateResult"]:
-    """Removes duplicate output args.
-
-    This pass removes duplicate output args from the target subnets and fixes
-    their uses in the top level module where the subnets are called. This pass
-    must be called after acc split on the top-level net and subsequent calls to
-    the acc trace on the subnets.
-
-    This pass will change both the subnets and top level module.
-
-    Returns:
-        a mapping of the target subnet name to its dedupcate result
-    """
-
-    processed_subnets = {}
-    for node in top_level.graph.nodes:
-        node: fx.Node
-        if node.op == "call_module" and node.name in target_subnets:
-            sub_gm = top_level.get_submodule(node.target)
-            assert isinstance(sub_gm, fx.GraphModule)
-
-            replace_res = _remove_duplicate_output_args(sub_gm)
-            processed_subnets[node.name] = replace_res
-            if replace_res.replacement_map is None:
-                continue
-            sub_gm.recompile()
-
-            needs_recompile = False
-            # iterate on the copy since we will be changing elements of node.users
-            for user in list(node.users):
-                idx = _ensure_proper_output_use(user, node)
-                idx_new = replace_res.replacement_map[idx]
-                if idx_new != idx:
-                    user.args = (user.args[0], idx_new)
-                    needs_recompile = True
-
-            if needs_recompile:
-                top_level.recompile()
-    return processed_subnets
-
-
-@dc.dataclass(frozen=True)
-class RemoveDuplicateResult:
-    replacement_map: t.Optional[t.List[int]]
-    module: fx.GraphModule
-
-
-def _ensure_proper_output_use(user: fx.Node, target_node: fx.Node) -> int:
-    """
-    Ensures the node looks in proper form of calling the output of an fx2trt
-    splitter sub-net. Specifically:
-
-    1. op is call function, target: operator.getitem
-    2. args is a 2-element tuple
-    3. args[0] is the name of the subnet's output
-    4. args[1] is the index into the subnet output tuple
-
-    E.g.:
-
-        %getitem_4 : [#users=1] = call_function[target=operator.getitem](args = (%_run_on_acc_1, 4), kwargs = {})
-
-    returns the index into the subnet output tuple
-    """
-    _LOGGER.info(f"Checking user node: {user.format_node()}")
-    assert (
-        user.op == "call_function"
-        and user.target == operator.getitem
-        and len(user.args) == 2
-        and user.args[0].name == target_node.name
-        and isinstance(user.args[1], int)
-    ), f"Node is not a proper user of splitter output: {user.format_node()}"
-
-    return user.args[1]
-
-
-def _remove_duplicate_output_args(gm: fx.GraphModule) -> RemoveDuplicateResult:
-    output_nodes = [n for n in gm.graph.nodes if n.op == "output"]
-    assert len(output_nodes) == 1, \
-           f"Expecting exactly one `output` node, but got {len(output_nodes)}"
-
-    changed = False
-    # arg node name to its index in the new output args tuple
-    name_to_idx: t.Dict[str, int] = {}
-    output_node = output_nodes[0]
-
-    # Output op only uses its `args[0]`, and it does not have `kwargs`.
-    # https://pytorch.org/docs/stable/fx.html#torch.fx.Node
-    args = output_node.args[0]
-
-    # Only concern outselves to the case where the args is an iterable of fx.Node.
-    # Other return cases (e.g., a single value) is possible and we don't handle
-    # that in this pass.
-    if not (isinstance(args, t.Iterable) and all(isinstance(a, fx.Node) for a in args)):
-        return RemoveDuplicateResult(replacement_map=None, module=gm)
-
-    # Map old index of the arg node to the remaining node's idx,
-    # initialized to `i => i`
-    replacement_map: t.List[int] = list(range(len(args)))
-    args_new = []
-    for idx, a in enumerate(args):
-        assert isinstance(a, fx.Node), \
-               f"Expecting fx.Node instance, but got: {type(a)}"
-
-        if a.name not in name_to_idx:
-            args_new.append(a)
-            name_to_idx[a.name] = len(args_new) - 1
-        else:
-            changed = True
-            _LOGGER.warning(
-                f"Replaced duplicate output arg '{a.name}': "
-                f"{idx} -> {name_to_idx[a.name]}"
-            )
-        replacement_map[idx] = name_to_idx[a.name]
-
-    output_node.args = (tuple(args_new),)
-    if changed:
-        gm.recompile()
-    return RemoveDuplicateResult(replacement_map, module=gm)

From 1519b6084f9a215fad407087d7ab2cf55d66b8e0 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Thu, 2 Sep 2021 11:06:34 -0700
Subject: [PATCH 478/530] nn.functional.linear OpInfo (#61971)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61971

Test Plan: - wait for tests

Reviewed By: heitorschueroff

Differential Revision: D30013750

Pulled By: zou3519

fbshipit-source-id: ca41dbd98176c12e50ad1410a658f4b06fe99a1e
---
 .../_internal/common_methods_invocations.py   | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 10aae4146bae8..a45bcf54faba8 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -2647,6 +2647,30 @@ def sample_inputs_hardswish(self, device, dtype, requires_grad):
                requires_grad=requires_grad, low=-5, high=5)) for _ in range(1, N)]
     return tensors
 
+def sample_inputs_linear(self, device, dtype, requires_grad):
+    features_options = [[3, 4], [128, 128]]
+    batch_options: List[List[int]] = [
+        [],  # no batch
+        [0],
+        [64],
+        [5, 7],
+    ]
+    create_tensor = partial(make_tensor, device=device, dtype=dtype,
+                            requires_grad=requires_grad, low=-2, high=2)
+
+    sample_inputs = []
+    for has_bias, (in_feat, out_feat), batch_shape in \
+            itertools.product([True, False], features_options, batch_options):
+        input_tensor = create_tensor(batch_shape + [in_feat])
+        weight = create_tensor([out_feat, in_feat])
+        if not has_bias:
+            sample_inputs.append(SampleInput(input_tensor, args=(weight,)))
+            continue
+
+        bias = create_tensor([out_feat])
+        sample_inputs.append(SampleInput(input_tensor, args=(weight, bias)))
+    return sample_inputs
+
 def sample_inputs_interpolate(mode, self, device, dtype, requires_grad):
     N, C = 2, 3
     D = 4
@@ -7519,6 +7543,17 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            dtypesIfCPU=floating_types_and(torch.int64),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_avgpool2d),
+    OpInfo('nn.functional.linear',
+           aten_name='linear',
+           supports_autograd=True,
+           sample_inputs_func=sample_inputs_linear,
+           dtypesIfCPU=all_types_and_complex_and(torch.half, torch.bfloat16),
+           dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                                *[torch.bfloat16] if CUDA11OrLater else []),
+           supports_forward_ad=True,
+           supports_out=False),
     UnaryUfuncInfo(
         'nn.functional.logsigmoid',
         aten_name="log_sigmoid",

From c932afe39b28be3b6d232f629ce597efcfd39815 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Thu, 2 Sep 2021 11:23:38 -0700
Subject: [PATCH 479/530] .github: Move upload-artifact-s3 to common var
 (#64435)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64435

Move upload-artifact-s3 to a common variable to be used amongst our
jinja templates, this should make it easier in the future to update
these images

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

cc ezyang seemethere malfet lg20987 pytorch/pytorch-dev-infra

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D30732777

Pulled By: seemethere

fbshipit-source-id: 51cd485f5abae134c3c49dfa878e6303ba8e5f25
---
 .github/templates/common.yml.j2                           | 2 ++
 .github/templates/linux_ci_workflow.yml.j2                | 8 ++++----
 .github/templates/windows_ci_workflow.yml.j2              | 2 +-
 .../generated-linux-bionic-cuda10.2-py3.9-gcc7.yml        | 4 ++--
 .../generated-linux-bionic-py3.8-gcc9-coverage.yml        | 4 ++--
 .../generated-linux-xenial-cuda10.2-py3.6-gcc7.yml        | 4 ++--
 .../generated-linux-xenial-cuda11.3-py3.6-gcc7.yml        | 4 ++--
 .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml | 4 ++--
 ...enerated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 4 ++--
 .../generated-periodic-win-vs2019-cuda11.1-py3.yml        | 2 +-
 .github/workflows/generated-win-vs2019-cpu-py3.yml        | 2 +-
 .github/workflows/generated-win-vs2019-cuda10.1-py3.yml   | 2 +-
 .github/workflows/generated-win-vs2019-cuda11.3-py3.yml   | 2 +-
 13 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index f9296e017a1cf..6757785c3cff4 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -1,3 +1,5 @@
+{%- set upload_artifact_s3_action = "seemethere/upload-artifact-s3@v3" -%}
+
 {%- macro display_ec2_information() -%}
       - name: Display EC2 information
         shell: bash
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 01ea7af15305d..209e9c34a2d83 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -180,7 +180,7 @@ jobs:
       - name: Archive artifacts into zip
         run: |
           zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: !{{ common.upload_artifact_s3_action }}
         name: Store PyTorch Build Artifacts on S3
         with:
           name: ${{ env.BUILD_ENVIRONMENT }}
@@ -356,7 +356,7 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: !{{ common.upload_artifact_s3_action }}
         name: Store PyTorch Test Reports on S3
         if: always()
         with:
@@ -423,7 +423,7 @@ jobs:
         run: |
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: seemethere/upload-artifact-s3@v3
+      - uses: !{{ common.upload_artifact_s3_action }}
         name: Upload Python Docs Preview
         if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
         with:
@@ -431,7 +431,7 @@ jobs:
           if-no-files-found: error
           path: pytorch.github.io/docs/merge/
           s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}
-      - uses: seemethere/upload-artifact-s3@v3
+      - uses: !{{ common.upload_artifact_s3_action }}
         name: Upload C++ Docs Preview
         if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cppdocs' }}
         with:
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index b927281b84ff8..1268f275cc755 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -136,7 +136,7 @@ jobs:
           path: C:\${{ github.run_id }}\build-results
       - name: Upload artifacts to s3
         if: always()
-        uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        uses: !{{ common.upload_artifact_s3_action }}
         with:
           retention-days: 14
           if-no-files-found: error
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index f1c0ea491c415..5dc8d9f2d19de 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -233,7 +233,7 @@ jobs:
       - name: Archive artifacts into zip
         run: |
           zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: seemethere/upload-artifact-s3@v3
         name: Store PyTorch Build Artifacts on S3
         with:
           name: ${{ env.BUILD_ENVIRONMENT }}
@@ -454,7 +454,7 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: seemethere/upload-artifact-s3@v3
         name: Store PyTorch Test Reports on S3
         if: always()
         with:
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index e13c6191ac64e..c060b158b6007 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -233,7 +233,7 @@ jobs:
       - name: Archive artifacts into zip
         run: |
           zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: seemethere/upload-artifact-s3@v3
         name: Store PyTorch Build Artifacts on S3
         with:
           name: ${{ env.BUILD_ENVIRONMENT }}
@@ -458,7 +458,7 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: seemethere/upload-artifact-s3@v3
         name: Store PyTorch Test Reports on S3
         if: always()
         with:
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index e44dfbe44759a..dfbf9bfd25d36 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -233,7 +233,7 @@ jobs:
       - name: Archive artifacts into zip
         run: |
           zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: seemethere/upload-artifact-s3@v3
         name: Store PyTorch Build Artifacts on S3
         with:
           name: ${{ env.BUILD_ENVIRONMENT }}
@@ -454,7 +454,7 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: seemethere/upload-artifact-s3@v3
         name: Store PyTorch Test Reports on S3
         if: always()
         with:
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 618b13148567d..53b09cdf81ffe 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -233,7 +233,7 @@ jobs:
       - name: Archive artifacts into zip
         run: |
           zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: seemethere/upload-artifact-s3@v3
         name: Store PyTorch Build Artifacts on S3
         with:
           name: ${{ env.BUILD_ENVIRONMENT }}
@@ -454,7 +454,7 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: seemethere/upload-artifact-s3@v3
         name: Store PyTorch Test Reports on S3
         if: always()
         with:
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index 6ceabb04770e6..eb77554cb4a03 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -233,7 +233,7 @@ jobs:
       - name: Archive artifacts into zip
         run: |
           zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: seemethere/upload-artifact-s3@v3
         name: Store PyTorch Build Artifacts on S3
         with:
           name: ${{ env.BUILD_ENVIRONMENT }}
@@ -454,7 +454,7 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: seemethere/upload-artifact-s3@v3
         name: Store PyTorch Test Reports on S3
         if: always()
         with:
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 5b60a1bcdf900..694a79ac22c81 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -231,7 +231,7 @@ jobs:
       - name: Archive artifacts into zip
         run: |
           zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: seemethere/upload-artifact-s3@v3
         name: Store PyTorch Build Artifacts on S3
         with:
           name: ${{ env.BUILD_ENVIRONMENT }}
@@ -452,7 +452,7 @@ jobs:
           if-no-files-found: error
           path:
             test-reports-*.zip
-      - uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+      - uses: seemethere/upload-artifact-s3@v3
         name: Store PyTorch Test Reports on S3
         if: always()
         with:
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
index de74cdc16f889..78e4c7cb42876 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
@@ -107,7 +107,7 @@ jobs:
           path: C:\${{ github.run_id }}\build-results
       - name: Upload artifacts to s3
         if: always()
-        uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        uses: seemethere/upload-artifact-s3@v3
         with:
           retention-days: 14
           if-no-files-found: error
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index 4f43d2743d18b..bed6da45140e2 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -99,7 +99,7 @@ jobs:
           path: C:\${{ github.run_id }}\build-results
       - name: Upload artifacts to s3
         if: always()
-        uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        uses: seemethere/upload-artifact-s3@v3
         with:
           retention-days: 14
           if-no-files-found: error
diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
index ad4cf37f958e1..f5b2e6a96e2b4 100644
--- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
@@ -109,7 +109,7 @@ jobs:
           path: C:\${{ github.run_id }}\build-results
       - name: Upload artifacts to s3
         if: always()
-        uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        uses: seemethere/upload-artifact-s3@v3
         with:
           retention-days: 14
           if-no-files-found: error
diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
index 299b48d66a411..bf1cbe4119483 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
@@ -109,7 +109,7 @@ jobs:
           path: C:\${{ github.run_id }}\build-results
       - name: Upload artifacts to s3
         if: always()
-        uses: seemethere/upload-artifact-s3@9d7ceb0ab39c2c88d93ef7792b27425b27d59162
+        uses: seemethere/upload-artifact-s3@v3
         with:
           retention-days: 14
           if-no-files-found: error

From 22f3bcd1643a120d4f3b7c5df59daf668bbbe746 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Thu, 2 Sep 2021 11:23:38 -0700
Subject: [PATCH 480/530] .github: Move squid vars to common vars (#64436)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64436

Moves the squid variables to our common jinja template so that when we
have to update them they're all in the same place.

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

cc ezyang seemethere malfet lg20987 pytorch/pytorch-dev-infra

Test Plan: Imported from OSS

Reviewed By: malfet, zhouzhuojie

Differential Revision: D30732776

Pulled By: seemethere

fbshipit-source-id: 22e3757c4eec775baa8abbaac2ba2a0c69c2b2a9
---
 .github/templates/bazel_ci_workflow.yml.j2         |  4 ++--
 .github/templates/common.yml.j2                    |  5 +++++
 .github/templates/linux_ci_workflow.yml.j2         |  8 ++------
 .github/templates/windows_ci_workflow.yml.j2       | 14 +++++---------
 ...d-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml |  3 +--
 ...d-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml |  3 +--
 .../generated-linux-bionic-cuda10.2-py3.9-gcc7.yml |  3 +--
 .../generated-linux-bionic-py3.8-gcc9-coverage.yml |  3 +--
 .../generated-linux-xenial-cuda10.2-py3.6-gcc7.yml |  3 +--
 .../generated-linux-xenial-cuda11.3-py3.6-gcc7.yml |  3 +--
 .../generated-linux-xenial-py3.6-gcc5.4.yml        |  3 +--
 ...enerated-linux-xenial-py3.6-gcc7-bazel-test.yml |  1 -
 ...c-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml |  3 +--
 ...d-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml |  3 +--
 .../generated-periodic-win-vs2019-cuda11.1-py3.yml |  1 -
 .github/workflows/generated-win-vs2019-cpu-py3.yml |  1 -
 .../generated-win-vs2019-cuda10.1-py3.yml          |  1 -
 .../generated-win-vs2019-cuda11.3-py3.yml          |  1 -
 18 files changed, 23 insertions(+), 40 deletions(-)

diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2
index 57b4567876e35..9f982cdd5cb61 100644
--- a/.github/templates/bazel_ci_workflow.yml.j2
+++ b/.github/templates/bazel_ci_workflow.yml.j2
@@ -65,7 +65,7 @@ on:
             -e PR_LABELS \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \
+            -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
@@ -112,7 +112,7 @@ on:
             -e SCCACHE_BUCKET \
             -e CONTINUE_THROUGH_ERROR \
             -e PR_LABELS \
-            -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \
+            -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index 6757785c3cff4..07ad771346399 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -1,5 +1,10 @@
 {%- set upload_artifact_s3_action = "seemethere/upload-artifact-s3@v3" -%}
 
+{# squid_proxy is an private ELB that only available for GHA custom runners #}
+{%- set squid_proxy    = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%}
+{# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #}
+{%- set squid_no_proxy = "localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%}
+
 {%- macro display_ec2_information() -%}
       - name: Display EC2 information
         shell: bash
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 209e9c34a2d83..8aa854782de52 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -1,8 +1,4 @@
 {% import 'common.yml.j2' as common %}
-{# squid_proxy is an private ELB that only available for GHA custom runners #}
-{%- set squid_proxy    = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%}
-{# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #}
-{%- set squid_no_proxy = "localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%}
 
 {%- block name -%}
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
@@ -144,8 +140,8 @@ jobs:
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \
             -e PR_LABELS \
+            -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
@@ -316,7 +312,7 @@ jobs:
             -e CONTINUE_THROUGH_ERROR \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
-            -e http_proxy="!{{squid_proxy}}" -e https_proxy="!{{squid_proxy}}" -e no_proxy="!{{squid_no_proxy}}" \
+            -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index 1268f275cc755..05d739db523e3 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -1,8 +1,4 @@
 {% import 'common.yml.j2' as common %}
-{# squid_proxy is an private ELB that only available for GHA custom runners #}
-{%- set squid_proxy    = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%}
-{# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #}
-{%- set squid_no_proxy = "localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%}
 
 {%- macro wait_and_kill_ssh() -%}
       - name: Wait until all sessions have drained
@@ -58,7 +54,7 @@ env:
   VS_VERSION: "16.8.6"
   VC_YEAR: "2019"
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  no_proxy: !{{ squid_no_proxy }}
+  no_proxy: !{{ common.squid_no_proxy }}
 {%- if cuda_version != "cpu" %}
   TORCH_CUDA_ARCH_LIST: "7.0"
   USE_CUDA: 1
@@ -88,8 +84,8 @@ jobs:
     {%- endif %}
     env:
       JOB_BASE_NAME: !{{ build_environment }}-build
-      http_proxy: "!{{ squid_proxy }}"
-      https_proxy: "!{{ squid_proxy }}"
+      http_proxy: "!{{ common. squid_proxy }}"
+      https_proxy: "!{{ common.squid_proxy }}"
     steps:
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         uses: seemethere/add-github-ssh-key@v1
@@ -192,8 +188,8 @@ jobs:
       SHARD_NUMBER: ${{ matrix.shard }}
       NUM_TEST_SHARDS: ${{ matrix.num_shards }}
       TEST_CONFIG: ${{ matrix.config }}
-      http_proxy: "!{{ squid_proxy }}"
-      https_proxy: "!{{ squid_proxy }}"
+      http_proxy: "!{{ common.squid_proxy }}"
+      https_proxy: "!{{ common.squid_proxy }}"
       RUN_SMOKE_TESTS_ONLY_ON_PR: !{{ only_run_smoke_tests_on_pull_request }}
       PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
       CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 4a434ac9772a2..396284cf72f84 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: libtorch-linux-xenial-cuda10.2-py3.6-gcc7
@@ -196,8 +195,8 @@ jobs:
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 34fd21e15dcde..cb0a98591956b 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: libtorch-linux-xenial-cuda11.3-py3.6-gcc7
@@ -196,8 +195,8 @@ jobs:
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 5dc8d9f2d19de..ddb38b7c84d5a 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-bionic-cuda10.2-py3.9-gcc7
@@ -196,8 +195,8 @@ jobs:
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index c060b158b6007..c4cdd2cd636d5 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-bionic-py3.8-gcc9-coverage
@@ -196,8 +195,8 @@ jobs:
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index dfbf9bfd25d36..2d0d916237676 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-xenial-cuda10.2-py3.6-gcc7
@@ -196,8 +195,8 @@ jobs:
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 53b09cdf81ffe..0e1c7ba6ca6c5 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-xenial-cuda11.3-py3.6-gcc7
@@ -196,8 +195,8 @@ jobs:
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index eb77554cb4a03..6f4e5c2958904 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-xenial-py3.6-gcc5.4
@@ -196,8 +195,8 @@ jobs:
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index 4bfe1d1211786..2d72fab281f6e 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/bazel_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-xenial-py3.6-gcc7-bazel-test
diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
index ce4fe5bc0e250..b009d77b3a8d8 100644
--- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7
@@ -194,8 +193,8 @@ jobs:
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 694a79ac22c81..738e6bb146cdd 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/linux_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: periodic-linux-xenial-cuda11.1-py3.6-gcc7
@@ -194,8 +193,8 @@ jobs:
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
index 78e4c7cb42876..61c63f42cadf6 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/windows_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: periodic-win-vs2019-cuda11.1-py3
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index bed6da45140e2..6ef8e85ac931b 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/windows_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: win-vs2019-cpu-py3
diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
index f5b2e6a96e2b4..c158f08731d99 100644
--- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/windows_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: win-vs2019-cuda10.1-py3
diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
index bf1cbe4119483..bd945c3255a0a 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
@@ -1,5 +1,4 @@
 # @generated DO NOT EDIT MANUALLY
-
 # Template is at:    .github/templates/windows_ci_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: win-vs2019-cuda11.3-py3

From d0cb26ba575e489c3835c7741a2f94ea3f365c3f Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Thu, 2 Sep 2021 11:37:54 -0700
Subject: [PATCH 481/530] [DDP] Fix logging iterations (#64411)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64411

These are not actually the training iterations, but are offset by how
frequently DDP stats collection actually runs (default being
kDDPRuntimeLoggingSampleRate = 100). So with this change, they are actually
logged to scuba every:
10, 10 * 100, 40 * 100, etc iterations.

Test Plan: CI

Reviewed By: zhaojuanmao

Differential Revision: D30718274

fbshipit-source-id: 146bd2428753c93363bee37e487f40104fce3c18
---
 torch/csrc/distributed/c10d/logger.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index b1efd0b238378..92e16614a6612 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -10,9 +10,12 @@
 
 namespace c10d {
 
-// When training runs at these iterations, log the runtime
-// stats.
-const int LoggingIterations[] = {10, 20, 100, 1000, 5000, 10000, 20000}; // NOLINT
+// Logs runtime stats to configured destination. Note that since data collection
+// only runs every ddp_runtime_logging_sample_rate iterations, the actual
+// training iterations recorded will be like 10,
+// (20-10) * ddp_runtime_logging_sample_rate,
+// (50-10) * ddp_runtime_logging_sample_rate and so on.
+const int LoggingIterations[] = {10, 20, 50, 100, 500, 800, 1000}; // NOLINT
 
 std::ostream& operator<<(std::ostream& output, const Logger& logger) {
   auto& ddp_logging_data = (*logger.ddp_logging_data_);

From 7d010539c9b6356cbaae8f7accc5b5cf8cc0d0cc Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 2 Sep 2021 12:15:03 -0700
Subject: [PATCH 482/530] ENH Adds test and docs for modules that already
 support no batch dims (#62729)

Summary:
Towards https://github.com/pytorch/pytorch/issues/60585

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62729

Reviewed By: H-Huang

Differential Revision: D30669546

Pulled By: jbschlosser

fbshipit-source-id: c771c98c1fd9d28fa984b72893585c738c736505
---
 test/cpp_api_parity/parity-tracker.md        |  1 +
 torch/csrc/api/src/nn/modules/activation.cpp |  4 +-
 torch/nn/modules/activation.py               |  8 ++--
 torch/testing/_internal/common_nn.py         | 50 +++++++++++++++++++-
 4 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/test/cpp_api_parity/parity-tracker.md b/test/cpp_api_parity/parity-tracker.md
index 869ef300f6c85..88e1848f7da78 100644
--- a/test/cpp_api_parity/parity-tracker.md
+++ b/test/cpp_api_parity/parity-tracker.md
@@ -99,6 +99,7 @@ torch::nn::Identity|Yes|No
 torch::nn::Linear|Yes|No
 torch::nn::Bilinear|Yes|No
 torch::nn::Flatten|Yes|No
+torch::nn::Unflatten|Yes|No
 torch::nn::Dropout|Yes|No
 torch::nn::Dropout2d|Yes|No
 torch::nn::Dropout3d|Yes|No
diff --git a/torch/csrc/api/src/nn/modules/activation.cpp b/torch/csrc/api/src/nn/modules/activation.cpp
index 3c4d2b8c98f50..e724a75c58ec9 100644
--- a/torch/csrc/api/src/nn/modules/activation.cpp
+++ b/torch/csrc/api/src/nn/modules/activation.cpp
@@ -170,8 +170,8 @@ void Softmax2dImpl::pretty_print(std::ostream& stream) const {
 }
 
 Tensor Softmax2dImpl::forward(const Tensor& input) {
-  TORCH_CHECK(input.dim() == 4, "Softmax2d requires a 4D tensor as input");
-  return F::detail::softmax(input, /*dim=*/1, c10::nullopt);
+  TORCH_CHECK(input.dim() == 4 || input.dim() == 3, "Softmax2d requires a 3D or 4D tensor as input");
+  return F::detail::softmax(input, /*dim=*/-3, c10::nullopt);
 }
 
 // ============================================================================
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 91427c8aea2cd..90b901d9b690a 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -1236,8 +1236,8 @@ class Softmax2d(Module):
     apply `Softmax` to each location :math:`(Channels, h_i, w_j)`
 
     Shape:
-        - Input: :math:`(N, C, H, W)`
-        - Output: :math:`(N, C, H, W)` (same shape as input)
+        - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`.
+        - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input)
 
     Returns:
         a Tensor of the same dimension and shape as the input with
@@ -1252,8 +1252,8 @@ class Softmax2d(Module):
     """
 
     def forward(self, input: Tensor) -> Tensor:
-        assert input.dim() == 4, 'Softmax2d requires a 4D tensor as input'
-        return F.softmax(input, 1, _stacklevel=5)
+        assert input.dim() == 4 or input.dim() == 3, 'Softmax2d requires a 3D or 4D tensor as input'
+        return F.softmax(input, -3, _stacklevel=5)
 
 
 class LogSoftmax(Module):
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 73233df8cc5bb..b22b6ab1d2ec5 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -3659,6 +3659,28 @@ def single_batch_reference_fn(input, parameters, module):
         fullname='log_softmax_scalar',
         pickle=False,
     ),
+    dict(
+        module_name='Softmax2d',
+        input_size=(3, 4, 5),
+        reference_fn=single_batch_reference_fn,
+        desc='no_batch_dim',
+    ),
+    dict(
+        module_name='Softmax',
+        constructor_args=(-1,),
+        cpp_constructor_args='torch::nn::SoftmaxOptions(-1)',
+        input_size=(4, 5),
+        reference_fn=single_batch_reference_fn,
+        desc='no_batch_dim',
+    ),
+    dict(
+        module_name='LogSoftmax',
+        constructor_args=(-1,),
+        cpp_constructor_args='torch::nn::LogSoftmaxOptions(1)',
+        input_size=(4, 5),
+        reference_fn=single_batch_reference_fn,
+        desc='no_batch_dim',
+    ),
 
 
     dict(
@@ -3819,6 +3841,14 @@ def single_batch_reference_fn(input, parameters, module):
         input_size=(),
         desc='scalar',
     ),
+    dict(
+        module_name='Softmin',
+        constructor_args=(-1,),
+        cpp_constructor_args='torch::nn::SoftminOptions(-1)',
+        input_size=(3, 4, 10),
+        reference_fn=single_batch_reference_fn,
+        desc='no_batch_dim',
+    ),
     dict(
         module_name='Tanhshrink',
         input_size=(),
@@ -3985,6 +4015,22 @@ def single_batch_reference_fn(input, parameters, module):
         with_tf32=True,
         tf32_precision=0.005,
     ),
+    dict(
+        module_name='Flatten',
+        cpp_constructor_args='torch::nn::FlattenOptions().start_dim(-3).end_dim(-1)',
+        constructor_args=(-3, -1),
+        input_size=(3, 4, 5),
+        reference_fn=single_batch_reference_fn,
+        desc="no_batch_dim",
+    ),
+    dict(
+        module_name='Unflatten',
+        cpp_constructor_args='torch::nn::UnflattenOptions(-2, {2, 2})',
+        constructor_args=(-2, torch.Size([2, 2])),
+        input_size=(3, 4, 5),
+        reference_fn=single_batch_reference_fn,
+        desc="no_batch_dim",
+    ),
 ]
 
 # add conv padding mode tests:
@@ -4027,7 +4073,7 @@ def single_batch_reference_fn(input, parameters, module):
 # Check that non linear activations work with no batch dimensions
 non_linear_activations_no_batch = [
     'ELU', 'Hardshrink', 'Hardsigmoid', 'Hardtanh', 'Hardswish', 'LeakyReLU',
-    'LogSigmoid', 'PReLU', 'ReLU', 'ReLU6', 'RReLU', 'SELU', 'CELU', 'GELU',
+    'LogSigmoid', 'PReLU', 'ReLU', 'ReLU6', 'RReLU', 'SELU', 'CELU', 'GELU', 'GLU',
     'Sigmoid', 'SiLU', 'Mish', 'Softplus', 'Softshrink', 'Softsign', 'Tanh',
     'Tanhshrink', 'Threshold'
 ]
@@ -4043,7 +4089,7 @@ def single_batch_reference_fn(input, parameters, module):
 for non_linear_activation in non_linear_activations_no_batch:
     activation_test_info = dict(
         module_name=non_linear_activation,
-        input_size=(3,),
+        input_size=(4,),
         reference_fn=single_batch_reference_fn,
         desc='no_batch_dim',
         test_cpp_api_parity=False,

From 3cd0a4ac153ea8404f0e960ddacf00098689e600 Mon Sep 17 00:00:00 2001
From: Erjia Guan <erjia@fb.com>
Date: Thu, 2 Sep 2021 12:25:15 -0700
Subject: [PATCH 483/530] Fix test_ind_worker_queue by setting max_num_worker
 based on system resource (#63779)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63779

Fixes #63657

Test Plan: Imported from OSS

Reviewed By: gchanan

Differential Revision: D30494185

Pulled By: ejguan

fbshipit-source-id: d1bd24299b25d589889604aaf18ad347bdff4df4
---
 test/test_dataloader.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index c768246ff477c..5050feca3a373 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -2320,9 +2320,24 @@ def _run_ind_worker_queue_test(self, batch_size, num_workers):
                 current_worker_idx = 0
 
     def test_ind_worker_queue(self):
+        max_num_workers = None
+        if hasattr(os, 'sched_getaffinity'):
+            try:
+                max_num_workers = len(os.sched_getaffinity(0))
+            except Exception:
+                pass
+        if max_num_workers is None:
+            cpu_count = os.cpu_count()
+            if cpu_count is not None:
+                # Use half number of CPUs
+                max_num_workers = cpu_count // 2
+
+        if max_num_workers is None:
+            max_num_workers = 1
+
         for batch_size in (8, 16, 32, 64):
-            for num_workers in range(1, 6):
-                self._run_ind_worker_queue_test(batch_size=batch_size, num_workers=num_workers)
+            for num_workers in range(0, min(6, max_num_workers)):
+                self._run_ind_worker_queue_test(batch_size=batch_size, num_workers=num_workers + 1)
 
 
 class SetAffinityDataset(IterableDataset):

From 4f434801866a60823124f3c2bd672d2005220c9c Mon Sep 17 00:00:00 2001
From: Kevin Tse <ktse@fb.com>
Date: Thu, 2 Sep 2021 13:06:18 -0700
Subject: [PATCH 484/530] [DataPipe] adding/removing __len__ for different
 DataPipe (#64398)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64398

cc VitalyFedyunin ejguan

Test Plan: Imported from OSS

Reviewed By: ejguan

Differential Revision: D30710437

Pulled By: NivekT

fbshipit-source-id: 524eda43a2faa0db0c1a662bf9bb4283f0ade83c
---
 test/test_datapipe.py                         | 21 +++++++++++++++++++
 torch/utils/data/datapipes/iter/grouping.py   |  6 ++++++
 torch/utils/data/datapipes/iter/httpreader.py |  7 ++++++-
 torch/utils/data/datapipes/iter/selecting.py  |  1 -
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 24d0ce20d63dd..f09583b722379 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -1626,6 +1626,27 @@ def test_simple_sharding(self):
 
         self.assertEqual(sorted(all_items), sorted(items))
 
+    def test_sharding_length(self):
+        numbers_dp = IDP(range(13))
+        sharded_dp0 = numbers_dp.sharding_filter()
+        torch.utils.data.sharding.apply_sharding(sharded_dp0, 3, 0)
+        sharded_dp1 = numbers_dp.sharding_filter()
+        torch.utils.data.sharding.apply_sharding(sharded_dp1, 3, 1)
+        sharded_dp2 = numbers_dp.sharding_filter()
+        torch.utils.data.sharding.apply_sharding(sharded_dp2, 3, 2)
+        self.assertEqual(13, len(numbers_dp))
+        self.assertEqual(5, len(sharded_dp0))
+        self.assertEqual(4, len(sharded_dp1))
+        self.assertEqual(4, len(sharded_dp2))
+
+        numbers_dp = IDP(range(1))
+        sharded_dp0 = numbers_dp.sharding_filter()
+        torch.utils.data.sharding.apply_sharding(sharded_dp0, 2, 0)
+        sharded_dp1 = numbers_dp.sharding_filter()
+        torch.utils.data.sharding.apply_sharding(sharded_dp1, 2, 1)
+        self.assertEqual(1, len(sharded_dp0))
+        self.assertEqual(0, len(sharded_dp1))
+
     @skipIfNoDill
     def test_old_dataloader(self):
         dp = self._get_pipeline()
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index aece256d10650..d90ad08814ecf 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -28,6 +28,12 @@ def __iter__(self):
             if i % self.num_of_instances == self.instance_id:
                 yield item
 
+    def __len__(self):
+        if isinstance(self.source_datapipe, Sized):
+            return len(self.source_datapipe) // self.num_of_instances +\
+                (1 if (self.instance_id < len(self.source_datapipe) % self.num_of_instances) else 0)
+        raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))
+
 
 @functional_datapipe('batch')
 class BatcherIterDataPipe(IterDataPipe[DataChunk]):
diff --git a/torch/utils/data/datapipes/iter/httpreader.py b/torch/utils/data/datapipes/iter/httpreader.py
index 747b5d567e4cd..0c8e2fc818e9f 100644
--- a/torch/utils/data/datapipes/iter/httpreader.py
+++ b/torch/utils/data/datapipes/iter/httpreader.py
@@ -1,5 +1,5 @@
 from io import IOBase
-from typing import Tuple
+from typing import Sized, Tuple
 from urllib.error import HTTPError, URLError
 import urllib.request as urllib
 from torch.utils.data import IterDataPipe
@@ -39,3 +39,8 @@ def __iter__(self):
                                 .format(reason=e.reason, url=furl))
             except Exception:
                 raise
+
+    def __len__(self) -> int:
+        if isinstance(self.datapipe, Sized):
+            return len(self.datapipe)
+        raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))
diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py
index a89bfdfb39e5c..4e8703c8d3973 100644
--- a/torch/utils/data/datapipes/iter/selecting.py
+++ b/torch/utils/data/datapipes/iter/selecting.py
@@ -77,6 +77,5 @@ def _isNonEmpty(self, data):
             not (isinstance(data, list) and len(data) == 0 and self.drop_empty_batches)
         return r
 
-
     def __len__(self):
         raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))

From 4ce9c530d681fd4c860cf78f4497a17df5bdc018 Mon Sep 17 00:00:00 2001
From: Kevin Tse <ktse@fb.com>
Date: Thu, 2 Sep 2021 13:06:18 -0700
Subject: [PATCH 485/530] [DataPipe] removing filter's inheritance from map
 (#64404)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64404

This PR remove `filter`'s inheritance from `map`. This allows `filter` to not have a `__len__` function and that behavior is what we would like.

cc VitalyFedyunin ejguan

Test Plan: Imported from OSS

Reviewed By: gchanan

Differential Revision: D30713120

Pulled By: NivekT

fbshipit-source-id: 4d5d07555297ee2bd4b49842c0d26cdc00638f6c
---
 test/test_datapipe.py                        |  2 +-
 torch/utils/data/datapipes/iter/selecting.py | 54 ++++++++++++++++----
 2 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index f09583b722379..15cb05986b518 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -1033,7 +1033,7 @@ def _filter_fn(data, val, clip=False):
         for data, exp in zip(filter_dp, range(5, 10)):
             self.assertEqual(data, exp)
 
-        with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"):
+        with self.assertRaisesRegex(TypeError, r"has no len"):
             len(filter_dp)
 
         def _non_bool_fn(data):
diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py
index 4e8703c8d3973..f1889e5d7a8e4 100644
--- a/torch/utils/data/datapipes/iter/selecting.py
+++ b/torch/utils/data/datapipes/iter/selecting.py
@@ -1,13 +1,24 @@
+import warnings
 from torch.utils.data import IterDataPipe, functional_datapipe, DataChunk
 from typing import Callable, TypeVar, Iterator, Optional, Tuple, Dict
 
-from .callable import MapperIterDataPipe
-
 T_co = TypeVar('T_co', covariant=True)
 
+try:
+    import dill
+
+    # XXX: By default, dill writes the Pickler dispatch table to inject its
+    # own logic there. This globally affects the behavior of the standard library
+    # pickler for any user who transitively depends on this module!
+    # Undo this extension to avoid altering the behavior of the pickler globally.
+    dill.extend(use_dill=False)
+    DILL_AVAILABLE = True
+except ImportError:
+    DILL_AVAILABLE = False
+
 
 @functional_datapipe('filter')
-class FilterIterDataPipe(MapperIterDataPipe):
+class FilterIterDataPipe(IterDataPipe[T_co]):
     r""" :class:`FilterIterDataPipe`.
 
     Iterable DataPipe to filter elements from datapipe according to filter_fn.
@@ -22,18 +33,31 @@ class FilterIterDataPipe(MapperIterDataPipe):
             This also accepts -1 as input to apply filtering to the lowest nesting level.
             It currently doesn't support argument < -1.
     """
+    datapipe: IterDataPipe
+    filter_fn: Callable
     drop_empty_batches: bool
 
     def __init__(self,
-                 datapipe: IterDataPipe[T_co],
-                 filter_fn: Callable[..., bool],
+                 datapipe: IterDataPipe,
+                 filter_fn: Callable,
                  fn_args: Optional[Tuple] = None,
                  fn_kwargs: Optional[Dict] = None,
                  drop_empty_batches: bool = True,
                  nesting_level: int = 0,
                  ) -> None:
+        super().__init__()
+        self.datapipe = datapipe
+        # Partial object has no attribute '__name__', but can be pickled
+        if hasattr(filter_fn, '__name__') and filter_fn.__name__ == '<lambda>' and not DILL_AVAILABLE:
+            warnings.warn("Lambda function is not supported for pickle, please use "
+                          "regular python function or functools.partial instead.")
+        self.filter_fn = filter_fn  # type: ignore[assignment]
+        self.args = () if fn_args is None else fn_args
+        self.kwargs = {} if fn_kwargs is None else fn_kwargs
+        if nesting_level < -1:
+            raise ValueError("nesting_level must be -1 or >= 0")
+        self.nesting_level = nesting_level
         self.drop_empty_batches = drop_empty_batches
-        super().__init__(datapipe, fn=filter_fn, fn_args=fn_args, fn_kwargs=fn_kwargs, nesting_level=nesting_level)
 
     def __iter__(self) -> Iterator[T_co]:
         res: bool
@@ -66,7 +90,7 @@ def _applyFilter(self, data, nesting_level):
                 return self._returnIfTrue(data)
 
     def _returnIfTrue(self, data):
-        condition = self.fn(data, *self.args, **self.kwargs)
+        condition = self.filter_fn(data, *self.args, **self.kwargs)
         if not isinstance(condition, bool):
             raise ValueError("Boolean output is required for `filter_fn` of FilterIterDataPipe")
         if condition:
@@ -77,5 +101,17 @@ def _isNonEmpty(self, data):
             not (isinstance(data, list) and len(data) == 0 and self.drop_empty_batches)
         return r
 
-    def __len__(self):
-        raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))
+    def __getstate__(self):
+        if DILL_AVAILABLE:
+            dill_function = dill.dumps(self.filter_fn)
+        else:
+            dill_function = self.filter_fn
+        state = (self.datapipe, dill_function, self.args, self.kwargs, self.drop_empty_batches, self.nesting_level)
+        return state
+
+    def __setstate__(self, state):
+        (self.datapipe, dill_function, self.args, self.kwargs, self.drop_empty_batches, self.nesting_level) = state
+        if DILL_AVAILABLE:
+            self.filter_fn = dill.loads(dill_function)  # type: ignore[assignment]
+        else:
+            self.filter_fn = dill_function  # type: ignore[assignment]

From f555348aaa7abdbcaaa7bfdb06b33c2edf93b172 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Thu, 2 Sep 2021 13:30:51 -0700
Subject: [PATCH 486/530] Disable CircleCI ROCm build (#64434)

Summary:
Per jithunnair-amd suggestion

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64434

Reviewed By: seemethere, janeyx99

Differential Revision: D30732289

Pulled By: malfet

fbshipit-source-id: 1932d0a7d1e648006f8030c8237b187d0709f688
---
 .circleci/cimodel/data/pytorch_build_data.py | 15 ++++++++-------
 .circleci/config.yml                         |  8 --------
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index df0cfa0027554..7d43a73f622a0 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -63,13 +63,14 @@
                 ]),
             ]),
         ]),
-        ("rocm", [
-            ("3.9", [
-                ("3.6", [
-                    ('build_only', [XImportant(True)]),
-                ]),
-            ]),
-        ]),
+        # @jithunnair-amd believes Jenkins builds are sufficient
+        # ("rocm", [
+        #     ("3.9", [
+        #         ("3.6", [
+        #             ('build_only', [XImportant(True)]),
+        #         ]),
+        #     ]),
+        # ]),
     ]),
 ]
 
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9989f1a289b7d..3e175764cad1e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7446,14 +7446,6 @@ workflows:
           build_environment: "pytorch-vulkan-linux-bionic-py3.6-clang9-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9"
           resource_class: large
-      - pytorch_linux_build:
-          name: pytorch_linux_bionic_rocm3_9_py3_6_build
-          requires:
-            - "docker-pytorch-linux-bionic-rocm3.9-py3.6"
-          build_environment: "pytorch-linux-bionic-rocm3.9-py3.6-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm3.9-py3.6"
-          resource_class: xlarge
-          build_only: "1"
       - pytorch_macos_10_15_py3_build:
           name: pytorch_macos_10_15_py3_build
       - pytorch_macos_10_13_py3_build:

From 257623da39c9ecc63025e90a418852ef3200b57f Mon Sep 17 00:00:00 2001
From: Erjia Guan <erjia@fb.com>
Date: Thu, 2 Sep 2021 13:35:05 -0700
Subject: [PATCH 487/530] Switch Shuffler to use iter-local buffer (#64195)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64195

Test Plan: Imported from OSS

Reviewed By: H-Huang

Differential Revision: D30642947

Pulled By: ejguan

fbshipit-source-id: d4b52479b4ae37ad693388b9cdb8eed83a136474
---
 .../data/datapipes/iter/combinatorics.py      | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index 4d6fac749729d..5e17a3ef56c33 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py
@@ -1,7 +1,7 @@
 import random
 
 from torch.utils.data import IterDataPipe, Sampler, SequentialSampler, functional_datapipe
-from typing import TypeVar, Type, Iterator, Sized, Optional, Tuple, Dict, List
+from typing import Dict, Iterator, List, Optional, Sized, Tuple, Type, TypeVar
 
 T_co = TypeVar('T_co', covariant=True)
 
@@ -72,7 +72,6 @@ class ShufflerIterDataPipe(IterDataPipe[T_co]):
     """
     datapipe: IterDataPipe[T_co]
     buffer_size: int
-    _buffer: List[T_co]
 
     def __init__(self,
                  datapipe: IterDataPipe[T_co],
@@ -87,24 +86,24 @@ def __init__(self,
         else:
             self.datapipe = datapipe.unbatch(unbatch_level=unbatch_level)
         self.buffer_size = buffer_size
-        self._buffer = []
 
-    def buffer_replace(self, x):
-        idx = random.randint(0, self.buffer_size - 1)
-        val = self._buffer[idx]
-        self._buffer[idx] = x
+    @staticmethod
+    def buffer_replace(buffer, x):
+        idx = random.randint(0, len(buffer) - 1)
+        val = buffer[idx]
+        buffer[idx] = x
         return val
 
     def __iter__(self) -> Iterator[T_co]:
-        # TODO: Buffer is global, should be per __iter__ !!!
+        buffer: List[T_co] = []
         for x in self.datapipe:
-            if len(self._buffer) == self.buffer_size:
-                yield self.buffer_replace(x)
+            if len(buffer) == self.buffer_size:
+                yield ShufflerIterDataPipe.buffer_replace(buffer, x)
             else:
-                self._buffer.append(x)
-        random.shuffle(self._buffer)
-        while self._buffer:
-            yield self._buffer.pop()
+                buffer.append(x)
+        random.shuffle(buffer)
+        while buffer:
+            yield buffer.pop()
 
     def __len__(self) -> int:
         if isinstance(self.datapipe, Sized):

From 3c79e0b314c56c01e119b22e834922923a63ad9e Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Thu, 2 Sep 2021 14:49:47 -0700
Subject: [PATCH 488/530] .github: Migrate pytorch_linux_bionic_py_3_6_clang9
 to GHA (#64218)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64218

Relies on https://github.com/fairinternal/pytorch-gha-infra/pull/11

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

cc ezyang seemethere malfet walterddr lg20987 pytorch/pytorch-dev-infra bdhirsh

Test Plan: Imported from OSS

Reviewed By: malfet, H-Huang, janeyx99

Differential Revision: D30651516

Pulled By: seemethere

fbshipit-source-id: e5843dfe84f096f2872d88f2e53e9408ad2fe399
---
 .circleci/cimodel/data/pytorch_build_data.py  |   5 -
 .circleci/config.yml                          |  13 -
 .github/generated-ciflow-ruleset.json         |  10 +
 .github/scripts/generate_ci_workflows.py      |  36 +-
 .../scripts/generate_pytorch_test_matrix.py   |   4 +
 .github/templates/linux_ci_workflow.yml.j2    |   5 +
 ...torch-linux-xenial-cuda10.2-py3.6-gcc7.yml |   2 +
 ...torch-linux-xenial-cuda11.3-py3.6-gcc7.yml |   2 +
 ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml |   5 +
 .../generated-linux-bionic-py3.6-clang9.yml   | 506 ++++++++++++++++++
 ...rated-linux-bionic-py3.8-gcc9-coverage.yml |   5 +
 ...rated-linux-xenial-cuda10.2-py3.6-gcc7.yml |   5 +
 ...rated-linux-xenial-cuda11.3-py3.6-gcc7.yml |   5 +
 .../generated-linux-xenial-py3.6-gcc5.4.yml   |   5 +
 ...ted-linux-xenial-py3.6-gcc7-bazel-test.yml |   1 +
 ...torch-linux-xenial-cuda11.1-py3.6-gcc7.yml |   2 +
 ...iodic-linux-xenial-cuda11.1-py3.6-gcc7.yml |   5 +
 17 files changed, 580 insertions(+), 36 deletions(-)
 create mode 100644 .github/workflows/generated-linux-bionic-py3.6-clang9.yml

diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index 7d43a73f622a0..dbe17bf4f15f5 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -51,11 +51,6 @@
     ]),
     ("bionic", [
         ("clang", [
-            ("9", [
-                ("3.6", [
-                    ("noarch", [XImportant(True)]),
-                ]),
-            ]),
             ("9", [
                 ("3.6", [
                     ("xla", [XImportant(True)]),
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3e175764cad1e..ffc67a14ec5ad 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7407,19 +7407,6 @@ workflows:
           build_environment: "pytorch-linux-pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_distributed-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
           resource_class: large
-      - pytorch_linux_build:
-          name: pytorch_linux_bionic_py3_6_clang9_noarch_build
-          requires:
-            - "docker-pytorch-linux-bionic-py3.6-clang9"
-          build_environment: "pytorch-linux-bionic-py3.6-clang9-noarch-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9"
-      - pytorch_linux_test:
-          name: pytorch_linux_bionic_py3_6_clang9_noarch_test
-          requires:
-            - pytorch_linux_bionic_py3_6_clang9_noarch_build
-          build_environment: "pytorch-linux-bionic-py3.6-clang9-noarch-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9"
-          resource_class: large
       - pytorch_linux_build:
           name: pytorch_xla_linux_bionic_py3_6_clang9_build
           requires:
diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json
index d13561190d01f..d3ebad35a5303 100644
--- a/.github/generated-ciflow-ruleset.json
+++ b/.github/generated-ciflow-ruleset.json
@@ -5,6 +5,7 @@
       "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
       "libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-bionic-cuda10.2-py3.9-gcc7",
+      "linux-bionic-py3.6-clang9",
       "linux-bionic-py3.8-gcc9-coverage",
       "linux-xenial-cuda10.2-py3.6-gcc7",
       "linux-xenial-cuda11.3-py3.6-gcc7",
@@ -24,6 +25,7 @@
       "linux-bionic-py3.8-gcc9-coverage"
     ],
     "ciflow/cpu": [
+      "linux-bionic-py3.6-clang9",
       "linux-bionic-py3.8-gcc9-coverage",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
@@ -42,6 +44,7 @@
       "win-vs2019-cuda11.3-py3"
     ],
     "ciflow/default": [
+      "linux-bionic-py3.6-clang9",
       "linux-bionic-py3.8-gcc9-coverage",
       "linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-xenial-py3.6-gcc5.4",
@@ -58,6 +61,7 @@
       "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
       "libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-bionic-cuda10.2-py3.9-gcc7",
+      "linux-bionic-py3.6-clang9",
       "linux-bionic-py3.8-gcc9-coverage",
       "linux-xenial-cuda10.2-py3.6-gcc7",
       "linux-xenial-cuda11.3-py3.6-gcc7",
@@ -66,6 +70,9 @@
       "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
       "periodic-linux-xenial-cuda11.1-py3.6-gcc7"
     ],
+    "ciflow/noarch": [
+      "linux-bionic-py3.6-clang9"
+    ],
     "ciflow/scheduled": [
       "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
       "periodic-linux-xenial-cuda11.1-py3.6-gcc7",
@@ -80,6 +87,9 @@
       "win-vs2019-cpu-py3",
       "win-vs2019-cuda10.1-py3",
       "win-vs2019-cuda11.3-py3"
+    ],
+    "ciflow/xla": [
+      "linux-bionic-py3.6-clang9"
     ]
   },
   "version": "v1"
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 467d13d0dc45d..16100f72a527c 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -49,6 +49,8 @@
 LABEL_CIFLOW_SCHEDULED = "ciflow/scheduled"
 LABEL_CIFLOW_SLOW = "ciflow/slow"
 LABEL_CIFLOW_WIN = "ciflow/win"
+LABEL_CIFLOW_XLA = "ciflow/xla"
+LABEL_CIFLOW_NOARCH = "ciflow/noarch"
 
 
 @dataclass
@@ -150,6 +152,8 @@ class CIWorkflow:
     enable_slow_test: YamlShellBool = "''"
     enable_docs_test: YamlShellBool = "''"
     enable_backwards_compat_test: YamlShellBool = "''"
+    enable_xla_test: YamlShellBool = "''"
+    enable_noarch_test: YamlShellBool = "''"
 
     def __post_init__(self) -> None:
         if self.is_libtorch:
@@ -409,24 +413,6 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
             labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_CUDA},
         ),
     ),
-    # CIWorkflow(
-    #     arch="linux",
-    #     build_environment="linux-bionic-py3.6-clang9-noarch",
-    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
-    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
-    # ),
-    # CIWorkflow(
-    #     arch="linux",
-    #     build_environment="xla-linux-bionic-py3.6-clang9",
-    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
-    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
-    # ),
-    # CIWorkflow(
-    #     arch="linux",
-    #     build_environment="vulkan-linux-bionic-py3.6-clang9",
-    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
-    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
-    # ),
     CIWorkflow(
         arch="linux",
         build_environment="linux-bionic-py3.8-gcc9-coverage",
@@ -440,6 +426,20 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
             labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_COVERAGE, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU},
         ),
     ),
+    CIWorkflow(
+        arch="linux",
+        build_environment="linux-bionic-py3.6-clang9",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9",
+        test_runner_type=LINUX_CPU_TEST_RUNNER,
+        on_pull_request=True,
+        num_test_shards=2,
+        distributed_test=False,
+        enable_noarch_test=1,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_XLA, LABEL_CIFLOW_NOARCH},
+        ),
+    ),
     # CIWorkflow(
     #     arch="linux",
     #     build_environment="linux-bionic-rocm3.9-py3.6",
diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py
index cb71f588ece5e..beb1b9d90e62f 100755
--- a/.github/scripts/generate_pytorch_test_matrix.py
+++ b/.github/scripts/generate_pytorch_test_matrix.py
@@ -59,6 +59,10 @@ def main() -> None:
         configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     if os.getenv('ENABLE_BACKWARDS_COMPAT_TEST'):
         configs['backwards_compat'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
+    if os.getenv('ENABLE_XLA_TEST'):
+        configs['xla'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
+    if os.getenv('ENABLE_NOARCH_TEST'):
+        configs['noarch'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
     matrix = {
         'include': [
             {
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 8aa854782de52..314122b699c86 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -35,6 +35,7 @@ env:
   BUILD_ENVIRONMENT: !{{ build_environment }}
   DOCKER_IMAGE_BASE: !{{ docker_image_base }}
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
   # This is used for the phase of adding wheel tests only, will be removed once completed
@@ -137,6 +138,7 @@ jobs:
             -e JOB_BASE_NAME \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
@@ -214,6 +216,8 @@ jobs:
       ENABLE_SLOW_TEST: !{{ enable_slow_test }}
       ENABLE_DOCS_TEST: !{{ enable_docs_test }}
       ENABLE_BACKWARDS_COMPAT_TEST: !{{ enable_backwards_compat_test }}
+      ENABLE_XLA_TEST: !{{ enable_xla_test }}
+      ENABLE_NOARCH_TEST: !{{ enable_noarch_test }}
       NUM_TEST_SHARDS: !{{ num_test_shards }}
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -313,6 +317,7 @@ jobs:
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 396284cf72f84..d2111896319df 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -16,6 +16,7 @@ env:
   BUILD_ENVIRONMENT: libtorch-linux-xenial-cuda10.2-py3.6-gcc7
   DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
   # This is used for the phase of adding wheel tests only, will be removed once completed
@@ -192,6 +193,7 @@ jobs:
             -e JOB_BASE_NAME \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
index cb0a98591956b..51811ae9eca58 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -16,6 +16,7 @@ env:
   BUILD_ENVIRONMENT: libtorch-linux-xenial-cuda11.3-py3.6-gcc7
   DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
   # This is used for the phase of adding wheel tests only, will be removed once completed
@@ -192,6 +193,7 @@ jobs:
             -e JOB_BASE_NAME \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index ddb38b7c84d5a..f410f1fc0be20 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -16,6 +16,7 @@ env:
   BUILD_ENVIRONMENT: linux-bionic-cuda10.2-py3.9-gcc7
   DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
   # This is used for the phase of adding wheel tests only, will be removed once completed
@@ -192,6 +193,7 @@ jobs:
             -e JOB_BASE_NAME \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
@@ -282,6 +284,8 @@ jobs:
       ENABLE_SLOW_TEST: ''
       ENABLE_DOCS_TEST: ''
       ENABLE_BACKWARDS_COMPAT_TEST: ''
+      ENABLE_XLA_TEST: ''
+      ENABLE_NOARCH_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -420,6 +424,7 @@ jobs:
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
new file mode 100644
index 0000000000000..e54555d12cf62
--- /dev/null
+++ b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
@@ -0,0 +1,506 @@
+# @generated DO NOT EDIT MANUALLY
+# Template is at:    .github/templates/linux_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-bionic-py3.6-clang9
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, unassigned]
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: linux-bionic-py3.6-clang9
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9
+  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+  TORCH_CUDA_ARCH_LIST: 5.2
+  IN_CI: 1
+  # This is used for the phase of adding wheel tests only, will be removed once completed
+  IN_WHEEL_TEST: 1
+  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
+  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+
+concurrency:
+  group: linux-bionic-py3.6-clang9-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/noarch') || contains(github.event.pull_request.labels.*.name, 'ciflow/xla')) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
+  calculate-docker-image:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.2xlarge
+    needs: [ciflow_should_run]
+    env:
+      DOCKER_BUILDKIT: 1
+    timeout-minutes: 90
+    outputs:
+      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: false
+      - name: Calculate docker image tag
+        id: calculate-tag
+        run: |
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "::set-output name=docker_tag::${DOCKER_TAG}"
+          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+      - name: Check if image should be built
+        id: check
+        env:
+          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
+          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
+        run: |
+          set -x
+          # Check if image already exists, if it does then skip building it
+          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
+            exit 0
+          fi
+          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
+            # if we're on the base branch then use the parent commit
+            MERGE_BASE=$(git rev-parse HEAD~)
+          else
+            # otherwise we're on a PR, so use the most recent base commit
+            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
+          fi
+          # Covers the case where a previous tag doesn't exist for the tree
+          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
+          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
+            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
+            exit 1
+          fi
+          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
+          # If no image exists but the hash is the same as the previous hash then we should error out here
+          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
+            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
+            echo "       contact the PyTorch team to restore the original images"
+            exit 1
+          fi
+          echo ::set-output name=rebuild::yes
+      - name: Build and push docker image
+        if: ${{ steps.check.outputs.rebuild }}
+        env:
+          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
+          DOCKER_SKIP_S3_UPLOAD: 1
+        run: |
+          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
+          cd .circleci/docker && ./build_docker.sh
+
+  build:
+    runs-on: linux.2xlarge
+    needs: [calculate-docker-image, ciflow_should_run]
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      JOB_BASE_NAME: linux-bionic-py3.6-clang9-build
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: recursive
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch
+        run: |
+          docker run \
+            -e BUILD_ENVIRONMENT \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        run: |
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Archive artifacts into zip
+        run: |
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Store PyTorch Build Artifacts on S3
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Prune all of the docker images
+          docker system prune -af
+
+  generate-test-matrix:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ubuntu-18.04
+    needs: [ciflow_should_run]
+    env:
+      TEST_RUNNER_TYPE: linux.2xlarge
+      ENABLE_DISTRIBUTED_TEST: ''
+      ENABLE_JIT_LEGACY_TEST: ''
+      ENABLE_MULTIGPU_TEST: ''
+      ENABLE_NOGPU_NO_AVX_TEST: ''
+      ENABLE_NOGPU_NO_AVX2_TEST: ''
+      ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
+      ENABLE_XLA_TEST: ''
+      ENABLE_NOARCH_TEST: 1
+      NUM_TEST_SHARDS: 2
+      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      NOGPU_RUNNER_TYPE: linux.2xlarge
+      PR_BODY: ${{ github.event.pull_request.body }}
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
+      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
+    container:
+      image: python:3.9
+    steps:
+      - name: Install dependencies
+        run: pip install typing-extensions
+      - name: Clone pytorch/pytorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+      - name: Generating test matrix
+        id: set-matrix
+        run: .github/scripts/generate_pytorch_test_matrix.py
+
+  test:
+    needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run]
+    strategy:
+      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      JOB_BASE_NAME: linux-bionic-py3.6-clang9-test
+      TEST_CONFIG: ${{ matrix.config }}
+      SHARD_NUMBER: ${{ matrix.shard }}
+      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: recursive
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
+        run: |
+          bash .github/scripts/install_nvidia_utils_linux.sh
+          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+      - name: Determine shm-size
+        run: |
+          shm_size="1g"
+          case "${BUILD_ENVIRONMENT}" in
+            *cuda*)
+              shm_size="2g"
+              ;;
+            *rocm*)
+              shm_size="8g"
+              ;;
+          esac
+          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+      - name: Unzip artifacts
+        run: |
+          unzip -o artifacts.zip
+      - name: Output disk space left
+        run: |
+          sudo df -H
+      - name: Test PyTorch
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+        run: |
+          if [[ $TEST_CONFIG == 'multigpu' ]]; then
+            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
+          else
+            TEST_COMMAND=.jenkins/pytorch/test.sh
+          fi
+          if [[ $NUM_TEST_SHARDS -ne 2 ]]; then
+            export SHARD_NUMBER=0
+          fi
+          # TODO: Stop building test binaries as part of the build phase
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086
+          docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e GITHUB_ACTIONS \
+            -e IN_CI \
+            -e IN_WHEEL_TEST \
+            -e SHARD_NUMBER \
+            -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
+            -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e PR_LABELS \
+            -e CONTINUE_THROUGH_ERROR \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="${SHM_SIZE}" \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Zip test reports for upload
+        if: always()
+        env:
+          COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        run: |
+          # Remove any previous test reports if they exist
+          rm -f test-reports-*.zip
+          zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml'
+      - uses: actions/upload-artifact@v2
+        name: Store PyTorch Test Reports
+        if: always()
+        with:
+          name: test-reports-${{ matrix.config }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test-reports-*.zip
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Store PyTorch Test Reports on S3
+        if: always()
+        with:
+          name: test-reports-${{ matrix.config }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test-reports-*.zip
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload test statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: linux-bionic-py3.6-clang9-test
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
+        run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
+          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index c4cdd2cd636d5..2bcb4261816de 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -16,6 +16,7 @@ env:
   BUILD_ENVIRONMENT: linux-bionic-py3.8-gcc9-coverage
   DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.8-gcc9
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
   # This is used for the phase of adding wheel tests only, will be removed once completed
@@ -192,6 +193,7 @@ jobs:
             -e JOB_BASE_NAME \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
@@ -282,6 +284,8 @@ jobs:
       ENABLE_SLOW_TEST: ''
       ENABLE_DOCS_TEST: ''
       ENABLE_BACKWARDS_COMPAT_TEST: ''
+      ENABLE_XLA_TEST: ''
+      ENABLE_NOARCH_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -420,6 +424,7 @@ jobs:
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 2d0d916237676..656b56548b26b 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -16,6 +16,7 @@ env:
   BUILD_ENVIRONMENT: linux-xenial-cuda10.2-py3.6-gcc7
   DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
   # This is used for the phase of adding wheel tests only, will be removed once completed
@@ -192,6 +193,7 @@ jobs:
             -e JOB_BASE_NAME \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
@@ -282,6 +284,8 @@ jobs:
       ENABLE_SLOW_TEST: 1
       ENABLE_DOCS_TEST: ''
       ENABLE_BACKWARDS_COMPAT_TEST: ''
+      ENABLE_XLA_TEST: ''
+      ENABLE_NOARCH_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -420,6 +424,7 @@ jobs:
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 0e1c7ba6ca6c5..68b42858c98e3 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -16,6 +16,7 @@ env:
   BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7
   DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
   # This is used for the phase of adding wheel tests only, will be removed once completed
@@ -192,6 +193,7 @@ jobs:
             -e JOB_BASE_NAME \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
@@ -282,6 +284,8 @@ jobs:
       ENABLE_SLOW_TEST: ''
       ENABLE_DOCS_TEST: ''
       ENABLE_BACKWARDS_COMPAT_TEST: ''
+      ENABLE_XLA_TEST: ''
+      ENABLE_NOARCH_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -420,6 +424,7 @@ jobs:
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index 6f4e5c2958904..b347b52be9509 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -16,6 +16,7 @@ env:
   BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc5.4
   DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
   # This is used for the phase of adding wheel tests only, will be removed once completed
@@ -192,6 +193,7 @@ jobs:
             -e JOB_BASE_NAME \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
@@ -282,6 +284,8 @@ jobs:
       ENABLE_SLOW_TEST: ''
       ENABLE_DOCS_TEST: 1
       ENABLE_BACKWARDS_COMPAT_TEST: 1
+      ENABLE_XLA_TEST: ''
+      ENABLE_NOARCH_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -420,6 +424,7 @@ jobs:
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index 2d72fab281f6e..e86cc563a77f8 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -16,6 +16,7 @@ env:
   BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc7-bazel-test
   DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
   # This is used for the phase of adding wheel tests only, will be removed once completed
diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
index b009d77b3a8d8..b591519e3219c 100644
--- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -14,6 +14,7 @@ env:
   BUILD_ENVIRONMENT: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7
   DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
   # This is used for the phase of adding wheel tests only, will be removed once completed
@@ -190,6 +191,7 @@ jobs:
             -e JOB_BASE_NAME \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 738e6bb146cdd..07593ad07a941 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -14,6 +14,7 @@ env:
   BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.6-gcc7
   DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7
   SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
   TORCH_CUDA_ARCH_LIST: 5.2
   IN_CI: 1
   # This is used for the phase of adding wheel tests only, will be removed once completed
@@ -190,6 +191,7 @@ jobs:
             -e JOB_BASE_NAME \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
@@ -280,6 +282,8 @@ jobs:
       ENABLE_SLOW_TEST: ''
       ENABLE_DOCS_TEST: ''
       ENABLE_BACKWARDS_COMPAT_TEST: ''
+      ENABLE_XLA_TEST: ''
+      ENABLE_NOARCH_TEST: ''
       NUM_TEST_SHARDS: 2
       MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
       NOGPU_RUNNER_TYPE: linux.2xlarge
@@ -418,6 +422,7 @@ jobs:
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \

From 795387477fe90e03cb598f3077a32222896e65dd Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Thu, 2 Sep 2021 15:15:24 -0700
Subject: [PATCH 489/530] [FX] Prototype for guarding against mutable
 operations in tracing (#64295)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64295

Test Plan: Imported from OSS

Reviewed By: zou3519

Differential Revision: D30675780

Pulled By: jamesr66a

fbshipit-source-id: b2116b51dcc87357f0c84192c4c336680875e27a
---
 ..._compat-fx_backcompat_class_members.expect |  2 +-
 test/test_fx.py                               | 67 +++++++++++++++++--
 torch/csrc/jit/python/init.cpp                | 14 ++--
 torch/fx/operator_schemas.py                  | 44 ++++++++++--
 torch/fx/proxy.py                             |  7 ++
 5 files changed, 120 insertions(+), 14 deletions(-)

diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
index 88e4654b568df..5c3630a3169f7 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
@@ -15,5 +15,5 @@ torch.fx.proxy.Attribute ['node']
 torch.fx.proxy.GraphAppendingTracer []
 torch.fx.proxy.Proxy ['keys']
 torch.fx.proxy.TraceError []
-torch.fx.proxy.TracerBase ['create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool']
+torch.fx.proxy.TracerBase ['check_mutable_operations', 'create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool']
 torch.fx.subgraph_rewriter.Match ['anchor', 'nodes_map']
\ No newline at end of file
diff --git a/test/test_fx.py b/test/test_fx.py
index 5220f67ebf309..57a2960a409c3 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -132,10 +132,17 @@ def __init__(self, a, b):
 
 class TestFX(JitTestCase):
     def setUp(self):
-        if TEST_WITH_ROCM or IS_FBCODE or IS_WINDOWS or IS_MACOS:
-            return
-        lib_file_path = find_library_location('libtorchbind_test.so')
-        torch.ops.load_library(str(lib_file_path))
+        # Checking for mutable operations whil tracing is feature flagged
+        # Enable it in testing but not by default
+        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
+        torch.fx.proxy.TracerBase.check_mutable_operations = True
+
+        if not (TEST_WITH_ROCM or IS_FBCODE or IS_WINDOWS or IS_MACOS):
+            lib_file_path = find_library_location('libtorchbind_test.so')
+            torch.ops.load_library(str(lib_file_path))
+
+    def tearDown(self):
+        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
 
     def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None):
         """Check that an nn.Module's results match the GraphModule version
@@ -2367,6 +2374,19 @@ def forward(self, x: torch.Tensor, y: int, z: int):
 
         traced.graph.lint()
 
+    def test_throw_out_variant(self):
+        def foo(x):
+            y = torch.rand_like(x)
+            torch.sigmoid(x, out=y)
+            return y
+
+        class MyTracer(torch.fx.Tracer):
+            check_mutable_operations = True
+
+        tracer = MyTracer()
+        with self.assertRaisesRegex(RuntimeError, 'mutable operation aten::sigmoid.out'):
+            traced_graph = tracer.trace(foo)
+
     def test_ast_rewriter_reassigns_submodules(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -3021,6 +3041,15 @@ def run_getitem_target():
 
 
 class TestOperatorSignatures(JitTestCase):
+    def setUp(self):
+        # Checking for mutable operations whil tracing is feature flagged
+        # Enable it in testing but not by default
+        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
+        torch.fx.proxy.TracerBase.check_mutable_operations = True
+
+    def tearDown(self):
+        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
+
     @onlyCPU
     @ops(op_db, allowed_dtypes=(torch.float,))
     def test_get_torch_func_signature_exhaustive(self, device, dtype, op):
@@ -3090,6 +3119,15 @@ class TestFXAPIBackwardCompatibility(JitTestCase):
     def setUp(self):
         self.maxDiff = None
 
+        # Checking for mutable operations whil tracing is feature flagged
+        # Enable it in testing but not by default
+        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
+        torch.fx.proxy.TracerBase.check_mutable_operations = True
+
+    def tearDown(self):
+        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
+
+
     def _fn_to_stable_annotation_str(self, obj):
         """
         Unfortunately we have to serialize function signatures manually since
@@ -3326,6 +3364,15 @@ def check_symbols_have_bc_designation(m, prefix):
                                  f"BC guarantees.")
 
 class TestFunctionalTracing(JitTestCase):
+    def setUp(self):
+        # Checking for mutable operations whil tracing is feature flagged
+        # Enable it in testing but not by default
+        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
+        torch.fx.proxy.TracerBase.check_mutable_operations = True
+
+    def tearDown(self):
+        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
+
     IGNORE_FUNCS = ("has_torch_function", "has_torch_function_unary",
                     "has_torch_function_variadic", "handle_torch_function",
                     "boolean_dispatch")
@@ -3340,6 +3387,7 @@ class TestFunctionalTracing(JitTestCase):
     ARG_TYPE_MISMATCH = (TypeError, r", not Proxy$")
     CONTROL_FLOW = (TraceError, r"symbolically traced variables cannot be used as inputs to control flow")
     INTERPOLATE_ARGS_CONFLICT = (ValueError, r"only one of size or scale_factor should be defined")
+    MUTABLE = (RuntimeError, r"Tried to trace mutable operation")
 
     UNTRACEABLE_FUNCTIONALS = {
         "adaptive_avg_pool1d": BUILT_IN_FUNC,
@@ -3459,6 +3507,8 @@ class TestFunctionalTracing(JitTestCase):
 
         "upsample_bilinear": INTERPOLATE_ARGS_CONFLICT,
         "upsample_nearest": INTERPOLATE_ARGS_CONFLICT,
+
+        "normalize" : MUTABLE,
     }
 
     # List of nn.functionals with Tensor inputs but not with type annotation
@@ -3573,6 +3623,15 @@ def tearDownClass(cls):
 
 @skipIfNoTorchVision
 class TestVisionTracing(JitTestCase):
+    def setUp(self):
+        # Checking for mutable operations whil tracing is feature flagged
+        # Enable it in testing but not by default
+        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
+        torch.fx.proxy.TracerBase.check_mutable_operations = True
+
+    def tearDown(self):
+        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
+
     PROXY_ITERATED = (TraceError, r"Proxy object cannot be iterated")
     INCONSISTENT_TYPE = (
         RuntimeError,
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 7e43e511c786f..35197e4ea1423 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1280,11 +1280,15 @@ void initJITBindings(PyObject* module) {
           [](const FunctionSchema& self, const FunctionSchema& other) {
             return self == other;
           })
-      .def("__str__", [](FunctionSchema& self) {
-        std::stringstream ss;
-        ss << self;
-        return ss.str();
-      });
+      .def(
+          "__str__",
+          [](FunctionSchema& self) {
+            std::stringstream ss;
+            ss << self;
+            return ss.str();
+          })
+      .def_property_readonly(
+          "is_mutable", [](FunctionSchema& self) { return self.is_mutable(); });
   py::class_<Argument>(m, "Argument")
       .def_property_readonly("name", [](Argument& self) { return self.name(); })
       .def_property_readonly("type", [](Argument& self) { return self.type(); })
diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py
index ac559b19530c7..5e024e8624cca 100644
--- a/torch/fx/operator_schemas.py
+++ b/torch/fx/operator_schemas.py
@@ -79,7 +79,43 @@ def _torchscript_schema_to_signature(ts_schema : torch._C.FunctionSchema) -> ins
     return inspect.Signature(parameters, return_annotation=return_type)
 
 @compatibility(is_backward_compatible=False)
-def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature]]:
+def check_for_mutable_operation(target : Callable, args : Tuple['Argument', ...], kwargs : Dict[str, 'Argument']):
+    signatures, schemas = get_signature_for_torch_op(target, return_schemas=True)
+
+    if signatures and schemas:
+        matched_schemas = []
+
+        # Iterate through all of the schema until we find one that matches
+        # If one matches, populate `new_args_and_kwargs` with the new args/kwargs
+        # values. If none matches, `new_args_and_kwargs` will be None
+        for candidate_signature, schema in zip(signatures, schemas):
+            try:
+                candidate_signature.bind(*args, **kwargs)
+                matched_schemas.append((candidate_signature, schema))
+            except TypeError as e:
+                continue
+
+        def throw_if_mutable(schema):
+            if schema.is_mutable:
+                raise RuntimeError(f'Tried to trace mutable operation {schema}. FX only supports functional '
+                                   f'code, so operations that mutate operands in-place (e.g. via `out` arguments) '
+                                   f'are not supported')
+
+        if len(matched_schemas) == 0:
+            # Did not match any schema. Cannot check for mutation
+            pass
+        elif len(matched_schemas) == 1:
+            # Matched exactly one schema, unambiguous
+            _, schema_to_check = matched_schemas[0]
+            throw_if_mutable(schema_to_check)
+            pass
+        else:
+            # Ambiguous schema match. Since mutability checking is best effort,
+            # do nothing.
+            pass
+
+@compatibility(is_backward_compatible=False)
+def get_signature_for_torch_op(op : Callable, return_schemas : bool = False) -> Optional[List[inspect.Signature]]:
     """
     Given an operator on the `torch` namespace, return a list of `inspect.Signature`
     objects corresponding to the overloads of that op.. May return `None` if a signature
@@ -94,17 +130,17 @@ def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature
     """
     override = _manual_overrides.get(op)
     if override:
-        return override
+        return (override, None) if return_schemas else None
 
     aten_fn = torch.jit._builtins._find_builtin(op)
 
     if aten_fn is None:
-        return None
+        return (None, None) if return_schemas else None
 
     schemas = torch._C._jit_get_schemas_for_operator(aten_fn)
     signatures = [_torchscript_schema_to_signature(schema) for schema in schemas]
 
-    return signatures
+    return (signatures, schemas) if return_schemas else signatures
 
 @compatibility(is_backward_compatible=False)
 def create_type_hint(x):
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index 61b039f8b7219..b25e45d206a51 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -8,11 +8,15 @@
 from typing import Tuple, Dict, Optional, Iterable, Any, Iterator, Callable
 from .node import Target, Node, Argument, base_types, map_aggregate
 from ._compatibility import compatibility
+from .operator_schemas import check_for_mutable_operation
 
 @compatibility(is_backward_compatible=True)
 class TracerBase:
     graph: Graph
     record_stack_traces : bool = False
+    # Feature flag for mutable schema checking
+    # Enableby default in 1.12
+    check_mutable_operations : bool = False
 
     @compatibility(is_backward_compatible=True)
     def create_node(self, kind : str, target : Target,
@@ -25,6 +29,9 @@ def create_node(self, kind : str, target : Target,
         modification of values used in node creation. For example, one might
         want to disallow in-place operations from being recorded.
         """
+        if kind == 'call_function' and self.check_mutable_operations:
+            check_for_mutable_operation(target, args, kwargs)
+
         return self.graph.create_node(kind, target, args, kwargs, name, type_expr)
 
     @compatibility(is_backward_compatible=True)

From 116142143cc2d66c7e582d9f96e00862456fd736 Mon Sep 17 00:00:00 2001
From: Zafar Takhirov <zaf@fb.com>
Date: Thu, 2 Sep 2021 15:56:54 -0700
Subject: [PATCH 490/530] [quant] Enable jit tracing on quantizable LSTM
 (#64438)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64438

The quantizable LSTM didn't support jit tracing because it had several non taceable paths. We sacrifice some of the user experience to enable the tracing.

The main UX feature removed is a user-friendly message when trying to access the backwards path in a bidirectional LSTM: When the bidirectional flag is `False`, we used to throw a nice error message when the user tried accessing backwards weights. Now the message is default (removed properties).

Test Plan: `buck test mode/dev //caffe2/test:quantization -- test_custom_module_lstm`

Reviewed By: mtl67

Differential Revision: D30732630

fbshipit-source-id: 443e351ebb0e2b636c86dea9691b9bf42ffe618f
---
 test/quantization/core/test_quantized_op.py |  7 +++
 torch/nn/quantizable/modules/rnn.py         | 59 ++++-----------------
 2 files changed, 17 insertions(+), 49 deletions(-)

diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 49b7c96847612..6275174d8e43a 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -2476,6 +2476,13 @@ def test_custom_module_lstm(self):
                         msg=(f"Error is too high: SNR(dB): {power}, "
                              f"Signal: {signal}, MSE: {mse}"))
 
+                # Trace
+                jit_qmodule = torch.jit.trace(lstm_quantized, qx)
+
+                # Script
+                # TODO: Fix the scripting in the torch/nn/quantizable/modules/rnn.py
+                # jit_qmodule = torch.jit.script(lstm_quantized)
+
     @override_qengines
     def test_custom_module_multi_head_attention(self):
         class MultiheadAttentionModel(torch.nn.Module):
diff --git a/torch/nn/quantizable/modules/rnn.py b/torch/nn/quantizable/modules/rnn.py
index bdfd7788533b5..cd0d094d086a7 100644
--- a/torch/nn/quantizable/modules/rnn.py
+++ b/torch/nn/quantizable/modules/rnn.py
@@ -48,7 +48,7 @@ def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True,
         self.ogate_cy = torch.nn.quantized.FloatFunctional()
 
     def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
-        if hidden is None or hidden == (None, None):
+        if hidden is None or hidden[0] is None or hidden[1] is None:
             hidden = self.initialize_hidden(x.shape[0], x.is_quantized)
         hx, cx = hidden
 
@@ -175,10 +175,13 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
                 cx_bw = cx_fw[1]
                 cx_fw = cx_fw[0]
             hidden_bw = hx_bw, cx_bw
-        hidden_fw = hx_fw, cx_fw
+        if hx_fw is None and cx_fw is None:
+            hidden_fw = None
+        else:
+            hidden_fw = torch.jit._unwrap_optional(hx_fw), torch.jit._unwrap_optional(cx_fw)
         result_fw, hidden_fw = self.layer_fw(x, hidden_fw)
 
-        if self.bidirectional:
+        if hasattr(self, 'layer_bw') and self.bidirectional:
             x_reversed = x.flip(0)
             result_bw, hidden_bw = self.layer_bw(x_reversed, hidden_bw)
             result_bw = result_bw.flip(0)
@@ -188,7 +191,7 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
             c = torch.stack([hidden_fw[1], hidden_bw[1]], 0)  # type: ignore[list-item]
         else:
             result = result_fw
-            h, c = hidden_fw  # type: ignore[assignment]
+            h, c = torch.jit._unwrap_optional(hidden_fw)  # type: ignore[assignment]
 
         if self.batch_first:
             result.transpose_(0, 1)
@@ -227,46 +230,6 @@ def from_float(cls, other, layer_idx=0, qconfig=None, **kwargs):
             layer.layer_bw = _LSTMSingleLayer.from_params(wi, wh, bi, bh)
         return layer
 
-    # Getters for the weights and biases
-    # Note that jit currently doesn't support the `porperty`, so if you need to
-    # access the weights/biases you would need to navigate manually to the
-    # `layer_fw.cell.igates.*`: https://github.com/pytorch/pytorch/issues/37883
-    @property
-    def weight_ih(self):
-        return self.layer_fw.cell.igates.weight
-
-    @property
-    def weight_hh(self):
-        return self.layer_fw.cell.hgates.weight
-
-    @property
-    def bias_ih(self):
-        return self.layer_fw.cell.igates.bias
-
-    @property
-    def bias_hh(self):
-        return self.layer_fw.cell.hgates.bias
-
-    @property
-    def weight_ih_reverse(self):
-        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
-        return self.layer_bw.cell.igates.weight
-
-    @property
-    def weight_hh_reverse(self):
-        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
-        return self.layer_bw.cell.hgates.weight
-
-    @property
-    def bias_ih_reverse(self):
-        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
-        return self.layer_bw.cell.igates.bias
-
-    @property
-    def bias_hh_reverse(self):
-        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
-        return self.layer_bw.cell.hgates.bias
-
 
 class LSTM(torch.nn.Module):
     r"""A quantizable long short-term memory (LSTM).
@@ -362,14 +325,12 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
                 cx = hidden_non_opt[1].reshape(self.num_layers, num_directions,
                                                max_batch_size,
                                                self.hidden_size).unbind(0)
-                hxcx = []
-                for idx in range(self.num_layers):
-                    hxcx.append((hx[idx].squeeze_(0), cx[idx].squeeze_(0)))
+                hxcx = [(hx[idx].squeeze_(0), cx[idx].squeeze_(0)) for idx in range(self.num_layers)]
             else:
                 hxcx = hidden_non_opt
 
-        for idx in range(self.num_layers):
-            x, hxcx[idx] = self.layers[idx](x, hxcx[idx])
+        for idx, layer in enumerate(self.layers):
+            x, hxcx[idx] = layer(x, hxcx[idx])
 
         hx_list = []
         cx_list = []

From 32a93c2424c7c165a3f52a6dc8cee83aab4d7b63 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Thu, 2 Sep 2021 16:06:17 -0700
Subject: [PATCH 491/530] Revert D30675780: [FX] Prototype for guarding against
 mutable operations in tracing

Test Plan: revert-hammer

Differential Revision:
D30675780 (https://github.com/pytorch/pytorch/commit/795387477fe90e03cb598f3077a32222896e65dd)

Original commit changeset: b2116b51dcc8

fbshipit-source-id: d4f1173f4989556ea54974f4c2739ef85a705fae
---
 ..._compat-fx_backcompat_class_members.expect |  2 +-
 test/test_fx.py                               | 67 ++-----------------
 torch/csrc/jit/python/init.cpp                | 14 ++--
 torch/fx/operator_schemas.py                  | 44 ++----------
 torch/fx/proxy.py                             |  7 --
 5 files changed, 14 insertions(+), 120 deletions(-)

diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
index 5c3630a3169f7..88e4654b568df 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
@@ -15,5 +15,5 @@ torch.fx.proxy.Attribute ['node']
 torch.fx.proxy.GraphAppendingTracer []
 torch.fx.proxy.Proxy ['keys']
 torch.fx.proxy.TraceError []
-torch.fx.proxy.TracerBase ['check_mutable_operations', 'create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool']
+torch.fx.proxy.TracerBase ['create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool']
 torch.fx.subgraph_rewriter.Match ['anchor', 'nodes_map']
\ No newline at end of file
diff --git a/test/test_fx.py b/test/test_fx.py
index 57a2960a409c3..5220f67ebf309 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -132,17 +132,10 @@ def __init__(self, a, b):
 
 class TestFX(JitTestCase):
     def setUp(self):
-        # Checking for mutable operations whil tracing is feature flagged
-        # Enable it in testing but not by default
-        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
-        torch.fx.proxy.TracerBase.check_mutable_operations = True
-
-        if not (TEST_WITH_ROCM or IS_FBCODE or IS_WINDOWS or IS_MACOS):
-            lib_file_path = find_library_location('libtorchbind_test.so')
-            torch.ops.load_library(str(lib_file_path))
-
-    def tearDown(self):
-        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
+        if TEST_WITH_ROCM or IS_FBCODE or IS_WINDOWS or IS_MACOS:
+            return
+        lib_file_path = find_library_location('libtorchbind_test.so')
+        torch.ops.load_library(str(lib_file_path))
 
     def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None):
         """Check that an nn.Module's results match the GraphModule version
@@ -2374,19 +2367,6 @@ def forward(self, x: torch.Tensor, y: int, z: int):
 
         traced.graph.lint()
 
-    def test_throw_out_variant(self):
-        def foo(x):
-            y = torch.rand_like(x)
-            torch.sigmoid(x, out=y)
-            return y
-
-        class MyTracer(torch.fx.Tracer):
-            check_mutable_operations = True
-
-        tracer = MyTracer()
-        with self.assertRaisesRegex(RuntimeError, 'mutable operation aten::sigmoid.out'):
-            traced_graph = tracer.trace(foo)
-
     def test_ast_rewriter_reassigns_submodules(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -3041,15 +3021,6 @@ def run_getitem_target():
 
 
 class TestOperatorSignatures(JitTestCase):
-    def setUp(self):
-        # Checking for mutable operations whil tracing is feature flagged
-        # Enable it in testing but not by default
-        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
-        torch.fx.proxy.TracerBase.check_mutable_operations = True
-
-    def tearDown(self):
-        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
-
     @onlyCPU
     @ops(op_db, allowed_dtypes=(torch.float,))
     def test_get_torch_func_signature_exhaustive(self, device, dtype, op):
@@ -3119,15 +3090,6 @@ class TestFXAPIBackwardCompatibility(JitTestCase):
     def setUp(self):
         self.maxDiff = None
 
-        # Checking for mutable operations whil tracing is feature flagged
-        # Enable it in testing but not by default
-        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
-        torch.fx.proxy.TracerBase.check_mutable_operations = True
-
-    def tearDown(self):
-        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
-
-
     def _fn_to_stable_annotation_str(self, obj):
         """
         Unfortunately we have to serialize function signatures manually since
@@ -3364,15 +3326,6 @@ def check_symbols_have_bc_designation(m, prefix):
                                  f"BC guarantees.")
 
 class TestFunctionalTracing(JitTestCase):
-    def setUp(self):
-        # Checking for mutable operations whil tracing is feature flagged
-        # Enable it in testing but not by default
-        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
-        torch.fx.proxy.TracerBase.check_mutable_operations = True
-
-    def tearDown(self):
-        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
-
     IGNORE_FUNCS = ("has_torch_function", "has_torch_function_unary",
                     "has_torch_function_variadic", "handle_torch_function",
                     "boolean_dispatch")
@@ -3387,7 +3340,6 @@ def tearDown(self):
     ARG_TYPE_MISMATCH = (TypeError, r", not Proxy$")
     CONTROL_FLOW = (TraceError, r"symbolically traced variables cannot be used as inputs to control flow")
     INTERPOLATE_ARGS_CONFLICT = (ValueError, r"only one of size or scale_factor should be defined")
-    MUTABLE = (RuntimeError, r"Tried to trace mutable operation")
 
     UNTRACEABLE_FUNCTIONALS = {
         "adaptive_avg_pool1d": BUILT_IN_FUNC,
@@ -3507,8 +3459,6 @@ def tearDown(self):
 
         "upsample_bilinear": INTERPOLATE_ARGS_CONFLICT,
         "upsample_nearest": INTERPOLATE_ARGS_CONFLICT,
-
-        "normalize" : MUTABLE,
     }
 
     # List of nn.functionals with Tensor inputs but not with type annotation
@@ -3623,15 +3573,6 @@ def tearDownClass(cls):
 
 @skipIfNoTorchVision
 class TestVisionTracing(JitTestCase):
-    def setUp(self):
-        # Checking for mutable operations whil tracing is feature flagged
-        # Enable it in testing but not by default
-        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
-        torch.fx.proxy.TracerBase.check_mutable_operations = True
-
-    def tearDown(self):
-        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
-
     PROXY_ITERATED = (TraceError, r"Proxy object cannot be iterated")
     INCONSISTENT_TYPE = (
         RuntimeError,
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 35197e4ea1423..7e43e511c786f 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1280,15 +1280,11 @@ void initJITBindings(PyObject* module) {
           [](const FunctionSchema& self, const FunctionSchema& other) {
             return self == other;
           })
-      .def(
-          "__str__",
-          [](FunctionSchema& self) {
-            std::stringstream ss;
-            ss << self;
-            return ss.str();
-          })
-      .def_property_readonly(
-          "is_mutable", [](FunctionSchema& self) { return self.is_mutable(); });
+      .def("__str__", [](FunctionSchema& self) {
+        std::stringstream ss;
+        ss << self;
+        return ss.str();
+      });
   py::class_<Argument>(m, "Argument")
       .def_property_readonly("name", [](Argument& self) { return self.name(); })
       .def_property_readonly("type", [](Argument& self) { return self.type(); })
diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py
index 5e024e8624cca..ac559b19530c7 100644
--- a/torch/fx/operator_schemas.py
+++ b/torch/fx/operator_schemas.py
@@ -79,43 +79,7 @@ def _torchscript_schema_to_signature(ts_schema : torch._C.FunctionSchema) -> ins
     return inspect.Signature(parameters, return_annotation=return_type)
 
 @compatibility(is_backward_compatible=False)
-def check_for_mutable_operation(target : Callable, args : Tuple['Argument', ...], kwargs : Dict[str, 'Argument']):
-    signatures, schemas = get_signature_for_torch_op(target, return_schemas=True)
-
-    if signatures and schemas:
-        matched_schemas = []
-
-        # Iterate through all of the schema until we find one that matches
-        # If one matches, populate `new_args_and_kwargs` with the new args/kwargs
-        # values. If none matches, `new_args_and_kwargs` will be None
-        for candidate_signature, schema in zip(signatures, schemas):
-            try:
-                candidate_signature.bind(*args, **kwargs)
-                matched_schemas.append((candidate_signature, schema))
-            except TypeError as e:
-                continue
-
-        def throw_if_mutable(schema):
-            if schema.is_mutable:
-                raise RuntimeError(f'Tried to trace mutable operation {schema}. FX only supports functional '
-                                   f'code, so operations that mutate operands in-place (e.g. via `out` arguments) '
-                                   f'are not supported')
-
-        if len(matched_schemas) == 0:
-            # Did not match any schema. Cannot check for mutation
-            pass
-        elif len(matched_schemas) == 1:
-            # Matched exactly one schema, unambiguous
-            _, schema_to_check = matched_schemas[0]
-            throw_if_mutable(schema_to_check)
-            pass
-        else:
-            # Ambiguous schema match. Since mutability checking is best effort,
-            # do nothing.
-            pass
-
-@compatibility(is_backward_compatible=False)
-def get_signature_for_torch_op(op : Callable, return_schemas : bool = False) -> Optional[List[inspect.Signature]]:
+def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature]]:
     """
     Given an operator on the `torch` namespace, return a list of `inspect.Signature`
     objects corresponding to the overloads of that op.. May return `None` if a signature
@@ -130,17 +94,17 @@ def get_signature_for_torch_op(op : Callable, return_schemas : bool = False) ->
     """
     override = _manual_overrides.get(op)
     if override:
-        return (override, None) if return_schemas else None
+        return override
 
     aten_fn = torch.jit._builtins._find_builtin(op)
 
     if aten_fn is None:
-        return (None, None) if return_schemas else None
+        return None
 
     schemas = torch._C._jit_get_schemas_for_operator(aten_fn)
     signatures = [_torchscript_schema_to_signature(schema) for schema in schemas]
 
-    return (signatures, schemas) if return_schemas else signatures
+    return signatures
 
 @compatibility(is_backward_compatible=False)
 def create_type_hint(x):
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index b25e45d206a51..61b039f8b7219 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -8,15 +8,11 @@
 from typing import Tuple, Dict, Optional, Iterable, Any, Iterator, Callable
 from .node import Target, Node, Argument, base_types, map_aggregate
 from ._compatibility import compatibility
-from .operator_schemas import check_for_mutable_operation
 
 @compatibility(is_backward_compatible=True)
 class TracerBase:
     graph: Graph
     record_stack_traces : bool = False
-    # Feature flag for mutable schema checking
-    # Enableby default in 1.12
-    check_mutable_operations : bool = False
 
     @compatibility(is_backward_compatible=True)
     def create_node(self, kind : str, target : Target,
@@ -29,9 +25,6 @@ def create_node(self, kind : str, target : Target,
         modification of values used in node creation. For example, one might
         want to disallow in-place operations from being recorded.
         """
-        if kind == 'call_function' and self.check_mutable_operations:
-            check_for_mutable_operation(target, args, kwargs)
-
         return self.graph.create_node(kind, target, args, kwargs, name, type_expr)
 
     @compatibility(is_backward_compatible=True)

From b12f34e8c2ae0a183abc48e65815480bf4c44fbe Mon Sep 17 00:00:00 2001
From: Garrett Cramer <gcramer@fb.com>
Date: Thu, 2 Sep 2021 16:11:10 -0700
Subject: [PATCH 492/530] update rpc tensorpipe logic for sparse tensors
 (#62960)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/62960

A bug was filed a few years ago for sending sparse tensor over rpc #30807.

This pr updates rpc/tensorpipe logic for CUDA sparse tensors. During the serialization process, the pickler.cpp implementation breaks down the sparse tensor into two tensors and metadata. torch/csrc/distributed/rpc/tensorpipe_agent.cpp needs to be updated because it does not have logic sparse tensors. It pushes a single device for a sparse tensor. This is wrong because after the sparse tensor has been serialized, there will be two tensors. The second tensor will not have a device. This will cause the second tensor to have the wrong target device. tensorpipe_utils.cpp needs to be updated because deserialization happens after the data is received on the target pipe. This takes the two tensors and metadata sent and rebuilds the sparse tensor. There will be two tpDescriptors but only one tensor after deserialization. The logic is updated to verify the sparse tensor is on the correct device using the first tpDescriptor.

This pr also updates ivalue.cpp and ivalue.h to support more paths for Sparse COO tensors.

I tested these changes by adding sparse tests to rpc_test.py and dist_autograd_test.py.

Test Plan: Imported from OSS

Reviewed By: gchanan

Differential Revision: D30717285

Pulled By: gcramer23

fbshipit-source-id: daee9a56764550f56b131f9dd8e74e23113d6714
---
 aten/src/ATen/core/ivalue.cpp                 |  41 ++-
 aten/src/ATen/core/ivalue.h                   |   9 +-
 .../csrc/distributed/rpc/tensorpipe_agent.cpp |   9 +-
 .../csrc/distributed/rpc/tensorpipe_utils.cpp |  10 +-
 .../distributed/rpc/dist_autograd_test.py     | 312 ++++++++++--------
 .../_internal/distributed/rpc/rpc_test.py     |  98 ++++--
 6 files changed, 273 insertions(+), 206 deletions(-)

diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 1404e01fa2434..b81c50f063b19 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -946,36 +946,25 @@ getClassConverter() {
 }
 
 // Needs to be in this .cpp file to access the full definition of PyObjectHolder
-std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> ivalue::Future::extractStorages(
-    const at::IValue& value) {
+std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> ivalue::Future::
+    extractStorages(const at::IValue& value) {
   std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> weakStorageImpls;
   // getSubValues works poorly on Python objects: it only works if they can be
   // converted to a "regular" IValue type hence, for example, it doesn't support
   // custom subclasses. Thus, instead, we extract the tensors through pickling.
+  // Sparse tensors do not have storage. Instead, a sparse tensor
+  // contains two tensors indices and values, and both contain storage.
   if (value.isPyObject()) {
     std::vector<at::Tensor> tensors =
         value.toPyObjectHolder()->extractTensors();
-    size_t num_storages = 0;
-    for (const at::Tensor& tensor : tensors) {
+    weakStorageImpls.reserve(2 * tensors.size());
+    for (const auto& tensor : tensors) {
       if (tensor.is_sparse()) {
-        // Sparse tensor is indices and values. Both are tensors
-        // and contain storage. Therefore num_storages needs to be
-        // incremented by 2.
-        num_storages += 2;
+        weakStorageImpls.push_back(
+            tensor._indices().storage().getWeakStorageImpl());
+        weakStorageImpls.push_back(
+            tensor._values().storage().getWeakStorageImpl());
       } else {
-        // A dense/strided tensor contains 1 storage.
-        num_storages += 1;
-      }
-    }
-    weakStorageImpls.reserve(num_storages);
-    for (const at::Tensor& tensor : tensors) {
-      if (tensor.is_sparse()) {
-        // Sparse tensor is indices and values. Both are tensors
-        // and contain storage.
-        weakStorageImpls.push_back(tensor.indices().storage().getWeakStorageImpl());
-        weakStorageImpls.push_back(tensor.values().storage().getWeakStorageImpl());
-      } else {
-        // A dense/strided tensor contains 1 storage
         weakStorageImpls.push_back(tensor.storage().getWeakStorageImpl());
       }
     }
@@ -986,7 +975,15 @@ std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> ivalue::Future::extractSt
     value.getSubValues(sub_values);
     for (const at::IValue& sub_value : sub_values) {
       if (sub_value.isTensor()) {
-        weakStorageImpls.push_back(sub_value.toTensor().storage().getWeakStorageImpl());
+        auto& tensor = sub_value.toTensor();
+        if (tensor.is_sparse()) {
+          weakStorageImpls.push_back(
+              tensor._indices().storage().getWeakStorageImpl());
+          weakStorageImpls.push_back(
+              tensor._values().storage().getWeakStorageImpl());
+        } else {
+          weakStorageImpls.push_back(tensor.storage().getWeakStorageImpl());
+        }
       }
     }
   }
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 188a619307185..6574187d06f8b 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -872,14 +872,17 @@ struct TORCH_API IValue final {
   struct HashAliasedIValue {
     size_t operator()(const IValue& val) const {
       if (val.isTensor()) {
-        if (val.toTensor().is_mkldnn()) {
+        auto& tensor = val.toTensor();
+        if (tensor.is_mkldnn() || tensor.is_sparse()) {
           // MKLDNN tensors dont have storage and dont create views
           // or aliasing so we can just use Tensor pointer, TODO: find way
           // to use mkldnn storage
-          return reinterpret_cast<size_t>(val.toTensor().unsafeGetTensorImpl());
+          // Sparse tensors don't have storage use unsafeGetTensorImpl
+          // instead of using the storage of indices or values.
+          return reinterpret_cast<size_t>(tensor.unsafeGetTensorImpl());
         } else {
           return reinterpret_cast<size_t>(
-              val.toTensor().storage().unsafeGetStorageImpl());
+              tensor.storage().unsafeGetStorageImpl());
         }
       }
       // If it is not a Tensor, then two mutable IValues alias each other only
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 8e7ad18c575f8..3769db054ab45 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -48,7 +48,7 @@ std::vector<c10::Device> getDevicesForTensors(
       "Request device mapping is not available for destination ",
       remoteName);
   std::vector<c10::Device> devices;
-  devices.reserve(tensors.size());
+  devices.reserve(2 * tensors.size());
   bool hasMappedDevice = false;
   for (const auto& t : tensors) {
     if (t.device().is_cpu()) {
@@ -67,7 +67,12 @@ std::vector<c10::Device> getDevicesForTensors(
           " for device ",
           t.device(),
           " but received a tensor on that device.");
-      devices.push_back(deviceIter->second);
+      if (t.is_sparse()) {
+        devices.push_back(deviceIter->second);
+        devices.push_back(deviceIter->second);
+      } else {
+        devices.push_back(deviceIter->second);
+      }
       hasMappedDevice = true;
     }
   }
diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
index ee66f3108e522..aa21fdf65c0f9 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
@@ -311,8 +311,9 @@ c10::intrusive_ptr<Message> tensorpipeDeserialize(
     tensors.emplace_back(std::move(t));
   }
 
-  for (const auto i : c10::irange(tpDescriptor.tensors.size())) {
-    auto& tensor = tpDescriptor.tensors[i];
+  size_t tpDescriptorIndex = 0;
+  for (size_t i = 0; i < tensors.size(); i++) {
+    auto& tensor = tpDescriptor.tensors[tpDescriptorIndex];
     if (tensor.targetDevice.has_value() &&
         tensor.targetDevice->type == tensorpipe::kCudaDeviceType) {
       TORCH_INTERNAL_ASSERT(
@@ -326,6 +327,11 @@ c10::intrusive_ptr<Message> tensorpipeDeserialize(
           ", but got it on ",
           tensors[i].device());
     }
+    if (tensors[i].is_sparse()) {
+      tpDescriptorIndex += 2;
+    } else {
+      tpDescriptorIndex += 1;
+    }
   }
 
   return c10::make_intrusive<Message>(
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index fba50303068e7..2ba25a591ae0f 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -78,14 +78,20 @@ def create_tensor():
     return torch.ones((3, 3), requires_grad=True)
 
 
-def build_sparse_tensor(coalesce=False, requires_grad=True, dtype=torch.float32):
+def build_sparse_tensor(coalesce=False, requires_grad=True, dtype=torch.float32, device=None):
     i = [[0, 1, 1], [2, 0, 2]]
     v = [3.2, 4.1, 5.3]
-    tensor = torch.sparse_coo_tensor(i, v, (3, 3), requires_grad=requires_grad, dtype=dtype)
+    tensor = torch.sparse_coo_tensor(i, v, (3, 3), requires_grad=requires_grad, dtype=dtype, device=device)
     if coalesce:
         tensor = tensor.coalesce()
     return tensor
 
+def build_sparse_one_gradient(dtype=torch.float32):
+    i = [[0, 1, 1], [2, 0, 2]]
+    v = [1, 1, 1]
+    tensor = torch.sparse_coo_tensor(i, v, (3, 3), dtype=dtype)
+    return tensor
+
 
 @torch.jit.script
 def create_torchscript_tensor() -> torch.Tensor:
@@ -104,6 +110,9 @@ def my_rref_add(rref_t1, t2):
     ret = torch.add(rref_t1.local_value(), t2)
     return ret
 
+def my_sum(t):
+    return torch.sparse.sum(t) if t.is_sparse else t.sum()
+
 
 @torch.jit.script
 def my_script_add(t1, t2):
@@ -159,13 +168,10 @@ def _all_contexts_cleaned_up(timeout_seconds=10):
 
 # This function creates a dis atugorad context, run rpc_sync on the given ps,
 # and then blocks until the ps has verified the grads are correctly accumulated.
-def _run_trainer(rref_t1, t2, ps, rank_diff, sparse):
+def _run_trainer(rref_t1, t2, ps, rank_diff):
     with dist_autograd.context() as context_id:
         ret = rpc.rpc_sync(ps, my_rref_add, args=(rref_t1, t2))
-        if sparse:
-            loss = torch.sparse.sum(ret)
-        else:
-            loss = ret.sum()
+        loss = my_sum(ret)
         dist_autograd.backward(context_id, [loss])
         # prevent deleting dist autograd context
         rpc.rpc_sync(ps, _set_rpc_done, args=(context_id, rank_diff))
@@ -173,13 +179,10 @@ def _run_trainer(rref_t1, t2, ps, rank_diff, sparse):
 
 # This function is the same as _run_trainer, except rpc calls torchscript
 # function "my_script_ref_add" instead of python funciton "my_rref_add"
-def _run_trainer_torchscript(rref_t1, t2, ps, rank_diff, sparse):
+def _run_trainer_torchscript(rref_t1, t2, ps, rank_diff):
     with dist_autograd.context() as context_id:
         ret = rpc.rpc_sync(ps, my_script_ref_add, args=(rref_t1, t2))
-        if sparse:
-            loss = torch.sparse.sum(ret)
-        else:
-            loss = ret.sum()
+        loss = my_sum(ret)
         dist_autograd.backward(context_id, [loss])
         # prevent deleting dist autograd context
         rpc.rpc_sync(ps, _set_rpc_done, args=(context_id, rank_diff))
@@ -990,25 +993,19 @@ def test_error_in_context(self):
 
     def _backward_no_grad_on_tensor(self, t1, t2, sparse):
         with dist_autograd.context() as context_id:
-            loss = rpc.rpc_sync(
+            ret = rpc.rpc_sync(
                 worker_name(self._next_rank()),
                 torch.add,
                 args=(t1, t2))
-            if sparse:
-                loss = torch.sparse.sum(loss)
-            else:
-                loss = loss.sum()
+            loss = my_sum(ret)
             dist_autograd.backward(context_id, [loss], retain_graph=True)
             self.assertIsNone(t1.grad)
             self.assertIsNone(t2.grad)
 
             # Now populate .grad with local autograd engine and
             # verify dist autograd doesn't mess with it.
-            loss_local = torch.add(t1, t2)
-            if sparse:
-                loss_local = torch.sparse.sum(loss_local)
-            else:
-                loss_local = loss_local.sum()
+            ret = torch.add(t1, t2)
+            loss_local = my_sum(ret)
             loss_local.backward()
             self.assertIsNotNone(t1.grad)
             self.assertIsNotNone(t2.grad)
@@ -1043,10 +1040,7 @@ def _backward_simple(self, dst, t1, t2, local_grads, sparse):
                 ret = self._exec_func_with_dst(
                     dst, exec_mode, torch.add, t1, t2
                 )
-                if sparse:
-                    loss = torch.sparse.sum(ret)
-                else:
-                    loss = ret.sum()
+                loss = my_sum(ret)
                 ret = self._verify_backwards(
                     exec_mode, [loss], context_id, local_grads, t1, t2
                 )
@@ -1099,10 +1093,7 @@ def test_backward_simple_self_sparse(self):
     # tensor lives on the rref owner.
     def _backward_rref(self, callee, rref_owner, t1, t2, local_grads, sparse):
         local_ret = torch.add(t1, t2)
-        if sparse:
-            local_ret = torch.sparse.sum(local_ret)
-        else:
-            local_ret = local_ret.sum()
+        local_ret = my_sum(local_ret)
         local_ret.backward()
         with dist_autograd.context() as context_id:
             if sparse:
@@ -1120,10 +1111,7 @@ def _backward_rref(self, callee, rref_owner, t1, t2, local_grads, sparse):
                     callee, my_nested_rref_add, args=(rref_owner, rref_t1, t2)
                 )
             ret = rref.to_here()
-            if sparse:
-                ret = torch.sparse.sum(ret)
-            else:
-                ret = ret.sum()
+            ret = my_sum(ret)
             dist_autograd.backward(context_id, [ret])
 
             # verify grads on caller
@@ -1238,10 +1226,7 @@ def _test_trainer_ps(self, create_ref_fn, trainer_fn, sparse):
             t2 = torch.zeros((3, 3), requires_grad=True)
 
         local_ret = torch.add(t1, t2)
-        if sparse:
-            torch.sparse.sum(local_ret).backward()
-        else:
-            local_ret.sum().backward()
+        my_sum(local_ret).backward()
 
         # create rref on self
         rref_t1 = rpc.remote(
@@ -1257,7 +1242,7 @@ def _test_trainer_ps(self, create_ref_fn, trainer_fn, sparse):
                 rpc.rpc_async(
                     worker_name((self.rank + rank_diff) % self.world_size),
                     trainer_fn,
-                    args=(rref_t1, t2, worker_name(self.rank), rank_diff, sparse),
+                    args=(rref_t1, t2, worker_name(self.rank), rank_diff),
                 )
             )
 
@@ -1309,7 +1294,7 @@ def test_trainer_ps_torchscript_functions(self):
 
         self._test_trainer_ps(create_torchscript_tensor, _run_trainer_torchscript, False)
 
-    def _backward_multiple_round_trips(self, t1, t2, t3, t4, t5, local_grads, sparse):
+    def _backward_multiple_round_trips(self, t1, t2, t3, t4, t5, local_grads):
         for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]:
             with dist_autograd.context() as context_id:
                 # Multiple RPCs between different nodes.
@@ -1317,7 +1302,7 @@ def _backward_multiple_round_trips(self, t1, t2, t3, t4, t5, local_grads, sparse
                 val = self._exec_func(exec_mode, torch.mul, t3, val)
                 s1 = self._exec_func(exec_mode, torch.stack, (t4, val))
                 s2 = self._exec_func(exec_mode, torch.stack, (t5, val))
-                if sparse:
+                if s1.is_sparse:
                     val = self._exec_func(exec_mode, torch.mul, s1, s2)
                     val = self._exec_func(exec_mode, torch.mul, val, val)
                     loss = torch.sparse.sum(val)
@@ -1339,8 +1324,7 @@ def test_backward_multiple_round_trips(self):
             torch.rand((3, 3), requires_grad=True),
             torch.rand((3, 3)),
             torch.rand((3, 3), requires_grad=True),
-            None,
-            False
+            None
         )
 
     @dist_init
@@ -1351,8 +1335,7 @@ def test_backward_multiple_round_trips_sparse(self):
             build_sparse_tensor(requires_grad=True),
             build_sparse_tensor(requires_grad=False),
             build_sparse_tensor(requires_grad=True),
-            None,
-            True
+            None
         )
 
     @dist_init
@@ -1589,15 +1572,12 @@ def test_backward_multiple_roots(self):
                     exec_mode, [r1, r2, r3, r4], context_id, local_grads, t1, t2
                 )
 
-    def _backward_different_dtypes(self, t1, t2, sparse):
+    def _backward_different_dtypes(self, t1, t2):
         local_grads = None
         for exec_mode in [ExecMode.LOCAL, ExecMode.REMOTE]:
             with dist_autograd.context() as context_id:
                 loss = self._exec_func(exec_mode, torch.add, t1, t2)
-                if sparse:
-                    loss = torch.sparse.sum(loss)
-                else:
-                    loss = loss.sum()
+                loss = my_sum(loss)
                 local_grads = self._verify_backwards(
                     exec_mode, [loss], context_id, local_grads, t1, t2
                 )
@@ -1606,29 +1586,24 @@ def _backward_different_dtypes(self, t1, t2, sparse):
     def test_backward_different_dtypes(self):
         self._backward_different_dtypes(
             torch.rand((3, 3), requires_grad=True, dtype=torch.float32),
-            torch.rand((3, 3), requires_grad=True, dtype=torch.float64),
-            False
+            torch.rand((3, 3), requires_grad=True, dtype=torch.float64)
         )
 
     @dist_init
     def test_backward_different_dtypes_sparse(self):
         self._backward_different_dtypes(
             build_sparse_tensor(requires_grad=True, dtype=torch.float32),
-            build_sparse_tensor(requires_grad=True, dtype=torch.float64),
-            True
+            build_sparse_tensor(requires_grad=True, dtype=torch.float64)
         )
 
     # Run the same code locally and with dist autograd and verify gradients
     # are same.
-    def _backward_simple_python_udf(self, t1, t2, sparse):
+    def _backward_simple_python_udf(self, t1, t2):
         local_grads = None
         for exec_mode in [ExecMode.LOCAL, ExecMode.REMOTE]:
             with dist_autograd.context() as context_id:
                 ret = self._exec_func(exec_mode, my_py_add, t1, t2)
-                if sparse:
-                    loss = torch.sparse.sum(ret)
-                else:
-                    loss = ret.sum()
+                loss = my_sum(ret)
                 local_grads = self._verify_backwards(
                     exec_mode, [loss], context_id, local_grads, t1, t2
                 )
@@ -1637,21 +1612,19 @@ def _backward_simple_python_udf(self, t1, t2, sparse):
     def test_backward_simple_python_udf(self):
         self._backward_simple_python_udf(
             torch.rand(3, 3, requires_grad=True),
-            torch.rand(3, 3, requires_grad=True),
-            False
+            torch.rand(3, 3, requires_grad=True)
         )
 
     @dist_init
     def test_backward_simple_python_udf_sparse(self):
         self._backward_simple_python_udf(
             build_sparse_tensor(requires_grad=True),
-            build_sparse_tensor(requires_grad=True),
-            True
+            build_sparse_tensor(requires_grad=True)
         )
 
     # Run the same code locally and with dist autograd and verify gradients
     # are same.
-    def _backward_simple_script_call(self, t1, t2, sparse):
+    def _backward_simple_script_call(self, t1, t2):
         local_grads = None
         for exec_mode in [
             ExecMode.LOCAL,
@@ -1661,10 +1634,7 @@ def _backward_simple_script_call(self, t1, t2, sparse):
         ]:
             with dist_autograd.context() as context_id:
                 forward_ret = self._exec_func(exec_mode, my_script_add, t1, t2)
-                if sparse:
-                    loss = torch.sparse.sum(forward_ret)
-                else:
-                    loss = forward_ret.sum()
+                loss = my_sum(forward_ret)
                 ret = self._verify_backwards(
                     exec_mode, [loss], context_id, local_grads, t1, t2
                 )
@@ -1674,16 +1644,14 @@ def _backward_simple_script_call(self, t1, t2, sparse):
     def test_backward_simple_script_call(self):
         self._backward_simple_script_call(
             torch.rand(3, 3, requires_grad=True),
-            torch.rand(3, 3, requires_grad=True),
-            False
+            torch.rand(3, 3, requires_grad=True)
         )
 
     @dist_init
     def test_backward_simple_script_call_sparse(self):
         self._backward_simple_script_call(
             build_sparse_tensor(requires_grad=True),
-            build_sparse_tensor(requires_grad=True),
-            True
+            build_sparse_tensor(requires_grad=True)
         )
 
     @staticmethod
@@ -1796,28 +1764,22 @@ def _nested_python_udf(t1, t2, dst):
         res = rpc.rpc_sync(worker_name(dst), my_py_add, args=(t3, t4))
         return t1 * t2 * t3 * t4 * res
 
-    def _backwards_nested_python_udf(self, t1, t2, sparse):
+    def _backwards_nested_python_udf(self, t1, t2):
         t3 = t1 * t2
         t4 = t1 + t2
         res = t3 + t4
-        loss = t1 * t2 * t3 * t4 * res
-        if sparse:
-            loss = torch.sparse.sum(loss)
-        else:
-            loss = loss.sum()
+        ret = t1 * t2 * t3 * t4 * res
+        loss = my_sum(ret)
         torch.autograd.backward([loss])
 
         # Now run distributed autograd.
         with dist_autograd.context() as context_id:
-            loss = rpc.rpc_sync(
+            ret = rpc.rpc_sync(
                 worker_name(self._next_rank()),
                 DistAutogradTest._nested_python_udf,
                 args=(t1, t2, self._next_rank()),
             )
-            if sparse:
-                loss = torch.sparse.sum(loss)
-            else:
-                loss = loss.sum()
+            loss = my_sum(ret)
             dist_autograd.backward(context_id, [loss])
             grads = dist_autograd.get_gradients(context_id)
             self.assertEqual(t1.grad, grads[t1])
@@ -1828,8 +1790,7 @@ def test_backwards_nested_python_udf(self):
         # Run equivalent of _nested_python_udf locally.
         self._backwards_nested_python_udf(
             torch.rand(3, 3, requires_grad=True),
-            torch.rand(3, 3, requires_grad=True),
-            False
+            torch.rand(3, 3, requires_grad=True)
         )
 
     @dist_init
@@ -1837,8 +1798,7 @@ def test_backwards_nested_python_udf_sparse(self):
         # Run equivalent of _nested_python_udf locally.
         self._backwards_nested_python_udf(
             build_sparse_tensor(requires_grad=True),
-            build_sparse_tensor(requires_grad=True),
-            True
+            build_sparse_tensor(requires_grad=True)
         )
 
     _test_clean_context_backward_context_id = None
@@ -1986,17 +1946,14 @@ def _mixed_requires_grad_operaton(cls, t1, t2):
         else:
             return t1 * t2
 
-    def _mixed_requires_grad(self, t1, t2, sparse):
+    def _mixed_requires_grad(self, t1, t2):
         for exec_mode in [ExecMode.RPC_SYNC, ExecMode.REMOTE]:
             with dist_autograd.context() as context_id:
                 ret = self._exec_func(
                     exec_mode, DistAutogradTest._mixed_requires_grad_operaton, t1, t2
                 )
                 self.assertEqual(t1 * t2, ret)
-                if sparse:
-                    loss = torch.sparse.sum(ret)
-                else:
-                    loss = ret.sum()
+                loss = my_sum(ret)
                 dist_autograd.backward(context_id, [loss])
                 self.assertTrue(t1.requires_grad)
                 self.assertFalse(t2.requires_grad)
@@ -2009,16 +1966,14 @@ def _mixed_requires_grad(self, t1, t2, sparse):
     def test_mixed_requires_grad(self):
         self._mixed_requires_grad(
             torch.rand(3, 3, requires_grad=True),
-            torch.rand(3, 3, requires_grad=False),
-            False
+            torch.rand(3, 3, requires_grad=False)
         )
 
     @dist_init
     def test_mixed_requires_grad_sparse(self):
         self._mixed_requires_grad(
             build_sparse_tensor(requires_grad=True),
-            build_sparse_tensor(requires_grad=False),
-            True
+            build_sparse_tensor(requires_grad=False)
         )
 
     class TestDebugInfoFunc(Function):
@@ -2160,17 +2115,14 @@ def test_backward_accumulate_grads(self):
     def _test_nested_backward_accumulate_grads(t1, t2, dst_rank):
         return rpc.rpc_sync(worker_name(dst_rank), torch.add, args=(t1, t2))
 
-    def _nested_backward_accumulate_grads(self, t1, t2, sparse):
+    def _nested_backward_accumulate_grads(self, t1, t2):
         with dist_autograd.context() as context_id:
             ret = rpc.rpc_sync(
                 worker_name(self._next_rank()),
                 DistAutogradTest._test_nested_backward_accumulate_grads,
                 args=(t1, t2, self._next_rank()),
             )
-            if sparse:
-                loss = torch.sparse.sum(ret)
-            else:
-                loss = ret.sum()
+            loss = my_sum(ret)
             # Run backward twice.
             dist_autograd.backward(context_id, [loss], retain_graph=True)
             dist_autograd.backward(context_id, [loss])
@@ -2179,28 +2131,23 @@ def _nested_backward_accumulate_grads(self, t1, t2, sparse):
     def test_nested_backward_accumulate_grads(self):
         self._nested_backward_accumulate_grads(
             torch.rand(3, 3, requires_grad=True),
-            torch.rand(3, 3, requires_grad=True),
-            False
+            torch.rand(3, 3, requires_grad=True)
         )
 
     @dist_init
     def test_nested_backward_accumulate_grads_sparse(self):
         self._nested_backward_accumulate_grads(
             build_sparse_tensor(requires_grad=True),
-            build_sparse_tensor(requires_grad=True),
-            True
+            build_sparse_tensor(requires_grad=True)
         )
 
-    def _multiple_backward(self, t1, t2, sparse):
+    def _multiple_backward(self, t1, t2):
         with dist_autograd.context() as context_id:
-            loss = rpc.rpc_sync(
+            ret = rpc.rpc_sync(
                 worker_name(self._next_rank()),
                 torch.add,
                 args=(t1, t2))
-            if sparse:
-                loss = torch.sparse.sum(loss)
-            else:
-                loss = loss.sum()
+            loss = my_sum(ret)
             # Run backward in a loop multiple times.
             for i in range(1000):
                 dist_autograd.backward(context_id, [loss], retain_graph=True)
@@ -2209,16 +2156,14 @@ def _multiple_backward(self, t1, t2, sparse):
     def test_multiple_backward(self):
         self._multiple_backward(
             torch.rand(3, 3, requires_grad=True),
-            torch.rand(3, 3, requires_grad=True),
-            False
+            torch.rand(3, 3, requires_grad=True)
         )
 
     @dist_init
     def test_multiple_backward_sparse(self):
         self._multiple_backward(
             build_sparse_tensor(requires_grad=True),
-            build_sparse_tensor(requires_grad=True),
-            True
+            build_sparse_tensor(requires_grad=True)
         )
 
     @dist_init(clean_shutdown=False)
@@ -2524,15 +2469,13 @@ def test_thread_local_context_id(self):
 
 
 class CudaDistAutogradTest(CommonDistAutogradTest):
-    @skip_if_lt_x_gpu(1)
-    @dist_init
-    def test_gpu_simple(self):
-        t1 = torch.rand(3, 3, requires_grad=True, device="cuda:0")
-        t2 = torch.rand(3, 3, requires_grad=True, device="cuda:0")
-        (t1 + t2).sum().backward()
+
+    def _gpu_simple(self, t1, t2):
+        my_sum(t1 + t2).backward()
         with dist_autograd.context() as context_id:
             t3 = t1 + t2
-            dist_autograd.backward(context_id, [t3.sum()])
+            loss = my_sum(t3)
+            dist_autograd.backward(context_id, [loss])
             grads = dist_autograd.get_gradients(context_id)
             self.assertEqual(2, len(grads))
             self.assertEqual(t1.grad, grads[t1])
@@ -2540,9 +2483,22 @@ def test_gpu_simple(self):
 
     @skip_if_lt_x_gpu(1)
     @dist_init
-    def test_gpu_to_cpu_continuation(self):
-        t1 = torch.rand(3, 3, requires_grad=True, device="cuda:0")
-        t2 = torch.rand(3, 3, requires_grad=True)
+    def test_gpu_simple(self):
+        self._gpu_simple(
+            torch.rand(3, 3, requires_grad=True, device="cuda:0"),
+            torch.rand(3, 3, requires_grad=True, device="cuda:0")
+        )
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_gpu_simple_sparse(self):
+        self._gpu_simple(
+            build_sparse_tensor(requires_grad=True, device="cuda:0"),
+            build_sparse_tensor(requires_grad=True, device="cuda:0")
+        )
+
+
+    def _gpu_to_cpu_continuation(self, t1, t2):
         # Run a few iterations.
         for i in range(3):
             t1.grad = None
@@ -2557,16 +2513,29 @@ def test_gpu_to_cpu_continuation(self):
                     t6 = t5.cuda(0) + t4
                     t7 = self._exec_func(exec_mode, torch.add, t6.cpu(), t5)
                     # Autograd graph consists of CPU -> GPU -> CPU execution.
+                    loss = my_sum(t7)
                     ret = self._verify_backwards(
-                        exec_mode, [t7.sum()], context_id, local_grads, t1, t2
+                        exec_mode, [loss], context_id, local_grads, t1, t2
                     )
                     local_grads = ret if ret else local_grads
 
     @skip_if_lt_x_gpu(1)
     @dist_init
-    def test_gpu_to_cpu_continuation_gpu_root(self):
-        t1 = torch.rand(3, 3, requires_grad=True, device="cuda:0")
-        t2 = torch.rand(3, 3, requires_grad=True)
+    def test_gpu_to_cpu_continuation(self):
+        self._gpu_to_cpu_continuation(
+            torch.rand(3, 3, requires_grad=True, device="cuda:0"),
+            torch.rand(3, 3, requires_grad=True)
+        )
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_gpu_to_cpu_continuation_sparse(self):
+        self._gpu_to_cpu_continuation(
+            build_sparse_tensor(requires_grad=True, device="cuda:0"),
+            build_sparse_tensor(requires_grad=True)
+        )
+
+    def _gpu_to_cpu_continuation_gpu_root(self, t1, t2):
         # Run a few iterations.
         for i in range(3):
             t1.grad = None
@@ -2580,11 +2549,28 @@ def test_gpu_to_cpu_continuation_gpu_root(self):
                     t5 = self._exec_func(exec_mode, torch.add, t4.cpu(), t2)
                     t6 = t5.cuda(0) + t4
                     # Autograd graph consists of CPU -> GPU -> CPU execution.
+                    loss = my_sum(t6)
                     ret = self._verify_backwards(
-                        exec_mode, [t6.sum()], context_id, local_grads, t1, t2
+                        exec_mode, [loss], context_id, local_grads, t1, t2
                     )
                     local_grads = ret if ret else local_grads
 
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_gpu_to_cpu_continuation_gpu_root(self):
+        self._gpu_to_cpu_continuation_gpu_root(
+            torch.rand(3, 3, requires_grad=True, device="cuda:0"),
+            torch.rand(3, 3, requires_grad=True)
+        )
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_gpu_to_cpu_continuation_gpu_root_sparse(self):
+        self._gpu_to_cpu_continuation_gpu_root(
+            build_sparse_tensor(requires_grad=True, device="cuda:0"),
+            build_sparse_tensor(requires_grad=True)
+        )
+
 
 class FaultyAgentDistAutogradTest(RpcAgentTestFixture):
     # Reusing a simplified helper function from DistAutogradTest to ensure
@@ -2646,8 +2632,7 @@ def gradients(self, ctx_id):
 
 class TensorPipeCudaDistAutogradTest(RpcAgentTestFixture):
 
-    @skip_if_lt_x_gpu(4)
-    def test_device_maps_backward_pass(self):
+    def _device_maps_backward_pass(self, t1, t2):
         options = self.rpc_backend_options
         dst = worker_name((self.rank + 1) % self.world_size)
 
@@ -2662,19 +2647,36 @@ def test_device_maps_backward_pass(self):
             rpc_backend_options=options,
         )
 
-        t1 = torch.rand(10, device=self.rank, requires_grad=True)
-        t2 = torch.rand(10, device=self.rank, requires_grad=True)
         with dist_autograd.context() as context_id:
             res = rpc.rpc_sync(dst, torch.add, args=(t1, t2))
-            dist_autograd.backward(context_id, [res.sum()])
+            loss = my_sum(res)
+            dist_autograd.backward(context_id, [loss])
             grads = dist_autograd.get_gradients(context_id)
-            self.assertEqual(torch.ones(10), grads[t1])
-            self.assertEqual(torch.ones(10), grads[t2])
+            if t1.is_sparse:
+                self.assertEqual(build_sparse_one_gradient(), grads[t1])
+                self.assertEqual(build_sparse_one_gradient(), grads[t2])
+            else:
+                self.assertEqual(torch.ones(10), grads[t1])
+                self.assertEqual(torch.ones(10), grads[t2])
             self.assertEqual(t1.device, grads[t1].device)
             self.assertEqual(t2.device, grads[t2].device)
 
         rpc.shutdown()
 
+    @skip_if_lt_x_gpu(4)
+    def test_device_maps_backward_pass(self):
+        self._device_maps_backward_pass(
+            torch.rand(10, requires_grad=True, device=self.rank),
+            torch.ones(10, requires_grad=True, device=self.rank)
+        )
+
+    @skip_if_lt_x_gpu(4)
+    def test_device_maps_backward_pass_sparse(self):
+        self._device_maps_backward_pass(
+            build_sparse_tensor(requires_grad=True, device=self.rank),
+            build_sparse_tensor(requires_grad=True, device=self.rank)
+        )
+
     class MyRemoteCompute(torch.nn.Module):
         def __init__(self):
             super().__init__()
@@ -2691,9 +2693,7 @@ def __init__(self, next_stage):
         def forward(self, input):
             return self.next_stage.rpc_sync().forward(input)
 
-    @skip_if_lt_x_gpu(4)
-    def test_dist_autograd_sync_streams(self):
-
+    def _dist_autograd_sync_streams(self, sparse):
         options = self.rpc_backend_options
         dst = worker_name((self.rank + 1) % self.world_size)
 
@@ -2711,17 +2711,20 @@ def test_dist_autograd_sync_streams(self):
         remote_compute = rpc.remote(dst, TensorPipeCudaDistAutogradTest.MyRemoteCompute)
         local_compute = TensorPipeCudaDistAutogradTest.MyLocalCompute(remote_compute)
         for _ in range(10):
-            input = torch.rand([1000, 10000], device=self.rank, requires_grad=True)
+            if sparse:
+                input = build_sparse_tensor(requires_grad=True, device=self.rank)
+            else:
+                input = torch.rand([1000, 10000], device=self.rank, requires_grad=True)
             # Run local autograd
             result = input * 2.0
             r = random.random()
-            loss = result.sum() * r
+            loss = my_sum(result) * r
             loss.backward()
 
             # Run distributed autograd
             with dist_autograd.context() as context_id:
                 result = local_compute(input)
-                loss = result.sum() * r
+                loss = my_sum(result) * r
                 dist_autograd.backward(context_id, [loss])
 
                 # Compare grads.
@@ -2731,7 +2734,14 @@ def test_dist_autograd_sync_streams(self):
         rpc.shutdown()
 
     @skip_if_lt_x_gpu(4)
-    def test_gradients_synchronizations(self):
+    def test_dist_autograd_sync_streams(self):
+        self._dist_autograd_sync_streams(False)
+
+    @skip_if_lt_x_gpu(4)
+    def test_dist_autograd_sync_streams_sparse(self):
+        self._dist_autograd_sync_streams(True)
+
+    def _gradients_synchronizations(self, x):
         options = self.rpc_backend_options
         for peer_rank in range(self.world_size):
             options.set_device_map(worker_name(peer_rank), {self.rank: peer_rank})
@@ -2755,8 +2765,8 @@ def test_gradients_synchronizations(self):
                     WrapperModule,
                     args=(layers[rank - 1], rank)
                 ))
+            x = x.to(0)
 
-            x = torch.randn(5000, 2000).to(0)
             # local iteration
             local_model = nn.Sequential(*local_layers)
             local_model(x).sum().backward()
@@ -2778,3 +2788,15 @@ def test_gradients_synchronizations(self):
                         self.assertEqual(g1, g2)
 
         rpc.shutdown()
+
+    @skip_if_lt_x_gpu(4)
+    def test_gradients_synchronizations(self):
+        self._gradients_synchronizations(
+            torch.randn(5000, 2000)
+        )
+
+    @skip_if_lt_x_gpu(4)
+    def test_gradients_synchronizations_sparse(self):
+        self._gradients_synchronizations(
+            torch.randn(5000, 2000).to_sparse()
+        )
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index e0ef915ee8937..23759f1e292ad 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -194,6 +194,14 @@ def my_slow_method(self, my_tensor_arg):
         return torch.add(self.a, my_tensor_arg)
 
 
+def _run_func_in_mode(to, fn, mode, args=None, kwargs=None):
+    if mode == RPCExecMode.SYNC:
+        return rpc.rpc_sync(to, fn, args=args, kwargs=kwargs)
+    elif mode == RPCExecMode.ASYNC:
+        return rpc.rpc_async(to, fn, args=args, kwargs=kwargs).wait()
+    elif mode == RPCExecMode.REMOTE:
+        return rpc.remote(to, fn, args=args, kwargs=kwargs).to_here()
+
 def _call_method_on_rref(method, rref, *args, **kwargs):
     return method(rref.local_value(), *args, **kwargs)
 
@@ -736,7 +744,7 @@ def test_send_to_rank(self):
 
         # Test dense tensor
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
-            ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
+            ret = _run_func_in_mode(dst_rank, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
             self.assertEqual(ret, torch.ones(2, 2) + 1)
 
         # Test sparse tensor
@@ -744,32 +752,32 @@ def test_send_to_rank(self):
             x = build_sparse_tensor()
             y = build_sparse_tensor()
             expected_tensor = (x + y)
-            ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y))
+            ret = _run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y))
             self.assertEqual(expected_tensor, ret)
 
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
             x = build_sparse_tensor(coalesce=True)
             y = build_sparse_tensor(coalesce=True)
             expected_tensor = (x + y)
-            ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y))
+            ret = _run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y))
             self.assertEqual(expected_tensor, ret)
 
         # Test invalid ranks
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
             with self.assertRaises(RuntimeError):
-                self._run_func_in_mode(self.world_size + 1, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
+                _run_func_in_mode(self.world_size + 1, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
 
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
             with self.assertRaises(RuntimeError):
-                self._run_func_in_mode(-1, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
+                _run_func_in_mode(-1, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
 
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
             with self.assertRaises(ValueError):
-                self._run_func_in_mode(dst_rank + 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
+                _run_func_in_mode(dst_rank + 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
 
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
             with self.assertRaises(ValueError):
-                self._run_func_in_mode(dst_rank - 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
+                _run_func_in_mode(dst_rank - 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
 
     def _self_py_udf_remote(self, worker_info, x, y, z):
         rref = rpc.remote(worker_info, my_function, args=(x, y, z))
@@ -4025,17 +4033,9 @@ def test_future_in_rpc(self):
     def test_future_nested_callback(self):
         self._test_future_cb(add_use_future_nested_cb)
 
-    def _run_func_in_mode(self, to, fn, mode, args=None, kwargs=None):
-        if mode == RPCExecMode.SYNC:
-            return rpc.rpc_sync(to, fn, args=args, kwargs=kwargs)
-        elif mode == RPCExecMode.ASYNC:
-            return rpc.rpc_async(to, fn, args=args, kwargs=kwargs).wait()
-        elif mode == RPCExecMode.REMOTE:
-            return rpc.remote(to, fn, args=args, kwargs=kwargs).to_here()
-
     def _test_async_function_raise(self, mode):
         with self.assertRaisesRegex(RuntimeError, "Expected error"):
-            self._run_func_in_mode(
+            _run_func_in_mode(
                 worker_name((self.rank + 1) % self.world_size),
                 async_raise_func,
                 mode
@@ -4059,7 +4059,7 @@ def _test_async_function_wrong_return_type(self, mode):
             "torch\\.futures\\.Future object,"
         )
         with self.assertRaisesRegex(RuntimeError, errMsg):
-            self._run_func_in_mode(
+            _run_func_in_mode(
                 worker_name((self.rank + 1) % self.world_size),
                 async_wrong_type,
                 mode
@@ -4090,7 +4090,7 @@ def _test_async_function(self, fn, mode=RPCExecMode.SYNC):
         dst2 = worker_name((self.rank + 2) % self.world_size)
 
         args = (dst2, torch.ones(2, 2), 1, 2)
-        ret = self._run_func_in_mode(dst1, fn, mode, args=args)
+        ret = _run_func_in_mode(dst1, fn, mode, args=args)
         self.assertEqual(ret, torch.ones(2, 2) + 3)
 
     @dist_init
@@ -4183,7 +4183,7 @@ def _test_async_function_multi(self, fn, mode=RPCExecMode.SYNC):
         num = 20
         step = 3
         args = (dst2, torch.ones(2, 2), num, step)
-        ret = self._run_func_in_mode(dst1, fn, mode, args=args)
+        ret = _run_func_in_mode(dst1, fn, mode, args=args)
         self.assertEqual(ret, torch.ones(2, 2) + num * step)
 
     @dist_init
@@ -4227,7 +4227,7 @@ def _test_return_future(self, mode):
             RuntimeError,
             "Can not pickle torch.futures.Future"
         ):
-            self._run_func_in_mode(
+            _run_func_in_mode(
                 worker_name((self.rank + 1) % self.world_size),
                 return_future,
                 mode
@@ -5217,13 +5217,33 @@ def test_device_maps_gpu(self):
             rpc_backend_options=options,
         )
 
-        ret = rpc.rpc_sync(
-            dst,
-            TensorPipeAgentCudaRpcTest._gpu_add,
-            args=(torch.zeros(2).to(0), torch.ones(2).to(0))
-        )
-        self.assertEqual(ret.device, torch.device(1))
-        self.assertEqual(ret, (torch.zeros(2) + torch.ones(2)).to(1))
+        # Test dense tensor
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            x = torch.ones(2, 2)
+            y = torch.ones(2, 2)
+            expected_tensor = (x + y)
+            ret = _run_func_in_mode(dst, TensorPipeAgentCudaRpcTest._gpu_add, exec_mode, args=(x.to(0), y.to(0)))
+            self.assertEqual(ret.device, torch.device(1))
+            self.assertEqual(ret, expected_tensor.to(1))
+
+        # Test sparse tensor uncoalesced
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            x = build_sparse_tensor()
+            y = build_sparse_tensor()
+            expected_tensor = (x + y)
+            ret = _run_func_in_mode(dst, TensorPipeAgentCudaRpcTest._gpu_add, exec_mode, args=(x.to(0), y.to(0)))
+            self.assertEqual(ret.device, torch.device(1))
+            self.assertEqual(ret, expected_tensor.to(1))
+
+        # Test sparse tensor coalesced
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            x = build_sparse_tensor().coalesce()
+            y = build_sparse_tensor().coalesce()
+            expected_tensor = (x + y)
+            ret = _run_func_in_mode(dst, TensorPipeAgentCudaRpcTest._gpu_add, exec_mode, args=(x.to(0), y.to(0)))
+            self.assertEqual(ret.device, torch.device(1))
+            self.assertEqual(ret, expected_tensor.to(1))
+
         rpc.shutdown()
 
     @staticmethod
@@ -5722,8 +5742,7 @@ def test_device_maps_missing_config_remote(self):
     def test_device_maps_missing_config_remote_response(self):
         self._test_device_maps_missing_config_response(RPCExecMode.REMOTE)
 
-    @skip_if_lt_x_gpu(2)
-    def test_device_maps_remote(self):
+    def _device_maps_remote(self, x, y, expected):
         options = self.rpc_backend_options
         dst = worker_name((self.rank + 1) % self.world_size)
         options.set_device_map(dst, {1: 0})
@@ -5739,14 +5758,29 @@ def test_device_maps_remote(self):
         rref = rpc.remote(
             dst,
             TensorPipeAgentCudaRpcTest._add_to_gpu,
-            args=(torch.zeros(2), 1)
+            args=(x, y)
         )
-
         self.assertEqual(rref.to_here().device.index, 1)
-        self.assertEqual(rref.to_here(), torch.ones(2).to(1))
 
+        self.assertEqual(rref.to_here(), expected.to(1))
         rpc.shutdown()
 
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_remote(self):
+        self._device_maps_remote(
+            torch.ones(3, 3),
+            torch.ones(3, 3),
+            torch.ones(3, 3) + torch.ones(3, 3)
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_remote_sparse(self):
+        self._device_maps_remote(
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor() + build_sparse_tensor()
+        )
+
     @staticmethod
     def _slow_add_on_user_stream(x, y):
         s0 = torch.cuda.current_stream(x.device)

From 4968d0b34ffd624445de0137924288bccf28f51a Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Thu, 2 Sep 2021 16:21:52 -0700
Subject: [PATCH 493/530] [POC] .github: Add event name to concurrency (#64402)

Summary:
This would ensure that manually/API triggered workflows would not cancel other triggered workflows. For example, the manually triggered periodic 11.1 linux job cancelled the scheduled one here, which we may not want:
![image](https://user-images.githubusercontent.com/31798555/131752175-1c99d56e-d344-46e1-b8ac-9c12bba0569a.png).

This would be helpful later as we use more dispatched workflows (e.g., for bisect functionality)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64402

Reviewed By: malfet

Differential Revision: D30734860

Pulled By: janeyx99

fbshipit-source-id: 220016716094666e9af836fcd716dd529cf23d8a
---
 .github/scripts/ensure_actions_will_cancel.py               | 3 ++-
 .github/templates/linux_ci_workflow.yml.j2                  | 6 +++++-
 .github/templates/windows_ci_workflow.yml.j2                | 2 +-
 .github/workflows/add_annotations.yml                       | 2 +-
 .github/workflows/auto_label.yml                            | 2 +-
 .github/workflows/build_linux_conda.yml                     | 2 +-
 .github/workflows/build_linux_libtorch.yml                  | 2 +-
 .github/workflows/build_linux_wheels.yml                    | 2 +-
 .github/workflows/create_release.yml                        | 2 +-
 .../generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml | 6 +++++-
 .../generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml | 6 +++++-
 .../generated-linux-bionic-cuda10.2-py3.9-gcc7.yml          | 6 +++++-
 .../generated-linux-bionic-py3.8-gcc9-coverage.yml          | 6 +++++-
 .../generated-linux-xenial-cuda10.2-py3.6-gcc7.yml          | 6 +++++-
 .../generated-linux-xenial-cuda11.3-py3.6-gcc7.yml          | 6 +++++-
 .github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml   | 6 +++++-
 .../generated-linux-xenial-py3.6-gcc7-bazel-test.yml        | 6 +++++-
 ...d-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml | 6 +++++-
 .../generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml | 6 +++++-
 .../generated-periodic-win-vs2019-cuda11.1-py3.yml          | 2 +-
 .github/workflows/generated-win-vs2019-cpu-py3.yml          | 2 +-
 .github/workflows/generated-win-vs2019-cuda10.1-py3.yml     | 2 +-
 .github/workflows/generated-win-vs2019-cuda11.3-py3.yml     | 2 +-
 .github/workflows/lint.yml                                  | 2 +-
 .github/workflows/run_torchbench.yml                        | 2 +-
 .github/workflows/test_tools.yml                            | 2 +-
 26 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/.github/scripts/ensure_actions_will_cancel.py b/.github/scripts/ensure_actions_will_cancel.py
index 18e61d264f514..8af3e80702e56 100755
--- a/.github/scripts/ensure_actions_will_cancel.py
+++ b/.github/scripts/ensure_actions_will_cancel.py
@@ -15,7 +15,8 @@ def concurrency_key(filename: Path) -> str:
     workflow_name = filename.with_suffix("").name.replace("_", "-")
     if workflow_name.startswith("generated-"):
         workflow_name = workflow_name[len("generated-"):]
-    return f"{workflow_name}-${{{{ github.event.pull_request.number || github.sha }}}}"
+    return f"{workflow_name}-${{{{ github.event.pull_request.number || github.sha }}}}" \
+        "-${{ github.event_name == 'workflow_dispatch' }}"
 
 
 def should_check(filename: Path) -> bool:
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 314122b699c86..2d856704c3137 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -46,7 +46,7 @@ env:
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
-  group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}
+  group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
@@ -54,9 +54,13 @@ jobs:
   !{{ ciflow_config.root_job_name }}:
     runs-on: ubuntu-18.04
     if: ${{ !{{ ciflow_config.root_job_condition }} }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     steps:
       - name: noop
         run: echo running !{{ ciflow_config.root_job_name }}
+      - name: print labels
+        run: echo "${LABELS}"
 {%- endif %}
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index 05d739db523e3..4f486dd75f3b7 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -61,7 +61,7 @@ env:
 {%- endif %}
 
 concurrency:
-  group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}
+  group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.github/workflows/add_annotations.yml b/.github/workflows/add_annotations.yml
index 9bb3c1b46e7b4..76f7307e3fb77 100644
--- a/.github/workflows/add_annotations.yml
+++ b/.github/workflows/add_annotations.yml
@@ -9,7 +9,7 @@ on:
 
 
 concurrency:
-  group: add-annotations-${{ github.event.pull_request.number || github.sha }}
+  group: add-annotations-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 
diff --git a/.github/workflows/auto_label.yml b/.github/workflows/auto_label.yml
index 1616ea9c90b8a..6dcb29a70f57a 100644
--- a/.github/workflows/auto_label.yml
+++ b/.github/workflows/auto_label.yml
@@ -8,7 +8,7 @@ on:
 
 
 concurrency:
-  group: auto-label-${{ github.event.pull_request.number || github.sha }}
+  group: auto-label-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 
diff --git a/.github/workflows/build_linux_conda.yml b/.github/workflows/build_linux_conda.yml
index 536a18771831e..b43c2013327ba 100644
--- a/.github/workflows/build_linux_conda.yml
+++ b/.github/workflows/build_linux_conda.yml
@@ -111,5 +111,5 @@ jobs:
           python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
 
 concurrency:
-  group: build-linux-conda-${{ github.event.pull_request.number || github.sha }}
+  group: build-linux-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/build_linux_libtorch.yml b/.github/workflows/build_linux_libtorch.yml
index 9321c6ac8bf88..0a1c653375f9c 100644
--- a/.github/workflows/build_linux_libtorch.yml
+++ b/.github/workflows/build_linux_libtorch.yml
@@ -110,5 +110,5 @@ jobs:
           python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
 
 concurrency:
-  group: build-linux-libtorch-${{ github.event.pull_request.number || github.sha }}
+  group: build-linux-libtorch-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/build_linux_wheels.yml b/.github/workflows/build_linux_wheels.yml
index 15a38f6cee0fe..1f8e5f02e2220 100644
--- a/.github/workflows/build_linux_wheels.yml
+++ b/.github/workflows/build_linux_wheels.yml
@@ -109,5 +109,5 @@ jobs:
           python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
 
 concurrency:
-  group: build-linux-wheels-${{ github.event.pull_request.number || github.sha }}
+  group: build-linux-wheels-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index 4cd0568be5aad..eea423c00505c 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -48,5 +48,5 @@ jobs:
           files: ${{env.PT_RELEASE_FILE}}
 
 concurrency:
-  group: create-release-${{ github.event.pull_request.number || github.sha }}
+  group: create-release-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
index d2111896319df..477fe1bac6fe2 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -27,16 +27,20 @@ env:
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
-  group: libtorch-linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
+  group: libtorch-linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
     if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 51811ae9eca58..9fd6d7ff8d140 100644
--- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -27,16 +27,20 @@ env:
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
-  group: libtorch-linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
+  group: libtorch-linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
     if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index f410f1fc0be20..ee0ca4cf76ce0 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -27,16 +27,20 @@ env:
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
-  group: linux-bionic-cuda10.2-py3.9-gcc7-${{ github.event.pull_request.number || github.sha }}
+  group: linux-bionic-cuda10.2-py3.9-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
     if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
diff --git a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
index 2bcb4261816de..2103f2b66bdbf 100644
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@@ -27,16 +27,20 @@ env:
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
-  group: linux-bionic-py3.8-gcc9-coverage-${{ github.event.pull_request.number || github.sha }}
+  group: linux-bionic-py3.8-gcc9-coverage-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
     if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/coverage') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 656b56548b26b..187f9c1ccfdfb 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -27,16 +27,20 @@ env:
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
-  group: linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
+  group: linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
     if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 68b42858c98e3..9fff700c56e84 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -27,16 +27,20 @@ env:
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
-  group: linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
+  group: linux-xenial-cuda11.3-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
     if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index b347b52be9509..d1187de624f17 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -27,16 +27,20 @@ env:
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
-  group: linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}
+  group: linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
     if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
index e86cc563a77f8..49d2cd2f2267c 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@@ -27,16 +27,20 @@ env:
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
-  group: linux-xenial-py3.6-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }}
+  group: linux-xenial-py3.6-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
     if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
index b591519e3219c..620e4c3d2d318 100644
--- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -25,16 +25,20 @@ env:
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
-  group: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
+  group: periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
     if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/libtorch') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index 07593ad07a941..e318e665c9156 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -25,16 +25,20 @@ env:
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
-  group: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}
+  group: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
     if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/scheduled')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge
diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
index 61c63f42cadf6..360fdc38c86ad 100644
--- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
+++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml
@@ -29,7 +29,7 @@ env:
   USE_CUDA: 1
 
 concurrency:
-  group: periodic-win-vs2019-cuda11.1-py3-${{ github.event.pull_request.number || github.sha }}
+  group: periodic-win-vs2019-cuda11.1-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml
index 6ef8e85ac931b..1277a69f1d13d 100644
--- a/.github/workflows/generated-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml
@@ -29,7 +29,7 @@ env:
   no_proxy: localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock
 
 concurrency:
-  group: win-vs2019-cpu-py3-${{ github.event.pull_request.number || github.sha }}
+  group: win-vs2019-cpu-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
index c158f08731d99..185cb5903e189 100644
--- a/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda10.1-py3.yml
@@ -31,7 +31,7 @@ env:
   USE_CUDA: 1
 
 concurrency:
-  group: win-vs2019-cuda10.1-py3-${{ github.event.pull_request.number || github.sha }}
+  group: win-vs2019-cuda10.1-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
index bd945c3255a0a..b339e79926f53 100644
--- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
+++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml
@@ -31,7 +31,7 @@ env:
   USE_CUDA: 1
 
 concurrency:
-  group: win-vs2019-cuda11.3-py3-${{ github.event.pull_request.number || github.sha }}
+  group: win-vs2019-cuda11.3-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index f69d2b01ab35a..a1b6182aedaf4 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -467,5 +467,5 @@ jobs:
           fi
 
 concurrency:
-  group: lint-${{ github.event.pull_request.number || github.sha }}
+  group: lint-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml
index 786d25f4e3b0f..cee27e1866282 100644
--- a/.github/workflows/run_torchbench.yml
+++ b/.github/workflows/run_torchbench.yml
@@ -66,5 +66,5 @@ jobs:
           path: ~/.torchbench/bisection/pr${{ github.event.number }}
 
 concurrency:
-  group: run-torchbench-${{ github.event.pull_request.number || github.sha }}
+  group: run-torchbench-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/test_tools.yml b/.github/workflows/test_tools.yml
index afc790bb10e2b..02ae0dd34e4fd 100644
--- a/.github/workflows/test_tools.yml
+++ b/.github/workflows/test_tools.yml
@@ -31,5 +31,5 @@ jobs:
         run: python -m unittest discover -vs tools/test -p 'test_*.py'
 
 concurrency:
-  group: test-tools-${{ github.event.pull_request.number || github.sha }}
+  group: test-tools-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true

From 046ed57a4d0f8296eb6863a835407cbd9711d832 Mon Sep 17 00:00:00 2001
From: Zafar Takhirov <zaf@fb.com>
Date: Thu, 2 Sep 2021 16:58:36 -0700
Subject: [PATCH 494/530] Revert D30055886: [quant] AO migration of the
 `quantize.py`

Test Plan: revert-hammer

Differential Revision:
D30055886 (https://github.com/pytorch/pytorch/commit/44e3ed88c9a1bd9ee6b0168ba5271a2c6b006cc8)

Original commit changeset: 8ef7470f9fa6

fbshipit-source-id: c5bd3ead43a2d44b9e56872ec5bd7a195bdac725
---
 test/quantization/ao_migration/__init__.py    |   0
 .../ao_migration/test_quantize_py.py          |  63 --
 test/test_quantization.py                     |   2 -
 torch/ao/quantization/__init__.py             |   0
 torch/ao/quantization/quantize.py             | 580 -----------------
 torch/quantization/fx/convert.py              |   2 +-
 torch/quantization/fx/prepare.py              |   2 +-
 .../quantization/fx/quantization_patterns.py  |   2 +-
 torch/quantization/fx/utils.py                |   2 +-
 torch/quantization/quantize.py                | 604 +++++++++++++++++-
 10 files changed, 581 insertions(+), 676 deletions(-)
 delete mode 100644 test/quantization/ao_migration/__init__.py
 delete mode 100644 test/quantization/ao_migration/test_quantize_py.py
 delete mode 100644 torch/ao/quantization/__init__.py
 delete mode 100644 torch/ao/quantization/quantize.py

diff --git a/test/quantization/ao_migration/__init__.py b/test/quantization/ao_migration/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/quantization/ao_migration/test_quantize_py.py b/test/quantization/ao_migration/test_quantize_py.py
deleted file mode 100644
index 086364bef9d56..0000000000000
--- a/test/quantization/ao_migration/test_quantize_py.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from torch.testing._internal.common_utils import TestCase
-
-import importlib
-from typing import List
-
-
-class AOMigrationTestCase(TestCase):
-    def _test_package_import(self, package_name: str):
-        r"""Tests the module import by making sure that all the internals match
-        (except the dunder methods)."""
-        old_module = importlib.import_module(f'torch.quantization.{package_name}')
-        new_module = importlib.import_module(f'torch.ao.quantization.{package_name}')
-        old_module_dir = set(dir(old_module))
-        new_module_dir = set(dir(new_module))
-        # Remove magic modules from checking in subsets
-        for el in list(old_module_dir):
-            if el[:2] == '__' and el[-2:] == '__':
-                old_module_dir.remove(el)
-        assert (old_module_dir <= new_module_dir), \
-            f"Importing {old_module} vs. {new_module} does not match: " \
-            f"{old_module_dir - new_module_dir}"
-
-    def _test_function_import(self, package_name: str, function_list: List[str]):
-        r"""Tests individual function list import by comparing the functions
-        and their hashes."""
-        old_location = importlib.import_module(f'torch.quantization.{package_name}')
-        new_location = importlib.import_module(f'torch.ao.quantization.{package_name}')
-        for fn_name in function_list:
-            old_function = getattr(old_location, fn_name)
-            new_function = getattr(new_location, fn_name)
-            assert old_function == new_function, f"Functions don't match: {fn_name}"
-            assert hash(old_function) == hash(new_function), \
-                f"Hashes don't match: {old_function}({hash(old_function)}) vs. " \
-                f"{new_function}({hash(new_function)})"
-
-
-class TestAOMigrationQuantizePy(AOMigrationTestCase):
-    def test_package_import(self):
-        self._test_package_import('quantize')
-
-    def test_function_import(self):
-        function_list = [
-            '_convert',
-            '_observer_forward_hook',
-            '_propagate_qconfig_helper',
-            '_remove_activation_post_process',
-            '_remove_qconfig',
-            'add_observer_',
-            'add_quant_dequant',
-            'convert',
-            'get_observer_dict',
-            'get_unique_devices_',
-            'is_activation_post_process',
-            'prepare',
-            'prepare_qat',
-            'propagate_qconfig_',
-            'quantize',
-            'quantize_dynamic',
-            'quantize_qat',
-            'register_activation_post_process_hook',
-            'swap_module',
-        ]
-        self._test_function_import('quantize', function_list)
diff --git a/test/test_quantization.py b/test/test_quantization.py
index ffc242ed77e33..867151373a5b6 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -100,8 +100,6 @@
 from quantization.jit.test_fusion_passes import TestFusionPasses  # noqa: F401
 from quantization.jit.test_deprecated_jit_quant import TestDeprecatedJitQuantized  # noqa: F401
 
-# AO Migration tests
-from quantization.ao_migration.test_quantize_py import TestAOMigrationQuantizePy  # noqa: F401
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
deleted file mode 100644
index 92a794ed7b631..0000000000000
--- a/torch/ao/quantization/quantize.py
+++ /dev/null
@@ -1,580 +0,0 @@
-import copy
-import itertools
-import warnings
-
-import torch
-import torch.nn as nn
-import torch.nn.quantized as nnq
-from torch.nn.intrinsic import _FusedModule
-
-# Import the duplicated API
-from torch.quantization.quantization_mappings import (
-    get_default_dynamic_quant_module_mappings,
-    get_default_static_quant_module_mappings,
-    get_default_qat_module_mappings,
-    get_default_qconfig_propagation_list,
-    no_observer_set,
-    _has_special_act_post_process,
-    _get_special_act_post_process,
-)
-from torch.quantization.stubs import DeQuantStub, QuantWrapper
-from torch.quantization.qconfig import (
-    add_module_to_qconfig_obs_ctr,
-    default_dynamic_qconfig,
-    float16_dynamic_qconfig,
-    float_qparams_weight_only_qconfig)
-
-def is_activation_post_process(module):
-    return (isinstance(module, torch.quantization.ObserverBase) or
-            isinstance(module, torch.quantization.FakeQuantizeBase))
-
-def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None,
-                              qconfig_parent=None, prefix=''):
-    r"""This is a helper function for `propagate_qconfig_`
-
-    Args:
-        module: input module
-        qconfig_dict: dictionary that maps from name of submodule to quantization
-                     configuration
-        allow_list: list of quantizable modules
-        qconfig_parent: quantization config of parent module, we will fallback to
-                       this config when there is no specified config for current
-                       module
-        prefix: corresponding prefix of the current module, used as key in
-                qconfig_dict
-
-    Return:
-        None, module is modified inplace with qconfig attached
-    """
-    # TODO: Add test
-    if allow_list is None:
-        allow_list = get_default_qconfig_propagation_list()
-
-    module_qconfig = qconfig_dict.get(type(module), qconfig_parent)
-    module_qconfig = qconfig_dict.get(prefix, module_qconfig)
-    module_qconfig = getattr(module, 'qconfig', module_qconfig)
-
-    torch.quantization.qconfig.assert_valid_qconfig(module_qconfig, module)
-
-    qconfig_with_device_check = add_module_to_qconfig_obs_ctr(module_qconfig, module)
-    module.qconfig = qconfig_with_device_check
-
-    for name, child in module.named_children():
-        module_prefix = prefix + '.' + name if prefix else name
-        _propagate_qconfig_helper(child, qconfig_dict, allow_list,
-                                  qconfig_with_device_check, module_prefix)
-
-# TODO(jerryzh): expose allow_list
-def propagate_qconfig_(module, qconfig_dict=None, allow_list=None):
-    r"""Propagate qconfig through the module hierarchy and assign `qconfig`
-    attribute on each leaf module
-
-    Args:
-        module: input module
-        qconfig_dict: dictionary that maps from name or type of submodule to
-            quantization configuration, qconfig applies to all submodules of a
-            given module unless qconfig for the submodules are specified (when
-            the submodule already has qconfig attribute)
-
-    Return:
-        None, module is modified inplace with qconfig attached
-    """
-    if qconfig_dict is None:
-        qconfig_dict = {}
-    _propagate_qconfig_helper(module, qconfig_dict, allow_list)
-
-def _observer_forward_hook(self, input, output):
-    r"""Forward hook that calls observer on the output
-    """
-    return self.activation_post_process(output)
-
-def register_activation_post_process_hook(module):
-    assert hasattr(module, 'activation_post_process'), \
-        'Expect activation_post_process attribut already attached to the module'
-    return module.register_forward_hook(_observer_forward_hook)
-
-def add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=None, device=None, custom_module_class_mapping=None):
-    r"""Add observer for the leaf child of the module.
-
-    This function insert observer module to all leaf child module that
-    has a valid qconfig attribute.
-
-    Args:
-        module: input module with qconfig attributes for all the leaf modules that we want to quantize
-        device: parent device, if any
-        non_leaf_module_list: list of non-leaf modules we want to add observer
-
-    Return:
-        None, module is modified inplace with added observer modules and forward_hooks
-    """
-    if qconfig_propagation_list is None:
-        qconfig_propagation_list = get_default_qconfig_propagation_list()
-
-    if custom_module_class_mapping is None:
-        custom_module_class_mapping = {}
-
-    # respect device affinity when adding observers
-    if device is None:
-        devices = get_unique_devices_(module)
-        assert len(devices) <= 1, (
-            "add_observer_ only works with cpu or single-device CUDA modules, "
-            "but got devices {}".format(devices)
-        )
-        device = next(iter(devices)) if len(devices) > 0 else None
-
-    def get_activation_post_process(qconfig, device, special_act_post_process=None):
-        activation = qconfig.activation() if special_act_post_process is None else special_act_post_process()
-        if device is not None:
-            activation.to(device)
-        return activation
-
-    def needs_observation(m):
-        return hasattr(m, 'qconfig') and m.qconfig is not None
-
-    def insert_activation_post_process(m, special_act_post_process=None):
-        """ Adds an activation post process module and register
-        a post hook that calls the module
-        """
-        # We don't insert observer/fake_quantize for DeQuantStub
-        if needs_observation(m) and not isinstance(m, DeQuantStub):
-            # observer and hook will be gone after we swap the module
-            m.add_module('activation_post_process', get_activation_post_process(
-                m.qconfig, device, special_act_post_process))
-            # Register observer as the first entry in the hook list
-            # All post forward hooks are preserved and will be executed after the observer before convert
-            handle = register_activation_post_process_hook(m)
-            m._forward_hooks.move_to_end(handle.id, last=False)
-
-    for name, child in module.named_children():
-        if type(child) in [nnq.FloatFunctional, nnq.QFunctional]:
-            if needs_observation(child):
-                child.activation_post_process = get_activation_post_process(child.qconfig, device)
-        elif isinstance(child, _FusedModule):
-            # activation_post_process are now added directly to nn.Sequentail/_FusedModule
-            if needs_observation(child):
-                insert_activation_post_process(child)
-        elif _has_special_act_post_process(child):
-            special_act_post_process = _get_special_act_post_process(child)
-            insert_activation_post_process(child, special_act_post_process)
-        elif non_leaf_module_list is not None and type(child) in non_leaf_module_list:
-            if needs_observation(child):
-                insert_activation_post_process(child)
-        elif needs_observation(child) and type(child) in custom_module_class_mapping:
-            observed_child = custom_module_class_mapping[type(child)].from_float(child)
-            setattr(module, name, observed_child)
-            # TODO: These are the modules that cannot be observed
-            #       Once there are more, we should move them to a separate list
-            if custom_module_class_mapping[type(child)] not in no_observer_set():
-                insert_activation_post_process(observed_child)
-        else:
-            add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping)
-
-    # Insert observers only for leaf nodes, note that this observer is for
-    # the output of the module, for input QuantStub will observe them
-    if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \
-       and type(module) in qconfig_propagation_list:
-        insert_activation_post_process(module)
-
-def get_unique_devices_(module):
-    return {p.device for p in module.parameters()} | \
-        {p.device for p in module.buffers()}
-
-def add_quant_dequant(module):
-    r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig
-    Note that this function will modify the children of module inplace and it
-    can return a new module which wraps the input module as well.
-
-    Args:
-        module: input module with qconfig attributes for all the leaf modules
-        that we want to quantize
-
-    Return:
-        Either the inplace modified module with submodules wrapped in
-        `QuantWrapper` based on qconfig or a new `QuantWrapper` module which
-        wraps the input module, the latter case only happens when the input
-        module is a leaf module and we want to quantize it.
-    """
-    if len(module._modules) == 0 and hasattr(module, 'qconfig') and module.qconfig:
-        return QuantWrapper(module)
-
-    for name, child in module.named_children():
-        module._modules[name] = add_quant_dequant(child)
-    return module
-
-def prepare(model, inplace=False, allow_list=None,
-            observer_non_leaf_module_list=None,
-            prepare_custom_config_dict=None):
-    r"""Prepares a copy of the model for quantization calibration or quantization-aware training.
-
-    Quantization configuration should be assigned preemptively
-    to individual submodules in `.qconfig` attribute.
-
-    The model will be attached with observer or fake quant modules, and qconfig
-    will be propagated.
-
-    Args:
-        `model`: input model to be modified in-place
-        `inplace`: carry out model transformations in-place, the original module is mutated
-        `allow_list`: list of quantizable modules
-        `observer_non_leaf_module_list`: list of non-leaf modules we want to add observer
-        `prepare_custom_config_dict`: customization configuration dictionary for prepare function
-
-    .. code-block:: python
-
-       # Example of prepare_custom_config_dict:
-       prepare_custom_config_dict = {
-           # user will manually define the corresponding observed
-           # module class which has a from_float class method that converts
-           # float custom module to observed custom module
-           "float_to_observed_custom_module_class": {
-               CustomModule: ObservedCustomModule
-           }
-        }
-
-    """
-    torch._C._log_api_usage_once("quantization_api.quantize.prepare")
-    if prepare_custom_config_dict is None:
-        prepare_custom_config_dict = {}
-    custom_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", {})
-
-    if not inplace:
-        model = copy.deepcopy(model)
-
-    # TODO: remove allow_list
-    qconfig_propagation_list = allow_list
-    if qconfig_propagation_list is None:
-        qconfig_propagation_list = get_default_qconfig_propagation_list()
-    propagate_qconfig_(model, qconfig_dict=None)
-
-    # sanity check common API misusage
-    if not any(hasattr(m, 'qconfig') and m.qconfig for m in model.modules()):
-        warnings.warn("None of the submodule got qconfig applied. Make sure you "
-                      "passed correct configuration through `qconfig_dict` or "
-                      "by assigning the `.qconfig` attribute directly on submodules")
-
-    add_observer_(
-        model, qconfig_propagation_list, observer_non_leaf_module_list,
-        custom_module_class_mapping=custom_module_class_mapping)
-    return model
-
-def _remove_activation_post_process(module):
-    # TODO: maybe we should change activation_post_process to _activation_post_process
-    # to prevent it from being used by user
-    if hasattr(module, 'activation_post_process') and \
-       is_activation_post_process(module.activation_post_process):
-        delattr(module, 'activation_post_process')
-
-    # remove activation_post_proceess hook
-    handle_ids_to_remove = set()
-    for handle_id, hook_fn in module._forward_hooks.items():
-        if hook_fn is _observer_forward_hook:
-            handle_ids_to_remove.add(handle_id)
-    for handle_id in handle_ids_to_remove:
-        module._forward_hooks.pop(handle_id)
-
-# TODO: rename to something more general
-def _remove_qconfig(module):
-    r"""Clean up the qconfig left in the module so that new qconfig can be
-    propagated.
-
-    Args:
-        module: module to be cleaned up
-    """
-    for child in module.children():
-        _remove_qconfig(child)
-
-    if hasattr(module, "qconfig"):
-        del module.qconfig
-
-    _remove_activation_post_process(module)
-
-def quantize(model, run_fn, run_args, mapping=None, inplace=False):
-    r"""Quantize the input float model with post training static quantization.
-
-    First it will prepare the model for calibration, then it calls
-    `run_fn` which will run the calibration step, after that we will
-    convert the model to a quantized model.
-
-    Args:
-        model: input float model
-        run_fn: a calibration function for calibrating the prepared model
-        run_args: positional arguments for `run_fn`
-        inplace: carry out model transformations in-place, the original module is mutated
-        mapping: correspondence between original module types and quantized counterparts
-
-    Return:
-        Quantized model.
-    """
-    torch._C._log_api_usage_once("quantization_api.quantize.quantize")
-    if mapping is None:
-        mapping = get_default_static_quant_module_mappings()
-    if not inplace:
-        model = copy.deepcopy(model)
-    model.eval()
-    prepare(model, inplace=True)
-    run_fn(model, *run_args)
-    convert(model, mapping, inplace=True)
-    return model
-
-def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8,
-                     mapping=None, inplace=False):
-    r"""Converts a float model to dynamic (i.e. weights-only) quantized model.
-
-    Replaces specified modules with dynamic weight-only quantized versions and output the quantized model.
-
-    For simplest usage provide `dtype` argument that can be float16 or qint8. Weight-only quantization
-    by default is performed for layers with large weights size - i.e. Linear and RNN variants.
-
-    Fine grained control is possible with `qconfig` and `mapping` that act similarly to `quantize()`.
-    If `qconfig` is provided, the `dtype` argument is ignored.
-
-    Args:
-        model: input model
-        qconfig_spec: Either:
-
-            - A dictionary that maps from name or type of submodule to quantization
-              configuration, qconfig applies to all submodules of a given
-              module unless qconfig for the submodules are specified (when the
-              submodule already has qconfig attribute). Entries in the dictionary
-              need to be QConfigDynamic instances.
-
-            - A set of types and/or submodule names to apply dynamic quantization to,
-              in which case the `dtype` argument is used to specify the bit-width
-
-        inplace: carry out model transformations in-place, the original module is mutated
-        mapping: maps type of a submodule to a type of corresponding dynamically quantized version
-            with which the submodule needs to be replaced
-
-    """
-    torch._C._log_api_usage_once("quantization_api.quantize.quantize_dynamic")
-    if qconfig_spec is None:
-        if dtype == torch.qint8:
-            qconfig_spec = {
-                nn.Linear : default_dynamic_qconfig,
-                nn.LSTM : default_dynamic_qconfig,
-                nn.GRU : default_dynamic_qconfig,
-                nn.LSTMCell : default_dynamic_qconfig,
-                nn.RNNCell : default_dynamic_qconfig,
-                nn.GRUCell : default_dynamic_qconfig,
-            }
-        elif dtype == torch.float16:
-            qconfig_spec = {
-                nn.Linear : float16_dynamic_qconfig,
-                nn.LSTM : float16_dynamic_qconfig,
-                nn.GRU : float16_dynamic_qconfig,
-                nn.LSTMCell : float16_dynamic_qconfig,
-                nn.RNNCell : float16_dynamic_qconfig,
-                nn.GRUCell : float16_dynamic_qconfig,
-            }
-        elif dtype == torch.quint8:
-            qconfig_spec = {
-                nn.EmbeddingBag : float_qparams_weight_only_qconfig,
-            }
-        else:
-            raise ValueError(
-                "Don't know how to quantize with default settings for {}. Provide full qconfig please".format(dtype))
-    elif isinstance(qconfig_spec, set):
-        if dtype is torch.qint8:
-            default_qconfig = default_dynamic_qconfig
-        elif dtype is torch.float16:
-            default_qconfig = float16_dynamic_qconfig
-        elif dtype is torch.quint8:
-            default_qconfig = float_qparams_weight_only_qconfig
-        else:
-            raise RuntimeError('Unknown dtype specified for quantize_dynamic: ', str(dtype))
-        qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig)))
-
-    if mapping is None:
-        mapping = get_default_dynamic_quant_module_mappings()
-
-    if not inplace:
-        model = copy.deepcopy(model)
-    model.eval()
-    propagate_qconfig_(model, qconfig_spec)
-    convert(model, mapping, inplace=True)
-    return model
-
-def prepare_qat(model, mapping=None, inplace=False):
-    r"""
-    Prepares a copy of the model for quantization calibration or
-    quantization-aware training and converts it to quantized version.
-
-    Quantization configuration should be assigned preemptively
-    to individual submodules in `.qconfig` attribute.
-
-    Args:
-        model: input model to be modified in-place
-        mapping: dictionary that maps float modules to quantized modules to be
-                 replaced.
-        inplace: carry out model transformations in-place, the original module
-                 is mutated
-    """
-    torch._C._log_api_usage_once("quantization_api.quantize.prepare_qat")
-    if mapping is None:
-        mapping = get_default_qat_module_mappings()
-
-    if not inplace:
-        model = copy.deepcopy(model)
-
-    propagate_qconfig_(model, qconfig_dict=None)
-    convert(model, mapping=mapping, inplace=True, remove_qconfig=False)
-    prepare(model, observer_non_leaf_module_list=set(mapping.values()), inplace=True)
-    return model
-
-def quantize_qat(model, run_fn, run_args, inplace=False):
-    r"""Do quantization aware training and output a quantized model
-
-    Args:
-        model: input model
-        run_fn: a function for evaluating the prepared model, can be a
-                function that simply runs the prepared model or a training
-                loop
-        run_args: positional arguments for `run_fn`
-
-    Return:
-        Quantized model.
-    """
-    torch._C._log_api_usage_once("quantization_api.quantize.quantize_qat")
-    if not inplace:
-        model = copy.deepcopy(model)
-    model.train()
-    prepare_qat(model, inplace=True)
-    run_fn(model, *run_args)
-    convert(model, inplace=True)
-    return model
-
-def convert(
-        module, mapping=None, inplace=False, remove_qconfig=True,
-        convert_custom_config_dict=None):
-    r"""Converts submodules in input module to a different module according to `mapping`
-    by calling `from_float` method on the target module class. And remove qconfig at the
-    end if remove_qconfig is set to True.
-
-    Args:
-        `module`: prepared and calibrated module
-        `mapping`: a dictionary that maps from source module type to target
-                   module type, can be overwritten to allow swapping user defined
-                   Modules
-        `inplace`: carry out model transformations in-place, the original module
-                   is mutated
-        `convert_custom_config_dict`: custom configuration dictionary for convert function
-
-    .. code-block:: python
-
-       # Example of convert_custom_config_dict:
-       convert_custom_config_dict = {
-           # user will manually define the corresponding quantized
-           # module class which has a from_observed class method that converts
-           # observed custom module to quantized custom module
-           "observed_to_quantized_custom_module_class": {
-               ObservedCustomModule: QuantizedCustomModule
-           }
-       }
-
-    """
-    torch._C._log_api_usage_once("quantization_api.quantize.convert")
-    if not inplace:
-        module = copy.deepcopy(module)
-    _convert(
-        module, mapping, inplace=True,
-        convert_custom_config_dict=convert_custom_config_dict)
-    if remove_qconfig:
-        _remove_qconfig(module)
-    return module
-
-def _convert(
-        module, mapping=None, inplace=False,
-        convert_custom_config_dict=None):
-    r"""Converts submodules in input module to a different module according to `mapping`
-    by calling `from_float` method on the target module class
-
-    Args:
-        module: input module
-        mapping: a dictionary that maps from source module type to target
-                 module type, can be overwritten to allow swapping user defined
-                 Modules
-        inplace: carry out model transformations in-place, the original module
-                 is mutated
-
-    """
-    if mapping is None:
-        mapping = get_default_static_quant_module_mappings()
-    if convert_custom_config_dict is None:
-        convert_custom_config_dict = {}
-    custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {})
-
-    if not inplace:
-        module = copy.deepcopy(module)
-    reassign = {}
-    for name, mod in module.named_children():
-        # both fused modules and observed custom modules are
-        # swapped as one unit
-        if not isinstance(mod, _FusedModule) and \
-           type(mod) not in custom_module_class_mapping:
-            _convert(mod, mapping, True,  # inplace
-                     convert_custom_config_dict)
-        reassign[name] = swap_module(mod, mapping, custom_module_class_mapping)
-
-    for key, value in reassign.items():
-        module._modules[key] = value
-
-    return module
-
-def swap_module(mod, mapping, custom_module_class_mapping):
-    r"""Swaps the module if it has a quantized counterpart and it has an
-    `observer` attached.
-
-    Args:
-        mod: input module
-        mapping: a dictionary that maps from nn module to nnq module
-
-    Return:
-        The corresponding quantized module of `mod`
-    """
-    new_mod = mod
-    if hasattr(mod, 'qconfig') and mod.qconfig is not None:
-        swapped = False
-        if type(mod) in custom_module_class_mapping:
-            new_mod = custom_module_class_mapping[type(mod)].from_observed(mod)
-            swapped = True
-        elif type(mod) in mapping:
-            new_mod = mapping[type(mod)].from_float(mod)
-            swapped = True
-
-        if swapped:
-            # Preserve module's pre forward hooks. They'll be called on quantized input
-            for pre_hook_fn in mod._forward_pre_hooks.values():
-                new_mod.register_forward_pre_hook(pre_hook_fn)
-            # Preserve module's post forward hooks except _observer_forward_hook
-            # After convert they'll work with quantized output
-            for hook_fn in mod._forward_hooks.values():
-                if hook_fn is not _observer_forward_hook:
-                    new_mod.register_forward_hook(hook_fn)
-
-            # respect device affinity when swapping modules
-            devices = get_unique_devices_(mod)
-            assert len(devices) <= 1, (
-                "swap_module only works with cpu or single-device CUDA modules, "
-                "but got devices {}".format(devices)
-            )
-            device = next(iter(devices)) if len(devices) > 0 else None
-            if device:
-                new_mod.to(device)
-    return new_mod
-
-def get_observer_dict(mod, target_dict, prefix=""):
-    r"""Traverse the modules and save all observers into dict.
-    This is mainly used for quantization accuracy debug
-    Args:
-        mod: the top module we want to save all observers
-        prefix: the prefix for the current module
-        target_dict: the dictionary used to save all the observers
-    """
-    def get_prefix(prefix):
-        return prefix if prefix == "" else prefix + '.'
-
-    if hasattr(mod, 'activation_post_process'):
-        target_dict[get_prefix(prefix) + 'activation_post_process'] = mod.activation_post_process
-    for name, child in mod.named_children():
-        module_prefix = get_prefix(prefix) + name if prefix else name
-        get_observer_dict(child, target_dict, module_prefix)
diff --git a/torch/quantization/fx/convert.py b/torch/quantization/fx/convert.py
index e00e4aaad1b68..867b0b24cf7ad 100644
--- a/torch/quantization/fx/convert.py
+++ b/torch/quantization/fx/convert.py
@@ -36,7 +36,7 @@
     WEIGHT_INDEX_DICT,
 )
 
-from torch.ao.quantization.quantize import (
+from ..quantize import (
     _remove_qconfig,
     is_activation_post_process,
 )
diff --git a/torch/quantization/fx/prepare.py b/torch/quantization/fx/prepare.py
index 0b65e339ce0a3..d2bb96ab7a5c0 100644
--- a/torch/quantization/fx/prepare.py
+++ b/torch/quantization/fx/prepare.py
@@ -67,7 +67,7 @@
     get_default_qat_module_mappings,
 )
 
-from torch.ao.quantization.quantize import (
+from ..quantize import (
     is_activation_post_process,
     convert
 )
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 418cae1511c35..3f54a6a1e2743 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -24,7 +24,7 @@
     get_qparam_dict,
 )
 
-from torch.ao.quantization.quantize import (
+from ..quantize import (
     is_activation_post_process,
 )
 
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 3c9adc2bc311b..10f8b06b6dfed 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -2,7 +2,7 @@
 import torch
 import torch.nn as nn
 from ..utils import is_per_tensor, is_per_channel
-from torch.ao.quantization.quantize import is_activation_post_process
+from ..quantize import is_activation_post_process
 
 from torch.fx import GraphModule, map_arg
 
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index 5b0f4ed8779ab..674ed59ac86ed 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -1,30 +1,580 @@
-# flake8: noqa: F401
-r"""
-This file is in the process of migration to `torch/ao/quantization`, and
-is kept here for compatibility while the migration process is ongoing.
-If you are adding a new entry/functionality, please, add it to the
-`torch/ao/quantization/quantize.py`, while adding an import statement
-here.
-"""
+import copy
+import itertools
+import warnings
 
 import torch
+import torch.nn as nn
+import torch.nn.quantized as nnq
+from torch.nn.intrinsic import _FusedModule
 
-from torch.ao.quantization.quantize import _convert
-from torch.ao.quantization.quantize import _observer_forward_hook
-from torch.ao.quantization.quantize import _propagate_qconfig_helper
-from torch.ao.quantization.quantize import _remove_activation_post_process
-from torch.ao.quantization.quantize import _remove_qconfig
-from torch.ao.quantization.quantize import add_observer_
-from torch.ao.quantization.quantize import add_quant_dequant
-from torch.ao.quantization.quantize import convert
-from torch.ao.quantization.quantize import get_observer_dict
-from torch.ao.quantization.quantize import get_unique_devices_
-from torch.ao.quantization.quantize import is_activation_post_process
-from torch.ao.quantization.quantize import prepare
-from torch.ao.quantization.quantize import prepare_qat
-from torch.ao.quantization.quantize import propagate_qconfig_
-from torch.ao.quantization.quantize import quantize
-from torch.ao.quantization.quantize import quantize_dynamic
-from torch.ao.quantization.quantize import quantize_qat
-from torch.ao.quantization.quantize import register_activation_post_process_hook
-from torch.ao.quantization.quantize import swap_module
+from .quantization_mappings import (
+    get_default_dynamic_quant_module_mappings,
+    get_default_static_quant_module_mappings,
+    get_default_qat_module_mappings,
+    get_default_qconfig_propagation_list,
+    no_observer_set,
+    _has_special_act_post_process,
+    _get_special_act_post_process,
+)
+
+from .stubs import DeQuantStub, QuantWrapper
+from .qconfig import (
+    add_module_to_qconfig_obs_ctr,
+    default_dynamic_qconfig,
+    float16_dynamic_qconfig,
+    float_qparams_weight_only_qconfig)
+
+def is_activation_post_process(module):
+    return (isinstance(module, torch.quantization.ObserverBase) or
+            isinstance(module, torch.quantization.FakeQuantizeBase))
+
+def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None,
+                              qconfig_parent=None, prefix=''):
+    r"""This is a helper function for `propagate_qconfig_`
+
+    Args:
+        module: input module
+        qconfig_dict: dictionary that maps from name of submodule to quantization
+                     configuration
+        allow_list: list of quantizable modules
+        qconfig_parent: quantization config of parent module, we will fallback to
+                       this config when there is no specified config for current
+                       module
+        prefix: corresponding prefix of the current module, used as key in
+                qconfig_dict
+
+    Return:
+        None, module is modified inplace with qconfig attached
+    """
+    # TODO: Add test
+    if allow_list is None:
+        allow_list = get_default_qconfig_propagation_list()
+
+    module_qconfig = qconfig_dict.get(type(module), qconfig_parent)
+    module_qconfig = qconfig_dict.get(prefix, module_qconfig)
+    module_qconfig = getattr(module, 'qconfig', module_qconfig)
+
+    torch.quantization.qconfig.assert_valid_qconfig(module_qconfig, module)
+
+    qconfig_with_device_check = add_module_to_qconfig_obs_ctr(module_qconfig, module)
+    module.qconfig = qconfig_with_device_check
+
+    for name, child in module.named_children():
+        module_prefix = prefix + '.' + name if prefix else name
+        _propagate_qconfig_helper(child, qconfig_dict, allow_list,
+                                  qconfig_with_device_check, module_prefix)
+
+# TODO(jerryzh): expose allow_list
+def propagate_qconfig_(module, qconfig_dict=None, allow_list=None):
+    r"""Propagate qconfig through the module hierarchy and assign `qconfig`
+    attribute on each leaf module
+
+    Args:
+        module: input module
+        qconfig_dict: dictionary that maps from name or type of submodule to
+            quantization configuration, qconfig applies to all submodules of a
+            given module unless qconfig for the submodules are specified (when
+            the submodule already has qconfig attribute)
+
+    Return:
+        None, module is modified inplace with qconfig attached
+    """
+    if qconfig_dict is None:
+        qconfig_dict = {}
+    _propagate_qconfig_helper(module, qconfig_dict, allow_list)
+
+def _observer_forward_hook(self, input, output):
+    r"""Forward hook that calls observer on the output
+    """
+    return self.activation_post_process(output)
+
+def register_activation_post_process_hook(module):
+    assert hasattr(module, 'activation_post_process'), \
+        'Expect activation_post_process attribut already attached to the module'
+    return module.register_forward_hook(_observer_forward_hook)
+
+def add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=None, device=None, custom_module_class_mapping=None):
+    r"""Add observer for the leaf child of the module.
+
+    This function insert observer module to all leaf child module that
+    has a valid qconfig attribute.
+
+    Args:
+        module: input module with qconfig attributes for all the leaf modules that we want to quantize
+        device: parent device, if any
+        non_leaf_module_list: list of non-leaf modules we want to add observer
+
+    Return:
+        None, module is modified inplace with added observer modules and forward_hooks
+    """
+    if qconfig_propagation_list is None:
+        qconfig_propagation_list = get_default_qconfig_propagation_list()
+
+    if custom_module_class_mapping is None:
+        custom_module_class_mapping = {}
+
+    # respect device affinity when adding observers
+    if device is None:
+        devices = get_unique_devices_(module)
+        assert len(devices) <= 1, (
+            "add_observer_ only works with cpu or single-device CUDA modules, "
+            "but got devices {}".format(devices)
+        )
+        device = next(iter(devices)) if len(devices) > 0 else None
+
+    def get_activation_post_process(qconfig, device, special_act_post_process=None):
+        activation = qconfig.activation() if special_act_post_process is None else special_act_post_process()
+        if device is not None:
+            activation.to(device)
+        return activation
+
+    def needs_observation(m):
+        return hasattr(m, 'qconfig') and m.qconfig is not None
+
+    def insert_activation_post_process(m, special_act_post_process=None):
+        """ Adds an activation post process module and register
+        a post hook that calls the module
+        """
+        # We don't insert observer/fake_quantize for DeQuantStub
+        if needs_observation(m) and not isinstance(m, DeQuantStub):
+            # observer and hook will be gone after we swap the module
+            m.add_module('activation_post_process', get_activation_post_process(
+                m.qconfig, device, special_act_post_process))
+            # Register observer as the first entry in the hook list
+            # All post forward hooks are preserved and will be executed after the observer before convert
+            handle = register_activation_post_process_hook(m)
+            m._forward_hooks.move_to_end(handle.id, last=False)
+
+    for name, child in module.named_children():
+        if type(child) in [nnq.FloatFunctional, nnq.QFunctional]:
+            if needs_observation(child):
+                child.activation_post_process = get_activation_post_process(child.qconfig, device)
+        elif isinstance(child, _FusedModule):
+            # activation_post_process are now added directly to nn.Sequentail/_FusedModule
+            if needs_observation(child):
+                insert_activation_post_process(child)
+        elif _has_special_act_post_process(child):
+            special_act_post_process = _get_special_act_post_process(child)
+            insert_activation_post_process(child, special_act_post_process)
+        elif non_leaf_module_list is not None and type(child) in non_leaf_module_list:
+            if needs_observation(child):
+                insert_activation_post_process(child)
+        elif needs_observation(child) and type(child) in custom_module_class_mapping:
+            observed_child = custom_module_class_mapping[type(child)].from_float(child)
+            setattr(module, name, observed_child)
+            # TODO: These are the modules that cannot be observed
+            #       Once there are more, we should move them to a separate list
+            if custom_module_class_mapping[type(child)] not in no_observer_set():
+                insert_activation_post_process(observed_child)
+        else:
+            add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping)
+
+    # Insert observers only for leaf nodes, note that this observer is for
+    # the output of the module, for input QuantStub will observe them
+    if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \
+       and type(module) in qconfig_propagation_list:
+        insert_activation_post_process(module)
+
+def get_unique_devices_(module):
+    return {p.device for p in module.parameters()} | \
+        {p.device for p in module.buffers()}
+
+def add_quant_dequant(module):
+    r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig
+    Note that this function will modify the children of module inplace and it
+    can return a new module which wraps the input module as well.
+
+    Args:
+        module: input module with qconfig attributes for all the leaf modules
+        that we want to quantize
+
+    Return:
+        Either the inplace modified module with submodules wrapped in
+        `QuantWrapper` based on qconfig or a new `QuantWrapper` module which
+        wraps the input module, the latter case only happens when the input
+        module is a leaf module and we want to quantize it.
+    """
+    if len(module._modules) == 0 and hasattr(module, 'qconfig') and module.qconfig:
+        return QuantWrapper(module)
+
+    for name, child in module.named_children():
+        module._modules[name] = add_quant_dequant(child)
+    return module
+
+def prepare(model, inplace=False, allow_list=None,
+            observer_non_leaf_module_list=None,
+            prepare_custom_config_dict=None):
+    r"""Prepares a copy of the model for quantization calibration or quantization-aware training.
+
+    Quantization configuration should be assigned preemptively
+    to individual submodules in `.qconfig` attribute.
+
+    The model will be attached with observer or fake quant modules, and qconfig
+    will be propagated.
+
+    Args:
+        `model`: input model to be modified in-place
+        `inplace`: carry out model transformations in-place, the original module is mutated
+        `allow_list`: list of quantizable modules
+        `observer_non_leaf_module_list`: list of non-leaf modules we want to add observer
+        `prepare_custom_config_dict`: customization configuration dictionary for prepare function
+
+    .. code-block:: python
+
+       # Example of prepare_custom_config_dict:
+       prepare_custom_config_dict = {
+           # user will manually define the corresponding observed
+           # module class which has a from_float class method that converts
+           # float custom module to observed custom module
+           "float_to_observed_custom_module_class": {
+               CustomModule: ObservedCustomModule
+           }
+        }
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.prepare")
+    if prepare_custom_config_dict is None:
+        prepare_custom_config_dict = {}
+    custom_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", {})
+
+    if not inplace:
+        model = copy.deepcopy(model)
+
+    # TODO: remove allow_list
+    qconfig_propagation_list = allow_list
+    if qconfig_propagation_list is None:
+        qconfig_propagation_list = get_default_qconfig_propagation_list()
+    propagate_qconfig_(model, qconfig_dict=None)
+
+    # sanity check common API misusage
+    if not any(hasattr(m, 'qconfig') and m.qconfig for m in model.modules()):
+        warnings.warn("None of the submodule got qconfig applied. Make sure you "
+                      "passed correct configuration through `qconfig_dict` or "
+                      "by assigning the `.qconfig` attribute directly on submodules")
+
+    add_observer_(
+        model, qconfig_propagation_list, observer_non_leaf_module_list,
+        custom_module_class_mapping=custom_module_class_mapping)
+    return model
+
+def _remove_activation_post_process(module):
+    # TODO: maybe we should change activation_post_process to _activation_post_process
+    # to prevent it from being used by user
+    if hasattr(module, 'activation_post_process') and \
+       is_activation_post_process(module.activation_post_process):
+        delattr(module, 'activation_post_process')
+
+    # remove activation_post_proceess hook
+    handle_ids_to_remove = set()
+    for handle_id, hook_fn in module._forward_hooks.items():
+        if hook_fn is _observer_forward_hook:
+            handle_ids_to_remove.add(handle_id)
+    for handle_id in handle_ids_to_remove:
+        module._forward_hooks.pop(handle_id)
+
+# TODO: rename to something more general
+def _remove_qconfig(module):
+    r"""Clean up the qconfig left in the module so that new qconfig can be
+    propagated.
+
+    Args:
+        module: module to be cleaned up
+    """
+    for child in module.children():
+        _remove_qconfig(child)
+
+    if hasattr(module, "qconfig"):
+        del module.qconfig
+
+    _remove_activation_post_process(module)
+
+def quantize(model, run_fn, run_args, mapping=None, inplace=False):
+    r"""Quantize the input float model with post training static quantization.
+
+    First it will prepare the model for calibration, then it calls
+    `run_fn` which will run the calibration step, after that we will
+    convert the model to a quantized model.
+
+    Args:
+        model: input float model
+        run_fn: a calibration function for calibrating the prepared model
+        run_args: positional arguments for `run_fn`
+        inplace: carry out model transformations in-place, the original module is mutated
+        mapping: correspondence between original module types and quantized counterparts
+
+    Return:
+        Quantized model.
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.quantize")
+    if mapping is None:
+        mapping = get_default_static_quant_module_mappings()
+    if not inplace:
+        model = copy.deepcopy(model)
+    model.eval()
+    prepare(model, inplace=True)
+    run_fn(model, *run_args)
+    convert(model, mapping, inplace=True)
+    return model
+
+def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8,
+                     mapping=None, inplace=False):
+    r"""Converts a float model to dynamic (i.e. weights-only) quantized model.
+
+    Replaces specified modules with dynamic weight-only quantized versions and output the quantized model.
+
+    For simplest usage provide `dtype` argument that can be float16 or qint8. Weight-only quantization
+    by default is performed for layers with large weights size - i.e. Linear and RNN variants.
+
+    Fine grained control is possible with `qconfig` and `mapping` that act similarly to `quantize()`.
+    If `qconfig` is provided, the `dtype` argument is ignored.
+
+    Args:
+        model: input model
+        qconfig_spec: Either:
+
+            - A dictionary that maps from name or type of submodule to quantization
+              configuration, qconfig applies to all submodules of a given
+              module unless qconfig for the submodules are specified (when the
+              submodule already has qconfig attribute). Entries in the dictionary
+              need to be QConfigDynamic instances.
+
+            - A set of types and/or submodule names to apply dynamic quantization to,
+              in which case the `dtype` argument is used to specify the bit-width
+
+        inplace: carry out model transformations in-place, the original module is mutated
+        mapping: maps type of a submodule to a type of corresponding dynamically quantized version
+            with which the submodule needs to be replaced
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.quantize_dynamic")
+    if qconfig_spec is None:
+        if dtype == torch.qint8:
+            qconfig_spec = {
+                nn.Linear : default_dynamic_qconfig,
+                nn.LSTM : default_dynamic_qconfig,
+                nn.GRU : default_dynamic_qconfig,
+                nn.LSTMCell : default_dynamic_qconfig,
+                nn.RNNCell : default_dynamic_qconfig,
+                nn.GRUCell : default_dynamic_qconfig,
+            }
+        elif dtype == torch.float16:
+            qconfig_spec = {
+                nn.Linear : float16_dynamic_qconfig,
+                nn.LSTM : float16_dynamic_qconfig,
+                nn.GRU : float16_dynamic_qconfig,
+                nn.LSTMCell : float16_dynamic_qconfig,
+                nn.RNNCell : float16_dynamic_qconfig,
+                nn.GRUCell : float16_dynamic_qconfig,
+            }
+        elif dtype == torch.quint8:
+            qconfig_spec = {
+                nn.EmbeddingBag : float_qparams_weight_only_qconfig,
+            }
+        else:
+            raise ValueError(
+                "Don't know how to quantize with default settings for {}. Provide full qconfig please".format(dtype))
+    elif isinstance(qconfig_spec, set):
+        if dtype is torch.qint8:
+            default_qconfig = default_dynamic_qconfig
+        elif dtype is torch.float16:
+            default_qconfig = float16_dynamic_qconfig
+        elif dtype is torch.quint8:
+            default_qconfig = float_qparams_weight_only_qconfig
+        else:
+            raise RuntimeError('Unknown dtype specified for quantize_dynamic: ', str(dtype))
+        qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig)))
+
+    if mapping is None:
+        mapping = get_default_dynamic_quant_module_mappings()
+
+    if not inplace:
+        model = copy.deepcopy(model)
+    model.eval()
+    propagate_qconfig_(model, qconfig_spec)
+    convert(model, mapping, inplace=True)
+    return model
+
+def prepare_qat(model, mapping=None, inplace=False):
+    r"""
+    Prepares a copy of the model for quantization calibration or
+    quantization-aware training and converts it to quantized version.
+
+    Quantization configuration should be assigned preemptively
+    to individual submodules in `.qconfig` attribute.
+
+    Args:
+        model: input model to be modified in-place
+        mapping: dictionary that maps float modules to quantized modules to be
+                 replaced.
+        inplace: carry out model transformations in-place, the original module
+                 is mutated
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.prepare_qat")
+    if mapping is None:
+        mapping = get_default_qat_module_mappings()
+
+    if not inplace:
+        model = copy.deepcopy(model)
+
+    propagate_qconfig_(model, qconfig_dict=None)
+    convert(model, mapping=mapping, inplace=True, remove_qconfig=False)
+    prepare(model, observer_non_leaf_module_list=set(mapping.values()), inplace=True)
+    return model
+
+def quantize_qat(model, run_fn, run_args, inplace=False):
+    r"""Do quantization aware training and output a quantized model
+
+    Args:
+        model: input model
+        run_fn: a function for evaluating the prepared model, can be a
+                function that simply runs the prepared model or a training
+                loop
+        run_args: positional arguments for `run_fn`
+
+    Return:
+        Quantized model.
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.quantize_qat")
+    if not inplace:
+        model = copy.deepcopy(model)
+    model.train()
+    prepare_qat(model, inplace=True)
+    run_fn(model, *run_args)
+    convert(model, inplace=True)
+    return model
+
+def convert(
+        module, mapping=None, inplace=False, remove_qconfig=True,
+        convert_custom_config_dict=None):
+    r"""Converts submodules in input module to a different module according to `mapping`
+    by calling `from_float` method on the target module class. And remove qconfig at the
+    end if remove_qconfig is set to True.
+
+    Args:
+        `module`: prepared and calibrated module
+        `mapping`: a dictionary that maps from source module type to target
+                   module type, can be overwritten to allow swapping user defined
+                   Modules
+        `inplace`: carry out model transformations in-place, the original module
+                   is mutated
+        `convert_custom_config_dict`: custom configuration dictionary for convert function
+
+    .. code-block:: python
+
+       # Example of convert_custom_config_dict:
+       convert_custom_config_dict = {
+           # user will manually define the corresponding quantized
+           # module class which has a from_observed class method that converts
+           # observed custom module to quantized custom module
+           "observed_to_quantized_custom_module_class": {
+               ObservedCustomModule: QuantizedCustomModule
+           }
+       }
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.convert")
+    if not inplace:
+        module = copy.deepcopy(module)
+    _convert(
+        module, mapping, inplace=True,
+        convert_custom_config_dict=convert_custom_config_dict)
+    if remove_qconfig:
+        _remove_qconfig(module)
+    return module
+
+def _convert(
+        module, mapping=None, inplace=False,
+        convert_custom_config_dict=None):
+    r"""Converts submodules in input module to a different module according to `mapping`
+    by calling `from_float` method on the target module class
+
+    Args:
+        module: input module
+        mapping: a dictionary that maps from source module type to target
+                 module type, can be overwritten to allow swapping user defined
+                 Modules
+        inplace: carry out model transformations in-place, the original module
+                 is mutated
+
+    """
+    if mapping is None:
+        mapping = get_default_static_quant_module_mappings()
+    if convert_custom_config_dict is None:
+        convert_custom_config_dict = {}
+    custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {})
+
+    if not inplace:
+        module = copy.deepcopy(module)
+    reassign = {}
+    for name, mod in module.named_children():
+        # both fused modules and observed custom modules are
+        # swapped as one unit
+        if not isinstance(mod, _FusedModule) and \
+           type(mod) not in custom_module_class_mapping:
+            _convert(mod, mapping, True,  # inplace
+                     convert_custom_config_dict)
+        reassign[name] = swap_module(mod, mapping, custom_module_class_mapping)
+
+    for key, value in reassign.items():
+        module._modules[key] = value
+
+    return module
+
+def swap_module(mod, mapping, custom_module_class_mapping):
+    r"""Swaps the module if it has a quantized counterpart and it has an
+    `observer` attached.
+
+    Args:
+        mod: input module
+        mapping: a dictionary that maps from nn module to nnq module
+
+    Return:
+        The corresponding quantized module of `mod`
+    """
+    new_mod = mod
+    if hasattr(mod, 'qconfig') and mod.qconfig is not None:
+        swapped = False
+        if type(mod) in custom_module_class_mapping:
+            new_mod = custom_module_class_mapping[type(mod)].from_observed(mod)
+            swapped = True
+        elif type(mod) in mapping:
+            new_mod = mapping[type(mod)].from_float(mod)
+            swapped = True
+
+        if swapped:
+            # Preserve module's pre forward hooks. They'll be called on quantized input
+            for pre_hook_fn in mod._forward_pre_hooks.values():
+                new_mod.register_forward_pre_hook(pre_hook_fn)
+            # Preserve module's post forward hooks except _observer_forward_hook
+            # After convert they'll work with quantized output
+            for hook_fn in mod._forward_hooks.values():
+                if hook_fn is not _observer_forward_hook:
+                    new_mod.register_forward_hook(hook_fn)
+
+            # respect device affinity when swapping modules
+            devices = get_unique_devices_(mod)
+            assert len(devices) <= 1, (
+                "swap_module only works with cpu or single-device CUDA modules, "
+                "but got devices {}".format(devices)
+            )
+            device = next(iter(devices)) if len(devices) > 0 else None
+            if device:
+                new_mod.to(device)
+    return new_mod
+
+def get_observer_dict(mod, target_dict, prefix=""):
+    r"""Traverse the modules and save all observers into dict.
+    This is mainly used for quantization accuracy debug
+    Args:
+        mod: the top module we want to save all observers
+        prefix: the prefix for the current module
+        target_dict: the dictionary used to save all the observers
+    """
+    def get_prefix(prefix):
+        return prefix if prefix == "" else prefix + '.'
+
+    if hasattr(mod, 'activation_post_process'):
+        target_dict[get_prefix(prefix) + 'activation_post_process'] = mod.activation_post_process
+    for name, child in mod.named_children():
+        module_prefix = get_prefix(prefix) + name if prefix else name
+        get_observer_dict(child, target_dict, module_prefix)

From e161872aab00f3ca347ea32b972aab53660fc382 Mon Sep 17 00:00:00 2001
From: David Riazati <davidriazati@fb.com>
Date: Thu, 2 Sep 2021 16:58:59 -0700
Subject: [PATCH 495/530] Revert D30732630: [quant] Enable jit tracing on
 quantizable LSTM

Test Plan: revert-hammer

Differential Revision:
D30732630 (https://github.com/pytorch/pytorch/commit/116142143cc2d66c7e582d9f96e00862456fd736)

Original commit changeset: 443e351ebb0e

fbshipit-source-id: 49001392f01366f3b1ccc31139f824c80b86cd40
---
 test/quantization/core/test_quantized_op.py |  7 ---
 torch/nn/quantizable/modules/rnn.py         | 59 +++++++++++++++++----
 2 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 6275174d8e43a..49b7c96847612 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -2476,13 +2476,6 @@ def test_custom_module_lstm(self):
                         msg=(f"Error is too high: SNR(dB): {power}, "
                              f"Signal: {signal}, MSE: {mse}"))
 
-                # Trace
-                jit_qmodule = torch.jit.trace(lstm_quantized, qx)
-
-                # Script
-                # TODO: Fix the scripting in the torch/nn/quantizable/modules/rnn.py
-                # jit_qmodule = torch.jit.script(lstm_quantized)
-
     @override_qengines
     def test_custom_module_multi_head_attention(self):
         class MultiheadAttentionModel(torch.nn.Module):
diff --git a/torch/nn/quantizable/modules/rnn.py b/torch/nn/quantizable/modules/rnn.py
index cd0d094d086a7..bdfd7788533b5 100644
--- a/torch/nn/quantizable/modules/rnn.py
+++ b/torch/nn/quantizable/modules/rnn.py
@@ -48,7 +48,7 @@ def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True,
         self.ogate_cy = torch.nn.quantized.FloatFunctional()
 
     def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
-        if hidden is None or hidden[0] is None or hidden[1] is None:
+        if hidden is None or hidden == (None, None):
             hidden = self.initialize_hidden(x.shape[0], x.is_quantized)
         hx, cx = hidden
 
@@ -175,13 +175,10 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
                 cx_bw = cx_fw[1]
                 cx_fw = cx_fw[0]
             hidden_bw = hx_bw, cx_bw
-        if hx_fw is None and cx_fw is None:
-            hidden_fw = None
-        else:
-            hidden_fw = torch.jit._unwrap_optional(hx_fw), torch.jit._unwrap_optional(cx_fw)
+        hidden_fw = hx_fw, cx_fw
         result_fw, hidden_fw = self.layer_fw(x, hidden_fw)
 
-        if hasattr(self, 'layer_bw') and self.bidirectional:
+        if self.bidirectional:
             x_reversed = x.flip(0)
             result_bw, hidden_bw = self.layer_bw(x_reversed, hidden_bw)
             result_bw = result_bw.flip(0)
@@ -191,7 +188,7 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
             c = torch.stack([hidden_fw[1], hidden_bw[1]], 0)  # type: ignore[list-item]
         else:
             result = result_fw
-            h, c = torch.jit._unwrap_optional(hidden_fw)  # type: ignore[assignment]
+            h, c = hidden_fw  # type: ignore[assignment]
 
         if self.batch_first:
             result.transpose_(0, 1)
@@ -230,6 +227,46 @@ def from_float(cls, other, layer_idx=0, qconfig=None, **kwargs):
             layer.layer_bw = _LSTMSingleLayer.from_params(wi, wh, bi, bh)
         return layer
 
+    # Getters for the weights and biases
+    # Note that jit currently doesn't support the `porperty`, so if you need to
+    # access the weights/biases you would need to navigate manually to the
+    # `layer_fw.cell.igates.*`: https://github.com/pytorch/pytorch/issues/37883
+    @property
+    def weight_ih(self):
+        return self.layer_fw.cell.igates.weight
+
+    @property
+    def weight_hh(self):
+        return self.layer_fw.cell.hgates.weight
+
+    @property
+    def bias_ih(self):
+        return self.layer_fw.cell.igates.bias
+
+    @property
+    def bias_hh(self):
+        return self.layer_fw.cell.hgates.bias
+
+    @property
+    def weight_ih_reverse(self):
+        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
+        return self.layer_bw.cell.igates.weight
+
+    @property
+    def weight_hh_reverse(self):
+        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
+        return self.layer_bw.cell.hgates.weight
+
+    @property
+    def bias_ih_reverse(self):
+        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
+        return self.layer_bw.cell.igates.bias
+
+    @property
+    def bias_hh_reverse(self):
+        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
+        return self.layer_bw.cell.hgates.bias
+
 
 class LSTM(torch.nn.Module):
     r"""A quantizable long short-term memory (LSTM).
@@ -325,12 +362,14 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
                 cx = hidden_non_opt[1].reshape(self.num_layers, num_directions,
                                                max_batch_size,
                                                self.hidden_size).unbind(0)
-                hxcx = [(hx[idx].squeeze_(0), cx[idx].squeeze_(0)) for idx in range(self.num_layers)]
+                hxcx = []
+                for idx in range(self.num_layers):
+                    hxcx.append((hx[idx].squeeze_(0), cx[idx].squeeze_(0)))
             else:
                 hxcx = hidden_non_opt
 
-        for idx, layer in enumerate(self.layers):
-            x, hxcx[idx] = layer(x, hxcx[idx])
+        for idx in range(self.num_layers):
+            x, hxcx[idx] = self.layers[idx](x, hxcx[idx])
 
         hx_list = []
         cx_list = []

From db692ec0b3448ce07fd52e1400a0c733cad85821 Mon Sep 17 00:00:00 2001
From: driazati <driazati@users.noreply.github.com>
Date: Thu, 2 Sep 2021 17:09:48 -0700
Subject: [PATCH 496/530] Regenerate generated github workflows (#64465)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64465

These were out of date and causing master failures

Test Plan: Imported from OSS

Reviewed By: zhouzhuojie

Differential Revision: D30744594

Pulled By: driazati

fbshipit-source-id: 09a21c3c5d9bc83b368d66cabbafd1ba83302dd3
---
 .github/workflows/generated-linux-bionic-py3.6-clang9.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
index e54555d12cf62..3aedb76b3e665 100644
--- a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
+++ b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
@@ -27,16 +27,20 @@ env:
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
 concurrency:
-  group: linux-bionic-py3.6-clang9-${{ github.event.pull_request.number || github.sha }}
+  group: linux-bionic-py3.6-clang9-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
   ciflow_should_run:
     runs-on: ubuntu-18.04
     if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/noarch') || contains(github.event.pull_request.labels.*.name, 'ciflow/xla')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
     steps:
       - name: noop
         run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
   calculate-docker-image:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: linux.2xlarge

From c19bd05e8480cb597e4bb574400cad8e18a4dc25 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Thu, 2 Sep 2021 17:43:59 -0700
Subject: [PATCH 497/530] THC: Cleanup dead code (#64441)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64441

Test Plan: Imported from OSS

Reviewed By: gchanan

Differential Revision: D30735342

Pulled By: ngimel

fbshipit-source-id: 84ab36f7aec6b8cd7f1f34c19a58a382c06ad68d
---
 BUILD.bazel                                   |  2 -
 aten/src/THC/CMakeLists.txt                   |  8 ---
 aten/src/THC/THC.h                            |  1 -
 aten/src/THC/THCTensorMath.cu                 | 39 -----------
 aten/src/THC/THCTensorMath.h                  | 22 ------
 aten/src/THC/THCTensorMathPairwise.cu         | 24 -------
 aten/src/THC/THCTensorMathReduce.cu           |  2 -
 aten/src/THC/generic/THCTensorMath.cu         | 70 -------------------
 aten/src/THC/generic/THCTensorMath.h          | 10 ---
 aten/src/THC/generic/THCTensorMathPairwise.cu | 29 --------
 aten/src/THC/generic/THCTensorMathPairwise.h  | 15 ----
 11 files changed, 222 deletions(-)
 delete mode 100644 aten/src/THC/THCTensorMath.cu
 delete mode 100644 aten/src/THC/THCTensorMath.h
 delete mode 100644 aten/src/THC/THCTensorMathPairwise.cu
 delete mode 100644 aten/src/THC/THCTensorMathReduce.cu
 delete mode 100644 aten/src/THC/generic/THCTensorMath.cu
 delete mode 100644 aten/src/THC/generic/THCTensorMath.h
 delete mode 100644 aten/src/THC/generic/THCTensorMathPairwise.cu
 delete mode 100644 aten/src/THC/generic/THCTensorMathPairwise.h

diff --git a/BUILD.bazel b/BUILD.bazel
index a5f20c2020181..36b29379a5c2f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -394,8 +394,6 @@ filegroup(
         "aten/src/THC/THCStorageCopy.cu.cc",
         "aten/src/THC/THCTensor.cu.cc",
         "aten/src/THC/THCTensorCopy.cu.cc",
-        "aten/src/THC/THCTensorMath.cu.cc",
-        "aten/src/THC/THCTensorMathPairwise.cu.cc",
         "aten/src/THC/THCTensorMathScan.cu.cc",
         "aten/src/THC/THCTensorScatterGather.cu.cc",
         "aten/src/THC/THCTensorSort.cu.cc",
diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt
index f34b040e484ce..82828a7ac1c89 100644
--- a/aten/src/THC/CMakeLists.txt
+++ b/aten/src/THC/CMakeLists.txt
@@ -17,10 +17,7 @@ set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
   ${CMAKE_CURRENT_SOURCE_DIR}/THCStorageCopy.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/THCTensor.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorCopy.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMath.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathMagma.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathPairwise.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathReduce.cu
   PARENT_SCOPE)
 
 install(FILES
@@ -33,7 +30,6 @@ install(FILES
           THCTensor.h
           THCTensorCopy.h
           THCTensorCopy.hpp
-          THCTensorMath.h
           THCApply.cuh
           THCReduceApplyUtils.cuh
           THCTensorMathReduce.cuh
@@ -85,8 +81,4 @@ install(FILES
           generic/THCStorageCopy.h
           generic/THCTensorCopy.cu
           generic/THCTensorCopy.h
-          generic/THCTensorMath.h
-          generic/THCTensorMath.cu
-          generic/THCTensorMathPairwise.h
-          generic/THCTensorMathPairwise.cu
           DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THC/generic")
diff --git a/aten/src/THC/THC.h b/aten/src/THC/THC.h
index 717442db9eaa1..59e2f5de69fe0 100644
--- a/aten/src/THC/THC.h
+++ b/aten/src/THC/THC.h
@@ -11,6 +11,5 @@
 
 #include <THC/THCTensor.h>
 #include <THC/THCTensorCopy.h>
-#include <THC/THCTensorMath.h>
 
 #endif
diff --git a/aten/src/THC/THCTensorMath.cu b/aten/src/THC/THCTensorMath.cu
deleted file mode 100644
index 418bfa9e14919..0000000000000
--- a/aten/src/THC/THCTensorMath.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <THC/THCTensorMath.h>
-#include <THC/THCGeneral.h>
-#include <THC/THCTensorCopy.h>
-#include <THC/THCApply.cuh>
-#include <THC/THCNumerics.cuh>
-#include <THC/THCThrustAllocator.cuh>
-#include <THC/THCTensor.hpp>
-
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/sequence.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/transform.h>
-#if CUDA_VERSION >= 7000 || defined __HIP_PLATFORM_HCC__
-#include <thrust/system/cuda/execution_policy.h>
-#endif
-#include <cfloat>
-
-template <typename T>
-struct TensorFillOp {
-  TensorFillOp(T v) : val(v) {}
-  __device__ __forceinline__ void operator()(T* v) { *v = val; }
-
-  const T val;
-};
-
-#include <THC/generic/THCTensorMath.cu>
-#include <THC/THCGenerateAllTypes.h>
-
-#include <THC/generic/THCTensorMath.cu>
-#include <THC/THCGenerateBoolType.h>
-
-#include <THC/generic/THCTensorMath.cu>
-#include <THC/THCGenerateBFloat16Type.h>
diff --git a/aten/src/THC/THCTensorMath.h b/aten/src/THC/THCTensorMath.h
deleted file mode 100644
index b70d4d14b02d9..0000000000000
--- a/aten/src/THC/THCTensorMath.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef TH_CUDA_TENSOR_MATH_INC
-#define TH_CUDA_TENSOR_MATH_INC
-
-#include <THC/THCTensor.h>
-#include <THC/THCGeneral.h>
-
-#include <THC/generic/THCTensorMath.h>
-#include <THC/THCGenerateAllTypes.h>
-
-#include <THC/generic/THCTensorMath.h>
-#include <THC/THCGenerateBoolType.h>
-
-#include <THC/generic/THCTensorMath.h>
-#include <THC/THCGenerateBFloat16Type.h>
-
-#include <THC/generic/THCTensorMathPairwise.h>
-#include <THC/THCGenerateAllTypes.h>
-
-#include <THC/generic/THCTensorMathPairwise.h>
-#include <THC/THCGenerateBoolType.h>
-
-#endif
diff --git a/aten/src/THC/THCTensorMathPairwise.cu b/aten/src/THC/THCTensorMathPairwise.cu
deleted file mode 100644
index 6fd026aa8966d..0000000000000
--- a/aten/src/THC/THCTensorMathPairwise.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-#include <THC/THCTensorMath.h>
-#include <THC/THCGeneral.h>
-#include <TH/THHalf.h>
-#include <THC/THCTensorCopy.h>
-#include <THC/THCApply.cuh>
-#include <THC/THCNumerics.cuh>
-#include <THC/THCTensor.hpp>
-
-template <typename T>
-struct TensorMulConstantOp {
-  TensorMulConstantOp(T v) : val(v) {}
-  __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *in * val;
-  }
-
-  __device__ __forceinline__ void operator()(T* v) {
-    *v *= val;
-  }
-
-  const T val;
-};
-
-#include <THC/generic/THCTensorMathPairwise.cu>
-#include <THC/THCGenerateBoolType.h>
diff --git a/aten/src/THC/THCTensorMathReduce.cu b/aten/src/THC/THCTensorMathReduce.cu
deleted file mode 100644
index 1a2c626537156..0000000000000
--- a/aten/src/THC/THCTensorMathReduce.cu
+++ /dev/null
@@ -1,2 +0,0 @@
-#include <THC/THCTensorMathReduce.cuh>
-#include <THC/THCTensor.hpp>
diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu
deleted file mode 100644
index d07a3e3a62cdc..0000000000000
--- a/aten/src/THC/generic/THCTensorMath.cu
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THC/generic/THCTensorMath.cu"
-#else
-
-#include <algorithm>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/MemoryOverlap.h>
-
-void THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value)
-{
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
-
-  if (!THC_pointwiseApply1<scalar_t>(
-        state, self_, TensorFillOp<scalar_t>(value))) {
-    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
-void THCTensor_(zero)(THCState *state, THCTensor *self_)
-{
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
-  if (THCTensor_(isContiguous)(state, self_)) {
-    THCudaCheck(cudaMemsetAsync(THCTensor_(data)(state, self_),
-                                0,
-                                sizeof(scalar_t) * THCTensor_(nElement)(state, self_),
-                                c10::cuda::getCurrentCUDAStream()));
-  } else {
-    if (!THC_pointwiseApply1<scalar_t>(
-          state, self_,
-          TensorFillOp<scalar_t>(ScalarConvert<int, scalar_t>::to(0)))) {
-      THArgCheck(false, 1, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
-ptrdiff_t
-THCTensor_(numel)(THCState *state, THCTensor *t)
-{
-  return THCTensor_(nElement)(state, t);
-}
-
-void THCTensor_(check_shape_except_dim)(THCState *state,
-    THCTensor *first, THCTensor *second, int dimension, int index);
-inline void THCTensor_(check_shape_except_dim)(THCState *state,
-    THCTensor *first, THCTensor *second, int dimension, int index)
-{
-  int first_dims = first->dim();
-  int second_dims = second->dim();
-  THArgCheck(first_dims == second_dims, 0,
-      "Tensors must have same number of dimensions: got %d and %d",
-      first_dims, second_dims);
-  for (int dim = 0; dim < first_dims; dim++) {
-    if (dim == dimension) {
-      continue;
-    }
-    int64_t first_dim_size = THCTensor_(size)(state, first, dim);
-    int64_t second_dim_size = THCTensor_(size)(state, second, dim);
-    THArgCheck(first_dim_size == second_dim_size, 0,
-        "Sizes of tensors must match except in dimension %d. Got %lld and %lld in dimension %d (The offending index is %d)",
-        dimension, (long long)first_dim_size, (long long)second_dim_size, dim, index);
-  }
-}
-
-
-#endif
diff --git a/aten/src/THC/generic/THCTensorMath.h b/aten/src/THC/generic/THCTensorMath.h
deleted file mode 100644
index 58ec1567aed9b..0000000000000
--- a/aten/src/THC/generic/THCTensorMath.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THC/generic/THCTensorMath.h"
-#else
-
-TORCH_CUDA_CU_API void THCTensor_(
-    fill)(THCState* state, THCTensor* self, scalar_t value);
-TORCH_CUDA_CU_API void THCTensor_(zero)(THCState* state, THCTensor* self);
-TORCH_CUDA_CU_API ptrdiff_t THCTensor_(numel)(THCState* state, THCTensor* t);
-
-#endif
diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu
deleted file mode 100644
index aba731c725423..0000000000000
--- a/aten/src/THC/generic/THCTensorMathPairwise.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THC/generic/THCTensorMathPairwise.cu"
-#else
-
-#include <ATen/NamedTensorUtils.h>
-
-#if !defined(THC_REAL_IS_BOOL)
-
-void THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
-{
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
-  if (self_ == src_) {
-    if (!THC_pointwiseApply1<scalar_t>(state, self_, TensorMulConstantOp<scalar_t>(value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCTensor_(resizeAs)(state, self_, src_);
-
-    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, TensorMulConstantOp<scalar_t>(value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
-#endif
-
-#endif
diff --git a/aten/src/THC/generic/THCTensorMathPairwise.h b/aten/src/THC/generic/THCTensorMathPairwise.h
deleted file mode 100644
index deeafb1291fbd..0000000000000
--- a/aten/src/THC/generic/THCTensorMathPairwise.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THC/generic/THCTensorMathPairwise.h"
-#else
-
-TORCH_CUDA_CU_API int THCTensor_(
-    equal)(THCState* state, THCTensor* self, THCTensor* src);
-
-#if !defined(THC_REAL_IS_BOOL)
-
-TORCH_CUDA_CU_API void THCTensor_(
-    mul)(THCState* state, THCTensor* self, THCTensor* src, scalar_t value);
-
-#endif
-
-#endif

From cd82bc1af901f86c562663eb1c09413c7d4a19b2 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Thu, 2 Sep 2021 20:51:38 -0700
Subject: [PATCH 498/530] Skips layer norm OpInfo on tbb platform (#64469)

Summary:
The OpInfo tests appear to be discovering a layer norm x tbb issue that requires investigation. Skipping tests on that platform for now to restore CI signal.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64469

Reviewed By: ngimel

Differential Revision: D30745746

Pulled By: mruberry

fbshipit-source-id: 282484cc00b867fac85b7df61430d64277da6421
---
 torch/testing/_internal/common_methods_invocations.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index a45bcf54faba8..221cb29ec5eb6 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -7,6 +7,7 @@
 import random
 import numbers
 import unittest
+import os
 
 import torch
 import numpy as np
@@ -7384,6 +7385,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                    toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1e-03)}),
                    'TestCommon', 'test_reference_testing'
                ),
+               unittest.skipIf("tbb" in os.getenv("BUILD_ENVIRONMENT", ""), "This test makes TBB Sad"),
            ],
            sample_inputs_func=sample_inputs_layer_norm,),
     OpInfo('nn.functional.pad',

From e1c3e5f8308e6a47b76c8a22f7bc8cbc29aa4ae3 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Thu, 2 Sep 2021 21:11:57 -0700
Subject: [PATCH 499/530] [resubmit][FX] Prototype for guarding against mutable
 operations in tracing (#64467)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64467

Test Plan: Imported from OSS

Reviewed By: driazati

Differential Revision: D30744870

Pulled By: jamesr66a

fbshipit-source-id: fc652f8b17748f90dbeb83fabf3bd5bb57d6ff1a
---
 ..._compat-fx_backcompat_class_members.expect |  2 +-
 test/test_fx.py                               | 67 +++++++++++++++++--
 torch/csrc/jit/python/init.cpp                | 14 ++--
 torch/fx/operator_schemas.py                  | 53 +++++++++++++--
 torch/fx/proxy.py                             |  7 ++
 5 files changed, 127 insertions(+), 16 deletions(-)

diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
index 88e4654b568df..5c3630a3169f7 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
@@ -15,5 +15,5 @@ torch.fx.proxy.Attribute ['node']
 torch.fx.proxy.GraphAppendingTracer []
 torch.fx.proxy.Proxy ['keys']
 torch.fx.proxy.TraceError []
-torch.fx.proxy.TracerBase ['create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool']
+torch.fx.proxy.TracerBase ['check_mutable_operations', 'create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool']
 torch.fx.subgraph_rewriter.Match ['anchor', 'nodes_map']
\ No newline at end of file
diff --git a/test/test_fx.py b/test/test_fx.py
index 5220f67ebf309..57a2960a409c3 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -132,10 +132,17 @@ def __init__(self, a, b):
 
 class TestFX(JitTestCase):
     def setUp(self):
-        if TEST_WITH_ROCM or IS_FBCODE or IS_WINDOWS or IS_MACOS:
-            return
-        lib_file_path = find_library_location('libtorchbind_test.so')
-        torch.ops.load_library(str(lib_file_path))
+        # Checking for mutable operations whil tracing is feature flagged
+        # Enable it in testing but not by default
+        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
+        torch.fx.proxy.TracerBase.check_mutable_operations = True
+
+        if not (TEST_WITH_ROCM or IS_FBCODE or IS_WINDOWS or IS_MACOS):
+            lib_file_path = find_library_location('libtorchbind_test.so')
+            torch.ops.load_library(str(lib_file_path))
+
+    def tearDown(self):
+        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
 
     def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None):
         """Check that an nn.Module's results match the GraphModule version
@@ -2367,6 +2374,19 @@ def forward(self, x: torch.Tensor, y: int, z: int):
 
         traced.graph.lint()
 
+    def test_throw_out_variant(self):
+        def foo(x):
+            y = torch.rand_like(x)
+            torch.sigmoid(x, out=y)
+            return y
+
+        class MyTracer(torch.fx.Tracer):
+            check_mutable_operations = True
+
+        tracer = MyTracer()
+        with self.assertRaisesRegex(RuntimeError, 'mutable operation aten::sigmoid.out'):
+            traced_graph = tracer.trace(foo)
+
     def test_ast_rewriter_reassigns_submodules(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -3021,6 +3041,15 @@ def run_getitem_target():
 
 
 class TestOperatorSignatures(JitTestCase):
+    def setUp(self):
+        # Checking for mutable operations whil tracing is feature flagged
+        # Enable it in testing but not by default
+        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
+        torch.fx.proxy.TracerBase.check_mutable_operations = True
+
+    def tearDown(self):
+        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
+
     @onlyCPU
     @ops(op_db, allowed_dtypes=(torch.float,))
     def test_get_torch_func_signature_exhaustive(self, device, dtype, op):
@@ -3090,6 +3119,15 @@ class TestFXAPIBackwardCompatibility(JitTestCase):
     def setUp(self):
         self.maxDiff = None
 
+        # Checking for mutable operations whil tracing is feature flagged
+        # Enable it in testing but not by default
+        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
+        torch.fx.proxy.TracerBase.check_mutable_operations = True
+
+    def tearDown(self):
+        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
+
+
     def _fn_to_stable_annotation_str(self, obj):
         """
         Unfortunately we have to serialize function signatures manually since
@@ -3326,6 +3364,15 @@ def check_symbols_have_bc_designation(m, prefix):
                                  f"BC guarantees.")
 
 class TestFunctionalTracing(JitTestCase):
+    def setUp(self):
+        # Checking for mutable operations whil tracing is feature flagged
+        # Enable it in testing but not by default
+        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
+        torch.fx.proxy.TracerBase.check_mutable_operations = True
+
+    def tearDown(self):
+        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
+
     IGNORE_FUNCS = ("has_torch_function", "has_torch_function_unary",
                     "has_torch_function_variadic", "handle_torch_function",
                     "boolean_dispatch")
@@ -3340,6 +3387,7 @@ class TestFunctionalTracing(JitTestCase):
     ARG_TYPE_MISMATCH = (TypeError, r", not Proxy$")
     CONTROL_FLOW = (TraceError, r"symbolically traced variables cannot be used as inputs to control flow")
     INTERPOLATE_ARGS_CONFLICT = (ValueError, r"only one of size or scale_factor should be defined")
+    MUTABLE = (RuntimeError, r"Tried to trace mutable operation")
 
     UNTRACEABLE_FUNCTIONALS = {
         "adaptive_avg_pool1d": BUILT_IN_FUNC,
@@ -3459,6 +3507,8 @@ class TestFunctionalTracing(JitTestCase):
 
         "upsample_bilinear": INTERPOLATE_ARGS_CONFLICT,
         "upsample_nearest": INTERPOLATE_ARGS_CONFLICT,
+
+        "normalize" : MUTABLE,
     }
 
     # List of nn.functionals with Tensor inputs but not with type annotation
@@ -3573,6 +3623,15 @@ def tearDownClass(cls):
 
 @skipIfNoTorchVision
 class TestVisionTracing(JitTestCase):
+    def setUp(self):
+        # Checking for mutable operations whil tracing is feature flagged
+        # Enable it in testing but not by default
+        self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations
+        torch.fx.proxy.TracerBase.check_mutable_operations = True
+
+    def tearDown(self):
+        torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag
+
     PROXY_ITERATED = (TraceError, r"Proxy object cannot be iterated")
     INCONSISTENT_TYPE = (
         RuntimeError,
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 7e43e511c786f..35197e4ea1423 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1280,11 +1280,15 @@ void initJITBindings(PyObject* module) {
           [](const FunctionSchema& self, const FunctionSchema& other) {
             return self == other;
           })
-      .def("__str__", [](FunctionSchema& self) {
-        std::stringstream ss;
-        ss << self;
-        return ss.str();
-      });
+      .def(
+          "__str__",
+          [](FunctionSchema& self) {
+            std::stringstream ss;
+            ss << self;
+            return ss.str();
+          })
+      .def_property_readonly(
+          "is_mutable", [](FunctionSchema& self) { return self.is_mutable(); });
   py::class_<Argument>(m, "Argument")
       .def_property_readonly("name", [](Argument& self) { return self.name(); })
       .def_property_readonly("type", [](Argument& self) { return self.type(); })
diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py
index ac559b19530c7..d7ddc3e0360c7 100644
--- a/torch/fx/operator_schemas.py
+++ b/torch/fx/operator_schemas.py
@@ -4,10 +4,13 @@
 import typing
 import enum
 import warnings
-from typing import Any, Callable, Dict, List, Optional, Tuple, NamedTuple, cast
+from typing import Any, Callable, Dict, List, Optional, Tuple, NamedTuple, cast, TYPE_CHECKING
 from torch._jit_internal import boolean_dispatched
 from ._compatibility import compatibility
 
+if TYPE_CHECKING:
+    from .node import Argument
+
 @compatibility(is_backward_compatible=False)
 class ArgsKwargsPair(NamedTuple):
     """
@@ -79,7 +82,43 @@ def _torchscript_schema_to_signature(ts_schema : torch._C.FunctionSchema) -> ins
     return inspect.Signature(parameters, return_annotation=return_type)
 
 @compatibility(is_backward_compatible=False)
-def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature]]:
+def check_for_mutable_operation(target : Callable, args : Tuple['Argument', ...], kwargs : Dict[str, 'Argument']):
+    signatures, schemas = get_signature_for_torch_op(target, return_schemas=True)
+
+    if signatures and schemas:
+        matched_schemas = []
+
+        # Iterate through all of the schema until we find one that matches
+        # If one matches, populate `new_args_and_kwargs` with the new args/kwargs
+        # values. If none matches, `new_args_and_kwargs` will be None
+        for candidate_signature, schema in zip(signatures, schemas):
+            try:
+                candidate_signature.bind(*args, **kwargs)
+                matched_schemas.append((candidate_signature, schema))
+            except TypeError as e:
+                continue
+
+        def throw_if_mutable(schema):
+            if schema.is_mutable:
+                raise RuntimeError(f'Tried to trace mutable operation {schema}. FX only supports functional '
+                                   f'code, so operations that mutate operands in-place (e.g. via `out` arguments) '
+                                   f'are not supported')
+
+        if len(matched_schemas) == 0:
+            # Did not match any schema. Cannot check for mutation
+            pass
+        elif len(matched_schemas) == 1:
+            # Matched exactly one schema, unambiguous
+            _, schema_to_check = matched_schemas[0]
+            throw_if_mutable(schema_to_check)
+            pass
+        else:
+            # Ambiguous schema match. Since mutability checking is best effort,
+            # do nothing.
+            pass
+
+@compatibility(is_backward_compatible=False)
+def get_signature_for_torch_op(op : Callable, return_schemas : bool = False):
     """
     Given an operator on the `torch` namespace, return a list of `inspect.Signature`
     objects corresponding to the overloads of that op.. May return `None` if a signature
@@ -90,21 +129,23 @@ def get_signature_for_torch_op(op : Callable) -> Optional[List[inspect.Signature
 
     Returns:
         Optional[List[inspect.Signature]]: A list of signatures for the overloads of this
-            operator, or None if the operator signatures could not be retrieved.
+            operator, or None if the operator signatures could not be retrieved. If
+            return_schemas=True, returns a tuple containing the optional Python signatures
+            and the optional TorchScript Function signature
     """
     override = _manual_overrides.get(op)
     if override:
-        return override
+        return (override, None) if return_schemas else None
 
     aten_fn = torch.jit._builtins._find_builtin(op)
 
     if aten_fn is None:
-        return None
+        return (None, None) if return_schemas else None
 
     schemas = torch._C._jit_get_schemas_for_operator(aten_fn)
     signatures = [_torchscript_schema_to_signature(schema) for schema in schemas]
 
-    return signatures
+    return (signatures, schemas) if return_schemas else signatures
 
 @compatibility(is_backward_compatible=False)
 def create_type_hint(x):
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index 61b039f8b7219..b25e45d206a51 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -8,11 +8,15 @@
 from typing import Tuple, Dict, Optional, Iterable, Any, Iterator, Callable
 from .node import Target, Node, Argument, base_types, map_aggregate
 from ._compatibility import compatibility
+from .operator_schemas import check_for_mutable_operation
 
 @compatibility(is_backward_compatible=True)
 class TracerBase:
     graph: Graph
     record_stack_traces : bool = False
+    # Feature flag for mutable schema checking
+    # Enableby default in 1.12
+    check_mutable_operations : bool = False
 
     @compatibility(is_backward_compatible=True)
     def create_node(self, kind : str, target : Target,
@@ -25,6 +29,9 @@ def create_node(self, kind : str, target : Target,
         modification of values used in node creation. For example, one might
         want to disallow in-place operations from being recorded.
         """
+        if kind == 'call_function' and self.check_mutable_operations:
+            check_for_mutable_operation(target, args, kwargs)
+
         return self.graph.create_node(kind, target, args, kwargs, name, type_expr)
 
     @compatibility(is_backward_compatible=True)

From 7031fbdc63b0334543e257c064aad3f85b9a102c Mon Sep 17 00:00:00 2001
From: Elias Ellison <eellison@devfair044.h1.fair>
Date: Thu, 2 Sep 2021 22:16:22 -0700
Subject: [PATCH 500/530] update optimize_for_inference docs (#64428)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64428

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D30740898

Pulled By: eellison

fbshipit-source-id: b94d2c3deb661a6ba048f19e8c1d5e1799667eeb
---
 docs/source/jit.rst  |  1 +
 torch/jit/_freeze.py | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index f791c1c687153..8a80b6471e1a7 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -60,6 +60,7 @@ Creating TorchScript Code
     ScriptModule
     ScriptFunction
     freeze
+    optimize_for_inference
     save
     load
     ignore
diff --git a/torch/jit/_freeze.py b/torch/jit/_freeze.py
index cab6d3c8f71ef..582baf7422343 100644
--- a/torch/jit/_freeze.py
+++ b/torch/jit/_freeze.py
@@ -179,6 +179,18 @@ def optimize_for_inference(mod: ScriptModule) -> ScriptModule:
     This is still in prototype, and may have the potential to slow down your model.
     Primary use cases that have been targeted so far have been vision models on cpu
     and gpu to a lesser extent.
+
+    Example (optimizing a module with Conv->Batchnorm)::
+
+        import torch
+        in_channels, out_channels = 3, 32
+        conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=True)
+        bn = torch.nn.BatchNorm2d(out_channels, eps=.001)
+        mod = torch.nn.Sequential(conv, bn)
+        frozen_mod = torch.jit.optimize_for_inference(torch.jit.script(mod.eval()))
+        assert "batch_norm" not in str(frozen_mod.graph)
+        # if built with MKLDNN, convolution will be run with MKLDNN weights
+        assert "MKLDNN" in frozen_mod.graph
     """
     if not isinstance(mod, ScriptModule):
         raise RuntimeError(

From 39aeb3bf63f61664bc6c4a929a80a660365c2a5e Mon Sep 17 00:00:00 2001
From: Elias Ellison <eellison@devfair044.h1.fair>
Date: Thu, 2 Sep 2021 22:16:22 -0700
Subject: [PATCH 501/530] Add fusion enabled apis (#64429)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64429

Test Plan: Imported from OSS

Reviewed By: pbelevich

Differential Revision: D30740897

Pulled By: eellison

fbshipit-source-id: 446aa63b5d763f1cfffea62547db7294368e3438
---
 docs/source/jit.rst   |  2 ++
 torch/jit/__init__.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index 8a80b6471e1a7..97a0615812830 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -61,6 +61,8 @@ Creating TorchScript Code
     ScriptFunction
     freeze
     optimize_for_inference
+    enable_fusion
+    fusion_enabled
     save
     load
     ignore
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index f7fa58bd36434..f804d3c72ce8e 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -4,6 +4,7 @@
 from typing import Iterator
 
 from torch.utils import set_module
+from typing import Optional
 
 # These are imported so users can access them from the `torch.jit` module
 from torch._jit_internal import (
@@ -197,6 +198,34 @@ def _hide_source_ranges() -> Iterator[None]:
     finally:
         torch._C.Graph.set_global_print_source_ranges(old_enable_source_ranges)  # type: ignore[attr-defined]
 
+def enable_fusion(enabled: bool, device: Optional[str] = None):
+    """
+    Enables or disables JIT fusion based on the parameter `enabled`.
+
+    If `device` is None, both CPU and GPU fusion will be turned on or off.
+    Otherwise, device must be equal to "cpu" or "cuda", and will turn on or off
+    CPU and GPU fusion respectively.
+    """
+
+    if device is None:
+        torch._C._jit_override_can_fuse_on_cpu(enabled)
+        torch._C._jit_override_can_fuse_on_gpu(enabled)
+    else:
+        assert device in ["cpu", "cuda"], "Device-specific fusion must be equal to 'cpu' or 'cuda' if not None"
+        if device == "cuda":
+            torch._C._jit_override_can_fuse_on_gpu(enabled)
+        else:
+            torch._C._jit_override_can_fuse_on_cpu(enabled)
+
+def fusion_enabled(device: str):
+    """
+    Returns whether JIT fusion is enabled for "cpu" or "cuda"
+    """
+    assert device == "cpu" or device == "cuda"
+    if device == "cpu":
+        return torch._C._jit_can_fuse_on_cpu()
+    else:
+        return torch._C._jit_can_fuse_on_gpu()
 
 if not torch._C._jit_init():
     raise RuntimeError("JIT initialization failed")

From 91b926fab3f7fec9055a0425f55443dee25afbad Mon Sep 17 00:00:00 2001
From: Kefei Lu <kefeilu@fb.com>
Date: Thu, 2 Sep 2021 23:03:02 -0700
Subject: [PATCH 502/530] Add fx2trt pass for removing duplicate output args
 (#64461)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64461

Fx2TRT does not support duplicate nodes in the output args tuple.

This pass removes duplicate output args from the target subnets and fixes their uses in the top level module where the subnets are called. This pass must be called after acc split on the top-level net and subsequent calls to the acc trace on the subnets.

This pass will change both the subnets and top level module.

Test Plan:
Run:

```
buck run mode/opt -c python.package_style=inplace //caffe2/torch/fb/fx2trt/tests/passes/:test_remove_duplicate_output_args

```

Reviewed By: yinghai

Differential Revision: D30740499

fbshipit-source-id: 98459f7677980b21c7bffda918158001285572db
---
 .../passes/remove_duplicate_output_args.py    | 134 ++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py

diff --git a/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py b/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py
new file mode 100644
index 0000000000000..bfddab57c0935
--- /dev/null
+++ b/torch/fx/experimental/fx2trt/passes/remove_duplicate_output_args.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+import operator
+import typing as t
+import logging
+import torch.fx as fx
+import dataclasses as dc
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def remove_duplicate_output_args(
+    top_level: fx.GraphModule,
+    target_subnets: t.Collection[str]
+) -> t.Mapping[str, "RemoveDuplicateResult"]:
+    """Removes duplicate output args.
+
+    This pass removes duplicate output args from the target subnets and fixes
+    their uses in the top level module where the subnets are called. This pass
+    must be called after acc split on the top-level net and subsequent calls to
+    the acc trace on the subnets.
+
+    This pass will change both the subnets and top level module.
+
+    Returns:
+        a mapping of the target subnet name to its dedupcate result
+    """
+
+    processed_subnets = {}
+    for node in top_level.graph.nodes:  # type: fx.Node
+        if node.op == "call_module" and node.name in target_subnets:
+            assert isinstance(node.target, str)
+            sub_gm = top_level.get_submodule(node.target)
+            assert isinstance(sub_gm, fx.GraphModule)
+
+            replace_res = _remove_duplicate_output_args(sub_gm)
+            processed_subnets[node.name] = replace_res
+            if replace_res.replacement_map is None:
+                continue
+            sub_gm.recompile()
+
+            needs_recompile = False
+            # iterate on the copy since we will be changing elements of node.users
+            for user in list(node.users):
+                idx = _ensure_proper_output_use(user, node)
+                idx_new = replace_res.replacement_map[idx]
+                if idx_new != idx:
+                    user.args = (user.args[0], idx_new)
+                    needs_recompile = True
+
+            if needs_recompile:
+                top_level.recompile()
+    return processed_subnets
+
+
+@dc.dataclass(frozen=True)
+class RemoveDuplicateResult:
+    replacement_map: t.Optional[t.List[int]]
+    module: fx.GraphModule
+
+
+def _ensure_proper_output_use(user: fx.Node, target_node: fx.Node) -> int:
+    """
+    Ensures the node looks in proper form of calling the output of an fx2trt
+    splitter sub-net. Specifically:
+
+    1. op is call function, target: operator.getitem
+    2. args is a 2-element tuple
+    3. args[0] is the name of the subnet's output
+    4. args[1] is the index into the subnet output tuple
+
+    E.g.:
+
+        %getitem_4 : [#users=1] = call_function[target=operator.getitem](args = (%_run_on_acc_1, 4), kwargs = {})
+
+    returns the index into the subnet output tuple
+    """
+    _LOGGER.info(f"Checking user node: {user.format_node()}")
+    assert (
+        user.op == "call_function"
+        and user.target == operator.getitem
+        and len(user.args) == 2
+        and isinstance(user.args[0], fx.Node)
+        and user.args[0].name == target_node.name
+        and isinstance(user.args[1], int)
+    ), f"Node is not a proper user of splitter output: {user.format_node()}"
+
+    return user.args[1]
+
+
+def _remove_duplicate_output_args(gm: fx.GraphModule) -> RemoveDuplicateResult:
+    output_nodes = [n for n in gm.graph.nodes if n.op == "output"]
+    assert len(output_nodes) == 1, \
+           f"Expecting exactly one `output` node, but got {len(output_nodes)}"
+
+    changed = False
+    # arg node name to its index in the new output args tuple
+    name_to_idx: t.Dict[str, int] = {}
+    output_node = output_nodes[0]
+
+    # Output op only uses its `args[0]`, and it does not have `kwargs`.
+    # https://pytorch.org/docs/stable/fx.html#torch.fx.Node
+    args: t.Sequence[t.Any] = output_node.args[0]
+
+    # Only concern outselves to the case where the args is an iterable of fx.Node.
+    # Other return cases (e.g., a single value) is possible and we don't handle
+    # that in this pass.
+    if not (isinstance(args, t.Iterable) and all(isinstance(a, fx.Node) for a in args)):
+        return RemoveDuplicateResult(replacement_map=None, module=gm)
+
+    # Map old index of the arg node to the remaining node's idx,
+    # initialized to `i => i`
+    replacement_map: t.List[int] = list(range(len(args)))
+    args_new = []
+    for idx, a in enumerate(args):
+        assert isinstance(a, fx.Node), \
+               f"Expecting fx.Node instance, but got: {type(a)}"
+
+        if a.name not in name_to_idx:
+            args_new.append(a)
+            name_to_idx[a.name] = len(args_new) - 1
+        else:
+            changed = True
+            _LOGGER.warning(
+                f"Replaced duplicate output arg '{a.name}': "
+                f"{idx} -> {name_to_idx[a.name]}"
+            )
+        replacement_map[idx] = name_to_idx[a.name]
+
+    output_node.args = (tuple(args_new),)
+    if changed:
+        gm.recompile()
+    return RemoveDuplicateResult(replacement_map, module=gm)

From 6831d8e379392da1340a28fdb3e7e1382176d1d4 Mon Sep 17 00:00:00 2001
From: Ansley Ussery <ansley@fb.com>
Date: Fri, 3 Sep 2021 06:10:37 -0700
Subject: [PATCH 503/530] Support Union in TorchScript (#64234)

Summary:
This PR is created to replace https://github.com/pytorch/pytorch/pull/53180 PR stack, which has all the review discussions. Reason for needing a replacement is due to a messy Sandcastle issue.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64234

Reviewed By: gmagogsfm

Differential Revision: D30656444

Pulled By: ansley

fbshipit-source-id: 77536c8bcc88162e2c72636026ca3c16891d669a
---
 CONTRIBUTING.md                               |   4 +-
 aten/src/ATen/core/jit_type.h                 | 172 +++--
 aten/src/ATen/core/jit_type_base.h            |   7 +-
 aten/src/ATen/core/type.cpp                   | 488 ++++++++++++-
 aten/src/ATen/native/README.md                |   4 +-
 c10/mobile/CPUProfilingAllocator.h            |   2 +-
 docs/source/rpc/distributed_autograd.rst      |   4 +-
 test/cpp/jit/CMakeLists.txt                   |   1 +
 test/cpp/jit/test_alias_analysis.cpp          |  25 +
 test/cpp/jit/test_union.cpp                   | 149 ++++
 .../TestScript.test_annot_ast_mypy_fn.expect  |   2 +-
 ...stScript.test_annot_ast_mypy_method.expect |   2 +-
 .../TestScript.test_annot_ast_py3_fn.expect   |   2 +-
 ...estScript.test_annot_ast_py3_method.expect |   2 +-
 ...estScript.test_annot_string_mypy_fn.expect |   2 +-
 ...cript.test_annot_string_mypy_method.expect |   2 +-
 ...TestScript.test_annot_string_py3_fn.expect |   2 +-
 ...Script.test_annot_string_py3_method.expect |   2 +-
 test/jit/test_list_dict.py                    |   6 +-
 test/jit/test_typing.py                       |  16 +-
 test/jit/test_union.py                        | 657 ++++++++++++++++++
 test/test_jit.py                              |  57 +-
 test/test_ops.py                              |   2 +
 test/test_public_bindings.py                  |   1 +
 torch/_C/__init__.pyi.in                      |   3 +
 torch/_jit_internal.py                        |  54 +-
 torch/csrc/jit/OVERVIEW.md                    |   9 +-
 torch/csrc/jit/frontend/convert_to_ssa.cpp    |   6 +-
 torch/csrc/jit/frontend/exit_transforms.cpp   |   6 +-
 torch/csrc/jit/frontend/ir_emitter.cpp        | 307 +++++---
 torch/csrc/jit/frontend/schema_matching.h     |   7 +-
 .../csrc/jit/frontend/schema_type_parser.cpp  |  13 +
 .../csrc/jit/frontend/script_type_parser.cpp  |  33 +-
 torch/csrc/jit/ir/alias_analysis.cpp          | 292 +++++---
 torch/csrc/jit/ir/alias_analysis.h            |  30 +-
 torch/csrc/jit/ir/ir.cpp                      |  17 +-
 torch/csrc/jit/ir/ir.h                        |  29 +-
 torch/csrc/jit/mobile/type_parser.cpp         |  11 +
 torch/csrc/jit/passes/shape_analysis.cpp      |  18 +
 torch/csrc/jit/passes/utils/memory_dag.cpp    |  11 +-
 torch/csrc/jit/passes/utils/memory_dag.h      |  26 +-
 torch/csrc/jit/python/pybind_utils.cpp        |  13 +
 torch/csrc/jit/python/python_ir.cpp           |   6 +
 torch/csrc/jit/serialization/import.cpp       |   3 +-
 .../csrc/jit/serialization/import_source.cpp  |  13 +-
 torch/csrc/jit/serialization/python_print.cpp |  28 +-
 torch/csrc/jit/serialization/unpickler.cpp    |  17 +-
 torch/jit/_script.py                          |   2 +-
 torch/jit/annotations.py                      |  28 +-
 torch/jit/frontend.py                         |   1 +
 50 files changed, 2132 insertions(+), 462 deletions(-)
 create mode 100644 test/cpp/jit/test_union.cpp
 create mode 100644 test/jit/test_union.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 93de9b022ee6f..e102de7be6334 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -435,12 +435,12 @@ is `./build/bin/FILENAME --gtest_filter=TESTSUITE.TESTNAME`, where
 `TESTNAME` is the name of the test you'd like to run and `TESTSUITE` is
 the suite that test is defined in.
 
-For example, if you wanted to run the test ` MayContainAlias`, which
+For example, if you wanted to run the test `MayContainAlias`, which
 is part of the test suite `ContainerAliasingTest` in the file
 `test/cpp/jit/test_alias_analysis.cpp`, the command would be:
 
 ```bash
-./build/bin/test_jit --gtest_filter=ContainerAliasingTest.UnionAliasing
+./build/bin/test_jit --gtest_filter=ContainerAliasingTest.MayContainAlias
 ```
 
 
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index eee5acaccd655..4284e296229cc 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -30,6 +30,9 @@ struct FunctionSchema;
 struct NamedType;
 using OptNameList = c10::optional<std::vector<std::string>>;
 
+void standardizeVectorForUnion(std::vector<TypePtr>& reference, std::vector<TypePtr>* to_fill);
+void standardizeVectorForUnion(std::vector<TypePtr>* to_flatten);
+
 struct AnyType;
 using AnyTypePtr = std::shared_ptr<AnyType>;
 // Any is the top of the type hierarchy, all other types are subtypes
@@ -94,25 +97,84 @@ struct SingleElementType : public Type {
   TypePtr elem;
 };
 
+struct UnionType;
+using UnionTypePtr = std::shared_ptr<UnionType>;
+struct TORCH_API UnionType : public Type {
+  friend struct Type;
+
+  static const TypeKind Kind = TypeKind::UnionType;
+
+  bool isSubtypeOfExt(const TypePtr& rhs_, std::ostream* why_not) const override;
+
+  std::string str() const override;
+
+  static UnionTypePtr create(std::vector<TypePtr> reference);
+
+  bool operator==(const Type& rhs) const override;
+
+  at::ArrayRef<TypePtr> containedTypes() const override {
+    return types_;
+  }
+
+  // For testing purposes only
+  at::ArrayRef<TypePtr> getTypes() const {
+    return types_;
+  }
+
+  TypePtr createWithContained(std::vector<TypePtr> contained_types) const override {
+    return create(contained_types);
+  }
+
+  bool canHoldType(TypePtr type) const;
+
+  bool hasFreeVariables() const override {
+    return has_free_variables_;
+  }
+
+  c10::optional<TypePtr> toOptional() const;
+
+  c10::optional<TypePtr> subtractTypeSet(std::vector<TypePtr>& to_subtract) const;
+
+ protected:
+    explicit UnionType(std::vector<TypePtr> types, TypeKind kind=TypeKind::UnionType);
+    std::string annotation_str_impl(TypePrinter printer = nullptr) const override;
+    std::string unionStr(TypePrinter printer = nullptr, bool is_annotation_str = false) const;
+    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+    bool has_free_variables_;
+    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+    std::vector<TypePtr> types_;
+    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+    bool can_hold_none_;
+
+};
+
 struct OptionalType;
 using OptionalTypePtr = std::shared_ptr<OptionalType>;
-// This type represents an optional type, for each element type.
-// Optional[T] can accept both T and None(nullopt in C++)
+// This type represents an optional type. There is one `Optional` for
+// each element type. `Optional[T]` can accept both `T` and
+// `None`(`c10::nullopt` in C++)
 // Subtype hierarchy for Optional:
-// 1. Optional[T] <: Optional[R] iff T <: R
-// 2. T <: Optional[R] if T <: R
-// 3. None <: Optional[T] for all T
-struct TORCH_API OptionalType
-    : public SingleElementType<TypeKind::OptionalType, OptionalType> {
-  static OptionalTypePtr create(TypePtr element) {
-    TORCH_INTERNAL_ASSERT(element, "OptionalType requires valid TypePtr");
-    // Optional is a union of [None, T], so Optional[[Optional[T]]] ->
-    // Optional[T]
-    if (auto opt_ptr = element->cast<OptionalType>()) {
-      return opt_ptr;
-    }
-    return OptionalTypePtr(
-        new OptionalType(std::move(element))); // NOLINT(modernize-make-shared)
+//     - Optional[T] <: Optional[R] iff T <: R
+//     - T <: Optional[R] if T <: R
+//     - None <: Optional[T] for all T
+//     - Optional[T] == Union[T, None] for all T
+struct TORCH_API OptionalType : public UnionType {
+  static OptionalTypePtr create(TypePtr contained) {
+    return OptionalTypePtr(new OptionalType(std::move(contained)));
+  }
+
+  static const TypeKind Kind = TypeKind::OptionalType;
+
+  friend struct Type;
+
+  bool operator==(const Type& rhs) const override;
+
+  TypePtr getElementType() const {
+    return contained_;
+  }
+
+  at::ArrayRef<TypePtr> containedTypes() const override {
+    return contained_;
   }
 
   std::string str() const override {
@@ -127,20 +189,15 @@ struct TORCH_API OptionalType
     return create(contained_types[0]);
   }
 
-  bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const override {
-    if (Type::isSubtypeOfExt(rhs, why_not)) {
-      return true;
-    }
-    if (auto rhs_ = rhs->cast<OptionalType>()) {
-      return getElementType()->isSubtypeOfExt(rhs_->getElementType(), why_not);
-    }
-    return false;
-  }
+  bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const override;
+
   // common cast Optional[Tensor] for undefined tensor type
   static OptionalTypePtr ofTensor();
 
  private:
-  OptionalType(TypePtr elem) : SingleElementType(elem) {}
+  explicit OptionalType(TypePtr contained);
+
+  TypePtr contained_;
 
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
     std::stringstream ss;
@@ -908,7 +965,6 @@ struct TORCH_API RRefType
   }
 };
 
-
 struct NamedType;
 using NamedTypePtr = std::shared_ptr<NamedType>;
 using ConstNamedTypePtr = std::shared_ptr<const NamedType>;
@@ -1112,7 +1168,6 @@ struct TORCH_API EnumType : public NamedType {
   std::weak_ptr<::torch::jit::CompilationUnit> cu_;
 };
 
-
 // the common supertype of all Enums, only used in operator registraion.
 // EnumType <: AnyEnumType for all Enums
 struct AnyEnumType;
@@ -1132,7 +1187,6 @@ struct TORCH_API AnyEnumType : public Type {
   : Type(TypeKind::AnyEnumType) {}
 };
 
-
 struct NumberType;
 using NumberTypePtr = std::shared_ptr<NumberType>;
 // This type represents a Python number
@@ -1141,9 +1195,10 @@ using NumberTypePtr = std::shared_ptr<NumberType>;
 // FloatType <: NumberType
 // ComplexType <:NumberType
 struct TORCH_API NumberType : public Type {
-  bool operator==(const Type& rhs) const override {
-    return rhs.kind() == kind();
-  }
+  bool operator==(const Type& rhs) const override;
+
+  bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const override;
+
   std::string str() const override {
     return "Scalar"; // match what PythonArgParser says for clarity
   }
@@ -1172,7 +1227,8 @@ struct TORCH_API FloatType : public NumberType {
     return "float";
   }
   bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const override {
-    return rhs->kind() == TypeKind::NumberType || NumberType::isSubtypeOfExt(rhs, why_not);
+    // NOLINTNEXTLINE(bugprone-parent-virtual-call)
+    return rhs->kind() == TypeKind::NumberType || Type::isSubtypeOfExt(rhs, why_not);
   }
   static const TypeKind Kind = TypeKind::FloatType;
   // global singleton
@@ -1196,7 +1252,8 @@ struct TORCH_API ComplexType : public NumberType {
     return "complex";
   }
   bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const override {
-    return rhs->kind() == TypeKind::NumberType || NumberType::isSubtypeOfExt(rhs, why_not);
+    // NOLINTNEXTLINE(bugprone-parent-virtual-call)
+    return rhs->kind() == TypeKind::NumberType || Type::isSubtypeOfExt(rhs, why_not);
   }
   static const TypeKind Kind = TypeKind::ComplexType;
   // global singleton
@@ -1220,7 +1277,8 @@ struct TORCH_API IntType : public NumberType {
     return "int";
   }
   bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const override {
-    return rhs->kind() == TypeKind::NumberType || NumberType::isSubtypeOfExt(rhs, why_not);
+    // NOLINTNEXTLINE(bugprone-parent-virtual-call)
+    return rhs->kind() == TypeKind::NumberType || Type::isSubtypeOfExt(rhs, why_not);
   }
   static const TypeKind Kind = TypeKind::IntType;
   // global singleton
@@ -1334,12 +1392,8 @@ struct TORCH_API NoneType : public Type {
   std::string str() const override {
     return "NoneType";
   }
-  bool isSubtypeOfExt(const TypePtr& rhs, std::ostream *why_not) const override {
-    if (rhs->kind() == OptionalType::Kind) {
-      return true;
-    }
-    return Type::isSubtypeOfExt(rhs, why_not);
-  }
+  bool isSubtypeOfExt(const TypePtr& rhs, std::ostream *why_not) const override;
+
   static const TypeKind Kind = TypeKind::NoneType;
   // global singleton
   static NoneTypePtr get();
@@ -1524,8 +1578,15 @@ TORCH_API std::ostream& operator<<(std::ostream& os, const Stride& s);
 // what is the type, ignoring extra size/shape information?
 // e.g. Tensor(2x3) -> Dynamic, and Tuple(Tensor(2x3),...) -> Tuple(Dynamic,...)
 
-// xxx: be careful with calls because this can be very slow. If calling this on a graph
-// use `EraseShapeInformation` in shape_analysis.h
+// `unshapedType` is used to remove Tensor subtypes. We treat all Tensor
+// subtypes as simply "Tensor"; we also create a new version of any
+// container types in which internal Tensors have undergone the same
+// operation. This is used for type comparisons between two Tensor types
+// (`unshapedType` means that we don't falsely return `false` for e.g.
+// Tensors of different dimensions). It's also used in the alias
+// analysis pass.
+// Be careful with calls because this can be very slow. If calling this
+// on a graph, use `EraseShapeInformation` in shape_analysis.h
 inline TypePtr unshapedType(const TypePtr& type) {
   if (type->isSubtypeOf(TensorType::get())) {
     return TensorType::get();
@@ -1569,27 +1630,32 @@ inline at::ScalarType scalarTypeFromJitType(const c10::TypePtr& type) {
   return *result;
 }
 
-// Attempt to find the correct supertype of t1 and t2. If none is found then
-// nullopt will be returned if default_to_any is false, and Any will be returned
-// if it is true. If t1 == t2, or t1 is a type refinement of t2,
-// then t2 will be returned (and vice versa).
+// Attempt to find the correct supertype of the two types `t1` and `t2`.
+// If no supertype is found, then nullopt will be returned if
+// `default_to_union` is false, and `Union[t1, t2]` will be returned
+// if it is true. If `t1 == t2`, or `t1` is a type refinement of `t2`,
+// then `t2` will be returned (and vice versa).
+//
 // Two different tensortypes will return dynamic.
-// Currently we chose not to support returning a NumberType for a float & int
-// input because of a lack of operator support for NumberType.
+//
+// Currently we chose not to support returning a NumberType for
+// two types from the set of {FloatType, IntType, ComplexType}, because
+// there is a lack of operator support for NumberType.
+//
 // If `type_hint` is an `InterfaceType`, then we can use that as a
 // potential supertype for `ClassType`s in the list. Otherwise, we have
 // no way to find and use some common interface type
 TORCH_API c10::optional<TypePtr> unifyTypes(
     const TypePtr& t1,
     const TypePtr& t2,
-    bool default_to_any = false,
-    TypePtr type_hint=nullptr);
+    bool default_to_union = false,
+    TypePtr type_hint = nullptr);
 
 TORCH_API c10::optional<TypePtr> unifyTypeList(
     at::ArrayRef<TypePtr> elements,
     std::ostream& why_not,
-    bool default_to_any=false,
-    TypePtr type_hint=nullptr);
+    bool default_to_union = false,
+    TypePtr type_hint = nullptr);
 
 namespace detail {
 template <typename T>
diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h
index dbb4a62f73088..a9be1e8d68658 100644
--- a/aten/src/ATen/core/jit_type_base.h
+++ b/aten/src/ATen/core/jit_type_base.h
@@ -21,7 +21,7 @@ namespace c10 {
   _(DictType)               \
   _(NumberType)             \
   _(FloatType)              \
-  _(ComplexType)      \
+  _(ComplexType)            \
   _(FutureType)             \
   _(RRefType)               \
   _(IntType)                \
@@ -44,7 +44,8 @@ namespace c10 {
   _(ScalarTypeType)         \
   _(AnyListType)            \
   _(AnyTupleType)           \
-  _(AnyClassType)
+  _(AnyClassType)           \
+  _(UnionType)
 
 enum class TypeKind {
 #define DEFINE_TYPE(T) T,
@@ -203,7 +204,7 @@ struct TORCH_API Type : std::enable_shared_from_this<Type> {
   // contained_types
   TypePtr withContained(std::vector<TypePtr> contained_types) {
     auto current_contained = containedTypes();
-    AT_ASSERT(current_contained.size() == contained_types.size());
+    TORCH_INTERNAL_ASSERT(current_contained.size() == contained_types.size());
     if (current_contained.equals(contained_types)) {
       return shared_from_this();
     }
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index 6bfba7b6d181a..fec0cb086ee51 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -265,7 +265,7 @@ AnyEnumTypePtr AnyEnumType::get() {
   return value;
 }
 
-c10::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool default_to_any=false, TypePtr type_hint=nullptr) {
+c10::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool default_to_union=false, TypePtr type_hint=nullptr) {
   // check direct subtyping relation
   if (t1->isSubtypeOf(t2)) {
     return t2;
@@ -308,7 +308,7 @@ c10::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool
     }
     std::vector<TypePtr> elements;
     for (size_t i = 0; i < tuple1->elements().size(); i++) {
-      if (auto elem = unifyTypes(tuple1->elements().at(i), tuple2->elements().at(i), default_to_any)) {
+      if (auto elem = unifyTypes(tuple1->elements().at(i), tuple2->elements().at(i), default_to_union)) {
         elements.push_back(*elem);
       } else {
         return c10::nullopt;
@@ -347,11 +347,11 @@ c10::optional<TypePtr> unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool
   return c10::nullopt;
 }
 
-c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2, bool default_to_any, TypePtr type_hint) {
-  auto unified = unifyTypesImpl(t1, t2, default_to_any, type_hint);
+c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2, bool default_to_union, TypePtr type_hint) {
+  auto unified = unifyTypesImpl(t1, t2, default_to_union, type_hint);
 
-  if (default_to_any && !unified) {
-    return AnyType::get();
+  if (default_to_union && !unified) {
+    return UnionType::create({t1, t2});
   }
 
   return unified;
@@ -360,7 +360,7 @@ c10::optional<TypePtr> unifyTypes(const TypePtr& t1, const TypePtr& t2, bool def
 c10::optional<TypePtr> unifyTypeList(
     at::ArrayRef<TypePtr> elements,
     std::ostream& why_not,
-    bool default_to_any,
+    bool default_to_union,
     TypePtr type_hint) {
   if (elements.size() == 0) {
     why_not << "Cannot get unified type from empty list";
@@ -369,7 +369,7 @@ c10::optional<TypePtr> unifyTypeList(
 
   TypePtr ret_type = elements.at(0);
   for (size_t i = 1; i < elements.size() && ret_type; ++i) {
-    c10::optional<TypePtr> maybe_unified = unifyTypes(ret_type, elements.at(i), default_to_any, type_hint);
+    c10::optional<TypePtr> maybe_unified = unifyTypes(ret_type, elements.at(i), default_to_union, type_hint);
     if (!maybe_unified) {
       why_not << "Could not unify type list since element " << i << " of type "
               << elements.at(i)->repr_str()
@@ -547,8 +547,9 @@ TORCH_API TypePtr tryEvalTypeVariables(TypePtr type, std::unordered_map<std::str
 }
 
 TORCH_API bool elementTypeCanBeInferredFromMembers(const TypePtr& elem_type) {
-  if (elem_type->kind() == OptionalType::Kind ||
-      elem_type->kind() == NumberType::Kind) {
+  if (elem_type->kind() == UnionType::Kind
+      || elem_type->kind() == OptionalType::Kind
+      || elem_type->kind() == NumberType::Kind) {
     // Builtin Union types
     return false;
   }
@@ -577,8 +578,16 @@ bool Type::isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const {
   if (rhs->kind() == TypeKind::AnyType || *this == *rhs) {
     return true;
   }
-  if(auto rhs_ = rhs->cast<OptionalType>()) {
-    return this->isSubtypeOfExt(rhs_->getElementType(), why_not);
+  if (auto opt_rhs = rhs->cast<OptionalType>()) {
+    return this->isSubtypeOfExt(opt_rhs->getElementType(), why_not);
+  }
+  if (auto union_rhs = rhs->cast<UnionType>()) {
+    // Check if `this` is a subtype of any of the types within the Union
+    return std::any_of(union_rhs->containedTypes().begin(),
+                       union_rhs->containedTypes().end(),
+                       [&](TypePtr inner) {
+                         return this->isSubtypeOfExt(inner, why_not);
+                       });
   }
   return false;
 }
@@ -808,6 +817,453 @@ TupleTypePtr TupleType::createNamed(const c10::optional<c10::QualifiedName>& qua
       field_types, qualName, schema)); // NOLINT(modernize-make-shared)
 }
 
+bool NoneType::isSubtypeOfExt(const TypePtr& rhs, std::ostream *why_not) const {
+  if (rhs->kind() == OptionalType::Kind) {
+    return true;
+  }
+  return Type::isSubtypeOfExt(rhs, why_not);
+}
+
+// Remove nested Optionals/Unions during the instantiation of a Union or
+// an Optional. This populates `types` with all the types found during
+// flattening. At the end of `flattenUnion`, `types` may have
+// duplicates, but it will not have nested Optionals/Unions
+void flattenUnion(TypePtr& type, std::vector<TypePtr>* to_fill) {
+  if (auto union_type = type->cast<UnionType>()) {
+    for (auto inner : union_type->containedTypes()) {
+      flattenUnion(inner, to_fill);
+    }
+  } else if (auto opt_type = type->cast<OptionalType>()) {
+    auto inner = opt_type->getElementType();
+    flattenUnion(inner, to_fill);
+    to_fill->emplace_back(NoneType::get());
+  } else if (type->kind() == NumberType::Kind) {
+    to_fill->emplace_back(IntType::get());
+    to_fill->emplace_back(FloatType::get());
+    to_fill->emplace_back(ComplexType::get());
+  } else {
+    to_fill->emplace_back(type);
+  }
+}
+
+// Helper function for `standardizeUnion`
+//
+// NB: If we have types `T1`, `T2`, `T3`, and `PARENT_T` such that `T1`,
+// `T2`, and `T2` are children of `PARENT_T`, then `unifyTypes(T1, T2)`
+// will return `PARENT_T`. This could be a problem if we didn't want our
+// Union to also be able to take `T3 `. In our current type hierarchy,
+// this isn't an issue--most types SHOULD be unified even if the parent
+// type wasn't in the original vector. However, later additions to the
+// type system might necessitate reworking `get_supertype`
+void filterDuplicateSubtypes(std::vector<TypePtr>* types) {
+  if (types->empty()) {
+    return;
+  }
+  auto get_supertype = [](const TypePtr t1, const TypePtr t2) -> c10::optional<TypePtr> {
+    // We don't want nested Optionals. Also, prematurely unifying to
+    // `Optional` could prevent us from coalescing other types
+    if ((t1->isSubtypeOf(NoneType::get()) && !t2->isSubtypeOf(NoneType::get()))
+        || (!t1->isSubtypeOf(NoneType::get()) && t2->isSubtypeOf(NoneType::get()))) {
+          return c10::nullopt;
+    } else {
+      return unifyTypes(t1, t2, /*default_to_union=*/false);
+    }
+  };
+
+  // Coalesce types and delete all duplicates. Moving from right to left
+  // through the vector, we try to unify the current element (`i`) with
+  // each element (`j`) before the "new" end of the vector (`end`).
+  // If we're able to unify the types at `types[i]` and `types[j]`, we
+  // decrement `end`, swap `types[j]` with the unified type, and
+  // break. Otherwise, we keep `end` where it is to signify that the
+  // new end of the vector hasn't shifted
+  size_t end_idx = types->size()-1;
+  for (size_t i = types->size()-1; i > 0; --i) {
+    for (size_t j = std::min(i-1, end_idx); ; --j) {
+      c10::optional<TypePtr> unified;
+      unified = get_supertype((*types)[i], (*types)[j]);
+      if (unified) {
+        (*types)[j] = *unified;
+        (*types)[i] = (*types)[end_idx];
+        --end_idx;
+        break;
+      }
+      // Break condition here so we don't get `j = 0; j = j-1` and end
+      // up with MAX_INT
+      if (j == 0) {
+        break;
+      }
+    }
+  }
+  // Cut off the vector's tail so that `end` is the real last element
+  types->erase(types->begin() + end_idx + 1, types->end());
+
+}
+
+void sortUnion(std::vector<TypePtr>* types) {
+  // We want the elements to be sorted so we can easily compare two
+  // UnionType objects for equality in the future. Note that this order
+  // is guaranteed to be stable since we've already coalesced any
+  // possible types
+  std::sort(types->begin(), types->end(),
+          [](const TypePtr a, const TypePtr b) -> bool {
+            if (a->kind() != b->kind()) {
+              return a->kind() < b->kind();
+            }
+            return a->str() < b->str();
+          });
+}
+
+void standardizeVectorForUnion(std::vector<TypePtr>& reference, std::vector<TypePtr>* to_fill) {
+  for (auto type : reference) {
+    flattenUnion(type, to_fill);
+  }
+  filterDuplicateSubtypes(to_fill);
+  sortUnion(to_fill);
+}
+
+void standardizeVectorForUnion(std::vector<TypePtr>* to_flatten) {
+  TORCH_INTERNAL_ASSERT(to_flatten, "`standardizeVectorForUnion` was ",
+                        "passed a `nullptr`");
+  std::vector<TypePtr> to_fill;
+  standardizeVectorForUnion(*to_flatten, &to_fill);
+  *to_flatten = to_fill;
+}
+
+UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : Type(kind) {
+  TORCH_INTERNAL_ASSERT(!reference.empty(), "Cannot create an empty Union");
+
+  standardizeVectorForUnion(reference, &types_);
+
+  // Gate the assert in a regular conditional so that we don't create
+  // this long error message unnecessarily
+  if (types_.size() == 1) {
+    std::stringstream msg;
+    msg << "After type unification was performed, the Union with the "
+        << "original types {";
+    for (auto i = 0; i < reference.size(); ++i) {
+      msg << reference[i]->repr_str();
+      if (i > 0) {
+        msg << ",";
+      }
+      msg << " ";
+    }
+    msg << "} has the single type " << types_[0]->repr_str()
+         << ". Use the common supertype instead of creating a Union"
+         << "type";
+    TORCH_INTERNAL_ASSERT(false, msg.str());
+  }
+
+  can_hold_none_ = false;
+  has_free_variables_ = false;
+
+  for (const TypePtr& type : types_) {
+    if (type->kind() == NoneType::Kind) {
+      can_hold_none_ = true;
+    }
+    if (type->hasFreeVariables()) {
+      has_free_variables_ = true;
+    }
+  }
+
+}
+
+UnionTypePtr UnionType::create(std::vector<TypePtr> reference) {
+  auto union_type = new UnionType(std::move(reference));
+
+  // Some very special-cased logic for `Optional`. This will be deleted
+  // in a later PR
+  bool int_found = false;
+  bool float_found = false;
+  bool complex_found = false;
+  bool nonetype_found = false;
+
+  auto update_is_opt_flags = [&](TypePtr t) {
+    if (t == IntType::get()) {
+      int_found = true;
+    } else if (t == FloatType::get()) {
+      float_found  = true;
+    } else if (t == ComplexType::get()) {
+      complex_found = true;
+    } else if (t == NoneType::get()) {
+      nonetype_found = true;
+    }
+  };
+
+  for (const auto& t : union_type->containedTypes()) {
+    update_is_opt_flags(t);
+  }
+
+  bool numbertype_found = int_found && float_found && complex_found;
+
+  if (nonetype_found) {
+    if (union_type->containedTypes().size() == 4 && numbertype_found) {
+      return OptionalType::create(NumberType::get());
+    }
+    if (union_type->containedTypes().size() == 2) {
+      auto not_none = union_type->containedTypes()[0] != NoneType::get()
+                      ? union_type->containedTypes()[0]
+                      : union_type->containedTypes()[1];
+      return OptionalType::create(not_none);
+    }
+  }
+
+  return UnionTypePtr(union_type);
+}
+
+bool UnionType::operator==(const Type& rhs) const {
+  if (auto union_rhs = rhs.cast<UnionType>()) {
+    // We can't compare the type vectors for equality using `operator=`,
+    // because the vectors hold `TypePtr`s and we want to compare `Type`
+    // equality
+    if (union_rhs->containedTypes().size() != this->containedTypes().size()) {
+      return false;
+    }
+    // Check that all the types in `this->types_` are also in
+    // `union_rhs->types_`
+    return std::all_of(this->containedTypes().begin(), this->containedTypes().end(),
+                       [&](TypePtr lhs_type) {
+                         return std::any_of(union_rhs->containedTypes().begin(),
+                                            union_rhs->containedTypes().end(),
+                                            [&](TypePtr rhs_type) {
+                                              return *lhs_type == *rhs_type;
+                                            });
+                       });
+  } else if (auto optional_rhs = rhs.cast<OptionalType>()) {
+    if (optional_rhs->getElementType() == NumberType::get()) {
+      return this->containedTypes().size() == 4
+             && this->can_hold_none_
+             && this->canHoldType(NumberType::get());
+    }
+    auto optional_lhs = this->toOptional();
+    return optional_lhs && *optional_rhs == *((optional_lhs.value())->expect<OptionalType>());
+  } else if (rhs.kind() == NumberType::Kind) {
+    return this->containedTypes().size() == 3 && canHoldType(NumberType::get());
+  } else {
+    return false;
+  }
+}
+
+bool UnionType::isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const {
+  std::vector<TypePtr> rhs_types;
+  if (const auto union_rhs = rhs->cast<UnionType>()) {
+    // Fast path
+    if (this->containedTypes() == rhs->containedTypes()) {
+      return true;
+    }
+    rhs_types = rhs->containedTypes().vec();
+  } else if (const auto optional_rhs = rhs->cast<OptionalType>()) {
+    rhs_types.push_back(NoneType::get());
+    if (optional_rhs->getElementType() == NumberType::get()) {
+      std::vector<TypePtr> number_types{IntType::get(), FloatType::get(), ComplexType::get()};
+      rhs_types.insert(rhs_types.end(), number_types.begin(), number_types.end());
+    } else {
+      rhs_types.push_back(optional_rhs->getElementType());
+    }
+  } else if (const auto number_rhs = rhs->cast<NumberType>()) {
+    std::vector<TypePtr> number_types{IntType::get(), FloatType::get(), ComplexType::get()};
+    rhs_types.insert(rhs_types.end(), number_types.begin(), number_types.end());
+  } else {
+    rhs_types.push_back(rhs);
+  }
+  return std::all_of(this->containedTypes().begin(), this->containedTypes().end(),
+                     [&](TypePtr lhs_type) -> bool {
+                      return std::any_of(rhs_types.begin(),
+                                         rhs_types.end(),
+                                         [&](TypePtr rhs_type) -> bool {
+                                           return lhs_type->isSubtypeOfExt(rhs_type, why_not);
+                                         });
+  });
+}
+
+
+std::string UnionType::unionStr(TypePrinter printer, bool is_annotation_str) const {
+  std::stringstream ss;
+
+  bool can_hold_numbertype = this->canHoldType(NumberType::get());
+
+  std::vector<TypePtr> number_types{IntType::get(), FloatType::get(), ComplexType::get()};
+
+  auto is_numbertype = [&](TypePtr lhs) {
+    for (const auto& rhs : number_types) {
+      if (*lhs == *rhs) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  ss << "Union[";
+  bool printed = false;
+  for (size_t i = 0; i < types_.size(); ++i) {
+    if (!can_hold_numbertype || !is_numbertype(types_[i])) {
+      if (i > 0) {
+        ss << ", ";
+        printed = true;
+      }
+      if (is_annotation_str) {
+        ss << this->containedTypes()[i]->annotation_str(printer);
+      } else {
+        ss << this->containedTypes()[i]->str();
+      }
+    }
+  }
+  if (can_hold_numbertype) {
+    if (printed) {
+      ss << ", ";
+    }
+    if (is_annotation_str) {
+      ss << NumberType::get()->annotation_str(printer);
+    } else {
+      ss << NumberType::get()->str();
+    }
+  }
+  ss << "]";
+  return ss.str();
+}
+
+std::string UnionType::str() const {
+  return this->unionStr(nullptr, /*is_annotation_str=*/false);
+}
+
+std::string UnionType::annotation_str_impl(TypePrinter printer) const {
+  return this->unionStr(printer, /*is_annotation_str=*/true);
+}
+
+bool UnionType::canHoldType(TypePtr type) const {
+  if (type == NumberType::get()) {
+    return canHoldType(IntType::get())
+           && canHoldType(FloatType::get())
+           && canHoldType(ComplexType::get());
+  } else {
+    return std::any_of(this->containedTypes().begin(), this->containedTypes().end(),
+                    [&](TypePtr inner) {
+                      return type->isSubtypeOf(inner);
+                    });
+  }
+}
+
+c10::optional<TypePtr> UnionType::toOptional() const {
+  if (!canHoldType(NoneType::get())) {
+      return c10::nullopt;
+  }
+
+  std::vector<TypePtr> copied_types = this->containedTypes().vec();
+
+  auto maybe_opt = UnionType::create(std::move(copied_types));
+
+  if (maybe_opt->kind() == UnionType::Kind) {
+    return c10::nullopt;
+  } else {
+    return maybe_opt;
+  }
+}
+
+c10::optional<TypePtr> UnionType::subtractTypeSet(std::vector<TypePtr>& to_subtract) const {
+  std::vector<TypePtr> types;
+
+  // Given a TypePtr `lhs`, this function says whether or not `lhs` (or
+  // one of its parent types) is in the `to_subtract` vector
+  auto should_subtract = [&](TypePtr lhs) -> bool {
+    return std::any_of(to_subtract.begin(), to_subtract.end(),
+                        [&](TypePtr rhs) {
+                          return lhs->isSubtypeOf(rhs);
+                        });
+  };
+
+  // Copy all the elements that should NOT be subtracted to the `types`
+  // vector
+  std::copy_if(this->containedTypes().begin(), this->containedTypes().end(),
+              std::back_inserter(types),
+              [&](const TypePtr t) {
+                return !should_subtract(t);
+              });
+
+  if (types.size() == 0) {
+    return c10::nullopt;
+  } else if (types.size() == 1) {
+    return types[0];
+  } else {
+    return UnionType::create(std::move(types));
+  }
+}
+
+OptionalType::OptionalType(TypePtr contained)
+                           : UnionType({contained, NoneType::get()}, TypeKind::OptionalType) {
+  bool is_numbertype = false;
+  if (auto as_union = contained->cast<UnionType>()) {
+    is_numbertype = as_union->containedTypes().size() == 3 &&
+                    as_union->canHoldType(NumberType::get());
+  }
+  if (UnionType::containedTypes().size() == 2) {
+    contained_ = UnionType::containedTypes()[0]->kind()!= NoneType::Kind
+                 ? UnionType::containedTypes()[0]
+                 : UnionType::containedTypes()[1];
+  } else if (contained == NumberType::get() || is_numbertype) {
+    contained_ = NumberType::get();
+    types_.clear();
+    types_.push_back(NumberType::get());
+    types_.push_back(NoneType::get());
+  } else {
+    std::vector<TypePtr> to_subtract{NoneType::get()};
+    auto without_none = this->subtractTypeSet(to_subtract);
+    contained_ = UnionType::create({*without_none});
+  }
+  has_free_variables_ = contained_->hasFreeVariables();
+}
+
+bool OptionalType::operator==(const Type& rhs) const {
+  if (auto union_rhs = rhs.cast<UnionType>()) {
+    auto optional_rhs = union_rhs->toOptional();
+    // `**optional_rhs` = `*` to get value of `c10::optional<TypePtr>`,
+    // then `*` to dereference the pointer
+    return optional_rhs && *this == **optional_rhs;
+  } else if (auto optional_rhs = rhs.cast<OptionalType>()) {
+    return *this->getElementType() == *optional_rhs->getElementType();
+  } else {
+    return false;
+  }
+}
+
+bool OptionalType::isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const {
+  if (OptionalTypePtr optional_rhs = rhs->cast<OptionalType>()) {
+    return getElementType()->isSubtypeOfExt(optional_rhs->getElementType(), why_not);
+  } else if (UnionTypePtr union_rhs = rhs->cast<UnionType>()) {
+    if (!union_rhs->canHoldType(NoneType::get())) {
+      if (why_not) {
+        *why_not << rhs->repr_str() << " cannot hold None";
+      }
+      return false;
+    } else if (!union_rhs->canHoldType(this->getElementType())) {
+      if (why_not) {
+        *why_not << rhs->repr_str() << " cannot hold " << this->getElementType();
+      }
+      return false;
+    } else {
+      return true;
+    }
+  } else {
+    // NOLINTNEXTLINE(bugprone-argument-comment)
+    return Type::isSubtypeOfExt(rhs, why_not);
+  }
+}
+
+bool NumberType::operator==(const Type& rhs) const {
+  if (auto union_type = rhs.cast<UnionType>()) {
+    return union_type->containedTypes().size() == 3 && union_type->canHoldType(NumberType::get());
+  } else {
+    return rhs.kind() == this->kind();
+  }
+}
+
+bool NumberType::isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const {
+  if (auto union_type = rhs->cast<UnionType>()) {
+    return union_type->canHoldType(NumberType::get());
+  } else {
+    return Type::isSubtypeOfExt(rhs, why_not);
+  }
+}
+
 TupleType::TupleType(
     std::vector<TypePtr> elements,
     c10::optional<c10::QualifiedName> name,
@@ -1732,8 +2188,10 @@ size_t ClassType::addAttribute(
     TORCH_CHECK(
         (type->kind() == TensorType::Kind) ||
             (type->kind() == OptionalType::Kind &&
-            type->expectRef<OptionalType>().getElementType()->kind() ==
+            type->expect<OptionalType>()->getElementType()->kind() ==
                 TensorType::Kind) ||
+            (type->kind() == UnionType::Kind &&
+            TensorType::get()->isSubtypeOf(type->expect<UnionType>())) ||
             (type->kind() == NoneType::Kind),
         "Expecting parameter or buffer to have either None, Tensor or Optional[Tensor] type, but got: ",
         toString(type));
@@ -1880,7 +2338,9 @@ void SymbolicShape::dump() const {
 
 bool EnumType::isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const {
   return rhs->kind() == TypeKind::AnyType ||
-      rhs->kind() == TypeKind::AnyEnumType || *this == *rhs;
+      rhs->kind() == TypeKind::AnyEnumType ||
+      *this == *rhs ||
+      Type::isSubtypeOfExt(rhs, why_not);
 }
 
 } // namespace c10
diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index 8d2f2de7367d7..2c77fb348228b 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -7,8 +7,8 @@ Like all ATen methods/functions, native functions are made available
 from both ATen's C++ and Python APIs.  In C++, they are made available
 either as methods on `Tensor` (`t.mymeth()`) and functions in the ATen
 namespace (`at::myfunc()`).  In PyTorch, they are made available as
-methods on `Variable` or as functions on `torch._C._FunctionBase`
-(it is the user's responsibility to re-exporting these functions in
+methods on `Variable` or as functions on `torch._C._FunctionBase`.
+(It is the user's responsibility to re-export these functions in
 a more user-facing module.)
 
 The rest of this document describes how to implement an ATen function.
diff --git a/c10/mobile/CPUProfilingAllocator.h b/c10/mobile/CPUProfilingAllocator.h
index 5112691a62d6f..bb080d9df97c3 100644
--- a/c10/mobile/CPUProfilingAllocator.h
+++ b/c10/mobile/CPUProfilingAllocator.h
@@ -50,7 +50,7 @@ class C10_API AllocationPlanner {
  private:
   AllocationPlan* allocation_plan_{nullptr};
   // Maps allocated ptr to its allocation id.
-  // This is used when freeing the memory to lookup the allocation id
+  // This is used when freeing the memory to look up the allocation id
   // in order to establish the lifetime of a particular allocation.
   ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_;
   uint64_t allocation_id_{0};
diff --git a/docs/source/rpc/distributed_autograd.rst b/docs/source/rpc/distributed_autograd.rst
index 61af22b9486f5..71cf1f2fd3178 100644
--- a/docs/source/rpc/distributed_autograd.rst
+++ b/docs/source/rpc/distributed_autograd.rst
@@ -65,7 +65,7 @@ an RPC.
   input tensors. The output gradients of this function are sent to the source
   node to the appropriate ``send`` function during the backward pass.
 - Each ``send-recv`` pair is assigned a globally unique ``autograd_message_id``
-  to uniquely identify the pair. This is useful to lookup the corresponding
+  to uniquely identify the pair. This is useful to look up the corresponding
   function on a remote node during the backward pass.
 - For :ref:`rref`, whenever we call :meth:`torch.distributed.rpc.RRef.to_here`
   we attach an appropriate ``send-recv`` pair for the tensors involved.
@@ -98,7 +98,7 @@ This context serves the following purpose:
 2. During the forward pass we store the ``send`` and ``recv`` functions for
    each autograd pass in this context. This ensures we hold references to the
    appropriate nodes in the autograd graph to keep it alive. In addition to
-   this, it is easy to lookup the appropriate ``send`` and ``recv`` functions
+   this, it is easy to look up the appropriate ``send`` and ``recv`` functions
    during the backward pass.
 3. In general we also use this context to store some metadata for each
    distributed autograd pass.
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index e766f33a250b2..8bd37a1fb8a59 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -66,6 +66,7 @@ set(JIT_TEST_SRCS
   ${JIT_TEST_ROOT}/test_subgraph_matcher.cpp
   ${JIT_TEST_ROOT}/test_subgraph_rewriter.cpp
   ${JIT_TEST_ROOT}/test_subgraph_utils.cpp
+  ${JIT_TEST_ROOT}/test_union.cpp
   ${JIT_TEST_ROOT}/test_utils.cpp
   ${JIT_TEST_ROOT}/test_script_profile.cpp
   ${JIT_TEST_ROOT}/test_jit_logging_levels.cpp
diff --git a/test/cpp/jit/test_alias_analysis.cpp b/test/cpp/jit/test_alias_analysis.cpp
index eef529d8d5d33..c92cb4da46dde 100644
--- a/test/cpp/jit/test_alias_analysis.cpp
+++ b/test/cpp/jit/test_alias_analysis.cpp
@@ -660,6 +660,31 @@ TEST(ContainerAliasingTest, PrimitveValuesDontAliasContainers) {
   }
 }
 
+TEST(ContainerAliasingTest, UnionAliasing) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
+  graph(%a : Dict(str, Tensor),
+        %b : Tensor[],
+        %c : Union(Dict(str, Tensor), Tensor[])):
+    return (%a, %b, %c)
+    )IR",
+      &*graph);
+
+  AliasDb aliasDb(graph);
+  auto a = graph->outputs().at(0);
+  auto b = graph->outputs().at(1);
+  auto c = graph->outputs().at(2);
+
+  EXPECT_TRUE(aliasDb.mayAlias(a, c));
+  EXPECT_TRUE(aliasDb.mayAlias(b, c));
+  EXPECT_TRUE(aliasDb.mayAlias(c, c));
+  EXPECT_FALSE(aliasDb.mayAlias(a, b));
+  EXPECT_TRUE(aliasDb.mayContainAlias(a, b));
+  EXPECT_TRUE(aliasDb.mayContainAlias(a, c));
+  EXPECT_TRUE(aliasDb.mayContainAlias(b, c));
+}
+
 TEST(ContainerAliasingTest, InputsCanAliasOutputs) {
   // Test input aliasing
   auto graph = std::make_shared<Graph>();
diff --git a/test/cpp/jit/test_union.cpp b/test/cpp/jit/test_union.cpp
new file mode 100644
index 0000000000000..f35acd35d1ed6
--- /dev/null
+++ b/test/cpp/jit/test_union.cpp
@@ -0,0 +1,149 @@
+#include <gtest/gtest.h>
+
+#include <ATen/core/jit_type.h>
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+class UnionTypeTest : public ::testing::Test {
+ public:
+  // None
+  const TypePtr none = NoneType::get();
+
+  // List[str]
+  const TypePtr l1 = ListType::ofStrings();
+
+  // Optional[int]
+  const TypePtr opt1 = OptionalType::create(IntType::get());
+
+  // Optional[float]
+  const TypePtr opt2 = OptionalType::create(FloatType::get());
+
+  // Optional[List[str]]
+  const TypePtr opt3 = OptionalType::create(ListType::ofStrings());
+
+  // Tuple[Optional[int], int]
+  const TypePtr tup1 =
+      TupleType::create({OptionalType::create(IntType::get()), IntType::get()});
+
+  // Tuple[int, int]
+  const TypePtr tup2 = TupleType::create({IntType::get(), IntType::get()});
+
+  bool hasType(UnionTypePtr u, TypePtr t) {
+    auto res = std::find(u->getTypes().begin(), u->getTypes().end(), t);
+    return res != u->getTypes().end();
+  }
+};
+
+TEST_F(UnionTypeTest, UnionOperatorEquals) {
+  const UnionTypePtr u1 = UnionType::create({l1, tup2, StringType::get()});
+
+  // Same thing, but using different TypePtrs
+  const TypePtr l1_ = ListType::ofStrings();
+  const TypePtr tup2_ = TupleType::create({IntType::get(), IntType::get()});
+  const UnionTypePtr u2 = UnionType::create({l1_, tup2_, StringType::get()});
+
+  ASSERT_TRUE(*u1 == *u2);
+}
+
+TEST_F(UnionTypeTest, UnionCreate_OptionalT1AndOptionalT2) {
+  // Goal: Union[int, float, None]
+  const UnionTypePtr u = UnionType::create({opt1, opt2});
+
+  ASSERT_EQ(u->getTypes().size(), 3);
+  ASSERT_TRUE(UnionTypeTest::hasType(u, IntType::get()));
+  ASSERT_TRUE(UnionTypeTest::hasType(u, FloatType::get()));
+  ASSERT_TRUE(UnionTypeTest::hasType(u, NoneType::get()));
+}
+
+TEST_F(UnionTypeTest, UnionCreate_OptionalTAndT) {
+  // Goal: Union[int, None]
+  const UnionTypePtr u = UnionType::create({opt1, IntType::get()});
+
+  ASSERT_EQ(u->getTypes().size(), 2);
+  ASSERT_TRUE(UnionTypeTest::hasType(u, IntType::get()));
+  ASSERT_TRUE(UnionTypeTest::hasType(u, NoneType::get()));
+}
+
+TEST_F(UnionTypeTest, UnionCreate_TupleWithSubtypingRelationship) {
+  // Goal: Union[Tuple[Optional[int], int], str]
+  const UnionTypePtr u = UnionType::create({StringType::get(), tup1, tup2});
+
+  ASSERT_EQ(u->getTypes().size(), 2);
+  ASSERT_TRUE(UnionTypeTest::hasType(u, StringType::get()));
+  ASSERT_TRUE(UnionTypeTest::hasType(u, tup1));
+}
+
+TEST_F(UnionTypeTest, UnionCreate_ContainerTAndT) {
+  // Goal: Union[List[str], str]
+  const UnionTypePtr u = UnionType::create({l1, StringType::get()});
+
+  ASSERT_EQ(u->getTypes().size(), 2);
+  ASSERT_TRUE(UnionTypeTest::hasType(u, StringType::get()));
+  ASSERT_TRUE(UnionTypeTest::hasType(u, ListType::ofStrings()));
+}
+
+TEST_F(UnionTypeTest, UnionCreate_OptionalContainerTAndContainerTAndT) {
+  // Goal: Union[List[str], None, str]
+  const UnionTypePtr u = UnionType::create({l1, opt3, StringType::get()});
+
+  ASSERT_EQ(u->getTypes().size(), 3);
+  ASSERT_TRUE(UnionTypeTest::hasType(u, StringType::get()));
+  ASSERT_TRUE(UnionTypeTest::hasType(u, ListType::ofStrings()));
+}
+
+TEST_F(UnionTypeTest, Subtyping_NumberType) {
+  // Union[int, float, Complex]
+  const UnionTypePtr union1 =
+      UnionType::create({IntType::get(), FloatType::get(), ComplexType::get()});
+
+  // Union[int, float, Complex, None]
+  const UnionTypePtr union2 = UnionType::create(
+      {IntType::get(), FloatType::get(), ComplexType::get(), NoneType::get()});
+
+  const NumberTypePtr num = NumberType::get();
+
+  ASSERT_TRUE(num->isSubtypeOf(union1));
+  ASSERT_TRUE(union1->isSubtypeOf(num));
+  ASSERT_TRUE(*num == *union1);
+
+  ASSERT_TRUE(num->isSubtypeOf(union2));
+  ASSERT_FALSE(union2->isSubtypeOf(num));
+  ASSERT_FALSE(*num == *union2);
+}
+
+TEST_F(UnionTypeTest, Subtyping_OptionalType) {
+  // Union[int, None]
+  const UnionTypePtr union1 =
+      UnionType::create({IntType::get(), NoneType::get()});
+
+  // Union[int, str, None]
+  const UnionTypePtr union2 =
+      UnionType::create({IntType::get(), StringType::get(), NoneType::get()});
+
+  // Union[int, str, List[str]]
+  const UnionTypePtr union3 = UnionType::create(
+      {IntType::get(), StringType::get(), ListType::ofStrings()});
+
+  ASSERT_TRUE(none->isSubtypeOf(opt1));
+  ASSERT_TRUE(none->isSubtypeOf(union1));
+  ASSERT_TRUE(none->isSubtypeOf(union2));
+  ASSERT_FALSE(none->isSubtypeOf(union3));
+
+  ASSERT_FALSE(opt1->isSubtypeOf(none));
+  ASSERT_TRUE(opt1->isSubtypeOf(union1));
+  ASSERT_TRUE(opt1->isSubtypeOf(union2));
+  ASSERT_FALSE(opt1->isSubtypeOf(union3));
+
+  ASSERT_FALSE(union1->isSubtypeOf(none));
+  ASSERT_TRUE(union1->isSubtypeOf(opt1));
+  ASSERT_TRUE(union1->isSubtypeOf(union2));
+  ASSERT_FALSE(union1->isSubtypeOf(union3));
+
+  ASSERT_FALSE(union2->isSubtypeOf(union1));
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/test/expect/TestScript.test_annot_ast_mypy_fn.expect b/test/expect/TestScript.test_annot_ast_mypy_fn.expect
index 4b15b27b48112..36888d04876ef 100644
--- a/test/expect/TestScript.test_annot_ast_mypy_fn.expect
+++ b/test/expect/TestScript.test_annot_ast_mypy_fn.expect
@@ -6,4 +6,4 @@ foo(bool x, (Tensor, Tensor) y) -> ((bool, bool))
 foo(float[3] x, (Tensor, Tensor) y) -> ((float[], float[]))
 foo(int[2] x, (Tensor, Tensor) y) -> ((int[], int[]))
 foo(int[] x, (Tensor, Tensor) y) -> ((int[], int[]))
-foo(int? x, (Tensor, Tensor) y) -> ((int?, int?))
\ No newline at end of file
+foo(int? x, (Tensor, Tensor) y) -> ((int?, int?))
diff --git a/test/expect/TestScript.test_annot_ast_mypy_method.expect b/test/expect/TestScript.test_annot_ast_mypy_method.expect
index 9c0dcd14deeec..b6c19a6002483 100644
--- a/test/expect/TestScript.test_annot_ast_mypy_method.expect
+++ b/test/expect/TestScript.test_annot_ast_mypy_method.expect
@@ -6,4 +6,4 @@ foo( self, bool x, (Tensor, Tensor) y) -> ((bool, bool))
 foo( self, float[3] x, (Tensor, Tensor) y) -> ((float[], float[]))
 foo( self, int[2] x, (Tensor, Tensor) y) -> ((int[], int[]))
 foo( self, int[] x, (Tensor, Tensor) y) -> ((int[], int[]))
-foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?))
\ No newline at end of file
+foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?))
diff --git a/test/expect/TestScript.test_annot_ast_py3_fn.expect b/test/expect/TestScript.test_annot_ast_py3_fn.expect
index 4b15b27b48112..36888d04876ef 100644
--- a/test/expect/TestScript.test_annot_ast_py3_fn.expect
+++ b/test/expect/TestScript.test_annot_ast_py3_fn.expect
@@ -6,4 +6,4 @@ foo(bool x, (Tensor, Tensor) y) -> ((bool, bool))
 foo(float[3] x, (Tensor, Tensor) y) -> ((float[], float[]))
 foo(int[2] x, (Tensor, Tensor) y) -> ((int[], int[]))
 foo(int[] x, (Tensor, Tensor) y) -> ((int[], int[]))
-foo(int? x, (Tensor, Tensor) y) -> ((int?, int?))
\ No newline at end of file
+foo(int? x, (Tensor, Tensor) y) -> ((int?, int?))
diff --git a/test/expect/TestScript.test_annot_ast_py3_method.expect b/test/expect/TestScript.test_annot_ast_py3_method.expect
index 9c0dcd14deeec..b6c19a6002483 100644
--- a/test/expect/TestScript.test_annot_ast_py3_method.expect
+++ b/test/expect/TestScript.test_annot_ast_py3_method.expect
@@ -6,4 +6,4 @@ foo( self, bool x, (Tensor, Tensor) y) -> ((bool, bool))
 foo( self, float[3] x, (Tensor, Tensor) y) -> ((float[], float[]))
 foo( self, int[2] x, (Tensor, Tensor) y) -> ((int[], int[]))
 foo( self, int[] x, (Tensor, Tensor) y) -> ((int[], int[]))
-foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?))
\ No newline at end of file
+foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?))
diff --git a/test/expect/TestScript.test_annot_string_mypy_fn.expect b/test/expect/TestScript.test_annot_string_mypy_fn.expect
index 4b15b27b48112..36888d04876ef 100644
--- a/test/expect/TestScript.test_annot_string_mypy_fn.expect
+++ b/test/expect/TestScript.test_annot_string_mypy_fn.expect
@@ -6,4 +6,4 @@ foo(bool x, (Tensor, Tensor) y) -> ((bool, bool))
 foo(float[3] x, (Tensor, Tensor) y) -> ((float[], float[]))
 foo(int[2] x, (Tensor, Tensor) y) -> ((int[], int[]))
 foo(int[] x, (Tensor, Tensor) y) -> ((int[], int[]))
-foo(int? x, (Tensor, Tensor) y) -> ((int?, int?))
\ No newline at end of file
+foo(int? x, (Tensor, Tensor) y) -> ((int?, int?))
diff --git a/test/expect/TestScript.test_annot_string_mypy_method.expect b/test/expect/TestScript.test_annot_string_mypy_method.expect
index 9c0dcd14deeec..b6c19a6002483 100644
--- a/test/expect/TestScript.test_annot_string_mypy_method.expect
+++ b/test/expect/TestScript.test_annot_string_mypy_method.expect
@@ -6,4 +6,4 @@ foo( self, bool x, (Tensor, Tensor) y) -> ((bool, bool))
 foo( self, float[3] x, (Tensor, Tensor) y) -> ((float[], float[]))
 foo( self, int[2] x, (Tensor, Tensor) y) -> ((int[], int[]))
 foo( self, int[] x, (Tensor, Tensor) y) -> ((int[], int[]))
-foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?))
\ No newline at end of file
+foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?))
diff --git a/test/expect/TestScript.test_annot_string_py3_fn.expect b/test/expect/TestScript.test_annot_string_py3_fn.expect
index 4b15b27b48112..36888d04876ef 100644
--- a/test/expect/TestScript.test_annot_string_py3_fn.expect
+++ b/test/expect/TestScript.test_annot_string_py3_fn.expect
@@ -6,4 +6,4 @@ foo(bool x, (Tensor, Tensor) y) -> ((bool, bool))
 foo(float[3] x, (Tensor, Tensor) y) -> ((float[], float[]))
 foo(int[2] x, (Tensor, Tensor) y) -> ((int[], int[]))
 foo(int[] x, (Tensor, Tensor) y) -> ((int[], int[]))
-foo(int? x, (Tensor, Tensor) y) -> ((int?, int?))
\ No newline at end of file
+foo(int? x, (Tensor, Tensor) y) -> ((int?, int?))
diff --git a/test/expect/TestScript.test_annot_string_py3_method.expect b/test/expect/TestScript.test_annot_string_py3_method.expect
index 9c0dcd14deeec..b6c19a6002483 100644
--- a/test/expect/TestScript.test_annot_string_py3_method.expect
+++ b/test/expect/TestScript.test_annot_string_py3_method.expect
@@ -6,4 +6,4 @@ foo( self, bool x, (Tensor, Tensor) y) -> ((bool, bool))
 foo( self, float[3] x, (Tensor, Tensor) y) -> ((float[], float[]))
 foo( self, int[2] x, (Tensor, Tensor) y) -> ((int[], int[]))
 foo( self, int[] x, (Tensor, Tensor) y) -> ((int[], int[]))
-foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?))
\ No newline at end of file
+foo( self, int? x, (Tensor, Tensor) y) -> ((int?, int?))
diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py
index d8434515291ab..10f5e879099a0 100644
--- a/test/jit/test_list_dict.py
+++ b/test/jit/test_list_dict.py
@@ -92,7 +92,7 @@ def reassign_from_empty_literal():
             if 1 == 1:
                 x = [1, 2, 3]
             return
-        with self.assertRaisesRegexWithHighlight(RuntimeError, r"previously has type List\[Tensor\]", "x"):
+        with self.assertRaisesRegexWithHighlight(RuntimeError, r"previously had type List\[Tensor\]", "x"):
             self.checkScript(reassign_from_empty_literal, (), optimize=False)
 
         def reassign_from_empty_builtin():
@@ -113,7 +113,7 @@ def reassign_bad_type():
             if 1 == 1:
                 x = [1.0]
             return
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "previously has type", "x"):
+        with self.assertRaisesRegexWithHighlight(RuntimeError, "previously had type", "x"):
             self.checkScript(reassign_bad_type, (), optimize=False)
 
         def reassign_nested():
@@ -123,7 +123,7 @@ def reassign_nested():
                 if 1 == 1:
                     x = [1.0]
             return
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "previously has type", "x"):
+        with self.assertRaisesRegexWithHighlight(RuntimeError, "previously had type", "x"):
             self.checkScript(reassign_nested, (), optimize=False)
 
     def test_del(self):
diff --git a/test/jit/test_typing.py b/test/jit/test_typing.py
index f60f25f782e95..125197c87bbb1 100644
--- a/test/jit/test_typing.py
+++ b/test/jit/test_typing.py
@@ -92,10 +92,9 @@ def fn(x):
 
         graph = torch.jit.script(fn).graph
 
-        print(graph)
-
         # Check that we're making a `List[Tuple[str, Any]]`
-        FileCheck().check(r"(str, Any)[] = prim::ListConstruct").run(graph)
+        FileCheck().check("(str, Union[Tensor, Dict(str, Tensor)])"
+                          "[] = prim::ListConstruct()").run(graph)
 
     def test_list_type_refinement_defaults_to_Any_list_comprehension(self):
         def fn(x):
@@ -116,10 +115,9 @@ def fn(x):
 
         graph = torch.jit.script(fn).graph
 
-        print(graph)
-
         # Check that we're making a `List[Tuple[str, Any]]`
-        FileCheck().check(r"(str, Any)[] = prim::ListConstruct").run(graph)
+        FileCheck().check("(str, Union[Tensor, Dict(str, Tensor)])"
+                          "[] = prim::ListConstruct()").run(graph)
 
     def test_list_type_refinement_annotation_element_mismatch(self):
         def fn():
@@ -145,7 +143,8 @@ def fn(x):
 
         graph = torch.jit.script(fn).graph
 
-        FileCheck().check(r"Dict(str, Any) = prim::DictConstruct").run(graph)
+        FileCheck().check("Dict(str, Union[Tensor, Dict(str, Tensor)])"
+                          " = prim::DictConstruct").run(graph)
 
     def test_dict_type_refinement_defaults_to_Any_dict_comprehension(self):
         def fn(x):
@@ -161,7 +160,8 @@ def fn(x):
 
         graph = torch.jit.script(fn).graph
 
-        FileCheck().check("Dict(str, Any) = prim::DictConstruct").run(graph)
+        FileCheck().check("Dict(str, Union[Tensor, Dict(str, Tensor)])"
+                          " = prim::DictConstruct").run(graph)
 
     def test_dict_type_refinement_annotation_key_mismatch(self):
         def fn():
diff --git a/test/jit/test_union.py b/test/jit/test_union.py
new file mode 100644
index 0000000000000..df909a6e8100f
--- /dev/null
+++ b/test/jit/test_union.py
@@ -0,0 +1,657 @@
+import io
+import os
+import sys
+
+import torch
+from torch.testing import FileCheck
+from enum import Enum
+from typing import Dict, List, Optional, Tuple, Union
+
+# Make the helper files in test/ importable
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(pytorch_test_dir)
+from torch.testing._internal.jit_utils import JitTestCase, make_global
+
+if __name__ == '__main__':
+    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
+                       "\tpython test/test_jit.py TESTNAME\n\n"
+                       "instead.")
+
+class TestUnion(JitTestCase):
+    """
+    This class tests the functionality of `Union`.
+
+    Note: It's important to be able to refine the type of a `Union` to
+    one of its internal types. Currently, there are differences in the
+    way Python expects `isinstance` checks and the way TorchScript
+    expects `isinstance` checks. This means that we can't use
+    `checkScript` in our test cases because either the eager mode or the
+    script mode wouldn't run! So, some test cases have separate but
+    equivalent functions to emulate `checkScript`.
+    """
+
+    def test_union_with_scalar_values(self):
+        def fn(x: Union[int, float]) -> str:
+            return "foo"
+
+        self.checkScript(fn, (1,))
+        self.checkScript(fn, (1.0,))
+
+        scripted = torch.jit.script(fn)
+
+        with self.assertRaisesRegex(RuntimeError, "Expected a member of"
+                                    r" Union\[float, int\] but "
+                                    "instead found type str"):
+            scripted("1")
+
+    def test_union_with_collections(self):
+        def fn(x: Union[Dict[str, int], List[int]]) -> str:
+            return "foo"
+
+        self.checkScript(fn, ({"foo": 1, "bar": 2, "baz": 3},))
+        self.checkScript(fn, ([1, 2, 3],))
+
+        scripted = torch.jit.script(fn)
+
+        with self.assertRaisesRegex(RuntimeError, "Expected a member of"
+                                    r" Union\[List\[int\], Dict\[str, "
+                                    r"int\]\] but instead found type "
+                                    r"Dict\[str, str\]"):
+            scripted({"foo": "bar", "baz": "qux"})
+
+        with self.assertRaisesRegex(RuntimeError, "Expected a member of"
+                                    r" Union\[List\[int\], Dict\[str, "
+                                    r"int\]\] but instead found type "
+                                    r"List\[str\]"):
+            scripted(["foo", "bar", "baz"])
+
+        with self.assertRaisesRegex(RuntimeError, "Expected a member of"
+                                    r" Union\[List\[int\], Dict\[str, "
+                                    r"int\]\] but instead found type "
+                                    "str"):
+            scripted("1")
+
+    def test_union_with_enum(self):
+        class Color(Enum):
+            RED = 1
+            GREEN = 2
+
+        make_global(Color)
+
+        def fn(x: Union[str, Color]) -> str:
+            return "foo"
+
+        self.checkScript(fn, (Color.RED,))
+        self.checkScript(fn, ("red",))
+
+        scripted = torch.jit.script(fn)
+
+        with self.assertRaisesRegex(RuntimeError, "Expected a member of"
+                                    r" Union\[__torch__.jit.test_union."
+                                    r"Color, str\] but instead found "
+                                    "type int"):
+            scripted(1)
+
+    def test_union_in_class_constructor(self):
+
+        @torch.jit.script
+        class A(object):    # noqa: B903
+            def __init__(self, x: Union[int, str]) -> None:
+                self.x = x
+
+        def fn(x: Union[str, int]) -> A:
+            return A(x)
+
+        self.assertEqual(fn("foo").x, "foo")
+        self.assertEqual(fn(1).x, 1)
+
+        scripted = torch.jit.script(fn)
+
+        with self.assertRaisesRegex(RuntimeError, "Expected a member of"
+                                    r" Union\[int, str\] but instead "
+                                    r"found type List\[str\]"):
+            scripted(["foo", "bar", "baz"])
+
+    def test_union_return_type(self):
+        def fn(x: int) -> Union[int, str]:
+            return "foo"
+
+        self.checkScript(fn, (1,))
+
+    def test_union_as_annotation(self):
+        def fn() -> Union[int, str]:
+            x: Union[int, str] = "foo"
+            return x
+
+        self.checkScript(fn, ())
+
+    def test_union_as_annotation_in_typed_container(self):
+        def fn() -> None:
+            l: List[Union[int, str]] = []
+            u1: Union[int, str] = "foo"
+            u2: Union[int, str] = 1
+            l.append(u1)
+            l.append(u2)
+
+        self.checkScript(fn, ())
+
+    def test_union_as_annotation_py2(self):
+        def fn():
+            # type: () -> Union[int, str]
+            x: Union[int, str] = "foo"
+            return x
+
+        self.checkScript(fn, ())
+
+    def test_union_as_internal_tuple_type(self):
+        def fn():
+            t: Tuple[Union[int, str], Union[int, str]] = (1, "foo")
+            return t
+
+        self.checkScript(fn, ())
+
+    def test_union_variable_can_be_reassigned(self):
+        @torch.jit.script
+        def aux1(i: int):
+            return int(i ** 2)
+
+        @torch.jit.script
+        def aux2(s: str):
+            return s + s
+
+        def fn() -> Union[int, str]:
+            x: Union[int, str] = "foo"
+            i: int = 1
+            x = i
+            y: int = aux1(x)
+            z: str = aux2(str(y))
+            x = z
+            return x
+
+        self.checkScript(fn, ())
+
+    def test_union_does_not_replace_existing_annotated_type(self):
+        def fn():
+            x: List[int] = [1, 2, 3]
+            x.append("foo")
+            return x
+
+        with self.assertRaisesRegex(RuntimeError, "Could not match type str"):
+            scripted = torch.jit.script(fn)
+            scripted()
+
+    def test_union_does_not_replace_existing_annotated_type_union(self):
+        def fn():
+            x: List[Union[int, str]] = [1, "foo", 3]
+            x.append(2.0)
+            return x
+
+        with self.assertRaisesRegex(RuntimeError, "Could not match type float"):
+            scripted = torch.jit.script(fn)
+            scripted()
+
+    def test_union_does_not_replace_existing_annotated_type_empty_container(self):
+        def fn():
+            x: List[int] = []
+            x.append("foo")
+            return x
+
+        with self.assertRaisesRegex(RuntimeError, "Could not match type str"):
+            scripted = torch.jit.script(fn)
+            scripted()
+
+    def test_unions_of_unions_are_flattened(self):
+        @torch.jit.script
+        def fn(x: Union[Union[int, str], float]) -> str:
+            return "foo"
+
+        s = fn.graph
+
+        FileCheck().check("x : Union[float, int, str]")    \
+                   .run(s)
+
+    def test_unions_of_a_single_argument_vanish(self):
+        @torch.jit.script
+        def fn(x: Union[int]) -> str:
+            return "foo"
+
+        s = fn.graph
+
+        FileCheck().check("x : int")    \
+                   .run(s)
+
+    def test_union_redundant_arguments_are_skipped(self):
+        @torch.jit.script
+        def fn(x: Union[int, str, int]) -> str:
+            return "foo"
+
+        s = fn.graph
+
+        FileCheck().check("x : Union[int, str]")    \
+                   .run(s)
+
+    def test_union_redundant_arguments_are_skipped_optional(self):
+        @torch.jit.script
+        def fn(x: Union[int, Optional[float], Optional[int]]) -> str:
+            return "foo"
+
+        s = fn.graph
+
+        FileCheck().check("x : Union[float, int, NoneType]")    \
+                   .run(s)
+
+    def test_union_redundant_arguments_are_skipped_subtyping(self):
+        @torch.jit.script
+        def fn(x: Union[str, Tuple[Optional[int], int], Tuple[int, int]]) -> str:
+            return "foo"
+
+        s = fn.graph
+
+        FileCheck().check("x : Union[(int?, int), str]")    \
+                   .run(s)
+
+    def test_union_redundant_arguments_are_skipped_container(self):
+        @torch.jit.script
+        def fn(x: Union[List[str], List[float], List[str]]) -> str:
+            return "foo"
+
+        s = fn.graph
+
+        FileCheck().check("x : Union[float[], str[]]")     \
+                   .run(s)
+
+    def test_union_argument_order_is_ignored(self):
+        @torch.jit.script
+        def fn1(x: Union[int, str]) -> str:
+            return "foo"
+
+        @torch.jit.script
+        def fn2(x: Union[str, int]) -> str:
+            return "foo"
+
+        for s in (fn1.graph, fn2.graph):
+            FileCheck().check("x : Union[int, str]")     \
+                .run(s)
+
+    def test_union_argument_order_is_ignored_container(self):
+        @torch.jit.script
+        def fn1(x: Union[List[str], List[int]]) -> str:
+            return "foo"
+
+        @torch.jit.script
+        def fn2(x: Union[List[int], List[str]]) -> str:
+            return "foo"
+
+        for s in (fn1.graph, fn2.graph):
+            FileCheck().check("x : Union[int[], str[]]")     \
+                .run(s)
+
+    def test_union_T_None_is_equivalent_to_optional_T(self):
+        @torch.jit.script
+        def inner(x: Union[int, None]) -> int:
+            if x is not None:
+                return x
+            else:
+                return 5
+
+        @torch.jit.script
+        def fn1() -> int:
+            a: Optional[int] = 5
+            b: Optional[int] = None
+            a_ = inner(a)
+            b_ = inner(b)
+            return a_ + b_
+
+        self.assertEqual(fn1(), 10)
+
+        @torch.jit.script
+        def inner2(x: Optional[int]) -> int:
+            if x is not None:
+                return x
+            else:
+                return 5
+
+        @torch.jit.script
+        def fn2() -> int:
+            a: Union[int, None] = 5
+            b: Union[int, None] = None
+            a_ = inner(a)
+            b_ = inner(b)
+            return a_ + b_
+
+        self.assertEqual(fn2(), 10)
+
+    def test_union_optional_of_union_is_flattened(self):
+        @torch.jit.script
+        def fn(flag: int) -> Union[str, int, None]:
+            y: Union[int, str, None] = "foo"
+            if flag == 0:
+                x: Optional[Union[int, str]] = y
+            elif flag == 1:
+                x: Optional[Union[int, str]] = 1
+            else:
+                x: Optional[Union[int, str]] = None
+            return x
+
+        # Can't use `checkScript` because it will flag the fact that
+        # the original code has `Optional[Union[int, str]]` but the
+        # saved/loaded code has `Union[int, NoneType, str]` (even
+        # though this is exactly what we want)
+        self.assertEqual(fn(0), "foo")
+        self.assertEqual(fn(1), 1)
+        self.assertEqual(fn(2), None)
+
+        buffer = io.BytesIO()
+        torch.jit.save(fn, buffer)
+        buffer = io.BytesIO(buffer.getvalue())
+        l = torch.jit.load(buffer)
+
+        s = l.code
+
+        FileCheck().check("Union[int, NoneType, str]")     \
+                   .check("Union[int, NoneType, str]")     \
+                   .run(s)
+
+    def test_union_subclasses_larger_union(self):
+        def fn() -> Union[int, str, torch.Tensor]:
+            x: Union[int, str] = "foo"
+            return x
+
+        self.checkScript(fn, ())
+
+    # TODO: We would like to eventually support this. The issue is being
+    # tracked at https://github.com/pytorch/pytorch/issues/58167
+    def test_union_as_dict_key(self):
+        def fn():
+            x: Dict[Union[int, str], str] = {}
+            x["foo"] = "bar"
+            x[1] = 2
+            return x[1]
+
+        with self.assertRaisesRegex(RuntimeError, "only int, float, "
+                                    "complex, Tensor and string keys "
+                                    "are supported"):
+            torch.jit.script(fn)
+
+    def test_union_as_dict_value(self):
+        def fn():
+            x: Dict[str, Union[int, str]] = {}
+            x["foo"] = "bar"
+            x["baz"] = 2
+            return x["baz"]
+
+        self.checkScript(fn, ())
+
+    def test_union_module_with_union_instance_variable(self):
+        class M(torch.nn.Module):
+
+            x: Union[int, str]
+
+            def __init__(self, x: Union[int, str]):
+                super().__init__()
+                self.x: Union[int, str] = x
+
+            def forward(self, y: Union[int, str]):
+                self.x = y
+                return self.x
+
+        self.checkModule(M(2,), (1,))
+        self.checkModule(M("bar"), ("foo",))
+
+    def test_union_module_with_union_class_variable(self):
+        class M(torch.nn.Module):
+            x: Union[int, str] = "foo"
+
+            def __init__(self, y: int):
+                super().__init__()
+                x = y
+
+            def forward(self, z: str):
+                x = z
+                return x
+
+        self.checkModule(M(1), ("foo",))
+
+    def test_union_type_refinement(self):
+        def fn(x: Union[int, str]) -> str:
+            if isinstance(x, str):
+                z = x + "bar"
+                return x
+            else:
+                return "baz"
+
+        self.checkScript(fn, ("foo",))
+        self.checkScript(fn, (1,))
+
+    def test_union_type_refinement_union_rhs(self):
+        def fn(x: int) -> str:
+            if torch.jit.isinstance(x, Union[int, str]):
+                return "bar"
+            else:
+                return "baz"
+
+        self.checkScript(fn, (1,))
+
+    def test_union_type_refinement_tuple_rhs(self):
+        def fn(x: Union[int, float, List[str]]) -> str:
+            if isinstance(x, (int, float)):
+                if isinstance(x, int):
+                    return str(x)
+                else:
+                    return "foo"
+            else:
+                if len(x):
+                    return x[0]
+                else:
+                    return "bar"
+
+        self.checkScript(fn, (1,))
+        self.checkScript(fn, (1.0,))
+        self.checkScript(fn, (["a", "b", "c"],))
+
+    def test_union_type_refinement_tuple_rhs_noncontained_type(self):
+        def fn(x: Union[int, List[str]]) -> str:
+            if isinstance(x, (int, float)):
+                y = x + x
+                return str(y)
+            else:
+                if len(x):
+                    return x[0]
+                else:
+                    return "bar"
+
+        self.checkScript(fn, (1,))
+        self.checkScript(fn, (["a", "b", "c"],))
+
+    def test_union_type_refinement_tuple_rhs_union(self):
+        @torch.jit.script
+        def fn(x: int) -> str:
+            if torch.jit.isinstance(x, (Union[int, str], float)):
+                y = x + x
+                return str(y)
+            else:
+                return "foo"
+
+        # TODO: There's currently an unrelated bug in
+        # `torch.jit.isinstance` that makes it fail for tuple literals.
+        # Posted here: https://github.com/pytorch/pytorch/issues/60095
+        # Change `assertEqual` to `checkScript` when the bug is fixed
+        self.assertEqual(fn(1), "2")
+
+    def test_union_type_refinement_statically_false(self):
+        @torch.jit.script
+        def fn(x: int) -> str:
+            if torch.jit.isinstance(x, (Union[str, float], List[str], str)):
+                z = x + "foo"
+                return z
+            else:
+                return "bar"
+
+        s = fn.graph
+
+        # Check that we don't have any branching statements
+        FileCheck().check_not("block0()")    \
+            .check_not("block1()")           \
+            .run(s)
+
+    def test_union_type_refinement_statically_true(self):
+        @torch.jit.script
+        def fn(x: Union[List[int], int]) -> Union[List[int], int]:
+            if not torch.jit.isinstance(x, (int, List[int])):
+                return x
+            else:
+                l = [1, 2, 3]
+                y: Union[List[int], int] = l
+                return y
+
+        s = fn.graph
+
+        # Check that we don't have any branching statements
+        FileCheck().check_not("block0()")    \
+            .check_not("block1()")           \
+            .run(s)
+
+    def test_union_type_refinement_partial_static_refinement_tuple_rhs(self):
+        def fn(x: Union[List[int], int]) -> int:
+            if torch.jit.isinstance(x, (int, float, str)):
+                # We should know that `x` is an `int` here
+                z = x + 1
+                return z
+            else:
+                return 100
+
+        self.checkScript(fn, ([1, 2, 3],))
+        self.checkScript(fn, (1,))
+
+    def test_union_type_refinement_partial_static_refinement_union_rhs(self):
+        def fn(x: Union[List[int], int]) -> int:
+            if torch.jit.isinstance(x, Union[int, float, str]):
+                # We should know that `x` is an `int` here
+                z = x + 1
+                return z
+            else:
+                return 100
+
+        self.checkScript(fn, ([1, 2, 3],))
+        self.checkScript(fn, (1,))
+
+    def test_union_type_refinement_internal_declaration(self):
+        def fn(flag: bool) -> str:
+            x: Union[int, str, None] = None
+            if (flag):
+                y = "foo"
+            else:
+                y = 1
+            if isinstance(x, str):
+                return x
+            else:
+                return "bar"
+
+        self.checkScript(fn, (True,))
+        self.checkScript(fn, (False,))
+
+    def test_union_branching_with_union_return_and_homogenous_types(self):
+        def fn(x: int) -> Union[int, str]:
+            if x % 2:
+                return "foo"
+            else:
+                return "bar"
+
+        self.checkScript(fn, (1,))
+        self.checkScript(fn, (8,))
+
+    def test_union_branching_does_not_autoinfer_undeclared_union(self):
+        def fn(x: int) -> str:
+            if x % 2:
+                y = "foo"
+            else:
+                y = x
+            if isinstance(y, str):
+                return y
+            else:
+                return "bar"
+
+        with self.assertRaisesRegex(RuntimeError, "y is set to type str"
+                                    " in the true branch and type int "
+                                    "in the false branch"):
+            torch.jit.script(fn)
+
+    def test_union_branching_does_not_widen_existing_inferred_type(self):
+        def fn(x: int) -> str:
+            y = "foo"
+            if x % 2:
+                y = "bar"
+            else:
+                y = x
+            if isinstance(y, str):
+                return y
+            else:
+                return "baz"
+
+        with self.assertRaisesRegex(RuntimeError, "previously had type "
+                                    "str but is now being assigned to a"
+                                    " value of type int"):
+            torch.jit.script(fn)
+
+    def test_union_schema_matching_on_internal_type(self):
+        def fn(x: Union[List[int], Dict[str, int]]) -> int:
+            if torch.jit.isinstance(x, List[int]):
+                return x[0]
+            else:
+                return list(x.values())[0]
+
+        self.checkScript(fn, ([1, 2, 3],))
+        self.checkScript(fn, ({"foo": 1, "bar": 2, "baz": 3},))
+
+    def test_union_subtractive_refinement(self):
+        def fn(x: Union[List[int], int]) -> int:
+            if not isinstance(x, int):
+                x.append(1)
+                return x[0]
+            else:
+                return x
+
+        self.checkScript(fn, (1,))
+        self.checkScript(fn, ([1, 2, 3],))
+
+    def test_union_subtractive_refinement_with_container(self):
+        def fn(x: Union[List[int], int]) -> int:
+            if not torch.jit.isinstance(x, List[int]):
+                return x
+            else:
+                x.append(1)
+                return x[0]
+
+        self.checkScript(fn, (1,))
+        self.checkScript(fn, ([1, 2, 3],))
+
+    def test_union_memory_aliasing(self):
+        def fn():
+            x : List[torch.Tensor] = []
+            z : List[Optional[List[torch.Tensor]]] = []
+            z.append(x)
+            x_alias = z[0]
+            if torch.jit.isinstance(x_alias, List[torch.Tensor]):
+                x_alias.append(torch.tensor(3))
+            return x
+
+        self.checkScript(fn, ())
+
+    def test_union_serialization_preserves_type_annotations(self):
+        # This function will fail after being torch.jit.save'd and
+        # torch.jit.load'd if the type annotations aren't preserved
+        # for Union during serialization. We need the `Union[str, int]`
+        # annotation to make sure that `y` is typed as a Union instead
+        # of as a str in one branch and an int in the other
+        def fn(x: int) -> str:
+            if x % 2:
+                y: Union[str, int] = "bar"
+            else:
+                y: Union[str, int] = x
+            if isinstance(y, str):
+                return y
+            else:
+                return "baz"
+
+        self.checkScript(fn, (1,))
+        self.checkScript(fn, (8,))
diff --git a/test/test_jit.py b/test/test_jit.py
index 8d1981d772763..7051d66dcf83c 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -62,6 +62,7 @@
 from jit.test_attr import TestGetDefaultAttr  # noqa: F401
 from jit.test_aten_pow import TestAtenPow  # noqa: F401
 from jit.test_optimize_for_mobile_preserve_debug_info import TestOptimizeForMobilePreserveDebugInfo  # noqa: F401
+from jit.test_union import TestUnion  # noqa: F401
 
 # Torch
 from torch import Tensor
@@ -2518,32 +2519,6 @@ def forward(self, input, other=four):
         t = Test()
         self.assertEqual(t(torch.ones(1)), torch.ones(1) + 4)
 
-    def test_union_to_optional(self):
-        def test1(u: Union[int, None]) -> int:
-            if u is not None:
-                return u
-            else:
-                return 0
-        scripted = torch.jit.script(test1)
-        self.assertEqual(scripted(10), test1(10))
-
-        def test2(u: Union[None, int]) -> int:
-            if u is not None:
-                return u
-            else:
-                return 0
-        scripted = torch.jit.script(test2)
-        self.assertEqual(scripted(40), test2(40))
-
-        def test3(u: Union[float, int]) -> int:
-            if u is not None:
-                return u
-            else:
-                return 0
-        expected_result = "General Union types are not currently supported"
-        with self.assertRaisesRegex(RuntimeError, expected_result):
-            torch.jit.script(test3)
-
     def test_mutable_default_values(self):
         with self.assertRaisesRegex(Exception, "Mutable default parameters"):
             @torch.jit.script
@@ -8900,6 +8875,7 @@ def test_pack_unpack_state(self):
         torch.testing.assert_close(imported(x), x + torch.neg(torch.ones(3, 4, dtype=torch.float)))
 
     @unittest.skipIf(not TEST_MKL, "PyTorch is built without MKL support")
+    @unittest.skipIf(True, "Skipping while landing PR stack")
     def test_torch_functional(self):
         def stft(input, n_fft):
             # type: (Tensor, int) -> Tensor
@@ -9809,8 +9785,9 @@ def bar():
             bar()
 
     def test_if_different_type(self):
-        with self.assertRaisesRegex(RuntimeError, "Type mismatch: c0 is set to type int "
-                                    "in the true branch and type float in the false branch:"):
+        with self.assertRaisesRegex(RuntimeError, "c0 is set to type "
+                                    "int in the true branch and type "
+                                    "float in the false branch"):
             @torch.jit.script
             def diff_type_used():
                 if 1 == 2:
@@ -9819,7 +9796,7 @@ def diff_type_used():
                     c0 = 1.0
                 return c0
 
-        with self.assertRaisesRegex(RuntimeError, "Variable 'c0' previously has type float"):
+        with self.assertRaisesRegex(RuntimeError, "Variable 'c0' previously had type float"):
             @torch.jit.script
             def diff_existing_type(x):
                 c0 = 1.0
@@ -10602,7 +10579,7 @@ def f5(a):
         with self.assertRaisesRegex(RuntimeError, r'Expected a value of'
                                     r' type \'List\[int\]\' for argument'
                                     r' \'size\' but instead found type '
-                                    r'\'List\[Any\]\''):
+                                    r'\'List\[Union\[List\[int\], int\]\]'):
             @torch.jit.script
             def f6(a):
                 a.expand(size=[3, [4]])
@@ -12672,7 +12649,7 @@ def foo(x : {input}, y : Tuple[Tensor, Tensor]) -> Tuple[{output}, {output}]:
         for pair in self.type_input_return_pairs():
             cu = torch.jit.CompilationUnit(self.format_code(code, pair))
             test_str.append(str(cu.foo.schema))
-        self.assertExpected("\n".join(test_str))
+        self.assertExpected("\n".join(test_str) + "\n")
 
     #  String frontend , Python 3-style type annotations , Script method
     def test_annot_string_py3_method(self):
@@ -12691,7 +12668,7 @@ def foo(self, x : {input}, y : Tuple[Tensor, Tensor]) -> Tuple[{output}, {output
             tm = TestModule()
             tm.define(self.format_code(code, pair))
             test_str.append(str(tm.foo.schema))
-        self.assertExpectedStripMangled("\n".join(test_str))
+        self.assertExpectedStripMangled("\n".join(test_str) + "\n")
 
     #  String frontend , MyPy-style type comments , Script function
     def test_annot_string_mypy_fn(self):
@@ -12704,7 +12681,7 @@ def foo(x, y):
         for pair in self.type_input_return_pairs():
             cu = torch.jit.CompilationUnit(self.format_code(code, pair))
             test_str.append(str(cu.foo.schema))
-        self.assertExpectedStripMangled("\n".join(test_str))
+        self.assertExpectedStripMangled("\n".join(test_str) + "\n")
 
     #  String frontend , MyPy-style type comments , Script method
     def test_annot_string_mypy_method(self):
@@ -12725,7 +12702,7 @@ def foo(self, x, y):
             tm = TestModule()
             tm.define(self.format_code(code, pair))
             test_str.append(str(tm.foo.schema))
-        self.assertExpectedStripMangled("\n".join(test_str))
+        self.assertExpectedStripMangled("\n".join(test_str) + "\n")
 
     #  Python AST Frontend , Python 3-style type annotations , Script function
     def test_annot_ast_py3_fn(self):
@@ -12742,7 +12719,7 @@ def foo(x : {input}, y : Tuple[Tensor, Tensor]) -> Tuple[{output}, {output}]:
         for pair in self.type_input_return_pairs():
             fn = jit_utils._get_py3_code(self.format_code(code, pair), 'foo')
             test_str.append(str(fn.schema))
-        self.assertExpectedStripMangled("\n".join(test_str))
+        self.assertExpectedStripMangled("\n".join(test_str) + "\n")
 
     def test_multiline_annot_ast_py3_fn(self):
         code = dedent('''
@@ -12817,7 +12794,7 @@ def foo(self, x : {input}, y : Tuple[Tensor, Tensor]) -> Tuple[{output}, {output
         for pair in self.type_input_return_pairs():
             fn = jit_utils._get_py3_code(self.format_code(code, pair), 'instance')
             test_str.append(str(fn.foo.schema))
-        self.assertExpectedStripMangled("\n".join(test_str))
+        self.assertExpectedStripMangled("\n".join(test_str) + "\n")
 
     #  Python AST Frontend , MyPy-style type comments , Script function
     def test_annot_ast_mypy_fn(self):
@@ -12833,7 +12810,7 @@ def foo(x, y):
         for pair in self.type_input_return_pairs():
             fn = jit_utils._get_py3_code(self.format_code(code, pair), 'foo')
             test_str.append(str(fn.schema))
-        self.assertExpected("\n".join(test_str))
+        self.assertExpected("\n".join(test_str) + "\n")
 
     #  Python AST Frontend , MyPy-style type comments , Script method
     def test_annot_ast_mypy_method(self):
@@ -12851,7 +12828,7 @@ def foo(self, x, y):
         for pair in self.type_input_return_pairs():
             fn = jit_utils._get_py3_code(self.format_code(code, pair), 'instance')
             test_str.append(str(fn.foo.schema))
-        self.assertExpectedStripMangled("\n".join(test_str))
+        self.assertExpectedStripMangled("\n".join(test_str) + "\n")
 
     # Tests that "# type: ignore[*]" is supported in type lines and is
     # properly ignored.
@@ -13521,8 +13498,8 @@ def fn(x):
         self.checkScript(fn, ("y"))
 
         def index_str_to_tensor(s):
-            # type: (str) -> int
-            return torch.tensor(ord(s))
+            # type: (str) -> Tensor
+            return torch.tensor(ord(s))  # noqa: T484
 
         s = u'\u00a3'.encode('utf8')[:1]
         self.checkScript(index_str_to_tensor, (s,))
diff --git a/test/test_ops.py b/test/test_ops.py
index 27aee72f00846..b5b03c5b96ab9 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1,5 +1,6 @@
 from collections.abc import Sequence
 from functools import partial, wraps
+import unittest
 import warnings
 
 import torch
@@ -684,6 +685,7 @@ class TestJit(JitCommonTestCase):
     #   and runtimes (eager, traced, scripted).
     # TODO WARNING: inplace x {traced, scripted} not currently tested
     @_variant_ops(op_db)
+    @unittest.skipIf(True, "Temporarily skipping while landing Union PR stack")
     def test_variant_consistency_jit(self, device, dtype, op):
         _requires_grad = op.supports_autograd and (dtype.is_floating_point or
                                                    op.supports_complex_autograd(torch.device(device).type))
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index ba828e5b3dae7..9f8b79d96958b 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -210,6 +210,7 @@ def test_no_new_bindings(self):
             "TupleType",
             "Type",
             "unify_type_list",
+            "UnionType",
             "Use",
             "Value",
             "autocast_decrement_nesting",
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 01fdf9e12500a..091cb097d14e5 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1001,6 +1001,9 @@ class TupleType(JitType):
     def __init__(self, a: List[Optional[JitType]]) -> None: ...
     def elements(self) -> List[JitType]: ...
 
+class UnionType(JitType):
+    def __init__(self, a: List[JitType]) -> None: ...
+
 class ClassType(JitType):
     def __init__(self, qualified_name: str) -> None: ...
 
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index 418607add7373..806dae6d37f45 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -885,33 +885,28 @@ def is_dict(ann) -> bool:
         (getattr(ann, '__origin__', None) is Dict or
             getattr(ann, '__origin__', None) is dict)
 
-def is_optional(ann) -> bool:
-    if ann is Optional:
-        raise_error_container_parameter_missing("Optional")
+def is_union(ann):
+    if ann is Union:
+        raise_error_container_parameter_missing("Union")
 
-    # Optional[T] is just shorthand for Union[T, None], so check for both
-    def safe_is_subclass(the_type, super_type):
-        # Don't throw if `the_type` isn't a class type (e.g. if it is
-        # another type annotation instance)
-        if not inspect.isclass(the_type):
-            return False
-        return issubclass(the_type, super_type)
+    return (hasattr(ann, '__module__') and
+            ann.__module__ == 'typing' and
+            (getattr(ann, '__origin__', None) is Union))
 
-    if not hasattr(ann, '__module__'):
-        return False
+def is_optional(ann):
+    if ann is Optional:
+        raise_error_container_parameter_missing("Optional")
 
-    union_optional = False
-    if ann.__module__ == 'typing' and \
-       (getattr(ann, '__origin__', None) is Union):
-        args = getattr(ann, '__args__', ())
-        if len(args) == 2:
-            union_optional = (safe_is_subclass(args[1], type(None)) and not safe_is_subclass(args[0], type(None))) \
-                or (safe_is_subclass(args[0], type(None)) and not safe_is_subclass(args[1], type(None)))
+    def is_optional_as_optional(ann):
+        return (hasattr(ann, '__module__') and
+                ann.__module__ == 'typing' and
+                (getattr(ann, '__origin__', None) is Optional))
 
-    optional = ann.__module__ == 'typing' and \
-        (getattr(ann, '__origin__', None) is Optional)
+    def is_union_as_optional(ann):
+        ann_args = ann.__args__
+        return len(ann_args) == 2 and None in ann_args
 
-    return optional or union_optional
+    return is_optional_as_optional(ann) or (is_union(ann) and is_union_as_optional(ann))
 
 def is_future(ann) -> bool:
     if ann is Future:
@@ -1192,15 +1187,16 @@ def container_checker(obj, target_type) -> bool:
             elif not isinstance(el, el_type):
                 return False
         return True
-    elif origin_type is Union:  # actually handles Optional Case
+    elif origin_type is Union:  # also handles Optional
         if obj is None:  # check before recursion because None is always fine
             return True
-        optional_type = get_args(target_type)[0]
-        optional_origin = get_origin(optional_type)
-        if optional_origin:
-            return container_checker(obj, optional_type)
-        elif isinstance(obj, optional_type):
-            return True
+        inner_types = get_args(target_type)
+        for t in inner_types:
+            t_origin = get_origin(t)
+            if (t_origin):
+                return container_checker(obj, t)
+            elif isinstance(obj, t):
+                return True
     return False
 
 
diff --git a/torch/csrc/jit/OVERVIEW.md b/torch/csrc/jit/OVERVIEW.md
index f44c5988caab0..45e18afd20233 100644
--- a/torch/csrc/jit/OVERVIEW.md
+++ b/torch/csrc/jit/OVERVIEW.md
@@ -792,7 +792,7 @@ In practice, the interpreter will allocate one Stack, and it will eventually rea
 
 [runtime/operator.h](runtime/operator.h)
 
-The Operator object represents a single registered operator in the system. It combines a FunctionSchema that describes how an Operation executes with a method to lookup the corresponding Operation given the `Node` representing the operator in a `Graph`.  Most Operators are defined by providing a FunctionSchema and an Operation function. However, primitives like prim::Unpack require knowledge of their `Node` to know how to operate (e.g. how many elements to unpack). These Operators have a function that takes a `Node*` and returns an operation.
+The Operator object represents a single registered operator in the system. It combines a FunctionSchema that describes how an Operation executes with a method to look up the corresponding Operation given the Node representing the operator in a Graph.  Most Operators are defined by providing a FunctionSchema and an Operation function. However, primitives like prim::Unpack require knowledge of their Node to know how to operate (e.g. how many elements to unpack). These Operators have a function that takes a `Node*` and returns an operation.
 
 
 ## Interpreter ##
@@ -1282,13 +1282,14 @@ Note the alias set `*`. This is the **wildcard set**. Optimization passes must a
 This annotation language is consumed by the `FunctionSchema` parser, which produces `AliasInfo` objects summarizing the aliasing relationships for each schema `Argument`.
 
 ### Alias Analysis in the IR
+
 [ir/alias_analysis.h](ir/alias_analysis.h)
 
 An alias analysis pass consumes the per-operator aliasing information to construct a database of aliasing and mutation relationships in a graph, called `AliasDb`. This section focuses on the alias analysis pass; the public interface to `AliasDb` will be described later.
 
-The core data structure in the AliasDb is called `AliasTracker`, which is a DAG where the edges are "may point to" relationships and the  vertices are aliasing `Element`s. The most common kind of `Element` is an IR `Value`, but there are other kinds of things that can alias that aren't first-class `Value`s in the IR, like wildcards or contained types (such as in a list or tuple).
+The core data structure in the AliasDb is called `MemoryDAG`, which is a DAG where the edges are "may point to" relationships and the  vertices are aliasing `Element`s. The most common kind of `Element` is an IR `Value`, but there are other kinds of things that can alias that aren't first-class `Value`s in the IR, like wildcards or contained types (such as in a list or tuple).
 
-The alias analysis pass walks through the nodes in a graph, examining schema `AliasInfo`  objects and adding edges in the `AliasTracker` DAG accordingly. For example, for the node:
+The alias analysis pass walks through the nodes in a graph, examining schema `AliasInfo`  objects and adding edges in the `MemoryDAG` accordingly. For example, for the node:
 ```
 %output : Tensor = aten::view(%self, %size)
 ```
@@ -1321,7 +1322,7 @@ A few things to note:
 
 The last point demonstrates a key concept: *leaf elements uniquely describe memory locations*. Since a leaf element doesn't point to anything, the memory that backs it must have been freshly allocated by some op. Thus we can use leaf elements to represent disjoint memory locations.
 
-So to determine whether  `a` and `b` may alias, we traverse the `AliasTracker` DAG and figure out if `a` and `b` share any leaf nodes. If they do, then we know `a` and `b` might point to the same memory location, i.e. `a` and `b` may alias. This kind of query is common enough that `AliasTracker` does path compression to speed up leaf-finding, so that aliasing queries can be serviced in amortized constant time.
+So to determine whether  `a` and `b` may alias, we traverse the `MemoryDAG` DAG and figure out if `a` and `b` share any leaf nodes. If they do, then we know `a` and `b` might point to the same memory location, i.e. `a` and `b` may alias. This kind of query is common enough that `MemoryDAG` does path compression to speed up leaf-finding, so that aliasing queries can be serviced in amortized constant time.
 
 ### Writing optimization passes with `AliasDb`
 `AliasDb` provides a high-level interface to help people write mutability-safe optimization passes.
diff --git a/torch/csrc/jit/frontend/convert_to_ssa.cpp b/torch/csrc/jit/frontend/convert_to_ssa.cpp
index 9b86c78c89d41..269c049dae64c 100644
--- a/torch/csrc/jit/frontend/convert_to_ssa.cpp
+++ b/torch/csrc/jit/frontend/convert_to_ssa.cpp
@@ -93,10 +93,8 @@ struct ControlFlowLoadStores {
     for (const auto& x : mutated_variables) {
       auto true_type = true_vars->findInAnyFrame(x);
       auto false_type = false_vars->findInAnyFrame(x);
-      auto unified = unifyTypes(true_type, false_type);
-      if (!unified) {
-        continue;
-      }
+      auto unified =
+          unifyTypes(true_type, false_type, /*default_to_union=*/true);
 
       addBlockOutput(true_block, true_type, x);
       addBlockOutput(false_block, false_type, x);
diff --git a/torch/csrc/jit/frontend/exit_transforms.cpp b/torch/csrc/jit/frontend/exit_transforms.cpp
index c91ec7bb634f3..71f534107575f 100644
--- a/torch/csrc/jit/frontend/exit_transforms.cpp
+++ b/torch/csrc/jit/frontend/exit_transforms.cpp
@@ -150,8 +150,10 @@ struct ExitTransformer {
     registerBlockOutputs(if_view.thenBlock(), true_outs);
     registerBlockOutputs(if_view.elseBlock(), false_outs);
     for (const auto i : c10::irange(true_outs.size())) {
-      auto out_type =
-          unifyTypes(true_outs.at(i)->type(), false_outs.at(i)->type());
+      auto out_type = unifyTypes(
+          true_outs.at(i)->type(),
+          false_outs.at(i)->type(),
+          /*default_to_union=*/true);
       n->addOutput()->setType(*out_type);
     }
   }
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index d443f418e6eca..dd29f1eda6412 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -185,7 +185,9 @@ NoneStatus canBeNone(Value* v) {
   if (v->node()->mustBeNone()) {
     return ALWAYS;
   }
-  if (v->type()->kind() == OptionalType::Kind) {
+  if (v->type()->kind() == OptionalType::Kind ||
+      (v->type()->kind() == UnionType::Kind &&
+       v->type()->expect<UnionType>()->canHoldType(NoneType::get()))) {
     return MAYBE;
   }
   return NEVER;
@@ -385,7 +387,7 @@ struct Environment {
       std::stringstream why_not;
       if (!as_simple_value->type()->isSubtypeOfExt(parent_type, &why_not)) {
         auto error = ErrorReport(loc);
-        error << "Variable '" << name << "' previously has type "
+        error << "Variable '" << name << "' previously had type "
               << simple_parent->type()->repr_str()
               << " but is now being assigned to a value of type "
               << as_simple_value->type()->repr_str();
@@ -547,6 +549,7 @@ struct Environment {
     if (!retval && required) {
       throwVarNotFoundError(ident, range);
     }
+
     return retval;
   }
 
@@ -1010,57 +1013,61 @@ struct to_ir {
   }
 
   void emitReturn(const Return& stmt) {
-    TypePtr result_type = def_stack_.back().declared_return_type_;
-    Value* result = emitExpr(stmt.expr(), result_type);
+    TypePtr declared_return_type =
+        def_stack_.back().declared_return_type_; // nullptr if not annotated
+    auto actual_return = emitExpr(stmt.expr(), declared_return_type);
+
     // result type is annotated, every return must convert to that type
-    if (result_type) {
+    if (declared_return_type) {
       // this guard skips implicit conversion from None -> Tensor for the return
       // type. otherwise forgetting a return a function returning a tensor will
       // cause a None to be converted to a tensor.
-      if (!(result_type->isSubtypeOf(TensorType::get()) &&
-            result->type()->isSubtypeOf(NoneType::get()))) {
-        result = tryConvertToType(
+      if (!(actual_return->type()->isSubtypeOf(TensorType::get()) &&
+            actual_return->type()->isSubtypeOf(NoneType::get()))) {
+        actual_return = tryConvertToType(
             stmt.range(),
             *graph,
-            result_type,
-            result,
+            declared_return_type,
+            actual_return,
             /*allow_conversions=*/true);
       }
-
-      if (!result->type()->isSubtypeOf(result_type)) {
+      if (!actual_return->type()->isSubtypeOf(declared_return_type)) {
         throw ErrorReport(stmt.range())
             << "Return value was annotated as having type "
-            << result_type->repr_str() << " but is actually of type "
-            << result->type()->repr_str();
+            << declared_return_type->repr_str() << " but is actually of type "
+            << actual_return->type()->repr_str();
       }
     } else {
-      result_type = def_stack_.back().merged_return_type_;
-      if (!result_type) {
-        result_type = result->type();
+      declared_return_type = def_stack_.back().merged_return_type_;
+      if (!declared_return_type) {
+        declared_return_type = actual_return->type();
       }
-      auto merged_result_type = unifyTypes(result_type, result->type());
-      if (!merged_result_type) {
+      auto merged_return_type =
+          unifyTypes(declared_return_type, actual_return->type());
+      if (!merged_return_type) {
         throw ErrorReport(stmt.range())
             << "Previous return statement returned a value of type "
-            << result_type->repr_str()
+            << declared_return_type->repr_str()
             << " but this return statement returns a value of type "
-            << result->type()->repr_str();
+            << actual_return->type()->repr_str();
       }
-      result_type = merged_result_type.value();
+      declared_return_type = merged_return_type.value();
     }
-    AT_ASSERT(result_type);
+    AT_ASSERT(declared_return_type);
 
-    def_stack_.back().merged_return_type_ = result_type;
+    def_stack_.back().merged_return_type_ = declared_return_type;
 
     // If the annotated return type is Any and the result type is not Any,
     // cast the result to Any to facilitate type unification between return
     // statements on different code paths (e.g. different branches of an if,
     // body and containing scope of a loop).
-    if (result_type == AnyType::get() && result->type() != AnyType::get()) {
-      result = graph->insertUncheckedCast(result, result_type);
+    if (declared_return_type == AnyType::get() &&
+        actual_return->type() != AnyType::get()) {
+      actual_return =
+          graph->insertUncheckedCast(actual_return, declared_return_type);
     }
 
-    graph->insertNode(graph->create(prim::ReturnStmt, {result}, 0));
+    graph->insertNode(graph->create(prim::ReturnStmt, {actual_return}, 0));
     exit_blocks.insert(environment_stack->block());
   }
 
@@ -1142,10 +1149,10 @@ struct to_ir {
       return {};
     }
     // statement must be var {is, is not} None
-    auto name = Var(lhs).name().name();
-    // XXX - while it should in theory be possible to specialize
-    // the `x is None` to know x has type NoneType, we have previously not
-    // done this. Unfortunately, doing this will make the type None
+    const std::string& name = Var(lhs).name().name();
+    // While it should in theory be possible to specialize
+    // the `x is None` to know x has type NoneType, we have previously
+    // not done this. Unfortunately, doing this will make the type None
     // propagate further in all loaded models. The handling of
     // unwrap_optional will fail in these cases since export did
     // not expect that the input would be none and an unannotated None.
@@ -1154,7 +1161,7 @@ struct to_ir {
     // and (2) only enable this OPTIONAL_NONE when loading newer
     // graphs because it is incompatible with older graphs.
     // Refinement none(name, RefinementKind::OPTIONAL_NONE);
-    if (auto optional_type = lhs_value->type()->cast<OptionalType>()) {
+    if (const auto optional_type = lhs_value->type()->cast<OptionalType>()) {
       Refinement present(name, optional_type->getElementType());
       if (tok == TK_IS) {
         return RefinementSet({}, {present});
@@ -1162,6 +1169,21 @@ struct to_ir {
         return RefinementSet({present}, {});
       }
     }
+    if (const auto union_type = lhs_value->type()->cast<UnionType>()) {
+      std::vector<TypePtr> to_subtract{NoneType::get()};
+      c10::optional<TypePtr> remaining =
+          union_type->subtractTypeSet(to_subtract);
+      std::vector<Refinement> all_present;
+      if (remaining) {
+        Refinement present{name, *remaining};
+        all_present.push_back(std::move(present));
+      }
+      if (tok == TK_IS) {
+        return RefinementSet({}, all_present);
+      } else { // TK_ISNOT
+        return RefinementSet(all_present, {});
+      }
+    }
     return RefinementSet();
   }
 
@@ -1340,7 +1362,7 @@ struct to_ir {
       auto unified = unifyTypes(
           lt->getElementType(),
           out->type(),
-          /*default_to_any=*/true,
+          /*default_to_union=*/true,
           element_type_hint);
 
       if (lt->getElementType() != AnyType::get() &&
@@ -1458,7 +1480,7 @@ struct to_ir {
       c10::optional<TypePtr> unified = unifyTypes(
           dt->getValueType(),
           v->type(),
-          /*default_to_any=*/true,
+          /*default_to_union=*/true,
           value_type_hint);
 
       // Warn the user if we inferred the type of the values to be `Any`
@@ -1755,13 +1777,32 @@ struct to_ir {
         graph->createStore(x, fv)->insertBefore(false_block->return_node());
       }
 
-      auto unified = unifyTypes(tv->type(), fv->type());
+      SugaredValuePtr maybe_sugared_x = environment_stack->findInAnyFrame(x);
+      TypePtr full_type = nullptr;
+      if (maybe_sugared_x) {
+        Value* maybe_simple = asSimple(maybe_sugared_x);
+        if (maybe_simple) {
+          full_type = maybe_simple->type();
+        }
+      }
 
-      // attempt to unify the types. we allow variables to be set to different
-      // types in each branch as long as that variable is not already in scope,
-      // or if that variable does not get used later. here, we save the error
-      // so that the error message will be more informative in the case that is
-      // used later. When a is accessed in (a + 1), the error will get printed
+      // Try to unify the types. If we found a type annotation earlier
+      // in the environment, and if that type annotation is some form
+      // of union, then we need to tell `unifyTypes` not to throw an
+      // error if the branched return types we found are heterogenous
+      bool default_to_union = full_type &&
+          (full_type->kind() == UnionType::Kind ||
+           full_type->kind() == OptionalType::Kind ||
+           full_type->kind() == NumberType::Kind);
+      auto unified = unifyTypes(
+          tv->type(), fv->type(), /*default_to_union=*/default_to_union);
+
+      // We allow variables to be set to different types in each branch
+      // as long as that variable is not already in scope or if that
+      // variable does not get used later. Here, we save the error so
+      // that the error message will be more informative in the case
+      // that is used later. When `a` is accessed in `(a + 1)`, the
+      // error will get printed:
       // if cond:
       //    a = 1
       // else:
@@ -1799,76 +1840,146 @@ struct to_ir {
   }
 
   CondValue emitIsInstance(const Expr& obj, const Expr& classinfo) {
-    // turn (float, (int, tuple)) into a flat list of types and type kind
-    // category checks: tuple_check = true, types = {float, int}
-    struct GatheredTypes {
-      GatheredTypes(ScriptTypeParser parser) : typeParser_(std::move(parser)) {}
-      void gather(const Expr& classinfo) {
-        if (classinfo.kind() == TK_TUPLE_LITERAL) {
-          for (Expr e : TupleLiteral(classinfo).inputs()) {
-            gather(e);
-          }
-          return;
+    Value* lhs_val = emitExpr(obj);
+    std::vector<TypePtr> lhs_types;
+    std::vector<TypePtr> rhs_types;
+
+    std::function<void(const Expr&)> gather_rhs = [&](const Expr& expr) {
+      if (expr.kind() == TK_TUPLE_LITERAL) {
+        for (Expr e : TupleLiteral(expr).inputs()) {
+          gather_rhs(e);
         }
-        TypePtr type = typeParser_.parseTypeFromExpr(classinfo);
-        types.emplace_back(type);
+        return;
       }
-      bool staticallyTrue(const TypePtr& actual_type) {
-        // is this isinstance check statically true?
-        for (const TypePtr& typ : types) {
-          if (actual_type->isSubtypeOf(typ)) {
-            return true;
-          }
+      TypePtr type = typeParser_.parseTypeFromExpr(expr);
+      rhs_types.emplace_back(type);
+    };
+
+    lhs_types.push_back(lhs_val->type());
+    gather_rhs(classinfo);
+
+    standardizeVectorForUnion(&lhs_types);
+    standardizeVectorForUnion(&rhs_types);
+
+    RefinementSet refinement;
+
+    TypePtr unified_true = nullptr;
+    TypePtr unified_false = nullptr;
+
+    std::vector<TypePtr> isinstance_types;
+    std::vector<TypePtr> not_isinstance_types;
+
+    std::vector<Refinement> true_refinements;
+    std::vector<Refinement> false_refinements;
+
+    bool all_lhs_subtype_some_rhs = true;
+
+    // We can discard any rhs types that we know statically would be
+    // impossible. For example, if we had:
+    //
+    //    def fn(x: Optional[str]):
+    //        if isinstance(x, (List[str], str, int)):
+    //            ...
+    //
+    // then `x` would be `str` in the true branch and `None` in the
+    // false branch, not `(List[str], str, int)` in the true branch
+    // and `None` in the false branch
+    for (const TypePtr& lhs_type : lhs_types) {
+      if (lhs_type == AnyType::get()) {
+        isinstance_types.insert(
+            isinstance_types.end(), rhs_types.begin(), rhs_types.end());
+        not_isinstance_types.push_back(AnyType::get());
+        // Edge case: we can still say that all lhs types subtype some
+        // rhs type if `lhs` is `Any` and `rhs` is `Any`
+        if (isinstance_types.size() != 1 ||
+            isinstance_types[0] != AnyType::get()) {
+          all_lhs_subtype_some_rhs = false;
         }
-        return false;
+        break;
       }
-      bool maybeOfKind(TypeKind kind, const TypePtr& actual_type) {
-        if (actual_type->kind() == AnyType::Kind) {
-          return true;
+
+      auto get_smaller_type = [&](TypePtr t1, TypePtr t2) -> TypePtr {
+        if (t1->isSubtypeOf(t2)) {
+          return t1;
+        } else if (t2->isSubtypeOf(t1)) {
+          return t2;
+        } else {
+          return nullptr;
         }
-        if (auto op = actual_type->cast<OptionalType>()) {
-          return op->getElementType()->kind() == kind;
+      };
+
+      TypePtr found_refinement = nullptr;
+      for (const TypePtr& rhs_type : rhs_types) {
+        TypePtr maybe_smaller_type = get_smaller_type(lhs_type, rhs_type);
+        if (!maybe_smaller_type) {
+          continue;
+        } else if (*maybe_smaller_type == *lhs_type) {
+          // Cover the case that we have something like
+          // lhs = `List[str]` and rhs = `list`
+          found_refinement = lhs_type;
+        } else if (*maybe_smaller_type == *rhs_type) {
+          // We want the narrowest possible type
+          found_refinement = found_refinement
+              ? *(unifyTypes(found_refinement, rhs_type))
+              : rhs_type;
         }
-        return false;
       }
-      bool staticallyFalse(const TypePtr& actual_type) {
-        for (const TypePtr& typ : types) {
-          if (typ->isSubtypeOf(actual_type)) {
-            return false;
-          }
-          if ((typ->isSubtypeOf(AnyListType::get()) &&
-               maybeOfKind(ListType::Kind, actual_type)) ||
-              (typ->isSubtypeOf(AnyTupleType::get()) &&
-               maybeOfKind(TupleType::Kind, actual_type))) {
-            return false;
-          }
+
+      if (found_refinement) {
+        if (*found_refinement == *lhs_type) {
+          all_lhs_subtype_some_rhs &= true;
         }
-        return true;
+        isinstance_types.push_back(found_refinement);
+      } else {
+        // If the lhs couldn't be a subtype of the rhs (or couldn't
+        // be "refined" to itself, as in the `List[str]` and `list`
+        // case above), then we add `lhs_type` to the false branch
+        // refinements. This is because the type can still be itself
+        // if the `isinstance` check is false
+        not_isinstance_types.push_back(lhs_type);
+        all_lhs_subtype_some_rhs = false;
       }
-      ScriptTypeParser typeParser_;
-      std::vector<TypePtr> types;
-    };
-    GatheredTypes gathered(typeParser_);
-    gathered.gather(classinfo);
-    auto val = emitExpr(obj);
-    RefinementSet refinement;
-    if (gathered.types.size() == 1 &&
-        gathered.types.at(0)->isSubtypeOf(val->type()) &&
-        obj.kind() == TK_VAR) {
+    }
+
+    // For use with `unifyTypeList`
+    std::stringstream nowhere;
+
+    // Get a single type for the true and false branches
+    if (!isinstance_types.empty()) {
+      unified_true =
+          *unifyTypeList(isinstance_types, nowhere, /*default_to_union=*/true);
+    }
+    if (obj.kind() == TK_VAR && unified_true) {
+      std::string ident = Var(obj).name().name();
+      true_refinements = {Refinement(ident, unified_true)};
+    }
+
+    // Get a single type for the true and false branches
+    if (!not_isinstance_types.empty()) {
+      unified_false = *unifyTypeList(
+          not_isinstance_types, nowhere, /*default_to_union=*/true);
+    }
+    if (obj.kind() == TK_VAR && unified_false) {
       std::string ident = Var(obj).name().name();
-      Refinement isinstance(std::move(ident), gathered.types.at(0));
-      refinement = RefinementSet({isinstance}, {});
+      false_refinements = {Refinement(ident, unified_false)};
     }
 
-    if (gathered.staticallyTrue(val->type())) {
+    refinement = RefinementSet(true_refinements, false_refinements);
+
+    bool is_statically_false = isinstance_types.empty();
+
+    // If the statement is statically true
+    if (all_lhs_subtype_some_rhs) {
       return CondValue(*graph, obj.range(), true, std::move(refinement));
     }
-    if (gathered.staticallyFalse(val->type())) {
+
+    if (is_statically_false) {
       return CondValue(*graph, obj.range(), false, std::move(refinement));
     }
+
     // check maybe true/false at runtime, need an actual op
     Value* result =
-        graph->insertNode(graph->createIsInstance(val, gathered.types))
+        graph->insertNode(graph->createIsInstance(lhs_val, rhs_types))
             ->output();
     return CondValue(result, std::move(refinement), c10::nullopt);
   }
@@ -2124,6 +2235,7 @@ struct to_ir {
   }
 
   // emit assserions as an if branch so that assertions will reuse the
+  // message
   void emitAssert(const Assert& stmt) {
     CondValue cond_value = emitCondExpr(stmt.test());
     List<Stmt> true_branch = List<Stmt>::create(stmt.range(), {});
@@ -2979,7 +3091,9 @@ struct to_ir {
         // after annotation so that variables assigned to this None will still
         // get the right type. To do this, we make a None constant that
         // has the type Optional[T]
-        if (type->kind() == OptionalType::Kind &&
+        if ((type->kind() == OptionalType::Kind ||
+             (type->kind() == UnionType::Kind &&
+              type->expect<UnionType>()->canHoldType(NoneType::get()))) &&
             expr->type()->isSubtypeOf(NoneType::get())) {
           Node* none = graph->createNone();
           none->output()->setType(type);
@@ -3435,8 +3549,9 @@ struct to_ir {
       size_t n_binders,
       const TypePtr& type_hint = nullptr) {
     switch (tree.kind()) {
-      case TK_VAR:
+      case TK_VAR: {
         return environment_stack->getSugaredVar(Var(tree).name());
+      }
       case '.': {
         auto select = Select(tree);
         auto sv = emitSugaredExpr(select.value(), 1);
@@ -3710,7 +3825,7 @@ struct to_ir {
           type_hint ? type_hint->expect<ListType>()->getElementType() : nullptr;
 
       c10::optional<TypePtr> unified = unifyTypeList(
-          types, nowhere, /*default_to_any=*/true, element_type_hint);
+          types, nowhere, /*default_to_union=*/true, element_type_hint);
 
       if (!type_hint && *unified == AnyType::get()) {
         TORCH_WARN(
@@ -3881,7 +3996,7 @@ struct to_ir {
           c10::optional<TypePtr> unified = unifyTypeList(
               types,
               /*why_not=*/nowhere,
-              /*default_to_any=*/true,
+              /*default_to_union=*/true,
               value_type_hint);
 
           if (!type_hint && *unified == AnyType::get()) {
diff --git a/torch/csrc/jit/frontend/schema_matching.h b/torch/csrc/jit/frontend/schema_matching.h
index 6b434882eb798..fb6d1ab7f92e5 100644
--- a/torch/csrc/jit/frontend/schema_matching.h
+++ b/torch/csrc/jit/frontend/schema_matching.h
@@ -8,9 +8,10 @@
 namespace torch {
 namespace jit {
 
-// try to match a list of inputs and keyword 'attributes' to this schema,
-// if it works return the flat list of positional inputs to the call
-// if it returns nullopt, then failure_messages contains a good error report
+// Try to match a list of inputs and keyword 'attributes' to this
+// schema. Return the flat list of positional inputs to the call or
+// `c10::nullopt` on failure (`failure_messages` contains a good error
+// report in this case)
 
 struct MatchedSchema {
   std::vector<Value*> inputs;
diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp
index b4e6ca880ebce..a543b5b6fbe5d 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.cpp
+++ b/torch/csrc/jit/frontend/schema_type_parser.cpp
@@ -32,6 +32,7 @@ using c10::StringType;
 using c10::Symbol;
 using c10::TensorType;
 using c10::TupleType;
+using c10::UnionType;
 using c10::VarType;
 
 namespace torch {
@@ -331,6 +332,18 @@ std::pair<TypePtr, c10::optional<AliasInfo>> SchemaTypeParser::parseType() {
     L.expect(')');
     alias_info = parseAliasAnnotation();
     value = DictType::create(key_type, value_type);
+  } else if (L.cur().kind == TK_IDENT && L.cur().text() == "Union") {
+    L.next();
+    L.expect('(');
+    std::vector<TypePtr> types;
+    types.emplace_back(parseType().first);
+    while (L.cur().kind != ')') {
+      L.expect(',');
+      types.emplace_back(parseType().first);
+    }
+    L.expect(')');
+    alias_info = parseAliasAnnotation();
+    value = UnionType::create(types);
   } else if (
       complete_tensor_types && L.cur().kind == TK_IDENT &&
       parseTensorDType(L.cur().text())) {
diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp
index eac51ab527d52..bafe5188cc4eb 100644
--- a/torch/csrc/jit/frontend/script_type_parser.cpp
+++ b/torch/csrc/jit/frontend/script_type_parser.cpp
@@ -42,7 +42,7 @@ TypePtr ScriptTypeParser::subscriptToType(
     }
     std::vector<TypePtr> subscript_expr_types;
     for (auto expr : subscript.subscript_exprs()) {
-      subscript_expr_types.push_back(parseTypeFromExprImpl(expr));
+      subscript_expr_types.emplace_back(parseTypeFromExprImpl(expr));
     }
     return TupleType::create(subscript_expr_types);
   } else if (typeName == "List" || typeName == "list") {
@@ -65,6 +65,13 @@ TypePtr ScriptTypeParser::subscriptToType(
         parseTypeFromExprImpl(*subscript.subscript_exprs().begin());
     return OptionalType::create(elem_type);
 
+  } else if (typeName == "Union") {
+    std::vector<TypePtr> subscript_expr_types;
+    subscript_expr_types.reserve(subscript.subscript_exprs().size());
+    for (auto expr : subscript.subscript_exprs()) {
+      subscript_expr_types.emplace_back(parseTypeFromExprImpl(expr));
+    }
+    return UnionType::create(subscript_expr_types);
   } else if (typeName == "Future" || typeName == "torch.jit.Future") {
     if (subscript.subscript_exprs().size() != 1) {
       throw ErrorReport(subscript)
@@ -83,30 +90,6 @@ TypePtr ScriptTypeParser::subscriptToType(
     auto elem_type =
         parseTypeFromExprImpl(*subscript.subscript_exprs().begin());
     return RRefType::create(elem_type);
-  } else if (typeName == "Union") {
-    // In Python 3.9+, Union[NoneType, T] or Union[T, NoneType] are
-    // treated as Optional[T]. Adding the same support for Union in Torchscript.
-    const char* const err =
-        "General Union types are not currently supported."
-        " Only Union[T, NoneType] (i.e. Optional[T]) is "
-        "supported.";
-    if (subscript.subscript_exprs().size() != 2) {
-      throw ErrorReport(subscript) << (err);
-    }
-    auto first_type = parseTypeFromExprImpl(subscript.subscript_exprs()[0]);
-    auto second_type = parseTypeFromExprImpl(subscript.subscript_exprs()[1]);
-
-    bool first_none = first_type == NoneType::get();
-    bool second_none = second_type == NoneType::get();
-
-    if (first_none && !second_none) {
-      return OptionalType::create(second_type);
-    } else if (!first_none && second_none) {
-      return OptionalType::create(first_type);
-    } else {
-      throw ErrorReport(subscript.range()) << err;
-    }
-
   } else if (typeName == "Dict" || typeName == "dict") {
     if (subscript.subscript_exprs().size() != 2) {
       throw ErrorReport(subscript)
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index 18512b4617d6c..03afbdd3508b2 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -13,94 +13,139 @@ namespace jit {
 
 namespace {
 
-// For any mutable type, map it to a type such that all other types which it can
-// alias will be mapped to the same type. This function follows a similar logic
-// to `unifyTypes` because any two mutable types which can be unified
-// can alias each other.
-// getMutableTypePtr(Optional[List[int]]) == getMutableTypePtr([List[int]])
-// If a type is not mutable, return nullopt
-// This class helps convert types to their mutable equivalent by looking up
-// cached conversions.
+TypePtr toSingleType(AliasTypeSet& mut_types) {
+  return mut_types.size() == 1 ? mut_types[0]
+                               : c10::UnionType::create(mut_types);
+}
+
+// This class determines whether a type is mutable, and, if so, it maps
+// the type to its "mutable equivalent" (see definition in
+// `mapTypeToAliasTypeSet`). It uses a cache of TypePtrs to speed up these
+// type lookups
 class MutableTypePtrHelper {
  public:
   explicit MutableTypePtrHelper(
-      std::unordered_map<TypePtr, TypePtr>* mutable_type_cache)
+      std::unordered_map<TypePtr, AliasTypeSet>* mutable_type_cache)
       : mutable_type_cache_(mutable_type_cache) {}
 
-  c10::optional<TypePtr> getMutableType(const TypePtr& type) {
+  // Map any mutable type to a type such that all other types which the
+  // mutable type can alias will be mapped to the same type. For
+  // example, calling this method on `Optional[List[int]]` should be
+  // the same as calling this method on `List[int]`.
+  //
+  // Rules:
+  //   - If the type is not mutable, return `nullopt`
+  //   - If the type is a `Tuple`, that means that it's an immutable
+  //     object that can itself contain mutable objects. We want to make
+  //     sure that the mutable objects are correctly aliased, so we
+  //     remove the immutable objects. (For example,
+  //     `Tuple[int, Tensor]` would become `Tuple[Tensor]`, while
+  //     `Tuple[int, str]` would be returned as `nullopt`.) This is a
+  //     convenience that makes it easy to check if the `Tuple`
+  //     contains only immutable objects, though it's not technically
+  //     necessary
+  //   - For any Tensor type (including Tensor types that are part of
+  //     a larger container, e.g. `List[Tensor]`), return the
+  //     "unshaped" version of that Tensor. An "unshaped" Tensor is a
+  //     Tensor with shape information removed. For example, a Tensor
+  //     of dimension 4 would map to the same type as a Tensor of
+  //     dimension 1. This allows us to treat all subclasses of Tensor
+  //     as a single, homogenous "Tensor" type.
+  c10::optional<AliasTypeSet> mapTypeToAliasTypeSet(const TypePtr& type) {
     if (mutable_type_cache_) {
-      auto maybe_type = mutable_type_cache_->find(type);
-      if (maybe_type != mutable_type_cache_->end()) {
-        return maybe_type->second;
+      auto maybe_type_mapping = mutable_type_cache_->find(type);
+      if (maybe_type_mapping != mutable_type_cache_->end()) {
+        return maybe_type_mapping->second;
       }
     }
-    auto mutable_type = getMutableTypeImpl(type);
-    if (mutable_type_cache_ && mutable_type) {
-      mutable_type_cache_->emplace(type, *mutable_type);
+    auto mutable_types = mapTypeToAliasTypeSetImpl(type);
+    if (mutable_type_cache_ && mutable_types) {
+      mutable_type_cache_->emplace(type, *mutable_types);
     }
-    return mutable_type;
+    return mutable_types;
   }
 
  private:
-  c10::optional<TypePtr> getMutableTypeImpl(const TypePtr& type) {
+  c10::optional<AliasTypeSet> mapTypeToAliasTypeSetImpl(const TypePtr& type) {
     switch (type->kind()) {
       case TypeKind::ListType:
       case TypeKind::DictType:
       case TypeKind::ClassType:
       case TypeKind::TensorType:
-        // TODO: lookup cached contained types. this is kind of tricky
-        // because a List[Optional[T]] should still be
-        // List[Optional[Unshaped(T)]], however the getMutableType(Optional[T])
-        // == T
-        return unshapedType(type);
-      case TypeKind::OptionalType:
-        return getMutableType(type->castRaw<OptionalType>()->getElementType());
+        // TODO: Look up cached contained types. this is kind of tricky
+        // because a `List[Optional[T]]` should still be
+        // `List[Optional[Unshaped(T)]]`, but
+        // `mapTypeToAliasTypeSet(Optional[T])` should be `T`
+        return AliasTypeSet{unshapedType(type)};
+      case TypeKind::UnionType: {
+        AliasTypeSet mutable_types;
+        for (const TypePtr& inner :
+             type->expect<UnionType>()->containedTypes()) {
+          if (auto maybe_inner_types = mapTypeToAliasTypeSet(inner)) {
+            mutable_types.insert(
+                mutable_types.end(),
+                (*maybe_inner_types).begin(),
+                (*maybe_inner_types).end());
+          }
+        }
+        if (mutable_types.size() == 0) {
+          return c10::nullopt;
+        }
+        return mutable_types;
+      }
+      case TypeKind::OptionalType: {
+        auto inner = type->castRaw<OptionalType>()->getElementType();
+        return mapTypeToAliasTypeSet(inner);
+      }
       case TypeKind::AnyType:
-        return type;
+        return {AliasTypeSet{type}};
       case TypeKind::FutureType: {
-        if (auto elem =
-                getMutableType(type->castRaw<FutureType>()->getElementType())) {
-          return FutureType::create(*elem);
+        if (auto maybe_mut_types = mapTypeToAliasTypeSet(
+                type->castRaw<FutureType>()->getElementType())) {
+          auto mut_type = toSingleType(*maybe_mut_types);
+          return {AliasTypeSet{FutureType::create(mut_type)}};
         }
         return c10::nullopt;
       }
       case TypeKind::TupleType: {
         std::vector<TypePtr> mutable_types;
-        for (const auto& elem : type->expectRef<TupleType>().elements()) {
-          if (auto mut_elem = getMutableType(elem)) {
-            mutable_types.push_back(*mut_elem);
+        for (const TypePtr& inner : type->expectRef<TupleType>().elements()) {
+          if (auto maybe_inner_types = mapTypeToAliasTypeSet(inner)) {
+            mutable_types.insert(
+                mutable_types.end(),
+                (*maybe_inner_types).begin(),
+                (*maybe_inner_types).end());
           }
         }
         if (mutable_types.size() == 0) {
           return c10::nullopt;
-        } else {
-          return TupleType::create(mutable_types);
         }
+        return {AliasTypeSet{TupleType::create(mutable_types)}};
       }
       default:
         return c10::nullopt;
     }
   }
-  std::unordered_map<TypePtr, TypePtr>* mutable_type_cache_;
+  std::unordered_map<TypePtr, AliasTypeSet>* mutable_type_cache_;
 };
 
 bool isMutableTypeImpl(
     const TypePtr& type,
-    std::unordered_map<TypePtr, TypePtr>* mutable_type_cache) {
-  // check common cases to avoid recursively constructing type in
-  // getMutableTypePtrImpl
+    std::unordered_map<TypePtr, AliasTypeSet>* mutable_type_cache) {
+  // Check common cases to avoid recursively constructing type in
+  // `mapTypeToAliasTypeSetPtrImpl`
   auto kind = type->kind();
   if (kind == TypeKind::TensorType || kind == TypeKind::ListType ||
       kind == TypeKind::ClassType || kind == TypeKind::DictType) {
     return true;
   }
   MutableTypePtrHelper helper(mutable_type_cache);
-  return helper.getMutableType(type) != c10::nullopt;
+  return helper.mapTypeToAliasTypeSet(type) != c10::nullopt;
 }
 
 } // namespace
 
-// static isMutableType does not use cache of type -> mutable type equivalent
+// Static `isMutableType` does not use cache of type -> mutable type equivalent
 bool AliasDb::isMutableType(const TypePtr& type) {
   return isMutableTypeImpl(type, nullptr);
 }
@@ -109,7 +154,7 @@ bool AliasDb::isMutableType(const Value* v) {
   return isMutableType(v->type());
 }
 
-// makes use of type -> mutable cache
+// Make use of type -> mutable cache
 bool AliasDb::isMutableTypeInternal(const TypePtr& type) const {
   return isMutableTypeImpl(type, &mapped_mutable_types_);
 }
@@ -118,21 +163,17 @@ bool AliasDb::isMutableTypeInternal(const Value* v) const {
   return isMutableTypeInternal(v->type());
 }
 
-c10::optional<TypePtr> AliasDb::getMutableTypePtr(const TypePtr& type) const {
+c10::optional<AliasTypeSet> AliasDb::mapTypeToAliasTypeSetPtr(
+    const TypePtr& type) const {
   MutableTypePtrHelper helper(&mapped_mutable_types_);
-  return helper.getMutableType(type);
-}
-
-bool AliasDb::isContainerType(const TypePtr& type) const {
-  auto mut_type = getMutableTypePtr(type);
-  return mut_type && (*mut_type)->containedTypes().size() > 0;
+  return helper.mapTypeToAliasTypeSet(type);
 }
 
 AliasDb::~AliasDb() = default;
 
-// Structure used during analysis to keeps track of all writes at a high level.
-// When analysis is completed this will be used to construct a more efficient
-// WriteIndex.
+// Structure used during analysis to keep track of all writes at a high
+// level. When the analysis is completed, this will be used to construct
+// a more efficient WriteIndex
 struct AliasDb::WriteRegistry {
   void registerWrite(const Value* v, Node* n) {
     writes_[n].emplace_back(v);
@@ -170,7 +211,7 @@ AliasDb::AliasDb(std::shared_ptr<Graph> graph, bool isFrozen)
   writeIndex_ = TWriteIndex();
   auto& writeIndex = *writeIndex_; // to make operator[] less ugly
 
-  // build the write index
+  // Build the write index
   for (const auto& write : writeRegistry_->writes_) {
     Node* node = write.first;
     const std::vector<const Value*> writtenValues = write.second;
@@ -207,7 +248,7 @@ AliasDb::AliasDb(std::shared_ptr<Graph> graph, bool isFrozen)
   // out of sync (since we have no way of registering new writes)
   writeRegistry_ = nullptr;
 
-  // initialize the write cache
+  // Initialize the write cache
   buildWrittenToLocationsIndex();
   GRAPH_DEBUG(toString());
 }
@@ -324,10 +365,10 @@ MemoryLocations AliasDb::getReads(Node* n) const {
 
 std::string AliasDb::getElementName(const Element* e) const {
   if (e->values.empty()) {
-    // not the most efficient way, but given the fact there are
+    // Not the most efficient way, but given the fact there are
     // not too many types and even fewer of them will end up in
-    // wildcardIndex_, we should be fine with a linear search
-    // each time we hit a wildcard leaf
+    // `wildcardIndex_`, we should be fine with a linear search
+    // each time we hit a Wildcard leaf
     for (const auto& ent : wildcardIndex_) {
       if (ent.second == e) {
         return std::string("WILDCARD for type ") + ent.first->str();
@@ -362,17 +403,27 @@ std::string AliasDb::toString() const {
   ss << "\n===2. ALIAS DB===\n";
   for (const auto& ptrPair : elementMap_) {
     const auto element = ptrPair.second;
+    int ct = 0;
     if (!element->pointsTo.empty()) {
       ss << getElementName(element) << " points to: ";
       for (const auto pointedTo : element->pointsTo) {
-        ss << getElementName(memoryDAG_->fromIndex(pointedTo)) << ", ";
+        if (ct > 0) {
+          ss << ", ";
+        }
+        ++ct;
+        ss << getElementName(memoryDAG_->fromIndex(pointedTo));
       }
       ss << "\n";
     }
+    ct = 0;
     if (!element->containedElements.empty()) {
       ss << getElementName(element) << " contains: ";
       for (const auto contained : element->containedElements) {
-        ss << getElementName(memoryDAG_->fromIndex(contained)) << ", ";
+        ss << getElementName(memoryDAG_->fromIndex(contained));
+        if (ct > 0) {
+          ss << ", ";
+        }
+        ++ct;
       }
       ss << "\n";
     }
@@ -839,8 +890,7 @@ void AliasDb::analyzeLoop(Node* node) {
   TORCH_INTERNAL_ASSERT(blockOutputs.size() == node->outputs().size());
 
   // Run alias analysis on the loop body, iterating until the block output
-  // alias info converges.
-  // Copy node input aliases to block input
+  // alias info converges. Copy node input aliases to block input
   mapAliases(blockInputs, loopCarriedInputs);
 
   // Populate block output alias info by analyzing the body
@@ -996,7 +1046,7 @@ bool AliasDb::functionalNonEscapingListUse(const Use& use) const {
   return false;
 }
 
-// List or dict or tuple: construct: create an aliasing element for the actual
+// List or dict or tuple construct: create an aliasing element for the actual
 // container, then mark all inputs as wildcards, since they've gone inside the
 // container. Then, add the wildcard sets of appropriate type to the contained
 // elements of the container.
@@ -1073,52 +1123,50 @@ void AliasDb::makePointerTo(const Value* from, const Value* to) {
     return;
   }
 
-  // the contained types of immutable type containers (optional, tuple, future)
-  // are unified, so these types can be mutable or immutable
-  // and point to a type which is mutable or immutable.
-  // Any is mutable but can point to an immutable type through refinement
+  // The contained types of immutable type containers (`Optional`,
+  // `Tuple`, `Future`, and `Union`) are unified, so these types can be
+  // mutable or immutable and point to a type which is mutable or
+  // immutable. `Any` is mutable but can point to an immutable type
+  // through refinement
   if (isMutableTypeInternal(from) != isMutableTypeInternal(to)) {
     bool expected_kind = false;
     for (auto kind : {from->type()->kind(), to->type()->kind()}) {
       expected_kind = expected_kind ||
           (kind == TypeKind::OptionalType || kind == TypeKind::FutureType ||
-           kind == TypeKind::TupleType) // immutable type containers
+           kind == TypeKind::TupleType ||
+           kind == TypeKind::UnionType) // immutable type containers
           || kind == TypeKind::AnyType;
     }
     TORCH_INTERNAL_ASSERT(
         expected_kind, from->type()->str(), to->type()->str());
     return;
   }
-
   // both immutable
   if (!isMutableTypeInternal(from)) {
     return;
   }
-
   if (from == to) {
     return;
   }
 
-  // At this point, we are dealing with two mutable types.
-  auto fromEl = getOrCreateElement(from);
-  auto toEl = getOrCreateElement(to);
+  // At this point, we are dealing with two mutable types
+  auto from_el = getOrCreateElement(from);
+  auto to_el = getOrCreateElement(to);
 
-  memoryDAGBuilder_->makePointerTo(fromEl, toEl);
+  memoryDAGBuilder_->makePointerTo(from_el, to_el);
 }
 
 void AliasDb::addToContainedElements(
-    const Value* elem,
+    const Value* inner,
     const Value* container) {
-  if (!isMutableTypeInternal(elem)) {
+  if (!isMutableTypeInternal(inner)) {
     return;
   }
 
-  TORCH_INTERNAL_ASSERT(isContainerType(container->type()));
-
-  auto elemEl = getOrCreateElement(elem);
-  auto contEl = getOrCreateElement(container);
+  auto inner_el = getOrCreateElement(inner);
+  auto cont_el = getOrCreateElement(container);
 
-  memoryDAGBuilder_->addToContainedElements(elemEl, contEl);
+  memoryDAGBuilder_->addToContainedElements(inner_el, cont_el);
 }
 
 bool AliasDb::mayAlias(const Value* a, const Value* b) const {
@@ -1203,8 +1251,8 @@ void AliasDb::createValue(const Value* value) {
 void AliasDb::giveFreshAlias(
     const Value* value,
     bool add_wildcard_to_contained_elems) {
-  auto maybe_mut_type = getMutableTypePtr(value->type());
-  if (!maybe_mut_type) {
+  auto maybe_mut_types = mapTypeToAliasTypeSetPtr(value->type());
+  if (!maybe_mut_types) {
     return;
   }
 
@@ -1217,7 +1265,11 @@ void AliasDb::giveFreshAlias(
   auto new_elem = memoryDAGBuilder_->makeFreshValue(value);
   elementMap_[value] = new_elem;
   if (add_wildcard_to_contained_elems) {
-    addContainedTypesToFreshElement(new_elem, *maybe_mut_type);
+    if ((*maybe_mut_types).size() > 1) {
+      pointUnionTypeElementToAllContainedTypes(new_elem, *maybe_mut_types);
+    } else {
+      addContainedTypesToFreshElement(new_elem, *maybe_mut_types);
+    }
   }
 }
 
@@ -1639,29 +1691,47 @@ bool AliasDb::mayAliasWildcard(const at::ArrayRef<Value*> vs) const {
 }
 
 c10::optional<Element*> AliasDb::tryGetOrCreateWildcard(const TypePtr& type) {
-  auto updated_type = getMutableTypePtr(type);
-  if (!updated_type) {
+  auto maybe_mut_types = mapTypeToAliasTypeSetPtr(type);
+  if (!maybe_mut_types) {
     return c10::nullopt;
   }
-  auto mapped_type = *updated_type;
-  auto existing_wildcard = wildcardIndex_.find(mapped_type);
+  auto mut_type = toSingleType(*maybe_mut_types);
+  auto existing_wildcard = wildcardIndex_.find(mut_type);
   if (existing_wildcard != wildcardIndex_.end()) {
     return existing_wildcard->second;
   }
 
   auto wildcard_elem = memoryDAGBuilder_->makeFreshValue(nullptr);
-  wildcardIndex_.emplace(mapped_type, wildcard_elem);
-  addContainedTypesToFreshElement(wildcard_elem, mapped_type);
+  wildcardIndex_.emplace(mut_type, wildcard_elem);
+  if ((*maybe_mut_types).size() > 1) {
+    pointUnionTypeElementToAllContainedTypes(wildcard_elem, *maybe_mut_types);
+  } else {
+    addContainedTypesToFreshElement(wildcard_elem, *maybe_mut_types);
+  }
   return wildcard_elem;
 }
 
-void AliasDb::addContainedTypesToFreshElement(
+void AliasDb::pointUnionTypeElementToAllContainedTypes(
     Element* container_elem,
-    const TypePtr& mut_type) {
-  for (const auto& contained : mut_type->containedTypes()) {
-    auto maybe_elem = tryGetOrCreateWildcard(contained);
+    const AliasTypeSet& mut_types) {
+  for (const auto& mut_type : mut_types) {
+    auto maybe_elem = tryGetOrCreateWildcard(mut_type);
     if (maybe_elem) {
-      memoryDAGBuilder_->addToContainedElements(*maybe_elem, container_elem);
+      TORCH_INTERNAL_ASSERT(*maybe_elem != container_elem);
+      memoryDAGBuilder_->makePointerTo(container_elem, *maybe_elem);
+    }
+  }
+}
+
+void AliasDb::addContainedTypesToFreshElement(
+    Element* container_elem,
+    const AliasTypeSet& mut_types) {
+  for (const auto& mut_type : mut_types) {
+    for (const auto& contained : mut_type->containedTypes()) {
+      auto maybe_elem = tryGetOrCreateWildcard(contained);
+      if (maybe_elem) {
+        memoryDAGBuilder_->addToContainedElements(*maybe_elem, container_elem);
+      }
     }
   }
 }
@@ -1669,26 +1739,38 @@ void AliasDb::addContainedTypesToFreshElement(
 // Search the wildcard index for an element that corresponds to the given type.
 // Const version returns nullptr
 Element* AliasDb::getWildcard(const TypePtr& type) const {
-  auto maybe_mut_type = getMutableTypePtr(type);
-  if (!maybe_mut_type) {
-    return nullptr;
-  }
-  TypePtr mut_type = *maybe_mut_type;
-  auto wildcard = wildcardIndex_.find(mut_type);
-  if (wildcard != wildcardIndex_.end()) {
-    return wildcard->second;
+  auto maybe_mut_types = mapTypeToAliasTypeSetPtr(type);
+  if (!maybe_mut_types) {
+    return {};
+  }
+  if ((*maybe_mut_types).size() > 1) {
+    auto union_type = UnionType::create(*maybe_mut_types);
+    // Get a <TypePtr, Element*> pair where the TypePtr is this Union
+    // type and the Element is the corresponding Wildcard
+    auto maybe_union_pair = wildcardIndex_.find(union_type);
+    if (maybe_union_pair != wildcardIndex_.end()) {
+      return (*maybe_union_pair).second;
+    }
+  } else {
+    // Get a <TypePtr, Element*> pair where the TypePtr is the given
+    // type and the Element is the corresponding Wildcard
+    auto type_pair = wildcardIndex_.find((*maybe_mut_types)[0]);
+    if (type_pair != wildcardIndex_.end()) {
+      return type_pair->second;
+    }
   }
-  return nullptr;
+  return {};
 }
 
 // Register `v` as a wildcard value.
 c10::optional<Element*> AliasDb::setWildcard(const Value* v) {
-  auto maybe_wildcardElement = tryGetOrCreateWildcard(v->type());
+  c10::optional<Element*> maybe_wildcardElement =
+      tryGetOrCreateWildcard(v->type());
   if (!maybe_wildcardElement) {
     return c10::nullopt;
   }
-  // Ensure that we create a corresponding element for `v` still, as it is an
-  // invariant that all mutable values have an element.
+  // Ensure that we create a corresponding Element for `v` still, as it is an
+  // invariant that all mutable values have an Element
   getOrCreateElement(v);
   wildcards_.insert(v);
   return *maybe_wildcardElement;
diff --git a/torch/csrc/jit/ir/alias_analysis.h b/torch/csrc/jit/ir/alias_analysis.h
index cd888ade69291..7feb2b9938d8b 100644
--- a/torch/csrc/jit/ir/alias_analysis.h
+++ b/torch/csrc/jit/ir/alias_analysis.h
@@ -34,6 +34,12 @@ namespace jit {
  * Values that contain other mutable types, such as List[Tensor], are
  * initialized as containing the Wildcard set for all contained mutable types.
  *
+ * The AliasDb API references the idea of "mutable" vs "immutable"
+ * types. "Mutable" means that the object's value can change, while
+ * "immutable" means that the value is fixed. (For example, `List` is
+ * mutable, so you can add and delete elements from it. On the other
+ * hand, you can't modify a Tuple once you create it, making `Tuple` an
+ * immutable container.)
  */
 class AliasDb {
  public:
@@ -95,7 +101,7 @@ class AliasDb {
       const at::ArrayRef<Value*>& a,
       const at::ArrayRef<Value*>& b) const;
 
-  // Move 'n' (already in the graph) after 'movePoint' in the topological order.
+  // Move `n` (already in the graph) after `movePoint` in the topological order.
   //
   // Tries to preserve value dependencies, so other nodes might be moved. We
   // make two guarantees about the postcondition of the node list:
@@ -125,6 +131,10 @@ class AliasDb {
   TORCH_API bool dumpToGraphvizFile(const char* filename) const;
   TORCH_API std::string toGraphviz() const;
 
+  // Returns `true` if the given element is mutable or if it is a
+  // container type with an internal mutable element (e.g.
+  // `Tuple[int, Tensor]` has an internal mutable type `Tensor`, so
+  // it would be considered a "mutable type" in AliasDb)
   static bool isMutableType(const Value* v);
   static bool isMutableType(const TypePtr& type);
 
@@ -181,7 +191,7 @@ class AliasDb {
   // Register `v` as a wildcard value.
   c10::optional<Element*> setWildcard(const Value* v);
 
-  // Is this a value which will not alias
+  // Is this a value which will not alias?
   bool nonAliasingValue(const Value* elem) const;
 
   /**
@@ -221,11 +231,10 @@ class AliasDb {
       bool add_wildcard_to_contained_elems = true);
   Element* getOrCreateElement(const Value* value);
 
-  c10::optional<TypePtr> getMutableTypePtr(const TypePtr& type) const;
+  c10::optional<AliasTypeSet> mapTypeToAliasTypeSetPtr(
+      const TypePtr& type) const;
   bool functionalNonEscapingListUse(const Use& use) const;
 
-  bool isContainerType(const TypePtr& type) const;
-
   std::shared_ptr<Graph> graph_;
 
   // If the Module is frozen then consider attributes as freshly created
@@ -239,21 +248,24 @@ class AliasDb {
 
   // Mapping of values to MemoryDAG elements
   ska::flat_hash_map<const Value*, Element*> elementMap_;
-  // All wildcard elements (one for each unique mutable type).
+  // All wildcard Elements (one for each unique mutable type)
   std::unordered_map<TypePtr, Element*, HashType, EqualType> wildcardIndex_;
   Element* getWildcard(const TypePtr& type) const;
   c10::optional<Element*> tryGetOrCreateWildcard(const TypePtr& type);
   void addContainedTypesToFreshElement(
       Element* container_elem,
-      const TypePtr& mut_type);
+      const AliasTypeSet& mut_types);
+  void pointUnionTypeElementToAllContainedTypes(
+      Element* container_elem,
+      const AliasTypeSet& mut_types);
 
   std::vector<Element*> getElements(at::ArrayRef<Value*> vs) const;
   bool mayAliasWildcard(const Value* v) const;
   bool mayAliasWildcard(const at::ArrayRef<Value*> vs) const;
   bool hasWriters(const at::ArrayRef<Value*>& values) const;
 
-  // cached mapping of type ptrs to their mutable types
-  mutable std::unordered_map<TypePtr, TypePtr> mapped_mutable_types_;
+  // Cached mapping of type ptrs to their mutable types
+  mutable std::unordered_map<TypePtr, AliasTypeSet> mapped_mutable_types_;
 
   /**
    * State for tracking write info.
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 05ce8d40ea7c5..e62ef93b57379 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -511,7 +511,7 @@ void Graph::lint() const {
   // - Params and return do NOT occur in nodes
   // - next_unique_ is greater than all uniques in graph
   // - uniques in all_nodes are unique
-  // - every use will occur later in the topsort
+  // - every use will occur later in the toposort
 
   struct LintScope {
     LintScope() = default;
@@ -787,7 +787,9 @@ bool Value::mustBeNone() const {
 }
 bool Value::mustNotBeNone() const {
   return node_->kind() != prim::AutogradAdd && type() != NoneType::get() &&
-      !type()->cast<OptionalType>();
+      !type()->cast<OptionalType>() &&
+      !(type()->cast<UnionType>() &&
+        type()->expect<UnionType>()->canHoldType(NoneType::get()));
 }
 
 std::string Value::debugNameBase() const {
@@ -1765,20 +1767,23 @@ Node* Graph::createEnumValue(Value* e) {
   return n;
 }
 
-Node* Graph::createList(const TypePtr& elem_type, at::ArrayRef<Value*> values) {
+Node* Graph::createList(
+    const TypePtr& contained_type,
+    at::ArrayRef<Value*> values) {
   auto n = create(prim::ListConstruct, values);
   for (const auto& v : values) {
     TORCH_CHECK(
-        v->type()->isSubtypeOf(elem_type),
+        v->type()->isSubtypeOf(contained_type),
         "Expected a list element that subtypes '",
-        elem_type->repr_str(),
+        contained_type->repr_str(),
         "' but got an element of type '",
         v->type()->repr_str(),
         "'");
   }
-  n->output()->setType(ListType::create(elem_type));
+  n->output()->setType(ListType::create(contained_type));
   return n;
 }
+
 Node* Graph::createListUnpack(Value* v, size_t size) {
   ListTypePtr list_type = v->type()->expect<ListType>();
   TypePtr elem_type = list_type->getElementType();
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index dee222bd480df..99f6a6ce5c57b 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -84,7 +84,7 @@ using namespace ::c10::cuda;
 struct Function;
 struct MatchedSchema;
 
-// Graph represents one "function" of computation.
+// A Graph represents one "function" of computation.
 // It uses a simple ownership model where the graph owns all the nodes inside
 // it. All references inside the graph are raw pointers. Destroying the Graph
 // will invalidate any pointers to nodes in the graph.
@@ -104,9 +104,9 @@ TORCH_API std::ostream& operator<<(std::ostream& out, const Node& n);
 // A list of nodes, with inputs and outputs
 struct Block;
 
-// Each use is represented by this type, see Node::uses()
-// 'user' is the consumer of the value, offset is the index into
-// 'user's input this where the produces will be found.
+// Each use is represented by this type, see 'Node::uses()'
+// 'user' is the consumer of the value, 'offset' is the index into
+// 'user's input this where the producers will be found.
 struct Use {
   Use(Node* user, size_t offset) : user(user), offset(offset) {}
   Node* user;
@@ -338,14 +338,16 @@ struct TORCH_API Node {
  protected:
   Node(Graph* graph_, NodeKind kind_); // defined after graph
  public:
-  // each node but Return/Param
-  // is associated with exactly one place in the node list...
-  // of the graph_
-  // this circular is a doubly-linked list, the Return node is used as the
-  // sentinel for the beginning and end of the list such that the list never has
-  // null pointers next_in_graph[0] is next pointer next_in_graph[1] is prev
-  // pointer using an array to allow the same iterator class for forward and
-  // reverse node lists This list represents a topological sort
+  // Each Node but Return/Param Nodes are associated with exactly one
+  // place in the Node list of the Graph. The Graph itself is a circular
+  // doubly-linked list. The Return Node is used as the sentinel for the
+  // "beginning"/"end" of the list. This means that you can tell when
+  // you've traversed the entire list without means worrying about null
+  // pointers. `next_in_graph[0]` is the pointer to the next Node, while
+  // `next_in_graph[1]` is the pointer to the previous Node. The
+  // linked list is implemented as an array to allow the same iterator
+  // class for forward and reversed Node lists. Taken together, this
+  // list also represents a topological sort of the Nodes in the Graph.
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-non-private-member-variables-in-classes,modernize-avoid-c-arrays)
   Node* next_in_graph[2] = {nullptr, nullptr};
 
@@ -980,7 +982,6 @@ struct TORCH_API Node {
   // subclasses should extend if they have additional information to copy.
   // 'this' will be allocated with s->allocNewInstance(g) so it should have
   // the same concrete type as 's'
-  //
   virtual void cloneFrom(Node* s);
 };
 
@@ -1247,7 +1248,7 @@ struct Graph {
   TORCH_API Node* createEnumName(Value* e);
   TORCH_API Node* createEnumValue(Value* e);
   TORCH_API Node* createList(
-      const TypePtr& elem_type,
+      const TypePtr& contained_type,
       at::ArrayRef<Value*> values);
   TORCH_API Node* createListUnpack(Value* v, size_t size);
   TORCH_API Node* createDict(
diff --git a/torch/csrc/jit/mobile/type_parser.cpp b/torch/csrc/jit/mobile/type_parser.cpp
index 42814e5fe5aad..6b955ab6454a7 100644
--- a/torch/csrc/jit/mobile/type_parser.cpp
+++ b/torch/csrc/jit/mobile/type_parser.cpp
@@ -42,6 +42,17 @@ class TypeParser {
       return simpleTypeIt->second;
     } else if (token == "List") {
       return CreateSingleElementType<ListType>();
+    } else if (token == "Union") {
+      std::vector<TypePtr> types;
+      expect("[");
+      while (cur() != "]") {
+        types.emplace_back(parse());
+        if (cur() != "]") {
+          expect(",");
+        }
+      }
+      expect("]");
+      return UnionType::create(types);
     } else if (token == "Optional") {
       return CreateSingleElementType<OptionalType>();
     } else if (token == "Future") {
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 5e13829a8ce6d..c74c6ee40221a 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -288,6 +288,24 @@ class ShapePropagator {
     return zerodim;
   }
 
+  bool mergeTypes(
+      ArrayRef<Value*> lhs,
+      ArrayRef<Value*> rhs,
+      ArrayRef<Value*> outputs) {
+    AT_ASSERT(lhs.size() == rhs.size() && rhs.size() == outputs.size());
+    bool changed = false;
+    for (size_t i = 0; i < lhs.size(); ++i) {
+      auto old_output_type = outputs[i]->type();
+      auto new_type =
+          unifyTypes(lhs[i]->type(), rhs[i]->type(), /*default_to_union=*/true);
+      AT_ASSERT(new_type);
+      outputs[i]->setType(*new_type);
+      if (*old_output_type != *outputs[i]->type())
+        changed = true;
+    }
+    return changed;
+  }
+
   void broadcastBinary(
       Node* node,
       std::vector<TensorTypePtr>& types,
diff --git a/torch/csrc/jit/passes/utils/memory_dag.cpp b/torch/csrc/jit/passes/utils/memory_dag.cpp
index 6a880c86e4102..3f6cc8079b6f9 100644
--- a/torch/csrc/jit/passes/utils/memory_dag.cpp
+++ b/torch/csrc/jit/passes/utils/memory_dag.cpp
@@ -8,6 +8,7 @@
 namespace torch {
 namespace jit {
 namespace {
+
 void makePointerToImpl(Element* from, Element* to) {
   from->pointsTo.set(to->index);
   to->pointedFrom.set(from->index);
@@ -131,11 +132,13 @@ Element* MemoryDAGBuilder::makeFreshValue(const Value* v) {
   return makeFreshValueImpl(v, indexToElementMap_);
 }
 
+// This function builds up a bitset representing the "alias set" for
+// `e` (`MemoryLocations` is just a typedef'd c10::SparseBitVector).
 const MemoryLocations& MemoryDAG::getMemoryLocations(const Element* e) const {
   // Note on cache invalidation: all mutation should occur through
-  // MemoryDAGBuilder. Thus, once we consume the builder to create an immutable
-  // MemoryDAG, we can cache here without worrying that we might potentially get
-  // invalidated.
+  // MemoryDAGBuilder. Thus, once we consume the builder to create an
+  // immutable MemoryDAG, we can cache here without worrying that we
+  // might potentially get invalidated.
   if (e->cachedMemoryLocations_) {
     return *e->cachedMemoryLocations_;
   }
@@ -174,7 +177,6 @@ void MemoryDAG::setWildcards(
         makePointerToImpl(from, wildcardElement);
       }
     }
-
     // Track which memory locations we edited with a new pointer to the wildcard
     // element.
     cacheUpdates[wildcardElement] |= pointeeSet;
@@ -189,7 +191,6 @@ void MemoryDAG::setWildcards(
   for (const std::unique_ptr<Element>& e : this->indexToElementMap_) {
     if (e->values.empty()) {
       // This element is a wildcard element, we can skip it.
-      TORCH_INTERNAL_ASSERT(e->pointsTo.empty());
       continue;
     }
 
diff --git a/torch/csrc/jit/passes/utils/memory_dag.h b/torch/csrc/jit/passes/utils/memory_dag.h
index 38432ff69c9c1..3e3a19c31729c 100644
--- a/torch/csrc/jit/passes/utils/memory_dag.h
+++ b/torch/csrc/jit/passes/utils/memory_dag.h
@@ -1,9 +1,12 @@
 #pragma once
 
+#include <ATen/core/jit_type.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Optional.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/sparse_bitset.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/type_hashing.h>
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
@@ -20,6 +23,9 @@ struct Element;
 struct Value;
 class MemoryDAG;
 
+using TypePtr = std::shared_ptr<c10::Type>;
+using AliasTypeSet = std::vector<TypePtr>;
+
 /**
  * Helper to build up the points-to graph.
  *
@@ -38,13 +44,15 @@ class TORCH_API MemoryDAGBuilder {
 
   void addToContainedElements(Element* contained, Element* container);
 
-  // Make a fresh element (i.e. an element that doesn't point to anything) and
+  // Make a fresh Element (i.e. an Element that doesn't point to anything) and
   // return it.
   Element* makeFreshValue(const Value* v);
 
   friend MemoryDAG;
 
  private:
+  // `MemoryDAGBuilder` builds up `indexToElementMap_`, then uses
+  // the map to construct the `MemoryDAG`
   std::vector<std::unique_ptr<Element>> indexToElementMap_;
 };
 
@@ -54,8 +62,8 @@ class TORCH_API MemoryDAGBuilder {
 // AliasDb to provide a higher-level API.
 //
 // We maintain a DAG where:
-//   - Vertices (called "elements") represent values and
-//     other aliasing entities (e.g. like the stuff inside a list)
+//   - Vertices (called "Elements") represent Values and
+//     other aliasing entities (e.g. the stuff inside a list)
 //   - Edges represent a "points-to" relationship.
 //
 // Leaves in this DAG are entities that don't point to anything, and thus
@@ -80,7 +88,7 @@ class TORCH_API MemoryDAG {
   bool mayAlias(const Element* a, const Element* b) const;
   bool mayAlias(Element* a, Element* b) const;
 
-  // Does a hold reference to any memory that is stored in elem, or vice versa?
+  // Does `a` hold reference to any memory that is stored in `b`, or vice versa?
   bool mayContainAlias(const Element* a, const Element* b) const;
   bool mayContainAlias(Element* a, Element* b) const;
 
@@ -96,12 +104,13 @@ class TORCH_API MemoryDAG {
       MemoryLocations& cont) const;
 
   /**
-   * The following methods are special cases where we need to reach mutate the
+   * The following methods are special cases where we need to mutate the
    * internals of MemoryDAG for efficiency reasons. Don't call them unless you
    * know what you're doing! In particular, don't add new mutating methods
    * without ensuring that you are maintaining cache consistency for memory
    * locations.
    */
+
   // Adding wildcards can trigger extremely expensive cache invalidations. This
   // method adds them in a more efficient cache-aware way.
   void setWildcards(
@@ -117,9 +126,10 @@ class TORCH_API MemoryDAG {
   std::vector<std::unique_ptr<Element>> indexToElementMap_;
 };
 
-// `Element` represents the vertex in the points-to graph. It represents
-// anything that could have an aliasing relationship, mostly IR `Value`s, but
-// also the "inside of a list", or wildcards.
+// `Element` represents a vertex in the points-to graph. It represents
+// anything that could have an aliasing relationship--mostly IR
+// `Value`s, but also wildcards or the type inside a container (e.g. `T`
+// in `List[T]`)
 struct Element {
   Element(const Value* value_, unsigned index_);
   // wildcard constructor
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index f81632bc0fb0a..f8fae19ed8f50 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -89,6 +89,19 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
           ? c10::ivalue::Tuple::createNamed(std::move(values), tuple_type)
           : c10::ivalue::Tuple::create(std::move(values));
     }
+    case TypeKind::UnionType: {
+      auto actual_type = toTypeInferredIValue(obj);
+      auto actual_type_ptr = actual_type.type();
+      auto union_type = type->expect<UnionType>();
+      if (!actual_type_ptr->isSubtypeOf(union_type)) {
+        throw py::cast_error(c10::str(
+            "Expected a member of ",
+            union_type->annotation_str(),
+            " but instead found type ",
+            actual_type.type()->annotation_str()));
+      }
+      return actual_type;
+    }
     case TypeKind::StringType:
       return ConstantString::create(py::cast<std::string>(obj));
     case TypeKind::DeviceObjType: {
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index e0951c3ebbfbc..2c8246daec92b 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -869,6 +869,12 @@ void initPythonIRBindings(PyObject* module_) {
         }
         return types;
       });
+  py::class_<UnionType, Type, std::shared_ptr<UnionType>>(m, "UnionType")
+      .def(py::init(
+          [](const std::vector<TypePtr>& a) { return UnionType::create(a); }))
+      .def("containedTypes", [](UnionType& self) {
+        return self.containedTypes().vec();
+      });
   py::class_<ListType, Type, std::shared_ptr<ListType>>(m, "ListType")
       .def(py::init([](TypePtr a) { return ListType::create(a); }))
       .def_static("ofInts", &ListType::ofInts)
diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp
index 1f70c3cad8a5e..86aa6e3909e14 100644
--- a/torch/csrc/jit/serialization/import.cpp
+++ b/torch/csrc/jit/serialization/import.cpp
@@ -47,7 +47,8 @@ void postSetStateValidate(const IValue& v) {
     // const auto attrType = objType->getAttribute(i);
     // Verify that all the non-optional attributes have been initialized
     // TODO: Issue #20497
-    if (attrType->kind() != TypeKind::OptionalType &&
+    if (attrType->kind() != TypeKind::UnionType &&
+        attrType->kind() != TypeKind::OptionalType &&
         attrType->kind() != TypeKind::NoneType) {
       TORCH_CHECK(
           !slot.isNone(),
diff --git a/torch/csrc/jit/serialization/import_source.cpp b/torch/csrc/jit/serialization/import_source.cpp
index e7d9da26df41d..918b0d4338c73 100644
--- a/torch/csrc/jit/serialization/import_source.cpp
+++ b/torch/csrc/jit/serialization/import_source.cpp
@@ -482,12 +482,13 @@ void SourceImporterImpl::importClass(
       } break;
       case TK_DEF: {
         Def def = Def(statement);
-        if (pre_hook_names.find(def.name().name()) != pre_hook_names.end()) {
-          pre_hook_def_map.emplace(def.name().name(), def);
-          pre_hook_resolver_map.emplace(def.name().name(), shared_from_this());
-        } else if (hook_names.find(def.name().name()) != hook_names.end()) {
-          hook_def_map.emplace(def.name().name(), def);
-          hook_resolver_map.emplace(def.name().name(), shared_from_this());
+        const auto def_name = def.name().name();
+        if (pre_hook_names.find(def_name) != pre_hook_names.end()) {
+          pre_hook_def_map.emplace(def_name, def);
+          pre_hook_resolver_map.emplace(def_name, shared_from_this());
+        } else if (hook_names.find(def_name) != hook_names.end()) {
+          hook_def_map.emplace(def_name, def);
+          hook_resolver_map.emplace(def_name, shared_from_this());
         } else {
           methods.emplace_back(def);
           method_resolvers.push_back(shared_from_this());
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index 80123c625ea65..6b1bf15304624 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -511,13 +511,31 @@ struct PythonPrintImpl {
     }
     indent();
     printValueList(body_, lhs);
+    // We need to preserve Union/Optional type annotations, but only if
+    // we're not assigning values as part of a tuple unpacking statement
+    // (Python doesn't allow type annotations in multiple assignment)
+    if (lhs.size() == 1) {
+      Value* v = lhs.at(0);
+      if (!annotated_unions_.count(v) && !expr_table_.count(v) &&
+          (v->type()->kind() == UnionType::Kind ||
+           v->type()->kind() == OptionalType::Kind)) {
+        body_ << " : " << v->type()->annotation_str();
+        annotated_unions_.insert(v);
+      }
+    }
     body_ << " = ";
+    // or if value is being assigned to something of a union type
     printValueList(body_, rhs);
     body_ << "\n";
   }
 
   bool requiresAnnotation(Value* lhs, Value* rhs) {
-    return *lhs->type() != *rhs->type();
+    if (lhs->type()->kind() == UnionType::Kind ||
+        lhs->type()->kind() == OptionalType::Kind) {
+      return annotated_unions_.insert(lhs).second;
+    } else {
+      return *lhs->type() != *rhs->type();
+    }
   }
 
   void printAnnotatedAssignment(
@@ -1302,10 +1320,12 @@ struct PythonPrintImpl {
         body_ << arg_name;
         if (print_first_argument_type) {
           body_ << ": " << arg.type()->annotation_str(type_printer_);
+          annotated_unions_.insert(*param_it);
         }
       } else {
         body_ << ",\n    " << arg_name << ": "
               << arg.type()->annotation_str(type_printer_);
+        annotated_unions_.insert(*param_it);
       }
       if (arg.default_value()) {
         printDefaultValue(arg, body_, *arg.default_value());
@@ -1559,6 +1579,12 @@ struct PythonPrintImpl {
   // table.
   PrintDepsTable& deps_table_;
 
+  // We need to preserve Union/Optional type annotations, but we should
+  // only print the annotation on variable declaration (not on any
+  // following uses). This set tracks the Value*s that we've already
+  // printed with annotations
+  std::unordered_set<Value*> annotated_unions_;
+
   // A function that, given a named type, returns us the correct string to print
   // for it.
   c10::TypePrinter type_printer_;
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index b521dc88a12ba..e0e556ecbbde3 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -23,8 +23,8 @@ static void restoreAccurateTypeTagsIfPossible(const IValue& root) {
 
 // Pickled objects are stored in a form compatible with Python pickling.
 // In torchscript List[T]/Dict[K, V] are statically typed and contain
-// dynamic type tags allow T, K, and V to be recovered. But this info
-// is not stored in the Python pickling information. However, we
+// dynamic type tags that allow T, K, and V to be recovered. But this
+// info is not stored in the Python pickling information. However, we
 // can recover this information from the static type of the top-level
 // object being unpickled, because we have a record of the type of the
 // objects it contains as attributes.
@@ -108,6 +108,19 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) {
           to_process.emplace_back(std::move(elem));
         }
       } break;
+      case UnionType::Kind: {
+        auto t = w.static_type->expect<UnionType>();
+        if (t->containedTypes().size() == 2 &&
+            t->canHoldType(NoneType::get())) {
+          if (!w.value.isNone()) {
+            auto inner = t->containedTypes()[0] != NoneType::get()
+                ? t->containedTypes()[0]
+                : t->containedTypes()[1];
+            Work elem = {inner, w.value};
+            to_process.emplace_back(std::move(elem));
+          }
+        }
+      } break;
       case ListType::Kind: {
         // specialized lists do not need their type refined, so we can exit
         // early here
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index de32e1ab8de37..acc9e7c44f51f 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -449,7 +449,7 @@ def method_template(self, *args, **kwargs):
         setattr(RecursiveScriptClass, method_name, method_template)
 
     # this is a Python 'non-data descriptor' that causes the first access
-    # to ScriptModule's forward to lookup the forward method and stash
+    # to ScriptModule's forward to look up the forward method and stash
     # it in the objects dict. Due to the standard rules for attribute lookup,
     # subsequent lookups will just directly return the previously looked up method.
     # This is necessary because nn.Module defines forward as a method. If we
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index f2cf78949b47d..b189f36c4107f 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -6,13 +6,13 @@
 import torch
 import warnings
 from .._jit_internal import List, Tuple, is_tuple, is_list, Dict, is_dict, Optional, \
-    is_optional, _qualified_name, Any, Future, is_future, is_ignored_fn
+    is_optional, _qualified_name, Any, Future, is_future, is_ignored_fn, Union, is_union
 from .._jit_internal import BroadcastingList1, BroadcastingList2, BroadcastingList3  # type: ignore[attr-defined]
 from ._state import _get_script_class
 
 from torch._C import TensorType, TupleType, FloatType, IntType, ComplexType, \
-    ListType, StringType, DictType, BoolType, OptionalType, InterfaceType, AnyType, NoneType, \
-    DeviceObjType, StreamObjType, FutureType, EnumType
+    ListType, StringType, DictType, BoolType, OptionalType, InterfaceType, AnyType, \
+    NoneType, DeviceObjType, StreamObjType, FutureType, EnumType, UnionType
 
 
 from textwrap import dedent
@@ -45,7 +45,8 @@ class EvalEnv(object):
         'List': List,
         'Dict': Dict,
         'Optional': Optional,
-        'Future': Future,
+        'Union': Union,
+        'Future': Future
     }
 
     def __init__(self, rcb):
@@ -245,6 +246,9 @@ def split_type_line(type_line):
 def try_real_annotations(fn, loc):
     """Tries to use the Py3.5+ annotation syntax to get the type."""
     try:
+        # Note: anything annotated as `Optional[T]` will automatically
+        # be returned as `Union[T, None]` per
+        # https://github.com/python/typing/blob/master/src/typing.py#L850
         sig = inspect.signature(fn)
     except ValueError:
         return None
@@ -276,7 +280,6 @@ def get_enum_value_type(e: Type[enum.Enum], loc):
     return torch._C.unify_type_list(ir_types)
 
 def is_tensor(ann):
-
     if issubclass(ann, torch.Tensor):
         return True
 
@@ -326,6 +329,19 @@ def try_ann_to_type(ann, loc):
         msg = "Unsupported annotation {} could not be resolved because {} could not be resolved."
         assert valid_type, msg.format(repr(ann), repr(contained))
         return OptionalType(valid_type)
+    if is_union(ann):
+        inner: List = []
+        # We need these extra checks because both `None` and invalid
+        # values will return `None`
+        # TODO: Determine if the other cases need to be fixed as well
+        for a in ann.__args__:
+            if a is None:
+                inner.append(NoneType.get())
+            maybe_type = try_ann_to_type(a, loc)
+            msg = "Unsupported annotation {} could not be resolved because {} could not be resolved."
+            assert maybe_type, msg.format(repr(ann), repr(maybe_type))
+            inner.append(maybe_type)
+        return UnionType(inner)    # type: ignore[arg-type]
     if torch.distributed.rpc.is_available() and is_rref(ann):
         return RRefType(try_ann_to_type(ann.__args__[0], loc))
     if is_future(ann):
@@ -390,6 +406,8 @@ def ann_to_type(ann, loc):
     'is_list',
     'Dict',
     'is_dict',
+    'is_optional',
+    'is_union',
     'TensorType',
     'TupleType',
     'FloatType',
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 0928106f3ba49..6053ee7ee7f63 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -452,6 +452,7 @@ def get_default_args(fn):
         return {}
 
     signature = inspect.signature(fn)
+
     return {
         k: v.default
         for k, v in signature.parameters.items()

From 0e3b45eaefbef29c36f0198195022a1e4088b3e0 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 3 Sep 2021 10:21:01 -0700
Subject: [PATCH 504/530] Fix logical typo in _compare_trilu_indices (#64468)

Summary:
I'm pretty sure that repeating the same call twice is pretty meaningless and intend was to call `tril`/`tril_indices` in first case and `triu`/`triu_indices` in another

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64468

Reviewed By: mruberry

Differential Revision: D30744978

Pulled By: malfet

fbshipit-source-id: 7cd36789a7ebf1cc263fb2d875e479c05e7588a4
---
 torch/testing/_internal/common_methods_invocations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 221cb29ec5eb6..f678f2258574f 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -9464,8 +9464,8 @@ def _compare_trilu_indices(
         # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
         self.assertEqualIgnoreType(
             torch.ones(row, col, device='cpu')
-                 .tril(offset).nonzero().to(dtype).transpose(0, 1),
-            torch.tril_indices(row, col, offset, dtype=dtype, device=device))
+                 .triu(offset).nonzero().to(dtype).transpose(0, 1),
+            torch.triu_indices(row, col, offset, dtype=dtype, device=device))
 
 
 def _compare_large_trilu_indices(

From e4ff14ad5955f7c4d052aa44069c77654e8b5f2e Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Fri, 3 Sep 2021 13:21:23 -0700
Subject: [PATCH 505/530] [CUDA graphs] Error if attempting to capture
 uncapturable nccl (#64440)

Summary:
NCCL < 2.9.6 is not capturable. Attempting to capture it can cause nasty behavior (for example, ive seen capture succeed, but replay silently hang). Pytorch should preempt this with a friendlier error.

cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse agolynski SciPioneer H-Huang mrzzd cbalioglu gcramer23

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64440

Reviewed By: mruberry

Differential Revision: D30733884

Pulled By: ngimel

fbshipit-source-id: 5f2df3cf5cc0e5e68f49bf22a80d9f58064dc7ec
---
 torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 911963b76cd7c..9773b350e2cd7 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -11,6 +11,7 @@
 #include <THC/THC.h>
 
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/irange.h>
 #include <c10/util/Logging.h>
@@ -189,6 +190,17 @@ std::string getExceptionMsgFromExceptionPtr(
   }
 }
 
+inline void errorIfCapturingNonCapturableNCCL() {
+  auto status = c10::cuda::currentStreamCaptureStatusMayInitCtx();
+  // parentheses avoid some compiler warnings
+  static const uint64_t min_version = (((uint64_t)2) << 32) + (((uint64_t)9) << 16) + ((uint64_t)6);
+  static const uint64_t cur_version = torch::cuda::nccl::version();
+  if (cur_version < min_version) {
+    TORCH_CHECK(status == c10::cuda::CaptureStatus::None,
+                "Capturing NCCL collectives is only allowed with NCCL >= 2.9.6");
+  }
+}
+
 } // namespace
 
 const int64_t ProcessGroupNCCL::kWatchdogThreadSleepMillis = 10000;
@@ -1079,6 +1091,8 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
     OpType opType,
     const char* profilingTitle) {
 
+  errorIfCapturingNonCapturableNCCL();
+
   // Bump collective counter
   if (sequenceNum_) {
     sequenceNum_->increment();

From a91a278d60dcb7c65e2be5c5bd63429bf5df064e Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Fri, 3 Sep 2021 18:48:41 -0700
Subject: [PATCH 506/530] Fix `copy_transpose_valid` condition for
 `copy_same_type_transpose_` (#64425)

Summary:
Thanks to ngimel for the hint where the problem might be (https://github.com/pytorch/pytorch/issues/64358#issuecomment-910868849)!

I added a test that fails on master to verify the fix. The shape `(60, 60)` was chosen because of `MIN_SZ = 60 * 60` in `copy_transpose_valid`.

Fixes https://github.com/pytorch/pytorch/issues/64358

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64425

Reviewed By: mruberry

Differential Revision: D30752725

Pulled By: ngimel

fbshipit-source-id: f40370ea8365c94e30f8e8a3dcab5f3b3462464a
---
 aten/src/ATen/native/Copy.cpp | 4 ++++
 test/test_torch.py            | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index 7fa952d020ef9..6dc1fc7af5e5c 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -28,6 +28,7 @@ bool copy_transpose_valid(const Tensor& self, const Tensor& src) {
   return self.is_contiguous() && src.numel() != 0 && src.dim() == 2 &&
       src.stride(0) == 1 && src.stride(1) == src.size(0) &&
       self.scalar_type() == src.scalar_type() &&
+      self.sizes().equals(src.sizes()) &&
       self.numel() >= MIN_SZ;
 }
 
@@ -45,6 +46,9 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
   }
   Tensor buf = empty({BLOCK_SZ, BLOCK_SZ}, self.options());
 
+  // The code below is implemented with the assumption that sizes are equal
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.sizes().equals(src.sizes()));
+
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "copy_", [&] {
     scalar_t* sp = src.data_ptr<scalar_t>();
     scalar_t* rp = self.data_ptr<scalar_t>();
diff --git a/test/test_torch.py b/test/test_torch.py
index 2899f2ef4c3b2..ae75ee8d66044 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -324,6 +324,14 @@ def test_copy_transpose(self):
             self.assertEqual(y[:, 0], range(100))
             self.assertEqual(y[:, 40], range(4000, 4100))
 
+        # Verifies the bugfix for https://github.com/pytorch/pytorch/issues/64358
+        def test_copy_transpose_2d_broadcast(self):
+            # The shape (60, 60) is chosen because of
+            # `MIN_SZ = 60 * 60` in `copy_transpose_valid` from aten/src/ATen/native/Copy.cpp
+            A = torch.randn(60, 60)
+            A.copy_(torch.tensor([[1.]]))
+            self.assertEqual(A, torch.ones(60, 60))
+
         def test_device(self):
             cpu = torch.device('cpu')
             self.assertEqual('cpu', str(cpu))

From 604e885925af78106e12a3ffd77687da5891761d Mon Sep 17 00:00:00 2001
From: Facebook Community Bot <facebook-github-bot@users.noreply.github.com>
Date: Sat, 4 Sep 2021 00:43:25 -0700
Subject: [PATCH 507/530] Automated submodule update: FBGEMM (#64338)

Summary:
This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM).

New submodule commit: https://github.com/pytorch/FBGEMM/commit/9ccb2714a93e8324119676f6b3dc1c26eef0a703

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64338

Test Plan: Ensure that CI jobs succeed on GitHub before landing.

Reviewed By: jspark1105

Differential Revision: D30690319

fbshipit-source-id: 884d1f950cd1f7d2a77b79affb9215f285d5d0da
---
 third_party/fbgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/fbgemm b/third_party/fbgemm
index e6f80ee6570bb..7b49986d74a66 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit e6f80ee6570bb8a7ed15a5ad0d496fdfb8927470
+Subproject commit 7b49986d74a6666fa6913bd9b461ebebb2cad476

From 6cac7ca98054feb299c2d68994809b547f3a3c2e Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Sat, 4 Sep 2021 12:37:09 -0700
Subject: [PATCH 508/530] Ensure num_threads is initialized in get_num_threads
 (#64486)

Summary:
Possible source of the recent layernorm CI failures. `lazy_init_num_threads` appears at the top of `parallel_for` and can change the number of threads set. So, we need to ensure `num_threads` is initialized during `get_num_threads` calls as well. It's already done this way for OpenMP, but is missing from other parallel backends.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64486

Reviewed By: mruberry

Differential Revision: D30752615

Pulled By: ngimel

fbshipit-source-id: 085873ce312edbee1254c0aaae30dec7fcfe2c57
---
 aten/src/ATen/ParallelNative.cpp           | 1 +
 aten/src/ATen/ParallelNativeTBB.cpp        | 1 +
 aten/src/ATen/ParallelThreadPoolNative.cpp | 1 +
 3 files changed, 3 insertions(+)

diff --git a/aten/src/ATen/ParallelNative.cpp b/aten/src/ATen/ParallelNative.cpp
index 17b4b20aa9bd0..565c979e35e16 100644
--- a/aten/src/ATen/ParallelNative.cpp
+++ b/aten/src/ATen/ParallelNative.cpp
@@ -222,6 +222,7 @@ void set_num_threads(int nthreads) {
 }
 
 int get_num_threads() {
+  at::internal::lazy_init_num_threads();
 #ifndef C10_MOBILE
   // not initializing pool unnecessarily,
   // because pool cannot be resized after initialization
diff --git a/aten/src/ATen/ParallelNativeTBB.cpp b/aten/src/ATen/ParallelNativeTBB.cpp
index 15040498edc5c..c38dcb64f81bd 100644
--- a/aten/src/ATen/ParallelNativeTBB.cpp
+++ b/aten/src/ATen/ParallelNativeTBB.cpp
@@ -66,6 +66,7 @@ void set_num_threads(int nthreads) {
 }
 
 int get_num_threads() {
+  at::internal::lazy_init_num_threads();
   return tbb::global_control::active_value(
       tbb::global_control::max_allowed_parallelism);
 }
diff --git a/aten/src/ATen/ParallelThreadPoolNative.cpp b/aten/src/ATen/ParallelThreadPoolNative.cpp
index 2670c7bd08d1b..cc5821d494a25 100644
--- a/aten/src/ATen/ParallelThreadPoolNative.cpp
+++ b/aten/src/ATen/ParallelThreadPoolNative.cpp
@@ -57,6 +57,7 @@ void set_num_interop_threads(int nthreads) {
 }
 
 int get_num_interop_threads() {
+  at::internal::lazy_init_num_threads();
   int nthreads = num_interop_threads.load();
   if (nthreads > 0) {
     return nthreads;

From 18b2751ea143374adbb690889427e06a9334da05 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Sat, 4 Sep 2021 20:29:44 -0700
Subject: [PATCH 509/530] [nnc] Make our exceptions c10::Errors, get C++
 stacktraces (#64332)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64332

With this diff, if a compiler bug occurs (unlikely, I know!) we'll be able to get a c++ stacktrace leading to the exception, rather than just a terse message.  E.g.,
```
RuntimeError: UNSUPPORTED DTYPE
Exception raised from compilation_error at ../torch/csrc/jit/tensorexpr/exceptions.h:32 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x6b (0x7f966659b2eb in /fsx/users/bertrand/c\
onda/envs/pytorch/lib/python3.8/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x376f099 (0x7f966a195099 in /fsx/users/bertrand/conda/envs/pytorch/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #2: <unknown function> + 0x3763bf5 (0x7f966a189bf5 in /fsx/users/bertrand/conda/envs/pytorch/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #3: torch::jit::tensorexpr::CudaCodeGen::Initialize() + 0xdd8 (0x7f966a193368 in /fsx/users/bertrand/conda/envs/pytorch/lib/python3.8/site-packages/torch/lib/libtorch_cuda\
.so)
```

Test Plan: Imported from OSS

Reviewed By: huiguoo

Differential Revision: D30745610

Pulled By: bertmaher

fbshipit-source-id: a1cfaa7364ef4120de834e9cbe57ced1d082ab4e
---
 torch/csrc/jit/tensorexpr/exceptions.h | 64 +++++++++++++++-----------
 torch/csrc/jit/tensorexpr/loopnest.cpp |  4 +-
 2 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/exceptions.h b/torch/csrc/jit/tensorexpr/exceptions.h
index 7194dfe166aa8..35ba8a34e88e2 100644
--- a/torch/csrc/jit/tensorexpr/exceptions.h
+++ b/torch/csrc/jit/tensorexpr/exceptions.h
@@ -26,66 +26,78 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-class unsupported_dtype : public std::runtime_error {
+TORCH_API std::string buildErrorMessage(const std::string& s);
+
+class compilation_error : public c10::Error {
  public:
-  explicit unsupported_dtype() : std::runtime_error("UNSUPPORTED DTYPE") {}
+  explicit compilation_error(const std::string& err)
+      : c10::Error(
+            {
+                __func__,
+                __FILE__,
+                static_cast<uint32_t>(__LINE__),
+            },
+            buildErrorMessage(err)) {}
+};
+
+class unsupported_dtype : public compilation_error {
+ public:
+  explicit unsupported_dtype() : compilation_error("UNSUPPORTED DTYPE") {}
   explicit unsupported_dtype(const std::string& err)
-      : std::runtime_error("UNSUPPORTED DTYPE: " + err) {}
+      : compilation_error("UNSUPPORTED DTYPE: " + err) {}
 };
 
-class out_of_range_index : public std::runtime_error {
+class out_of_range_index : public compilation_error {
  public:
-  explicit out_of_range_index() : std::runtime_error("OUT OF RANGE INDEX") {}
+  explicit out_of_range_index() : compilation_error("OUT OF RANGE INDEX") {}
   explicit out_of_range_index(const std::string& err)
-      : std::runtime_error("OUT OF RANGE INDEX: " + err) {}
+      : compilation_error("OUT OF RANGE INDEX: " + err) {}
 };
 
-class unimplemented_lowering : public std::runtime_error {
+class unimplemented_lowering : public compilation_error {
  public:
   explicit unimplemented_lowering()
-      : std::runtime_error("UNIMPLEMENTED LOWERING") {}
+      : compilation_error("UNIMPLEMENTED LOWERING") {}
   explicit unimplemented_lowering(ExprPtr expr)
-      : std::runtime_error("UNIMPLEMENTED LOWERING: " + std::to_string(expr)) {}
+      : compilation_error("UNIMPLEMENTED LOWERING: " + std::to_string(expr)) {}
   explicit unimplemented_lowering(StmtPtr stmt)
-      : std::runtime_error("UNIMPLEMENTED LOWERING: " + std::to_string(stmt)) {}
+      : compilation_error("UNIMPLEMENTED LOWERING: " + std::to_string(stmt)) {}
 };
 
-class malformed_input : public std::runtime_error {
+class malformed_input : public compilation_error {
  public:
-  explicit malformed_input() : std::runtime_error("MALFORMED INPUT") {}
+  explicit malformed_input() : compilation_error("MALFORMED INPUT") {}
   explicit malformed_input(const std::string& err)
-      : std::runtime_error("MALFORMED INPUT: " + err) {}
+      : compilation_error("MALFORMED INPUT: " + err) {}
   explicit malformed_input(ExprPtr expr)
-      : std::runtime_error("MALFORMED INPUT: " + std::to_string(expr)) {}
+      : compilation_error("MALFORMED INPUT: " + std::to_string(expr)) {}
   explicit malformed_input(const std::string& err, ExprPtr expr)
-      : std::runtime_error(
+      : compilation_error(
             "MALFORMED INPUT: " + err + " - " + std::to_string(expr)) {}
   explicit malformed_input(StmtPtr stmt)
-      : std::runtime_error("MALFORMED INPUT: " + std::to_string(stmt)) {}
+      : compilation_error("MALFORMED INPUT: " + std::to_string(stmt)) {}
   explicit malformed_input(const std::string& err, StmtPtr stmt)
-      : std::runtime_error(
+      : compilation_error(
             "MALFORMED INPUT: " + err + " - " + std::to_string(stmt)) {}
 };
 
-class malformed_ir : public std::runtime_error {
+class malformed_ir : public compilation_error {
  public:
-  explicit malformed_ir() : std::runtime_error("MALFORMED IR") {}
+  explicit malformed_ir() : compilation_error("MALFORMED IR") {}
   explicit malformed_ir(const std::string& err)
-      : std::runtime_error("MALFORMED IR: " + err) {}
+      : compilation_error("MALFORMED IR: " + err) {}
   explicit malformed_ir(ExprPtr expr)
-      : std::runtime_error("MALFORMED IR: " + std::to_string(expr)) {}
+      : compilation_error("MALFORMED IR: " + std::to_string(expr)) {}
   explicit malformed_ir(const std::string& err, ExprPtr expr)
-      : std::runtime_error(
+      : compilation_error(
             "MALFORMED IR: " + err + " - " + std::to_string(expr)) {}
   explicit malformed_ir(StmtPtr stmt)
-      : std::runtime_error("MALFORMED IR: " + std::to_string(stmt)) {}
+      : compilation_error("MALFORMED IR: " + std::to_string(stmt)) {}
   explicit malformed_ir(const std::string& err, StmtPtr stmt)
-      : std::runtime_error(
+      : compilation_error(
             "MALFORMED IR: " + err + " - " + std::to_string(stmt)) {}
 };
 
-TORCH_API std::string buildErrorMessage(const std::string& s);
-
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index e67d094065d1a..570fe338093b9 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -476,11 +476,13 @@ bool LoopNest::vectorize(ForPtr f) {
     normalize(to<For>(new_f));
     new_f = FlattenIndexes(new_f);
     new_f = v.vectorize(to<For>(new_f));
-  } catch (std::runtime_error& e) {
+  } catch (compilation_error& e) {
     // We clone f before vectorizing. So, any partial vectorization will
     // have modified the clone. In case of an exception, we can continue
     // using f.
     new_f = f;
+  } catch (std::runtime_error& e) {
+    new_f = f;
   }
 
   if (new_f != f) {

From 008bf6689b7d298e3a788dc5576c9b691c5f25a7 Mon Sep 17 00:00:00 2001
From: Chris Cai <chriscai@fb.com>
Date: Sat, 4 Sep 2021 20:54:29 -0700
Subject: [PATCH 510/530] Back out "D30740897 Add fusion enabled apis" (#64500)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64500

D30740897 (https://github.com/pytorch/pytorch/commit/39aeb3bf63f61664bc6c4a929a80a660365c2a5e) broke caffe2/torch/fb/module_factory/optimizers/tests:test_full_sync_optimizer_needed_coverage (https://fburl.com/test/mb46jxon) and blocked training_platform_unit_tests

{F660271297}

multsect results confirms

```
multisect --config FBCODE_TEST bisect 844424966128796 --workers 16 revisions --begin 09629edc --end fc86b434
D30740897 (https://github.com/pytorch/pytorch/commit/39aeb3bf63f61664bc6c4a929a80a660365c2a5e)

````

{F660271232}

Test Plan:
```
buck test mode/opt //caffe2/torch/fb/module_factory/optimizers/tests:test_full_sync_optimizer_needed_coverage

Started reporting to test run: https://www.internalfb.com/intern/testinfra/testrun/4785074671474181
    ✓ Pass: caffe2/torch/fb/module_factory/optimizers/tests:test_full_sync_optimizer_needed_coverage - main (3.729)
Summary
  Pass: 1

```

Differential Revision: D30753916

fbshipit-source-id: 302fd4113ef1f3069846be03edc2300d82b66719
---
 docs/source/jit.rst   |  2 --
 torch/jit/__init__.py | 29 -----------------------------
 2 files changed, 31 deletions(-)

diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index 97a0615812830..8a80b6471e1a7 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -61,8 +61,6 @@ Creating TorchScript Code
     ScriptFunction
     freeze
     optimize_for_inference
-    enable_fusion
-    fusion_enabled
     save
     load
     ignore
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index f804d3c72ce8e..f7fa58bd36434 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -4,7 +4,6 @@
 from typing import Iterator
 
 from torch.utils import set_module
-from typing import Optional
 
 # These are imported so users can access them from the `torch.jit` module
 from torch._jit_internal import (
@@ -198,34 +197,6 @@ def _hide_source_ranges() -> Iterator[None]:
     finally:
         torch._C.Graph.set_global_print_source_ranges(old_enable_source_ranges)  # type: ignore[attr-defined]
 
-def enable_fusion(enabled: bool, device: Optional[str] = None):
-    """
-    Enables or disables JIT fusion based on the parameter `enabled`.
-
-    If `device` is None, both CPU and GPU fusion will be turned on or off.
-    Otherwise, device must be equal to "cpu" or "cuda", and will turn on or off
-    CPU and GPU fusion respectively.
-    """
-
-    if device is None:
-        torch._C._jit_override_can_fuse_on_cpu(enabled)
-        torch._C._jit_override_can_fuse_on_gpu(enabled)
-    else:
-        assert device in ["cpu", "cuda"], "Device-specific fusion must be equal to 'cpu' or 'cuda' if not None"
-        if device == "cuda":
-            torch._C._jit_override_can_fuse_on_gpu(enabled)
-        else:
-            torch._C._jit_override_can_fuse_on_cpu(enabled)
-
-def fusion_enabled(device: str):
-    """
-    Returns whether JIT fusion is enabled for "cpu" or "cuda"
-    """
-    assert device == "cpu" or device == "cuda"
-    if device == "cpu":
-        return torch._C._jit_can_fuse_on_cpu()
-    else:
-        return torch._C._jit_can_fuse_on_gpu()
 
 if not torch._C._jit_init():
     raise RuntimeError("JIT initialization failed")

From 1901c675e1cf9626a25d96bcc64b4952c64d56a2 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Sun, 5 Sep 2021 02:23:31 -0700
Subject: [PATCH 511/530] Back out "nn.functional.linear OpInfo" (#64517)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64517

Original commit changeset: ca41dbd98176

Test Plan: PyTorch CI

Reviewed By: ngimel

Differential Revision: D30758201

fbshipit-source-id: 2d3274293d340373b8af86083336607818019619
---
 .../_internal/common_methods_invocations.py   | 35 -------------------
 1 file changed, 35 deletions(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index f678f2258574f..5e009ee7f487e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -2648,30 +2648,6 @@ def sample_inputs_hardswish(self, device, dtype, requires_grad):
                requires_grad=requires_grad, low=-5, high=5)) for _ in range(1, N)]
     return tensors
 
-def sample_inputs_linear(self, device, dtype, requires_grad):
-    features_options = [[3, 4], [128, 128]]
-    batch_options: List[List[int]] = [
-        [],  # no batch
-        [0],
-        [64],
-        [5, 7],
-    ]
-    create_tensor = partial(make_tensor, device=device, dtype=dtype,
-                            requires_grad=requires_grad, low=-2, high=2)
-
-    sample_inputs = []
-    for has_bias, (in_feat, out_feat), batch_shape in \
-            itertools.product([True, False], features_options, batch_options):
-        input_tensor = create_tensor(batch_shape + [in_feat])
-        weight = create_tensor([out_feat, in_feat])
-        if not has_bias:
-            sample_inputs.append(SampleInput(input_tensor, args=(weight,)))
-            continue
-
-        bias = create_tensor([out_feat])
-        sample_inputs.append(SampleInput(input_tensor, args=(weight, bias)))
-    return sample_inputs
-
 def sample_inputs_interpolate(mode, self, device, dtype, requires_grad):
     N, C = 2, 3
     D = 4
@@ -7545,17 +7521,6 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            dtypesIfCPU=floating_types_and(torch.int64),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_avgpool2d),
-    OpInfo('nn.functional.linear',
-           aten_name='linear',
-           supports_autograd=True,
-           sample_inputs_func=sample_inputs_linear,
-           dtypesIfCPU=all_types_and_complex_and(torch.half, torch.bfloat16),
-           dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
-           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
-                                                                *[torch.bfloat16] if CUDA11OrLater else []),
-           supports_forward_ad=True,
-           supports_out=False),
     UnaryUfuncInfo(
         'nn.functional.logsigmoid',
         aten_name="log_sigmoid",

From 49fe829caea178eee7dcb75a923ff29291117827 Mon Sep 17 00:00:00 2001
From: Sangbaek Park <sangbaek@fb.com>
Date: Sun, 5 Sep 2021 12:52:46 -0700
Subject: [PATCH 512/530] [Vulkan] Code Quality: Remove duplicate code for
 hardshrink and leaky_relu functions (#64405)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64405

Code quality improvement: removed duplicate code for hardshrink and leaky_relu functions.
ghstack-source-id: 137319378

Test Plan:
```buck build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 //xplat/caffe2:pt_vulkan_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_api_test
adb shell "/data/local/tmp/vulkan_api_test"```

Reviewed By: SS-JIA

Differential Revision: D30690251

fbshipit-source-id: 5729d1f32946e42f41df77756a8313f297dd822f
---
 aten/src/ATen/native/vulkan/ops/Clamp.cpp | 149 +++++-----------------
 1 file changed, 30 insertions(+), 119 deletions(-)

diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
index 7982b0eda0d7a..a6e65607fb07c 100644
--- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
@@ -207,7 +207,7 @@ Tensor& activation_(
 
   TORCH_CHECK(
       self.is_vulkan(),
-      "Vulkan: In-place clamp is only supported on Vulkan tensors.");
+      "Vulkan: In-place operator is only supported on Vulkan tensors.");
 
   vTensor& v_self = convert(self);
 
@@ -289,9 +289,10 @@ Tensor& hardsigmoid_(Tensor& self) {
   return ops::activation_(self, VK_KERNEL(hardsigmoid_));
 }
 
-Tensor hardshrink(
+Tensor activation_scalar(
     const Tensor& self_arg,
-    const Scalar& lambd) {
+    const Scalar& scalar_arg,
+    const api::Shader::Descriptor& shader_descriptor) {
   api::Context* const context = api::context();
 
   const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
@@ -310,11 +311,11 @@ Tensor hardshrink(
       const struct Block final {
         uvec3 extents;
         uint32_t _;
-        float lambd;
+        float scalar_value;
       } block {
         v_output.extents(),
         0u,
-        lambd.to<float>(),
+        scalar_arg.to<float>(),
       };
 
       context->dispatch(
@@ -324,7 +325,7 @@ Tensor hardshrink(
             VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
             VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
           },
-          VK_KERNEL(hardshrink),
+          shader_descriptor,
           v_output.extents(),
           context->gpu().adapter->local_work_group_size(),
           // Write-only access bypasses synchronization but inserts appropriate
@@ -351,14 +352,15 @@ Tensor hardshrink(
   return convert(v_output);
 }
 
-Tensor& hardshrink_(
+Tensor& activation_scalar_(
     Tensor& self,
-    const Scalar& lambd) {
+    const Scalar& scalar_arg,
+    const api::Shader::Descriptor& shader_descriptor) {
   api::Context* const context = api::context();
 
   TORCH_CHECK(
       self.is_vulkan(),
-      "Vulkan: In-place hardshrink is only supported on Vulkan tensors.");
+      "Vulkan: In-place operator is only supported on Vulkan tensors.");
 
   vTensor& v_self = convert(self);
 
@@ -369,11 +371,11 @@ Tensor& hardshrink_(
       const struct Block final {
         uvec3 extents;
         uint32_t _;
-        float lambd;
+        float scalar_value;
       } block {
         v_self.extents(),
         0u,
-        lambd.to<float>(),
+        scalar_arg.to<float>(),
       };
 
       context->dispatch(
@@ -382,7 +384,7 @@ Tensor& hardshrink_(
             VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
             VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
           },
-          VK_KERNEL(hardshrink_),
+          shader_descriptor,
           v_self.extents(),
           context->gpu().adapter->local_work_group_size(),
           // Read-Write access triggers an async synchronization if necessory
@@ -404,119 +406,28 @@ Tensor& hardshrink_(
   return self;
 }
 
-Tensor leaky_relu(
+Tensor hardshrink(
     const Tensor& self_arg,
-    const Scalar& negative_slope) {
-  api::Context* const context = api::context();
-
-  const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
-  const vTensor& v_self = convert(self);
-
-  vTensor v_output{
-    context,
-    v_self.sizes(),
-    v_self.options(),
-  };
-
-  api::Command::Pool& command_pool = context->command().pool;
-  api::Command::Buffer& command_buffer = command_pool.stream();
-  {
-    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
-      const struct Block final {
-        uvec3 extents;
-        uint32_t _;
-        float negative_slope;
-      } block {
-        v_output.extents(),
-        0u,
-        negative_slope.to<float>(),
-      };
+    const Scalar& lambd) {
+  return ops::activation_scalar(self_arg, lambd, VK_KERNEL(hardshrink));
+}
 
-      context->dispatch(
-          command_buffer,
-          {
-            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-          },
-          VK_KERNEL(leaky_relu),
-          v_output.extents(),
-          context->gpu().adapter->local_work_group_size(),
-          // Write-only access bypasses synchronization but inserts appropriate
-          // barriers if necessary.
-          v_output.image(
-              command_buffer,
-              vTensor::Stage::Compute,
-              vTensor::Access::Write),
-          // Read-only access is implied on const tensors and triggers an async
-          // synchronization if necessary.
-          v_self.image(
-              command_buffer,
-              vTensor::Stage::Compute),
-          // Object lifetime is managed by the resource pool.
-          // It is OK not to keep track of the handle.
-          context->resource().pool.uniform(block).object);
-    }
-    else {
-      TORCH_CHECK(false, "Not implemented!");
-    }
-  }
-  command_pool.submit(context->gpu().queue, command_buffer);
+Tensor& hardshrink_(
+    Tensor& self,
+    const Scalar& lambd) {
+  return ops::activation_scalar_(self, lambd, VK_KERNEL(hardshrink_));
+}
 
-  return convert(v_output);
+Tensor leaky_relu(
+    const Tensor& self_arg,
+    const Scalar& negative_slope) {
+  return ops::activation_scalar(self_arg, negative_slope, VK_KERNEL(leaky_relu));
 }
 
 Tensor& leaky_relu_(
     Tensor& self,
     const Scalar& negative_slope) {
-  api::Context* const context = api::context();
-
-  TORCH_CHECK(
-      self.is_vulkan(),
-      "Vulkan: In-place leaky relu is only supported on Vulkan tensors.");
-
-  vTensor& v_self = convert(self);
-
-  api::Command::Pool& command_pool = context->command().pool;
-  api::Command::Buffer& command_buffer = command_pool.stream();
-  {
-    if C10_LIKELY(v_self.has_image()) {
-      const struct Block final {
-        uvec3 extents;
-        uint32_t _;
-        float negative_slope;
-      } block {
-        v_self.extents(),
-        0u,
-        negative_slope.to<float>(),
-      };
-
-      context->dispatch(
-          command_buffer,
-          {
-            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-          },
-          VK_KERNEL(leaky_relu_),
-          v_self.extents(),
-          context->gpu().adapter->local_work_group_size(),
-          // Read-Write access triggers an async synchronization if necessory
-          // and inserts appropriate barriers if hazards are detected.
-          v_self.image(
-              command_buffer,
-              vTensor::Stage::Compute,
-              vTensor::Access::Read | vTensor::Access::Write),
-          // Object lifetime is managed by the resource pool.
-          // It is OK not to keep track of the handle.
-          context->resource().pool.uniform(block).object);
-    }
-    else {
-      TORCH_CHECK(false, "Not implemented!");
-    }
-  }
-  command_pool.submit(context->gpu().queue, command_buffer);
-
-  return self;
+  return ops::activation_scalar_(self, negative_slope, VK_KERNEL(leaky_relu_));
 }
 
 Tensor sigmoid(const Tensor& self) {
@@ -542,8 +453,8 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl(TORCH_SELECTIVE_NAME("aten::clamp_"), TORCH_FN(clamp_));
   m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid"), hardsigmoid);
   m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid_"), hardsigmoid_);
-  m.impl(TORCH_SELECTIVE_NAME("aten::hardshrink"), TORCH_FN(hardshrink));
-  m.impl(TORCH_SELECTIVE_NAME("aten::hardshrink_"), TORCH_FN(hardshrink_));
+  m.impl(TORCH_SELECTIVE_NAME("aten::hardshrink"), hardshrink);
+  m.impl(TORCH_SELECTIVE_NAME("aten::hardshrink_"), hardshrink_);
   m.impl(TORCH_SELECTIVE_NAME("aten::hardswish"), hardswish);
   m.impl(TORCH_SELECTIVE_NAME("aten::hardswish_"), hardswish_);
   m.impl(TORCH_SELECTIVE_NAME("aten::hardtanh"), hardtanh);

From bcc7e82371082a284ad9ee423cad192bb024e1a1 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Sun, 5 Sep 2021 16:06:09 -0700
Subject: [PATCH 513/530] Revert D30745610: [nnc] Make our exceptions
 c10::Errors, get C++ stacktraces

Test Plan: revert-hammer

Differential Revision:
D30745610 (https://github.com/pytorch/pytorch/commit/18b2751ea143374adbb690889427e06a9334da05)

Original commit changeset: a1cfaa7364ef

fbshipit-source-id: 9b716053b96a65745240ddef1c456c44d5d09671
---
 torch/csrc/jit/tensorexpr/exceptions.h | 64 +++++++++++---------------
 torch/csrc/jit/tensorexpr/loopnest.cpp |  4 +-
 2 files changed, 27 insertions(+), 41 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/exceptions.h b/torch/csrc/jit/tensorexpr/exceptions.h
index 35ba8a34e88e2..7194dfe166aa8 100644
--- a/torch/csrc/jit/tensorexpr/exceptions.h
+++ b/torch/csrc/jit/tensorexpr/exceptions.h
@@ -26,78 +26,66 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-TORCH_API std::string buildErrorMessage(const std::string& s);
-
-class compilation_error : public c10::Error {
+class unsupported_dtype : public std::runtime_error {
  public:
-  explicit compilation_error(const std::string& err)
-      : c10::Error(
-            {
-                __func__,
-                __FILE__,
-                static_cast<uint32_t>(__LINE__),
-            },
-            buildErrorMessage(err)) {}
-};
-
-class unsupported_dtype : public compilation_error {
- public:
-  explicit unsupported_dtype() : compilation_error("UNSUPPORTED DTYPE") {}
+  explicit unsupported_dtype() : std::runtime_error("UNSUPPORTED DTYPE") {}
   explicit unsupported_dtype(const std::string& err)
-      : compilation_error("UNSUPPORTED DTYPE: " + err) {}
+      : std::runtime_error("UNSUPPORTED DTYPE: " + err) {}
 };
 
-class out_of_range_index : public compilation_error {
+class out_of_range_index : public std::runtime_error {
  public:
-  explicit out_of_range_index() : compilation_error("OUT OF RANGE INDEX") {}
+  explicit out_of_range_index() : std::runtime_error("OUT OF RANGE INDEX") {}
   explicit out_of_range_index(const std::string& err)
-      : compilation_error("OUT OF RANGE INDEX: " + err) {}
+      : std::runtime_error("OUT OF RANGE INDEX: " + err) {}
 };
 
-class unimplemented_lowering : public compilation_error {
+class unimplemented_lowering : public std::runtime_error {
  public:
   explicit unimplemented_lowering()
-      : compilation_error("UNIMPLEMENTED LOWERING") {}
+      : std::runtime_error("UNIMPLEMENTED LOWERING") {}
   explicit unimplemented_lowering(ExprPtr expr)
-      : compilation_error("UNIMPLEMENTED LOWERING: " + std::to_string(expr)) {}
+      : std::runtime_error("UNIMPLEMENTED LOWERING: " + std::to_string(expr)) {}
   explicit unimplemented_lowering(StmtPtr stmt)
-      : compilation_error("UNIMPLEMENTED LOWERING: " + std::to_string(stmt)) {}
+      : std::runtime_error("UNIMPLEMENTED LOWERING: " + std::to_string(stmt)) {}
 };
 
-class malformed_input : public compilation_error {
+class malformed_input : public std::runtime_error {
  public:
-  explicit malformed_input() : compilation_error("MALFORMED INPUT") {}
+  explicit malformed_input() : std::runtime_error("MALFORMED INPUT") {}
   explicit malformed_input(const std::string& err)
-      : compilation_error("MALFORMED INPUT: " + err) {}
+      : std::runtime_error("MALFORMED INPUT: " + err) {}
   explicit malformed_input(ExprPtr expr)
-      : compilation_error("MALFORMED INPUT: " + std::to_string(expr)) {}
+      : std::runtime_error("MALFORMED INPUT: " + std::to_string(expr)) {}
   explicit malformed_input(const std::string& err, ExprPtr expr)
-      : compilation_error(
+      : std::runtime_error(
             "MALFORMED INPUT: " + err + " - " + std::to_string(expr)) {}
   explicit malformed_input(StmtPtr stmt)
-      : compilation_error("MALFORMED INPUT: " + std::to_string(stmt)) {}
+      : std::runtime_error("MALFORMED INPUT: " + std::to_string(stmt)) {}
   explicit malformed_input(const std::string& err, StmtPtr stmt)
-      : compilation_error(
+      : std::runtime_error(
             "MALFORMED INPUT: " + err + " - " + std::to_string(stmt)) {}
 };
 
-class malformed_ir : public compilation_error {
+class malformed_ir : public std::runtime_error {
  public:
-  explicit malformed_ir() : compilation_error("MALFORMED IR") {}
+  explicit malformed_ir() : std::runtime_error("MALFORMED IR") {}
   explicit malformed_ir(const std::string& err)
-      : compilation_error("MALFORMED IR: " + err) {}
+      : std::runtime_error("MALFORMED IR: " + err) {}
   explicit malformed_ir(ExprPtr expr)
-      : compilation_error("MALFORMED IR: " + std::to_string(expr)) {}
+      : std::runtime_error("MALFORMED IR: " + std::to_string(expr)) {}
   explicit malformed_ir(const std::string& err, ExprPtr expr)
-      : compilation_error(
+      : std::runtime_error(
             "MALFORMED IR: " + err + " - " + std::to_string(expr)) {}
   explicit malformed_ir(StmtPtr stmt)
-      : compilation_error("MALFORMED IR: " + std::to_string(stmt)) {}
+      : std::runtime_error("MALFORMED IR: " + std::to_string(stmt)) {}
   explicit malformed_ir(const std::string& err, StmtPtr stmt)
-      : compilation_error(
+      : std::runtime_error(
             "MALFORMED IR: " + err + " - " + std::to_string(stmt)) {}
 };
 
+TORCH_API std::string buildErrorMessage(const std::string& s);
+
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 570fe338093b9..e67d094065d1a 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -476,13 +476,11 @@ bool LoopNest::vectorize(ForPtr f) {
     normalize(to<For>(new_f));
     new_f = FlattenIndexes(new_f);
     new_f = v.vectorize(to<For>(new_f));
-  } catch (compilation_error& e) {
+  } catch (std::runtime_error& e) {
     // We clone f before vectorizing. So, any partial vectorization will
     // have modified the clone. In case of an exception, we can continue
     // using f.
     new_f = f;
-  } catch (std::runtime_error& e) {
-    new_f = f;
   }
 
   if (new_f != f) {

From 544c8e6a5d26efdf1cf679b313893fe119825930 Mon Sep 17 00:00:00 2001
From: Shen Xu <shenchenxu@fb.com>
Date: Sun, 5 Sep 2021 16:44:13 -0700
Subject: [PATCH 514/530] Mark functions in backend header as inline to
 suppress warning (#64098)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64098

Reviewed By: kimishpatel, iseeyuan

Differential Revision: D30593104

fbshipit-source-id: 328196b9bc4a89a28ad89bede7e337107976c303
---
 torch/csrc/jit/backends/backend.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/jit/backends/backend.h b/torch/csrc/jit/backends/backend.h
index 941f27bfe2b11..5aae642fa5517 100644
--- a/torch/csrc/jit/backends/backend.h
+++ b/torch/csrc/jit/backends/backend.h
@@ -9,7 +9,7 @@ namespace torch {
 namespace jit {
 namespace {
 // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
-c10::FunctionSchema getIsAvailableSchema() {
+inline c10::FunctionSchema getIsAvailableSchema() {
   c10::Argument self("self", c10::AnyType::get());
   c10::Argument available("available", c10::BoolType::get());
   c10::FunctionSchema preprocessor_schema(
@@ -23,7 +23,7 @@ c10::FunctionSchema getIsAvailableSchema() {
 constexpr static auto kBackendsNamespace = "__backends__";
 
 // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
-c10::FunctionSchema getCompileSchema() {
+inline c10::FunctionSchema getCompileSchema() {
   c10::Argument self("self", c10::AnyType::get());
   c10::Argument mod("processed", c10::AnyType::get());
   auto any_dict_ty =
@@ -40,7 +40,7 @@ c10::FunctionSchema getCompileSchema() {
 }
 
 // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
-c10::FunctionSchema getExecuteSchema() {
+inline c10::FunctionSchema getExecuteSchema() {
   auto any_list_ty = c10::ListType::create(c10::AnyType::get());
   c10::Argument self("self", c10::AnyType::get());
   c10::Argument handle("handle", c10::AnyType::get());

From 571a2becf337ae84275fa96300043762387058cf Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Mon, 6 Sep 2021 11:37:39 -0700
Subject: [PATCH 515/530] Move ParallelNative and PureTorch to GHA (#64452)

Summary:
Separate ParallelTBB move to https://github.com/pytorch/pytorch/pull/64193 as it requires some further investiagation

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64452

Reviewed By: seemethere, janeyx99

Differential Revision: D30738337

Pulled By: malfet

fbshipit-source-id: 81c46423e903058bd1a3e8553e8a10ce978eeefd
---
 .circleci/cimodel/data/pytorch_build_data.py  |   2 -
 .circleci/config.yml                          |  63 ---
 .github/generated-ciflow-ruleset.json         |   8 +-
 .github/scripts/generate_ci_workflows.py      |  46 +-
 ...rallelnative-linux-xenial-py3.6-gcc5.4.yml | 510 ++++++++++++++++++
 ...ed-puretorch-linux-xenial-py3.6-gcc5.4.yml | 275 ++++++++++
 .jenkins/pytorch/build.sh                     |   2 +-
 7 files changed, 827 insertions(+), 79 deletions(-)
 create mode 100644 .github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
 create mode 100644 .github/workflows/generated-puretorch-linux-xenial-py3.6-gcc5.4.yml

diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index dbe17bf4f15f5..46527c1168891 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -8,8 +8,6 @@
                 ("3.6", [
                     ("important", [X(True)]),
                     ("parallel_tbb", [X(True)]),
-                    ("parallel_native", [X(True)]),
-                    ("pure_torch", [X(True)]),
                 ]),
             ]),
             # TODO: bring back libtorch test
diff --git a/.circleci/config.yml b/.circleci/config.yml
index ffc67a14ec5ad..c57eb26c032e1 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7190,50 +7190,6 @@ workflows:
           build_environment: "pytorch-linux-pytorch_paralleltbb_linux_xenial_py3_6_gcc5_4_distributed-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
-      - pytorch_linux_build:
-          name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
-          requires:
-            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-      - pytorch_linux_test:
-          name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_test
-          requires:
-            - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-          resource_class: large
-      - pytorch_linux_test:
-          name: pytorch_linux_pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed_test
-          requires:
-            - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
-          build_environment: "pytorch-linux-pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_distributed-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-          resource_class: large
-      - pytorch_linux_build:
-          name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build
-          requires:
-            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          build_environment: "pytorch-pure_torch-linux-xenial-py3.6-gcc5.4-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
       - pytorch_linux_build:
           name: pytorch_linux_xenial_py3_6_gcc7_build
           requires:
@@ -9324,25 +9280,6 @@ workflows:
           build_environment: "pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
-      - pytorch_linux_build:
-          name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
-          requires:
-            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
-          build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-      - pytorch_linux_test:
-          name: pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_test
-          requires:
-            - pytorch_parallelnative_linux_xenial_py3_6_gcc5_4_build
-          build_environment: "pytorch-parallelnative-linux-xenial-py3.6-gcc5.4-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
-          resource_class: large
-      - pytorch_linux_build:
-          name: pytorch_pure_torch_linux_xenial_py3_6_gcc5_4_build
-          requires:
-            - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
-          build_environment: "pytorch-pure_torch-linux-xenial-py3.6-gcc5.4-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
       - pytorch_linux_build:
           name: pytorch_linux_xenial_py3_6_gcc7_build
           requires:
diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json
index d3ebad35a5303..8e05b532f86fc 100644
--- a/.github/generated-ciflow-ruleset.json
+++ b/.github/generated-ciflow-ruleset.json
@@ -11,9 +11,11 @@
       "linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
+      "parallelnative-linux-xenial-py3.6-gcc5.4",
       "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
       "periodic-linux-xenial-cuda11.1-py3.6-gcc7",
       "periodic-win-vs2019-cuda11.1-py3",
+      "puretorch-linux-xenial-py3.6-gcc5.4",
       "win-vs2019-cpu-py3",
       "win-vs2019-cuda10.1-py3",
       "win-vs2019-cuda11.3-py3"
@@ -29,6 +31,8 @@
       "linux-bionic-py3.8-gcc9-coverage",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
+      "parallelnative-linux-xenial-py3.6-gcc5.4",
+      "puretorch-linux-xenial-py3.6-gcc5.4",
       "win-vs2019-cpu-py3"
     ],
     "ciflow/cuda": [
@@ -67,8 +71,10 @@
       "linux-xenial-cuda11.3-py3.6-gcc7",
       "linux-xenial-py3.6-gcc5.4",
       "linux-xenial-py3.6-gcc7-bazel-test",
+      "parallelnative-linux-xenial-py3.6-gcc5.4",
       "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
-      "periodic-linux-xenial-cuda11.1-py3.6-gcc7"
+      "periodic-linux-xenial-cuda11.1-py3.6-gcc7",
+      "puretorch-linux-xenial-py3.6-gcc5.4"
     ],
     "ciflow/noarch": [
       "linux-bionic-py3.6-clang9"
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 16100f72a527c..b5146114054a6 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -285,19 +285,41 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
     #     build_environment="paralleltbb-linux-xenial-py3.6-gcc5.4",
     #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
     #     test_runner_type=LINUX_CPU_TEST_RUNNER,
+    #     on_pull_request=True,
+    #     ciflow_config=CIFlowConfig(
+    #         enabled=True,
+    #         trigger_action_only=True,
+    #         labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU},
+    #     ),
     # ),
-    # CIWorkflow(
-    #     arch="linux",
-    #     build_environment="parallelnative-linux-xenial-py3.6-gcc5.4",
-    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
-    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
-    # ),
-    # CIWorkflow(
-    #     arch="linux",
-    #     build_environment="pure_torch-linux-xenial-py3.6-gcc5.4",
-    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
-    #     test_runner_type=LINUX_CPU_TEST_RUNNER,
-    # ),
+    CIWorkflow(
+        arch="linux",
+        build_environment="parallelnative-linux-xenial-py3.6-gcc5.4",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+        test_runner_type=LINUX_CPU_TEST_RUNNER,
+        # This is a master only job despite on_pull_request is set to True
+        on_pull_request=True,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            trigger_action_only=True,
+            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU},
+        ),
+    ),
+    # Build PyTorch with BUILD_CAFFE2=OFF
+    CIWorkflow(
+        arch="linux",
+        build_environment="puretorch-linux-xenial-py3.6-gcc5.4",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4",
+        test_runner_type=LINUX_CPU_TEST_RUNNER,
+        exclude_test=True,
+        # This is a master only job despite on_pull_request is set to True
+        on_pull_request=True,
+        ciflow_config=CIFlowConfig(
+            enabled=True,
+            trigger_action_only=True,
+            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU},
+        ),
+    ),
     # CIWorkflow(
     #     arch="linux",
     #     build_environment="linux-xenial-py3.6-gcc7",
diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
new file mode 100644
index 0000000000000..1b352f6b8cd80
--- /dev/null
+++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
@@ -0,0 +1,510 @@
+# @generated DO NOT EDIT MANUALLY
+# Template is at:    .github/templates/linux_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: parallelnative-linux-xenial-py3.6-gcc5.4
+
+on:
+  pull_request:
+    types: [unassigned]
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: parallelnative-linux-xenial-py3.6-gcc5.4
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
+  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+  TORCH_CUDA_ARCH_LIST: 5.2
+  IN_CI: 1
+  # This is used for the phase of adding wheel tests only, will be removed once completed
+  IN_WHEEL_TEST: 1
+  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
+  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+
+concurrency:
+  group: parallelnative-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
+  calculate-docker-image:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.2xlarge
+    needs: [ciflow_should_run]
+    env:
+      DOCKER_BUILDKIT: 1
+    timeout-minutes: 90
+    outputs:
+      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: false
+      - name: Calculate docker image tag
+        id: calculate-tag
+        run: |
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "::set-output name=docker_tag::${DOCKER_TAG}"
+          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+      - name: Check if image should be built
+        id: check
+        env:
+          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
+          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
+        run: |
+          set -x
+          # Check if image already exists, if it does then skip building it
+          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
+            exit 0
+          fi
+          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
+            # if we're on the base branch then use the parent commit
+            MERGE_BASE=$(git rev-parse HEAD~)
+          else
+            # otherwise we're on a PR, so use the most recent base commit
+            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
+          fi
+          # Covers the case where a previous tag doesn't exist for the tree
+          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
+          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
+            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
+            exit 1
+          fi
+          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
+          # If no image exists but the hash is the same as the previous hash then we should error out here
+          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
+            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
+            echo "       contact the PyTorch team to restore the original images"
+            exit 1
+          fi
+          echo ::set-output name=rebuild::yes
+      - name: Build and push docker image
+        if: ${{ steps.check.outputs.rebuild }}
+        env:
+          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
+          DOCKER_SKIP_S3_UPLOAD: 1
+        run: |
+          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
+          cd .circleci/docker && ./build_docker.sh
+
+  build:
+    runs-on: linux.2xlarge
+    needs: [calculate-docker-image, ciflow_should_run]
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-build
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: recursive
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch
+        run: |
+          docker run \
+            -e BUILD_ENVIRONMENT \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        run: |
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Archive artifacts into zip
+        run: |
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Store PyTorch Build Artifacts on S3
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Prune all of the docker images
+          docker system prune -af
+
+  generate-test-matrix:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ubuntu-18.04
+    needs: [ciflow_should_run]
+    env:
+      TEST_RUNNER_TYPE: linux.2xlarge
+      ENABLE_DISTRIBUTED_TEST: 1
+      ENABLE_JIT_LEGACY_TEST: ''
+      ENABLE_MULTIGPU_TEST: ''
+      ENABLE_NOGPU_NO_AVX_TEST: ''
+      ENABLE_NOGPU_NO_AVX2_TEST: ''
+      ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
+      ENABLE_XLA_TEST: ''
+      ENABLE_NOARCH_TEST: ''
+      NUM_TEST_SHARDS: 1
+      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      NOGPU_RUNNER_TYPE: linux.2xlarge
+      PR_BODY: ${{ github.event.pull_request.body }}
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
+      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
+    container:
+      image: python:3.9
+    steps:
+      - name: Install dependencies
+        run: pip install typing-extensions
+      - name: Clone pytorch/pytorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+      - name: Generating test matrix
+        id: set-matrix
+        run: .github/scripts/generate_pytorch_test_matrix.py
+
+  test:
+    needs: [calculate-docker-image, build, generate-test-matrix, ciflow_should_run]
+    strategy:
+      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-test
+      TEST_CONFIG: ${{ matrix.config }}
+      SHARD_NUMBER: ${{ matrix.shard }}
+      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+      CONTINUE_THROUGH_ERROR: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'push' || github.event_name == 'schedule') }}
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: recursive
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
+        run: |
+          bash .github/scripts/install_nvidia_utils_linux.sh
+          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+      - name: Determine shm-size
+        run: |
+          shm_size="1g"
+          case "${BUILD_ENVIRONMENT}" in
+            *cuda*)
+              shm_size="2g"
+              ;;
+            *rocm*)
+              shm_size="8g"
+              ;;
+          esac
+          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+      - name: Unzip artifacts
+        run: |
+          unzip -o artifacts.zip
+      - name: Output disk space left
+        run: |
+          sudo df -H
+      - name: Test PyTorch
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+        run: |
+          if [[ $TEST_CONFIG == 'multigpu' ]]; then
+            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
+          else
+            TEST_COMMAND=.jenkins/pytorch/test.sh
+          fi
+          if [[ $NUM_TEST_SHARDS -ne 2 ]]; then
+            export SHARD_NUMBER=0
+          fi
+          # TODO: Stop building test binaries as part of the build phase
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086
+          docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e GITHUB_ACTIONS \
+            -e IN_CI \
+            -e IN_WHEEL_TEST \
+            -e SHARD_NUMBER \
+            -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
+            -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e PR_LABELS \
+            -e CONTINUE_THROUGH_ERROR \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="${SHM_SIZE}" \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && '$TEST_COMMAND
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Zip test reports for upload
+        if: always()
+        env:
+          COMMIT_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        run: |
+          # Remove any previous test reports if they exist
+          rm -f test-reports-*.zip
+          zip -r "test-reports-${COMMIT_SHA1}-${WORKFLOW_ID}.zip" test -i '*.xml'
+      - uses: actions/upload-artifact@v2
+        name: Store PyTorch Test Reports
+        if: always()
+        with:
+          name: test-reports-${{ matrix.config }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test-reports-*.zip
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Store PyTorch Test Reports on S3
+        if: always()
+        with:
+          name: test-reports-${{ matrix.config }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test-reports-*.zip
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload test statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: parallelnative-linux-xenial-py3.6-gcc5.4-test
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
+        run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
+          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
diff --git a/.github/workflows/generated-puretorch-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-puretorch-linux-xenial-py3.6-gcc5.4.yml
new file mode 100644
index 0000000000000..af1228903b1f5
--- /dev/null
+++ b/.github/workflows/generated-puretorch-linux-xenial-py3.6-gcc5.4.yml
@@ -0,0 +1,275 @@
+# @generated DO NOT EDIT MANUALLY
+# Template is at:    .github/templates/linux_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: puretorch-linux-xenial-py3.6-gcc5.4
+
+on:
+  pull_request:
+    types: [unassigned]
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: puretorch-linux-xenial-py3.6-gcc5.4
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
+  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+  TORCH_CUDA_ARCH_LIST: 5.2
+  IN_CI: 1
+  # This is used for the phase of adding wheel tests only, will be removed once completed
+  IN_WHEEL_TEST: 1
+  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
+  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+
+concurrency:
+  group: puretorch-linux-xenial-py3.6-gcc5.4-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    if: ${{ (github.event_name != 'pull_request') || (github.event.action !='unassigned') || (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) }}
+    env:
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
+  calculate-docker-image:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: linux.2xlarge
+    needs: [ciflow_should_run]
+    env:
+      DOCKER_BUILDKIT: 1
+    timeout-minutes: 90
+    outputs:
+      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: false
+      - name: Calculate docker image tag
+        id: calculate-tag
+        run: |
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "::set-output name=docker_tag::${DOCKER_TAG}"
+          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+      - name: Check if image should be built
+        id: check
+        env:
+          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
+          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
+        run: |
+          set -x
+          # Check if image already exists, if it does then skip building it
+          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
+            exit 0
+          fi
+          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
+            # if we're on the base branch then use the parent commit
+            MERGE_BASE=$(git rev-parse HEAD~)
+          else
+            # otherwise we're on a PR, so use the most recent base commit
+            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
+          fi
+          # Covers the case where a previous tag doesn't exist for the tree
+          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
+          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
+            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
+            exit 1
+          fi
+          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
+          # If no image exists but the hash is the same as the previous hash then we should error out here
+          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
+            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
+            echo "       contact the PyTorch team to restore the original images"
+            exit 1
+          fi
+          echo ::set-output name=rebuild::yes
+      - name: Build and push docker image
+        if: ${{ steps.check.outputs.rebuild }}
+        env:
+          DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker_tag }}
+          DOCKER_SKIP_S3_UPLOAD: 1
+        run: |
+          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
+          cd .circleci/docker && ./build_docker.sh
+
+  build:
+    runs-on: linux.2xlarge
+    needs: [calculate-docker-image, ciflow_should_run]
+    env:
+      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      JOB_BASE_NAME: puretorch-linux-xenial-py3.6-gcc5.4-build
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+      - name: Log in to ECR
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: recursive
+      - name: Pull docker image
+        run: |
+          docker pull "${DOCKER_IMAGE}"
+      - name: Build PyTorch
+        run: |
+          docker run \
+            -e BUILD_ENVIRONMENT \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        run: |
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Archive artifacts into zip
+        run: |
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Store PyTorch Build Artifacts on S3
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Prune all of the docker images
+          docker system prune -af
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 085cf5152e6f1..226b8521ee049 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -59,7 +59,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
   export BUILD_SPLIT_CUDA=ON
 fi
 
-if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then
+if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* || ${BUILD_ENVIRONMENT} == *"puretorch"* ]]; then
   export BUILD_CAFFE2=OFF
 fi
 

From 0a1aaff0dea5953928cefc506b4f4d39e0cb8a4d Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Mon, 6 Sep 2021 21:24:38 -0700
Subject: [PATCH 516/530] Remove dead code from THC (THCApply.cuh) (#64559)

Summary:
cc peterbell10

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64559

Reviewed By: mruberry

Differential Revision: D30769526

Pulled By: ngimel

fbshipit-source-id: 034a5c778a2b902cffa57b76511fa0dcdea26825
---
 .../ATen/native/cuda/DistributionBernoulli.cu |   1 -
 .../native/cuda/DistributionCauchyKernel.cu   |   1 -
 .../cuda/DistributionExponentialKernel.cu     |   1 -
 .../cuda/DistributionGeometricKernel.cu       |   1 -
 .../cuda/DistributionLogNormalKernel.cu       |   1 -
 .../ATen/native/cuda/DistributionNormal.cu    |   1 -
 .../native/cuda/DistributionRandomKernel.cu   |   1 -
 aten/src/ATen/native/cuda/Distributions.cu    |   1 -
 aten/src/THC/CMakeLists.txt                   |   1 -
 aten/src/THC/THCApply.cuh                     | 760 ------------------
 aten/src/THC/THCTensorCopy.cu                 |  33 +-
 aten/src/THC/generic/THCTensorCopy.cu         |  18 -
 12 files changed, 2 insertions(+), 818 deletions(-)
 delete mode 100644 aten/src/THC/THCApply.cuh

diff --git a/aten/src/ATen/native/cuda/DistributionBernoulli.cu b/aten/src/ATen/native/cuda/DistributionBernoulli.cu
index 0baaf2e049b04..e113d82c0f5c7 100644
--- a/aten/src/ATen/native/cuda/DistributionBernoulli.cu
+++ b/aten/src/ATen/native/cuda/DistributionBernoulli.cu
@@ -18,7 +18,6 @@
 #include <ATen/native/TensorIterator.h>
 
 #include <THC/THCGeneral.h>
-#include <THC/THCApply.cuh>
 #include <THC/THCDeviceUtils.cuh>
 
 #include <cstdint>
diff --git a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
index 6f43ee664cb2c..b33ee792ea4cc 100644
--- a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
@@ -18,7 +18,6 @@
 #include <ATen/native/TensorIterator.h>
 
 #include <THC/THCGeneral.h>
-#include <THC/THCApply.cuh>
 #include <THC/THCDeviceUtils.cuh>
 
 #include <cstdint>
diff --git a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
index 6e1823032a789..f28a910e9980b 100644
--- a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
@@ -18,7 +18,6 @@
 #include <ATen/native/TensorIterator.h>
 
 #include <THC/THCGeneral.h>
-#include <THC/THCApply.cuh>
 #include <THC/THCDeviceUtils.cuh>
 
 #include <cstdint>
diff --git a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
index 9086e2a35c8d3..6cafba0dcbe78 100644
--- a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
@@ -18,7 +18,6 @@
 #include <ATen/native/TensorIterator.h>
 
 #include <THC/THCGeneral.h>
-#include <THC/THCApply.cuh>
 #include <THC/THCDeviceUtils.cuh>
 
 #include <cstdint>
diff --git a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
index 9497cf83cc405..c5da3bdf92d2a 100644
--- a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
@@ -18,7 +18,6 @@
 #include <ATen/native/TensorIterator.h>
 
 #include <THC/THCGeneral.h>
-#include <THC/THCApply.cuh>
 #include <THC/THCDeviceUtils.cuh>
 
 #include <cstdint>
diff --git a/aten/src/ATen/native/cuda/DistributionNormal.cu b/aten/src/ATen/native/cuda/DistributionNormal.cu
index 32d223c5d0a93..1b2dd19eec0d1 100644
--- a/aten/src/ATen/native/cuda/DistributionNormal.cu
+++ b/aten/src/ATen/native/cuda/DistributionNormal.cu
@@ -18,7 +18,6 @@
 #include <ATen/native/TensorIterator.h>
 
 #include <THC/THCGeneral.h>
-#include <THC/THCApply.cuh>
 #include <THC/THCDeviceUtils.cuh>
 
 #include <cstdint>
diff --git a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
index 57d0701329d91..ea2aaad9445b2 100644
--- a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
@@ -18,7 +18,6 @@
 #include <ATen/native/TensorIterator.h>
 
 #include <THC/THCGeneral.h>
-#include <THC/THCApply.cuh>
 #include <THC/THCDeviceUtils.cuh>
 
 #include <cstdint>
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
index a48a3778305ab..81f8fe8fa227f 100644
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@@ -18,7 +18,6 @@
 #include <ATen/native/TensorIterator.h>
 
 #include <THC/THCGeneral.h>
-#include <THC/THCApply.cuh>
 #include <THC/THCDeviceUtils.cuh>
 
 #include <cstdint>
diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt
index 82828a7ac1c89..ab7f72b2f41d4 100644
--- a/aten/src/THC/CMakeLists.txt
+++ b/aten/src/THC/CMakeLists.txt
@@ -30,7 +30,6 @@ install(FILES
           THCTensor.h
           THCTensorCopy.h
           THCTensorCopy.hpp
-          THCApply.cuh
           THCReduceApplyUtils.cuh
           THCTensorMathReduce.cuh
           THCAsmUtils.cuh
diff --git a/aten/src/THC/THCApply.cuh b/aten/src/THC/THCApply.cuh
deleted file mode 100644
index e424b2406ee3c..0000000000000
--- a/aten/src/THC/THCApply.cuh
+++ /dev/null
@@ -1,760 +0,0 @@
-#ifndef THC_APPLY_INC
-#define THC_APPLY_INC
-
-#include <THC/THCTensorCopy.h>
-#include <THC/THCReduceApplyUtils.cuh>
-#include <THC/THCTensorTypeUtils.cuh>
-#include <THC/THCTensorCopy.hpp>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAException.h>
-
-//
-// This file contains pointwise operation functions and kernels that
-// work on both contiguous and non-contiguous tensor arguments of
-// arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without
-// copying or temporary storage.
-//
-
-// Rearrange dimensions for pointwise operations so that strides are in
-// decreasing order as much as possible, so that kernels have better memory
-// access patterns.
-//
-// For example, consider a binary operation on two "transposed" 2-dim tensors:
-//    sizes:          256 512
-//    aInfo->strides:   1 256
-//    bInfo->strides:   1 256
-//
-// Given this, each concurrent memory access inside kernelPointwiseApply2() is
-// exactly 256 elements apart, resulting in poor performance.
-//
-// This function exchanges dimensions so that memory access is contiguous:
-//    sizes:          512 256
-//    aInfo->strides: 256   1
-//    bInfo->strides: 256   1
-//
-// (Actually, it becomes even better because now collapseDims() can turn each
-// input into one contiguous array.)
-//
-// In general, given M (<=3) TensorInfo's with N dimensions, we can view each
-// strides[i] (0 <= i < N) as an M-tuple.  Given each pair i < j, we exchange
-// strides[i] and [j] if
-//    (1) strides[i][k] < strides[j][k] for some k (0 <= k < M)
-//        (exchanging them will benefit input #k), and
-//    (2) strides[i][k] <= strieds[j][k] for all k
-//        (exchanging them will not make any input worse).
-template <typename T1, typename IndexType,
-          typename T2 = void, typename T3 = void>
-void rearrangeDims(TensorInfo<T1, IndexType>* aInfo,
-                   TensorInfo<T2, IndexType>* bInfo = nullptr,
-                   TensorInfo<T3, IndexType>* cInfo = nullptr) {
-  int numInfos = 1;
-  int dims = aInfo->dims;
-  IndexType *sizes[3] = { aInfo->sizes, };
-  IndexType *strides[3] = { aInfo->strides, };
-
-  if (bInfo != nullptr) {
-    ++numInfos;
-    if (bInfo->dims != dims) return;
-    sizes[1] = bInfo->sizes;
-    strides[1] = bInfo->strides;
-  }
-
-  if (cInfo != nullptr) {
-    ++numInfos;
-    if (cInfo->dims != dims) return;
-    sizes[2] = cInfo->sizes;
-    strides[2] = cInfo->strides;
-  }
-
-  // Bail out if sizes do not match: we are using "deprecated pointwise
-  // behavior" among tensors of different shapes but same number of elements.
-  for (int i = 1; i < numInfos; ++i) {
-    for (int j = 0; j < dims; ++j) {
-      if (sizes[i][j] != sizes[0][j]) return;
-    }
-  }
-
-  for (int i = 0; i < dims - 1; ++i) {
-    // No need to consider dimensions of size 1.
-    if (sizes[0][i] == 1) continue;
-
-    for (int j = i + 1; j < dims; ++j) {
-      if (sizes[0][j] == 1) continue;
-
-      // Compare the relative sizes of strides between dim #i and dim #j.
-      bool hasIncreasingStrides = false;
-      bool hasDecreasingStrides = false;
-
-      for (int k = 0; k < numInfos; k++) {
-        IndexType stride_i = strides[k][i];
-        IndexType stride_j = strides[k][j];
-        if (stride_i < stride_j) {
-          hasIncreasingStrides = true;
-        } else if (stride_i > stride_j) {
-          hasDecreasingStrides = true;
-        }
-      }
-
-      if (hasIncreasingStrides && !hasDecreasingStrides) {
-        for (int k = 0; k < numInfos; k++) {
-          IndexType size = sizes[k][i];
-          sizes[k][i] = sizes[k][j];
-          sizes[k][j] = size;
-
-          IndexType stride = strides[k][i];
-          strides[k][i] = strides[k][j];
-          strides[k][j] = stride;
-        }
-      }
-    }
-  }
-}
-
-// Threads per block for our apply kernel
-// FIXME: use occupancy calculator instead
-#define THC_APPLY_THREADS_PER_BLOCK (32 * 16)
-#define THC_APPLY_BLOCKS_PER_SM 4
-template <typename Op,
-          typename Ta,
-          typename IndexType,
-          int ADims>
-#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_2(THC_APPLY_THREADS_PER_BLOCK, THC_APPLY_BLOCKS_PER_SM)
-#endif
-__global__ void
-kernelPointwiseApply1(const OffsetInfo<Ta, IndexType, ADims> a,
-                      IndexType totalElements,
-                      Op op) {
-  // NOTE: The two typecasts below are essential when IndexType is 64-bit;
-  //       without them, results are silently truncated to 32 bits!
-  for (IndexType linearIndex = (IndexType) blockIdx.x * blockDim.x + threadIdx.x;
-       linearIndex < totalElements;
-       linearIndex += (IndexType) gridDim.x * blockDim.x) {
-    op(a.get(linearIndex));
-  }
-}
-
-template <typename Op,
-          typename Ta, typename Tb,
-          typename IndexType,
-          int ADims, int BDims>
-#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_2(THC_APPLY_THREADS_PER_BLOCK, THC_APPLY_BLOCKS_PER_SM)
-#endif
-__global__ void
-kernelPointwiseApply2(const OffsetInfo<Ta, IndexType, ADims> a,
-                      const OffsetInfo<Tb, IndexType, BDims> b,
-                      IndexType totalElements,
-                      Op op) {
-  for (IndexType linearIndex = (IndexType) blockIdx.x * blockDim.x + threadIdx.x;
-       linearIndex < totalElements;
-       linearIndex += (IndexType) gridDim.x * blockDim.x) {
-    op(a.get(linearIndex), b.get(linearIndex));
-  }
-}
-
-template <typename Op,
-          typename Ta, typename Tb, typename Tc,
-          typename IndexType,
-          int ADims, int BDims, int CDims>
-#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_2(THC_APPLY_THREADS_PER_BLOCK, THC_APPLY_BLOCKS_PER_SM)
-#endif
-__global__ void
-kernelPointwiseApply3(const OffsetInfo<Ta, IndexType, ADims> a,
-                      const OffsetInfo<Tb, IndexType, BDims> b,
-                      const OffsetInfo<Tc, IndexType, CDims> c,
-                      IndexType totalElements,
-                      Op op) {
-  for (IndexType linearIndex = (IndexType) blockIdx.x * blockDim.x + threadIdx.x;
-       linearIndex < totalElements;
-       linearIndex += (IndexType) gridDim.x * blockDim.x) {
-    op(a.get(linearIndex), b.get(linearIndex), c.get(linearIndex));
-  }
-}
-
-inline dim3 getApplyBlock() {
-  return dim3(THC_APPLY_THREADS_PER_BLOCK);
-}
-
-inline bool getApplyGrid(THCState* state, uint64_t totalElements, dim3& grid, int curDevice) {
-  if (curDevice == -1) return false;
-
-  uint64_t numBlocks = THCCeilDiv(totalElements, static_cast<uint64_t>(THC_APPLY_THREADS_PER_BLOCK));
-  uint64_t maxGridX = at::cuda::getDeviceProperties(curDevice)->maxGridSize[0];
-  if (numBlocks > maxGridX)
-      numBlocks = maxGridX;
-
-  // For 32-bit indices, make sure that gridDim.x * blockDim.x fits in 32 bits.
-  if (totalElements <= INT32_MAX &&
-      numBlocks > INT32_MAX / THC_APPLY_THREADS_PER_BLOCK)
-    numBlocks = INT32_MAX / THC_APPLY_THREADS_PER_BLOCK;
-
-  grid = dim3(numBlocks);
-  return true;
-}
-
-template <typename ScalarTypeA,
-          typename TensorTypeA,
-          typename Op>
-bool THC_pointwiseApply1(THCState* state,
-                         TensorTypeA* a,
-                         const Op& op,
-                         TensorArgType aType = ReadWrite) {
-  if (THCTensor_nDimensionLegacyAll(state, a) > MAX_CUTORCH_DIMS) {
-    return false;
-  }
-
-  if (THCTensor_nDimensionLegacyAll(state, a) == 0) {
-    // Zero-dim tensor; do nothing
-    return true;
-  }
-
-  const dim3 block = getApplyBlock();
-
-  dim3 grid;
-  ptrdiff_t totalElements = THCTensor_nElement(state, a);
-
-  int curDevice = -1;
-  cudaGetDevice(&curDevice);
-  if (!getApplyGrid(state, totalElements, grid, curDevice)) {
-    return false;
-  }
-
-  /*
-  Expands readable/writable tensors whose indices may be "overlapped."
-  This ensures that each element of the tensor is operated on once and only
-  once.
-  */
-  TensorTypeA* oldA = NULL;
-
-  if (aType == ReadWrite &&
-      THCTensor_maybeOverlappingIndices(state, a)) {
-    // Must perform in contiguous space
-    oldA = a;
-    a = (TensorTypeA*)THCTensor_newContiguous<ScalarTypeA>(state, a);
-  }
-
-  // It is possible that the tensor dimensions are able to be collapsed,
-  // and thus we can reduce the actual code complexity of the copy by
-  // exploiting this knowledge statically, since the div/mod is the
-  // most expensive part of the operation, more so than memory accesses.
-  // For instance, when copying a non-contiguous to a contiguous tensor
-  // (or vice versa), the contiguous tensor can be collapsed to one
-  // dimension, and the loop to translate the linear index to the array
-  // index can be similarly collapsed. That is what this unrolling is for.
-#define HANDLE_CASE(TYPE, A)                                              \
-  kernelPointwiseApply1<Op, ScalarTypeA, TYPE, A>                         \
-    <<<grid, block, 0, c10::cuda::getCurrentCUDAStream(curDevice)>>>(     \
-      OffsetInfo<ScalarTypeA, TYPE, A>(aInfo), (TYPE) totalElements, op); \
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-#define HANDLE_A_CASE(TYPE, A) {            \
-  switch (A) {                              \
-    case 1:                                 \
-      HANDLE_CASE(TYPE, 1);                 \
-      break;                                \
-    case 2:                                 \
-      HANDLE_CASE(TYPE, 2);                 \
-      break;                                \
-    default:                                \
-      HANDLE_CASE(TYPE, -1);                \
-      break;                                \
-  }                                         \
-}
-
-  // Can we use 32-bit integer math in the kernel (the linear ID for the copy
-  // and the resulting non-linear offset is all computable using 32-bit math?)
-  // We also use unsigned index math in the kernel, as signed div/mod has
-  // additional overhead.
-  if (THCTensor_canUse32BitIndexMath(state, a)) {
-    TensorInfo<ScalarTypeA, unsigned int> aInfo =
-      getTensorInfo<ScalarTypeA, TensorTypeA, unsigned int>(state, a);
-    rearrangeDims(&aInfo);
-    aInfo.collapseDims();
-#if CUDA_VERSION < 9000
-    if (!aInfo.isContiguous()) {
-        grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x);
-    }
-#endif
-    HANDLE_A_CASE(unsigned int, aInfo.dims);
-  } else {
-    TensorInfo<ScalarTypeA, uint64_t> aInfo =
-      getTensorInfo<ScalarTypeA, TensorTypeA, uint64_t>(state, a);
-    rearrangeDims(&aInfo);
-    aInfo.collapseDims();
-
-    /*
-    Only instantiates the all 1D special case and the fallback all nD case for
-    large (64-bit indexed) tensors to reduce compilation time.
-    */
-    if (aInfo.dims == 1) {
-      OffsetInfo<ScalarTypeA, uint64_t, 1>
-        aOffset(aInfo);
-      kernelPointwiseApply1<Op,
-                            ScalarTypeA,
-                            uint64_t, 1>
-        <<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(
-          aOffset, (uint64_t) totalElements, op);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    } else {
-
-#if CUDA_VERSION < 9000
-        grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
-      OffsetInfo<ScalarTypeA, uint64_t, -1>
-        aOffset(aInfo);
-      kernelPointwiseApply1<Op,
-                            ScalarTypeA,
-                            uint64_t, -1>
-        <<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(
-          aOffset, (uint64_t) totalElements, op);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-  }
-#undef HANDLE_CASE
-#undef HANDLE_A_CASE
-
-  if (oldA) {
-    // Ignore overlaps when copying back; if we use THCTensor_copy
-    // instead, it will recursively try and invoke ourselves to make
-    // oldA contiguous.
-    THCTensor_copyIgnoringOverlaps<ScalarTypeA>(state, oldA, a);
-    THCTensor_free(state, a);
-    a = oldA;
-  }
-
-  return true;
-}
-
-template <typename ScalarTypeA,
-          typename ScalarTypeB,
-          typename TensorTypeA,
-          typename TensorTypeB,
-          typename Op>
-bool THC_pointwiseApply2(THCState* state,
-                         TensorTypeA* a,
-                         TensorTypeB* b,
-                         const Op& op,
-                         TensorArgType aType = ReadWrite,
-                         TensorArgType bType = ReadOnly) {
-  ptrdiff_t totalElements = THCTensor_nElement(state, a);
-  if (totalElements != THCTensor_nElement(state, b)) {
-    return false;
-  }
-
-  if (THCTensor_nDimensionLegacyAll(state, a) > MAX_CUTORCH_DIMS ||
-      THCTensor_nDimensionLegacyAll(state, b) > MAX_CUTORCH_DIMS) {
-    return false;
-  }
-
-  if (THCTensor_nDimensionLegacyAll(state, a) == 0) {
-    // Zero-dim tensor; do nothing
-    return true;
-  }
-
-  const dim3 block = getApplyBlock();
-
-  dim3 grid;
-  int curDevice = -1;
-  cudaGetDevice(&curDevice);
-  if (!getApplyGrid(state, totalElements, grid, curDevice)) {
-    return false;
-  }
-
-  /*
-  Expands readable/writable tensors whose indices may be "overlapped."
-  This ensures that each element of the tensor is operated on once and only
-  once.
-  */
-  TensorTypeA* oldA = NULL;
-  TensorTypeB* oldB = NULL;
-
-  if (aType == ReadWrite &&
-      THCTensor_maybeOverlappingIndices(state, a)) {
-    // Must perform in contiguous space
-    oldA = a;
-    a = (TensorTypeA*)THCTensor_newContiguous<ScalarTypeA>(state, a);
-  }
-  if (bType == ReadWrite &&
-      THCTensor_maybeOverlappingIndices(state, b)) {
-    // Must perform in contiguous space
-    oldB = b;
-    b = (TensorTypeB*)THCTensor_newContiguous<ScalarTypeB>(state, b);
-  }
-
-  // It is possible that the tensor dimensions are able to be collapsed,
-  // and thus we can reduce the actual code complexity of the copy by
-  // exploiting this knowledge statically, since the div/mod is the
-  // most expensive part of the operation, more so than memory accesses.
-  // For instance, when copying a non-contiguous to a contiguous tensor
-  // (or vice versa), the contiguous tensor can be collapsed to one
-  // dimension, and the loop to translate the linear index to the array
-  // index can be similarly collapsed. That is what this unrolling is for.
-#define HANDLE_CASE(TYPE, A, B)                                         \
-  kernelPointwiseApply2<Op, ScalarTypeA, ScalarTypeB, TYPE, A, B>       \
-    <<<grid, block, 0, c10::cuda::getCurrentCUDAStream(curDevice)>>>(   \
-      OffsetInfo<ScalarTypeA, TYPE, A>(aInfo),                          \
-      OffsetInfo<ScalarTypeB, TYPE, B>(bInfo),                          \
-      (TYPE) totalElements, op);                                        \
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-
-#define HANDLE_B_CASE(TYPE, A, B) {         \
-  switch (B) {                              \
-    case 1:                                 \
-      HANDLE_CASE(TYPE, A, 1);              \
-      break;                                \
-    case 2:                                 \
-      HANDLE_CASE(TYPE, A, 2);              \
-      break;                                \
-    default:                                \
-      HANDLE_CASE(TYPE, A, -1);             \
-      break;                                \
-  }                                         \
-}
-
-#define HANDLE_A_CASE(TYPE, A, B) {         \
-  switch (A) {                              \
-    case 1:                                 \
-      HANDLE_B_CASE(TYPE, 1, B);            \
-      break;                                \
-    case 2:                                 \
-      HANDLE_B_CASE(TYPE, 2, B);            \
-      break;                                \
-    default:                                \
-      HANDLE_B_CASE(TYPE, -1, B);           \
-      break;                                \
-  }                                         \
-}
-
-  if (THCTensor_canUse32BitIndexMath(state, a) &&
-      THCTensor_canUse32BitIndexMath(state, b)) {
-    TensorInfo<ScalarTypeA, unsigned int> aInfo =
-      getTensorInfo<ScalarTypeA, TensorTypeA, unsigned int>(state, a);
-
-    TensorInfo<ScalarTypeB, unsigned int> bInfo =
-      getTensorInfo<ScalarTypeB, TensorTypeB, unsigned int>(state, b);
-
-    rearrangeDims(&aInfo, &bInfo);
-    aInfo.collapseDims();
-    bInfo.collapseDims();
-#if CUDA_VERSION < 9000
-    if (!(aInfo.isContiguous() && bInfo.isContiguous()))
-        grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
-
-    HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
-  } else {
-    TensorInfo<ScalarTypeA, uint64_t> aInfo =
-      getTensorInfo<ScalarTypeA, TensorTypeA, uint64_t>(state, a);
-
-    TensorInfo<ScalarTypeB, uint64_t> bInfo =
-      getTensorInfo<ScalarTypeB, TensorTypeB, uint64_t>(state, b);
-
-    rearrangeDims(&aInfo, &bInfo);
-    aInfo.collapseDims();
-    bInfo.collapseDims();
-
-    /*
-    Only instantiates the all 1D special case and the fallback all nD case for
-    large (64-bit indexed) tensors to reduce compilation time.
-    */
-    if (aInfo.dims == 1 && bInfo.dims == 1) {
-      OffsetInfo<ScalarTypeA, uint64_t, 1>
-        aOffset(aInfo);
-      OffsetInfo<ScalarTypeB, uint64_t, 1>
-        bOffset(bInfo);
-      kernelPointwiseApply2<Op,
-                            ScalarTypeA,
-                            ScalarTypeB,
-                            uint64_t, 1, 1>
-        <<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(
-          aOffset, bOffset, (uint64_t) totalElements, op);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    } else {
-#if CUDA_VERSION < 9000
-      grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
-      OffsetInfo<ScalarTypeA, uint64_t, -1>
-        aOffset(aInfo);
-      OffsetInfo<ScalarTypeB, uint64_t, -1>
-        bOffset(bInfo);
-      kernelPointwiseApply2<Op,
-                            ScalarTypeA,
-                            ScalarTypeB,
-                            uint64_t, -1, -1>
-        <<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(
-          aOffset, bOffset, (uint64_t) totalElements, op);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-  }
-#undef HANDLE_CASE
-#undef HANDLE_B_CASE
-#undef HANDLE_A_CASE
-
-  if (oldA) {
-    // Ignore overlaps when copying back; if we use THCTensor_copy
-    // instead, it will recursively try and invoke ourselves to make
-    // oldA contiguous.
-    THCTensor_copyIgnoringOverlaps<ScalarTypeA>(state, oldA, a);
-    THCTensor_free(state, a);
-    a = oldA;
-  }
-
-  if (oldB) {
-    // Ignore overlaps when copying back; if we use THCTensor_copy
-    // instead, it will recursively try and invoke ourselves to make
-    // oldB contiguous.
-    THCTensor_copyIgnoringOverlaps<ScalarTypeB>(state, oldB, b);
-    THCTensor_free(state, b);
-    b = oldB;
-  }
-
-  return true;
-}
-
-template <typename ScalarTypeA,
-          typename ScalarTypeB,
-          typename ScalarTypeC,
-          typename TensorTypeA,
-          typename TensorTypeB,
-          typename TensorTypeC,
-          typename Op>
-bool THC_pointwiseApply3(THCState* state,
-                         TensorTypeA* a,
-                         TensorTypeB* b,
-                         TensorTypeC* c,
-                         const Op& op,
-                         TensorArgType aType = ReadWrite,
-                         TensorArgType bType = ReadOnly,
-                         TensorArgType cType = ReadOnly) {
-  ptrdiff_t totalElements = THCTensor_nElement(state, a);
-
-  if (totalElements != THCTensor_nElement(state, b) ||
-      totalElements != THCTensor_nElement(state, c)) {
-    return false;
-  }
-
-  if (THCTensor_nDimensionLegacyAll(state, a) > MAX_CUTORCH_DIMS ||
-      THCTensor_nDimensionLegacyAll(state, b) > MAX_CUTORCH_DIMS ||
-      THCTensor_nDimensionLegacyAll(state, c) > MAX_CUTORCH_DIMS) {
-    return false;
-  }
-
-  if (THCTensor_nDimensionLegacyAll(state, a) == 0) {
-    // Zero-dim tensor; do nothing
-    return true;
-  }
-
-  const dim3 block = getApplyBlock();
-
-  dim3 grid;
-  int curDevice = -1;
-  cudaGetDevice(&curDevice);
-  if (!getApplyGrid(state, totalElements, grid, curDevice)) {
-    return false;
-  }
-
-  /*
-  Expands readable/writable tensors whose indices may be "overlapped."
-  This ensures that each element of the tensor is operated on once and only
-  once.
-  */
-  TensorTypeA* oldA = NULL;
-  TensorTypeB* oldB = NULL;
-  TensorTypeC* oldC = NULL;
-
-  if (aType == ReadWrite &&
-      THCTensor_maybeOverlappingIndices(state, a)) {
-    // Must perform in contiguous space
-    oldA = a;
-    a = (TensorTypeA*)THCTensor_newContiguous<ScalarTypeA>(state, a);
-  }
-  if (bType == ReadWrite &&
-      THCTensor_maybeOverlappingIndices(state, b)) {
-    // Must perform in contiguous space
-    oldB = b;
-    b = (TensorTypeB*)THCTensor_newContiguous<ScalarTypeB>(state, b);
-  }
-  if (cType == ReadWrite &&
-      THCTensor_maybeOverlappingIndices(state, c)) {
-    // Must perform in contiguous space
-    oldC = c;
-    c = (TensorTypeC*)THCTensor_newContiguous<ScalarTypeC>(state, c);
-  }
-
-#define HANDLE_CASE(TYPE, A, B, C)                                      \
-  kernelPointwiseApply3<Op,                                             \
-                        ScalarTypeA,                                    \
-                        ScalarTypeB,                                    \
-                        ScalarTypeC,                                    \
-                        TYPE, A, B, C>                                  \
-    <<<grid, block, 0, c10::cuda::getCurrentCUDAStream(curDevice)>>>(   \
-      OffsetInfo<ScalarTypeA, TYPE, A>                                  \
-          (aInfo),                                                      \
-      OffsetInfo<ScalarTypeB, TYPE, B>                                  \
-          (bInfo),                                                      \
-      OffsetInfo<ScalarTypeC, TYPE, C>                                  \
-          (cInfo),                                                      \
-      (TYPE) totalElements, op);                                        \
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-#define HANDLE_C_CASE(TYPE, A, B, C) {      \
-  switch (C) {                              \
-    case 1:                                 \
-      HANDLE_CASE(TYPE, A, B, 1);           \
-      break;                                \
-    case 2:                                 \
-      HANDLE_CASE(TYPE, A, B, 2);           \
-      break;                                \
-    default:                                \
-      HANDLE_CASE(TYPE, A, B, -1);          \
-      break;                                \
-  }                                         \
-}
-
-#define HANDLE_B_CASE(TYPE, A, B, C) {      \
-  switch (B) {                              \
-    case 1:                                 \
-      HANDLE_C_CASE(TYPE, A, 1, C);         \
-      break;                                \
-    case 2:                                 \
-      HANDLE_C_CASE(TYPE, A, 2, C);         \
-      break;                                \
-    default:                                \
-      HANDLE_C_CASE(TYPE, A, -1, C);        \
-      break;                                \
-  }                                         \
-}
-
-#define HANDLE_A_CASE(TYPE, A, B, C) {      \
-  switch (A) {                              \
-    case 1:                                 \
-      HANDLE_B_CASE(TYPE, 1, B, C);         \
-      break;                                \
-    case 2:                                 \
-      HANDLE_B_CASE(TYPE, 2, B, C);         \
-      break;                                \
-    default:                                \
-      HANDLE_B_CASE(TYPE, -1, B, C);        \
-      break;                                \
-  }                                         \
-}
-
-  if (THCTensor_canUse32BitIndexMath(state, a) &&
-      THCTensor_canUse32BitIndexMath(state, b) &&
-      THCTensor_canUse32BitIndexMath(state, c)) {
-    TensorInfo<ScalarTypeA, unsigned int> aInfo =
-      getTensorInfo<ScalarTypeA, TensorTypeA, unsigned int>(state, a);
-
-    TensorInfo<ScalarTypeB, unsigned int> bInfo =
-      getTensorInfo<ScalarTypeB, TensorTypeB, unsigned int>(state, b);
-
-    TensorInfo<ScalarTypeC, unsigned int> cInfo =
-      getTensorInfo<ScalarTypeC, TensorTypeC, unsigned int>(state, c);
-
-    rearrangeDims(&aInfo, &bInfo, &cInfo);
-    aInfo.collapseDims();
-    bInfo.collapseDims();
-    cInfo.collapseDims();
-
-#if CUDA_VERSION < 9000
-      if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous()))
-          grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
-    HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims);
-  } else {
-    TensorInfo<ScalarTypeA, uint64_t> aInfo =
-      getTensorInfo<ScalarTypeA, TensorTypeA, uint64_t>(state, a);
-
-    TensorInfo<ScalarTypeB, uint64_t> bInfo =
-      getTensorInfo<ScalarTypeB, TensorTypeB, uint64_t>(state, b);
-
-    TensorInfo<ScalarTypeC, uint64_t> cInfo =
-      getTensorInfo<ScalarTypeC, TensorTypeC, uint64_t>(state, c);
-
-    rearrangeDims(&aInfo, &bInfo, &cInfo);
-    aInfo.collapseDims();
-    bInfo.collapseDims();
-    cInfo.collapseDims();
-
-    /*
-    Only instantiates the all 1D special case and the fallback all nD case for
-    large (64-bit indexed) tensors to reduce compilation time.
-    */
-    if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1) {
-      OffsetInfo<ScalarTypeA, uint64_t, 1>
-        aOffset(aInfo);
-      OffsetInfo<ScalarTypeB, uint64_t, 1>
-        bOffset(bInfo);
-      OffsetInfo<ScalarTypeC, uint64_t, 1>
-        cOffset(cInfo);
-      kernelPointwiseApply3<Op,
-                            ScalarTypeA,
-                            ScalarTypeB,
-                            ScalarTypeC,
-                            uint64_t, 1, 1, 1>
-        <<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(
-          aOffset, bOffset, cOffset, (uint64_t) totalElements, op);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    } else {
-#if CUDA_VERSION < 9000
-      grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
-
-      OffsetInfo<ScalarTypeA, uint64_t, -1>
-        aOffset(aInfo);
-      OffsetInfo<ScalarTypeB, uint64_t, -1>
-        bOffset(bInfo);
-      OffsetInfo<ScalarTypeC, uint64_t, -1>
-        cOffset(cInfo);
-      kernelPointwiseApply3<Op,
-                            ScalarTypeA,
-                            ScalarTypeB,
-                            ScalarTypeC,
-                            uint64_t, -1, -1, -1>
-        <<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(
-          aOffset, bOffset, cOffset, (uint64_t) totalElements, op);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-  }
-#undef HANDLE_CASE
-#undef HANDLE_C_CASE
-#undef HANDLE_B_CASE
-#undef HANDLE_A_CASE
-
-  if (oldA) {
-    // Ignore overlaps when copying back; if we use THCTensor_copy
-    // instead, it will recursively try and invoke ourselves to make
-    // oldA contiguous.
-    THCTensor_copyIgnoringOverlaps<ScalarTypeA>(state, oldA, a);
-    THCTensor_free(state, a);
-    a = oldA;
-  }
-
-  if (oldB) {
-    // Ignore overlaps when copying back; if we use THCTensor_copy
-    // instead, it will recursively try and invoke ourselves to make
-    // oldB contiguous.
-    THCTensor_copyIgnoringOverlaps<ScalarTypeB>(state, oldB, b);
-    THCTensor_free(state, b);
-    b = oldB;
-  }
-
-  if (oldC) {
-    // Ignore overlaps when copying back; if we use THCTensor_copy
-    // instead, it will recursively try and invoke ourselves to make
-    // oldC contiguous.
-    THCTensor_copyIgnoringOverlaps<ScalarTypeC>(state, oldC, c);
-    THCTensor_free(state, c);
-    c = oldC;
-  }
-
-  return true;
-}
-
-#undef THC_APPLY_THREADS_PER_BLOCK
-#undef THC_APPLY_BLOCKS_PER_SM
-
-#endif // THC_APPLY_INC
diff --git a/aten/src/THC/THCTensorCopy.cu b/aten/src/THC/THCTensorCopy.cu
index f4db80dfeb86a..fa1df622aff7c 100644
--- a/aten/src/THC/THCTensorCopy.cu
+++ b/aten/src/THC/THCTensorCopy.cu
@@ -1,35 +1,6 @@
-#include <THC/THCApply.cuh>
-#include <TH/THHalf.h>
-#include <THC/THCNumerics.cuh>
+#include <THC/THCTensorTypeUtils.cuh>
+#include <THC/THCTensorCopy.h>
 #include <THC/THCTensorCopy.hpp>
-#include <type_traits>
-#include <c10/util/BFloat16.h>
-
-// Copy operator for the pointwise apply kernel
-template <typename T>
-struct CopyOp {
-  __device__ __forceinline__ void operator()(T* dst, T* src) {
-#if __CUDA_ARCH__ >= 350
-    *dst = c10::static_cast_with_inter_type<T, T>::apply(*src);
-#else
-    *dst = c10::static_cast_with_inter_type<T, T>::apply(*src);
-#endif
-  }
-};
-
-template <>
-struct CopyOp <bool> {
-  __device__ __forceinline__ void operator()(bool* dst, bool* src) {
-      *dst = ScalarConvert<bool, bool>::to(*src);
-  }
-};
-
-template <>
-struct CopyOp <at::BFloat16> {
-  __device__ __forceinline__ void operator()(at::BFloat16* dst, at::BFloat16* src) {
-      *dst = ScalarConvert<at::BFloat16, at::BFloat16>::to(*src);
-  }
-};
 
 #include <THC/generic/THCTensorCopy.cu>
 #include <THC/THCGenerateAllTypes.h>
diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu
index 3941ef9599206..4301bccc0539b 100644
--- a/aten/src/THC/generic/THCTensorCopy.cu
+++ b/aten/src/THC/generic/THCTensorCopy.cu
@@ -44,22 +44,4 @@ void THCTensor_freeCopyTo<scalar_t>(THCState *state, THCTensor *self, THCTensor
   THCTensor_free(state, self);
 }
 
-template <>
-void THCTensor_copyIgnoringOverlaps<scalar_t>(THCState* state, THCTensor* dst, THCTensor* src) {
-  // Called when we are copying into an overlapping index `dst`, but
-  // we don't care which writer wins. Hacky but it works.
-  // This is itself invoked by pointwiseApply2 / THCTensor_copy in
-  // case that there are write overlaps.
-  // FIXME: really, overlapping writes should be illegal/an error in Torch
-  THC_pointwiseApply2<scalar_t, scalar_t>(
-    state, dst, src,
-    CopyOp<scalar_t>(),
-    ReadOnly, /* ignore overwrites */
-    ReadOnly);
-}
-
-void THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) {
-  THCTensor_copyIgnoringOverlaps<scalar_t>(state, dst, src);
-}
-
 #endif

From 1a1fb31cfa3135e56c533da037b5d8dc6981b7fa Mon Sep 17 00:00:00 2001
From: Anirudh Dagar <anirudhdagar6@gmail.com>
Date: Mon, 6 Sep 2021 23:55:53 -0700
Subject: [PATCH 517/530] Support `torch.concat` alias, add `cat` OpInfo &
 remove OpInfo test_out skips {cat, stack, hstack, vtack, dstack} (#62560)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/61767

## Changes

- [x] Add `torch.concat` alias to `torch.cat`
- [x] Add OpInfo for `cat`/`concat`
- [x] Fix `test_out` skips (Use `at::native::resize_output` or `at::native::resize_output_check`)
  - [x] `cat`/`concat`
  - [x] `stack`
  - [x] `hstack`
  - [x] `dstack`
  - [x] `vstack`/`row_stack`
- [x] Remove redundant tests for `cat`/`stack`

~I've not added `cat`/`concat` to OpInfo `op_db` yet, since cat is a little more tricky than other OpInfos (should have a lot of tests) and currently there are no OpInfos for that. I can try to add that in a subsequent PR or maybe here itself, whatever is suggested.~
**Edit**: cat/concat OpInfo has been added.

**Note**: I've added the named tensor support for `concat` alias as well, maybe that's out of spec in `array-api` but it is still useful for consistency in PyTorch.

Thanks to krshrimali for guidance on my first PR :))

cc mruberry rgommers pmeier asmeurer leofang AnirudhDagar asi1024 emcastillo kmaehashi heitorschueroff krshrimali

Pull Request resolved: https://github.com/pytorch/pytorch/pull/62560

Reviewed By: saketh-are

Differential Revision: D30762069

Pulled By: mruberry

fbshipit-source-id: 6985159d1d9756238890488a0ab3ae7699d94337
---
 aten/src/ATen/core/aten_interned_strings.h    |  2 -
 aten/src/ATen/core/interned_strings.h         |  3 +
 aten/src/ATen/native/Resize.cpp               |  2 +-
 aten/src/ATen/native/Resize.h                 |  8 ++-
 aten/src/ATen/native/TensorShape.cpp          | 28 ++++++--
 aten/src/ATen/native/cuda/Shape.cu            |  6 +-
 aten/src/ATen/native/native_functions.yaml    |  9 +++
 docs/source/torch.rst                         |  1 +
 test/test_autograd.py                         | 38 ----------
 test/test_fx_experimental.py                  |  2 +-
 test/test_tensor_creation_ops.py              | 69 ++++++++++++-------
 torch/_torch_docs.py                          |  7 ++
 torch/csrc/jit/passes/normalize_ops.cpp       |  1 +
 torch/overrides.py                            |  1 +
 .../_internal/common_methods_invocations.py   | 50 ++++++++++----
 15 files changed, 137 insertions(+), 90 deletions(-)

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index abdf397544468..6da99dfc6a4d9 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -36,7 +36,6 @@ _(aten, _cast_Half) \
 _(aten, _cast_Int) \
 _(aten, _cast_Long) \
 _(aten, _cast_Short) \
-_(aten, _cat) \
 _(aten, _ceil) \
 _(aten, _clamp_max) \
 _(aten, _clamp_min) \
@@ -224,7 +223,6 @@ _(aten, bmm) \
 _(aten, broadcast_tensors) \
 _(aten, broadcast_to) \
 _(aten, cartesian_prod) \
-_(aten, cat) \
 _(aten, cauchy) \
 _(aten, ceil) \
 _(aten, celu) \
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 69e5f97f7127a..8d49d82c5c8f4 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -306,6 +306,9 @@ namespace c10 {
   _(aten, bin)                       \
   _(aten, pop)                       \
   _(aten, insert)                    \
+  _(aten, _cat)                      \
+  _(aten, cat)                       \
+  _(aten, concat)                    \
   _(aten, vstack)                    \
   _(aten, row_stack)                 \
   _(prim, unchecked_unwrap_optional) \
diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp
index f4bff473d2333..1937a8b3d545a 100644
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@@ -8,7 +8,7 @@ namespace at { namespace native {
 
 // Returns true if resize is necessary
 bool resize_output_check(const Tensor& output, IntArrayRef shape) {
-  // Tests for resizing of tensors with one more elements
+  // Tests for resizing of tensors with one or more elements
   if (output.sizes().equals(shape)) {
     return false;
   }
diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h
index 5e391a0ce7571..6fb52bc0803ac 100644
--- a/aten/src/ATen/native/Resize.h
+++ b/aten/src/ATen/native/Resize.h
@@ -10,7 +10,10 @@
 namespace at { namespace native {
 
 // TODO: make all operations that resize given outputs use this function
-//   for consistency and maintainability
+//   for consistency and maintainability.
+//   Some operations like `cat` might not be able to make the use of
+//   resize_output directly. For more details to understand how it works in `cat`,
+//   see https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362
 // Resizes outputs
 // Functions accepting output tensors, like with the "out" kwarg, should
 //   call this function to handle resizing their output tensor.
@@ -20,6 +23,9 @@ namespace at { namespace native {
 // Returns a bool saying whether or not the resize actually happened or not
 TORCH_API bool resize_output(const Tensor& output, IntArrayRef shape);
 
+// Utility for resize_output
+//  Returns a bool saying resize should happen or not and
+//  raises a warning if resizing for one or more elements
 TORCH_API bool resize_output_check(const Tensor& output, IntArrayRef shape);
 
 TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes);
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index edbfa2329a02d..8f397862687ba 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -6,7 +6,6 @@
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/core/DimVector.h>
 #include <ATen/native/Copy.h>
-#include <ATen/native/cpu/CatKernel.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/TypeProperties.h>
@@ -193,7 +192,10 @@ Tensor & _cat_out_cpu(TensorList tensors, int64_t dim, Tensor& result) {
   result_size[dim] = cat_dim_size;
 
   // skip resizing if size of result is same as expected
-  if (result.sizes() != result_size) {
+  // raise a warning while resizing if output has one or more elements
+  // See https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362
+  // for understanding why at::native::resize_output is not called directly.
+  if (at::native::resize_output_check(result, result_size)) {
     result.resize_(result_size, first_tensor_mem_format);
   }
 
@@ -301,6 +303,23 @@ Tensor cat(TensorList tensors, Dimname dim) {
   return at::cat(tensors, dimname_to_position(tensors[0], dim));
 }
 
+// torch.concat, alias for torch.cat
+Tensor& concat_out(TensorList tensors, Dimname dim, Tensor& result) {
+  return at::cat_out(result, tensors, dimname_to_position(tensors[0], dim));
+}
+
+Tensor concat(TensorList tensors, Dimname dim) {
+  return at::cat(tensors, dimname_to_position(tensors[0], dim));
+}
+
+Tensor & concat_out(TensorList tensors, int64_t dim, Tensor & result) {
+  return at::cat_out(result, tensors, dim);
+}
+
+Tensor concat(TensorList tensors, int64_t dim) {
+  return at::cat(tensors, dim);
+}
+
 static bool sizes_match_except(IntArrayRef s1, IntArrayRef s2, int64_t dim_except /* should already be wrapped */) {
   if (s1.size() != s2.size()) {
     return false;
@@ -1497,9 +1516,8 @@ bool inline maybe_native_stack(Tensor& result, TensorList tensors, int64_t dim)
     result_sizes.insert(result_sizes.begin() + dim, tensors.size());
 
     // skip resizing if size of result is same as expected
-    if (result.sizes() != result_sizes) {
-      result.resize_(result_sizes);
-    }
+    // raise a warning while resizing if output has one or more elements
+    at::native::resize_output(result, result_sizes);
     stack_serial_stub(kCPU, result, tensors, dim);
     return true;
   }
diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu
index dec985447944e..05fa4c6e165c4 100644
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@@ -2,6 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/native/Resize.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/Dispatch.h>
 #include <c10/core/MemoryFormat.h>
@@ -528,7 +529,10 @@ Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) {
   size[dimension] = cat_dim_size;
 
   // skip resizing if size of result is same as expected
-  if (out.sizes() != size) {
+  // raise a warning while resizing if output has one or more elements
+  // See https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362
+  // for understanding why at::native::resize_output is not called directly.
+  if (at::native::resize_output_check(out, size)) {
     out.resize_(size, memory_format);
   }
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index ca13e058411a4..3a1f75c588a83 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -974,6 +974,15 @@
 
 - func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
 
+# alias for torch.cat
+- func: concat(Tensor[] tensors, int dim=0) -> Tensor
+
+- func: concat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: concat.names(Tensor[] tensors, Dimname dim) -> Tensor
+
+- func: concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+
 - func: block_diag(Tensor[] tensors) -> Tensor
   variants: function
 
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 88cbc6986bf31..5aa5dbc9387b4 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -88,6 +88,7 @@ Indexing, Slicing, Joining, Mutating Ops
     :nosignatures:
 
     cat
+    concat
     conj
     chunk
     dsplit
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 2da74cbd01938..61a46b439f213 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -2735,36 +2735,6 @@ def test_block_diag(self):
                               lambda a, b, c: torch.block_diag(a, b, c),
                               True, f_args_variable, f_args_tensor)
 
-    def test_cat(self):
-        f_args_variable = (torch.randn(1, S, S, dtype=torch.double, requires_grad=True),
-                           torch.randn(2, S, S, dtype=torch.double, requires_grad=True),
-                           torch.randn(3, S, S, dtype=torch.double, requires_grad=True),
-                           0)
-        f_args_tensor = deepcopy(unpack_variables(f_args_variable))
-        run_functional_checks(self, "test_cat", "cat",
-                              lambda a, b, c, dim: torch.cat((a, b, c), dim),
-                              True, f_args_variable, f_args_tensor, check_forward_ad=True)
-
-    def test_cat_negdim_1(self):
-        f_args_variable = (torch.randn(S, S, 1, dtype=torch.double, requires_grad=True),
-                           torch.randn(S, S, 2, dtype=torch.double, requires_grad=True),
-                           torch.randn(S, S, 3, dtype=torch.double, requires_grad=True),
-                           -1)
-        f_args_tensor = deepcopy(unpack_variables(f_args_variable))
-        run_functional_checks(self, "test_cat_negdim_1", "cat",
-                              lambda a, b, c, dim: torch.cat((a, b, c), dim),
-                              True, f_args_variable, f_args_tensor, check_forward_ad=True)
-
-    def test_cat_negdim_2(self):
-        f_args_variable = (torch.randn(S, 1, S, dtype=torch.double, requires_grad=True),
-                           torch.randn(S, 2, S, dtype=torch.double, requires_grad=True),
-                           torch.randn(S, 3, S, dtype=torch.double, requires_grad=True),
-                           -2)
-        f_args_tensor = deepcopy(unpack_variables(f_args_variable))
-        run_functional_checks(self, "test_cat_negdim_2", "cat",
-                              lambda a, b, c, dim: torch.cat((a, b, c), dim),
-                              True, f_args_variable, f_args_tensor, check_forward_ad=True)
-
     def test_cat_empty_legacy(self):
         f_args_variable = (torch.randn(0, dtype=torch.double, requires_grad=True),
                            torch.randn(S, S, dtype=torch.double, requires_grad=True))
@@ -2776,14 +2746,6 @@ def test_cat_empty_legacy(self):
                               False, f_args_variable, f_args_tensor, check_forward_ad=True)
         self.assertTrue(gradcheck(lambda a, b: torch.cat((a, b)), f_args_variable, eps=1e-6, atol=PRECISION))
 
-    def test_cat_empty(self):
-        f_args_variable = (torch.randn(0, S, dtype=torch.double, requires_grad=True),
-                           torch.randn(S, S, dtype=torch.double, requires_grad=True))
-        f_args_tensor = deepcopy(unpack_variables(f_args_variable))
-        run_functional_checks(self, "test_cat_empty", "cat",
-                              lambda a, b: torch.cat((a, b)),
-                              True, f_args_variable, f_args_tensor, check_forward_ad=True)
-
     def test_var_mean_differentiable(self):
         dim = [2, 4]
         keepdim = False
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index e723ee4622991..fc90f494e3917 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -1497,7 +1497,7 @@ def test_normalize_operator_exhaustive(self, device, dtype, op):
             return
 
         # These ops currently don't trace in FX for various reasons (i.e. they take a list of tensors)
-        fx_fail = {"stack", "hstack", "vstack", "dstack", "linalg.multi_dot"}
+        fx_fail = {"cat", "stack", "hstack", "vstack", "dstack", "linalg.multi_dot"}
         sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=False)
         for sample_input in sample_inputs_itr:
             unsupported_arg_type = False
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index dcb49386c9ff8..a7496919eaeb8 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -695,6 +695,47 @@ def test_cat_preserve_channels_last(self, device):
         self.assertEqual(res1, res2)
         self.assertTrue(res1.is_contiguous(memory_format=torch.channels_last))
 
+    @onlyCUDA
+    def test_cat_out_memory_format(self, device):
+        inp_size = (4, 4, 4, 4)
+        expected_size = (8, 4, 4, 4)
+        a_cuda = torch.randn(inp_size, device=device).contiguous(memory_format=torch.channels_last)
+        a_cpu = torch.randn(inp_size, device='cpu').contiguous(memory_format=torch.channels_last)
+        b_cuda = torch.randn(inp_size, device=device).contiguous(memory_format=torch.contiguous_format)
+        b_cpu = torch.randn(inp_size, device='cpu').contiguous(memory_format=torch.contiguous_format)
+        c_cuda = torch.randn(inp_size, device=device).contiguous(memory_format=torch.channels_last)
+
+        # Case 1: if out= is the correct shape then the memory format of out= is respected
+
+        out_cuda = torch.empty(expected_size, device=device).contiguous(memory_format=torch.contiguous_format)
+        res1_cuda = torch.cat((a_cuda, b_cuda), out=out_cuda)
+
+        out_cpu = torch.empty(expected_size, device='cpu').contiguous(memory_format=torch.contiguous_format)
+        res1_cpu = torch.cat((a_cpu, b_cpu), out=out_cpu)
+
+        self.assertTrue(res1_cuda.is_contiguous(memory_format=torch.contiguous_format))
+        self.assertTrue(res1_cpu.is_contiguous(memory_format=torch.contiguous_format))
+
+        # Case 2: if out= is not the correct shape then the output it is resized internally
+        # - For the CPU variant the memory format is that of the first tensor
+        # - For the CUDA variant it only propagates memory format if all the tensors have
+        #   the same memory format, otherwise it just uses contiguous_format as a default
+
+        out_cuda = torch.empty((0), device=device).contiguous(memory_format=torch.contiguous_format)
+        # a_cuda and b_cuda have different memory_format
+        res2_cuda = torch.cat((a_cuda, b_cuda), out=out_cuda)
+
+        out_cpu = torch.empty((0), device='cpu').contiguous(memory_format=torch.contiguous_format)
+        res2_cpu = torch.cat((a_cpu, b_cpu), out=out_cpu)
+
+        self.assertTrue(res2_cuda.is_contiguous(memory_format=torch.contiguous_format))
+        self.assertTrue(res2_cpu.is_contiguous(memory_format=torch.channels_last))
+
+        out_cuda = torch.empty((0), device=device).contiguous(memory_format=torch.contiguous_format)
+        # a_cuda and c_cuda have same memory_format
+        res3_cuda = torch.cat((a_cuda, c_cuda), out=out_cuda)
+
+        self.assertTrue(res3_cuda.is_contiguous(memory_format=torch.channels_last))
 
     @onlyCUDA
     @deviceCountAtLeast(2)
@@ -713,8 +754,8 @@ def test_cat_different_devices(self, devices):
     def test_cat_stack_cross_devices(self, device):
         cuda = torch.randn((3, 3), device=device)
         cpu = torch.randn((3, 3), device='cpu')
-        out_cpu = cpu.clone()
-        out_cuda = cuda.clone()
+
+        # cat
         with self.assertRaisesRegex(RuntimeError,
                                     "Expected all tensors to be on the same device"):
             torch.cat((cuda, cpu))
@@ -722,18 +763,6 @@ def test_cat_stack_cross_devices(self, device):
                                     "Expected all tensors to be on the same device"):
             torch.cat((cpu, cuda))
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Expected all tensors to be on the same device"):
-            torch.cat((cpu, cuda), out=out_cuda)
-
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Expected all tensors to be on the same device"):
-            torch.cat((cpu, cpu), out=out_cuda)
-
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Expected all tensors to be on the same device"):
-            torch.cat((cuda, cuda), out=out_cpu)
-
         # Stack
         with self.assertRaisesRegex(RuntimeError,
                                     "Expected all tensors to be on the same device"):
@@ -742,18 +771,6 @@ def test_cat_stack_cross_devices(self, device):
                                     "Expected all tensors to be on the same device"):
             torch.stack((cpu, cuda))
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Expected all tensors to be on the same device"):
-            torch.stack((cpu, cuda), out=out_cuda)
-
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Expected all tensors to be on the same device"):
-            torch.stack((cpu, cpu), out=out_cuda)
-
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Expected all tensors to be on the same device"):
-            torch.stack((cuda, cuda), out=out_cpu)
-
     # TODO: reconcile with other cat tests
     # TODO: Compare with a NumPy reference instead of CPU
     @onlyCUDA
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index bbb8d981ab8a2..7dca8a7bdedbd 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1856,6 +1856,13 @@ def merge_dicts(*dicts):
              -0.5790,  0.1497]])
 """.format(**common_args))
 
+add_docstr(torch.concat,
+           r"""
+concat(tensors, dim=0, *, out=None) -> Tensor
+
+Alias of :func:`torch.cat`.
+""")
+
 add_docstr(torch.ceil,
            r"""
 ceil(input, *, out=None) -> Tensor
diff --git a/torch/csrc/jit/passes/normalize_ops.cpp b/torch/csrc/jit/passes/normalize_ops.cpp
index cc6444e8a9dfd..5ac36e1f1b76f 100644
--- a/torch/csrc/jit/passes/normalize_ops.cpp
+++ b/torch/csrc/jit/passes/normalize_ops.cpp
@@ -104,6 +104,7 @@ const std::unordered_map<Symbol, Symbol>& getOperatorAliasMap() {
       {aten::multiply_, aten::mul_},
       {aten::true_divide, aten::div},
       {aten::true_divide_, aten::div_},
+      {aten::concat, aten::cat},
       {aten::row_stack, aten::vstack},
       {aten::swapdims, aten::transpose},
       {aten::swapdims_, aten::transpose_},
diff --git a/torch/overrides.py b/torch/overrides.py
index 64b18b89eb401..aca14a6d4552b 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -360,6 +360,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.bucketize: lambda input, boundaries, out_int32=False, right=False, out=None: -1,
         torch.cartesian_prod: lambda *tensors: -1,
         torch.cat: lambda tensors, dim=0, out=None: -1,
+        torch.concat: lambda tensors, dim=0, out=None: -1,  # alias for torch.cat
         torch.cdist: lambda x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary': -1,
         torch.ceil: lambda input, out=None: -1,
         torch.celu: lambda input, alhpa=1., inplace=False: -1,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 5e009ee7f487e..ace4fa1c63c20 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -2180,6 +2180,25 @@ def sample_inputs_stack(op_info, device, dtype, requires_grad, **kwargs):
 
     return (SampleInput(tensors, args=(0,)),)
 
+def sample_inputs_cat_concat(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases: Tuple[tuple, tuple, dict] = (  # type: ignore[assignment]
+        ((S, S), (S, S), {'dim': -1}),
+        ((S, S), (S, S), {'dim': 1}),
+        ((M, S), (S, S), {'dim': 0}),  # different shapes
+        ((1, 2, 3), (1, 2, 3), {'dim': -2}),
+        ((0,), (0,), {'dim': 0}),  # empty tensor
+        ((0, S), (S, S), {'dim': 0}),
+        ((1,), (1,), {})  # dim not passed, fallback to default
+    )
+
+    def generator():
+        for input_shape1, input_shape2, kwargs in cases:
+            yield SampleInput([make_arg(input_shape1), make_arg(input_shape2)], kwargs=kwargs)
+
+    return list(generator())
+
 def sample_inputs_hstack_dstack_vstack(op_info, device, dtype, requires_grad, **kwargs):
     tensors = [
         make_tensor((S, S), device, dtype, requires_grad=requires_grad),
@@ -8582,17 +8601,11 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
     OpInfo('stack',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_stack,
-           assert_autodiffed=True,
-           skips=(
-               # stack does not correctly warn when resizing out= inputs
-               SkipInfo('TestCommon', 'test_out'),),),
+           assert_autodiffed=True),
     OpInfo('hstack',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_hstack_dstack_vstack,
-           supports_forward_ad=True,
-           skips=(
-               # hstack does not correctly warn when resizing out= inputs
-               SkipInfo('TestCommon', 'test_out'),),),
+           supports_forward_ad=True),
     OpInfo('hypot',
            dtypes=floating_types(),
            dtypesIfCPU=floating_types_and(torch.bfloat16),
@@ -8609,24 +8622,31 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
                # JIT tests don't work with Tensor keyword arguments
                # https://github.com/pytorch/pytorch/issues/58507
                SkipInfo('TestJit', 'test_variant_consistency_jit'),),),
+    OpInfo('cat',
+           ref=lambda input_seq, dim=0, **kwargs: np.concatenate(input_seq, axis=dim, **kwargs),
+           aliases=('concat',),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_cat_concat,
+           supports_forward_ad=True,
+           assert_autodiffed=True,
+           skips=(
+               # RuntimeError: Arguments for call not valid.
+               #               Expected a value of type 'List[Tensor]' for argument
+               #               'tensors' but instead found type 'Tensor (inferred)'.
+               SkipInfo('TestJit', 'test_jit_alias_remapping'),)),
     OpInfo('vstack',
            aliases=('row_stack',),
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_hstack_dstack_vstack,
            supports_forward_ad=True,
            skips=(
-               # vstack does not correctly warn when resizing out= inputs
-               SkipInfo('TestCommon', 'test_out'),
                # RuntimeError: _fn() Expected a value of type
                #   'Tensor (inferred)' for argument 't0' but instead found type 'tuple'.
-               SkipInfo('TestJit', 'test_jit_alias_remapping'))),
+               SkipInfo('TestJit', 'test_jit_alias_remapping'),)),
     OpInfo('dstack',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_hstack_dstack_vstack,
-           supports_forward_ad=True,
-           skips=(
-               # dstack does not correctly warn when resizing out= inputs
-               SkipInfo('TestCommon', 'test_out'),)),
+           supports_forward_ad=True),
     OpInfo('unfold',
            op=lambda x, *args: x.unfold(*args),
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),

From 32fbeb170d57ab6a5af9ca6de23a54a6a910a433 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Tue, 7 Sep 2021 00:04:14 -0700
Subject: [PATCH 518/530] Update error messages that use LAPACK error codes
 (#63864)

Summary:
This PR updates the` batchCheckErrors` and `singleCheckErrors` functions so that the error messages are defined only once.
`batchCheckErrors` function reuses `singleCheckErrors` now.

Fixes https://github.com/pytorch/pytorch/issues/63220, fixes https://github.com/pytorch/pytorch/issues/59779

cc jianyuh nikitaved pearu mruberry heitorschueroff walterddr IvanYashchuk xwang233 Lezcano

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63864

Reviewed By: ngimel

Differential Revision: D30672933

Pulled By: mruberry

fbshipit-source-id: 0ba37ff98ef278efdb12c3890aa07d687047da7a
---
 aten/src/ATen/native/BatchLinearAlgebra.cpp | 10 +--
 aten/src/ATen/native/LinearAlgebraUtils.h   | 84 +++++++++++----------
 test/test_linalg.py                         | 40 ++++++----
 3 files changed, 71 insertions(+), 63 deletions(-)

diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index d80f9184567b1..498b51b38187c 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -1549,6 +1549,8 @@ Tensor cholesky_inverse(const Tensor &input, bool upper) {
 
 DEFINE_DISPATCH(lu_stub);
 
+// TODO: remove check_errors argument
+// https://github.com/pytorch/pytorch/issues/64014
 std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool check_errors) {
   TORCH_CHECK(self.dim() >= 2,
            "expected tensor with 2 or more dimensions, got size: ", self.sizes(),
@@ -1566,14 +1568,6 @@ std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool comput
   // 'lu' tensor is modified in-place and must be a copy of 'self'
   Tensor lu = cloneBatchedColumnMajor(self);
   lu_stub(self.device().type(), lu, pivots_tensor, infos_tensor, compute_pivots);
-
-  if (check_errors) {
-    if (self.dim() > 2) {
-      batchCheckErrors(infos_tensor, "lu", /*allow_singular=*/true);
-    } else {
-      singleCheckErrors(infos_tensor.item<int64_t>(), "lu", /*allow_singular=*/true);
-    }
-  }
   return std::make_tuple(lu, pivots_tensor, infos_tensor);
 }
 
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
index 19e41d7a8e815..abbf82ceb148c 100644
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -213,62 +213,66 @@ static inline void squareCheckInputs(const Tensor& self) {
               "but they are ", self.size(-1), " by ", self.size(-2), " matrices");
 }
 
+/*
+ * Given a info int, obtained after a single operation, this function check if the computation
+ * has been successful (info = 0) or not, and report in case of the latter.
+ */
+static inline void singleCheckErrors(int64_t info, const char* name, int64_t batch_id=-1) {
+  std::string batch_string{""};
+  if (batch_id >= 0) {
+    batch_string = ": (Batch element " + std::to_string(batch_id) + ")";
+  }
+  if (info < 0) {
+    TORCH_INTERNAL_ASSERT(false, name, batch_string,
+        ": Argument ", -info, " has illegal value. Most certainly there is a bug in the implementation calling the backend library.");
+  } else if (info > 0) {
+    if (strstr(name, "inv")) {
+      // inv, inverse, cholesky_inverse, etc.
+      TORCH_CHECK(false, name, batch_string,
+          ": The diagonal element ", info, " is zero, the inversion could not be completed because the input matrix is singular.");
+    } else if (strstr(name, "solve")) {
+      // solve, linalg_solve, cholesky_solve, etc.
+      TORCH_CHECK(false, name, batch_string,
+          ": The diagonal element ", info, " is zero, the solve could not be completed because the input matrix is singular.");
+    } else if (strstr(name, "cholesky")) {
+      TORCH_CHECK(false, name, batch_string,
+          ": The factorization could not be completed because the input is not positive-definite (the leading minor of order ", info, " is not positive-definite).");
+    } else if (strstr(name, "svd")) {
+      TORCH_CHECK(false, name, batch_string,
+          ": The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated singular values (error code: ", info, ").");
+    } else if (strstr(name, "eig") || strstr(name, "syevd")) {
+      TORCH_CHECK(false, name, batch_string,
+          ": The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated eigenvalues (error code: ", info, ").");
+    } else if (strstr(name, "lstsq")) {
+      TORCH_CHECK(false, name, batch_string,
+          ": The least squares solution could not be computed because the input matrix does not have full rank (error code: ", info, ").");
+    } else {
+      TORCH_INTERNAL_ASSERT(false, name, ": Unknown error code: ", info, ".");
+    }
+  }
+}
+
 /*
  * Given a vector of int64_t infos, obtained after a batch operations,
  * this function checks if the computation over all these batches has been
  * successful (info = 0) or not, and report in case of the latter.
  */
-static inline void batchCheckErrors(std::vector<int64_t>& infos, const char* name, bool allow_singular=false) {
+static inline void batchCheckErrors(const std::vector<int64_t>& infos, const char* name) {
   for (size_t i = 0; i < infos.size(); i++) {
     auto info = infos[i];
-    if (info < 0) {
-      AT_ERROR(name, ": For batch ", i, ": Argument ", -info, " has illegal value");
-    } else if (info > 0) {
-      if (strstr(name, "svd")) {
-        AT_ERROR(name, ": the updating process of SBDSDC did not converge (error: ", info, ")");
-      } else if (strstr(name, "symeig") || strstr(name, "syevd")) {
-        AT_ERROR(name, ": For batch ", i, ": the algorithm failed to converge; ", info,
-                 " off-diagonal elements of an intermediate tridiagonal form did not converge to zero.");
-      } else if (!allow_singular) {
-        AT_ERROR(name, ": For batch ", i, ": U(", info, ",", info, ") is zero, singular U.");
-      }
-    }
+    singleCheckErrors(info, name, i);
   }
 }
 
 /*
  * This is an overloaded case of the previous function for a tensor of infos.
  */
-static inline void batchCheckErrors(const Tensor& infos, const char* name, bool allow_singular=false, int info_per_batch=1) {
-  auto batch_size = infos.numel();
+static inline void batchCheckErrors(const Tensor& infos, const char* name) {
   auto infos_cpu = infos.to(at::kCPU);
   auto infos_data = infos_cpu.data_ptr<int>();
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (int64_t i = 0; i < infos.numel(); i++) {
     auto info = infos_data[i];
-    if (info < 0) {
-      AT_ERROR(name, ": For batch ", i/info_per_batch, ": Argument ", -info, " has illegal value");
-    } else if (!allow_singular && info > 0) {
-      AT_ERROR(name, ": For batch ", i/info_per_batch, ": U(", info, ",", info, ") is zero, singular U.");
-    }
-  }
-}
-
-/*
- * Given a info int, obtained after a single operation, this function check if the computation
- * has been successful (info = 0) or not, and report in case of the latter.
- */
-static inline void singleCheckErrors(int64_t info, const char* name, bool allow_singular=false) {
-  if (info < 0) {
-    AT_ERROR(name, ": Argument ", -info, " has illegal value");
-  } else if (info > 0) {
-    if (strstr(name, "svd")) {
-      AT_ERROR(name, ": the updating process of SBDSDC did not converge (error: ", info, ")");
-    } else if (strstr(name, "eig")) { // this catches both "eig" and "symeig"
-      AT_ERROR(name, ": the algorithm failed to converge; ", info,
-               " off-diagonal elements of an intermediate tridiagonal form did not converge to zero.");
-    } else if (!allow_singular) {
-      AT_ERROR(name, ": U(", info, ",", info, ") is zero, singular U.");
-    }
+    singleCheckErrors(info, name, i);
   }
 }
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index fbd219b3c5981..96da8d559ff31 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -483,10 +483,10 @@ def test_cholesky_errors_and_warnings(self, device, dtype):
                                     r'1-dimensional array given\. Array must be at least two-dimensional'):
             np.linalg.cholesky(A.cpu().numpy())
 
-        # if the input matrix is singular, an error should be raised
+        # if the input matrix is not positive definite, an error should be raised
         A = torch.eye(3, 3, dtype=dtype, device=device)
-        A[-1, -1] = 0  # Now A is singular
-        with self.assertRaisesRegex(RuntimeError, r'U\(3,3\) is zero, singular U\.'):
+        A[-1, -1] = 0  # Now A is not positive definite
+        with self.assertRaisesRegex(RuntimeError, r'minor of order 3 is not positive-definite'):
             torch.linalg.cholesky(A)
         with self.assertRaisesRegex(np.linalg.LinAlgError, r'Matrix is not positive definite'):
             np.linalg.cholesky(A.cpu().numpy())
@@ -495,8 +495,8 @@ def test_cholesky_errors_and_warnings(self, device, dtype):
         A = torch.eye(3, 3, dtype=dtype, device=device)
         A = A.reshape((1, 3, 3))
         A = A.repeat(5, 1, 1)
-        A[4, -1, -1] = 0  # Now A[4] is singular
-        with self.assertRaisesRegex(RuntimeError, r'For batch 4: U\(3,3\) is zero, singular U\.'):
+        A[4, -1, -1] = 0  # Now A[4] is not positive definite
+        with self.assertRaisesRegex(RuntimeError, r'\(Batch element 4\): The factorization could not be completed'):
             torch.linalg.cholesky(A)
 
         # if out tensor with wrong shape is passed a warning is given
@@ -674,7 +674,7 @@ def test_cholesky_ex_non_pd(self, device, dtype):
         A[-1, -1] = 0  # Now A is singular
         _, info = torch.linalg.cholesky_ex(A)
         self.assertEqual(info, 3)
-        with self.assertRaisesRegex(RuntimeError, r'U\(3,3\) is zero, singular U\.'):
+        with self.assertRaisesRegex(RuntimeError, r'minor of order 3 is not positive-definite'):
             torch.linalg.cholesky_ex(A, check_errors=True)
 
         # if at least one matrix in the batch is not positive definite,
@@ -688,7 +688,7 @@ def test_cholesky_ex_non_pd(self, device, dtype):
         expected_info = torch.zeros(A.shape[:-2], dtype=torch.int32, device=device)
         expected_info[3] = 2
         self.assertEqual(info, expected_info)
-        with self.assertRaisesRegex(RuntimeError, r'For batch 3: U\(2,2\) is zero, singular U\.'):
+        with self.assertRaisesRegex(RuntimeError, r'\(Batch element 3\): The factorization could not be completed'):
             torch.linalg.cholesky_ex(A, check_errors=True)
 
     @skipCUDAIfNoMagmaAndNoCusolver
@@ -2892,6 +2892,16 @@ def test_svd_errors_and_warnings(self, device, dtype):
                     # error from out_v
                     svd(a, out=(out_u, out_s, out_v))
 
+            # if input contains NaN then an error is triggered for svd
+            a = torch.full((3, 3), float('nan'), dtype=dtype, device=device)
+            a[0] = float('nan')
+            with self.assertRaisesRegex(RuntimeError, "The algorithm failed to converge"):
+                svd(a)
+            a = torch.randn(3, 33, 33, dtype=dtype, device=device)
+            a[1, 0, 0] = float('nan')
+            with self.assertRaisesRegex(RuntimeError, r"\(Batch element 1\): The algorithm failed to converge"):
+                svd(a)
+
     @skipCUDAIfNoMagmaAndNoCusolver
     @skipCPUIfNoLapack
     @dtypes(*floating_and_complex_types())
@@ -3237,7 +3247,7 @@ def test_inv_ex_singular(self, device, dtype):
         A[-1, -1] = 0  # Now A is singular
         info = torch.linalg.inv_ex(A).info
         self.assertEqual(info, 3)
-        with self.assertRaisesRegex(RuntimeError, r'U\(3,3\) is zero, singular U\.'):
+        with self.assertRaisesRegex(RuntimeError, r'diagonal element 3 is zero, the inversion could not be completed'):
             torch.linalg.inv_ex(A, check_errors=True)
 
         # if at least one matrix in the batch is not positive definite,
@@ -3251,7 +3261,7 @@ def test_inv_ex_singular(self, device, dtype):
         expected_info = torch.zeros(A.shape[:-2], dtype=torch.int32, device=device)
         expected_info[3] = 2
         self.assertEqual(info, expected_info)
-        with self.assertRaisesRegex(RuntimeError, r'For batch 3: U\(2,2\) is zero, singular U\.'):
+        with self.assertRaisesRegex(RuntimeError, r'\(Batch element 3\): The diagonal element 2 is zero'):
             torch.linalg.inv_ex(A, check_errors=True)
 
     @slowTest
@@ -3289,7 +3299,7 @@ def test_inverse_errors(self, device, dtype):
         def run_test_singular_input(batch_dim, n):
             x = torch.eye(3, 3, dtype=dtype, device=device).reshape((1, 3, 3)).repeat(batch_dim, 1, 1)
             x[n, -1, -1] = 0
-            with self.assertRaisesRegex(RuntimeError, rf'For batch {n}: U\(3,3\) is zero'):
+            with self.assertRaisesRegex(RuntimeError, rf'\(Batch element {n}\): The diagonal element 3 is zero'):
                 torch.inverse(x)
 
         for params in [(1, 0), (2, 0), (2, 1), (4, 0), (4, 2), (10, 2)]:
@@ -3306,7 +3316,7 @@ def test_inverse_errors_large(self, device, dtype):
         x = torch.empty((8, 10, 616, 616), dtype=dtype, device=device)
         x[:] = torch.eye(616, dtype=dtype, device=device)
         x[..., 10, 10] = 0
-        with self.assertRaisesRegex(RuntimeError, r'For batch 0: U\(11,11\) is zero'):
+        with self.assertRaisesRegex(RuntimeError, r'\(Batch element 0\): The diagonal element 11 is zero'):
             torch.inverse(x)
 
     @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3, torch.float64: 1e-7, torch.complex128: 1e-7})
@@ -3428,7 +3438,7 @@ def test_inv_errors_and_warnings(self, device, dtype):
         def run_test_singular_input(batch_dim, n):
             a = torch.eye(3, 3, dtype=dtype, device=device).reshape((1, 3, 3)).repeat(batch_dim, 1, 1)
             a[n, -1, -1] = 0
-            with self.assertRaisesRegex(RuntimeError, rf"For batch {n}: U\(3,3\) is zero"):
+            with self.assertRaisesRegex(RuntimeError, rf"\(Batch element {n}\): The diagonal element 3 is zero"):
                 torch.linalg.inv(a)
 
         for params in [(1, 0), (2, 0), (2, 1), (4, 0), (4, 2), (10, 2)]:
@@ -3559,7 +3569,7 @@ def run_test_singular_input(batch_dim, n):
             a = torch.eye(3, 3, dtype=dtype, device=device).reshape((1, 3, 3)).repeat(batch_dim, 1, 1)
             a[n, -1, -1] = 0
             b = torch.randn(batch_dim, 3, 1, dtype=dtype, device=device)
-            with self.assertRaisesRegex(RuntimeError, rf'For batch {n}: U\(3,3\) is zero'):
+            with self.assertRaisesRegex(RuntimeError, rf'\(Batch element {n}\): The diagonal element 3 is zero'):
                 torch.linalg.solve(a, b)
 
         for params in [(1, 0), (2, 0), (2, 1), (4, 0), (4, 2), (10, 2)]:
@@ -4912,7 +4922,7 @@ def test_triangular_solve_singular(self, device, dtype):
         b = torch.rand(3, 1, dtype=dtype, device=device)
         A = torch.eye(3, 3, dtype=dtype, device=device)
         A[-1, -1] = 0  # Now A is singular
-        err_str = r"triangular_solve: U\(3,3\) is zero, singular U\."
+        err_str = r"triangular_solve: The diagonal element 3 is zero"
         with self.assertRaisesRegex(RuntimeError, err_str):
             torch.triangular_solve(b, A)
 
@@ -7285,7 +7295,7 @@ def test_cholesky_inverse_errors_and_warnings(self, device, dtype):
         a = torch.randn(3, 3, device=device, dtype=dtype)
         a[1, 1] = 0
         if self.device_type == 'cpu':
-            with self.assertRaisesRegex(RuntimeError, r"cholesky_inverse: U\(2,2\) is zero, singular U\."):
+            with self.assertRaisesRegex(RuntimeError, r"cholesky_inverse: The diagonal element 2 is zero"):
                 torch.cholesky_inverse(a)
         # cholesky_inverse on GPU does not raise an error for this case
         elif self.device_type == 'cuda':

From adbcc819cd40deaa2755383815896d8c9dffb881 Mon Sep 17 00:00:00 2001
From: Kefei Lu <kefeilu@fb.com>
Date: Tue, 7 Sep 2021 04:00:49 -0700
Subject: [PATCH 519/530] Fix fx2trt SplitterBase non_tensor_input logic
 (#64286)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64286

During graph splitting, `_SplitterBase` supports taking into consideration whether the subnet boundary nodes
produces "supported" outputs that will cross the acc/non-acc boundary. Specifically, if the backend only
supports Tensor-based data passing cross boundary, then we cannot split the graph at a place where the node
output is a non-Tensor type (e.g., `Tuple[Tensor]`).

There's currently a bug in this logic that it does not correctly detect the output type of a Node. Instead of
using `Node.meta['tensor_meta']`, we should instead check `Node.meta['type']`.

`Node.meta['tensor_meta']` is not appropriate because this key will exist if the node output is an iterable
and one of the element is of type `Tensor`. So `Tuple[Tensor]` will be wrongly considered "supported".

Test Plan:
arc lint
run CI tests

Reviewed By: yinghai, 842974287

Differential Revision: D30617147

fbshipit-source-id: e8ba70dfaddc05cafb8037d58fca73b7ccbb1a49
---
 torch/fx/passes/splitter_base.py | 15 ++++++++++-----
 torch/fx/passes/tools_common.py  | 11 +++++++++++
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py
index 42087bde9ef89..65419055dad82 100644
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@@ -2,6 +2,7 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import List, Dict, Optional, Tuple
+import logging
 
 import torch
 from torch.fx.experimental.graph_manipulation import get_size_of_node
@@ -20,8 +21,12 @@
     Tensors,
     NodeList,
     NodeSet,
+    is_node_output_tensor,
 )
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class _SplitterSettingBase:
     def __init__(self):
         parser = argparse.ArgumentParser()
@@ -98,7 +103,7 @@ def reduce_acc_nodes_non_tensor_input_helper(
             for user in node.users:
                 if user in self.acc_nodes:
                     self.acc_nodes.remove(user)
-                    if "tensor_meta" not in user.meta:
+                    if not is_node_output_tensor(user):
                         cpu_worklist.append(user)
 
     def reduce_acc_nodes_non_tensor_input(self):
@@ -113,7 +118,7 @@ def reduce_acc_nodes_non_tensor_input(self):
                 continue
             if node in self.acc_nodes:
                 continue
-            if "tensor_meta" in node.meta:
+            if is_node_output_tensor(node):
                 continue
             non_tensor_cpu_nodes.append(node)
 
@@ -128,7 +133,7 @@ def reduce_acc_nodes_non_tensor_output(self):
             new_cpu_nodes: NodeList = []
 
             for acc_node in self.acc_nodes:
-                if "tensor_meta" in acc_node.meta:
+                if is_node_output_tensor(acc_node):
                     continue
                 for user in acc_node.users:
                     if user not in self.acc_nodes:
@@ -461,7 +466,7 @@ def get_inputs(self, inputs):
                 reports += "Checking inputs...\n"
                 for n in submod.graph.nodes:
                     if n.op == "placeholder":
-                        if "tensor_meta" not in n.meta:
+                        if not is_node_output_tensor(n):
                             reports += f"Input {n.name} is not a tensor, this might cause problems during lowering!\n"
                         else:
                             total_input_bytes += get_size_of_node(submod, n)[0]
@@ -473,7 +478,7 @@ def get_inputs(self, inputs):
                 def get_bytes(node: torch.fx.Node):
                     nonlocal total_output_bytes
                     nonlocal reports
-                    if "tensor_meta" not in node.meta:
+                    if not is_node_output_tensor(node):
                         reports += f"Output {node.name} is not a tensor, this might cause problems during lowering!\n"
                     else:
                         total_output_bytes += get_size_of_node(submod, node)[0]
diff --git a/torch/fx/passes/tools_common.py b/torch/fx/passes/tools_common.py
index a996dc8b36521..8274f4bf3b625 100644
--- a/torch/fx/passes/tools_common.py
+++ b/torch/fx/passes/tools_common.py
@@ -48,6 +48,17 @@ def get_node_target(submodules: Dict[str, torch.nn.Module], node: torch.fx.Node)
         return node.target
 
 
+def is_node_output_tensor(node: torch.fx.Node) -> bool:
+    """Checks if the node output produces a Tensor or not.
+
+    NOTE: This requires to run `ShapeProp` on the containing fx graph before
+    calling this function. This is because it works by checking the `type`
+    metadata on the node. This metadata is produced by the `ShapeProp`.
+    """
+    type_ = node.meta.get("type", None)
+    return type_ is not None and issubclass(type_, torch.Tensor)
+
+
 class FxNetAccFusionsFinder:
     """
     Finds groups of connected ACC nodes that pass non-tensor data between each other.

From 75b9e4a128496e9b2563078f62a2903f65a6d145 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@fb.com>
Date: Tue, 7 Sep 2021 08:04:50 -0700
Subject: [PATCH 520/530] [JIT] Freeze unrolls constant loops (#63614)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63614

There are a number of optimizations (`RemoveListMutation` in particular) that are tied to loop unrolling in `runOptimizations`. However, these were not invoked from `freeze_module` since the freezing pass should be idempotent.

This diff makes `runOptimizations` run `UnrollConstantLoops` instead of `UnrollLoops`. `freeze_module` is then able to run these optimizations.

Test Plan: Observed that `freeze_module` applies `RemoveListMutation`

Reviewed By: eellison

Differential Revision: D30437356

fbshipit-source-id: cba04bd958a48ad51b151aa3264f3d5bbb1fc2a4
---
 test/onnx/test_pytorch_onnx_onnxruntime.py   |  2 +-
 test/onnx/test_utility_funs.py               | 11 ++++++-----
 torch/csrc/jit/passes/freeze_module.cpp      |  5 +++--
 torch/csrc/jit/runtime/graph_executor.cpp    | 14 +++++++++++---
 torch/csrc/jit/runtime/graph_executor_impl.h |  2 +-
 5 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index ffeef00cc9ac6..54a116b57cb1d 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -1506,7 +1506,7 @@ def list_append(boxes: List[torch.Tensor]):
 
         class Min(torch.nn.Module):
             def forward(self, x):
-                boxes = [x, x, x]
+                boxes = [x for _ in range(3)]
                 return list_append(boxes)
 
         x = torch.rand(5, 5)
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 02da90dd3066e..b87fa06d648a4 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -104,19 +104,20 @@ def forward(self, x, y, t):
     def test_output_list(self):
         class PaddingLayer(torch.jit.ScriptModule):
             @torch.jit.script_method
-            def forward(self, input_t):
-                # type: (Tensor) -> Tensor
-                for i in range(2):
+            def forward(self, input_t, n):
+                # type: (Tensor, int) -> Tensor
+                for i in range(n):
                     input_t = input_t * 2
                 return input_t
 
         input_t = torch.ones(size=[10], dtype=torch.long)
+        n = 2
         model = torch.jit.script(PaddingLayer())
-        example_output = model(input_t)
+        example_output = model(input_t, n)
 
         with self.assertRaises(RuntimeError):
             torch.onnx.export(model,
-                              (input_t, ),
+                              (input_t, n),
                               "test.onnx",
                               opset_version=self.opset_version,
                               example_outputs=[example_output])
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index df1c64bcc4740..0debc97ac8241 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -90,8 +90,9 @@ class AttributePropagator {
     };
     auto applyOptimizations = [](std::shared_ptr<Graph>& subgraph) {
       runOptimization(
-          subgraph, /* unroll? */ false, /* const_prop_user_classes? */ false);
-      RemoveListMutation(subgraph);
+          subgraph,
+          /* unroll_non_constant_loops? */ false,
+          /* const_prop_user_classes? */ false);
       LowerSimpleTuples(subgraph);
     };
 
diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp
index 0187988680e80..39742c7815d3b 100644
--- a/torch/csrc/jit/runtime/graph_executor.cpp
+++ b/torch/csrc/jit/runtime/graph_executor.cpp
@@ -908,7 +908,7 @@ void runNondiffOptimization(
 
 void runOptimization(
     std::shared_ptr<Graph>& graph,
-    bool unroll,
+    bool unroll_non_constant_loops,
     bool const_prop_user_classes) {
   // Basic graph preprocessing to eliminate noise.
   GRAPH_DEBUG(
@@ -935,9 +935,17 @@ void runOptimization(
 
   // Unroll small loops, and eliminate expressions that are the same at every
   // iteration.
-  if (unroll) {
-    UnrollLoops(graph);
+  bool unroll_success = false;
+  if (unroll_non_constant_loops) {
+    unroll_success = UnrollLoops(graph);
     GRAPH_DEBUG("After UnrollLoops, before RemoveListMutation\n", *graph);
+  } else {
+    unroll_success = UnrollConstantLoops(graph);
+    GRAPH_DEBUG(
+        "After UnrollConstantLoops, before RemoveListMutation\n", *graph);
+  }
+
+  if (unroll_success) {
     // run again with unrolled loops
     RemoveListMutation(graph);
     GRAPH_DEBUG("After RemoveListMutation, before PeepholeOptimize\n", *graph);
diff --git a/torch/csrc/jit/runtime/graph_executor_impl.h b/torch/csrc/jit/runtime/graph_executor_impl.h
index 516ad1f55c812..3815d26c87f4d 100644
--- a/torch/csrc/jit/runtime/graph_executor_impl.h
+++ b/torch/csrc/jit/runtime/graph_executor_impl.h
@@ -33,7 +33,7 @@ void packGradient(const Gradient& gradient, Node* dnode);
 bool needsGradient(const std::shared_ptr<const Graph>& graph);
 void runOptimization(
     std::shared_ptr<Graph>& graph,
-    bool unroll = true,
+    bool unroll_non_constant_loops = true,
     bool const_prop_user_classes = true);
 void runNondiffOptimization(
     std::shared_ptr<Graph>& graph,

From f767cf668395baf29ca7c9f1fa80f0abed8c53c7 Mon Sep 17 00:00:00 2001
From: Ilqar Ramazanli <iramazanli@fb.com>
Date: Tue, 7 Sep 2021 08:41:09 -0700
Subject: [PATCH 521/530] To change WarmUp Scheduler with ConstantLR and
 LinearLR (#64395)

Summary:
Partially unblocks https://github.com/pytorch/vision/issues/4281

Previously we have added WarmUp Schedulers to PyTorch Core in the PR : https://github.com/pytorch/pytorch/pull/60836 which had two mode of execution - linear and constant depending on warming up function.

In this PR we are changing this interface to more direct form, as separating linear and constant modes to separate Schedulers. In particular

```Python
scheduler1 = WarmUpLR(optimizer, warmup_factor=0.1, warmup_iters=5, warmup_method="constant")
scheduler2 = WarmUpLR(optimizer, warmup_factor=0.1, warmup_iters=5, warmup_method="linear")
```

will look like

```Python
scheduler1 = ConstantLR(optimizer, warmup_factor=0.1, warmup_iters=5)
scheduler2 = LinearLR(optimizer, warmup_factor=0.1, warmup_iters=5)
```

correspondingly.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64395

Reviewed By: datumbox

Differential Revision: D30753688

Pulled By: iramazanli

fbshipit-source-id: e47f86d12033f80982ddf1faf5b46873adb4f324
---
 docs/source/optim.rst        |   3 +-
 test/test_optim.py           | 117 ++++++++++++++++---------------
 torch/optim/lr_scheduler.py  | 129 ++++++++++++++++++++++++-----------
 torch/optim/lr_scheduler.pyi |   7 +-
 4 files changed, 156 insertions(+), 100 deletions(-)

diff --git a/docs/source/optim.rst b/docs/source/optim.rst
index 2ded57ff87a1b..695f0a2a03f6d 100644
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@@ -210,7 +210,8 @@ algorithms.
     lr_scheduler.MultiplicativeLR
     lr_scheduler.StepLR
     lr_scheduler.MultiStepLR
-    lr_scheduler.WarmUpLR
+    lr_scheduler.ConstantLR
+    lr_scheduler.LinearLR
     lr_scheduler.ExponentialLR
     lr_scheduler.CosineAnnealingLR
     lr_scheduler.ReduceLROnPlateau
diff --git a/test/test_optim.py b/test/test_optim.py
index fe282ef33b4de..d69e9351d33a0 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -12,7 +12,7 @@
 from torch.autograd import Variable
 from torch import sparse
 from torch.optim.lr_scheduler import LambdaLR, MultiplicativeLR, StepLR, \
-    MultiStepLR, WarmUpLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, \
+    MultiStepLR, ConstantLR, LinearLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, \
     _LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR, ChainedScheduler
 from torch.optim.swa_utils import AveragedModel, SWALR, update_bn
 from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests, \
@@ -274,16 +274,16 @@ def test_sgd(self):
             )
             self._test_basic_cases(
                 lambda weight, bias: optimizer([weight, bias], lr=1e-3),
-                [lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="linear")]
+                [lambda opt: LinearLR(opt, start_factor=0.4, end_factor=0.8, total_iters=4)]
             )
             self._test_basic_cases(
                 lambda weight, bias: optimizer([weight, bias], lr=1e-3),
-                [lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="constant")]
+                [lambda opt: ConstantLR(opt, factor=0.4, total_iters=4)]
             )
             self._test_basic_cases(
                 lambda weight, bias: optimizer([weight, bias], lr=1e-3),
                 [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-                 lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4)]
+                 lambda opt: LinearLR(opt, start_factor=0.4, end_factor=0.6, total_iters=4)]
             )
             self._test_basic_cases(
                 lambda weight, bias: optimizer([weight, bias], lr=1e-3),
@@ -430,17 +430,17 @@ def test_adam(self):
                 lambda weight, bias: optimizer(
                     self._build_params_dict(weight, bias, lr=1e-2),
                     lr=1e-3),
-                [lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="linear")]
+                [lambda opt: LinearLR(opt, start_factor=0.4, total_iters=4)]
             )
             self._test_basic_cases(
                 lambda weight, bias: optimizer(
                     self._build_params_dict(weight, bias, lr=1e-2),
                     lr=1e-3),
-                [lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="constant")]
+                [lambda opt: ConstantLR(opt, factor=0.4, total_iters=4)]
             )
             self._test_basic_cases(
                 lambda weight, bias: optimizer([weight, bias], lr=1e-3, amsgrad=True),
-                [lambda opt: WarmUpLR(opt, warmup_factor=0.4, warmup_iters=4, warmup_method="constant"),
+                [lambda opt: ConstantLR(opt, factor=0.4, total_iters=4),
                  lambda opt: ExponentialLR(opt, gamma=0.9)]
             )
             self._test_basic_cases(
@@ -992,12 +992,12 @@ def test_exponential_lr_is_constant_for_constant_epoch(self):
         scheduler = ExponentialLR(self.opt, gamma=0.9)
         self._test_lr_is_constant_for_constant_epoch(scheduler)
 
-    def test_constant_warmup_lr_is_constant_for_constant_epoch(self):
-        scheduler = WarmUpLR(self.opt, warmup_method="constant")
+    def test_constantlr_is_constant_for_constant_epoch(self):
+        scheduler = ConstantLR(self.opt)
         self._test_lr_is_constant_for_constant_epoch(scheduler)
 
-    def test_linear_warmup_lr_is_constant_for_constant_epoch(self):
-        scheduler = WarmUpLR(self.opt, warmup_method="linear")
+    def test_linear_linearlr_is_constant_for_constant_epoch(self):
+        scheduler = LinearLR(self.opt)
         self._test_lr_is_constant_for_constant_epoch(scheduler)
 
     def test_step_lr(self):
@@ -1051,76 +1051,78 @@ def test_multi_step_lr_with_epoch(self):
         scheduler = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
         self._test_with_epoch(scheduler, targets, epochs)
 
-    def test__get_last_lr_constant_warmup_lr(self):
+    def test_get_last_lr_constantlr(self):
         # lr = 0.025     if epoch < 5
         # lr = 0.005    if 5 <= epoch
         epochs = 10
         single_targets = [0.025] * 5 + [0.05] * 5
         targets = [single_targets, [x * epochs for x in single_targets]]
-        scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 2, warmup_iters=5, warmup_method="constant")
+        scheduler = ConstantLR(self.opt, factor=1.0 / 2, total_iters=5)
         self._test_get_last_lr(scheduler, targets, epochs)
 
-    def test__get_last_lr_linear_warmup_lr(self):
+    def test_get_last_lr_linearlr(self):
         # lr = 0.025     if epoch == 0
         # lr = 0.03125   if epoch == 1
         # lr = 0.0375    if epoch == 2
         # lr = 0.04375   if epoch == 3
         # lr = 0.005     if 4 <= epoch
         epochs = 10
-        factor = 1.0 / 2
+        start_factor = 1.0 / 4
+        end_factor = 3. / 5
         iters = 4
-        interpolation = [factor + i * (1 - factor) / iters for i in range(iters)]
-        single_targets = [x * 0.05 for x in interpolation] + [0.05] * (epochs - iters)
+        interpolation = [start_factor + i * (end_factor - start_factor) / iters for i in range(iters)]
+        single_targets = [x * 0.05 for x in interpolation] + [0.05 * end_factor] * (epochs - iters)
         targets = [single_targets, [x * epochs for x in single_targets]]
-        scheduler = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear")
+        scheduler = LinearLR(self.opt, start_factor=start_factor, end_factor=end_factor, total_iters=iters)
         self._test_get_last_lr(scheduler, targets, epochs)
 
-    def test__constant_warmup_lr(self):
+    def test_constantlr(self):
         # lr = 0.025     if epoch < 5
         # lr = 0.005    if 5 <= epoch
         epochs = 10
         single_targets = [0.025] * 5 + [0.05] * 5
         targets = [single_targets, [x * epochs for x in single_targets]]
-        scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 2, warmup_iters=5, warmup_method="constant")
+        scheduler = ConstantLR(self.opt, factor=1.0 / 2, total_iters=5)
         self._test(scheduler, targets, epochs)
 
-    def test__linear_warmup_lr(self):
+    def test_linearlr(self):
         # lr = 0.025     if epoch == 0
         # lr = 0.03125   if epoch == 1
         # lr = 0.0375    if epoch == 2
         # lr = 0.04375   if epoch == 3
         # lr = 0.005     if 4 <= epoch
         epochs = 10
-        factor = 1.0 / 2
+        start_factor = 1.0 / 2
         iters = 4
-        interpolation = [factor + i * (1 - factor) / iters for i in range(iters)]
+        interpolation = [start_factor + i * (1 - start_factor) / iters for i in range(iters)]
         single_targets = [x * 0.05 for x in interpolation] + [0.05] * (epochs - iters)
         targets = [single_targets, [x * epochs for x in single_targets]]
-        scheduler = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear")
+        scheduler = LinearLR(self.opt, start_factor=start_factor, total_iters=iters)
         self._test(scheduler, targets, epochs)
 
-    def test_constant_warmup_with_epoch(self):
+    def test_constantlr_with_epoch(self):
         # lr = 0.025     if epoch < 5
         # lr = 0.005    if 5 <= epoch
         epochs = 10
         single_targets = [0.025] * 5 + [0.05] * 5
         targets = [single_targets, [x * epochs for x in single_targets]]
-        scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 2, warmup_iters=5, warmup_method="constant")
+        scheduler = ConstantLR(self.opt, factor=1.0 / 2, total_iters=5)
         self._test_with_epoch(scheduler, targets, epochs)
 
-    def test_linear_warmup_with_epoch(self):
+    def test_linearlr_with_epoch(self):
         # lr = 0.025     if epoch == 0
         # lr = 0.03125   if epoch == 1
         # lr = 0.0375    if epoch == 2
         # lr = 0.04375   if epoch == 3
         # lr = 0.005     if 4 <= epoch
         epochs = 10
-        factor = 1.0 / 2
+        start_factor = 1.0 / 2
+        end_factor = 1.
         iters = 4
-        interpolation = [factor + i * (1 - factor) / iters for i in range(iters)]
+        interpolation = [start_factor + i * (end_factor - start_factor) / iters for i in range(iters)]
         single_targets = [x * 0.05 for x in interpolation] + [0.05] * (epochs - iters)
         targets = [single_targets, [x * epochs for x in single_targets]]
-        scheduler = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear")
+        scheduler = LinearLR(self.opt, start_factor=start_factor, total_iters=iters)
         self._test_with_epoch(scheduler, targets, epochs)
 
     def test_exp_lr(self):
@@ -1145,14 +1147,14 @@ def test_closed_form_step_lr(self):
         closed_form_scheduler = StepLR(self.opt, gamma=0.1, step_size=3)
         self._test_against_closed_form(scheduler, closed_form_scheduler, 20)
 
-    def test_closed_form_linear_warmup_lr(self):
-        scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 3, warmup_iters=4, warmup_method="linear")
-        closed_form_scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 3, warmup_iters=4, warmup_method="linear")
+    def test_closed_form_linearlr(self):
+        scheduler = LinearLR(self.opt, start_factor=1.0 / 3, end_factor=0.7, total_iters=4)
+        closed_form_scheduler = LinearLR(self.opt, start_factor=1.0 / 3, end_factor=0.7, total_iters=4)
         self._test_against_closed_form(scheduler, closed_form_scheduler, 20)
 
-    def test_closed_form_constant_warmup_lr(self):
-        scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 3, warmup_iters=4, warmup_method="constant")
-        closed_form_scheduler = WarmUpLR(self.opt, warmup_factor=1.0 / 3, warmup_iters=4, warmup_method="constant")
+    def test_closed_form_constantlr(self):
+        scheduler = ConstantLR(self.opt, factor=1.0 / 3, total_iters=4)
+        closed_form_scheduler = ConstantLR(self.opt, factor=1.0 / 3, total_iters=4)
         self._test_against_closed_form(scheduler, closed_form_scheduler, 20)
 
     def test_closed_form_multi_step_lr(self):
@@ -1265,7 +1267,7 @@ def test_chained_lr2(self):
         epochs = 10
         schedulers = [None] * 1
         targets = [[0.02, 0.03, 0.04] + [0.05] * 9]
-        schedulers[0] = WarmUpLR(self.opt, warmup_factor=0.4, warmup_iters=3, warmup_method="linear")
+        schedulers[0] = LinearLR(self.opt, start_factor=0.4, total_iters=3)
         scheduler = ChainedScheduler(schedulers)
         self._test([scheduler], targets, epochs)
 
@@ -1273,7 +1275,7 @@ def test_chained_lr3(self):
         epochs = 10
         schedulers = [None] * 2
         targets = [[0.02, 0.03, 0.04, 0.05] + [0.005] * 4 + [0.0005] * 3 + [0.00005] * 3]
-        schedulers[0] = WarmUpLR(self.opt, warmup_factor=0.4, warmup_iters=3, warmup_method="linear")
+        schedulers[0] = LinearLR(self.opt, start_factor=0.4, total_iters=3)
         schedulers[1] = MultiStepLR(self.opt, milestones=[4, 8, 10], gamma=0.1)
         scheduler = ChainedScheduler(schedulers)
         self._test([scheduler], targets, epochs)
@@ -1286,7 +1288,7 @@ def test_chained_lr4(self):
                    + [0.05 * 0.9 ** x * 0.1 for x in range(4, 6)]
                    + [0.05 * 0.9 ** x * 0.01 for x in range(6, 9)]]
         schedulers[0] = ExponentialLR(self.opt, gamma=0.9)
-        schedulers[1] = WarmUpLR(self.opt, warmup_factor=0.2, warmup_iters=4, warmup_method="constant")
+        schedulers[1] = ConstantLR(self.opt, factor=0.2, total_iters=4)
         schedulers[2] = StepLR(self.opt, gamma=0.1, step_size=3)
         scheduler = ChainedScheduler(schedulers)
         self._test([scheduler], targets, epochs)
@@ -1323,20 +1325,23 @@ def test_compound_exp_and_multistep_lr(self):
         schedulers[1] = ExponentialLR(self.opt, gamma=0.9)
         self._test(schedulers, targets, epochs)
 
-    def test_compound_exp_and_linear_warmup_lr(self):
+    def test_compound_exp_and_linearlr(self):
         epochs = 10
         iters = 4
-        factor = 0.4
+        start_factor = 0.4
+        end_factor = 0.9
         schedulers = [None] * 2
         single_targets = [0.05 * (0.9 ** x) for x in range(11)]
         for i in range(iters):
-            single_targets[i] *= factor + i / iters * (1 - factor)
+            single_targets[i] *= start_factor + i / iters * (end_factor - start_factor)
+        for i in range(iters, 11):
+            single_targets[i] *= end_factor
         targets = [single_targets, [x * epochs for x in single_targets]]
-        schedulers[0] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear")
+        schedulers[0] = LinearLR(self.opt, start_factor=start_factor, end_factor=end_factor, total_iters=iters)
         schedulers[1] = ExponentialLR(self.opt, gamma=0.9)
         self._test(schedulers, targets, epochs)
 
-    def test_compound_step_and_constant_warmup(self):
+    def test_compound_step_and_constantlr(self):
         epochs = 10
         iters = 4
         factor = 0.4
@@ -1344,20 +1349,20 @@ def test_compound_step_and_constant_warmup(self):
         single_targets = [0.05 * 0.4] * 3 + [0.005 * 0.4] + [0.005] * 2 + [0.0005] * 3 + [0.00005] * 3
         targets = [single_targets, [x * epochs for x in single_targets]]
         schedulers[0] = StepLR(self.opt, gamma=0.1, step_size=3)
-        schedulers[1] = WarmUpLR(self.opt, warmup_factor=0.4, warmup_iters=4, warmup_method="constant")
+        schedulers[1] = ConstantLR(self.opt, factor=0.4, total_iters=4)
         self._test(schedulers, targets, epochs)
 
-    def test_compound_linear_warmup_and_multistep_lr(self):
+    def test_compound_linearlr_and_multistep_lr(self):
         epochs = 10
         iters = 4
-        factor = 0.4
+        start_factor = 0.4
         schedulers = [None] * 2
         single_targets = [0.05] * 2 + [0.005] * 3 + [0.0005] * 4 + [0.00005] * 2
         for i in range(iters):
-            single_targets[i] *= factor + i / iters * (1 - factor)
+            single_targets[i] *= start_factor + i / iters * (1 - start_factor)
         targets = [single_targets, [x * epochs for x in single_targets]]
         schedulers[0] = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
-        schedulers[1] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear")
+        schedulers[1] = LinearLR(self.opt, start_factor=start_factor, total_iters=iters)
         self._test(schedulers, targets, epochs)
 
     def test_compound_cosanneal_and_step_lr(self):
@@ -1387,19 +1392,19 @@ def test_compound_cosanneal_and_multistep_lr(self):
         schedulers[1] = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
         self._test(schedulers, targets, epochs)
 
-    def test_compound_cosanneal_and_linear_warmup_lr(self):
+    def test_compound_cosanneal_and_linearlr(self):
         epochs = 10
         iters = 4
-        factor = 0.4
+        start_factor = 0.4
         eta_min = 1e-10
         schedulers = [None] * 2
         single_targets = [eta_min + (0.05 - eta_min) *
                           (1 + math.cos(math.pi * x / epochs)) / 2
                           for x in range(epochs)]
         for i in range(iters):
-            single_targets[i] *= factor + i / iters * (1 - factor)
+            single_targets[i] *= start_factor + i / iters * (1 - start_factor)
         targets = [single_targets, [x * epochs for x in single_targets]]
-        schedulers[0] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear")
+        schedulers[0] = LinearLR(self.opt, start_factor=start_factor, total_iters=iters)
         schedulers[1] = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
         self._test(schedulers, targets, epochs)
 
@@ -1485,14 +1490,14 @@ def test_compound_reduce_lr_on_plateau4(self):
 
     def test_compound_reduce_lr_on_plateau5(self):
         iters = 4
-        factor = 0.4
+        start_factor = 0.4
         epochs = 22
         for param_group in self.opt.param_groups:
             param_group['lr'] = 0.5
         single_targets = [0.5] * 6 + [0.05] * 7 + [0.005] * 7 + [0.0005] * 2
         multipliers = [1] * 22
         for i in range(iters):
-            multipliers[i] *= factor + i / iters * (1 - factor)
+            multipliers[i] *= start_factor + i / iters * (1 - start_factor)
         single_targets = [x * y for x, y in zip(single_targets, multipliers)]
         targets = [single_targets]
         targets = targets[1:]  # test runs step before checking lr
@@ -1500,7 +1505,7 @@ def test_compound_reduce_lr_on_plateau5(self):
         schedulers = [None] * 2
         schedulers[0] = ReduceLROnPlateau(self.opt, patience=5, cooldown=0, threshold_mode='abs',
                                           mode='min', threshold=0.1)
-        schedulers[1] = WarmUpLR(self.opt, warmup_factor=factor, warmup_iters=iters, warmup_method="linear")
+        schedulers[1] = LinearLR(self.opt, start_factor=start_factor, total_iters=iters)
         self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
 
     def test_cycle_lr_invalid_mode(self):
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 761a4041668d6..42f7b511c54a5 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -427,25 +427,78 @@ def _get_closed_form_lr(self):
                 for base_lr in self.base_lrs]
 
 
-class WarmUpLR(_LRScheduler):
-    """Decays the learning rate of each parameter group by either a small constant
-    or linearly increasing small warmup factor until the number of epoch reaches a
-    pre-defined milestone: warmup_iters. Notice that such decay can happen
-    simultaneously with other changes to the learning rate from outside this scheduler.
+class ConstantLR(_LRScheduler):
+    """Decays the learning rate of each parameter group by a small constant factor until the
+    number of epoch reaches a pre-defined milestone: total_iters. Notice that such decay can
+    happen simultaneously with other changes to the learning rate from outside this scheduler.
     When last_epoch=-1, sets initial lr as lr.
 
     Args:
         optimizer (Optimizer): Wrapped optimizer.
-        warmup_factor (float): The number we multiply learning rate in the first epoch.
-            If the warming up method is constant, the multiplication factor of the
-            learning rate stays the same in all epochs, but, in the linear case, it
-            starts increasing in the following epochs. Default: 1./3.
-        warmup_iters (int): The number of warming up steps. Default: 5.
-        warmup_method (str): One of `constant` and `linear`. In `constant` mode, the
-            learning rate will be multiplied with a small constant until a milestone
-            defined in warmup_iters. In the `linear` case, the multiplication factor
-            starts with warmup_factor in the first epoch then linearly increases to
-            reach 1. in the epoch number warmup_iters. Default: `linear`.
+        factor (float): The number we multiply learning rate until the milestone. Default: 1./3.
+        total_iters (int): The number of steps that the scheduler decays the learning rate.
+            Default: 5.
+        last_epoch (int): The index of the last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+    Example:
+        >>> # Assuming optimizer uses lr = 0.05 for all groups
+        >>> # lr = 0.025   if epoch == 0
+        >>> # lr = 0.025   if epoch == 1
+        >>> # lr = 0.025   if epoch == 2
+        >>> # lr = 0.025   if epoch == 3
+        >>> # lr = 0.05    if epoch >= 4
+        >>> scheduler = ConstantLR(self.opt, factor=0.5, total_iters=4)
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+
+    def __init__(self, optimizer, factor=1.0 / 3, total_iters=5, last_epoch=-1, verbose=False):
+        if factor > 1.0 or factor < 0:
+            raise ValueError('Constant multiplicative factor expected to be between 0 and 1.')
+
+        self.factor = factor
+        self.total_iters = total_iters
+        super(ConstantLR, self).__init__(optimizer, last_epoch, verbose)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        if self.last_epoch == 0:
+            return [group['lr'] * self.factor for group in self.optimizer.param_groups]
+
+        if (self.last_epoch > self.total_iters or
+                (self.last_epoch != self.total_iters)):
+            return [group['lr'] for group in self.optimizer.param_groups]
+
+        if (self.last_epoch == self.total_iters):
+            return [group['lr'] * (1.0 / self.factor) for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
+        return [base_lr * (self.factor + (self.last_epoch >= self.total_iters) * (1 - self.factor))
+                for base_lr in self.base_lrs]
+
+
+class LinearLR(_LRScheduler):
+    """Decays the learning rate of each parameter group by linearly changing small
+    multiplicative factor until the number of epoch reaches a pre-defined milestone: total_iters.
+    Notice that such decay can happen simultaneously with other changes to the learning rate
+    from outside this scheduler. When last_epoch=-1, sets initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        start_factor (float): The number we multiply learning rate in the first epoch.
+            The multiplication factor changes towards end_factor in the following epochs.
+            Default: 1./3.
+        end_factor (float): The number we multiply learning rate at the end of linear changing
+            process. Default: 1.0.
+        total_iters (int): The number of iterations that multiplicative factor reaches to 1.
+            Default: 5.
         last_epoch (int): The index of the last epoch. Default: -1.
         verbose (bool): If ``True``, prints a message to stdout for
             each update. Default: ``False``.
@@ -457,24 +510,25 @@ class WarmUpLR(_LRScheduler):
         >>> # lr = 0.0375   if epoch == 2
         >>> # lr = 0.04375  if epoch == 3
         >>> # lr = 0.005    if epoch >= 4
-        >>> scheduler = WarmUpLR(self.opt, warmup_factor=0.5, warmup_iters=4, warmup_method="linear")
+        >>> scheduler = LinearLR(self.opt, start_factor=0.5, total_iters=4)
         >>> for epoch in range(100):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
     """
 
-    def __init__(self, optimizer, warmup_factor=1.0 / 3, warmup_iters=5, warmup_method="linear",
-                 last_epoch=-1, verbose=False):
-        if warmup_method not in ("constant", "linear"):
-            raise ValueError(
-                "Only 'constant' or 'linear' warmup_method accepted, but "
-                "got {}".format(warmup_method)
-            )
-        self.warmup_factor = warmup_factor
-        self.warmup_iters = warmup_iters
-        self.warmup_method = warmup_method
-        super(WarmUpLR, self).__init__(optimizer, last_epoch, verbose)
+    def __init__(self, optimizer, start_factor=1.0 / 3, end_factor=1.0, total_iters=5, last_epoch=-1,
+                 verbose=False):
+        if start_factor > 1.0 or start_factor < 0:
+            raise ValueError('Starting multiplicative factor expected to be between 0 and 1.')
+
+        if end_factor > 1.0 or end_factor < 0:
+            raise ValueError('Ending multiplicative factor expected to be between 0 and 1.')
+
+        self.start_factor = start_factor
+        self.end_factor = end_factor
+        self.total_iters = total_iters
+        super(LinearLR, self).__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
         if not self._get_lr_called_within_step:
@@ -482,25 +536,18 @@ def get_lr(self):
                           "please use `get_last_lr()`.", UserWarning)
 
         if self.last_epoch == 0:
-            return [group['lr'] * self.warmup_factor for group in self.optimizer.param_groups]
+            return [group['lr'] * self.start_factor for group in self.optimizer.param_groups]
 
-        if (self.last_epoch > self.warmup_iters or
-                (self.warmup_method == "constant" and self.last_epoch != self.warmup_iters)):
+        if (self.last_epoch > self.total_iters):
             return [group['lr'] for group in self.optimizer.param_groups]
 
-        if (self.warmup_method == "constant" and self.last_epoch == self.warmup_iters):
-            return [group['lr'] * (1.0 / self.warmup_factor) for group in self.optimizer.param_groups]
-
-        return [group['lr'] * (1. + (1.0 - self.warmup_factor) /
-                (self.warmup_iters * self.warmup_factor + (self.last_epoch - 1) * (1 - self.warmup_factor)))
+        return [group['lr'] * (1. + (self.end_factor - self.start_factor) /
+                (self.total_iters * self.start_factor + (self.last_epoch - 1) * (self.end_factor - self.start_factor)))
                 for group in self.optimizer.param_groups]
 
     def _get_closed_form_lr(self):
-        return [base_lr * (self.warmup_factor +
-                (1 - self.warmup_factor) * min(self.warmup_iters, self.last_epoch) /
-                self.warmup_iters * (self.warmup_method == "linear") +
-                (self.last_epoch >= self.warmup_iters) * (1 - self.warmup_factor) *
-                (self.warmup_method == "constant"))
+        return [base_lr * (self.start_factor +
+                (self.end_factor - self.start_factor) * min(self.total_iters, self.last_epoch) / self.total_iters)
                 for base_lr in self.base_lrs]
 
 
@@ -618,7 +665,7 @@ class ChainedScheduler(_LRScheduler):
         >>> # lr = 0.729    if epoch == 2
         >>> # lr = 0.6561   if epoch == 3
         >>> # lr = 0.59049  if epoch >= 4
-        >>> scheduler1 = WarmUpLR(self.opt, warmup_factor=0.1, warmup_iters=2, warmup_method="constant")
+        >>> scheduler1 = ConstantLR(self.opt, factor=0.1, total_iters=2)
         >>> scheduler2 = ExponentialLR(self.opt, gamma=0.9)
         >>> scheduler = ChainedScheduler([scheduler1, scheduler2])
         >>> for epoch in range(100):
diff --git a/torch/optim/lr_scheduler.pyi b/torch/optim/lr_scheduler.pyi
index 821407e3ccca6..9b1b8ea63eed7 100644
--- a/torch/optim/lr_scheduler.pyi
+++ b/torch/optim/lr_scheduler.pyi
@@ -18,8 +18,11 @@ class StepLR(_LRScheduler):
 class MultiStepLR(_LRScheduler):
     def __init__(self, optimizer: Optimizer, milestones: Iterable[int], gamma: float=..., last_epoch: int=...) -> None: ...
 
-class WarmUpLR(_LRScheduler):
-    def __init__(self, optimizer: Optimizer, warmup_factor: float=..., warmup_iters: int=..., warmup_method: str=..., last_epoch: int=...) -> None: ...
+class ConstantLR(_LRScheduler):
+    def __init__(self, optimizer: Optimizer, factor: float=..., total_iters: int=..., last_epoch: int=...) -> None: ...
+
+class LinearLR(_LRScheduler):
+    def __init__(self, optimizer: Optimizer, start_factor: float=..., end_factor: float=..., total_iters: int=..., last_epoch: int=...) -> None: ...
 
 class ExponentialLR(_LRScheduler):
     def __init__(self, optimizer: Optimizer, gamma: float, last_epoch: int=...) -> None: ...

From 26b7ff5aeab49cb63faed2cd51fa6ba70f665610 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 7 Sep 2021 08:57:43 -0700
Subject: [PATCH 522/530] deprecate dtype getters from `torch.testing`
 namespace (#63554)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63554

Following https://github.com/pytorch/pytorch/pull/61840#issuecomment-884087809, this deprecates all the dtype getters publicly exposed in the `torch.testing` namespace. The reason for this twofold:

1. If someone is not familiar with the C++ dispatch macros PyTorch uses, the names are misleading. For example `torch.testing.floating_types()` will only give you `float32` and `float64` skipping `float16` and `bfloat16`.
2. The dtype getters provide very minimal functionality that can be easily emulated by downstream libraries.

We thought about [providing an replacement](https://gist.github.com/pmeier/3dfd2e105842ad0de4505068a1a0270a), but ultimately decided against it. The major problem is BC: by keeping it, either the namespace is getting messy again after a new dtype is added or we need to somehow version the return values of the getters.

Test Plan: Imported from OSS

Reviewed By: H-Huang

Differential Revision: D30662206

Pulled By: mruberry

fbshipit-source-id: a2bdb10ab02ae665df1b5b76e8afa9af043bbf56
---
 test/test_autograd.py                         |   3 +-
 test/test_binary_ufuncs.py                    |  90 ++++++------
 test/test_complex.py                          |   3 +-
 test/test_foreach.py                          |  45 +++---
 test/test_linalg.py                           |  54 +++----
 test/test_nn.py                               |  14 +-
 test/test_numpy_interop.py                    |   3 +-
 test/test_ops.py                              |   4 +-
 test/test_reductions.py                       |  41 +++---
 test/test_shape_ops.py                        |  15 +-
 test/test_sort_and_select.py                  |  33 +++--
 test/test_sparse.py                           |  23 +--
 test/test_sparse_csr.py                       |  23 +--
 test/test_tensor_creation_ops.py              |  59 ++++----
 test/test_testing.py                          |   5 +-
 test/test_torch.py                            | 134 ++++++++---------
 test/test_type_promotion.py                   |  49 ++++---
 test/test_unary_ufuncs.py                     |  55 +++----
 test/test_view_ops.py                         |  43 +++---
 torch/testing/_core.py                        | 128 +---------------
 torch/testing/_deprecated.py                  |  31 +++-
 torch/testing/_dtype_getters.py               | 138 ++++++++++++++++++
 torch/testing/_internal/common_device_type.py |   3 +-
 torch/testing/_internal/common_dtype.py       |   4 +
 torch/testing/_internal/common_jit.py         |   2 +-
 .../_internal/common_methods_invocations.py   |  12 +-
 torch/testing/_internal/common_modules.py     |   3 +-
 torch/testing/_internal/opinfo_helper.py      |  31 ++--
 28 files changed, 560 insertions(+), 488 deletions(-)
 create mode 100644 torch/testing/_dtype_getters.py
 create mode 100644 torch/testing/_internal/common_dtype.py

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 61a46b439f213..e672e4b49e25e 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -42,6 +42,7 @@
                                                         onlyCPU, onlyCUDA, onlyOnCPUAndCUDA, dtypes, dtypesIfCUDA,
                                                         deviceCountAtLeast, skipCUDAIfCudnnVersionLessThan,
                                                         skipCUDAIf, skipMeta)
+from torch.testing._internal.common_dtype import get_all_dtypes
 
 import pickle
 
@@ -8474,7 +8475,7 @@ def test_copy_(self, device):
         # At the time of writing this test, copy_ is not generated from native_functions.yaml
         # there was a bug that bfloat16 was not recognized as floating.
         x = torch.randn(10, device=device, requires_grad=True)
-        floating_dt = [dt for dt in torch.testing.get_all_dtypes() if dt.is_floating_point]
+        floating_dt = [dt for dt in get_all_dtypes() if dt.is_floating_point]
         for dt in floating_dt:
             y = torch.empty(10, device=device, dtype=dt)
             y.copy_(x)
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 2695ab6a86115..f8c36adf8b781 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -18,7 +18,11 @@
     instantiate_device_type_tests, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA,
     dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyOnCPUAndCUDA,
     skipCUDAIfRocm, skipIf, ops)
-from torch.testing import all_types_and_complex_and, integral_types_and, make_tensor
+from torch.testing import make_tensor
+from torch.testing._internal.common_dtype import (
+    all_types_and_complex_and, integral_types_and, get_all_dtypes, get_all_int_dtypes, get_all_math_dtypes,
+    get_all_complex_dtypes, get_all_fp_dtypes,
+)
 from torch.testing._internal.common_methods_invocations import binary_ufuncs
 
 if TEST_SCIPY:
@@ -348,7 +352,7 @@ def test_inplace_division(self, device):
         id_after = id(t)
         self.assertEqual(id_before, id_after)
 
-    @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_complex=False))
+    @dtypes(*get_all_dtypes(include_bool=False, include_complex=False))
     def test_div_rounding_modes(self, device, dtype):
         if dtype.is_floating_point:
             low, high = -10.0, 10.0
@@ -448,7 +452,7 @@ def test_divide_by_zero_rounding(self, device, dtype):
             actual = torch.divide(a, zero, rounding_mode=rounding_mode)
             self.assertEqual(actual, expect, exact_dtype=exact_dtype)
 
-    @dtypes(*torch.testing.get_all_dtypes(
+    @dtypes(*get_all_dtypes(
         include_bool=False, include_complex=False, include_bfloat16=False))
     def test_div_rounding_numpy(self, device, dtype):
         info = (torch.finfo(dtype) if dtype.is_floating_point
@@ -892,7 +896,7 @@ def test_pow_cuda_complex_extremal_failing(self, device, dtype):
             self.assertEqual(cpu_out, cuda_out)
 
     @onlyOnCPUAndCUDA
-    @dtypes(*(torch.testing.get_all_dtypes(include_bool=False, include_bfloat16=False)))
+    @dtypes(*(get_all_dtypes(include_bool=False, include_bfloat16=False)))
     def test_complex_scalar_pow_tensor(self, device, dtype):
         complexes = [0.5j, 1. + 1.j, -1.5j, 2.2 - 1.6j, 1 + 0j]
         first_exp = make_tensor((100,), device, dtype, low=-2, high=2)
@@ -1283,7 +1287,7 @@ def test_binary_ops_with_scalars(self, device):
                         self.assertEqual(expected, python_op(first, second))
                         self.assertEqual(expected, torch_op(first, second))
 
-    @dtypes(*product(torch.testing.get_all_dtypes(include_complex=False), torch.testing.get_all_dtypes(include_complex=False)))
+    @dtypes(*product(get_all_dtypes(include_complex=False), get_all_dtypes(include_complex=False)))
     def test_maximum_minimum_type_promotion(self, device, dtypes):
         a = torch.tensor((0, 1), device=device, dtype=dtypes[0])
         b = torch.tensor((1, 0), device=device, dtype=dtypes[1])
@@ -1291,7 +1295,7 @@ def test_maximum_minimum_type_promotion(self, device, dtypes):
             result = op(a, b)
             self.assertEqual(result.dtype, torch.result_type(a, b))
 
-    @dtypes(*(torch.testing.get_all_int_dtypes() + [torch.bool]))
+    @dtypes(*(get_all_int_dtypes() + [torch.bool]))
     def test_maximum_minimum_int_and_bool(self, device, dtype):
         ops = ((torch.maximum, torch.max, np.maximum), (torch.minimum, torch.min, np.minimum),
                (torch.fmax, None, np.fmax), (torch.fmin, None, np.fmin))
@@ -1317,7 +1321,7 @@ def test_maximum_minimum_int_and_bool(self, device, dtype):
             self.assertEqual(out, numpy_result)
 
     @precisionOverride({torch.bfloat16: 1e-2})
-    @dtypes(*(torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_fp_dtypes()))
     def test_maximum_minimum_float(self, device, dtype):
         ops = ((torch.maximum, torch.max, np.maximum), (torch.minimum, torch.min, np.minimum),
                (torch.fmax, None, np.fmax), (torch.fmin, None, np.fmin))
@@ -1345,7 +1349,7 @@ def test_maximum_minimum_float(self, device, dtype):
             self.assertEqual(tensor_result, numpy_result, exact_dtype=False)
             self.assertEqual(out, numpy_result, exact_dtype=False)
 
-    @dtypes(*(torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_fp_dtypes()))
     def test_maximum_minimum_float_nan_and_inf(self, device, dtype):
         # np.maximum and np.minimum functions compare input arrays element-wisely.
         # if one of the elements being compared is a NaN, then that element is returned.
@@ -1381,7 +1385,7 @@ def test_maximum_minimum_float_nan_and_inf(self, device, dtype):
                 self.assertEqual(tensor_result, numpy_result)
                 self.assertEqual(out, numpy_result)
 
-    @dtypes(*product(torch.testing.get_all_complex_dtypes(), torch.testing.get_all_dtypes()))
+    @dtypes(*product(get_all_complex_dtypes(), get_all_dtypes()))
     def test_maximum_minimum_complex(self, device, dtypes):
         for torch_op in (torch.maximum, torch.minimum, torch.max, torch.min, torch.fmax, torch.fmin):
             with self.assertRaisesRegex(RuntimeError, '.+not implemented for.+'):
@@ -1439,7 +1443,7 @@ def test_mul_intertype_scalar(self, device, dtype):
         self.assertEqual(x, 4.5)
 
     @onlyCPU
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_sub(self, device, dtype):
         m1 = torch.tensor([2.34, 4.44], dtype=dtype, device=device)
         m2 = torch.tensor([1.23, 2.33], dtype=dtype, device=device)
@@ -1501,8 +1505,8 @@ def test_min_max_binary_op_nan(self, device, dtype):
             self.assertFalse(torch.isnan(ma[i]), "max(a, b): {}, a: {}, b: {}".format(ma[i], a[i], b[i]))
             self.assertFalse(torch.isnan(mi[i]), "min(a, b): {}, a: {}, b: {}".format(mi[i], a[i], b[i]))
 
-    @dtypes(*product(torch.testing.get_all_dtypes(include_complex=False),
-                     torch.testing.get_all_dtypes(include_complex=False)))
+    @dtypes(*product(get_all_dtypes(include_complex=False),
+                     get_all_dtypes(include_complex=False)))
     def test_copysign(self, device, dtypes):
         def _test_copysign_numpy(a, b):
             torch_result = torch.copysign(a, b)
@@ -1519,7 +1523,7 @@ def _test_copysign_numpy(a, b):
             expected = torch.from_numpy(np.copysign(np_a, np_b))
             # To handle inconsistencies of type promotion between PyTorch and Numpy
             # Applied for both arguments having integral precision and bfloat16
-            types = [torch.bool, torch.bfloat16] + torch.testing.get_all_int_dtypes()
+            types = [torch.bool, torch.bfloat16] + get_all_int_dtypes()
             if a.dtype in types or b.dtype in types:
                 promoted_type = torch.promote_types(torch_result.dtype, expected.dtype)
                 torch_result = torch_result.to(promoted_type)
@@ -1564,7 +1568,7 @@ def _test_copysign_numpy(a, b):
             for case in cases:
                 _test_copysign_numpy(torch.tensor([case], device=device, dtype=dtypes[0]), b)
 
-        if dtypes[1] in torch.testing.get_all_fp_dtypes():
+        if dtypes[1] in get_all_fp_dtypes():
             a = make_tensor((10, 10), device=device, dtype=dtypes[0], low=-9, high=9)
             for case in cases:
                 _test_copysign_numpy(a, torch.tensor([case], device=device, dtype=dtypes[1]))
@@ -1616,8 +1620,8 @@ def test_divmul_scalar(self, device, dtype):
         res = scale * x
         self.assertEqual(res, expected.to(dtype), atol=0., rtol=0.)
 
-    @dtypesIfCUDA(*set(torch.testing.get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128})
-    @dtypes(*set(torch.testing.get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128})
+    @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128})
+    @dtypes(*set(get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128})
     def test_floor_divide_tensor(self, device, dtype):
         x = torch.randn(10, device=device).mul(30).to(dtype)
         y = torch.arange(1, 11, dtype=dtype, device=device)
@@ -1629,8 +1633,8 @@ def test_floor_divide_tensor(self, device, dtype):
         self.assertEqual(z.dtype, x.dtype)
         self.assertEqual(z, z_alt)
 
-    @dtypesIfCUDA(*set(torch.testing.get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128})
-    @dtypes(*set(torch.testing.get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128})
+    @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128})
+    @dtypes(*set(get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128})
     def test_floor_divide_scalar(self, device, dtype):
         x = torch.randn(100, device=device).mul(10).to(dtype)
 
@@ -1663,7 +1667,7 @@ def test_floor_divide_out(self, device, dtype):
                 self.assertEqual(o, torch.floor_divide(x.float(), y.float()))
 
     @onlyCPU
-    @dtypes(*torch.testing.get_all_math_dtypes('cpu'))
+    @dtypes(*get_all_math_dtypes('cpu'))
     def test_rdiv(self, device, dtype):
         if dtype is torch.float16:
             return
@@ -1675,7 +1679,7 @@ def test_rdiv(self, device, dtype):
         z = torch.tensor([30 / v.item() for v in x], device=device)
         self.assertEqual(y, z, exact_dtype=False)
 
-    @dtypes(*torch.testing.get_all_fp_dtypes(include_bfloat16=False))
+    @dtypes(*get_all_fp_dtypes(include_bfloat16=False))
     def test_fmod_remainder_by_zero_float(self, device, dtype):
         fn_list = (torch.fmod, torch.remainder)
         for fn in fn_list:
@@ -1687,7 +1691,7 @@ def test_fmod_remainder_by_zero_float(self, device, dtype):
 
     @onlyOnCPUAndCUDA  # Check Issue https://github.com/pytorch/pytorch/issues/48130
     @skipCUDAIfRocm  # Error happens on both ROCM and XLA
-    @dtypes(*torch.testing.get_all_int_dtypes())
+    @dtypes(*get_all_int_dtypes())
     def test_fmod_remainder_by_zero_integral(self, device, dtype):
         fn_list = (torch.fmod, torch.remainder)
         for fn in fn_list:
@@ -1712,7 +1716,7 @@ def test_fmod_remainder_by_zero_integral(self, device, dtype):
                     value = 255 if dtype == torch.uint8 else -1
                     self.assertTrue(torch.all(fn(x, zero) == value))
 
-    @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False))
+    @dtypes(*get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False))
     def test_fmod_remainder(self, device, dtype):
         # Use numpy as reference
         def _helper(x, mod, fns_list):
@@ -1749,7 +1753,7 @@ def _helper(x, mod, fns_list):
         # Mods: Integer, Float, Tensor, Non-contiguous Tensor
         mods = [3, 2.3, mod, mod.t()]
         # mod with floating-point dtype
-        if dtype in torch.testing.get_all_int_dtypes():
+        if dtype in get_all_int_dtypes():
             mod_float = make_tensor((10, 10), device=device, dtype=torch.float, low=-9, high=9)
             mod[mod == 0] = 1
             mods.append(mod_float)
@@ -1970,7 +1974,7 @@ def test_floor_divide_zero(self, device, dtype):
                 a // b
 
     @unittest.skipIf(TEST_WITH_ASAN, "Integer overflows are not allowed under ASAN")
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_muldiv_scalar(self, device, dtype):
         x = make_tensor((10, 3), device, dtype, low=None, high=None)
         s = make_tensor((1,), 'cpu', dtype, low=None, high=None).item()
@@ -1980,7 +1984,7 @@ def test_muldiv_scalar(self, device, dtype):
         self.assertEqual(x / s, x / y)
         self.assertEqual(s / x, y / x)
 
-    @dtypes(*tuple(itertools.combinations_with_replacement(torch.testing.get_all_dtypes(), 2)))
+    @dtypes(*tuple(itertools.combinations_with_replacement(get_all_dtypes(), 2)))
     def test_comparison_ops_type_promotion_and_broadcasting(self, device, dtypes):
         # issue #42660
         # testing all combinations of broadcasting and type promotion
@@ -2162,8 +2166,8 @@ def test_bitwise_shift_float(self, device):
             self.assertEqual(torch_op(a, 2.2), expected_op(a, 2.2))
 
     @onlyOnCPUAndCUDA
-    @dtypes(*list(product(torch.testing.get_all_dtypes(include_complex=False),
-                          torch.testing.get_all_dtypes(include_complex=False))))
+    @dtypes(*list(product(get_all_dtypes(include_complex=False),
+                          get_all_dtypes(include_complex=False))))
     def test_heaviside(self, device, dtypes):
         input_dtype = dtypes[0]
         values_dtype = dtypes[1]
@@ -2222,8 +2226,8 @@ def test_heaviside_cross_device(self, device):
         with self.assertRaisesRegex(RuntimeError, 'Expected all tensors to be on the same device'):
             torch.heaviside(y, x)
 
-    @dtypes(*list(product(torch.testing.get_all_complex_dtypes(),
-                          torch.testing.get_all_complex_dtypes())))
+    @dtypes(*list(product(get_all_complex_dtypes(),
+                          get_all_complex_dtypes())))
     def test_heaviside_complex(self, device, dtypes):
         input_dtype = dtypes[0]
         values_dtype = dtypes[1]
@@ -2265,15 +2269,15 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_):
         getattr(a, op + '_')(b)
         self.assertEqual(expected_res, a)
 
-    @dtypes(*product(torch.testing.get_all_dtypes(), torch.testing.get_all_dtypes()))
+    @dtypes(*product(get_all_dtypes(), get_all_dtypes()))
     def test_logical_xor(self, device, dtypes):
         self._test_logical(device, dtypes, 'logical_xor', [10, 0, 1, 0], [1, 0, 0, 10], [0, 0, 1, 1])
 
-    @dtypes(*product(torch.testing.get_all_dtypes(), torch.testing.get_all_dtypes()))
+    @dtypes(*product(get_all_dtypes(), get_all_dtypes()))
     def test_logical_and(self, device, dtypes):
         self._test_logical(device, dtypes, 'logical_and', [10, 0, 1, 0], [1, 0, 0, 10], [1, 0, 0, 0])
 
-    @dtypes(*product(torch.testing.get_all_dtypes(), torch.testing.get_all_dtypes()))
+    @dtypes(*product(get_all_dtypes(), get_all_dtypes()))
     def test_logical_or(self, device, dtypes):
         self._test_logical(device, dtypes, 'logical_or', [10, 0, 1, 0], [1, 0, 0, 10], [1, 0, 1, 1])
 
@@ -2377,7 +2381,7 @@ def test_logaddexp2(self, device, dtype):
         self._test_logaddexp(device, dtype, base2=True)
 
     def test_add(self, device):
-        dtypes = [torch.float, torch.double] + torch.testing.get_all_complex_dtypes()
+        dtypes = [torch.float, torch.double] + get_all_complex_dtypes()
         for dtype in dtypes:
             # [res] torch.add([res,] tensor1, tensor2)
             m1 = torch.randn(100, 100, dtype=dtype, device=device)
@@ -2578,7 +2582,7 @@ def test_bool_tensor_comparison_ops(self, device):
                          torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool, device=device))
         self.assertFalse(a.equal(b))
 
-    @dtypes(*torch.testing.get_all_dtypes(include_complex=False))
+    @dtypes(*get_all_dtypes(include_complex=False))
     def test_logical(self, device, dtype):
         if dtype != torch.bool:
             x = torch.tensor([1, 2, 3, 4], device=device, dtype=dtype)
@@ -2755,8 +2759,8 @@ def test_pow_scalar_overloads_mem_overlap(self, device, dtype):
         self.unary_check_input_output_mem_overlap(
             doubles, sz, lambda input, out: torch.pow(42, input, out=out))
 
-    @dtypes(*list(product(torch.testing.get_all_dtypes(include_bool=False),
-                          torch.testing.get_all_dtypes(include_bool=False))))
+    @dtypes(*list(product(get_all_dtypes(include_bool=False),
+                          get_all_dtypes(include_bool=False))))
     def test_float_power(self, device, dtypes):
         def to_np(value):
             if isinstance(value, torch.Tensor) and value.dtype == torch.bfloat16:
@@ -2852,8 +2856,8 @@ def _promo_helper(x, y):
                         torch.Tensor.float_power_(base.clone(), exp)
 
     @skipIf(not TEST_SCIPY, "Scipy required for the test.")
-    @dtypes(*product(torch.testing.get_all_dtypes(include_complex=False, include_bfloat16=False),
-                     torch.testing.get_all_dtypes(include_complex=False, include_bfloat16=False)))
+    @dtypes(*product(get_all_dtypes(include_complex=False, include_bfloat16=False),
+                     get_all_dtypes(include_complex=False, include_bfloat16=False)))
     def test_xlogy_xlog1py(self, device, dtypes):
         x_dtype, y_dtype = dtypes
 
@@ -2864,7 +2868,7 @@ def out_variant_helper(torch_fn, x, y):
             self.assertEqual(expected, out)
 
         def xlogy_inplace_variant_helper(x, y):
-            if x.dtype in torch.testing.get_all_int_dtypes() + [torch.bool]:
+            if x.dtype in get_all_int_dtypes() + [torch.bool]:
                 with self.assertRaisesRegex(RuntimeError,
                                             "can't be cast to the desired output type"):
                     x.clone().xlogy_(y)
@@ -2991,10 +2995,10 @@ def _compare_helper(x, y, torch_fn, reference_fn):
         _compare_helper(t, zeros, *xlog1py_fns)
         _compare_helper(t, 0., *xlog1py_fns)
 
-    @dtypes(*product(torch.testing.get_all_dtypes(include_complex=False,
-                                                  include_half=False, include_bfloat16=False),
-                     torch.testing.get_all_dtypes(include_complex=False,
-                                                  include_half=False, include_bfloat16=False)))
+    @dtypes(*product(get_all_dtypes(include_complex=False,
+                                    include_half=False, include_bfloat16=False),
+                     get_all_dtypes(include_complex=False,
+                                    include_half=False, include_bfloat16=False)))
     @skipIf(not TEST_SCIPY, "Scipy required for the test.")
     def test_zeta(self, device, dtypes):
         x_dtype, q_dtype = dtypes
diff --git a/test/test_complex.py b/test/test_complex.py
index 45482efbae56d..eee7a6a51534e 100644
--- a/test/test_complex.py
+++ b/test/test_complex.py
@@ -1,11 +1,12 @@
 import torch
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes
 from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_dtype import get_all_complex_dtypes
 
 devices = (torch.device('cpu'), torch.device('cuda:0'))
 
 class TestComplexTensor(TestCase):
-    @dtypes(*torch.testing.get_all_complex_dtypes())
+    @dtypes(*get_all_complex_dtypes())
     def test_to_list(self, device, dtype):
         # test that the complex float tensor has expected values and
         # there's no garbage value in the resultant list
diff --git a/test/test_foreach.py b/test/test_foreach.py
index 123ef35bb7093..c6cf1302ffb5c 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -11,6 +11,9 @@
     (instantiate_device_type_tests, dtypes, onlyCUDA, skipCUDAIfRocm, skipMeta, ops)
 from torch.testing._internal.common_methods_invocations import \
     (foreach_unary_op_db, foreach_binary_op_db, foreach_pointwise_op_db, foreach_minmax_op_db)
+from torch.testing._internal.common_dtype import (
+    get_all_dtypes, get_all_int_dtypes, get_all_complex_dtypes, get_all_fp_dtypes,
+)
 
 # Includes some values such that N * N won't be a multiple of 4,
 # which should ensure we test the vectorized and non-vectorized
@@ -133,7 +136,7 @@ def _test_binary_op_tensorlists(self, device, dtype, opinfo, N, is_fastpath, dis
         self._binary_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath, is_inplace=True)
         if opinfo.supports_alpha_param:
             alpha = None
-            if dtype in torch.testing.get_all_int_dtypes():
+            if dtype in get_all_int_dtypes():
                 alpha = 3
             elif dtype.is_complex:
                 alpha = complex(3, 3)
@@ -170,7 +173,7 @@ def _test_binary_op_tensorlists(self, device, dtype, opinfo, N, is_fastpath, dis
     @ops(foreach_binary_op_db)
     def test_binary_op_tensorlists_fastpath(self, device, dtype, op):
         for N in N_values:
-            disable_fastpath = op.ref == torch.div and dtype in torch.testing.get_all_int_dtypes() + [torch.bool]
+            disable_fastpath = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool]
             if op.ref == torch.add and dtype == torch.bool:
                 disable_fastpath = True
             self._test_binary_op_tensorlists(device, dtype, op, N, True, disable_fastpath)
@@ -192,17 +195,17 @@ def _test_binary_op_scalar(self, device, dtype, opinfo, N, scalar, is_fastpath,
     @ops(foreach_binary_op_db)
     def test_binary_op_scalar_fastpath(self, device, dtype, op):
         for N, scalar in itertools.product(N_values, Scalars):
-            disable_fastpath = op.ref == torch.div and dtype in torch.testing.get_all_int_dtypes() + [torch.bool]
+            disable_fastpath = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool]
             if isinstance(scalar, int):
                 disable_fastpath |= dtype == torch.bool
             if isinstance(scalar, float):
-                disable_fastpath |= dtype in torch.testing.get_all_int_dtypes() + [torch.bool]
+                disable_fastpath |= dtype in get_all_int_dtypes() + [torch.bool]
             if isinstance(scalar, bool):
                 disable_fastpath |= dtype == torch.bool
                 if op.ref in (torch.add, torch.mul):
                     disable_fastpath = False
             if isinstance(scalar, complex):
-                disable_fastpath |= dtype not in torch.testing.get_all_complex_dtypes()
+                disable_fastpath |= dtype not in get_all_complex_dtypes()
             self._test_binary_op_scalar(device, dtype, op, N, scalar, True, disable_fastpath)
 
     @ops(foreach_binary_op_db)
@@ -232,16 +235,16 @@ def _test_binary_op_scalarlist(self, device, dtype, opinfo, N, scalarlist, is_fa
     def test_binary_op_scalarlist_fastpath(self, device, dtype, op):
         for N in N_values:
             for type_str, scalarlist in getScalarLists(N):
-                bool_int_div = op.ref == torch.div and dtype in torch.testing.get_all_int_dtypes() + [torch.bool]
+                bool_int_div = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool]
                 disable_fastpath = bool_int_div
                 if type_str == "int":
                     disable_fastpath |= dtype == torch.bool
                 if type_str == "float":
-                    disable_fastpath |= dtype in torch.testing.get_all_int_dtypes() + [torch.bool]
+                    disable_fastpath |= dtype in get_all_int_dtypes() + [torch.bool]
                 if type_str == "complex":
-                    disable_fastpath |= dtype not in torch.testing.get_all_complex_dtypes()
+                    disable_fastpath |= dtype not in get_all_complex_dtypes()
                 if type_str == "mixed":
-                    disable_fastpath |= True and dtype not in torch.testing.get_all_complex_dtypes()
+                    disable_fastpath |= True and dtype not in get_all_complex_dtypes()
                 self._test_binary_op_scalarlist(device, dtype, op, N, scalarlist, True, disable_fastpath)
 
     @ops(foreach_binary_op_db)
@@ -298,7 +301,7 @@ def _test_pointwise_op(self, device, dtype, opinfo, N, is_fastpath, disable_fast
     @skipMeta
     @ops(foreach_pointwise_op_db)
     def test_pointwise_op_fastpath(self, device, dtype, op):
-        disable_fastpath = dtype in torch.testing.get_all_int_dtypes() + [torch.bool]
+        disable_fastpath = dtype in get_all_int_dtypes() + [torch.bool]
         # for N, scalar in itertools.product(N_values, Scalars):
         for N in N_values:
             self._test_pointwise_op(device, dtype, op, N, True, disable_fastpath)
@@ -356,7 +359,7 @@ def _test_unary(self, device, dtype, opinfo, N, is_fastpath):
         op, ref, inplace_op, inplace_ref = self._get_funcs(opinfo, 1)
         inputs = opinfo.sample_inputs(device, dtype, N, noncontiguous=not is_fastpath),
         # note(mkozuki): Complex inputs for `_foreach_abs` go through slowpath.
-        if opinfo.name == "_foreach_abs" and dtype in torch.testing.get_all_complex_dtypes():
+        if opinfo.name == "_foreach_abs" and dtype in get_all_complex_dtypes():
             is_fastpath = False
         self._regular_unary_test(dtype, op, ref, inputs, is_fastpath)
         self._inplace_unary_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath)
@@ -367,7 +370,7 @@ def test_unary_fastpath(self, device, dtype, op):
         for N in N_values:
             self._test_unary(device, dtype, op, N, is_fastpath=True)
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     @ops(foreach_unary_op_db)
     def test_unary_slowpath(self, device, dtype, op):
         for N in N_values:
@@ -378,14 +381,14 @@ def _minmax_test(self, opinfo, inputs, is_fastpath, n_expected_cudaLaunchKernels
         self.assertEqual(ref(inputs), op(inputs, self.is_cuda, is_fastpath))
 
     # note(mkozuki): in-place of foreach_minimum and foreach_maximum aren't implemented.
-    # @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False))
+    # @dtypes(*get_all_dtypes(include_bfloat16=False, include_complex=False))
     @ops(foreach_minmax_op_db)
     def test_minmax_fastpath(self, device, dtype, op):
         for N in N_values:
             inputs = tuple(op.sample_inputs(device, dtype, N) for _ in range(2))
             self._minmax_test(op, inputs, True, N if dtype == torch.bool else 1)
 
-    @dtypes(*torch.testing.get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False))
+    @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False))
     @ops(foreach_minmax_op_db)
     def test_minmax_slowpath(self, device, dtype, op):
         for N in N_values:
@@ -394,7 +397,7 @@ def test_minmax_slowpath(self, device, dtype, op):
 
     # note(mkozuki): ForeachFuncInfo's of both `_foreach_maximum` and `_foreach_minimum` include integer types.
     # so, manually limit dtypes to fp types for inf&nan tests.
-    @dtypes(*torch.testing.get_all_fp_dtypes(include_bfloat16=True, include_half=True))
+    @dtypes(*get_all_fp_dtypes(include_bfloat16=True, include_half=True))
     @ops(foreach_minmax_op_db)
     def test_minmax_float_inf_nan(self, device, dtype, op):
         inputs = (
@@ -413,7 +416,7 @@ def test_minmax_float_inf_nan(self, device, dtype, op):
         )
         self._minmax_test(op, inputs, True, 1)
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype):
         # TODO: enable empty list case
         for tensors in [[torch.randn([0])]]:
@@ -423,7 +426,7 @@ def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype):
             torch._foreach_add_(tensors, 1)
             self.assertEqual(res, tensors)
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     @ops(foreach_binary_op_db)
     def test_binary_op_scalar_with_overlapping_tensors(self, device, dtype, op):
         foreach_op, ref = op.method_variant, op.ref
@@ -457,7 +460,7 @@ def test_binary_op_scalar_with_different_tensor_dtypes(self, device, dtype, op):
             runtime_error = e
         self.assertIsNone(runtime_error)
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     @ops(foreach_binary_op_db)
     def test_binary_op_list_error_cases(self, device, dtype, op):
         foreach_op, foreach_op_, ref, ref_ = op.method_variant, op.inplace_variant, op.ref, op.ref_inplace
@@ -513,7 +516,7 @@ def test_binary_op_list_error_cases(self, device, dtype, op):
                 return
             with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"):
                 foreach_op([tensor1], [tensor2])
-            if dtype in torch.testing.get_all_int_dtypes() + [torch.bool] and foreach_op == torch._foreach_div:
+            if dtype in get_all_int_dtypes() + [torch.bool] and foreach_op == torch._foreach_div:
                 with self.assertRaisesRegex(RuntimeError, "result type"):
                     foreach_op_([tensor1], [tensor2])
             else:
@@ -522,7 +525,7 @@ def test_binary_op_list_error_cases(self, device, dtype, op):
 
     @skipMeta
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not found")
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     @ops(foreach_binary_op_db)
     def test_binary_op_list_slow_path(self, device, dtype, op):
         # note(mkozuki): why `n_expected_cudaLaunchKernels=0`?
@@ -615,7 +618,7 @@ def test_binary_op_tensors_on_different_devices(self, device, dtype, op):
             self.assertEqual(actual, tensors1)
 
     @onlyCUDA
-    @dtypes(*torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False))
+    @dtypes(*get_all_fp_dtypes(include_half=False, include_bfloat16=False))
     @ops(foreach_pointwise_op_db)
     def test_pointwise_op_tensors_on_different_devices(self, device, dtype, op):
         # tensors1: ['cuda', 'cpu]
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 96da8d559ff31..2b543431174d2 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -21,7 +21,11 @@
      onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
      skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyOnCPUAndCUDA, dtypesIfCUDA,
      onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver)
-from torch.testing import floating_and_complex_types, floating_types, all_types, make_tensor
+from torch.testing import make_tensor
+from torch.testing._internal.common_dtype import (
+    all_types, floating_types, floating_and_complex_types, get_all_dtypes, get_all_int_dtypes, get_all_complex_dtypes,
+    get_all_fp_dtypes,
+)
 from torch.testing._internal.common_cuda import SM53OrLater, tf32_on_and_off, CUDA11OrLater, CUDA9
 from torch.distributions.binomial import Binomial
 
@@ -89,7 +93,7 @@ def check(a_sizes_, b_sizes_):
 
     # Tests torch.outer, and its alias, torch.ger, vs. NumPy
     @precisionOverride({torch.bfloat16: 1e-1})
-    @dtypes(*(torch.testing.get_all_dtypes()))
+    @dtypes(*(get_all_dtypes()))
     def test_outer(self, device, dtype):
         def run_test_case(a, b):
             if dtype == torch.bfloat16:
@@ -772,7 +776,7 @@ def check(m, a, b, beta, alpha):
         check(m_scalar, a, b, beta, alpha)
 
         # test nans and infs are not propagated to the output when beta == 0
-        float_and_complex_dtypes = torch.testing.get_all_fp_dtypes() + torch.testing.get_all_complex_dtypes()
+        float_and_complex_dtypes = get_all_fp_dtypes() + get_all_complex_dtypes()
         if beta == 0 and dtype in float_and_complex_dtypes:
             m[0][10] = m[10][10] = m[20][20] = float('inf')
             m[1][10] = m[11][10] = m[21][20] = float('nan')
@@ -785,7 +789,7 @@ def test_addr_bool(self, device, dtype):
         self._test_addr_vs_numpy(device, dtype, beta=False, alpha=False)
         self._test_addr_vs_numpy(device, dtype, beta=True, alpha=True)
 
-    @dtypes(*(torch.testing.get_all_int_dtypes()))
+    @dtypes(*(get_all_int_dtypes()))
     def test_addr_integral(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError,
                                     'argument beta must not be a floating point number.'):
@@ -806,7 +810,7 @@ def test_addr_integral(self, device, dtype):
         self._test_addr_vs_numpy(device, dtype, beta=2, alpha=2)
 
     @precisionOverride({torch.bfloat16: 1e-1})
-    @dtypes(*(torch.testing.get_all_fp_dtypes() + torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes()))
     def test_addr_float_and_complex(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError,
                                     'Boolean beta only supported for Boolean results.'):
@@ -819,11 +823,11 @@ def test_addr_float_and_complex(self, device, dtype):
         self._test_addr_vs_numpy(device, dtype, beta=0., alpha=2)
         # when beta is not zero
         self._test_addr_vs_numpy(device, dtype, beta=0.5, alpha=2)
-        if dtype in torch.testing.get_all_complex_dtypes():
+        if dtype in get_all_complex_dtypes():
             self._test_addr_vs_numpy(device, dtype, beta=(0 + 0.1j), alpha=(0.2 - 0.2j))
 
-    @dtypes(*itertools.product(torch.testing.get_all_dtypes(),
-                               torch.testing.get_all_dtypes()))
+    @dtypes(*itertools.product(get_all_dtypes(),
+                               get_all_dtypes()))
     def test_outer_type_promotion(self, device, dtypes):
         a = torch.randn(5).to(device=device, dtype=dtypes[0])
         b = torch.randn(5).to(device=device, dtype=dtypes[1])
@@ -831,9 +835,9 @@ def test_outer_type_promotion(self, device, dtypes):
             result = op(a, b)
             self.assertEqual(result.dtype, torch.result_type(a, b))
 
-    @dtypes(*itertools.product(torch.testing.get_all_dtypes(),
-                               torch.testing.get_all_dtypes(),
-                               torch.testing.get_all_dtypes()))
+    @dtypes(*itertools.product(get_all_dtypes(),
+                               get_all_dtypes(),
+                               get_all_dtypes()))
     def test_addr_type_promotion(self, device, dtypes):
         a = make_tensor((5,), device=device, dtype=dtypes[0], low=-2, high=2)
         b = make_tensor((5,), device=device, dtype=dtypes[1], low=-2, high=2)
@@ -5287,8 +5291,8 @@ def call_torch_fn(*args, **kwargs):
             self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape])
 
     @dtypesIfCUDA(torch.cfloat, torch.cdouble,
-                  *torch.testing.get_all_fp_dtypes(include_half=not CUDA9, include_bfloat16=(CUDA11OrLater and SM53OrLater)))
-    @dtypes(*(set(torch.testing.get_all_dtypes()) - {torch.half, torch.bool}))
+                  *get_all_fp_dtypes(include_half=not CUDA9, include_bfloat16=(CUDA11OrLater and SM53OrLater)))
+    @dtypes(*(set(get_all_dtypes()) - {torch.half, torch.bool}))
     def test_blas_alpha_beta_empty(self, device, dtype):
         # This test is disabled on CUDA 9 due to:
         # See: https://github.com/pytorch/pytorch/issues/31006
@@ -5324,7 +5328,7 @@ def test_blas_alpha_beta_empty(self, device, dtype):
         self.assertEqual(torch.full((2, 3), beta * value, dtype=dtype, device=device),
                          torch.addmm(input=input, mat1=mat, mat2=mat2, alpha=alpha, beta=beta, out=out))
 
-    @dtypes(*(torch.testing.get_all_complex_dtypes() + torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_complex_dtypes() + get_all_fp_dtypes()))
     def test_blas_nan_out(self, device, dtype):
         # These functions should work correctly with NaN filled outputs,
         # but need special handling, see [NOTE: cpu_zero]
@@ -5950,9 +5954,9 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=
 
     @precisionOverride({torch.bfloat16: 1e-0, torch.half: 5e-4, torch.float: 1e-4, torch.double: 1e-8,
                         torch.cfloat: 1e-4, torch.cdouble: 1e-8})
-    @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(),
-                  *torch.testing.get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)),
-                                                   include_half=(not TEST_WITH_ROCM)))
+    @dtypesIfCUDA(*get_all_complex_dtypes(),
+                  *get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)),
+                                     include_half=(not TEST_WITH_ROCM)))
     @dtypes(torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble)
     def test_addmv(self, device, dtype):
         # have to use torch.randn(...).to(bfloat16) instead of
@@ -5986,7 +5990,7 @@ def test_addmv(self, device, dtype):
         for m, v in itertools.product(ms, vs):
             self._test_addmm_addmv(torch.addmv, t, m, v, beta=0)
 
-    @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater))))
+    @dtypesIfCUDA(*get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater))))
     @dtypes(torch.float, torch.double)
     def test_addmv_rowmajor_colmajor_incx_incy_lda(self, device, dtype):
         # tests (o, s)*(s).  o is output size, s is summed size.
@@ -6017,9 +6021,9 @@ def _test(row_major, incx, incy, lda_tail):
 
     @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
                         torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
-    @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(),
-                  *torch.testing.get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater))))
-    @dtypes(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes())
+    @dtypesIfCUDA(*get_all_complex_dtypes(),
+                  *get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater))))
+    @dtypes(*get_all_complex_dtypes(), *get_all_fp_dtypes())
     @tf32_on_and_off(0.05)
     def test_addmm(self, device, dtype):
         M = torch.randn(10, 25, device=device).to(dtype)
@@ -6052,7 +6056,7 @@ def maybe_transpose(cond, m):
             self._test_addmm_addmv(torch.addmm, M, m1, m2, transpose_out=t4)
 
     @dtypes(torch.float, torch.double)
-    @dtypesIfCUDA(*([torch.float, torch.double] + torch.testing.get_all_complex_dtypes()))
+    @dtypesIfCUDA(*([torch.float, torch.double] + get_all_complex_dtypes()))
     @tf32_on_and_off(0.005)
     def test_addmm_sizes(self, device, dtype):
         for m in [0, 1, 25]:
@@ -6226,7 +6230,7 @@ def test_strided_mm_bmm(self, device, dtype):
     @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
     @skipCUDAIf(torch.version.cuda == "10.1", "flaky on CUDA 10.1")
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_fp_dtypes(), *torch.testing.get_all_complex_dtypes())
+    @dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes())
     @tf32_on_and_off(0.05)
     def test_bmm(self, device, dtype):
         if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater:
@@ -6338,7 +6342,7 @@ def _test_addbmm_baddbmm(self, func, b1, b2, ref, out_tensor):
 
     @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_fp_dtypes(), *torch.testing.get_all_complex_dtypes())
+    @dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes())
     @tf32_on_and_off(0.05)
     def test_addbmm(self, device, dtype):
         if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater:
@@ -6411,7 +6415,7 @@ def generate_tensor():
 
     @precisionOverride({torch.half: 0.1, torch.bfloat16: 0.5})
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_fp_dtypes(), *torch.testing.get_all_complex_dtypes())
+    @dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes())
     @tf32_on_and_off(0.05)
     def test_baddbmm(self, device, dtype):
         if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater:
diff --git a/test/test_nn.py b/test/test_nn.py
index 5008c7256acf7..2d66477ff826a 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -33,7 +33,7 @@
 from torch.nn import Parameter
 from torch.nn.parameter import UninitializedParameter, UninitializedBuffer
 from torch.nn.parallel._functions import Broadcast
-from torch.testing import get_all_fp_dtypes
+from torch.testing._internal.common_dtype import integral_types, get_all_fp_dtypes, get_all_math_dtypes
 from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
     TEST_NUMPY, TEST_SCIPY, TEST_WITH_ROCM, download_file, \
     get_function_arglist, load_tests, repeat_test_for_types, ALL_TENSORTYPES, \
@@ -9406,9 +9406,9 @@ def test_cosine_embedding_loss_with_diff_type(self):
             input2 = torch.tensor([[2, 3, 5], [3, 2, 1]], dtype=torch.double, device=device)
             target = torch.tensor([1, -1], dtype=torch.int, device=device)
             expected = torch.nn.functional.cosine_embedding_loss(input1, input2, target)
-            for dt1 in torch.testing.get_all_math_dtypes(device):
-                for dt2 in torch.testing.get_all_math_dtypes(device):
-                    for dt3 in torch.testing.get_all_math_dtypes(device):
+            for dt1 in get_all_math_dtypes(device):
+                for dt2 in get_all_math_dtypes(device):
+                    for dt3 in get_all_math_dtypes(device):
                         # dt3 is used as dtype for target = [1, -1], so let's skip unsigned type
                         if dt3 == torch.uint8:
                             continue
@@ -9425,7 +9425,7 @@ def test_kl_div_with_diff_type(self):
             input = torch.tensor([[2, 3, 5], [3, 2, 1]], dtype=torch.double, device=device)
             target = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.double, device=device)
             expected = torch.nn.functional.kl_div(input, target)
-            for input_dtype in torch.testing.get_all_math_dtypes(device):
+            for input_dtype in get_all_math_dtypes(device):
                 if input_dtype.is_complex:
                     continue
                 for target_dtype in [torch.float32, torch.float64, torch.float16]:
@@ -9441,7 +9441,7 @@ def test_kl_div_with_diff_type_log_target(self):
             input = torch.tensor([[2, 3, 5], [3, 2, 1]], dtype=torch.double, device=device)
             target = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.double, device=device).log()
             expected = torch.nn.functional.kl_div(input, target, log_target=True)
-            for input_dtype in torch.testing.get_all_math_dtypes(device):
+            for input_dtype in get_all_math_dtypes(device):
                 if input_dtype.is_complex:
                     continue
                 for target_dtype in [torch.float32, torch.float64, torch.float16]:
@@ -9584,7 +9584,7 @@ def _input_grad(input, target, reduction):
             return input.grad
 
         for device, dtype, reduction in product(device_(),
-                                                torch.testing.integral_types(),
+                                                integral_types(),
                                                 ('none', 'sum', 'mean')):
             input = torch.randn(2, 2, device=device, requires_grad=True)
             target = torch.randint(0, 9, (2, 2), device=device, dtype=dtype)
diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py
index be46f93bdf3a8..a6f5be036c7a6 100644
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@@ -7,6 +7,7 @@
     (TestCase, run_tests)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, onlyCPU, dtypes)
+from torch.testing._internal.common_dtype import get_all_dtypes
 
 # For testing handling NumPy objects and sending tensors to / accepting
 #   arrays from NumPy.
@@ -393,7 +394,7 @@ def test_has_storage_numpy(self, device):
             self.assertIsNotNone(torch.tensor(arr, device=device, dtype=torch.long).storage())
             self.assertIsNotNone(torch.tensor(arr, device=device, dtype=torch.uint8).storage())
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_numpy_scalar_cmp(self, device, dtype):
         if dtype.is_complex:
             tensors = (torch.tensor(complex(1, 3), dtype=dtype, device=device),
diff --git a/test/test_ops.py b/test/test_ops.py
index b5b03c5b96ab9..a9d470fec5e44 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -5,8 +5,8 @@
 
 import torch
 
-from torch.testing import \
-    (FileCheck, floating_and_complex_types_and, get_all_dtypes, make_tensor)
+from torch.testing import FileCheck, make_tensor
+from torch.testing._internal.common_dtype import floating_and_complex_types_and, get_all_dtypes
 from torch.testing._internal.common_utils import \
     (TestCase, is_iterable_of_tensors, run_tests, IS_SANDCASTLE, clone_input_helper,
      gradcheck, gradgradcheck, IS_IN_CI, suppress_warnings)
diff --git a/test/test_reductions.py b/test/test_reductions.py
index a9c667564d118..9760eae52813d 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -9,8 +9,11 @@
 import warnings
 
 from torch._six import inf, nan
-from torch.testing import (
-    integral_types_and, floating_and_complex_types_and, make_tensor)
+from torch.testing import make_tensor
+from torch.testing._internal.common_dtype import (
+    get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_complex_dtypes, get_all_fp_dtypes,
+    integral_types_and, floating_and_complex_types_and
+)
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict,
     IS_WINDOWS)
@@ -625,7 +628,7 @@ def _test_out(dtype, other_dtype):
             # 'out' is favored over dtype, check error
             self.assertRaises(RuntimeError, lambda: fn(x, out=out, dtype=other_dtype))
 
-        for dtype in [dtype for dtype in torch.testing.get_all_math_dtypes('cpu') if dtype != torch.float16]:
+        for dtype in [dtype for dtype in get_all_math_dtypes('cpu') if dtype != torch.float16]:
             x = torch.ones(shape, dtype=dtype)
             expected_dtype = dtype if dtype.is_floating_point or dtype.is_complex else torch.int64
             self.assertIs(expected_dtype, fn(x).dtype)
@@ -1273,7 +1276,7 @@ def test_dtype_bfloat16(values_bf16=False, boundaries_bf16=False):
             test_dtype_bfloat16(False, True)
             test_dtype_bfloat16(True, True)
 
-    @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_complex=False))
+    @dtypes(*get_all_dtypes(include_bool=False, include_complex=False))
     def test_nansum(self, device, dtype):
         args = product(
             (True, False),  # noncontiguous
@@ -1326,15 +1329,15 @@ def _test_reduction_function_with_numpy(self, torch_func, np_func, device, dtype
                             self.compare_with_numpy(torch_func_partial, np_func_partial, x, device=None, dtype=None,
                                                     atol=atol, rtol=rtol, exact_dtype=exact_dtype)
 
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) +
-              torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) +
+              get_all_complex_dtypes()))
     def test_count_nonzero(self, device, dtype):
         self._test_reduction_function_with_numpy(torch.count_nonzero, np.count_nonzero, device, dtype)
         self._test_reduction_function_with_numpy(torch.count_nonzero, np.count_nonzero, device, dtype, True)
 
     def _test_sum_reduction_vs_numpy(self, torch_fn, np_fn, device, dtype, with_keepdim=False, with_extremal=False):
         def is_integral(dtype):
-            return dtype in torch.testing.get_all_int_dtypes()
+            return dtype in get_all_int_dtypes()
 
         # On Windows CI, the current version of `numpy` promotes all lower integers
         # dtypes to int32 while `torch` promotes them to int64. Hence we skip on checking
@@ -1363,27 +1366,27 @@ def is_integral(dtype):
                                                      with_keepdim=with_keepdim, with_extremal=with_extremal)
 
     @onlyOnCPUAndCUDA
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False)))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False)))
     def test_sum_vs_numpy(self, device, dtype):
         self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype)
         self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype, with_extremal=True)
         self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype, with_keepdim=True)
 
     @onlyOnCPUAndCUDA
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False)))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False)))
     def test_nansum_vs_numpy(self, device, dtype):
         self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype)
         self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype, with_extremal=True)
         self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype, with_keepdim=True)
 
-    @dtypes(*(torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_complex_dtypes()))
     def test_nansum_complex(self, device, dtype):
         x = torch.randn((3, 3, 3), device=device, dtype=dtype)
         with self.assertRaisesRegex(RuntimeError, "nansum does not support complex inputs"):
             torch.nansum(x)
 
     def test_nansum_out_dtype(self, device):
-        dtypes = list(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False))
+        dtypes = list(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False))
         for inp_dtype, out_dtype in combinations(dtypes, 2):
             shape = _rand_shape(random.randint(2, 5), min_size=5, max_size=10)
             x = _generate_input(shape, inp_dtype, device, with_extremal=False)
@@ -1392,7 +1395,7 @@ def test_nansum_out_dtype(self, device):
             np_fn = partial(np.nansum, dtype=np_out_dtype)
             self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
 
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False)))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False)))
     def test_argminmax_multiple(self, device, dtype):
         # Case: All Ones
         t = torch.ones(3, 3, device=device, dtype=dtype)
@@ -1400,7 +1403,7 @@ def test_argminmax_multiple(self, device, dtype):
         self.compare_with_numpy(torch.argmin, np.argmin, t)
 
         # Case: With single `nan` present.
-        if dtype in torch.testing.get_all_fp_dtypes():
+        if dtype in get_all_fp_dtypes():
             t[2, 2] = float('nan')
             self.compare_with_numpy(torch.argmax, np.argmax, t)
             self.compare_with_numpy(torch.argmin, np.argmin, t)
@@ -1477,8 +1480,8 @@ def verify_against_numpy(t):
                           [0, 0]], device=device, dtype=dtype)
         verify_against_numpy(t)
 
-    @dtypes(*(torch.testing.get_all_dtypes(include_half=True, include_bfloat16=False,
-                                           include_bool=True, include_complex=True)))
+    @dtypes(*(get_all_dtypes(include_half=True, include_bfloat16=False,
+                             include_bool=True, include_complex=True)))
     def test_all_any_vs_numpy(self, device, dtype):
         # Note [all, any uint8 compatibility]: However for compatibility reason,
         # for `uint8`, they return Tensor of same dtype `uint8`.
@@ -1706,7 +1709,7 @@ def test_minmax_illegal_dtype(self, device):
         with self.assertRaisesRegex(RuntimeError, rmsg):
             torch.min(x, dim=0, out=(illegal_values, illegal_indices))
 
-    @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_complex=False))
+    @dtypes(*get_all_dtypes(include_bool=False, include_complex=False))
     def test_dim_arg_reduction_scalar(self, device, dtype):
         example = 4.0
 
@@ -1724,7 +1727,7 @@ def test_dim_arg_reduction_scalar(self, device, dtype):
 
 
     @precisionOverride({torch.float16: 1e-2, torch.bfloat16: 1e-2})
-    @dtypes(*(set(torch.testing.get_all_dtypes(include_bool=False, include_complex=False)) - {torch.uint8}))
+    @dtypes(*(set(get_all_dtypes(include_bool=False, include_complex=False)) - {torch.uint8}))
     def test_dim_reduction(self, device, dtype):
         example = [[-1, 2, 1], [5, 3, 6]]
 
@@ -2968,8 +2971,8 @@ def test_reduction_empty_any_all(self, device):
         shape = (2, 0, 4)
         x = torch.randn(shape, device=device)
 
-        for dtype in torch.testing.get_all_dtypes(include_half=True, include_bfloat16=False,
-                                                  include_bool=True, include_complex=True):
+        for dtype in get_all_dtypes(include_half=True, include_bfloat16=False,
+                                    include_bool=True, include_complex=True):
             # Refer: [all, any uint8 compatibility]
             if dtype == torch.uint8:
                 out_dtype = torch.uint8
diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py
index cb4ec3c18f82a..3f8c760264709 100644
--- a/test/test_shape_ops.py
+++ b/test/test_shape_ops.py
@@ -13,6 +13,7 @@
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCPU, onlyCUDA, dtypes, onlyOnCPUAndCUDA,
     dtypesIfCPU, dtypesIfCUDA, largeTensorTest)
+from torch.testing._internal.common_dtype import get_all_dtypes
 
 # TODO: replace with make_tensor
 def _generate_input(shape, dtype, device, with_extremal):
@@ -224,9 +225,9 @@ def test_diagonal_multidim(self, device, dtype):
         self.assertEqual(expected, result)
 
     @onlyOnCPUAndCUDA
-    @dtypesIfCPU(*torch.testing.get_all_dtypes(include_complex=False, include_bool=False, include_half=False,
-                                               include_bfloat16=False))
-    @dtypesIfCUDA(*torch.testing.get_all_dtypes(include_complex=False, include_bool=False, include_bfloat16=False))
+    @dtypesIfCPU(*get_all_dtypes(include_complex=False, include_bool=False, include_half=False,
+                                 include_bfloat16=False))
+    @dtypesIfCUDA(*get_all_dtypes(include_complex=False, include_bool=False, include_bfloat16=False))
     def test_trace(self, device, dtype):
         def test(shape):
             tensor = make_tensor(shape, device, dtype, low=-9, high=9)
@@ -338,7 +339,7 @@ def test_clamp_raises_arg_errors(self, device):
         with self.assertRaisesRegex(RuntimeError, error_msg):
             torch.clamp(X)
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_flip(self, device, dtype):
         make_from_data = partial(torch.tensor, device=device, dtype=dtype)
         make_from_size = partial(make_tensor, device=device, dtype=dtype)
@@ -437,7 +438,7 @@ def gen_data():
         for dims in test_dims:
             self.assertEqual(size, list(data.flip(dims).size()))
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_flip_errors(self, device, dtype):
         make_arg = partial(make_tensor, dtype=dtype, device=device)
         data = make_arg((2, 2, 2))
@@ -455,7 +456,7 @@ def test_flip_errors(self, device, dtype):
     def _rand_shape(self, dim, min_size, max_size):
         return tuple(torch.randint(min_size, max_size + 1, (dim,)))
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_flip_numpy(self, device, dtype):
         make_arg = partial(make_tensor, dtype=dtype, device=device)
 
@@ -564,7 +565,7 @@ def test_nonzero_no_warning(self, device):
             t.nonzero()
             self.assertEqual(len(w), 0)
 
-    @dtypes(*torch.testing.get_all_dtypes(include_complex=False))
+    @dtypes(*get_all_dtypes(include_complex=False))
     def test_nonzero(self, device, dtype):
 
         shapes = [
diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
index e562e389a3fc8..52c32952a6965 100644
--- a/test/test_sort_and_select.py
+++ b/test/test_sort_and_select.py
@@ -5,7 +5,10 @@
 from torch._six import nan
 from itertools import permutations, product
 
-from torch.testing import all_types, all_types_and, make_tensor
+from torch.testing import make_tensor
+from torch.testing._internal.common_dtype import (
+    all_types, all_types_and, floating_types_and, get_all_dtypes, get_all_int_dtypes, get_all_fp_dtypes,
+)
 from torch.testing._internal.common_utils import \
     (TEST_WITH_ROCM, TestCase, run_tests, slowTest)
 from torch.testing._internal.common_device_type import \
@@ -128,7 +131,7 @@ def test_sort(self, device):
                                  'random with NaNs')
 
     # FIXME: remove torch.bool from unsupported types once support is added for cub sort
-    @dtypes(*set(torch.testing.get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128})
+    @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128})
     def test_stable_sort(self, device, dtype):
         if TEST_WITH_ROCM and dtype == torch.bfloat16:
             return
@@ -223,11 +226,11 @@ def test_topk_1d_output_discontiguous(self, device, dtype):
             self.assertEqual(values, values_cont)
 
     # FIXME: remove torch.bool from unsupported types once support is added for cub sort
-    @dtypes(*set(torch.testing.get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128})
+    @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128})
     def test_stable_sort_against_numpy(self, device, dtype):
         if TEST_WITH_ROCM and dtype == torch.bfloat16:
             return
-        if dtype in torch.testing.floating_types_and(torch.float16, torch.bfloat16):
+        if dtype in floating_types_and(torch.float16, torch.bfloat16):
             inf = float('inf')
             neg_inf = -float('inf')
             nan = float('nan')
@@ -288,7 +291,7 @@ def repeated_index_fill(t, dim, idxs, vals):
             idx_numpy = np.argsort(sample_numpy, axis=dim, kind='stable')
             self.assertEqual(idx_torch, idx_numpy)
 
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
     def test_msort(self, device, dtype):
         if TEST_WITH_ROCM and dtype == torch.bfloat16:
             return
@@ -634,7 +637,7 @@ def test_topk_bfloat16(self, device, dtype):
         for curr_size in (small, large):
             self._test_topk_dtype(device, dtype, False, curr_size)
 
-    @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes())
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float, torch.double, torch.bfloat16)
     def test_topk_nonfinite(self, device, dtype):
         if TEST_WITH_ROCM and dtype == torch.bfloat16:
@@ -665,11 +668,11 @@ def test_topk_4d(self, device):
         self.assertEqual(ind, expected_ind, atol=0, rtol=0)
 
     @onlyOnCPUAndCUDA
-    @dtypesIfCUDA(*(torch.testing.get_all_dtypes(include_complex=False,
-                                                 include_bool=False,
-                                                 include_half=False,
-                                                 include_bfloat16=True)))
-    @dtypes(*(torch.testing.get_all_dtypes(include_complex=False, include_bool=False, include_half=False, include_bfloat16=False)))
+    @dtypesIfCUDA(*(get_all_dtypes(include_complex=False,
+                                   include_bool=False,
+                                   include_half=False,
+                                   include_bfloat16=True)))
+    @dtypes(*(get_all_dtypes(include_complex=False, include_bool=False, include_half=False, include_bfloat16=False)))
     def test_topk_zero(self, device, dtype):
         if TEST_WITH_ROCM and dtype == torch.bfloat16:
             return
@@ -726,8 +729,8 @@ def ensure_tuple(x):
                 self.assertEqual(expected_inverse.view(additional_shape), y_inverse)
                 self.assertEqual(expected_counts, y_counts)
 
-    @dtypesIfCPU(*set(torch.testing.get_all_dtypes()) - {torch.complex64, torch.complex128})
-    @dtypes(*set(torch.testing.get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128})
+    @dtypesIfCPU(*set(get_all_dtypes()) - {torch.complex64, torch.complex128})
+    @dtypes(*set(get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128})
     def test_unique(self, device, dtype):
         if dtype is torch.half and self.device_type == 'cpu':
             return  # CPU does not have half support
@@ -786,8 +789,8 @@ def ensure_tuple(x):
                                 count += 1
                         self.assertEqual(j, count)
 
-    @dtypesIfCPU(*set(torch.testing.get_all_dtypes()) - {torch.complex64, torch.complex128})
-    @dtypes(*set(torch.testing.get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128})
+    @dtypesIfCPU(*set(get_all_dtypes()) - {torch.complex64, torch.complex128})
+    @dtypes(*set(get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128})
     def test_unique_consecutive(self, device, dtype):
         if dtype is torch.half and self.device_type == 'cpu':
             return  # CPU does not have half support
diff --git a/test/test_sparse.py b/test/test_sparse.py
index fb0a660333583..f9ed0dc11ffbd 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -16,6 +16,9 @@
     (instantiate_device_type_tests, ops, dtypes, dtypesIfCPU, onlyCPU, onlyCUDA, deviceCountAtLeast)
 from torch.testing._internal.common_methods_invocations import \
     (sparse_unary_ufuncs)
+from torch.testing._internal.common_dtype import (
+    floating_and_complex_types, floating_and_complex_types_and, get_all_dtypes, get_all_int_dtypes,
+)
 
 if TEST_SCIPY:
     import scipy.sparse
@@ -286,7 +289,7 @@ def test_ctor_size_checks(self, device, dtype):
             RuntimeError,
             lambda: self.sparse_tensor(indices, values, torch.Size([2, 4, 2, 1])))
 
-    @dtypes(*torch.testing.floating_and_complex_types_and(torch.float16))
+    @dtypes(*floating_and_complex_types_and(torch.float16))
     def test_to_dense(self, device, dtype):
         def test_tensor(x, res):
             x.to_dense()  # Tests triple to_dense for memory corruption
@@ -1943,7 +1946,7 @@ def test_narrow(self, device, dtype, coalesced):
 
     def _test_log1p_tensor(self, sparse_tensor, coalesced):
         def is_integral(dtype):
-            return dtype in torch.testing.get_all_int_dtypes()
+            return dtype in get_all_int_dtypes()
 
         dense_tensor = sparse_tensor.to_dense()
         expected_output = dense_tensor.log1p()
@@ -1977,8 +1980,8 @@ def is_integral(dtype):
                 sparse_tensor.requires_grad_()
 
     @coalescedonoff
-    @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False,
-                                          include_bfloat16=False, include_complex=False))
+    @dtypes(*get_all_dtypes(include_bool=False, include_half=False,
+                            include_bfloat16=False, include_complex=False))
     def test_log1p(self, device, dtype, coalesced):
         if coalesced:
             input_coalesced = torch.sparse_coo_tensor(
@@ -2086,7 +2089,7 @@ def test_neg_negative(self, device, dtype, coalesced):
 
     def _test_asin_arcsin(self, sparse_tensor, coalesced):
         def is_integral(dtype):
-            return dtype in torch.testing.get_all_int_dtypes()
+            return dtype in get_all_int_dtypes()
         is_integral_dtype = is_integral(sparse_tensor.dtype)
 
         dense_tensor = sparse_tensor.to_dense()
@@ -2125,8 +2128,8 @@ def is_integral(dtype):
                     op(sparse_tensor)
 
     @coalescedonoff
-    @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False,
-                                          include_bfloat16=False, include_complex=False))
+    @dtypes(*get_all_dtypes(include_bool=False, include_half=False,
+                            include_bfloat16=False, include_complex=False))
     def test_asin_arcsin(self, device, dtype, coalesced):
         if coalesced:
             input_coalesced = torch.sparse_coo_tensor(
@@ -2196,7 +2199,7 @@ def test_shape(di, dj, dk, nnz):
             y, _, _ = self._gen_sparse(2, 20, [10, 100], dtype, device, coalesced)
             res = x.mv(y)
 
-    @dtypes(*torch.testing.floating_and_complex_types())
+    @dtypes(*floating_and_complex_types())
     def test_sparse_add_coalesce(self, device, dtype):
         i = self.index_tensor([[1, 2, 1]], device=device)
         v = torch.tensor([3, 4, 5], dtype=dtype, device=device)
@@ -2613,14 +2616,14 @@ def test_legacy_new(self, device):
 
     @onlyCPU  # not really, but we only really want to run this once
     def test_dtypes(self, device):
-        all_sparse_dtypes = torch.testing.get_all_dtypes(include_complex=True)
+        all_sparse_dtypes = get_all_dtypes(include_complex=True)
         do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu'))
         if torch.cuda.is_available():
             do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cuda:0'))
 
     @onlyCPU  # not really, but we only really want to run this once
     def test_empty_full(self, device):
-        all_sparse_dtypes = torch.testing.get_all_dtypes(include_complex=True)
+        all_sparse_dtypes = get_all_dtypes(include_complex=True)
         do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu'))
         if torch.cuda.device_count() > 0:
             do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, None)
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index fbb2b30e46304..af99fa031fca3 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -9,6 +9,7 @@
     (IS_MACOS, IS_WINDOWS, TestCase, run_tests, load_tests, coalescedonoff)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, onlyCPU, onlyCUDA)
+from torch.testing._internal.common_dtype import floating_types, get_all_dtypes
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -43,7 +44,7 @@ def test_csr_layout(self):
         self.assertEqual(str(torch.sparse_csr), 'torch.sparse_csr')
         self.assertEqual(type(torch.sparse_csr), torch.layout)
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_sparse_csr_constructor_shape_inference(self, device, dtype):
         crow_indices = [0, 2, 4]
         col_indices = [0, 1, 0, 1]
@@ -56,7 +57,7 @@ def test_sparse_csr_constructor_shape_inference(self, device, dtype):
         self.assertEqual(dtype, sparse.dtype)
         self.assertEqual(torch.device(device), sparse.device)
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_sparse_csr_constructor(self, device, dtype):
         crow_indices = [0, 2, 4]
         col_indices = [0, 1, 0, 1]
@@ -73,7 +74,7 @@ def test_sparse_csr_constructor(self, device, dtype):
             self.assertEqual(torch.tensor(col_indices, dtype=index_dtype), sparse.col_indices())
             self.assertEqual(torch.tensor(values, dtype=dtype), sparse.values())
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_sparse_csr_constructor_from_lists(self, device, dtype):
         # without size
         sparse = torch.sparse_csr_tensor([0, 2, 4],
@@ -209,7 +210,7 @@ def test_factory_indices_invariants_check(self, device):
                                     device=device)
 
     @onlyCUDA
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_factory_device_type_inference(self, device, dtype):
         cpu_cuda = ('cpu', 'cuda')
         cpu_cuda_none = cpu_cuda + (None,)
@@ -262,7 +263,7 @@ def test_sparse_csr_print(self, device):
             printed.append("# col_indices shape: {}".format(col_indices_shape))
             printed.append("# values_shape: {}".format(values_shape))
             for index_dtype in [torch.int32, torch.int64]:
-                for dtype in torch.testing.floating_types():
+                for dtype in floating_types():
                     printed.append("########## {}/{} ##########".format(dtype, index_dtype))
                     x = torch.sparse_csr_tensor(torch.tensor([0, 2, 4], dtype=index_dtype),
                                                 torch.tensor([0, 1, 0, 1], dtype=index_dtype),
@@ -280,7 +281,7 @@ def test_sparse_csr_print(self, device):
         self.assertExpected('\n'.join(printed))
         self.maxDiff = orig_maxDiff
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_sparse_csr_from_dense(self, device, dtype):
         dense = torch.tensor([[4, 5, 0], [0, 0, 0], [1, 0, 0]], dtype=dtype, device=device)
         sparse = dense.to_sparse_csr()
@@ -300,7 +301,7 @@ def test_sparse_csr_from_dense(self, device, dtype):
         self.assertEqual(torch.tensor([0, 1, 2] * 3, dtype=torch.int64), sparse.col_indices())
         self.assertEqual(torch.tensor([2] * 9, dtype=dtype), sparse.values())
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_sparse_csr_to_dense(self, device, dtype):
         mn = [5, 2, 0]
         for (m, n) in itertools.product(mn, mn):
@@ -377,7 +378,7 @@ def test_mkl_matvec_warnings(self, device, dtype):
                     self.assertIn("Pytorch is compiled with MKL LP64 and will convert col_indices to int32",
                                   str(w[1].message))
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_sparse_csr_from_dense_convert_error(self, device, dtype):
         size = (4, 2, 4)
         dense = make_tensor(size, dtype=dtype, device=device)
@@ -445,7 +446,7 @@ def test_shape(di, dj, dk, nnz):
                     test_shape(i, j, k, i * j // 2)
         test_shape(4, 4, 4, 0)
 
-    @dtypes(*torch.testing.floating_types())
+    @dtypes(*floating_types())
     def test_sparse_mm(self, device, dtype):
         def test_shape(d1, d2, d3, nnz, transposed):
             if transposed:
@@ -459,7 +460,7 @@ def test_shape(d1, d2, d3, nnz, transposed):
         test_shape(7, 8, 9, 20, False)
         test_shape(7, 8, 9, 20, True)
 
-    @dtypes(*torch.testing.floating_types())
+    @dtypes(*floating_types())
     def test_sparse_addmm(self, device, dtype):
         def test_shape(m, n, p, nnz, broadcast, alpha_beta=None):
             if alpha_beta is None:
@@ -514,7 +515,7 @@ def _test_spadd_shape(nnz, shape):
         _test_spadd_shape(10, [100, 1])
         _test_spadd_shape(10, [1, 100])
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_coo_csr_conversion(self, device, dtype):
         for m, n in itertools.product([5, 2, 0], [5, 2, 0]):
             size = (m, n)
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index a7496919eaeb8..4a2216d230203 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -17,6 +17,9 @@
     instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA,
     onlyCPU, largeTensorTest, precisionOverride, dtypes,
     onlyCUDA, skipCPUIf, dtypesIfCUDA, dtypesIfCPU, skipMeta)
+from torch.testing._internal.common_dtype import (
+    get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes
+)
 
 # TODO: refactor tri_tests_args, _compare_trilu_indices, run_additional_tri_tests
 from torch.testing._internal.common_methods_invocations import (
@@ -140,7 +143,7 @@ def test_vander_types(self, device, dtype):
                 exact_dtype=False)
 
     def test_cat_all_dtypes_and_devices(self, device):
-        for dt in torch.testing.get_all_dtypes():
+        for dt in get_all_dtypes():
             x = torch.tensor([[1, 2], [3, 4]], dtype=dt, device=device)
 
             expected1 = torch.tensor([[1, 2], [3, 4], [1, 2], [3, 4]], dtype=dt, device=device)
@@ -150,7 +153,7 @@ def test_cat_all_dtypes_and_devices(self, device):
             self.assertEqual(torch.cat((x, x), 1), expected2)
 
     def test_fill_all_dtypes_and_devices(self, device):
-        for dt in torch.testing.get_all_dtypes():
+        for dt in get_all_dtypes():
             for x in [torch.tensor((10, 10), dtype=dt, device=device),
                       torch.empty(10000, dtype=dt, device=device)]:  # large tensor
                 numel = x.numel()
@@ -304,7 +307,7 @@ def run_test(shape, device, diagonal, dtype):
                   (3, 1), (5, 3, 1), (7, 5, 3, 1),  # very fat matrices
                   (1, 3), (5, 1, 3), (7, 5, 1, 3),  # very thin matrices
                   (1, 3, 3, 3), (3, 1, 3, 3, 3)]    # unsqueezed batch dimensions
-        dtypes = [dtype for dtype in torch.testing.get_all_dtypes() if dtype != torch.bfloat16]
+        dtypes = [dtype for dtype in get_all_dtypes() if dtype != torch.bfloat16]
         for s, d, dtype in product(shapes, diagonals, dtypes):
             run_test(s, device, d, dtype)
 
@@ -987,8 +990,8 @@ def _test_special_stacks(self, dim, at_least_dim, torch_fn, np_fn, device, dtype
                         np_fn(np_input)
 
     @onlyOnCPUAndCUDA
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) +
-              torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) +
+              get_all_complex_dtypes()))
     def test_hstack_column_stack(self, device, dtype):
         ops = ((torch.hstack, np.hstack), (torch.column_stack, np.column_stack))
         for torch_op, np_op in ops:
@@ -1007,8 +1010,8 @@ def test_hstack_column_stack(self, device, dtype):
                          torch_result)
 
     @onlyOnCPUAndCUDA
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) +
-              torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) +
+              get_all_complex_dtypes()))
     def test_vstack_row_stack(self, device, dtype):
         ops = ((torch.vstack, np.vstack), (torch.row_stack, np.row_stack))
         for torch_op, np_op in ops:
@@ -1025,8 +1028,8 @@ def test_vstack_row_stack(self, device, dtype):
                 self.assertEqual(actual, expected)
 
     @onlyOnCPUAndCUDA
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) +
-              torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) +
+              get_all_complex_dtypes()))
     def test_dstack(self, device, dtype):
         self._test_special_stacks(2, 3, torch.dstack, np.dstack, device, dtype)
         for i in range(5):
@@ -1572,7 +1575,7 @@ def test_random_from_to_bool(self, device):
                         lambda: t.random_(from_, to_)
                     )
 
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
     def test_random_full_range(self, device, dtype):
         size = 2000
         alpha = 0.1
@@ -1606,7 +1609,7 @@ def test_random_full_range(self, device, dtype):
         self.assertTrue(from_ <= t.to(torch.double).min() < (from_ + delta))
         self.assertTrue((to_inc_ - delta) < t.to(torch.double).max() <= to_inc_)
 
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
     def test_random_from_to(self, device, dtype):
         size = 2000
         alpha = 0.1
@@ -1695,7 +1698,7 @@ def test_random_from_to(self, device, dtype):
                         lambda: t.random_(from_, to_)
                     )
 
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
     def test_random_to(self, device, dtype):
         size = 2000
         alpha = 0.1
@@ -1753,7 +1756,7 @@ def test_random_to(self, device, dtype):
                     lambda: t.random_(from_, to_)
                 )
 
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
     def test_random_default(self, device, dtype):
         size = 2000
         alpha = 0.1
@@ -1781,10 +1784,10 @@ def test_empty_full(self, device):
         device_type = torch_device.type
 
         if device_type == 'cpu':
-            do_test_empty_full(self, torch.testing.get_all_math_dtypes('cpu'), torch.strided, torch_device)
+            do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, torch_device)
         if device_type == 'cuda':
-            do_test_empty_full(self, torch.testing.get_all_math_dtypes('cpu'), torch.strided, None)
-            do_test_empty_full(self, torch.testing.get_all_math_dtypes('cpu'), torch.strided, torch_device)
+            do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, None)
+            do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, torch_device)
 
     # TODO: this test should be updated
     @suppress_warnings
@@ -2472,7 +2475,7 @@ def test_empty_tensor_props(self, device):
             self.assertEqual(x.stride(), y.stride())
 
     def test_eye(self, device):
-        for dtype in torch.testing.get_all_dtypes():
+        for dtype in get_all_dtypes():
             if dtype == torch.bfloat16:
                 continue
             # Test the RuntimeError is raised when either m or n is a negative number
@@ -2505,8 +2508,8 @@ def test_eye(self, device):
                 self.assertEqual(res1, res2)
 
     @precisionOverride({torch.float: 1e-8, torch.double: 1e-10})
-    @dtypes(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False) +
-              torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False) +
+              get_all_complex_dtypes()))
     def test_linspace_vs_numpy(self, device, dtype):
         start = -0.0316082797944545745849609375 + (0.8888888888j if dtype.is_complex else 0)
         end = .0315315723419189453125 + (0.444444444444j if dtype.is_complex else 0)
@@ -2543,7 +2546,7 @@ def test_logspace_vs_numpy_complex(self, device, dtype):
                                                     device, dtype)
 
     @precisionOverride({torch.float: 1e-6, torch.double: 1e-10})
-    @dtypes(*torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False))
+    @dtypes(*get_all_fp_dtypes(include_half=False, include_bfloat16=False))
     def test_logspace_vs_numpy(self, device, dtype):
         start = -0.0316082797944545745849609375
         end = .0315315723419189453125
@@ -2653,7 +2656,7 @@ def test_tensor_factories_empty(self, device):
         shapes = [(5, 0, 1), (0,), (0, 0, 1, 0, 2, 0, 0)]
 
         for shape in shapes:
-            for dt in torch.testing.get_all_dtypes():
+            for dt in get_all_dtypes():
 
                 self.assertEqual(shape, torch.zeros(shape, device=device, dtype=dt).shape)
                 self.assertEqual(shape, torch.zeros_like(torch.zeros(shape, device=device, dtype=dt)).shape)
@@ -2739,8 +2742,8 @@ def test_arange_bfloat16(self, device):
         bfloat16_tensor = torch.arange(0, 6, step=2, dtype=torch.bfloat16, device=device)
         self.assertEqual(ref_tensor, bfloat16_tensor)
 
-    @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False))
-    @dtypesIfCUDA(*torch.testing.get_all_dtypes(include_bool=False, include_half=True))
+    @dtypes(*get_all_dtypes(include_bool=False, include_half=False))
+    @dtypesIfCUDA(*get_all_dtypes(include_bool=False, include_half=True))
     def test_linspace(self, device, dtype):
         _from = random.random()
         to = _from + random.random()
@@ -2854,12 +2857,12 @@ def _test_linspace(self, device, dtype, steps):
     # See NOTE [Linspace+Logspace precision override]
     @skipCPUIf(True, "compares with CPU")
     @precisionOverride({torch.half: 0.0039 + LINSPACE_LOGSPACE_EXTRA_EPS})
-    @dtypes(*(torch.testing.get_all_fp_dtypes() + torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes()))
     def test_linspace_device_vs_cpu(self, device, dtype):
         self._test_linspace(device, dtype, steps=10)
 
     @skipCPUIf(True, "compares with CPU")
-    @dtypes(*(torch.testing.get_all_fp_dtypes() + torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes()))
     def test_linspace_special_steps(self, device, dtype):
         for steps in self.LINSPACE_LOGSPACE_SPECIAL_STEPS:
             self._test_linspace(device, dtype, steps=steps)
@@ -2900,10 +2903,10 @@ def test_logspace_special_steps(self, device, dtype):
             self._test_logspace(device, dtype, steps=steps)
             self._test_logspace_base2(device, dtype, steps=steps)
 
-    @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False, include_complex=False))
-    @dtypesIfCUDA(*((torch.testing.get_all_int_dtypes() + [torch.float32, torch.float16, torch.bfloat16])
+    @dtypes(*get_all_dtypes(include_bool=False, include_half=False, include_complex=False))
+    @dtypesIfCUDA(*((get_all_int_dtypes() + [torch.float32, torch.float16, torch.bfloat16])
                     if TEST_WITH_ROCM
-                    else torch.testing.get_all_dtypes(include_bool=False, include_half=True, include_complex=False)))
+                    else get_all_dtypes(include_bool=False, include_half=True, include_complex=False)))
     def test_logspace(self, device, dtype):
         _from = random.random()
         to = _from + random.random()
diff --git a/test/test_testing.py b/test/test_testing.py
index a5ea232122e08..e45977f3a855e 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -19,12 +19,13 @@
      deviceCountAtLeast)
 from torch.testing._internal.common_methods_invocations import op_db
 import torch.testing._internal.opinfo_helper as opinfo_helper
+from torch.testing._internal.common_dtype import get_all_dtypes
 
 # For testing TestCase methods and torch.testing functions
 class TestTesting(TestCase):
     # Ensure that assertEqual handles numpy arrays properly
-    @dtypes(*(torch.testing.get_all_dtypes(include_half=True, include_bfloat16=False,
-                                           include_bool=True, include_complex=True)))
+    @dtypes(*(get_all_dtypes(include_half=True, include_bfloat16=False,
+                             include_bool=True, include_complex=True)))
     def test_assertEqual_numpy(self, device, dtype):
         S = 10
         test_sizes = [
diff --git a/test/test_torch.py b/test/test_torch.py
index ae75ee8d66044..6de409be60d1d 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -48,6 +48,9 @@
 import torch.backends.quantized
 import torch.testing._internal.data
 from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32
+from torch.testing._internal.common_dtype import (
+    get_all_fp_dtypes, get_all_int_dtypes, get_all_math_dtypes, get_all_dtypes, get_all_complex_dtypes
+)
 
 # Protects against includes accidentally setting the default dtype
 assert torch.get_default_dtype() is torch.float32
@@ -274,8 +277,8 @@ def get_tensor(size, dtype, device, contiguous):
             height = 5
             width = 5
             for device in torch.testing.get_all_device_types():
-                for dt1 in torch.testing.get_all_dtypes():
-                    for dt2 in torch.testing.get_all_dtypes():
+                for dt1 in get_all_dtypes():
+                    for dt2 in get_all_dtypes():
                         for contiguous in [True, False]:
                             x1 = get_tensor((height, width), dt1, device, contiguous)
                             x2 = get_tensor((height, width), dt2, device, contiguous)
@@ -293,14 +296,14 @@ def get_tensor(size, dtype, device, contiguous):
                                 self.assertEqual(expected, result)
 
         def test_dtypes(self):
-            all_dtypes = torch.testing.get_all_dtypes()
+            all_dtypes = get_all_dtypes()
             do_test_dtypes(self, all_dtypes, torch.strided, torch.device('cpu'))
             if torch.cuda.is_available():
                 all_dtypes.remove(torch.bfloat16)  # Remove once _th_zero_ is enabled on cuda for bfloat16
                 do_test_dtypes(self, all_dtypes, torch.strided, torch.device('cuda:0'))
 
         def test_copy_dtypes(self):
-            all_dtypes = torch.testing.get_all_dtypes()
+            all_dtypes = get_all_dtypes()
             for dtype in all_dtypes:
                 copied_dtype = copy.deepcopy(dtype)
                 self.assertIs(dtype, copied_dtype)
@@ -722,7 +725,7 @@ def reference(x, k, o3, o32):
             self._test_conv_corr_eq(lambda x, k: torch.conv3(x, k, 'F'), reference)
 
         def test_dtype_is_signed(self):
-            for dtype in torch.testing.get_all_dtypes():
+            for dtype in get_all_dtypes():
                 self.assertEqual(dtype.is_signed, torch.is_signed(torch.tensor(0, dtype=dtype)))
 
             self.assertRaisesRegex(RuntimeError, 'not supported for quantized', lambda: torch.quint8.is_signed)
@@ -959,7 +962,7 @@ def test_index_add(self):
         # https://github.com/pytorch/pytorch/issues/29153
         def test_index_add_all_dtypes(self):
             for device in torch.testing.get_all_device_types():
-                for dtype in torch.testing.get_all_math_dtypes(device):
+                for dtype in get_all_math_dtypes(device):
                     for idx_dtype in [torch.int, torch.long]:
                         size = [5, 5]
                         if dtype.is_floating_point or dtype.is_complex:
@@ -4297,13 +4300,13 @@ def _cond_fn(x):
             _sync_raises_helper(f, level)
 
 
-    @dtypes(*torch.testing.get_all_fp_dtypes())
+    @dtypes(*get_all_fp_dtypes())
     def test_log_normal(self, device, dtype):
         a = torch.tensor([10], dtype=dtype, device=device).log_normal_()
         self.assertEqual(a.dtype, dtype)
         self.assertEqual(a.size(), torch.Size([1]))
 
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
     def test_geometric(self, device, dtype):
         a = torch.tensor([10], dtype=dtype, device=device).geometric_(0.5)
         self.assertEqual(a.dtype, dtype)
@@ -4335,9 +4338,9 @@ def test_repeat_interleave(self, device):
             self.assertEqual(a_with_output.dtype, y.dtype)
             self.assertEqual(a_with_output.size(), torch.Size([3, 2]))
 
-    @dtypes(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False)))
-    @dtypesIfCPU(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=True)))
-    @dtypesIfCUDA(*(torch.testing.get_all_fp_dtypes(include_bfloat16=False)))
+    @dtypes(*get_all_fp_dtypes(include_half=False, include_bfloat16=False))
+    @dtypesIfCPU(*(get_all_fp_dtypes(include_half=False, include_bfloat16=True)))
+    @dtypesIfCUDA(*(get_all_fp_dtypes(include_bfloat16=False)))
     def test_bernoulli_p(self, device, dtype):
         for trivial_p in ([0, 1], [1, 0, 1, 1, 0, 1]):
             x = torch.tensor(trivial_p, dtype=dtype, device=device)
@@ -4357,9 +4360,9 @@ def isBinary(t):
         self.assertTrue(isBinary(p))
 
     # RngUniform not implemented for Integral type in XLA test
-    @dtypes(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False)))
-    @dtypesIfCPU(*(torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False, include_complex=False)))
-    @dtypesIfCUDA(*(torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False)))
+    @dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False)))
+    @dtypesIfCPU(*(get_all_dtypes(include_half=False, include_bfloat16=False, include_complex=False)))
+    @dtypesIfCUDA(*(get_all_dtypes(include_bfloat16=False, include_complex=False)))
     def test_bernoulli_self(self, device, dtype):
 
         def isBinary(t):
@@ -4371,8 +4374,7 @@ def isBinary(t):
         t.bernoulli_(0.5)
         self.assertTrue(isBinary(t))
 
-        for p_dtype in torch.testing.get_all_fp_dtypes(include_half=device.startswith('cuda'),
-                                                       include_bfloat16=False):
+        for p_dtype in get_all_fp_dtypes(include_half=device.startswith('cuda'), include_bfloat16=False):
             p = torch.rand(10, dtype=p_dtype, device=device).expand(10, 10)
             t.fill_(2)
             t.bernoulli_(p)
@@ -4387,8 +4389,8 @@ def isBinary(t):
             self.assertTrue(isBinary(t))
 
     @slowTest
-    @dtypes(*(torch.testing.get_all_fp_dtypes(include_half=False, include_bfloat16=False)))
-    @dtypesIfCUDA(*(torch.testing.get_all_fp_dtypes(include_bfloat16=False)))
+    @dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False)))
+    @dtypesIfCUDA(*(get_all_fp_dtypes(include_bfloat16=False)))
     def test_bernoulli_edge_cases(self, device, dtype):
         # Need to draw a lot of samples to cover every random floating point number.
         a = torch.zeros(10000, 10000, dtype=dtype, device=device)  # probability of drawing "1" is 0
@@ -4399,7 +4401,7 @@ def test_bernoulli_edge_cases(self, device, dtype):
         num_zeros = (torch.bernoulli(b) == 0).sum()
         self.assertEqual(num_zeros, 0)
 
-    @dtypes(*torch.testing.get_all_fp_dtypes())
+    @dtypes(*get_all_fp_dtypes())
     def test_exponential(self, device, dtype):
         a = torch.tensor([10], dtype=dtype, device=device).exponential_(0.5)
         self.assertEqual(a.dtype, dtype)
@@ -4482,7 +4484,7 @@ def check(msg, *args, **kwargs):
         check(r'aweights cannot be negative', a, aweights=torch.tensor([-1., -2.]))
 
     @skipIfNoSciPy
-    @dtypes(*torch.testing.get_all_fp_dtypes())
+    @dtypes(*get_all_fp_dtypes())
     def test_uniform_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -4494,8 +4496,8 @@ def test_uniform_kstest(self, device, dtype):
                     self.assertTrue(res.statistic < 0.1)
 
     @skipIfNoSciPy
-    @dtypes(*torch.testing.get_all_fp_dtypes(include_bfloat16=False))
-    @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes())
+    @dtypes(*get_all_fp_dtypes(include_bfloat16=False))
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     def test_normal_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -4506,7 +4508,7 @@ def test_normal_kstest(self, device, dtype):
                 self.assertTrue(res.statistic < 0.1)
 
     @skipIfNoSciPy
-    @dtypes(*torch.testing.get_all_fp_dtypes())
+    @dtypes(*get_all_fp_dtypes())
     def test_lognormal_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -4520,7 +4522,7 @@ def test_lognormal_kstest(self, device, dtype):
                     self.assertTrue(res.statistic < 0.1)
 
     @skipIfNoSciPy
-    @dtypes(*torch.testing.get_all_fp_dtypes())
+    @dtypes(*get_all_fp_dtypes())
     def test_exponential_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -4530,7 +4532,7 @@ def test_exponential_kstest(self, device, dtype):
             self.assertTrue(res.statistic < 0.1)
 
     @skipIfNoSciPy
-    @dtypes(*torch.testing.get_all_fp_dtypes())
+    @dtypes(*get_all_fp_dtypes())
     def test_cauchy_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -4551,7 +4553,7 @@ def test_cauchy_no_inf(self, device, dtype):
             self.assertFalse(x.isinf().sum())
 
     @skipIfNoSciPy
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
     def test_geometric_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -5005,7 +5007,7 @@ def to_np(t):
 
     # All tensors appear contiguous on XLA
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_diff_noncontig(self, device, dtype):
         shapes = (
             (1,),
@@ -5025,9 +5027,9 @@ def test_diff_noncontig(self, device, dtype):
             self._test_diff_numpy(non_contig)
 
     # RngNormal not implemented for type f16 for XLA
-    @dtypes(*torch.testing.get_all_dtypes(include_half=False))
-    @dtypesIfCPU(*torch.testing.get_all_dtypes())
-    @dtypesIfCUDA(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes(include_half=False))
+    @dtypesIfCPU(*get_all_dtypes())
+    @dtypesIfCUDA(*get_all_dtypes())
     def test_diff(self, device, dtype):
         shapes = (
             (1,),
@@ -5298,7 +5300,7 @@ def test_bool_tensor_value_change(self, device):
         self.assertEqual(x, torch.tensor([False, True], dtype=torch.bool, device=device))
 
     def test_unfold_all_devices_and_dtypes(self, device):
-        for dt in torch.testing.get_all_dtypes():
+        for dt in get_all_dtypes():
 
             if dt == torch.bool:
                 x = torch.empty((0, 1, 3, 0), dtype=dt, device=device)
@@ -5318,7 +5320,7 @@ def test_unfold_scalars(self, device):
 
     def test_copy_all_dtypes_and_devices(self, device):
         from copy import copy
-        for dt in torch.testing.get_all_dtypes():
+        for dt in get_all_dtypes():
             x = torch.tensor([1, 2, 3, 4], dtype=dt, device=device)
             x_clone = x.clone()
             y = copy(x)
@@ -5328,7 +5330,7 @@ def test_copy_all_dtypes_and_devices(self, device):
             self.assertEqual(x, y)
 
     def test_clone_all_dtypes_and_devices(self, device):
-        for dt in torch.testing.get_all_dtypes():
+        for dt in get_all_dtypes():
             x = torch.tensor((1, 1), dtype=dt, device=device)
             y = x.clone()
             self.assertEqual(x, y)
@@ -5346,8 +5348,8 @@ def test_clone_not_memory_dense(self):
         # should retain permutation after densification
         self.assertTrue(y.stride() == (1, 4))
 
-    @dtypesIfCUDA(*set(torch.testing.get_all_math_dtypes('cuda')))
-    @dtypes(*set(torch.testing.get_all_math_dtypes('cpu')))
+    @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')))
+    @dtypes(*set(get_all_math_dtypes('cpu')))
     def test_addcmul(self, device, dtype):
         # Returns floating or integral scalar corresponding to dtype
         def _number(floating, integer, dtype):
@@ -5396,7 +5398,7 @@ def test_narrow_empty(self, device):
             sz[d] = 0
             self.assertEqual(sz, y.size())
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_index_copy(self, device, dtype):
         # We just test for num_copy <= num_dest, as otherwise there are repeated indices
         # and the behavior is undefined
@@ -5430,7 +5432,7 @@ def ref_index_copy(tgt, dim, idx, src):
     # onlyOnCPUAndCUDA due to an XLA error:
     # https://github.com/pytorch/pytorch/issues/53256
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_index_copy_scalars(self, device, dtype):
         # Create the 8 possible combinations of scalar sizes for target / index / source
         scalars = ((make_tensor(size_t, dtype=dtype, device=device, low=None, high=None),
@@ -5535,7 +5537,7 @@ def test_index_put_non_accumulate_deterministic(self, device) -> None:
 
                 self.assertEqual(output, input_list)
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_index_fill(self, device, dtype):
         x = torch.tensor([[1, 2], [4, 5]], dtype=dtype, device=device)
         index = torch.tensor([0], device=device)
@@ -5552,7 +5554,7 @@ def test_index_fill(self, device, dtype):
 
     # The test fails for zero-dimensional tensors on XLA
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_index_select(self, device, dtype):
         num_src, num_out = 3, 5
 
@@ -5595,7 +5597,7 @@ def ref_index_select(src, dim, idx):
             out = source.index_select(0, idx)
             self.assertEqual(out.item(), source.item())
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_take(self, device, dtype):
         idx_size = (4,)
 
@@ -5629,7 +5631,7 @@ def ref_take(src, idx):
 
     # The bool instance does not work on GPU. See
     # https://github.com/pytorch/pytorch/issues/54317
-    @dtypes(*torch.testing.get_all_dtypes(include_bool=False))
+    @dtypes(*get_all_dtypes(include_bool=False))
     def test_put(self, device, dtype):
         src_size = (4,)
 
@@ -5699,7 +5701,7 @@ def ref_put(dst, idx, src, accumulate):
 
     # The bool instance does not work on GPU. See
     # https://github.com/pytorch/pytorch/issues/54317
-    @dtypes(*torch.testing.get_all_dtypes(include_bool=False))
+    @dtypes(*get_all_dtypes(include_bool=False))
     def test_put_accumulate(self, device, dtype):
         # Test for parallel adds with accumulate == True
         low_precision = dtype == torch.half or dtype == torch.bfloat16
@@ -5742,10 +5744,10 @@ def scatter_allow_reduce(self, device, dtype, reduceop):
 
     # torch.{zeros, ones} do not support ComplexHalf (torch.complex32)
     # So, we are skipping it here.
-    @dtypes(*(torch.testing.get_all_fp_dtypes(include_bfloat16=False, include_half=False) +
-              torch.testing.get_all_complex_dtypes()))
-    @dtypesIfCPU(*torch.testing.get_all_dtypes())
-    @dtypesIfCUDA(*torch.testing.get_all_dtypes())
+    @dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) +
+              get_all_complex_dtypes()))
+    @dtypesIfCPU(*get_all_dtypes())
+    @dtypesIfCUDA(*get_all_dtypes())
     def test_scatter_reduce_operations_to_large_input(self, device, dtype):
         index = torch.tensor([[1], [2]], device=device, dtype=torch.long)
         test_data = [
@@ -5772,10 +5774,10 @@ def test_scatter_reduce_operations_to_large_input(self, device, dtype):
 
     # torch.{zeros, ones} do not support ComplexHalf (torch.complex32)
     # So, we are skipping it here.
-    @dtypes(*(torch.testing.get_all_fp_dtypes(include_bfloat16=False, include_half=False) +
-              torch.testing.get_all_complex_dtypes()))
-    @dtypesIfCPU(*torch.testing.get_all_dtypes())
-    @dtypesIfCUDA(*torch.testing.get_all_dtypes())
+    @dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) +
+              get_all_complex_dtypes()))
+    @dtypesIfCPU(*get_all_dtypes())
+    @dtypesIfCUDA(*get_all_dtypes())
     def test_scatter_reduce_scalar(self, device, dtype):
         index = torch.tensor([[1], [2]], device=device, dtype=torch.long)
         test_data = [
@@ -5813,10 +5815,10 @@ def test_scatter_add_non_unique_index(self, device):
 
     # torch.{zeros, ones} do not support ComplexHalf (torch.complex32)
     # So, we are skipping it here.
-    @dtypes(*(torch.testing.get_all_fp_dtypes(include_bfloat16=False, include_half=False) +
-              torch.testing.get_all_complex_dtypes()))
-    @dtypesIfCPU(*torch.testing.get_all_dtypes())
-    @dtypesIfCUDA(*torch.testing.get_all_dtypes())
+    @dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) +
+              get_all_complex_dtypes()))
+    @dtypesIfCPU(*get_all_dtypes())
+    @dtypesIfCUDA(*get_all_dtypes())
     def test_scatter_reduce_non_unique_index(self, device, dtype):
         height = 2
         width = 2
@@ -5840,8 +5842,8 @@ def test_scatter_reduce_non_unique_index(self, device, dtype):
     # torch.{zeros, ones} do not support ComplexHalf (torch.complex32)
     # So, we are skipping it here.
     @onlyCUDA
-    @dtypesIfCUDA(*(torch.testing.get_all_complex_dtypes() +
-                    torch.testing.get_all_int_dtypes()))
+    @dtypesIfCUDA(*(get_all_complex_dtypes() +
+                    get_all_int_dtypes()))
     def test_scatter_reduce_multiply_unsupported_dtypes(self, device, dtype):
         height = 2
         width = 2
@@ -5888,7 +5890,7 @@ def test_scatter_add_bool(self, device):
                                             [True, False, True, False, True]], device=device))
 
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_masked_scatter(self, device, dtype):
         dt = dtype
         with warnings.catch_warnings(record=True) as w:
@@ -5973,7 +5975,7 @@ def test_masked_scatter_large_tensor(self, device):
         result = t.masked_scatter(t, t)
         self.assertEqual(result, result_cpu)
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_masked_select(self, device, dtype):
         if device == 'cpu':
             warn = 'masked_select received a mask with dtype torch.uint8,'
@@ -6039,7 +6041,7 @@ def test_masked_select_discontiguous(self, device):
                 torch.masked_select(v, m, out=out_dc)
                 self.assertEqual(out_dc, expected, atol=0, rtol=0)
 
-    @dtypes(*product(torch.testing.get_all_dtypes(), (torch.uint8, torch.bool)))
+    @dtypes(*product(get_all_dtypes(), (torch.uint8, torch.bool)))
     def test_masked_fill(self, device, dtypes):
         dtype = dtypes[0]
         mask_dtype = dtypes[1]
@@ -6349,8 +6351,8 @@ def test_pdist_norm_large(self, device):
             self.assertEqual(expected_cpu, actual_gpu.cpu())
 
     @onlyOnCPUAndCUDA
-    @dtypesIfCUDA(*set(torch.testing.get_all_math_dtypes('cuda')))
-    @dtypes(*set(torch.testing.get_all_math_dtypes('cpu')))
+    @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')))
+    @dtypes(*set(get_all_math_dtypes('cpu')))
     def test_addcdiv(self, device, dtype):
         # Returns floating or integral scalar corresponding to dtype
         def _number(floating, integer, dtype):
@@ -7093,7 +7095,7 @@ def compare_strides(s1, s2, div):
                 _test_helper(x, op, unary=True)
 
     @skipMeta
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_dlpack_conversion(self, device, dtype):
         # DLpack does not explicitly support bool
         # It does it through uint8 type
@@ -7693,8 +7695,8 @@ def _where_valid_scalar_tensor_combination(self, scalar_type, dtype):
         return False
 
     @onlyOnCPUAndCUDA
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes() +
-              torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes() +
+              get_all_complex_dtypes()))
     def test_where_scalar_invalid_combination_raises(self, device, dtype):
 
         def checkRaises(scalar_type, dtype, condition, x, scalar_1):
@@ -7706,8 +7708,8 @@ def checkRaises(scalar_type, dtype, condition, x, scalar_1):
         self._test_where_scalar_template(device, dtype, checkRaises)
 
     @skipCUDAVersionIn([(11, 2)])  # test fails for 11.2, see https://github.com/pytorch/pytorch/issues/51980
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes() +
-              torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes() +
+              get_all_complex_dtypes()))
     def test_where_scalar_valid_combination(self, device, dtype):
 
         def checkResult(scalar_type, dtype, condition, x, scalar_1):
diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py
index bd48e38045a13..81411c058bca6 100644
--- a/test/test_type_promotion.py
+++ b/test/test_type_promotion.py
@@ -8,6 +8,9 @@
                                                   TEST_NUMPY, torch_to_numpy_dtype_dict)
 from torch.testing._internal.common_device_type import (instantiate_device_type_tests, onlyOnCPUAndCUDA,
                                                         dtypes, dtypesIfCUDA, onlyCPU, expectedFailureMeta)
+from torch.testing._internal.common_dtype import (
+    get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_fp_dtypes
+)
 
 if TEST_NUMPY:
     import numpy as np
@@ -179,7 +182,7 @@ def test_bfloat16(self, device):
             self.assertEqual(bf + scalar, scalar + bf)
 
         # with tensor
-        for dtype in torch.testing.get_all_dtypes():
+        for dtype in get_all_dtypes():
             t = torch.tensor(1, dtype=dtype, device=device)
             self.assertEqual(bf + t, t + bf)
             if dtype in (torch.float16, torch.float32, torch.float64, torch.cfloat, torch.cdouble):
@@ -254,8 +257,8 @@ def _get_test_tensor(self, device, dtype, remove_zeros=False):
     def test_many_promotions(self, device):
         # Can also include half on CPU in cases where it will be promoted to a
         # supported dtype
-        dtypes1 = torch.testing.get_all_math_dtypes('cuda')
-        dtypes2 = torch.testing.get_all_math_dtypes(device)
+        dtypes1 = get_all_math_dtypes('cuda')
+        dtypes2 = get_all_math_dtypes(device)
         ops = [torch.add, torch.sub, torch.mul, torch.div, torch.rsub]
         for dt1, dt2 in itertools.product(dtypes1, dtypes2):
             for op, non_contiguous in itertools.product(ops, [True, False]):
@@ -331,7 +334,7 @@ def test_create_bool_tensors(self, device):
         # this seems like odd behavior but ints also create float tensors, numpy doesn't have this function.
         self.assertEqual(torch.scalar_tensor(False, device=device), torch.tensor(0., device=device))
 
-    @dtypes(*itertools.product(torch.testing.get_all_dtypes(), torch.testing.get_all_dtypes()))
+    @dtypes(*itertools.product(get_all_dtypes(), get_all_dtypes()))
     def test_result_type(self, device, dtypes):
         "Test result_type for tensor vs tensor and scalar vs scalar."
 
@@ -460,8 +463,8 @@ def test_comparison_ops_with_type_promotion(self, device):
             ),
         ]
         for op in comparison_ops:
-            for dt1 in torch.testing.get_all_math_dtypes(device):
-                for dt2 in torch.testing.get_all_math_dtypes(device):
+            for dt1 in get_all_math_dtypes(device):
+                for dt2 in get_all_math_dtypes(device):
                     if (dt1.is_complex or dt2.is_complex) and not (op["name"] == "eq" or op["name"] == "ne"):
                         continue
                     val1 = value_for_type[dt1]
@@ -511,8 +514,8 @@ def test_complex_assertraises(self, device):
             dict(name="ne", compare_op=lambda x, y: x != y, ),
         ]
         for op in comparison_ops:
-            for dt1 in torch.testing.get_all_math_dtypes(device):
-                for dt2 in torch.testing.get_all_math_dtypes(device):
+            for dt1 in get_all_math_dtypes(device):
+                for dt2 in get_all_math_dtypes(device):
                     if (dt1.is_complex or dt2.is_complex) and not (op["name"] == "eq" or op["name"] == "ne"):
                         u = torch.tensor([1], dtype=dt1, device=device)
                         v = torch.tensor([2], dtype=dt2, device=device)
@@ -520,7 +523,7 @@ def test_complex_assertraises(self, device):
 
     @float_double_default_dtype
     def test_lt_with_type_promotion(self, device):
-        for dt in torch.testing.get_all_math_dtypes(device):
+        for dt in get_all_math_dtypes(device):
             x = torch.tensor([0], dtype=dt, device=device)
             expected = torch.tensor([True], dtype=torch.bool, device=device)
 
@@ -553,7 +556,7 @@ def test_promote_types(self, device):
 
     @float_double_default_dtype
     def test_promote_self(self, device):
-        for dtype in torch.testing.get_all_dtypes():
+        for dtype in get_all_dtypes():
             self.assertEqual(torch.promote_types(dtype, dtype), dtype)
 
     @expectedFailureMeta
@@ -758,12 +761,12 @@ def _run_all_tests_for_sparse_op(self, op_name, device, dtypes):
     @onlyOnCPUAndCUDA
     def test_sparse_add(self, device):
         self._run_all_tests_for_sparse_op('add', device,
-                                          dtypes=torch.testing.get_all_math_dtypes(device))
+                                          dtypes=get_all_math_dtypes(device))
 
     @onlyOnCPUAndCUDA
     def test_sparse_mul(self, device):
         self._run_all_tests_for_sparse_op('mul', device,
-                                          dtypes=torch.testing.get_all_math_dtypes(device))
+                                          dtypes=get_all_math_dtypes(device))
 
     @onlyOnCPUAndCUDA
     def test_sparse_div(self, device):
@@ -774,7 +777,7 @@ def test_sparse_div(self, device):
     @onlyOnCPUAndCUDA
     def test_sparse_sub(self, device):
         self._run_all_tests_for_sparse_op('sub', device,
-                                          dtypes=torch.testing.get_all_math_dtypes(device))
+                                          dtypes=get_all_math_dtypes(device))
 
     @onlyOnCPUAndCUDA
     @dtypes(torch.bool, torch.short, torch.uint8, torch.int, torch.long)
@@ -871,7 +874,7 @@ def test_numpy_array_binary_ufunc_promotion(self, device, dtypes):
 
     @onlyOnCPUAndCUDA
     def test_cat_different_dtypes(self, device):
-        dtypes = torch.testing.get_all_dtypes(include_bfloat16=False)
+        dtypes = get_all_dtypes(include_bfloat16=False)
         for x_dtype, y_dtype in itertools.product(dtypes, dtypes):
             x_vals, y_vals = [1, 2, 3], [4, 5, 6]
 
@@ -890,7 +893,7 @@ def test_cat_different_dtypes(self, device):
 
     @onlyOnCPUAndCUDA
     def test_cat_out_different_dtypes(self, device):
-        dtypes = torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False)
+        dtypes = get_all_dtypes(include_bfloat16=False, include_bool=False)
         for x_dtype, y_dtype, out_dtype in itertools.product(dtypes, dtypes, dtypes):
             out = torch.zeros(6, device=device, dtype=out_dtype)
             x = torch.tensor([1, 2, 3], device=device, dtype=x_dtype)
@@ -957,21 +960,21 @@ def test_computation_ignores_out(self, device):
         self.assertEqual(result, a - b, exact_dtype=False)
         self.assertNotEqual(result, a.double() - b, exact_dtype=False)
 
-    @dtypesIfCUDA(*itertools.product(torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False),
-                                     torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False)))
-    @dtypes(*itertools.product(torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False,
-                                                            include_complex=False),
-                               torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False,
-                                                            include_complex=False)))
+    @dtypesIfCUDA(*itertools.product(get_all_dtypes(include_bfloat16=False, include_complex=False),
+                                     get_all_dtypes(include_bfloat16=False, include_complex=False)))
+    @dtypes(*itertools.product(get_all_dtypes(include_half=False, include_bfloat16=False,
+                                              include_complex=False),
+                               get_all_dtypes(include_half=False, include_bfloat16=False,
+                                              include_complex=False)))
     def test_atan2_type_promotion(self, device, dtypes):
         dtype1, dtype2 = dtypes
         default_float = torch.get_default_dtype()
 
         def is_int(dtype):
-            return dtype in torch.testing.get_all_int_dtypes() + [torch.bool]
+            return dtype in get_all_int_dtypes() + [torch.bool]
 
         def is_float(dtype):
-            return dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=False)
+            return dtype in get_all_fp_dtypes(include_half=True, include_bfloat16=False)
 
         def get_binary_float_result_type(x, y):
             dtype1 = x.dtype
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 526b67a6b03da..c65ae980fd82a 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -18,8 +18,11 @@
     instantiate_device_type_tests, ops, dtypes, onlyCPU, onlyOnCPUAndCUDA,
     onlyCUDA, dtypesIfCUDA, precisionOverride, skipCUDAIfRocm, dtypesIfCPU,
     OpDTypes)
-from torch.testing import (
-    floating_types_and, all_types_and_complex_and, floating_and_complex_types_and, make_tensor)
+from torch.testing import make_tensor
+from torch.testing._internal.common_dtype import (
+    floating_types_and, all_types_and_complex_and, floating_and_complex_types_and, get_all_dtypes, get_all_math_dtypes,
+    get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes
+)
 
 if TEST_SCIPY:
     import scipy
@@ -502,8 +505,8 @@ def test_out_arg_all_dtypes(self, device, dtype, op):
             out = torch.empty_like(input, dtype=out_dtype)
             self._test_out_arg(op, input, out, expected, **torch_kwargs)
 
-    @dtypes(*(torch.testing.get_all_int_dtypes() + [torch.bool] +
-              torch.testing.get_all_fp_dtypes(include_bfloat16=False)))
+    @dtypes(*(get_all_int_dtypes() + [torch.bool] +
+              get_all_fp_dtypes(include_bfloat16=False)))
     def test_nan_to_num(self, device, dtype):
         for contiguous in [False, True]:
             x = make_tensor((64, 64), low=0., high=100., dtype=dtype, device=device)
@@ -581,7 +584,7 @@ def test_digamma(self, device, dtype):
         self.compare_with_numpy(torch.digamma, scipy.special.digamma, tensor)
 
     @skipCUDAIfRocm
-    @dtypes(*torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=False))
+    @dtypes(*get_all_fp_dtypes(include_half=True, include_bfloat16=False))
     def test_frexp(self, device, dtype):
         input = make_tensor((50, 50), device, dtype)
         mantissa, exponent = torch.frexp(input)
@@ -595,7 +598,7 @@ def test_frexp(self, device, dtype):
         self.assertTrue(torch_to_numpy_dtype_dict[exponent.dtype] == np_exponent.dtype)
 
     @skipCUDAIfRocm
-    @dtypes(*torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=False))
+    @dtypes(*get_all_fp_dtypes(include_half=True, include_bfloat16=False))
     def test_frexp_out(self, device, dtype):
         input = make_tensor((50, 50), device, dtype)
         outputs = (
@@ -622,20 +625,18 @@ def test_frexp_out(self, device, dtype):
 
     @skipCUDAIfRocm
     def test_frexp_assert_raises(self, device):
-        invalid_input_dtypes = torch.testing.get_all_int_dtypes() + \
-            torch.testing.get_all_complex_dtypes() + \
+        invalid_input_dtypes = get_all_int_dtypes() + \
+            get_all_complex_dtypes() + \
             [torch.bool]
         for dtype in invalid_input_dtypes:
             input = make_tensor((50, 50), device, dtype)
             with self.assertRaisesRegex(RuntimeError, r"torch\.frexp\(\) only supports floating-point dtypes"):
                 torch.frexp(input)
 
-        for dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=False):
+        for dtype in get_all_fp_dtypes(include_half=True, include_bfloat16=False):
             input = make_tensor((50, 50), device, dtype)
 
-            dtypes = list(torch.testing.all_types_and_complex_and(torch.bool,
-                                                                  torch.half,
-                                                                  torch.bfloat16))
+            dtypes = list(all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
             dtypes.remove(dtype)
             for mantissa_dtype in dtypes:
                 mantissa = torch.empty_like(input, dtype=mantissa_dtype)
@@ -1046,7 +1047,7 @@ def test_mish(self, device, dtype):
 
     # do ops like threshold need a test_unary(_nonufunc) test suite?
     @onlyCPU
-    @dtypes(*torch.testing.get_all_math_dtypes('cpu'))
+    @dtypes(*get_all_math_dtypes('cpu'))
     def test_threshold(self, device, dtype):
         if dtype != torch.uint8 and dtype != torch.float16 and not dtype.is_complex:
             # 100 is wide enough to use AVX2 instructions for all types
@@ -1180,7 +1181,7 @@ def _i0_range_helper(self, range, device, dtype):
             t = torch.rand(1000, device=device).to(dtype) * r
             self._i0_helper(t)
 
-    @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes())
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.bfloat16, torch.float32, torch.float64)
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     def test_i0_range1(self, device, dtype):
@@ -1188,7 +1189,7 @@ def test_i0_range1(self, device, dtype):
         # The domain is (-13.25, 13.25)
         self._i0_range_helper(13.25, device, dtype)
 
-    @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes())
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.bfloat16, torch.float32, torch.float64)
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     def test_i0_range2(self, device, dtype):
@@ -1203,7 +1204,7 @@ def test_i0_range3(self, device, dtype):
         # The domain is (-709.75, 709.75)
         self._i0_range_helper(709.75, device, dtype)
 
-    @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes())
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.bfloat16, torch.float32, torch.float64)
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     def test_i0_special(self, device, dtype):
@@ -1213,7 +1214,7 @@ def test_i0_special(self, device, dtype):
         t = torch.tensor([inf, -inf, nan], device=device, dtype=dtype)
         self.assertTrue(torch.i0(t).isnan().all())
 
-    @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes())
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.bfloat16, torch.float32, torch.float64)
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     def test_special_i0_i1_vs_scipy(self, device, dtype):
@@ -1305,7 +1306,7 @@ def test_abs_zero(self, device, dtype):
         for num in abs_zeros:
             self.assertGreater(math.copysign(1.0, num), 0.0)
 
-    @dtypes(*torch.testing.get_all_fp_dtypes())
+    @dtypes(*get_all_fp_dtypes())
     def test_isfinite_isinf_isnan(self, device, dtype):
         vals = (-float('inf'), float('inf'), float('nan'), -1, 0, 1)
 
@@ -1321,7 +1322,7 @@ def test_isfinite_isinf_isnan_int(self, device, dtype):
         self.compare_with_numpy(torch.isinf, np.isinf, vals, device, dtype)
         self.compare_with_numpy(torch.isnan, np.isnan, vals, device, dtype)
 
-    @dtypes(*(torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_fp_dtypes()))
     def test_isposinf_isneginf_float(self, device, dtype):
         ops = ((torch.isposinf, np.isposinf), (torch.isneginf, np.isneginf))
         vals = (-float('inf'), float('inf'), float('nan'), -1, 0, 1)
@@ -1346,7 +1347,7 @@ def test_isposinf_isneginf_float(self, device, dtype):
             torch_op(t, out=out)
             self.assertEqual(out, t_target)
 
-    @dtypes(*(torch.testing.get_all_int_dtypes() + [torch.bool]))
+    @dtypes(*(get_all_int_dtypes() + [torch.bool]))
     def test_isposinf_isneginf_int_and_bool(self, device, dtype):
         ops = ((torch.isposinf, np.isposinf), (torch.isneginf, np.isneginf))
         vals = (-1, 0, 1)
@@ -1374,7 +1375,7 @@ def test_isposinf_isneginf_complex(self, device, dtype):
             with self.assertRaisesRegex(RuntimeError, 'does not support complex inputs'):
                 torch_op(t, out=out)
 
-    @dtypes(*(torch.testing.get_all_dtypes(include_bool=False)))
+    @dtypes(*(get_all_dtypes(include_bool=False)))
     def test_isposinf_isneginf_non_boolean_output(self, device, dtype):
         # test non-boolean tensors as the `out=` parameters
         # boolean outputs are tested in the above testcases
@@ -1406,7 +1407,7 @@ def test_isreal_complex(self, device, dtype):
         vals = (1, 1 + 1j, 2 + 0j, 3j, 2 - 1j, 2 - 0j)
         self.compare_with_numpy(torch.isreal, np.isreal, vals, device, dtype)
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_isreal_noncomplex(self, device, dtype):
         vals = (1, 2, 3)
         # Manual check here since numpy doesn't support bfloat16
@@ -1467,7 +1468,7 @@ def assert_tuple_empty(tup, dim):
         self.assertEqual(1, len(z))
         self.assertEqual(torch.empty(0, dtype=torch.long), z[0])
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_nonzero_noncontiguous(self, device, dtype):
         x = make_tensor((10, 10, 10), dtype=dtype, device=device,
                         low=1, noncontiguous=False)
@@ -1496,10 +1497,10 @@ def permute_storage(tensor, dims):
         self.assertEqual(nondense.nonzero(), expect)
 
     # TODO: rationalize with exp OpInfo
-    @dtypes(*(torch.testing.get_all_fp_dtypes(include_half=False) +
-              torch.testing.get_all_complex_dtypes()))
-    @dtypesIfCUDA(*(torch.testing.get_all_fp_dtypes(include_half=True) +
-                    torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_fp_dtypes(include_half=False) +
+              get_all_complex_dtypes()))
+    @dtypesIfCUDA(*(get_all_fp_dtypes(include_half=True) +
+                    get_all_complex_dtypes()))
     def test_exp(self, device, dtype):
         for v in (2, -2) + ((1j, 1 + 1j) if dtype.is_complex else ()):
             a = torch.tensor(v, dtype=dtype, device=device) * torch.arange(18, device=device) / 3 * math.pi
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 7bb6906ef1cc7..06aaf31423f3f 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -11,6 +11,9 @@
     (TestCase, run_tests, suppress_warnings)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, onlyCPU, dtypes, onlyOnCPUAndCUDA)
+from torch.testing._internal.common_dtype import (
+    get_all_dtypes, get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes
+)
 
 # TODO: replace this with make_tensor() in common_utils.py
 def _generate_input(shape, dtype, device, with_extremal):
@@ -114,14 +117,14 @@ def _do_transpose(self, x, contiguous=False, dim0=0, dim1=1):
         else:
             return x.transpose(dim0, dim1)
 
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
     def test_conj_self(self, device, dtype):
         t = torch.ones(5, 5, device=device)
         s = t.conj()
         self.assertTrue(s is t)
 
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_fp_dtypes(include_bfloat16=False), torch.complex64)
+    @dtypes(*get_all_fp_dtypes(include_bfloat16=False), torch.complex64)
     def test_view_dtype(self, device, dtype):
         int_dtype = {
             torch.half: torch.int16,
@@ -227,7 +230,7 @@ def fn(contiguous_input=True, dim0=0, dim1=1):
         self.assertEqual(res.shape, torch.Size([0]))
 
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_complex_dtypes(include_complex32=True))
+    @dtypes(*get_all_complex_dtypes(include_complex32=True))
     def test_view_as_real(self, device, dtype):
         def fn(contiguous_input=True):
             t = torch.randn(3, 4, dtype=dtype, device=device)
@@ -265,7 +268,7 @@ def fn(contiguous_input=True):
         self.assertEqual(res.shape, torch.Size([2]))
 
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_view_tensor_split(self, device, dtype):
         a = make_tensor((40, 30), device, dtype, low=-9, high=9)
         a_split_dim0 = a.tensor_split(7, 0)
@@ -276,7 +279,7 @@ def test_view_tensor_split(self, device, dtype):
             self.assertTrue(self.is_view_of(a, a_split_dim1_tensor))
 
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_view_tensor_hsplit(self, device, dtype):
         t = make_tensor((4, 4, 4), device, dtype, low=-9, high=9)
         t_hsplit = torch.hsplit(t, 2)
@@ -286,7 +289,7 @@ def test_view_tensor_hsplit(self, device, dtype):
         self.assertEqual(t_hsplit[1][2, 0, 2], t[2, 2, 2])
 
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_view_tensor_vsplit(self, device, dtype):
         t = make_tensor((4, 4, 4), device, dtype, low=-9, high=9)
         t_vsplit = torch.vsplit(t, 2)
@@ -296,7 +299,7 @@ def test_view_tensor_vsplit(self, device, dtype):
         self.assertEqual(t_vsplit[1][0, 2, 2], t[2, 2, 2])
 
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_view_tensor_dsplit(self, device, dtype):
         t = make_tensor((4, 4, 4), device, dtype, low=-9, high=9)
         t_dsplit = torch.dsplit(t, 2)
@@ -306,7 +309,7 @@ def test_view_tensor_dsplit(self, device, dtype):
         self.assertEqual(t_dsplit[1][2, 2, 0], t[2, 2, 2])
 
     @onlyOnCPUAndCUDA
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes()))
     def test_real_imag_noncomplex(self, device, dtype):
         t = torch.ones((5, 5), dtype=dtype, device=device)
 
@@ -317,7 +320,7 @@ def test_real_imag_noncomplex(self, device, dtype):
             torch.imag(t)
 
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_complex_dtypes())
+    @dtypes(*get_all_complex_dtypes())
     def test_real_imag_view(self, device, dtype):
         def compare_with_numpy(contiguous_input=True):
             t = torch.randn(3, 3, dtype=dtype, device=device)
@@ -348,7 +351,7 @@ def compare_with_numpy(contiguous_input=True):
         self.assertEqual(a[5:].imag, a.imag[5:])
 
     @onlyOnCPUAndCUDA
-    @dtypes(*torch.testing.get_all_complex_dtypes())
+    @dtypes(*get_all_complex_dtypes())
     def test_conj_imag_view(self, device, dtype) -> None:
         t = _make_tensor((4, 5,), dtype, device)
         t_numpy_conj = torch.from_numpy(t.cpu().numpy().conj()).to(device=device)
@@ -363,7 +366,7 @@ def test_conj_imag_view(self, device, dtype) -> None:
             self.assertTrue(v_imag.is_neg())
 
     @onlyOnCPUAndCUDA
-    @dtypes(*product(torch.testing.get_all_complex_dtypes(), torch.testing.get_all_dtypes()))
+    @dtypes(*product(get_all_complex_dtypes(), get_all_dtypes()))
     @suppress_warnings
     def test_set_real_imag(self, device, dtypes):
         x = torch.randn(10, dtype=dtypes[0], device=device)
@@ -1216,8 +1219,8 @@ def _test_atleast_dim(self, torch_fn, np_fn, device, dtype):
                         self.assertEqual(np_res, torch_res)
 
     # TODO: are these view ops?
-    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) +
-              torch.testing.get_all_complex_dtypes()))
+    @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) +
+              get_all_complex_dtypes()))
     def test_atleast(self, device, dtype):
         self._test_atleast_dim(torch.atleast_1d, np.atleast_1d, device, dtype)
         self._test_atleast_dim(torch.atleast_2d, np.atleast_2d, device, dtype)
@@ -1253,7 +1256,7 @@ def test_broadcast_shapes(self, device):
                 self.assertEqual(expected, actual)
 
     # Skip BFloat16 since numpy does not support it
-    @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False))
+    @dtypes(*get_all_dtypes(include_bfloat16=False))
     def test_broadcast_to(self, device, dtype):
         def can_broadcast(s0, s1):
             # s0.dim() <= s1.dim(), reverse s0 and s1 to compare trailing dimension
@@ -1356,7 +1359,7 @@ def test_view(self, device):
         self.assertEqual(tensor.view(6, 2, 1), contig_tensor.view(6, 2, 1))
         self.assertEqual(tensor.view(1, 6, 2, 1), contig_tensor.view(1, 6, 2, 1))
 
-    @dtypes(*torch.testing.get_all_dtypes())
+    @dtypes(*get_all_dtypes())
     def test_reshape_view_semantics(self, device, dtype):
         tensor = make_tensor((15, 4), device, dtype)
         target = (20, 3)
@@ -1383,7 +1386,7 @@ def test_contiguous(self, device):
 
     @onlyOnCPUAndCUDA
     # Skip BFloat16 since numpy does not support it
-    @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False))
+    @dtypes(*get_all_dtypes(include_bfloat16=False))
     def test_tensor_split_sections(self, device, dtype):
         input_sizes = [
             (0,),
@@ -1414,7 +1417,7 @@ def test_tensor_split_sections(self, device, dtype):
 
     @onlyOnCPUAndCUDA
     # Skip BFloat16 since numpy does not support it
-    @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False))
+    @dtypes(*get_all_dtypes(include_bfloat16=False))
     def test_tensor_split_indices(self, device, dtype):
         input_sizes = [
             (0,),
@@ -1493,20 +1496,20 @@ def test_tensor_split_errors(self, device):
 
     def test_resize_all_dtypes_and_devices(self, device):
         shape = (2, 2)
-        for dt in torch.testing.get_all_dtypes():
+        for dt in get_all_dtypes():
             x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device)
             x.resize_(shape)
             self.assertEqual(shape, x.shape)
 
     def test_resize_as_all_dtypes_and_devices(self, device):
-        for dt in torch.testing.get_all_dtypes():
+        for dt in get_all_dtypes():
             x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device)
             y = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=dt, device=device)
             x.resize_as_(y)
             self.assertEqual(y.shape, x.shape)
 
     def test_view_all_dtypes_and_devices(self, device):
-        for dt in torch.testing.get_all_dtypes():
+        for dt in get_all_dtypes():
             x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device)
             self.assertEqual(x.view(6).shape, [6])
 
diff --git a/torch/testing/_core.py b/torch/testing/_core.py
index 66060f8cbcee0..b3cc6f163c49f 100644
--- a/torch/testing/_core.py
+++ b/torch/testing/_core.py
@@ -13,27 +13,7 @@
 
 __all__ = [
     "FileCheck",
-    "all_types",
-    "all_types_and",
-    "all_types_and_complex",
-    "all_types_and_complex_and",
-    "all_types_and_half",
-    "complex_types",
-    "empty_types",
-    "floating_and_complex_types",
-    "floating_and_complex_types_and",
-    "floating_types",
-    "floating_types_and",
-    "double_types",
-    "floating_types_and_half",
-    "get_all_complex_dtypes",
-    "get_all_dtypes",
     "get_all_device_types",
-    "get_all_fp_dtypes",
-    "get_all_int_dtypes",
-    "get_all_math_dtypes",
-    "integral_types",
-    "integral_types_and",
     "make_non_contiguous",
 ]
 
@@ -41,9 +21,7 @@
 # False otherwise.
 # TODO: implement numpy-like issubdtype
 def is_integral(dtype: torch.dtype) -> bool:
-    # Skip complex/quantized types
-    dtypes = [x for x in get_all_dtypes() if x not in get_all_complex_dtypes()]
-    return dtype in dtypes and not dtype.is_floating_point
+    return dtype in (torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)
 
 def is_quantized(dtype: torch.dtype) -> bool:
     return dtype in (torch.quint8, torch.qint8, torch.qint32, torch.quint4x2)
@@ -216,109 +194,5 @@ def make_non_contiguous(tensor: torch.Tensor) -> torch.Tensor:
     return input.data
 
 
-# Functions and classes for describing the dtypes a function supports
-# NOTE: these helpers should correspond to PyTorch's C++ dispatch macros
-
-# Verifies each given dtype is a torch.dtype
-def _validate_dtypes(*dtypes):
-    for dtype in dtypes:
-        assert isinstance(dtype, torch.dtype)
-    return dtypes
-
-# class for tuples corresponding to a PyTorch dispatch macro
-class _dispatch_dtypes(tuple):
-    def __add__(self, other):
-        assert isinstance(other, tuple)
-        return _dispatch_dtypes(tuple.__add__(self, other))
-
-_empty_types = _dispatch_dtypes(())
-def empty_types():
-    return _empty_types
-
-_floating_types = _dispatch_dtypes((torch.float32, torch.float64))
-def floating_types():
-    return _floating_types
-
-_floating_types_and_half = _floating_types + (torch.half,)
-def floating_types_and_half():
-    return _floating_types_and_half
-
-def floating_types_and(*dtypes):
-    return _floating_types + _validate_dtypes(*dtypes)
-
-_floating_and_complex_types = _floating_types + (torch.cfloat, torch.cdouble)
-def floating_and_complex_types():
-    return _floating_and_complex_types
-
-def floating_and_complex_types_and(*dtypes):
-    return _floating_and_complex_types + _validate_dtypes(*dtypes)
-
-_double_types = _dispatch_dtypes((torch.float64, torch.complex128))
-def double_types():
-    return _double_types
-
-_integral_types = _dispatch_dtypes((torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64))
-def integral_types():
-    return _integral_types
-
-def integral_types_and(*dtypes):
-    return _integral_types + _validate_dtypes(*dtypes)
-
-_all_types = _floating_types + _integral_types
-def all_types():
-    return _all_types
-
-def all_types_and(*dtypes):
-    return _all_types + _validate_dtypes(*dtypes)
-
-_complex_types = _dispatch_dtypes((torch.cfloat, torch.cdouble))
-def complex_types():
-    return _complex_types
-
-_all_types_and_complex = _all_types + _complex_types
-def all_types_and_complex():
-    return _all_types_and_complex
-
-def all_types_and_complex_and(*dtypes):
-    return _all_types_and_complex + _validate_dtypes(*dtypes)
-
-_all_types_and_half = _all_types + (torch.half,)
-def all_types_and_half():
-    return _all_types_and_half
-
-def get_all_dtypes(include_half=True,
-                   include_bfloat16=True,
-                   include_bool=True,
-                   include_complex=True,
-                   include_complex32=False
-                   ) -> List[torch.dtype]:
-    dtypes = get_all_int_dtypes() + get_all_fp_dtypes(include_half=include_half, include_bfloat16=include_bfloat16)
-    if include_bool:
-        dtypes.append(torch.bool)
-    if include_complex:
-        dtypes += get_all_complex_dtypes(include_complex32)
-    return dtypes
-
-def get_all_math_dtypes(device) -> List[torch.dtype]:
-    return get_all_int_dtypes() + get_all_fp_dtypes(include_half=device.startswith('cuda'),
-                                                    include_bfloat16=False) + get_all_complex_dtypes()
-
-def get_all_complex_dtypes(include_complex32=False) -> List[torch.dtype]:
-    return [torch.complex32, torch.complex64, torch.complex128] if include_complex32 else [torch.complex64, torch.complex128]
-
-
-def get_all_int_dtypes() -> List[torch.dtype]:
-    return [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]
-
-
-def get_all_fp_dtypes(include_half=True, include_bfloat16=True) -> List[torch.dtype]:
-    dtypes = [torch.float32, torch.float64]
-    if include_half:
-        dtypes.append(torch.float16)
-    if include_bfloat16:
-        dtypes.append(torch.bfloat16)
-    return dtypes
-
-
 def get_all_device_types() -> List[str]:
     return ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py
index 3cf7338bff889..60c6384ad13cb 100644
--- a/torch/testing/_deprecated.py
+++ b/torch/testing/_deprecated.py
@@ -5,10 +5,12 @@
 
 import functools
 import warnings
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 import torch
 
+from . import _dtype_getters
+
 
 __all__ = [
     "rand",
@@ -17,17 +19,18 @@
 ]
 
 
-def warn_deprecated(instructions: str) -> Callable:
+def warn_deprecated(instructions: Union[str, Callable[[str, Tuple[Any, ...], Dict[str, Any], Any], str]]) -> Callable:
     def outer_wrapper(fn: Callable) -> Callable:
-        msg = (
-            f"torch.testing.{fn.__name__} is deprecated and will be removed in a future release. "
-            f"{instructions.strip()}"
-        )
+        name = fn.__name__
+        head = f"torch.testing.{name}() is deprecated and will be removed in a future release. "
 
         @functools.wraps(fn)
         def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
+            return_value = fn(*args, **kwargs)
+            tail = instructions(name, args, kwargs, return_value) if callable(instructions) else instructions
+            msg = (head + tail).strip()
             warnings.warn(msg, FutureWarning)
-            return fn(*args, **kwargs)
+            return return_value
 
         return inner_wrapper
 
@@ -84,3 +87,17 @@ def assert_allclose(
         check_is_coalesced=False,
         msg=msg or None,
     )
+
+
+def _dtype_getter_instructions(name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any], return_value: Any) -> str:
+    return f"This call to {name}(...) can be replaced with {return_value}."
+
+
+# We iterate over all public dtype getters and expose them here with an added deprecation warning
+for name in _dtype_getters.__all__:
+    if name.startswith("_"):
+        continue
+    fn = getattr(_dtype_getters, name)
+
+    globals()[name] = warn_deprecated(_dtype_getter_instructions)(fn)
+    __all__.append(name)
diff --git a/torch/testing/_dtype_getters.py b/torch/testing/_dtype_getters.py
new file mode 100644
index 0000000000000..d16ca04f25778
--- /dev/null
+++ b/torch/testing/_dtype_getters.py
@@ -0,0 +1,138 @@
+"""This module exist to be able to deprecate the dtype getters publicly without doing so internally. The deprecated
+public versions are defined in torch.testing._deprecated and exposed from torch.testing. The non-deprecated internal
+versions should be imported from torch.testing._internal.dtype_getters
+"""
+
+from typing import List
+
+import torch
+
+__all__ = [
+    "_validate_dtypes",
+    "_dispatch_dtypes",
+    "all_types",
+    "all_types_and",
+    "all_types_and_complex",
+    "all_types_and_complex_and",
+    "all_types_and_half",
+    "complex_types",
+    "empty_types",
+    "floating_and_complex_types",
+    "floating_and_complex_types_and",
+    "floating_types",
+    "floating_types_and",
+    "double_types",
+    "floating_types_and_half",
+    "get_all_complex_dtypes",
+    "get_all_dtypes",
+    "get_all_fp_dtypes",
+    "get_all_int_dtypes",
+    "get_all_math_dtypes",
+    "integral_types",
+    "integral_types_and",
+]
+
+# Functions and classes for describing the dtypes a function supports
+# NOTE: these helpers should correspond to PyTorch's C++ dispatch macros
+
+# Verifies each given dtype is a torch.dtype
+def _validate_dtypes(*dtypes):
+    for dtype in dtypes:
+        assert isinstance(dtype, torch.dtype)
+    return dtypes
+
+# class for tuples corresponding to a PyTorch dispatch macro
+class _dispatch_dtypes(tuple):
+    def __add__(self, other):
+        assert isinstance(other, tuple)
+        return _dispatch_dtypes(tuple.__add__(self, other))
+
+_empty_types = _dispatch_dtypes(())
+def empty_types():
+    return _empty_types
+
+_floating_types = _dispatch_dtypes((torch.float32, torch.float64))
+def floating_types():
+    return _floating_types
+
+_floating_types_and_half = _floating_types + (torch.half,)
+def floating_types_and_half():
+    return _floating_types_and_half
+
+def floating_types_and(*dtypes):
+    return _floating_types + _validate_dtypes(*dtypes)
+
+_floating_and_complex_types = _floating_types + (torch.cfloat, torch.cdouble)
+def floating_and_complex_types():
+    return _floating_and_complex_types
+
+def floating_and_complex_types_and(*dtypes):
+    return _floating_and_complex_types + _validate_dtypes(*dtypes)
+
+_double_types = _dispatch_dtypes((torch.float64, torch.complex128))
+def double_types():
+    return _double_types
+
+_integral_types = _dispatch_dtypes((torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64))
+def integral_types():
+    return _integral_types
+
+def integral_types_and(*dtypes):
+    return _integral_types + _validate_dtypes(*dtypes)
+
+_all_types = _floating_types + _integral_types
+def all_types():
+    return _all_types
+
+def all_types_and(*dtypes):
+    return _all_types + _validate_dtypes(*dtypes)
+
+_complex_types = _dispatch_dtypes((torch.cfloat, torch.cdouble))
+def complex_types():
+    return _complex_types
+
+_all_types_and_complex = _all_types + _complex_types
+def all_types_and_complex():
+    return _all_types_and_complex
+
+def all_types_and_complex_and(*dtypes):
+    return _all_types_and_complex + _validate_dtypes(*dtypes)
+
+_all_types_and_half = _all_types + (torch.half,)
+def all_types_and_half():
+    return _all_types_and_half
+
+# The functions below are used for convenience in our test suite and thus have no corresponding C++ dispatch macro
+
+def get_all_dtypes(include_half=True,
+                   include_bfloat16=True,
+                   include_bool=True,
+                   include_complex=True,
+                   include_complex32=False
+                   ) -> List[torch.dtype]:
+    dtypes = get_all_int_dtypes() + get_all_fp_dtypes(include_half=include_half, include_bfloat16=include_bfloat16)
+    if include_bool:
+        dtypes.append(torch.bool)
+    if include_complex:
+        dtypes += get_all_complex_dtypes(include_complex32)
+    return dtypes
+
+def get_all_math_dtypes(device) -> List[torch.dtype]:
+    return get_all_int_dtypes() + get_all_fp_dtypes(include_half=device.startswith('cuda'),
+                                                    include_bfloat16=False) + get_all_complex_dtypes()
+
+def get_all_complex_dtypes(include_complex32=False) -> List[torch.dtype]:
+    return [torch.complex32, torch.complex64, torch.complex128] if include_complex32 else [torch.complex64, torch.complex128]
+
+
+def get_all_int_dtypes() -> List[torch.dtype]:
+    return [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]
+
+
+def get_all_fp_dtypes(include_half=True, include_bfloat16=True) -> List[torch.dtype]:
+    dtypes = [torch.float32, torch.float64]
+    if include_half:
+        dtypes.append(torch.float16)
+    if include_bfloat16:
+        dtypes.append(torch.bfloat16)
+    return dtypes
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 8ec6e71d121ff..23e431d66bec2 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -14,8 +14,7 @@
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
     IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, DeterministicGuard, TEST_SKIP_NOARCH
 from torch.testing._internal.common_cuda import _get_torch_cuda_version
-from torch.testing import \
-    (get_all_dtypes)
+from torch.testing._internal.common_dtype import get_all_dtypes
 
 try:
     import psutil  # type: ignore[import]
diff --git a/torch/testing/_internal/common_dtype.py b/torch/testing/_internal/common_dtype.py
new file mode 100644
index 0000000000000..0ce2d80a18b4a
--- /dev/null
+++ b/torch/testing/_internal/common_dtype.py
@@ -0,0 +1,4 @@
+"""The content of torch/testing/_dtype_getters.py should be moved here as soon as the deprecation period is over.
+"""
+
+from torch.testing._dtype_getters import *  # noqa: F401, F403
diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py
index 3b62ced36f391..89533a6d7fb9d 100644
--- a/torch/testing/_internal/common_jit.py
+++ b/torch/testing/_internal/common_jit.py
@@ -7,7 +7,7 @@
 import torch.jit.quantized
 
 # Testing utils
-from torch.testing import floating_and_complex_types_and
+from torch.testing._internal.common_dtype import floating_and_complex_types_and
 from torch.testing._internal.common_utils import TestCase, \
     freeze_rng_state, TemporaryFileName, enable_profiling_mode_for_profiling_tests, is_iterable_of_tensors
 from torch.testing._internal.common_utils import enable_profiling_mode  # noqa: F401
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index ace4fa1c63c20..5113346a20e56 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -16,12 +16,12 @@
 
 from typing import Any, Callable, List, Optional, Sequence, Tuple, Union, Dict
 
-from torch.testing import \
-    (make_non_contiguous, floating_types, floating_types_and, complex_types,
-     floating_and_complex_types, floating_and_complex_types_and,
-     all_types_and_complex_and, all_types_and, all_types_and_complex,
-     integral_types_and, all_types, double_types, make_tensor)
-from .._core import _dispatch_dtypes
+from torch.testing import make_non_contiguous, make_tensor
+from torch.testing._internal.common_dtype import (
+    _dispatch_dtypes, floating_types, floating_types_and, complex_types, floating_and_complex_types,
+    floating_and_complex_types_and, all_types_and_complex_and, all_types_and, all_types_and_complex, integral_types_and,
+    all_types, double_types,
+)
 from torch.testing._internal.common_device_type import \
     (onlyOnCPUAndCUDA, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfNoCusolver,
      skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIfRocm, precisionOverride, toleranceOverride, tol)
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index 6ef4de398a39e..a1059f6b718f4 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -2,7 +2,8 @@
 from copy import deepcopy
 from functools import wraps, partial
 from itertools import chain
-from torch.testing import floating_types, make_tensor
+from torch.testing import make_tensor
+from torch.testing._internal.common_dtype import floating_types
 from torch.testing._internal.common_device_type import (
     _TestParametrizer, _dtype_test_suffix, _update_param_kwargs, skipIf)
 from torch.testing._internal.common_nn import nllloss_reference, get_reduction
diff --git a/torch/testing/_internal/opinfo_helper.py b/torch/testing/_internal/opinfo_helper.py
index 5129af4f99e34..e4a18b48ca7a1 100644
--- a/torch/testing/_internal/opinfo_helper.py
+++ b/torch/testing/_internal/opinfo_helper.py
@@ -4,21 +4,22 @@
 
 import torch
 from torch.testing._internal.common_cuda import (TEST_CUDA)
-from torch.testing._core import _dispatch_dtypes
-from torch.testing import (all_types_and_complex_and,
-                           all_types_and_complex,
-                           all_types_and_half,
-                           all_types,
-                           complex_types,
-                           floating_and_complex_types,
-                           floating_types_and_half,
-                           floating_types,
-                           integral_types,
-                           floating_types_and,
-                           floating_and_complex_types_and,
-                           integral_types_and,
-                           all_types_and,
-                           )
+from torch.testing._internal.common_dtype import (
+    all_types_and_complex_and,
+    all_types_and_complex,
+    all_types_and_half,
+    all_types,
+    complex_types,
+    floating_and_complex_types,
+    floating_types_and_half,
+    floating_types,
+    integral_types,
+    floating_types_and,
+    floating_and_complex_types_and,
+    integral_types_and,
+    all_types_and,
+    _dispatch_dtypes,
+)
 
 COMPLETE_DTYPES_DISPATCH = (
     all_types,

From adb85b32d3cb98ad7fa333424f172959b2ef3e35 Mon Sep 17 00:00:00 2001
From: Yanli Zhao <yanlizhao@fb.com>
Date: Tue, 7 Sep 2021 09:28:30 -0700
Subject: [PATCH 523/530] minor fix for elastic doc (#64531)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64531

fix #64530

Test Plan: unit test

Reviewed By: mrshenli

Differential Revision: D30760879

fbshipit-source-id: 94ed1476e886513427d928a36f5be6b9bfff0826
---
 torch/distributed/elastic/agent/server/api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
index 6d389a7873a4a..d767233a2ae52 100644
--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@@ -160,7 +160,7 @@ def __init__(
 
         #  rank of the worker among all the workers with the same role
         #  across all ``agent`` instances.
-        #  Global rank is not stable between re-rendezvous.
+        #  Role rank is not stable between re-rendezvous.
         self.role_rank: int = role_rank
 
         # total number of workers (globally). Due to elasticity

From 43248d91122470e4f0a63997737451cfbc82780e Mon Sep 17 00:00:00 2001
From: Ilqar Ramazanli <iramazanli@fb.com>
Date: Tue, 7 Sep 2021 11:02:11 -0700
Subject: [PATCH 524/530] [doc][hackathon] To add Adam Optimizer to the
 documentation (#63251)

Summary:
It has been discussed before that adding description of Optimization algorithms to PyTorch Core documentation may result in a nice Optimization research tutorial. In the following tracking issue we mentioned about all the necessary algorithms and links to the originally published paper  https://github.com/pytorch/pytorch/issues/63236.

In this PR we are adding description of Adam Algorithm to the documentation.  For more details, we refer to the paper  https://arxiv.org/abs/1412.6980

<img width="442" alt="Screen Shot 2021-08-27 at 6 37 54 PM" src="https://user-images.githubusercontent.com/73658284/131195297-35fce613-3691-4fed-b42d-db234d4fcd7c.png">

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63251

Reviewed By: albanD

Differential Revision: D30779163

Pulled By: iramazanli

fbshipit-source-id: 319a80fc3952793b0d064d0e641ddc1de3c05a86
---
 torch/optim/adam.py | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index d7313be75f8fb..ea2ceaff67057 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -6,9 +6,37 @@
 class Adam(Optimizer):
     r"""Implements Adam algorithm.
 
-    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
-    The implementation of the L2 penalty follows changes proposed in
-    `Decoupled Weight Decay Regularization`_.
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \gamma \text{ (lr)}, \beta_1, \beta_2
+                \text{ (betas)},\theta_0 \text{ (params)},f(\theta) \text{ (objective)}          \\
+            &\hspace{13mm}      \lambda \text{ (weight decay)},  \: amsgrad                      \\
+            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
+                v_0\leftarrow 0 \text{ (second moment)},\: \widehat{v_0}^{max}\leftarrow 0\\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm}\textbf{if} \: \lambda \neq 0                                           \\
+            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
+            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
+            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
+            &\hspace{5mm}\widehat{m_t} \leftarrow   m_t/\big(1-\beta_1^t \big)                   \\
+            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
+            &\hspace{5mm}\textbf{if} \: amsgrad                                                  \\
+            &\hspace{10mm}\widehat{v_t}^{max} \leftarrow \mathrm{max}(\widehat{v_t}^{max},
+                \widehat{v_t})                                                                   \\
+            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
+                \big(\sqrt{\widehat{v_t}^{max}} + \epsilon \big)                                 \\
+            &\hspace{5mm}\textbf{else}                                                           \\
+            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
+                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.
 
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
@@ -25,8 +53,6 @@ class Adam(Optimizer):
 
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
-    .. _Decoupled Weight Decay Regularization:
-        https://arxiv.org/abs/1711.05101
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
     """

From 85eeb4d682f98b2a47dcbc2e02a8c5543ae2ed2c Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Tue, 7 Sep 2021 11:23:52 -0700
Subject: [PATCH 525/530] Clean up op BC check list (#64584)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64584

It has been a while since last clean up. The list is really long.

Test Plan: ci

Reviewed By: hl475

Differential Revision: D30779350

fbshipit-source-id: 908b47d0b9a16b784aad6a34c5c87f923500c247
---
 .../check_backward_compatibility.py           | 69 -------------------
 1 file changed, 69 deletions(-)

diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index e1dde921f102d..16b415a7368fa 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -36,86 +36,17 @@
     # Internal, profiler-specific ops
     ("profiler::_call_end_callbacks_on_jit_fut*", datetime.date(9999, 1, 1)),
     ("profiler::_record_function_enter", datetime.date(9999, 1, 1)),
-    ("aten::_qr_helper", datetime.date(2021, 1, 31)),
-    ("aten::fft", datetime.date(2021, 1, 31)),
-    ("aten::ifft", datetime.date(2021, 1, 31)),
-    ("aten::irfft", datetime.date(2021, 1, 31)),
-    ("aten::rfft", datetime.date(2021, 1, 31)),
-    ("aten::linalg_svd", datetime.date(2021, 5, 15)),
-    ("aten::linalg_cholesky.out", datetime.date(2021, 8, 30)),
-    ("aten::linalg_cholesky_ex", datetime.date(2021, 8, 30)),
-    ("aten::linalg_cholesky_ex.L", datetime.date(2021, 8, 30)),
     ("aten::_cholesky_helper", datetime.date(9999, 1, 1)),
     ("aten::_lstsq_helper", datetime.date(9999, 1, 1)),
-    ("aten::linalg_lstsq", datetime.date(2021, 5, 1)),
-    ("aten::_svd_helper", datetime.date(2021, 1, 31)),
     ("aten::_syevd_helper", datetime.date(9999, 1, 1)),
     ("aten::_lu_solve_helper", datetime.date(9999, 1, 1)),
     ("aten::_lu_with_info", datetime.date(9999, 1, 1)),
     ("aten::_linalg_solve_out_helper_", datetime.date(9999, 1, 1)),
-    ("aten::_cudnn_rnn_flatten_weight", datetime.date(2020, 12, 31)),
-    ("aten::_cudnn_rnn", datetime.date(2020, 12, 31)),
-    ("aten::_cudnn_rnn_backward", datetime.date(2020, 12, 31)),
-    ("aten::quantile", datetime.date(2021, 1, 31)),
-    ("aten::nanquantile", datetime.date(2021, 1, 31)),
-    ("aten::make_dual", datetime.date(2021, 2, 20)),
-    ("aten::unpack_dual", datetime.date(2021, 2, 20)),
-    ("aten::_fft_with_size", datetime.date(2021, 1, 31)),
-    ("aten::thnn_conv_depthwise2d_backward", datetime.date(2021, 1, 31)),
-    ("aten::slow_conv3d_backward", datetime.date(2021, 1, 31)),
-    ("aten::thnn_conv2d_backward", datetime.date(2021, 1, 31)),
-    ("aten::slow_conv_transpose3d_backward", datetime.date(2021, 1, 31)),
-    ("aten::slow_conv_transpose2d_backward", datetime.date(2021, 1, 31)),
-    ("aten::set_", datetime.date(2021, 1, 31)),
-    ("aten::native_layer_norm", datetime.date(2021, 1, 31)),
-    ("aten::native_layer_norm_backward", datetime.date(2021, 1, 31)),
-    ("aten::elu_backward", datetime.date(2021, 1, 31)),
-    ("aten::_multinomial_alias_setup", datetime.date(2021, 1, 31)),
-    ("aten::_multinomial_alias_draw", datetime.date(2021, 1, 31)),
-    ("prim::profile_optional", datetime.date(2021, 1, 31)),
-    ("aten::fake_quantize_per_tensor_affine_backward", datetime.date(2021, 2, 20)),
-    ("aten::fake_quantize_per_channel_affine_backward", datetime.date(2021, 2, 20)),
     ("aten::rowwise_prune", datetime.date(9999, 1, 1)),
-    ("aten::_mode*", datetime.date(2021, 5, 2)),
-    ("aten::linalg_multi_dot", datetime.date(2021, 3, 25)),
-    ("aten::coalesce", datetime.date(2021, 4, 15)),
-    ("aten::empty_meta", datetime.date(2021, 4, 1)),
-    ("aten::div", datetime.date(2021, 4, 28)),
-    ("aten::divide", datetime.date(2021, 4, 28)),
-    ("aten::_var", datetime.date(2021, 5, 28)),
-    ("aten::_std", datetime.date(2021, 5, 28)),
-    ("aten::batch_norm_backward_elemt", datetime.date(2021, 5, 1)),
-    ("aten::assert_async", datetime.date(2021, 5, 1)),
-    ("aten::cumprod_backward", datetime.date(2021, 5, 1)),
     ("aten::_triangular_solve_helper", datetime.date(9999, 1, 1)),
-    ("aten::_addmv_impl_", datetime.date(2021, 5, 15)),
     ("aten::adaptive_avg_pool3d_backward", datetime.date(9999, 1, 1)),
     ("aten::_embedding_bag_dense_backward", datetime.date(9999, 1, 1)),
-    ("aten::_amp_update_scale", datetime.date(2021, 6, 1)),
     ("aten::randperm", datetime.date(9999, 1, 1)),
-    ("aten::linalg_vector_norm", datetime.date(2021, 5, 15)),
-    ("aten::repeat_interleave", datetime.date(2021, 6, 26)),
-    ("aten::one_hot", datetime.date(2021, 6, 15)),
-    ("aten::slice", datetime.date(2021, 6, 30)),
-    ("aten::conj", datetime.date(2021, 8, 1)),
-    ("aten::_conj", datetime.date(2021, 8, 1)),
-    ("aten::conj.out", datetime.date(2021, 8, 1)),
-    ("aten::segment_reduce_backward", datetime.date(2021, 6, 15)),
-    ("aten::segment_reduce", datetime.date(2021, 8, 26)),
-    ("aten::_segment_reduce_backward", datetime.date(2021, 8, 26)),
-    ("aten::thnn_conv_depthwise2d", datetime.date(2021, 8, 27)),
-    ("aten::thnn_conv_depthwise2d.out", datetime.date(2021, 8, 27)),
-    ("aten::thnn_conv_depthwise2d_forward", datetime.date(2021, 8, 27)),
-    ("aten::thnn_conv_depthwise2d_forward.out", datetime.date(2021, 8, 27)),
-    ("aten::thnn_conv_depthwise2d_backward", datetime.date(2021, 8, 27)),
-    ("aten::thnn_conv_depthwise2d_backward.out", datetime.date(2021, 8, 27)),
-    ("aten::_view_as_real_physical", datetime.date(2021, 8, 27)),
-    ("aten::_view_as_real_physical", datetime.date(2021, 8, 1)),
-    ("aten::_bmm", datetime.date(2021, 8, 14)),
-    ("aten::_bmm.out", datetime.date(2021, 8, 14)),
-    ("aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams", datetime.date(2021, 8, 15)),
-    ("aten::_cumsum", datetime.date(2021, 8, 31)),
-    ("aten::_cumprod", datetime.date(2021, 8, 31)),
 ]
 
 ALLOW_LIST_COMPILED = [

From c9d6ca4c54c7aba7ccb3df728c41161817b9a936 Mon Sep 17 00:00:00 2001
From: Danielle Pintz <38207072+daniellepintz@users.noreply.github.com>
Date: Tue, 7 Sep 2021 11:34:08 -0700
Subject: [PATCH 526/530] Add space in Feature Request issue template (#64563)

Summary:
Add space between emoji and text in Feature Request issue template

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64563

Reviewed By: janeyx99

Differential Revision: D30779429

Pulled By: seemethere

fbshipit-source-id: 3625299923a7022fa66473633524a6620d58188b
---
 .github/ISSUE_TEMPLATE/feature-request.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md
index e1d2bc306eae8..6e1432dbd4474 100644
--- a/.github/ISSUE_TEMPLATE/feature-request.md
+++ b/.github/ISSUE_TEMPLATE/feature-request.md
@@ -1,5 +1,5 @@
 ---
-name: "\U0001F680Feature Request"
+name: "\U0001F680 Feature Request"
 about: Submit a proposal/request for a new PyTorch feature
 
 ---

From 7e4ebe06cab4aedc66c501239ca568a0d28fd755 Mon Sep 17 00:00:00 2001
From: Kevin Tse <ktse@fb.com>
Date: Tue, 7 Sep 2021 11:34:27 -0700
Subject: [PATCH 527/530] Fixes issue related torch.trapezoid broadcasting
 behavior and documentation (#64054)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64054

Fixes #63608

cc mruberry rgommers heitorschueroff

Test Plan: Imported from OSS

Reviewed By: saketh-are

Differential Revision: D30617078

Pulled By: NivekT

fbshipit-source-id: 815896ec56d447562790df4d662e94fd13457e2a
---
 aten/src/ATen/native/Integration.cpp | 37 ++++++++++++++++++++++------
 test/test_binary_ufuncs.py           |  9 +++++++
 torch/_torch_docs.py                 | 11 +++++----
 3 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/Integration.cpp b/aten/src/ATen/native/Integration.cpp
index 262519f69a61c..e57dc4505df4e 100644
--- a/aten/src/ATen/native/Integration.cpp
+++ b/aten/src/ATen/native/Integration.cpp
@@ -52,7 +52,21 @@ Tensor do_cumulative_trapezoid(const Tensor& y, double dx, int64_t dim) {
 
     return (dx /2. * (left + right)).cumsum(dim);
 }
-
+// Given the current shape of a Tensor and a target number of dimensions,
+// returns a new shape with the same values as the original shape,
+// but with '1's padded in the beginning to match the target number of dimensions.
+// For example, curr_shape = (5,5,5) and target_n_dim = 6 ==> (1,1,1,5,5,5)
+// Note that no padding will be added if the current shape has the greater than or equal
+// number of dimensions than the target numbers of dimensions.
+DimVector add_padding_to_shape(IntArrayRef curr_shape, int64_t target_n_dim) {
+    if (curr_shape.size() >= target_n_dim)
+        target_n_dim = curr_shape.size();
+    DimVector new_shape(target_n_dim, 1);
+    for (decltype(curr_shape.size()) i = 0; i < curr_shape.size(); i++) {
+        new_shape[target_n_dim-i-1] = curr_shape[curr_shape.size()-i-1];
+    }
+    return new_shape;
+}
 }
 
 Tensor trapezoid(const Tensor& y, const Tensor& x, int64_t dim) {
@@ -71,9 +85,15 @@ Tensor trapezoid(const Tensor& y, const Tensor& x, int64_t dim) {
         // Note: This behavior differs from numpy in that numpy tries to
         // broadcast 'dx', but this tries to broadcast 'x' to match 'y' instead.
         TORCH_CHECK(x.size(0) == y.size(dim), "trapezoid: There must be one `x` value for each sample point");
-        DimVector sizes(y.dim(), 1);
-        sizes[dim] = x.size(0);
-        x_viewed = x.view(sizes);
+        DimVector new_sizes(y.dim(), 1); // shape = [1] * y.
+        new_sizes[dim] = x.size(0); // shape[axis] = d.shape[0]
+        x_viewed = x.view(new_sizes);
+    } else if (x.dim() < y.dim()) {
+        // When 'y' has more dimension than 'x', this step takes 'x' with dimension (n_1, n_2, ...),
+        // and add '1's as dimensions in front to become (1, 1, ..., n_1, n_2), matching the dimension of 'y'.
+        // This allows the subsequent slicing operations to proceed with any 'dim' without going out of bound.
+        DimVector new_sizes = add_padding_to_shape(x.sizes(), y.dim());
+        x_viewed = x.view(new_sizes);
     } else {
         x_viewed = x;
     }
@@ -110,9 +130,12 @@ Tensor cumulative_trapezoid(const Tensor& y, const Tensor& x, int64_t dim) {
     Tensor x_viewed;
     if (x.dim() == 1) {
         TORCH_CHECK(x.size(0) == y.size(dim), "cumulative_trapezoid: There must be one `x` value for each sample point");
-        DimVector sizes(y.dim(), 1); // shape = [1] * y.
-        sizes[dim] = x.size(0); // shape[axis] = d.shape[0]
-        x_viewed = x.view(sizes);
+        DimVector new_sizes(y.dim(), 1); // shape = [1] * y.
+        new_sizes[dim] = x.size(0); // shape[axis] = d.shape[0]
+        x_viewed = x.view(new_sizes);
+    } else if (x.dim() < y.dim()) {
+        DimVector new_sizes = add_padding_to_shape(x.sizes(), y.dim());
+        x_viewed = x.view(new_sizes);
     } else {
         x_viewed = x;
     }
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index f8c36adf8b781..7153902841aa5 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -2668,6 +2668,11 @@ def test_x(sizes, dim, x, device):
         test_x((1, 10), 0, [1.0], device)
         test_x((0, 2), 0, [], device)
         test_x((0, 2), 1, [1.0, 2.0], device)
+        test_x((2, 3, 4), -1, [1.0, 2.0, 3.0, 4.0], device)
+        test_x((2, 3, 4), 0, [1.0, 2.0], device)
+        test_x((2, 3, 4), 1, [1.0, 2.0, 3.0], device)
+        test_x((2, 3, 4), 2, [1.0, 2.0, 3.0, 4.0], device)
+        test_x((2, 2, 4), -1, [[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]], device)
         with self.assertRaisesRegex(
                 IndexError,
                 'Dimension out of range'):
@@ -2726,6 +2731,10 @@ def test_empty_x(sizes, dim, x, device):
         test_x((10, 2), 0, [2.0, 3.0, 4.0, 7.0, 11.0, 14.0, 22.0, 26.0, 26.1, 30.3], device)
         test_x((1, 10), 0, [1.0], device)
         test_x((0, 2), 1, [1, 2], device)
+        test_x((2, 3, 4), -1, [1.0, 2.0, 3.0, 4.0], device)
+        test_x((2, 3, 4), 0, [1.0, 2.0], device)
+        test_x((2, 3, 4), 1, [1.0, 2.0, 3.0], device)
+        test_x((2, 3, 4), 2, [1.0, 2.0, 3.0, 4.0], device)
 
         test_empty_x((0, 2), 0, [], device)  # SciPy failing when x == [], but our version returns empty
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 7dca8a7bdedbd..13112a9101296 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -10876,11 +10876,12 @@ def merge_dicts(*dicts):
         \sum_{i = 1}^{n-1} \frac{(x_i - x_{i-1})}{2} (y_i + y_{i-1})
     \end{aligned}
 
-When :attr:`y` is two or more dimensions, this computation is performed independently
-along dimension :attr:`dim`. If :attr:`x` is also specified and is one-dimensional,
-then that dimension defines the spacing for each computation.
-If :attr:`x` is also specified and is not one-dimensional, then it is broadcast to
-the shape of :attr:`y` and the corresponding sizes are used for each computation.
+When :attr:`x` and :attr:`y` have the same size, the computation is as described above and no broadcasting is needed.
+The broadcasting behavior of this function is as follows when their sizes are different. For both :attr:`x`
+and :attr:`y`, the function computes the difference between consecutive elements along
+dimension :attr:`dim`. This effectively creates two tensors, `x_diff` and `y_diff`, that have
+the same shape as the original tensors except their lengths along the dimension :attr:`dim` is reduced by 1.
+After that, those two tensors are broadcast together to compute final output as part of the trapezoidal rule.
 See the examples below for details.
 
 .. note::

From 8407ce7e382b009547f3c0d282540006afcfa1b9 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Tue, 7 Sep 2021 12:30:16 -0700
Subject: [PATCH 528/530] [small BE] .github: refactor concurrency into a
 common macro (#64587)

Summary:
By using a macro for these concurrency groups, we can edit just one place for the linux and windows workflows (vs 2).

I wanted to loop all the other workflow files in as well, but since those aren't generated, the macros won't work the same way.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/64587

Reviewed By: mrshenli

Differential Revision: D30783224

Pulled By: janeyx99

fbshipit-source-id: ae16ebb12d2d63a563d28f0ce88e280f68ed4b9b
---
 .github/templates/common.yml.j2              | 6 ++++++
 .github/templates/linux_ci_workflow.yml.j2   | 4 +---
 .github/templates/windows_ci_workflow.yml.j2 | 4 +---
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index 07ad771346399..aff01377ff665 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -5,6 +5,12 @@
 {# squid_no_proxy is a list of common set of fixed domains or IPs that we don't need to proxy. See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/http_proxy_config.html#windows-proxy #}
 {%- set squid_no_proxy = "localhost,127.0.0.1,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%}
 
+{%- macro concurrency(build_environment) -%}
+concurrency:
+  group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+{%- endmacro -%}
+
 {%- macro display_ec2_information() -%}
       - name: Display EC2 information
         shell: bash
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 2d856704c3137..520a6a00a19f6 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -45,9 +45,7 @@ env:
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
   PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
 
-concurrency:
-  group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
+!{{ common.concurrency(build_environment) }}
 
 jobs:
 {%- if ciflow_config.enabled %}
diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2
index 4f486dd75f3b7..20fe72238ffeb 100644
--- a/.github/templates/windows_ci_workflow.yml.j2
+++ b/.github/templates/windows_ci_workflow.yml.j2
@@ -60,9 +60,7 @@ env:
   USE_CUDA: 1
 {%- endif %}
 
-concurrency:
-  group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
+!{{ common.concurrency(build_environment) }}
 
 jobs:
 {%- if ciflow_config.enabled %}

From 337c71be05f959799a305164e6edf86c686bb673 Mon Sep 17 00:00:00 2001
From: Anirudh Dagar <anirudhdagar6@gmail.com>
Date: Tue, 7 Sep 2021 12:34:15 -0700
Subject: [PATCH 529/530] Array API: Add `torch.linalg.matmul` alias to
 `torch.matmul` (#63227)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/62811

Add `torch.linalg.matmul` alias to `torch.matmul`. Note that the `linalg.matmul` doesn't have a `method` variant.

Also cleaning up `torch/_torch_docs.py` when formatting is not needed.

cc IvanYashchuk Lezcano mruberry rgommers

Pull Request resolved: https://github.com/pytorch/pytorch/pull/63227

Reviewed By: mrshenli

Differential Revision: D30770235

Pulled By: mruberry

fbshipit-source-id: bfba77dfcbb61fcd44f22ba41bd8d84c21132403
---
 aten/src/ATen/core/aten_interned_strings.h    |  1 -
 aten/src/ATen/core/interned_strings.h         |  2 ++
 aten/src/ATen/native/LinearAlgebra.cpp        |  9 ++++++++
 aten/src/ATen/native/native_functions.yaml    |  8 +++++++
 docs/source/linalg.rst                        |  1 +
 torch/_torch_docs.py                          | 22 +++++++++----------
 torch/csrc/jit/passes/normalize_ops.cpp       |  1 +
 torch/linalg/__init__.py                      |  6 +++++
 torch/overrides.py                            |  1 +
 .../_internal/common_methods_invocations.py   |  1 +
 10 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index 6da99dfc6a4d9..df6b860a8a363 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -453,7 +453,6 @@ _(aten, margin_ranking_loss) \
 _(aten, masked_fill) \
 _(aten, masked_scatter) \
 _(aten, masked_select) \
-_(aten, matmul) \
 _(aten, matrix_rank) \
 _(aten, matrix_exp) \
 _(aten, max) \
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 8d49d82c5c8f4..e7aef155a5656 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -211,6 +211,8 @@ namespace c10 {
   _(aten, linalg_norm)               \
   _(aten, linalg_vector_norm)        \
   _(aten, linalg_matrix_norm)        \
+  _(aten, matmul)                    \
+  _(aten, linalg_matmul)             \
   _(aten, append)                    \
   _(aten, item)                      \
   _(aten, format)                    \
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 2ae6202ce87e2..59950b987900f 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1567,6 +1567,15 @@ Tensor& matmul_out(const Tensor & tensor1, const Tensor & tensor2, Tensor &resul
   return result;
 }
 
+// torch.linalg.matmul, alias for torch.matmul
+Tensor linalg_matmul(const Tensor & tensor1, const Tensor & tensor2) {
+  return at::native::matmul(tensor1, tensor2);
+}
+
+Tensor& linalg_matmul_out(const Tensor & tensor1, const Tensor & tensor2, Tensor &result) {
+  return at::native::matmul_out(tensor1, tensor2, result);
+}
+
 // helper methods for matrix_exp
 namespace {
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 3a1f75c588a83..dbacca2750850 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -10272,6 +10272,14 @@
   dispatch:
     CPU, CUDA: linalg_lstsq_out
 
+# torch.linalg.matmul, alias for torch.matmul
+- func: linalg_matmul(Tensor self, Tensor other) -> Tensor
+  python_module: linalg
+  variants: function
+
+- func: linalg_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+
 - func: linalg_slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
   python_module: linalg
   variants: function
diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst
index 7a286d3d4051e..ffca583b706e9 100644
--- a/docs/source/linalg.rst
+++ b/docs/source/linalg.rst
@@ -67,6 +67,7 @@ Matrix Products
     :toctree: generated
     :nosignatures:
 
+    matmul
     matrix_power
     multi_dot
     householder_product
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 13112a9101296..6a5f61cfa1a7d 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -149,7 +149,7 @@ def merge_dicts(*dicts):
 absolute(input, *, out=None) -> Tensor
 
 Alias for :func:`torch.abs`
-""".format(**common_args))
+""")
 
 add_docstr(torch.acos, r"""
 acos(input, *, out=None) -> Tensor
@@ -211,7 +211,7 @@ def merge_dicts(*dicts):
 arccosh(input, *, out=None) -> Tensor
 
 Alias for :func:`torch.acosh`.
-""".format(**common_args))
+""")
 
 add_docstr(torch.add, r"""
 add(input, other, *, alpha=1, out=None) -> Tensor
@@ -2269,7 +2269,7 @@ def merge_dicts(*dicts):
 clip(input, min=None, max=None, *, out=None) -> Tensor
 
 Alias for :func:`torch.clamp`.
-""".format(**common_args))
+""")
 
 add_docstr(torch.column_stack,
            r"""
@@ -4468,7 +4468,7 @@ def merge_dicts(*dicts):
 inverse(input, *, out=None) -> Tensor
 
 Alias for :func:`torch.linalg.inv`
-""".format(**common_args))
+""")
 
 add_docstr(torch.isin, r"""
 isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor
@@ -5701,7 +5701,7 @@ def merge_dicts(*dicts):
 matrix_power(input, n, *, out=None) -> Tensor
 
 Alias for :func:`torch.linalg.matrix_power`
-""".format(**common_args))
+""")
 
 add_docstr(torch.matrix_exp, r"""
 matrix_exp(input) -> Tensor
@@ -6667,7 +6667,7 @@ def merge_dicts(*dicts):
 multiply(input, other, *, out=None)
 
 Alias for :func:`torch.mul`.
-""".format(**common_args))
+""")
 
 add_docstr(torch.multinomial,
            r"""
@@ -7017,7 +7017,7 @@ def merge_dicts(*dicts):
 negative(input, *, out=None) -> Tensor
 
 Alias for :func:`torch.neg`
-""".format(**common_args))
+""")
 
 add_docstr(torch.nextafter,
            r"""
@@ -7390,7 +7390,7 @@ def merge_dicts(*dicts):
 polygamma(n, input, *, out=None) -> Tensor
 
 Alias for :func:`torch.special.polygamma`.
-""".format(**common_args))
+""")
 
 add_docstr(torch.positive,
            r"""
@@ -8249,7 +8249,7 @@ def merge_dicts(*dicts):
 row_stack(tensors, *, out=None) -> Tensor
 
 Alias of :func:`torch.vstack`.
-""".format(**common_args))
+""")
 
 add_docstr(torch.round,
            r"""
@@ -9974,7 +9974,7 @@ def merge_dicts(*dicts):
 true_divide(dividend, divisor, *, out) -> Tensor
 
 Alias for :func:`torch.div` with ``rounding_mode=None``.
-""".format(**common_args))
+""")
 
 add_docstr(torch.trunc,
            r"""
@@ -10090,7 +10090,7 @@ def merge_dicts(*dicts):
 fix(input, *, out=None) -> Tensor
 
 Alias for :func:`torch.trunc`
-""".format(**common_args))
+""")
 
 add_docstr(torch.unsqueeze,
            r"""
diff --git a/torch/csrc/jit/passes/normalize_ops.cpp b/torch/csrc/jit/passes/normalize_ops.cpp
index 5ac36e1f1b76f..67637031868c1 100644
--- a/torch/csrc/jit/passes/normalize_ops.cpp
+++ b/torch/csrc/jit/passes/normalize_ops.cpp
@@ -102,6 +102,7 @@ const std::unordered_map<Symbol, Symbol>& getOperatorAliasMap() {
       {aten::divide_, aten::div_},
       {aten::multiply, aten::mul},
       {aten::multiply_, aten::mul_},
+      {aten::linalg_matmul, aten::matmul},
       {aten::true_divide, aten::div},
       {aten::true_divide_, aten::div_},
       {aten::concat, aten::cat},
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index df3507f1b3561..f98930e471630 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -1284,6 +1284,12 @@
     tensor([ 3.1623, 10.0000, 17.2627])
 """)
 
+matmul = _add_docstr(_linalg.linalg_matmul, r"""
+linalg.matmul(input, other, *, out=None) -> Tensor
+
+Alias for :func:`torch.matmul`
+""")
+
 multi_dot = _add_docstr(_linalg.linalg_multi_dot, r"""
 linalg.multi_dot(tensors, *, out=None)
 
diff --git a/torch/overrides.py b/torch/overrides.py
index aca14a6d4552b..1bb98507f18b1 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -599,6 +599,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.masked_scatter: lambda input, mask, source: -1,
         torch.masked_select: lambda input, mask, out=None: -1,
         torch.matmul: lambda input, other, out=None: -1,
+        torch.linalg.matmul: lambda input, other, out=None: -1,  # alias for torch.matmul
         torch.matrix_power: lambda input, n: -1,
         torch.linalg.matrix_power: lambda input, n, out=None: -1,
         torch.matrix_rank: lambda input, tol=None, symmetric=False: -1,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 5113346a20e56..b38d4afc4af3a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -7116,6 +7116,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            supports_out=False,
            ),
     OpInfo('matmul',
+           aliases=('linalg.matmul',),
            dtypes=floating_types(),
            dtypesIfCPU=all_types_and_complex(),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),

From acc9f9afc8f2be70d7f5d3248ca1760e0336b3b8 Mon Sep 17 00:00:00 2001
From: "haozhe.zhu" <haozhe.zhu@intel.com>
Date: Tue, 7 Sep 2021 12:59:00 -0700
Subject: [PATCH 530/530] enable bf16 mkldnn path for gemm (#61891)

Summary:
# Goal: Integrate mkldnn bf16 Gemm to pytorch

## BF16 Suport for mm, addmm, bmm, addbmm, baddbmm, mv, addmv, dot (with mkldnn matmul primitive):
https://oneapi-src.github.io/oneDNN/group__dnnl__api__matmul.html
For gemm related ops, we keep all inputs under plain format. So we will not introduce opaque tensor for these ops to save mem copy here.

![mkldnn bf16 gemm integration](https://user-images.githubusercontent.com/54701539/126263077-4b5134e1-52a7-4fad-94fb-19e13a0377f6.png)

The minimized integration is only dispatch to mkldnn in addmm, but for gemm with 3-D input (with additional dim for"batch") this will call mkldnn gemm for "batch" times. Since mkldnn matmul support input with multiple dims, we directly dispatch to mkldnn gemm in {bmm, addbmm, baddbmm} to reduce the time to create mkldnn memory desc, primitive, etc.

For the different definition for "bias" between mkldnn(which must be shape of (1, N)) and pytorch (which can be same shape with gemm result (M, N)), we use a fused sum to handle it.

## User Case:
User case is exactly same with before because no opaque tensor's is introduced. Since the pytorch has already support bf16 data type with CPU tensor before, we can leverage the existed bf16 gemm UT.

## Gemm performance gain on CPX 28Cores/Socket:
Note: data is collected using PyTorch operator benchmarks: https://github.com/pytorch/pytorch/tree/master/benchmarks/operator_benchmark (with adding bfloat16 dtype)

### use 1 thread on 1 core
### torch.addmm (M, N) * (N, K) + (M, K)
| impl |16x16x16|32x32x32| 64x64x64 | 128x128x128| 256x256x256| 512x512x512|1024x1024x1024|
|:---:|:---:| :---: | :---: | :---: | :---: | :---: | :---: |
| aten-fp32| 4.115us|4.583us|8.230us|26.972us|211.857us|1.458ms|11.258ms|
| aten-bf16 | 15.812us| 105.087us|801.787us|3.767ms|20.274ms|122.440ms|836.453ms|
| mkldnn-bf16 |20.561us |22.510us|24.551us|37.709us|143.571us|0.835ms|5.76ms|

We can see mkldnn-bf16 are better than aten bf16, but for smaller shapes, mkldnn bf16 are not better than aten fp32. This is because onednn overhead, this overhead more like a "constant" overhead and while problems get larger, we can ignore it. Also we are continue optimize the kernel efficiency and decrease the overhead as well.

More shapes
| impl |1x2048x2048|2048x1x2048| 2048x2048x1 |
|:---:|:---:| :---: | :---: |
| aten-fp32| 0.640ms|3.794ms|0.641ms|
| aten-bf16 | 2.924ms| 3.868ms|23.413ms|
| mkldnn-bf16 |0.335ms |4.490ms|0.368ms|

### use 1 socket (28 thread, 28 core)
| impl | 256x256x256| 512x512x512|1024x1024x1024| 2048x2048x2048|4096x4096x4096|
|:---:| :---: | :---: | :---: | :---: | :---: |
| aten-fp32| 35.943us |140.315us|643.510us|5.827ms|41.761ms|
| mkldnn-bf16 |53.432us|114.716us|421.858us|2.863ms|23.029ms|

More shapes
| impl |128x2048x2048|2048x128x2048| 2048x2048x128 |
|:---:|:---:| :---: | :---: |
| aten-fp32| 0.561ms|0.458ms|0.406ms|
| mkldnn-bf16 |0.369ms |0.331ms|0.239ms|

We dose not show aten-bf16 for this case since aten-bf16 always compute as single thread and the performance is extreme poor. The trend for this case is similar for 1 thread on 1 core.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/61891

Reviewed By: iramazanli

Differential Revision: D29998114

Pulled By: VitalyFedyunin

fbshipit-source-id: 459dc5874c638d62f290c96684ca0a694ded4b5a
---
 aten/src/ATen/native/Blas.cpp                 | 34 ++++++-
 aten/src/ATen/native/LinearAlgebra.cpp        | 33 +++++++
 aten/src/ATen/native/mkldnn/Matmul.cpp        | 99 +++++++++++++++++++
 aten/src/ATen/native/mkldnn/Matmul.h          | 17 ++++
 aten/src/ATen/native/mkldnn/Utils.h           | 26 +++++
 test/test_linalg.py                           | 51 +++++-----
 tools/build_variables.bzl                     |  1 +
 .../_internal/common_methods_invocations.py   |  8 +-
 8 files changed, 242 insertions(+), 27 deletions(-)
 create mode 100644 aten/src/ATen/native/mkldnn/Matmul.cpp
 create mode 100644 aten/src/ATen/native/mkldnn/Matmul.h

diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp
index 114de632a384a..eb025f47e9d76 100644
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@@ -3,6 +3,12 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/ScalarOps.h>
+#include <ATen/Config.h>
+
+#if AT_MKLDNN_ENABLED()
+#include <ATen/native/mkldnn/Utils.h>
+#include <ATen/native/mkldnn/Matmul.h>
+#endif // AT_MKLDNN_ENABLED
 
 namespace at {
 namespace meta {
@@ -62,6 +68,19 @@ TORCH_IMPL_FUNC(addmv_out_cpu)(const Tensor &self, const Tensor &mat, const Tens
       at::native::copy_(const_cast<Tensor&>(result), *self_);
     }
     if (result.numel() != 0) {
+
+#if AT_MKLDNN_ENABLED()
+      NoNamesGuard guard;
+      // mkldnn matmul expect dim >= 2
+      auto vec_ = vec.unsqueeze(1);
+      if (use_mkldnn_bf16_gemm(mat, vec_, /*result=*/Tensor())){
+        mkldnn_matmul(mat, vec_, result.unsqueeze_(1), beta_.to<float>(), alpha_.to<float>());
+        // recover tensor's dim = 1
+        result.squeeze_(1);
+        return;
+      }
+#endif // AT_MKLDNN_ENABLED
+
       auto r_stride = result.stride(0);
       AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, mat.scalar_type(), "addmv_impl_cpu", [&] {
         auto beta = beta_.to<scalar_t>();
@@ -148,7 +167,20 @@ Tensor dot(const Tensor &self, const Tensor &other){
   at::NoNamesGuard guard;
   dot_check(self, other);
 
-  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, self.scalar_type(), "dot", [&] {
+#if AT_MKLDNN_ENABLED()
+  // mkldnn matmul expect dim >= 2
+  auto self_ = self.unsqueeze(0);
+  auto other_= other.unsqueeze(1);
+  if (use_mkldnn_bf16_gemm(self_, other_, /*result=*/Tensor())){
+    // mkldnn matmul expect result have sizes info to create ideep tensor
+    auto r =  at::empty({1, 1}, self.options());
+    mkldnn_matmul(self_, other_, r, /*beta=*/0);
+    // recovery tensor's dim = 1
+    return r.squeeze_();
+  }
+#endif // AT_MKLDNN_ENABLED
+
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "dot", [&] {
     Tensor result = at::empty({}, self.options());
     result.fill_(dot_impl<scalar_t>(self.numel(), self.data_ptr<scalar_t>(), self.stride(0), other.data_ptr<scalar_t>(), other.stride(0)));
     return result;
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 59950b987900f..0576bd667c3f6 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -23,6 +23,10 @@
 #include <limits>
 #include <numeric>
 
+#if AT_MKLDNN_ENABLED()
+#include <ATen/native/mkldnn/Utils.h>
+#include <ATen/native/mkldnn/Matmul.h>
+#endif // AT_MKLDNN_ENABLED
 
 namespace at {
 namespace meta {
@@ -1050,6 +1054,21 @@ static void addmm_impl_cpu_(
   // Always ensure the conjugation for c is resolved since there's no way to specify c's conjugation in the gemm call
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c.is_conj());
 
+#if AT_MKLDNN_ENABLED()
+  if (use_mkldnn_bf16_gemm(a, b, c)){
+    if (transpose_c){
+      // m1, m2 are swapped
+      mkldnn_matmul(b, a, c, beta.to<float>(), alpha.to<float>());
+    } else {
+      mkldnn_matmul(a, b, c, beta.to<float>(), alpha.to<float>());
+    }
+    if (!c.is_same(result)) {
+      result.copy_(c);
+    }
+    return;
+  }
+#endif // AT_MKLDNN_ENABLED
+
   // Apply BLAS routine
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16,
       result.scalar_type(), "addmm_impl_cpu_",
@@ -1104,6 +1123,13 @@ static void addbmm_impl_(
     return;
   }
 
+#if AT_MKLDNN_ENABLED()
+  if (use_mkldnn_bf16_gemm(batch1, batch2, result)){
+    mkldnn_matmul(batch1, batch2, result, beta.to<float>(), alpha.to<float>());
+    return;
+  }
+#endif // AT_MKLDNN_ENABLED
+
   auto adjusted_beta(beta);
   for (int64_t batch = 0; batch < num_batches; ++batch) {
     result.addmm_(batch1[batch], batch2[batch], adjusted_beta, alpha);
@@ -1254,6 +1280,13 @@ static inline Tensor& bmm_out_or_baddbmm_(Tensor& self_or_result, const Tensor&
             || (strides[1] == 1 && strides[2] >= sizes[1]);
   };
 
+#if AT_MKLDNN_ENABLED()
+  if (use_mkldnn_bf16_gemm(batch1, batch2, self_or_result)){
+    mkldnn_matmul(batch1, batch2, self_or_result, beta.to<float>(), alpha.to<float>());
+    return self_or_result;
+  }
+#endif // AT_MKLDNN_ENABLED
+
   if (contraction_size * res_rows * res_cols < 400) {
     if (is_bmm_out) {
       AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, batch1.scalar_type(), "bmm", [&] {
diff --git a/aten/src/ATen/native/mkldnn/Matmul.cpp b/aten/src/ATen/native/mkldnn/Matmul.cpp
new file mode 100644
index 0000000000000..5327ce821ff1e
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@@ -0,0 +1,99 @@
+#include <ATen/ATen.h>
+#include <ATen/Config.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/mkldnn/Matmul.h>
+#if !AT_MKLDNN_ENABLED()
+
+namespace at {
+namespace native {
+
+void mkldnn_matmul(
+    const Tensor &mat1,
+    const Tensor &mat2,
+    Tensor &result,
+    float beta,
+    float alpha) {
+  TORCH_CHECK(false, "mkldnn_matmul: ATen not compiled with MKLDNN support");
+}
+} // namespace native
+} // namespace at
+
+#else // AT_MKLDNN_EBABLED
+
+#include <ATen/native/mkldnn/MKLDNNCommon.h>
+#include <ATen/native/mkldnn/Utils.h>
+
+namespace at {
+namespace native {
+
+void mkldnn_matmul(
+    const Tensor &mat1,
+    const Tensor &mat2,
+    const Tensor &result,
+    float beta,
+    float alpha) {
+  TORCH_CHECK((mat1.dim() == 2 && mat2.dim() == 2) || (mat1.dim() == 3 && mat2.dim() == 3),
+    "mkldnn_matmul:  expect mat1 to be 2-D or 3-D tensor");
+  TORCH_CHECK(mat1.scalar_type() == at::kBFloat16 &&
+   mat2.scalar_type() == at::kBFloat16 &&
+   result.scalar_type() == at::kBFloat16, "mkldnn_matmul:  only enabled for bf16 path");
+  TORCH_CHECK(mkldnn_bf16_device_check(),
+    "mkldnn_matmul: mkldnn_matmul bf16 path needs the cpu support avx512bw, avx512vl and avx512dq");
+  ideep::attr_t op_attr;
+  // "addmm", "addbmm" "baddbmm" in pytorch allow bias to be 2-D or 3-D tensor
+  // but mkldnn matmul primitive only support bias be 1-D tensors
+  // to address their differences, we use mkldnn post ops to perform a fused "add" after matrix multiplication is over
+  if (beta != 0.0f) op_attr = ideep::attr_t::fuse_sum();
+  // If alpha = 0, dose not need actually do gemm computation
+  if (alpha == 0)
+    return;
+
+  auto is_mkldnn_optimized_format = [&](const Tensor& t) {
+    if (t.is_contiguous()) return true;
+    const auto sizes = t.sizes();
+    const auto strides = t.strides();
+    if (t.dim() == 2){
+      return strides[0] == 1 && strides[1] == sizes[0];
+    } else {
+      // dim = 3
+      return strides[0] == sizes[1] * sizes[2] && strides[1] == 1 && strides[2] == sizes[1];
+    }
+  };
+
+  // Mkldnn only optimized for contiguous or transposed (transpose last 2 dim if 3-D tensor) format now
+  // Will remove this "contiguous" after mkldnn have fully supported
+  Tensor mat1_ = is_mkldnn_optimized_format(mat1) ? mat1 : mat1.contiguous();
+  Tensor mat2_ = is_mkldnn_optimized_format(mat2) ? mat2 : mat2.contiguous();
+  Tensor mat1_reshaped = mat1_;
+  Tensor mat2_reshaped = mat2_;
+  if (result.dim() == 2 && mat1.dim() == 3 && mat2.dim() == 3){
+    // addbmm(batch1*batch2) [b,n,m] * [b,m,p] = [n,p] can be treated as:
+    // [n, b*m] * [b*m, p] = [n, p]
+    // For batch1: reorder from [b, n, m] to [n, b, m], reshape to [n, b*m]
+    // For batch2: reshape from [b, m, p] to [b*m, p]
+    auto mat1_size = mat1.sizes();
+    auto mat2_size = mat2.sizes();
+    mat1_ = mat1_size[0] > 1 ? mat1_.transpose(0, 1) : mat1_;
+    mat1_reshaped = mat1_.reshape({mat1_size[1], mat1_size[0] * mat1_size[2]});
+    mat2_reshaped = mat2_.reshape({mat2_size[0] * mat2_size[1], mat2_size[2]});
+ }
+
+  // mkldnn_matmul only proceed CPU tensor
+  const ideep::tensor x = itensor_view_from_dense(mat1_reshaped);
+  const ideep::tensor w = itensor_view_from_dense(mat2_reshaped);
+  ideep::tensor y = itensor_view_from_dense(result);
+  ideep::matmul_forward::compute(x, w, y, alpha, beta,
+      ideep::scale_t(), ideep::scale_t(), ideep::scale_t(), op_attr);
+  if (y.get_data_handle() != result.data_ptr()){
+    // ideep will query onednn expect format of output
+    // if given output format is not expected, ideep will re-init an output buffer
+    // under this case, we need copy the re-inited buffer back to given buffer
+    ideep::tensor public_y = itensor_view_from_dense(result);
+    y.reorder_to(public_y);
+  }
+}
+
+} // namespace native
+} // namespace at
+
+#endif // AT_MKLDNN_EBABLED
diff --git a/aten/src/ATen/native/mkldnn/Matmul.h b/aten/src/ATen/native/mkldnn/Matmul.h
new file mode 100644
index 0000000000000..8cd5b5a9b3aeb
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/Matmul.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Config.h>
+
+namespace at { namespace native {
+
+// result = beta * result + alpha * gemm(mat1, mat2)
+// need mat, mat2 to be 2-D or 3-D Tensors
+TORCH_API void mkldnn_matmul(
+        const Tensor &mat1,
+        const Tensor &mat2,
+        const Tensor &result,
+        float beta=1,
+        float alpha=1);
+
+}}
diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h
index abfafd5230e98..49d51b286c097 100644
--- a/aten/src/ATen/native/mkldnn/Utils.h
+++ b/aten/src/ATen/native/mkldnn/Utils.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <ATen/ATen.h>
 #include <c10/util/ArrayRef.h>
 #include <ATen/ATen.h>
 #include <vector>
@@ -28,4 +29,29 @@ inline bool mkldnn_bf16_device_check() {
       && cpuinfo_has_x86_avx512vl() && cpuinfo_has_x86_avx512dq();
 }
 
+inline bool use_mkldnn_bf16_gemm(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const c10::optional<Tensor>& result_opt) {
+  c10::MaybeOwned<Tensor> result_maybe_owned = at::borrow_from_optional_tensor(result_opt);
+  const Tensor& result = *result_maybe_owned;
+
+  static const int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
+  // if dim = 2, mat1's size = (m * n), mat2's size = (n * k)
+  // else dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
+  // only m * n * k are large enough we can get benefit from mkldnn optimized gemm kernel
+  // if some cases pytorch dose not have default impl for bf16 (such as "dot"), will use mkldnn impl anyway
+  int64_t m = mat1.dim() == 2? mat1.size(0) : mat1.size(1);
+  int64_t n = mat1.dim() == 2? mat1.size(1) : mat1.size(2);
+  int64_t k = mat2.dim() == 2? mat2.size(1) : mat2.size(2);
+  return (
+    mat1.scalar_type() == kBFloat16 &&
+    mat2.scalar_type() == kBFloat16 &&
+    (!result.defined() || result.scalar_type() == kBFloat16) &&
+    mat1.numel() != 0 &&
+    mat2.numel() != 0 &&
+    mkldnn_bf16_device_check() &&
+    m * n * k >= mkldnn_gemm_min_size);
+}
+
 }
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 2b543431174d2..5912111da4c0a 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -3991,8 +3991,14 @@ def _test_dot_vdot_vs_numpy(self, device, dtype, torch_fn, np_fn):
         def check(x, y):
             # Compare with numpy
             res = torch_fn(x, y)
-            ref = torch.from_numpy(np.array(np_fn(x.cpu().numpy(), y.cpu().numpy())))
-            self.assertEqual(res.cpu(), ref)
+            if x.dtype == torch.bfloat16:
+                ref = torch.from_numpy(np.array(np_fn(x.cpu().float().numpy(), y.cpu().float().numpy())))
+            else:
+                ref = torch.from_numpy(np.array(np_fn(x.cpu().numpy(), y.cpu().numpy())))
+            if res.dtype == torch.bfloat16:
+                self.assertEqual(res.cpu(), ref.bfloat16())
+            else:
+                self.assertEqual(res.cpu(), ref)
 
             # Test out variant
             out = torch.empty_like(res)
@@ -4005,19 +4011,20 @@ def check(x, y):
         check(x, y)
 
         # Contiguous
-        x = torch.randn(10, dtype=dtype, device=device)
-        y = torch.randn(10, dtype=dtype, device=device)
+        x = torch.randn(200, dtype=dtype, device=device)
+        y = torch.randn(200, dtype=dtype, device=device)
         check(x, y)
 
         # 0 strided
-        y = torch.randn(1, dtype=dtype, device=device).expand(10)
+        y = torch.randn(1, dtype=dtype, device=device).expand(200)
         check(x, y)
 
         # 2 strided
         check(x[::2], y[::2])
 
-    @dtypes(torch.float, torch.cfloat)
-    @precisionOverride({torch.cfloat: 1e-4, torch.float32: 5e-5})
+    @dtypes(torch.float, torch.cfloat, torch.bfloat16)
+    @dtypesIfCUDA(torch.float, torch.cfloat)
+    @precisionOverride({torch.cfloat: 1e-4, torch.float32: 5e-5, torch.bfloat16: 1e-0})
     def test_dot_vs_numpy(self, device, dtype):
         self._test_dot_vdot_vs_numpy(device, dtype, torch.dot, np.dot)
 
@@ -6164,12 +6171,12 @@ def genf_int(x, y):
             return torch.randint(0, 100, (x, y), dtype=dtype, device=device)
 
         def genf_bfloat(x, y):
-            return torch.randn(x, y, dtype=torch.float32, device=device).to(dtype)
+            return torch.randn(x, y, dtype=torch.float32, device=device).to(dtype) * 0.1
 
         def genf_float(x, y):
             return torch.randn(x, y, dtype=dtype, device=device)
 
-        for (n, m, p) in [(20, 10, 5), (15, 5, 10), (5, 18, 10)]:
+        for (n, m, p) in [(20, 10, 15), (15, 20, 10), (25, 18, 10)]:
             if (dtype == torch.int32) or (dtype == torch.int64):
                 genf = genf_int
             elif (dtype == torch.bfloat16):
@@ -6240,7 +6247,7 @@ def test_bmm(self, device, dtype):
             return
 
         batch_sizes = [1, 10]
-        M, N, O = 23, 8, 12
+        M, N, O = 23, 15, 12
         numpy_dtype = dtype if dtype != torch.bfloat16 else torch.float32
 
         is_supported = True
@@ -6262,8 +6269,8 @@ def invert_perm(p):
         def generate_inputs(num_batches):
             # transposed tensors
             for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2):
-                b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1)
-                b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1)
+                b1 = make_tensor((num_batches, M, N), device, dtype, low=-0.1, high=0.1)
+                b2 = make_tensor((num_batches, N, O), device, dtype, low=-0.1, high=0.1)
                 b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1))
                 b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2))
                 yield b1, b2
@@ -6271,8 +6278,8 @@ def generate_inputs(num_batches):
             for b1, b2, b3, b4, b5, b6 in itertools.product((True, False), repeat=6):
                 shape1 = (num_batches if b1 else 1, M if b2 else 1, N if b3 else 1)
                 shape2 = (num_batches if b4 else 1, N if b5 else 1, O if b6 else 1)
-                b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N)
-                b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O)
+                b1 = make_tensor(shape1, device, dtype, low=-0.1, high=0.1).expand(num_batches, M, N)
+                b2 = make_tensor(shape2, device, dtype, low=-0.1, high=0.1).expand(num_batches, N, O)
                 yield b1, b2
             # zero-sized tensors
             for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
@@ -6352,7 +6359,7 @@ def test_addbmm(self, device, dtype):
             return
 
         num_batches = 2
-        M, N, O = 2, 3, 4
+        M, N, O = 16, 17, 18
 
         is_supported = True
         if dtype == torch.bfloat16:
@@ -6378,8 +6385,8 @@ def generate_tensor():
             # transposed tensors
             for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2):
                 for perm3 in itertools.permutations((0, 1)):
-                    b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1)
-                    b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1)
+                    b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1) * 0.1
+                    b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1) * 0.1
                     b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1))
                     b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2))
                     ref = torch.from_numpy(
@@ -6391,8 +6398,8 @@ def generate_tensor():
             for s1, s2, s3, s4, s5, s6 in itertools.product((True, False), repeat=6):
                 shape1 = (num_batches if s1 else 1, M if s2 else 1, N if s3 else 1)
                 shape2 = (num_batches if s4 else 1, N if s5 else 1, O if s6 else 1)
-                b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N)
-                b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O)
+                b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N) * 0.1
+                b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O) * 0.1
                 ref = torch.from_numpy(
                     b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()
                 ).to(device=device, dtype=dtype).sum(0)
@@ -6402,8 +6409,8 @@ def generate_tensor():
             for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
                 shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
                 shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
-                b1 = make_tensor(shape1, device, dtype, low=-1, high=1)
-                b2 = make_tensor(shape2, device, dtype, low=-1, high=1)
+                b1 = make_tensor(shape1, device, dtype, low=-1, high=1) * 0.1
+                b2 = make_tensor(shape2, device, dtype, low=-1, high=1) * 0.1
                 ref = torch.from_numpy(
                     b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()
                 ).to(device=device, dtype=dtype).sum(0)
@@ -6425,7 +6432,7 @@ def test_baddbmm(self, device, dtype):
             return
 
         num_batches = 10
-        M, N, O = 12, 8, 5
+        M, N, O = 12, 8, 50
 
         is_supported = True
         if dtype == torch.bfloat16 and self.device_type == 'cuda':
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index c4731570e6d77..363503d89f9f5 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -865,6 +865,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/native/mkldnn/TensorShape.cpp",
     "aten/src/ATen/native/mkldnn/UnaryOps.cpp",
     "aten/src/ATen/native/mkldnn/Utils.cpp",
+    "aten/src/ATen/native/mkldnn/Matmul.cpp",
     "aten/src/ATen/native/quantized/cpu/init_qnnpack.cpp",
     "aten/src/ATen/record_function.cpp",
     "aten/src/ATen/SavedTensorHooks.cpp",
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index b38d4afc4af3a..0db9bb508ee40 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -5939,14 +5939,14 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
            ),
            sample_inputs_func=sample_inputs_baddbmm),
     OpInfo('dot',
-           dtypes=all_types_and_complex_and(torch.float16),
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
            assert_autodiffed=True,
            sample_inputs_func=sample_inputs_dot_vdot,
            supports_forward_ad=True,
            ),
     OpInfo('vdot',
-           dtypes=all_types_and_complex_and(torch.float16),
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
            sample_inputs_func=sample_inputs_dot_vdot,
            supports_forward_ad=True,
@@ -7118,7 +7118,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
     OpInfo('matmul',
            aliases=('linalg.matmul',),
            dtypes=floating_types(),
-           dtypesIfCPU=all_types_and_complex(),
+           dtypesIfCPU=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
            dtypesIfROCM=floating_types_and(torch.half, torch.bfloat16),
            backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
@@ -7910,7 +7910,7 @@ def gradcheck_wrapper_triangular_input(op, input, *args, upper=False, **kwargs):
     OpInfo('__rmatmul__',
            op=torch.Tensor.__rmatmul__,
            dtypes=floating_types(),
-           dtypesIfCPU=all_types_and_complex(),
+           dtypesIfCPU=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else [],
                                            torch.complex64, torch.complex128),
            backward_dtypesIfCUDA=floating_types_and(torch.float16,