From 045ebc771d5070696f839e586285ace9c06f1339 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 7 Sep 2022 05:52:27 +0000
Subject: [PATCH 01/45] [BE] Use `teardown-linux`/`chown` actions for binary
 builds (#84449)

Also embed `wait_for_ssh_to_drain.sh` into the action (to make it more reusable across repos) and delete unused teardown_linux template from `common.yml`

Also, in `_binary-test-linux.yml` move artifact download step after repo checkout, to make errors during that step more parseable
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84449
Approved by: https://github.com/kit1980
---
 .github/actions/teardown-linux/action.yml | 13 ++++++++-
 .github/scripts/wait_for_ssh_to_drain.sh  | 13 ---------
 .github/templates/common.yml.j2           | 23 ----------------
 .github/workflows/_binary-build-linux.yml | 32 ++++++-----------------
 .github/workflows/_binary-test-linux.yml  | 32 +++++++----------------
 5 files changed, 30 insertions(+), 83 deletions(-)
 delete mode 100755 .github/scripts/wait_for_ssh_to_drain.sh

diff --git a/.github/actions/teardown-linux/action.yml b/.github/actions/teardown-linux/action.yml
index 9238a073a6b6..024bb3c5f5c5 100644
--- a/.github/actions/teardown-linux/action.yml
+++ b/.github/actions/teardown-linux/action.yml
@@ -16,7 +16,18 @@ runs:
       # Always hold for active ssh sessions
       shell: bash
       if: inputs.skip-wait-ssh == ''
-      run: .github/scripts/wait_for_ssh_to_drain.sh
+      run: |
+        set -eou pipefail
+
+        echo "Holding runner for 2 hours until all ssh sessions have logged out"
+        for _ in $(seq 1440); do
+            # Break if no ssh session exists anymore
+            if [ "$(who)" = "" ]; then
+              break
+            fi
+            echo "."
+            sleep 5
+        done
 
     - name: Kill containers, clean up images
       shell: bash
diff --git a/.github/scripts/wait_for_ssh_to_drain.sh b/.github/scripts/wait_for_ssh_to_drain.sh
deleted file mode 100755
index f33d80764033..000000000000
--- a/.github/scripts/wait_for_ssh_to_drain.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-
-set -eou pipefail
-
-echo "Holding runner for 2 hours until all ssh sessions have logged out"
-for _ in $(seq 1440); do
-    # Break if no ssh session exists anymore
-    if [ "$(who)" = "" ]; then
-      break
-    fi
-    echo "."
-    sleep 5
-done
diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index b80b82f5d610..37f89ef9c40d 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -199,29 +199,6 @@ on:
           env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
 {%- endmacro -%}
 
-{%- macro teardown_ec2_linux(pytorch_directory="") -%}
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-{%- if pytorch_directory %}
-        working-directory: !{{ pytorch_directory }}
-{%- endif %}
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-{%- endmacro -%}
-
 {%- macro teardown_rocm_linux() -%}
       - name: Kill containers, clean up images
         if: always()
diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml
index dc69e3a82258..8ddd4af29d71 100644
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@@ -167,11 +167,9 @@ jobs:
         run: |
           echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
       - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
+        uses: ./pytorch/.github/actions/pull-docker-image
+        with:
+          docker-image: ${{ inputs.DOCKER_IMAGE }}
       - name: Build PyTorch binary
         run: |
           set -x
@@ -204,10 +202,7 @@ jobs:
           docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/${{ inputs.PACKAGE_TYPE }}/build.sh"
       - name: Chown artifacts
         if: always()
-        shell: bash
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+        uses: ./pytorch/.github/actions/chown-workspace
 
       - uses: actions/upload-artifact@v3
         with:
@@ -216,21 +211,10 @@ jobs:
           path:
             ${{ runner.temp }}/artifacts/*
 
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
+      - name: Teardown Linux
         if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
+        uses: ./pytorch/.github/actions/teardown-linux
+
       - name: Chown workspace
         if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
+        uses: ./pytorch/.github/actions/chown-workspace
diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml
index e8749c59d58c..9efd2a17b266 100644
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@@ -139,13 +139,6 @@ jobs:
           rm -rf "${GITHUB_WORKSPACE}"
           mkdir "${GITHUB_WORKSPACE}"
 
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: ${{ inputs.build_name }}
-          path: "${{ runner.temp }}/artifacts/"
-
-
       - name: Checkout PyTorch to pytorch dir
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -171,6 +164,12 @@ jobs:
           git clean -fxd
         working-directory: builder
 
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: ${{ inputs.build_name }}
+          path: "${{ runner.temp }}/artifacts/"
+
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         uses: nick-fields/retry@7d4a37704547a311dbb66ebdf5b23ec19374a767
         if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' }}
@@ -192,21 +191,10 @@ jobs:
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
 
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        working-directory: pytorch/
-        # Always hold for active ssh sessions
+      - name: Teardown Linux
         if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
+        uses: ./pytorch/.github/actions/teardown-linux
+
       - name: Chown workspace
         if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
+        uses: ./pytorch/.github/actions/chown-workspace

From 07d398fb269eebe314ae898287494a2bfdc7f278 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Wed, 7 Sep 2022 09:33:37 +0000
Subject: [PATCH 02/45] [composite compliance] linalg_householder_product
 (#84180)

Ref: #69991
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84180
Approved by: https://github.com/zou3519
---
 functorch/test/test_ops.py                    | 14 ++-
 torch/csrc/autograd/FunctionsManual.cpp       | 97 ++++++++++++++-----
 .../_internal/opinfo/definitions/linalg.py    |  6 --
 3 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/functorch/test/test_ops.py b/functorch/test/test_ops.py
index 445df4b9ec03..8d69fe7e22b5 100644
--- a/functorch/test/test_ops.py
+++ b/functorch/test/test_ops.py
@@ -675,7 +675,6 @@ def test_vmapvjp(self, device, dtype, op):
         xfail('_masked.prod'),  # .item or data-dependent control flow
 
         xfail('nn.functional.soft_margin_loss', ''),  # soft_margin_loss_backward does not support forward-ad
-        xfail('linalg.householder_product'),  # output with shape [5, 5] doesn't match the broadcast shape [2, 5, 5]
         xfail('tensor_split'),  # data_ptr composite compliance
         xfail('quantile'),  # at::equal batching rule (cpu), also, in-place vmap (cuda)
         skip('as_strided'),  # Test runner cannot handle this
@@ -705,6 +704,10 @@ def test_vmapvjp(self, device, dtype, op):
     @opsToleranceOverride('TestOperators', 'test_vmapjvpall', (
         tol1('nn.functional.conv_transpose3d',
              {torch.float32: tol(atol=2e-04, rtol=9e-3)}, device_type='cuda'),
+        tol1('linalg.householder_product',
+             {torch.float32: tol(atol=2e-04, rtol=9e-3)}, device_type='cuda'),
+        tol1('linalg.householder_product',
+             {torch.float32: tol(atol=2e-04, rtol=1e-4)}, device_type='cpu'),
     ))
     @skipOps('TestOperators', 'test_vmapjvpall', vmapjvpall_fail)
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
@@ -1323,9 +1326,6 @@ def fn(input, weight, bias):
 
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float32, torch.double))
     @skipOps('TestOperators', 'test_vmap_autograd_grad', {
-        # call inplace functions
-        xfail('linalg.householder_product'),  # inplace
-
         xfail('linalg.eig'),  # all close?
         # The size of tensor a (4) must match the size of tensor b (10) at non-singleton dimension 0
         xfail('masked_select'),
@@ -1349,6 +1349,12 @@ def fn(input, weight, bias):
         skip('native_layer_norm', '', device_type='cpu'),
         xfail('as_strided_scatter', ''),
     })
+    @opsToleranceOverride('TestOperators', 'test_vmap_autograd_grad', (
+        tol1('linalg.householder_product',
+             {torch.float32: tol(atol=5e-04, rtol=9e-03)}, device_type='cuda'),
+        tol1('linalg.householder_product',
+             {torch.float32: tol(atol=1e-04, rtol=1e-04)}, device_type='cpu'),
+    ))
     def test_vmap_autograd_grad(self, device, dtype, op):
         def is_differentiable(inp):
             return isinstance(inp, Tensor) and (inp.grad_fn is not None or inp.requires_grad)
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 39e068c23bbd..091dc1417a71 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -5115,10 +5115,6 @@ std::tuple<Tensor, Tensor> householder_product_backward(
   if (!grad.defined() || !input_.numel() || !tau.numel()) {
     return std::tuple<Tensor, Tensor>(Tensor(), Tensor());
   }
-
-  auto input_grad = at::zeros_like(input_);
-  auto tau_grad = at::zeros_like(tau);
-
   auto m = input_.size(-2);
   auto k = tau.size(-1);
 
@@ -5191,23 +5187,70 @@ std::tuple<Tensor, Tensor> householder_product_backward(
   // K <- H_0^{-1} @ K
   K = apply_householder_reflector(
       0, input.narrow(-1, 0, 1), sigma.narrow(-1, 0, 1), K, /*left=*/true);
-  for (const auto i : c10::irange(k)) {
-    // NOTE: narrow will unsqueeze(-1)
-    auto v_i = input.narrow(-1, i, 1);
-    auto t_i = tau.narrow(-1, i, 1);
-
-    Tensor v_i_grad, tau_i_grad;
-    std::tie(v_i_grad, tau_i_grad) = update_grad(i, v_i, t_i, K);
-    input_grad.select(-1, i).copy_(v_i_grad.squeeze(-1));
-    tau_grad.select(-1, i).copy_(tau_i_grad.squeeze(-1));
-
-    // K <- H_{i + 1}^{-1} @ K @ H_i
-    if (i < k - 1) {
-      auto v_i_next = input.narrow(-1, i + 1, 1);
-      auto s_i_next = sigma.narrow(-1, i + 1, 1);
-      K = apply_householder_reflector(
-          i + 1, v_i_next, s_i_next, K, /*left=*/true);
-      K = apply_householder_reflector(i, v_i, t_i, K, /*left=*/false);
+
+  Tensor input_grad, tau_grad;
+  // For Composite Compliance, we can't copy a Subclass into a Regular Tensor,
+  // so we use out-of-place ops with equivalent output.
+  // NOTE: We can't use `new_zeros` directly as `input`, 'tau' or `grad` can
+  // be Tensor Subclass and we don't want to make assumption about which
+  // one to choose for creating output buffer.
+  // eg. if both are BatchedTensor at different level.
+  if (areAnyTensorSubclassLike({input, tau, K})) {
+    std::vector<Tensor> input_grads = {};
+    std::vector<Tensor> tau_grads = {};
+    for (const auto i : c10::irange(k)) {
+      // NOTE: narrow will unsqueeze(-1)
+      auto v_i = input.narrow(-1, i, 1);
+      auto t_i = tau.narrow(-1, i, 1);
+
+      Tensor v_i_grad, tau_i_grad;
+      std::tie(v_i_grad, tau_i_grad) = update_grad(i, v_i, t_i, K);
+      input_grads.push_back(v_i_grad.squeeze(-1));
+      tau_grads.push_back(tau_i_grad.squeeze(-1));
+
+      // K <- H_{i + 1}^{-1} @ K @ H_i
+      if (i < k - 1) {
+        auto v_i_next = input.narrow(-1, i + 1, 1);
+        auto s_i_next = sigma.narrow(-1, i + 1, 1);
+        K = apply_householder_reflector(
+            i + 1, v_i_next, s_i_next, K, /*left=*/true);
+        K = apply_householder_reflector(i, v_i, t_i, K, /*left=*/false);
+      }
+    }
+
+    input_grad = at::stack(input_grads, -1);
+    tau_grad = at::stack(tau_grads, -1);
+
+    // Only first k columns are active in forward.
+    // zero gradients for the inactive input.
+    if (k < input.size(-1)) {
+      auto input_sizes = input_.sizes();
+      at::DimVector new_sizes(input_sizes);
+      new_sizes[input_.dim() - 1] = input.size(-1) - k;
+      auto zeros = at::zeros(new_sizes, input_.options());
+      input_grad = at::cat({input_grad, zeros}, -1);
+    }
+  } else {
+    input_grad = at::zeros_like(input_);
+    tau_grad = at::zeros_like(tau);
+    for (const auto i : c10::irange(k)) {
+      // NOTE: narrow will unsqueeze(-1)
+      auto v_i = input.narrow(-1, i, 1);
+      auto t_i = tau.narrow(-1, i, 1);
+
+      Tensor v_i_grad, tau_i_grad;
+      std::tie(v_i_grad, tau_i_grad) = update_grad(i, v_i, t_i, K);
+      input_grad.select(-1, i).copy_(v_i_grad.squeeze(-1));
+      tau_grad.select(-1, i).copy_(tau_i_grad.squeeze(-1));
+
+      // K <- H_{i + 1}^{-1} @ K @ H_i
+      if (i < k - 1) {
+        auto v_i_next = input.narrow(-1, i + 1, 1);
+        auto s_i_next = sigma.narrow(-1, i + 1, 1);
+        K = apply_householder_reflector(
+            i + 1, v_i_next, s_i_next, K, /*left=*/true);
+        K = apply_householder_reflector(i, v_i, t_i, K, /*left=*/false);
+      }
     }
   }
 
@@ -5299,10 +5342,18 @@ Tensor householder_product_jvp(
 
     H_plus = apply_householder_reflector(v_i, sigma_i, H_plus, /*left=*/true);
 
-    dprod.add_(H_minus.matmul(
+    // `H_minus_dH_i_H_plus` = H_1 * ... * H_{i-1} dH_i * H_{i+1} * ...
+    auto H_minus_dH_i_H_plus = H_minus.matmul(
         apply_simple_product(v_i, v_i, dtau_i, H_plus) +
         apply_simple_product(dv_i, v_i, tau_i, H_plus) +
-        apply_simple_product(v_i, dv_i, tau_i, H_plus)));
+        apply_simple_product(v_i, dv_i, tau_i, H_plus));
+    // For Composite Compliance, if `intermediate` is a Tensor-Subclass,
+    // we use out-of-place variant of add.
+    if (at::isTensorSubclassLike(H_minus_dH_i_H_plus)) {
+      dprod = dprod.add(H_minus_dH_i_H_plus);
+    } else {
+      dprod.add_(H_minus_dH_i_H_plus);
+    }
 
     H_minus = apply_householder_reflector(v_i, tau_i, H_minus, /*left=*/false);
   }
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index 0b5fb0e2ae1b..443693bdb43a 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -1564,12 +1564,6 @@ def make_input():
             DecorateInfo(
                 toleranceOverride({torch.complex64: tol(atol=1e-3, rtol=1e-3)})
             ),
-            DecorateInfo(
-                unittest.expectedFailure, "TestCompositeCompliance", "test_backward"
-            ),
-            DecorateInfo(
-                unittest.expectedFailure, "TestCompositeCompliance", "test_forward_ad"
-            ),
         ],
     ),
     OpInfo(

From ec3939a62f7e09807e0e7e9701c354c94aef7a66 Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Wed, 7 Sep 2022 12:53:08 +0000
Subject: [PATCH 03/45] Detect `__code__` a bit more reliably. (#84610)

Based on Ed's patch.

Fixes https://github.com/pytorch/pytorch/issues/84570

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84610
Approved by: https://github.com/Chillee
---
 functorch/test/test_pythonkey.py      | 21 +++++++++++++++++++++
 torch/fx/experimental/proxy_tensor.py |  2 +-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/functorch/test/test_pythonkey.py b/functorch/test/test_pythonkey.py
index 8fbfa94839bc..b2bd74348f7b 100644
--- a/functorch/test/test_pythonkey.py
+++ b/functorch/test/test_pythonkey.py
@@ -120,6 +120,27 @@ def f(x):
         new_cotangent = torch.randn(())
         self.assertEqual(fx_f(new_cotangent, True, True), vjp_fn(new_cotangent))
 
+    def test_make_fx_functionalize(self, device):
+        from functorch.experimental import functionalize
+
+        def fn(a):
+            a = a * 2
+            a.relu_()
+            return a
+
+        a = torch.randn(3, device=device)
+        symbolic_gm = torch.fx.symbolic_trace(fn)
+        includes_method_relu_ = any(
+            str(n.target) == "relu_" for n in symbolic_gm.graph.nodes
+        )
+        self.assertTrue(includes_method_relu_)
+        # Also verifies fix for https://github.com/pytorch/pytorch/issues/84570
+        gm = make_fx(functionalize(symbolic_gm))(a)
+        includes_aten_relu = any(
+            n.target == torch.ops.aten.relu.default for n in gm.graph.nodes
+        )
+        self.assertTrue(includes_aten_relu)
+
     def test_make_fx_no_decompose(self, device):
         # FIXME
         return self.skipTest("error: maximum recursion reached")
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index 0678d308a832..bbecccc38456 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -607,7 +607,7 @@ def wrap_fake_symbolic(x, sym_shape):
         else:
             args = pytree.tree_map(wrap_fn_map[tracing_mode], args)
 
-        if not hasattr(f, '__code__') or inspect.unwrap(f).__code__.co_flags & inspect.CO_VARARGS:
+        if not hasattr(inspect.unwrap(f), '__code__') or inspect.unwrap(f).__code__.co_flags & inspect.CO_VARARGS:
             # FX doesn't support varargs, so we gotta fake up a wrapper
             # TODO: Would be nice to fix this at the source...
             func = fake_signature(f, len(phs))

From 9e7af4e8d4540c6034806e84fec64d08643031bd Mon Sep 17 00:00:00 2001
From: Mateusz Sypniewski <m.odrowaz.sypniewski@gmail.com>
Date: Wed, 7 Sep 2022 13:01:51 +0000
Subject: [PATCH 04/45] Add alias info to torch._C (#84580)

This adds the `AliasInfo` class to torch._C, as defined in https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/python/init.cpp#L1943. This will fix MYPY errors for missing `Argument` attributes.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84580
Approved by: https://github.com/lw
---
 torch/_C/__init__.pyi.in | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 716659427d61..6ec595fd299a 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -629,6 +629,13 @@ class Graph:
     ...
 
 
+# Defined in torch/aten/src/ATen/core/alias_info.h
+class AliasInfo:
+    is_write: _bool
+    before_set: Set[str]
+    after_set: Set[str]
+
+
 # Defined in torch/aten/src/ATen/core/function_schema.h
 class Argument:
     name: str
@@ -637,6 +644,7 @@ class Argument:
     def has_default_value(self) -> _bool: ...
     kwarg_only : _bool
     is_out: _bool
+    alias_info: Optional[AliasInfo]
     ...
 class FunctionSchema:
     arguments: List[Argument]

From 189768ed64561e61ff05c9e42adfa40139388204 Mon Sep 17 00:00:00 2001
From: CaoE <e.cao@intel.com>
Date: Wed, 7 Sep 2022 13:48:43 +0000
Subject: [PATCH 05/45] Add mkl implementation for exponential on CPU (#69967)

### Description
Add mkl implementation for exponential on CPU to improve the performance of exponential.

### Testing
data type: float32
single socket (28cores):
```
before: torch.Size([10, 128, 10, 124])  0.065 ms
        torch.Size([10, 128, 20, 124])  0.130 ms

after:  torch.Size([10, 128, 10, 124])  5.9e-05 ms
        torch.Size([10, 128, 20, 124])  0.000113 ms
```
single core:
```
before: torch.Size([10, 128, 10, 124])  0.065 ms
        torch.Size([10, 128, 20, 124])  0.130 ms

after:  torch.Size([10, 128, 10, 124])  0.00117 ms
        torch.Size([10, 128, 20, 124])  0.002347 ms
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/69967
Approved by: https://github.com/frank-wei
---
 .../ATen/native/cpu/DistributionKernels.cpp   | 79 ++++++++++++++++++-
 test/distributions/test_distributions.py      |  8 +-
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp
index 4363cc9d62e3..617809e14292 100644
--- a/aten/src/ATen/native/cpu/DistributionKernels.cpp
+++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp
@@ -103,11 +103,88 @@ void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Gen
 }
 #endif
 
-static void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
+static void exponential_kernel_default(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::exponential_kernel(iter, lambda, generator);
 }
 
+#if !AT_MKL_ENABLED()
+void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
+  exponential_kernel_default(iter, lambda, gen);
+}
+#else
+void exponential_kernel(TensorIteratorBase &iter, double lambda, c10::optional<Generator> gen) {
+  Tensor self = iter.tensor(0);
+  if (lambda > 0 && !std::isinf(lambda) && !std::isnan(lambda) && cpuinfo_initialize() &&
+      cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) {
+    CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
+    int64_t seed;
+    {
+      // See Note [Acquire lock when using random generators]
+      std::lock_guard<std::mutex> lock(generator->mutex_);
+      if (self.scalar_type() == at::kDouble)
+        seed = generator->random64();
+      else
+        seed = generator->random();
+    }
+    int64_t n = self.numel();
+    bool contig = self.is_contiguous();
+
+    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "exponential_cpu", [&] {
+      at::Tensor tmp_tensor;
+      constexpr bool is_df = std::is_same<scalar_t, float>::value || std::is_same<scalar_t, double>::value;
+      if (is_df && contig) {
+        tmp_tensor = self;
+      } else if (std::is_same<scalar_t, double>::value) {
+        tmp_tensor = at::empty(self.sizes(), self.options().dtype(at::kDouble));
+      } else {
+        tmp_tensor = at::empty(self.sizes(), self.options().dtype(at::kFloat));
+      }
+
+      scalar_t *self_ptr = self.data_ptr<scalar_t>();
+      using tmp_scalar_t = typename std::conditional_t<std::is_same<scalar_t, double>::value, double, float>;
+      tmp_scalar_t *sample_ptr = tmp_tensor.data_ptr<tmp_scalar_t>();
+
+      auto sample = [&](int64_t begin, int64_t end) {
+        int64_t len = end - begin;
+        if (len > 0) {
+          VSLStreamStatePtr stream;
+          if (std::is_same<scalar_t, double>::value) {
+            vslNewStream(&stream, VSL_BRNG_MCG31, seed);
+            vslSkipAheadStream(stream, begin);
+            vdRngExponential(VSL_RNG_METHOD_EXPONENTIAL_ICDF, stream, len,
+              (double *)(sample_ptr + begin), 0, 1./lambda);
+            vslDeleteStream(&stream);
+          } else {
+            vslNewStream(&stream, VSL_BRNG_MCG31, seed);
+            vslSkipAheadStream(stream, begin);
+            vsRngExponential(VSL_RNG_METHOD_EXPONENTIAL_ICDF, stream, len,
+              (float *) (sample_ptr + begin), 0, 1./lambda);
+            vslDeleteStream(&stream);
+          }
+          // vectorized copy if using buffer and contiguous
+          if (!is_df && contig) {
+            scalar_t *self_seg = self_ptr + begin;
+            tmp_scalar_t *tmp_seg = sample_ptr + begin;
+            at::vec::convert<tmp_scalar_t, scalar_t>(tmp_seg, self_seg, len);
+          }
+        }
+      };
+
+      parallel_for(0, n, /* grain_size= */ 800, sample);
+
+      // copy_ if using buffer and non contiguous
+      if (!contig) {
+        self.copy_(tmp_tensor);
+      }
+    });
+  } else {
+    // The situation of AMD, move to using the default version
+    exponential_kernel_default(iter, lambda, gen);
+  }
+}
+#endif
+
 static void geometric_kernel(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::geometric_kernel(iter, p, generator);
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index b6201d4d9e84..385420a67813 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -5062,7 +5062,10 @@ def f(*values):
             xfail = [
                 Cauchy,  # aten::cauchy(Double(2,1), float, float, Generator)
                 HalfCauchy,  # aten::cauchy(Double(2, 1), float, float, Generator)
-                VonMises  # Variance is not Euclidean
+                VonMises,  # Variance is not Euclidean
+                Exponential,  # mkl implementation path on intel cpu will produce diffrent results from jit.trace
+                Pareto,  # base_distribution is Exponential
+                Weibull  # base_distribution is Exponential
             ]
             if Dist in xfail:
                 continue
@@ -5093,6 +5096,9 @@ def f(*values):
             xfail = [
                 Cauchy,  # aten::cauchy(Double(2,1), float, float, Generator)
                 HalfCauchy,  # aten::cauchy(Double(2, 1), float, float, Generator)
+                Exponential,  # mkl implementation path on intel cpu will produce diffrent results from jit.trace
+                Pareto,  # base_distribution is Exponential
+                Weibull  # base_distribution is Exponential
             ]
             if Dist in xfail:
                 continue

From e4519548a5a5f4026645f4a240ac026094ef1be5 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Tue, 6 Sep 2022 21:35:12 +0000
Subject: [PATCH 06/45] Supported nested lists in CommTensor and enable tracing
 allgather_ (#84585)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84585
Approved by: https://github.com/wanchaol
---
 test/distributed/test_c10d_common.py          | 39 +++++++++++++++----
 test/distributed/test_c10d_gloo.py            | 19 +++++++--
 test/distributed/test_c10d_nccl.py            | 12 +++++-
 .../check_forward_backward_compatibility.py   |  1 +
 torch/csrc/distributed/c10d/Ops.cpp           | 21 +++++++---
 5 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index d9b00d81ba4d..02ef113bcdb6 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -12,7 +12,7 @@
 from functools import partial
 from itertools import product
 from sys import platform
-from typing import Any, Tuple
+from typing import Any, Callable, Tuple
 
 import torch
 import torch.distributed as dist
@@ -53,7 +53,6 @@
     tree_flatten,
     tree_map,
     tree_map_only,
-    tree_unflatten
 )
 from torch.utils.checkpoint import checkpoint
 
@@ -1431,6 +1430,9 @@ class CommTensor(torch.Tensor):
 
     It is specifically tailored for allreduce_ at the moment.
     """
+
+    _supported_comms = ["allreduce_", "allgather_"]
+
     @staticmethod
     def __new__(cls, tensor: torch.Tensor):
         r = torch.Tensor._make_subclass(  # type: ignore[attr-defined]
@@ -1451,6 +1453,10 @@ def __repr__(self):
     # with ProxyTorchDispatchMode in make_fx
     __torch_function__ = _disabled_torch_function_impl
 
+    @classmethod
+    def _is_supported(cls, op_name):
+        return any([comm in op_name for comm in cls._supported_comms])
+
     @classmethod
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         # shared states when unwrapping args
@@ -1516,7 +1522,7 @@ def mark_after_comm(work, e):
         unwrapped_args = tree_map(unwrap, args)
         unwrapped_kwargs = tree_map(unwrap, kwargs)
 
-        if "allreduce_" in func.__name__:
+        if cls._is_supported(func.__name__):
             if tracer is not None:
                 # in tracing mode, get proxies for args
                 proxy_args, proxy_kwargs = tree_map_only(
@@ -1562,7 +1568,7 @@ def mark_after_comm(work, e):
                 for a, o in zip(flat_args, flat_out):
                     set_proxy_slot(a, tracer, get_proxy(o))
 
-                return (tree_unflatten(flat_out, out_spec), out[1])
+                return out
             else:
                 # in eager mode, simply remember work handle as an attribute
                 out = func(*unwrapped_args, **kwargs)
@@ -1592,7 +1598,7 @@ def tearDown(self):
     def _get_process_group(self):
         raise NotImplementedError("To be implemented by subclass")
 
-    def _test_work_wait(self, x: torch.Tensor):
+    def _test_work_wait(self, x: torch.Tensor, comm_fn: Callable):
         pg = self._get_default_group()
 
         def fn(x: torch.Tensor) -> torch.Tensor:
@@ -1600,12 +1606,12 @@ def fn(x: torch.Tensor) -> torch.Tensor:
             # all_reduce Python implementation, as the later will need more
             # discussion.
             y = CommTensor(x + x)
-            work = dist.all_reduce(y, group=pg, async_op=True)
+            work, z = comm_fn(y, group=pg)
             # this wait() will be ignored in tracing mode as
             # ProxyTorchDispatchMode only supports torch.Tensor, _ProxyTensor,
             # and torch.nn.Parameter objects
             work.wait()
-            return y * 2
+            return z * 2
 
         xx = x.clone()
 
@@ -1629,7 +1635,7 @@ def fn(x: torch.Tensor) -> torch.Tensor:
                     ])
                     commed |= all([
                         curr.op == "call_function",
-                        "allreduce_" in curr.target.__name__
+                        CommTensor._is_supported(curr.target.__name__),
                     ])
 
                     prev = curr.args[0]
@@ -1652,6 +1658,23 @@ def fn(x: torch.Tensor) -> torch.Tensor:
         yy = traced_fn(xx)
         self.assertFalse(y.allclose(yy))
 
+    def _test_allreduce_work_wait(self, tensor):
+        def comm_fn(tensor, group=None):
+            work = dist.all_reduce(tensor, group=group, async_op=True)
+            return work, tensor
+
+        self._test_work_wait(tensor, comm_fn=comm_fn)
+
+    def _test_allgather_work_wait(self, tensor):
+        def comm_fn(tensor, group=None):
+            out_tensors = [torch.zeros_like(tensor) for _ in range(group.size())]
+            work = dist.all_gather(out_tensors, tensor, group=group, async_op=True)
+            work.wait()
+
+            return work, sum(out_tensors)
+
+        self._test_work_wait(tensor, comm_fn=comm_fn)
+
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index ec81c0f707e7..01f004f2b2de 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -2354,12 +2354,23 @@ def _get_default_group(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         return c10d.ProcessGroupGloo(store, self.rank, self.world_size)
 
-    def test_work_wait_cpu(self):
-        self._test_work_wait(torch.ones(2, 2) * self.rank)
+    def test_allreduce_work_wait_cpu(self):
+        self._test_allreduce_work_wait(torch.ones(2, 2) * self.rank)
 
     @skip_if_lt_x_gpu(2)
-    def test_work_wait_gpu(self):
-        self._test_work_wait(torch.ones(2, 2, device=self.rank) * self.rank)
+    def test_allreduce_work_wait_gpu(self):
+        self._test_allreduce_work_wait(
+            torch.ones(2, 2, device=self.rank) * self.rank
+        )
+
+    def test_allgather_work_wait_cpu(self):
+        self._test_allgather_work_wait(torch.ones(2, 2) * self.rank)
+
+    @skip_if_lt_x_gpu(2)
+    def test_allgather_work_wait_gpu(self):
+        self._test_allgather_work_wait(
+            torch.ones(2, 2, device=self.rank) * self.rank
+        )
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 7d01c95de520..276571128b6a 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2818,8 +2818,16 @@ def _get_default_group(self):
         return c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
 
     @skip_if_lt_x_gpu(2)
-    def test_work_wait_gpu(self):
-        self._test_work_wait(torch.ones(2, 2, device=self.rank) * self.rank)
+    def test_allreduce_work_wait_gpu(self):
+        self._test_allgather_work_wait(
+            torch.ones(2, 2, device=self.rank) * self.rank,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_allgather_work_wait_gpu(self):
+        self._test_allgather_work_wait(
+            torch.ones(2, 2, device=self.rank) * self.rank
+        )
 
 
 if __name__ == "__main__":
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index 3e2fcd00da39..cf3a25f82853 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -276,6 +276,7 @@
     ("aten::sym_numel", datetime.date(2022, 10, 1)),
     # Distributed c10d ops are all going to be updated
     ("c10d::.*", datetime.date(2022, 10, 31)),
+    ("c10d::allgather_", datetime.date(2022, 10, 1)),
 ]
 
 ALLOW_LIST_COMPILED = [
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 13f57af18dc3..6f87069b94d7 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -38,15 +38,24 @@ allreduce_(
           std::move(tensor_vec), work);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> allgather_(
+std::tuple<
+    std::vector<std::vector<at::Tensor>>,
+    c10::intrusive_ptr<ProcessGroup::Work>>
+allgather_(
     const std::vector<std::vector<at::Tensor>>& output_tensors,
     const std::vector<at::Tensor>& input_tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     int64_t timeout) {
-  return process_group->allgather(
+  auto work = process_group->allgather(
       const_cast<std::vector<std::vector<at::Tensor>>&>(output_tensors),
       const_cast<std::vector<at::Tensor>&>(input_tensors),
       AllgatherOptions{std::chrono::milliseconds(timeout)});
+
+  // Copy output tensors (not storage) so that this can be used in a functional
+  // manner
+  return std::tuple<
+      std::vector<std::vector<at::Tensor>>,
+      c10::intrusive_ptr<ProcessGroup::Work>>(output_tensors, work);
 }
 
 c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter_(
@@ -243,13 +252,15 @@ c10::intrusive_ptr<ProcessGroup::Work> allgather(
     const AllgatherOptions& opts) {
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("c10d::allgather_", "")
-                       .typed<c10::intrusive_ptr<::c10d::ProcessGroup::Work>(
+                       .typed<std::tuple<
+                           std::vector<std::vector<at::Tensor>>,
+                           c10::intrusive_ptr<ProcessGroup::Work>>(
                            const std::vector<std::vector<at::Tensor>>&,
                            const std::vector<at::Tensor>&,
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                            int64_t)>();
-  return op.call(
-      output_tensors, input_tensors, process_group, opts.timeout.count());
+  return std::get<1>(op.call(
+      output_tensors, input_tensors, process_group, opts.timeout.count()));
 }
 
 c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(

From a24d7a8565f5aac8448775552557112d0239fc8f Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Tue, 6 Sep 2022 21:35:12 +0000
Subject: [PATCH 07/45] Add reduce_scatter_ to CommTensor (#84592)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84592
Approved by: https://github.com/wanchaol
---
 test/distributed/test_c10d_common.py | 11 ++++++++++-
 test/distributed/test_c10d_nccl.py   |  6 ++++++
 torch/csrc/distributed/c10d/Ops.cpp  | 17 ++++++++++++-----
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 02ef113bcdb6..687de53950ef 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1431,7 +1431,7 @@ class CommTensor(torch.Tensor):
     It is specifically tailored for allreduce_ at the moment.
     """
 
-    _supported_comms = ["allreduce_", "allgather_"]
+    _supported_comms = ["allreduce_", "allgather_", "reduce_scatter_"]
 
     @staticmethod
     def __new__(cls, tensor: torch.Tensor):
@@ -1675,6 +1675,15 @@ def comm_fn(tensor, group=None):
 
         self._test_work_wait(tensor, comm_fn=comm_fn)
 
+    def _test_reduce_scatter_work_wait(self, tensor):
+        def comm_fn(tensor, group=None):
+            in_tensors = [tensor.clone() + i for i in range(group.size())]
+            out_tensor = torch.zeros_like(tensor)
+            work = dist.reduce_scatter(out_tensor, in_tensors, group=group, async_op=True)
+            return work, out_tensor
+
+        self._test_work_wait(tensor, comm_fn=comm_fn)
+
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 276571128b6a..937b2f593113 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2829,6 +2829,12 @@ def test_allgather_work_wait_gpu(self):
             torch.ones(2, 2, device=self.rank) * self.rank
         )
 
+    @skip_if_lt_x_gpu(2)
+    def test_reduce_scatter_work_wait_gpu(self):
+        self._test_reduce_scatter_work_wait(
+            torch.ones(2, 2, device=self.rank) * self.rank
+        )
+
 
 if __name__ == "__main__":
     assert (
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 6f87069b94d7..eb5286b0d2cc 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -58,17 +58,22 @@ allgather_(
       c10::intrusive_ptr<ProcessGroup::Work>>(output_tensors, work);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter_(
+std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>
+reduce_scatter_(
     const std::vector<at::Tensor>& output_tensors,
     const std::vector<std::vector<at::Tensor>>& input_tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const c10::intrusive_ptr<ReduceOp>& reduce_op,
     int64_t timeout) {
-  return process_group->reduce_scatter(
+  auto work = process_group->reduce_scatter(
       const_cast<std::vector<at::Tensor>&>(output_tensors),
       const_cast<std::vector<std::vector<at::Tensor>>&>(input_tensors),
       ReduceScatterOptions{
           *reduce_op.get(), std::chrono::milliseconds(timeout)});
+
+  return std::
+      tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>(
+          output_tensors, work);
 }
 
 c10::intrusive_ptr<ProcessGroup::Work> reduce_(
@@ -270,18 +275,20 @@ c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
     const ReduceScatterOptions& opts) {
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("c10d::reduce_scatter_", "")
-                       .typed<c10::intrusive_ptr<::c10d::ProcessGroup::Work>(
+                       .typed<std::tuple<
+                           std::vector<at::Tensor>,
+                           c10::intrusive_ptr<ProcessGroup::Work>>(
                            const std::vector<at::Tensor>&,
                            const std::vector<std::vector<at::Tensor>>&,
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                            const c10::intrusive_ptr<::c10d::ReduceOp>&,
                            int64_t)>();
-  return op.call(
+  return std::get<1>(op.call(
       output_tensors,
       input_tensors,
       process_group,
       c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
-      opts.timeout.count());
+      opts.timeout.count()));
 }
 
 c10::intrusive_ptr<ProcessGroup::Work> reduce(

From f43c38bdc820650ad974bb1c48360b0c6931961a Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Tue, 6 Sep 2022 21:35:20 +0000
Subject: [PATCH 08/45] Add broadcast_ to CommTensor (#84604)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84604
Approved by: https://github.com/wanchaol
---
 test/distributed/test_c10d_common.py |  9 ++++++++-
 test/distributed/test_c10d_gloo.py   | 17 ++++++++++++++++-
 test/distributed/test_c10d_nccl.py   | 14 +++++++++++++-
 torch/csrc/distributed/c10d/Ops.cpp  | 17 ++++++++++++-----
 4 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 687de53950ef..901fbea14017 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1431,7 +1431,7 @@ class CommTensor(torch.Tensor):
     It is specifically tailored for allreduce_ at the moment.
     """
 
-    _supported_comms = ["allreduce_", "allgather_", "reduce_scatter_"]
+    _supported_comms = ["allreduce_", "allgather_", "broadcast_", "reduce_scatter_"]
 
     @staticmethod
     def __new__(cls, tensor: torch.Tensor):
@@ -1684,6 +1684,13 @@ def comm_fn(tensor, group=None):
 
         self._test_work_wait(tensor, comm_fn=comm_fn)
 
+    def _test_broadcast_work_wait(self, tensor):
+        def comm_fn(tensor, group=None):
+            work = dist.broadcast(tensor, src=0, group=group, async_op=True)
+            return work, tensor
+
+        self._test_work_wait(tensor, comm_fn=comm_fn)
+
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 01f004f2b2de..cd5c61c997fa 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -2352,7 +2352,13 @@ def world_size(self):
 
     def _get_default_group(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        return c10d.ProcessGroupGloo(store, self.rank, self.world_size)
+        dist.init_process_group(
+            backend="gloo",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=store,
+        )
+        return dist.distributed_c10d._get_default_group()
 
     def test_allreduce_work_wait_cpu(self):
         self._test_allreduce_work_wait(torch.ones(2, 2) * self.rank)
@@ -2372,6 +2378,15 @@ def test_allgather_work_wait_gpu(self):
             torch.ones(2, 2, device=self.rank) * self.rank
         )
 
+    def test_broadcast_work_wait_cpu(self):
+        self._test_broadcast_work_wait(torch.ones(2, 2) * self.rank)
+
+    @skip_if_lt_x_gpu(2)
+    def test_broadcast_work_wait_gpu(self):
+        self._test_broadcast_work_wait(
+            torch.ones(2, 2, device=self.rank) * self.rank
+        )
+
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 937b2f593113..61b03d52a6a7 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2815,7 +2815,13 @@ def world_size(self):
 
     def _get_default_group(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        return c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+        dist.init_process_group(
+            backend="nccl",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=store,
+        )
+        return dist.distributed_c10d._get_default_group()
 
     @skip_if_lt_x_gpu(2)
     def test_allreduce_work_wait_gpu(self):
@@ -2835,6 +2841,12 @@ def test_reduce_scatter_work_wait_gpu(self):
             torch.ones(2, 2, device=self.rank) * self.rank
         )
 
+    @skip_if_lt_x_gpu(2)
+    def test_broadcast_work_wait_gpu(self):
+        self._test_broadcast_work_wait(
+            torch.ones(2, 2, device=self.rank) * self.rank
+        )
+
 
 if __name__ == "__main__":
     assert (
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index eb5286b0d2cc..27e7ed7843d8 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -6,17 +6,22 @@
 
 namespace c10d {
 namespace {
-c10::intrusive_ptr<ProcessGroup::Work> broadcast_(
+std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>
+broadcast_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     int64_t root_rank,
     int64_t root_tensor,
     int64_t timeout) {
   auto tensor_vec = tensors.vec();
-  return process_group->broadcast(
+  auto work = process_group->broadcast(
       tensor_vec,
       BroadcastOptions{
           root_rank, root_tensor, std::chrono::milliseconds(timeout)});
+
+  return std::
+      tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>(
+          std::move(tensor_vec), work);
 }
 
 std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>
@@ -212,7 +217,9 @@ c10::intrusive_ptr<ProcessGroup::Work> broadcast(
     const BroadcastOptions& opts) {
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("c10d::broadcast_", "")
-                       .typed<c10::intrusive_ptr<::c10d::ProcessGroup::Work>(
+                       .typed<std::tuple<
+                           std::vector<at::Tensor>,
+                           c10::intrusive_ptr<ProcessGroup::Work>>(
                            at::TensorList,
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                            int64_t,
@@ -221,12 +228,12 @@ c10::intrusive_ptr<ProcessGroup::Work> broadcast(
   // It's awakward to unbox the opts here and box them again in the custom C++
   // op. But it's also complicated to make opts as a CustomClassHolder. Leave it
   // as it is now.
-  return op.call(
+  return std::get<1>(op.call(
       tensors,
       process_group,
       opts.rootRank,
       opts.rootTensor,
-      opts.timeout.count());
+      opts.timeout.count()));
 }
 
 c10::intrusive_ptr<ProcessGroup::Work> allreduce(

From 89c4654ba9e3c552d3a6e0a56da8adf656cce469 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Tue, 6 Sep 2022 21:51:34 +0000
Subject: [PATCH 09/45] Add scatter_ to CommTensor (#84606)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84606
Approved by: https://github.com/wanchaol
---
 test/distributed/test_c10d_common.py | 17 ++++++++++++++++-
 test/distributed/test_c10d_gloo.py   |  9 +++++++++
 test/distributed/test_c10d_nccl.py   |  6 ++++++
 torch/csrc/distributed/c10d/Ops.cpp  | 17 ++++++++++++-----
 4 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 901fbea14017..951b601fd90b 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1431,7 +1431,13 @@ class CommTensor(torch.Tensor):
     It is specifically tailored for allreduce_ at the moment.
     """
 
-    _supported_comms = ["allreduce_", "allgather_", "broadcast_", "reduce_scatter_"]
+    _supported_comms = [
+        "allreduce_",
+        "allgather_",
+        "broadcast_",
+        "reduce_scatter_",
+        "scatter_",
+    ]
 
     @staticmethod
     def __new__(cls, tensor: torch.Tensor):
@@ -1691,6 +1697,15 @@ def comm_fn(tensor, group=None):
 
         self._test_work_wait(tensor, comm_fn=comm_fn)
 
+    def _test_scatter_work_wait(self, tensor):
+        def comm_fn(tensor, group=None):
+            in_tensors = [tensor + i for i in range(group.size())] if self.rank == 0 else None
+            out_tensor = torch.zeros_like(tensor)
+            work = dist.scatter(out_tensor, in_tensors, src=0, group=group, async_op=True)
+            return work, out_tensor
+
+        self._test_work_wait(tensor, comm_fn=comm_fn)
+
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index cd5c61c997fa..7163d366d4d8 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -2387,6 +2387,15 @@ def test_broadcast_work_wait_gpu(self):
             torch.ones(2, 2, device=self.rank) * self.rank
         )
 
+    def test_scatter_work_wait_cpu(self):
+        self._test_scatter_work_wait(torch.ones(2, 2) * self.rank)
+
+    @skip_if_lt_x_gpu(2)
+    def test_scatter_work_wait_gpu(self):
+        self._test_scatter_work_wait(
+            torch.ones(2, 2, device=self.rank) * self.rank
+        )
+
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 61b03d52a6a7..9ec7ff6e1e6f 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2847,6 +2847,12 @@ def test_broadcast_work_wait_gpu(self):
             torch.ones(2, 2, device=self.rank) * self.rank
         )
 
+    @skip_if_lt_x_gpu(2)
+    def test_scatter_work_wait_gpu(self):
+        self._test_scatter_work_wait(
+            torch.ones(2, 2, device=self.rank) * self.rank
+        )
+
 
 if __name__ == "__main__":
     assert (
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 27e7ed7843d8..52d9d0ba2efb 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -110,16 +110,21 @@ c10::intrusive_ptr<ProcessGroup::Work> gather_(
       GatherOptions{root_rank, std::chrono::milliseconds(timeout)});
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> scatter_(
+std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>
+scatter_(
     const std::vector<at::Tensor>& output_tensors,
     const std::vector<std::vector<at::Tensor>>& input_tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     int64_t root_rank,
     int64_t timeout) {
-  return process_group->scatter(
+  auto work = process_group->scatter(
       const_cast<std::vector<at::Tensor>&>(output_tensors),
       const_cast<std::vector<std::vector<at::Tensor>>&>(input_tensors),
       ScatterOptions{root_rank, std::chrono::milliseconds(timeout)});
+
+  return std::
+      tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>(
+          output_tensors, work);
 }
 
 c10::intrusive_ptr<ProcessGroup::Work> alltoall_(
@@ -348,18 +353,20 @@ c10::intrusive_ptr<ProcessGroup::Work> scatter(
     const ScatterOptions& opts) {
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("c10d::scatter_", "")
-                       .typed<c10::intrusive_ptr<::c10d::ProcessGroup::Work>(
+                       .typed<std::tuple<
+                           std::vector<at::Tensor>,
+                           c10::intrusive_ptr<ProcessGroup::Work>>(
                            const std::vector<at::Tensor>&,
                            const std::vector<std::vector<at::Tensor>>&,
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                            int64_t,
                            int64_t)>();
-  return op.call(
+  return std::get<1>(op.call(
       output_tensors,
       input_tensors,
       process_group,
       opts.rootRank,
-      opts.timeout.count());
+      opts.timeout.count()));
 }
 
 c10::intrusive_ptr<ProcessGroup::Work> alltoall(

From a47bc96fb7176d43752d3e376697971d4ba47317 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Tue, 6 Sep 2022 10:20:07 -0700
Subject: [PATCH 10/45] [composite compliance] fix linalg.eigvals (#84137)

linalg.eigvals fails in some cases with functorch and the root of the
problem is that it is not composite compliant.

In particular, checks that branch on whether or not a Tensor requires
grad do not work with functorch. In order to support functorch with
them, we have to include an additional "if the tensor is a Tensor
Subclass, then assume that it MAY require grad, so we must always go
through the differentiable path".

This PR also changes the batching rule for linalg.eigvals to be a
decomposition instead of what it was previously. What it was previously
was masking the error in functorch's test suite.

Unfortunately we don't comprehensive tests for this on the functorch
side which is why this was not caught before. I'll look into why
that is in the future; it's a bit complicated.

Test Plan:
- wait for tests
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84137
Approved by: https://github.com/Lezcano, https://github.com/IvanYashchuk, https://github.com/samdow
---
 aten/src/ATen/native/BatchLinearAlgebra.cpp   | 24 +++++++++++--------
 .../csrc/BatchRulesDecompositions.cpp         |  1 +
 .../csrc/BatchRulesLinearAlgebra.cpp          |  1 -
 test/test_proxy_tensor.py                     |  2 +-
 .../_internal/opinfo/definitions/linalg.py    |  4 ----
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 56c66171a961..7464e12fd7d3 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -1495,9 +1495,18 @@ void _linalg_check_errors(
   TORCH_INTERNAL_ASSERT(false);
 }
 
-bool _requires_fw_or_bw_grad(const Tensor& input) {
+// If an input requires fw or bw grad then we need to go down a different
+// (slower) path to ensure that the gradients are computable.
+// That is what `_may_require_fw_or_bw_grad` is helpful for.
+//
+// Why is there a isTensorSubclassLike check here?
+// Without it, this function can lead to composite compliance problems, which
+// may lead to bugs in functorch, where a Tensor Subclass that doesn't
+// require grad may wrap a Tensor subclass that requires grad.
+bool _may_require_fw_or_bw_grad(const Tensor& input) {
   return ((at::GradMode::is_enabled() && input.requires_grad())
-          || input._fw_grad(/*level */ 0).defined());
+          || input._fw_grad(/*level */ 0).defined()
+          || isTensorSubclassLike(input));
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg.inv ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2688,9 +2697,8 @@ std::tuple<Tensor&, Tensor&> linalg_eigh_out(const Tensor& A, c10::string_view u
 
 
 Tensor linalg_eigvalsh(const Tensor& A, c10::string_view uplo) {
-  // See [Note: svdvals_compute_uv] for the condition in compute_v
   return std::get<0>(at::_linalg_eigh(A, uplo,
-                     /*comptue_v=*/_requires_fw_or_bw_grad(A) || isTensorSubclassLike(A)));
+                     /*compute_v=*/_may_require_fw_or_bw_grad(A)));
 }
 
 Tensor& linalg_eigvalsh_out(const Tensor& A, c10::string_view uplo, Tensor& L) {
@@ -3148,7 +3156,7 @@ Tensor& linalg_eigvals_out(const Tensor& input, Tensor& values) {
 Tensor linalg_eigvals(const Tensor& input) {
   // if input requires grad we must compute the eigenvectors to make this function differentiable
   // the eigenvectors are not exposed to the user
-  if (_requires_fw_or_bw_grad(input)) {
+  if (_may_require_fw_or_bw_grad(input)) {
     return std::get<0>(at::linalg_eig(input));
   }
 
@@ -3318,12 +3326,8 @@ Tensor& linalg_svdvals_out(const Tensor& A, c10::optional<c10::string_view> driv
 }
 
 Tensor linalg_svdvals(const Tensor& A, c10::optional<c10::string_view> driver) {
-  // [Note: svdvals_compute_uv]
-  // NB: Why do we need isTensorSubclassLike check for linalg_svdvals but not linalg_eigvals?
-  //     svdvals is decomposed at the vmap level in functorch so A can be a BatchedTensor wrapping
-  //     a TensorWrapper requiring fw or bw grad.
   return std::get<1>(at::_linalg_svd(A, /*full_matrices=*/false,
-                     /*comptue_uv=*/_requires_fw_or_bw_grad(A) || isTensorSubclassLike(A),
+                     /*compute_uv=*/_may_require_fw_or_bw_grad(A),
                      /*driver=*/driver));
 }
 
diff --git a/functorch/functorch/csrc/BatchRulesDecompositions.cpp b/functorch/functorch/csrc/BatchRulesDecompositions.cpp
index 06e559986a9d..41bb842cd412 100644
--- a/functorch/functorch/csrc/BatchRulesDecompositions.cpp
+++ b/functorch/functorch/csrc/BatchRulesDecompositions.cpp
@@ -140,6 +140,7 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   OP_DECOMPOSE(linalg_cholesky);
   OP_DECOMPOSE(linalg_det);
   OP_DECOMPOSE(linalg_eigvalsh);
+  OP_DECOMPOSE(linalg_eigvals);
   OP_DECOMPOSE(linalg_inv);
   OP_DECOMPOSE(linalg_matmul);
   OP_DECOMPOSE(linalg_matrix_norm);
diff --git a/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp b/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
index 80e9e30d82ec..08695e88da61 100644
--- a/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
+++ b/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
@@ -459,7 +459,6 @@ LINALG_CHECK_MATRIX_UNARY_ONE_OUT(cholesky, cholesky);
 LINALG_CHECK_MATRIX_UNARY_ONE_OUT(cholesky_inverse, cholesky_inverse);
 LINALG_CHECK_MATRIX_UNARY_TWO_OUT(linalg_cholesky_ex, linalg.cholesky);
 LINALG_CHECK_MATRIX_UNARY_TWO_OUT(linalg_eig, linalg.eig);
-LINALG_CHECK_MATRIX_UNARY_ONE_OUT(linalg_eigvals, linalg.eigvals);
 LINALG_CHECK_MATRIX_UNARY_TWO_OUT(linalg_inv_ex, linalg.inv_ex);
 LINALG_CHECK_MATRIX_UNARY_THREE_OUT(linalg_ldl_factor_ex, torch.linalg.ldl_factor_ex);
 LINALG_CHECK_MATRIX_UNARY_ONE_OUT(linalg_matrix_power, linalg.matrix_power);
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 095ff1ab6a0e..5f889cff5367 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -865,7 +865,6 @@ def f(a, b):
     # unknown
     xfail('allclose'),
     xfail('equal'),
-    xfail('linalg.eigvals'),
     # empty
     skip('new_empty'),
     skip('empty_like'),
@@ -914,6 +913,7 @@ def f(a, b):
     # Needs complex-value support
     xfail('polar'),
     xfail('linalg.eig'),
+    xfail('linalg.eigvals'),
     xfail('__getitem__', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('__rmatmul__', ''),  # aten.new_empty.default - couldn't find symbolic meta function/decomposition
     xfail('_masked.amax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index 443693bdb43a..8f25a4c6ea33 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -1437,10 +1437,6 @@ def make_input():
         supports_fwgrad_bwgrad=True,
         decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
         skips=(
-            # Pre-existing condition; Needs to be fixed
-            DecorateInfo(
-                unittest.expectedFailure, "TestCompositeCompliance", "test_operator"
-            ),
             # exits early on eager extremal value test
             DecorateInfo(
                 unittest.skip("Skipped!"),

From e96fb5d58c2accd717f0859b510ae7facb6d6aac Mon Sep 17 00:00:00 2001
From: Rodrigo Kumpera <kumpera@fb.com>
Date: Wed, 7 Sep 2022 14:49:45 +0000
Subject: [PATCH 11/45] [c10d] Fix docstring of scatter_object_list (#84596)

The docstring for scatter_object_list mentions is doesn't work with NCCL, but this was fixed in #79034

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84596
Approved by: https://github.com/H-Huang
---
 torch/distributed/distributed_c10d.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 3b6c7421646a..cd9c8ee9bc3e 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -2000,9 +2000,6 @@ def scatter_object_list(
         since it does not provide an ``async_op`` handle and thus will be a
         blocking call.
 
-    .. note:: Note that this API does not support the NCCL backend, as the
-        tensor-based scatter collective is not supported by ProcessGroupNCCL.
-
     .. warning::
         :func:`scatter_object_list` uses ``pickle`` module implicitly, which
         is known to be insecure. It is possible to construct malicious pickle

From 3eb16509c761c41f50163d404428246ea117c7fd Mon Sep 17 00:00:00 2001
From: nikitaved <nikitavedeneev@gmail.com>
Date: Wed, 7 Sep 2022 15:29:44 +0000
Subject: [PATCH 12/45] optimize householder product backward to be more
 memory-efficient (#84627)

A follow-up on discussions in https://github.com/pytorch/pytorch/pull/84180.
Makes backward more memory efficient with the lesser number of kernel calls.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84627
Approved by: https://github.com/kshitij12345, https://github.com/zou3519
---
 torch/csrc/autograd/FunctionsManual.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 091dc1417a71..7ad92e83f08e 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -5163,7 +5163,7 @@ std::tuple<Tensor, Tensor> householder_product_backward(
     auto v_grad = (-t_unsqueezed * vHK).conj().squeeze(-2) -
         (t_unsqueezed * Kv).squeeze(-1);
     auto tau_grad = -(vHK.narrow(-1, k, m - k).matmul(v)).conj();
-    return std::make_tuple(v_grad, tau_grad.squeeze(-1));
+    return std::make_tuple(v_grad.unsqueeze(-1), tau_grad.squeeze(-1));
   };
 
   auto apply_householder_reflector = [m, modify_K_in_place](
@@ -5205,8 +5205,8 @@ std::tuple<Tensor, Tensor> householder_product_backward(
 
       Tensor v_i_grad, tau_i_grad;
       std::tie(v_i_grad, tau_i_grad) = update_grad(i, v_i, t_i, K);
-      input_grads.push_back(v_i_grad.squeeze(-1));
-      tau_grads.push_back(tau_i_grad.squeeze(-1));
+      input_grads.push_back(v_i_grad);
+      tau_grads.push_back(tau_i_grad);
 
       // K <- H_{i + 1}^{-1} @ K @ H_i
       if (i < k - 1) {
@@ -5218,18 +5218,18 @@ std::tuple<Tensor, Tensor> householder_product_backward(
       }
     }
 
-    input_grad = at::stack(input_grads, -1);
-    tau_grad = at::stack(tau_grads, -1);
-
     // Only first k columns are active in forward.
     // zero gradients for the inactive input.
     if (k < input.size(-1)) {
-      auto input_sizes = input_.sizes();
-      at::DimVector new_sizes(input_sizes);
-      new_sizes[input_.dim() - 1] = input.size(-1) - k;
-      auto zeros = at::zeros(new_sizes, input_.options());
-      input_grad = at::cat({input_grad, zeros}, -1);
+      auto zero_grad_shape =
+          at::DimVector(input_.sizes().slice(0, input_.dim() - 1));
+      zero_grad_shape.push_back(input.size(-1) - k);
+      auto zero_grad = at::zeros(zero_grad_shape, input_.options());
+      input_grads.push_back(zero_grad);
     }
+
+    input_grad = at::cat(input_grads, -1);
+    tau_grad = at::cat(tau_grads, -1);
   } else {
     input_grad = at::zeros_like(input_);
     tau_grad = at::zeros_like(tau);

From e31ad1c2d3a08a6421cd7a8adcd7b3f66727305a Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Tue, 6 Sep 2022 13:23:03 -0400
Subject: [PATCH 13/45] [reland] Move decompositions and helpers for jvp from
 functorch into core (#84581)

Reland of https://github.com/pytorch/pytorch/pull/84358
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84581
Approved by: https://github.com/samdow
---
 functorch/functorch/_src/compilers.py         |  2 +-
 functorch/functorch/_src/eager_transforms.py  | 45 +--------
 functorch/functorch/compile/__init__.py       |  1 -
 functorch/test/test_pythonkey.py              |  4 +-
 .../_decomp/decompositions_for_jvp.py         | 97 ++++++++++++++++---
 5 files changed, 88 insertions(+), 61 deletions(-)
 rename functorch/functorch/_src/decompositions.py => torch/_decomp/decompositions_for_jvp.py (64%)

diff --git a/functorch/functorch/_src/compilers.py b/functorch/functorch/_src/compilers.py
index da0947b51895..3dd84554da10 100644
--- a/functorch/functorch/_src/compilers.py
+++ b/functorch/functorch/_src/compilers.py
@@ -10,10 +10,10 @@
 import torch
 import torch.fx as fx
 import torch.nn as nn
+from torch._decomp import get_decompositions
 
 from .aot_autograd import aot_function, aot_module, make_boxed_compiler
 from .compile_utils import strip_overloads
-from .decompositions import get_decompositions
 from .partitioners import (
     default_partition,
     draw_graph,
diff --git a/functorch/functorch/_src/eager_transforms.py b/functorch/functorch/_src/eager_transforms.py
index 6cfa66d1c9b7..bc6d2e2ffd81 100644
--- a/functorch/functorch/_src/eager_transforms.py
+++ b/functorch/functorch/_src/eager_transforms.py
@@ -6,7 +6,6 @@
 
 from typing import Callable, Union, Tuple, List, Any
 import torch
-import inspect
 from functools import partial, wraps
 import contextlib
 from torch.utils._pytree import tree_flatten, tree_unflatten, tree_map
@@ -14,8 +13,8 @@
 import torch.autograd.forward_ad as fwAD
 
 from .vmap import vmap
-from .decompositions import decomposition_table, decomposition_table_for_jvp
-
+from torch._decomp import decomposition_table
+import torch._decomp.decompositions_for_jvp
 
 from functorch._C import (
     _wrap_for_grad,
@@ -1439,37 +1438,6 @@ def wrapped(*args, **kwargs):
             _func_decrement_nesting()
     return wrapped
 
-
-def _register_jit_decomposition(decomp, use_python=False):
-    if decomp in decomposition_table_for_jvp:
-        decomposition_table_used = decomposition_table_for_jvp
-    elif decomp in decomposition_table:
-        decomposition_table_used = decomposition_table
-    else:
-        raise RuntimeError(f"could not find decomposition for {decomp}")
-    decomp_fn = decomposition_table_used[decomp]
-    if use_python:
-        decomp_fn = torch.jit.ignore(decomp_fn)
-        sig = inspect.signature(decomp_fn)
-
-        # Create a string wrapping the function from the signature
-        # example output:
-        # def wrapped_decomp(x: torch.Tensor, y: int, z: int):
-        #   return decomp_fn(x, y, z)
-        # Thanks copilot!
-        def get_function_def(sig):
-            param_def = [f"{param_str}" for param_str in sig.parameters.values()]
-            param_use = [f"{param_str}" for param_str in sig.parameters.keys()]
-
-            return f"def wrapped_decomp({', '.join(param_def)}):\n  return decomp_fn({', '.join(param_use)})\n"
-
-        f_str = get_function_def(sig)
-        graph = torch.jit.CompilationUnit(f_str).wrapped_decomp.graph
-    else:
-        graph = torch.jit.script(decomp_fn).graph
-    torch.jit._register_decomposition(decomp, graph)
-
-
 # use an alternate way to register an operator into the decomposition table
 # _register_jit_decomposition doesn't work for some operators, e.g. addr,
 #  because the Tensor types generated cannot be unioned by torchscript
@@ -1484,14 +1452,5 @@ def _register_python_decomposition_vmap(decomp):
         raise RuntimeError(f"could not find decomposition for {decomp}")
 
 
-_register_jit_decomposition(torch.ops.aten.trace.default, use_python=True)
-_register_jit_decomposition(torch.ops.aten.nll_loss_backward.default)
-_register_jit_decomposition(torch.ops.aten.nll_loss2d_backward.default)
-_register_jit_decomposition(torch.ops.aten._log_softmax_backward_data.default)
-_register_jit_decomposition(torch.ops.aten._softmax_backward_data.default)
-_register_jit_decomposition(torch.ops.aten.log_sigmoid_forward.default)
-_register_jit_decomposition(torch.ops.aten.native_layer_norm_backward.default)
-_register_jit_decomposition(torch.ops.aten.native_batch_norm_backward.default)
-_register_jit_decomposition(torch.ops.aten.cudnn_batch_norm_backward.default)
 _register_python_decomposition_vmap(torch.ops.aten.mse_loss_backward.default)
 _register_python_decomposition_vmap(torch.ops.aten.addr.default)
diff --git a/functorch/functorch/compile/__init__.py b/functorch/functorch/compile/__init__.py
index 99e0456a4e4e..b489eb40d91e 100644
--- a/functorch/functorch/compile/__init__.py
+++ b/functorch/functorch/compile/__init__.py
@@ -1,5 +1,4 @@
 from .._src.python_key import pythonkey_decompose
-from .._src.decompositions import register_decomposition, decomposition_table, get_decompositions
 from .._src.fx_minifier import minifier
 from .._src.aot_autograd import (
     aot_function,
diff --git a/functorch/test/test_pythonkey.py b/functorch/test/test_pythonkey.py
index b2bd74348f7b..9f0f2fdc556c 100644
--- a/functorch/test/test_pythonkey.py
+++ b/functorch/test/test_pythonkey.py
@@ -24,10 +24,10 @@
 from functorch.compile import (
     nnc_jit, compiled_function, compiled_module,
     min_cut_rematerialization_partition, aot_function, aot_module,
-    decomposition_table, nop,
-    num_of_recompilations, default_partition, default_decompositions,
+    nop, num_of_recompilations, default_partition, default_decompositions,
     memory_efficient_fusion, clear_compile_cache, get_aot_compilation_context
 )
+from torch._decomp import decomposition_table
 
 from torch.testing._internal.common_device_type import ops
 from functorch_additional_op_db import additional_op_db
diff --git a/functorch/functorch/_src/decompositions.py b/torch/_decomp/decompositions_for_jvp.py
similarity index 64%
rename from functorch/functorch/_src/decompositions.py
rename to torch/_decomp/decompositions_for_jvp.py
index 3780d09db20d..b8c541966bc2 100644
--- a/functorch/functorch/_src/decompositions.py
+++ b/torch/_decomp/decompositions_for_jvp.py
@@ -1,17 +1,31 @@
+import inspect
+from typing import Callable, Dict, List, Optional, Tuple
+
 import torch
-from torch import Tensor
 import torch._decomp
-from typing import Tuple, List, Optional
-
-aten = torch.ops.aten
+from torch import Tensor
 
 decomposition_table = torch._decomp.decomposition_table
+decomposition_table_for_jvp: Dict[torch._ops.OpOverload, Callable] = {}
 register_decomposition = torch._decomp.register_decomposition
-get_decompositions = torch._decomp.get_decompositions
+aten = torch.ops.aten
 
-# Decompositions have been ported to torch._decomp inside of PyTorch core.
-# The only decompositions here are temporary or hacks.
-# Please submit your contributions to PyTorch core!
+# NOTE: [forward-mode AD decompositions hack]
+#
+# The mechanism is in VariableType,
+#   IF any inputs have forward grad
+#      AND there is no forward AD formula implemented
+#      AND the functions is actually differentiable
+#   run the decomposition
+#      See run_jit_decomposition_with_args_for_jvp
+#      We currently use python decompositions that we torchscript.
+#
+# Note that we would be building the backward graph at the decomposed level
+# too, but that is OK, because we would've errored out otherwise anyway.
+#
+# TODO: what if jit decompositions exists, should we just use it?
+#       or do we want to have an explicit white list like functorch had
+#       using special JVP_DECOMP DynamicLayerFront kernel
 
 
 def maybe_register_decomposition(op):
@@ -20,6 +34,7 @@ def decorator(f):
             return register_decomposition(op)(f)
         except Exception:
             return f
+
     return decorator
 
 
@@ -33,6 +48,39 @@ def register_decomposition_for_jvp(fn):
     return register_decomposition(fn, registry=decomposition_table_for_jvp)
 
 
+def _register_jit_decomposition_for_jvp(decomp, use_python=False):
+    if decomp in decomposition_table_for_jvp:
+        decomposition_table_used = decomposition_table_for_jvp
+    elif decomp in decomposition_table:
+        decomposition_table_used = decomposition_table
+    else:
+        raise RuntimeError(f"could not find decomposition for {decomp}")
+    decomp_fn = decomposition_table_used[decomp]
+    if use_python:
+        decomp_fn = torch.jit.ignore(decomp_fn)
+        sig = inspect.signature(decomp_fn)
+
+        # Create a string wrapping the function from the signature
+        # example output:
+        # def wrapped_decomp(x: torch.Tensor, y: int, z: int):
+        #   return decomp_fn(x, y, z)
+        # Thanks copilot!
+        def get_function_def(sig):
+            param_def = [f"{param_str}" for param_str in sig.parameters.values()]
+            param_use = [f"{param_str}" for param_str in sig.parameters.keys()]
+
+            return f"def wrapped_decomp({', '.join(param_def)}):\n  return decomp_fn({', '.join(param_use)})\n"
+
+        f_str = get_function_def(sig)
+        graph = torch.jit.CompilationUnit(f_str).wrapped_decomp.graph
+    else:
+        graph = torch.jit.script(decomp_fn).graph
+    torch.jit._register_decomposition(decomp, graph)
+
+
+# The only decompositions here are temporary or hacks for the purposes of jvp
+
+# TODO: do these also belong here?
 @maybe_register_decomposition(aten.trace.default)
 def trace(self: Tensor) -> Tensor:
     return torch.sum(torch.diag(self))
@@ -49,7 +97,9 @@ def log_sigmoid_forward(self: Tensor) -> Tuple[Tensor, Tensor]:
     return min - torch.log1p(z), buffer
 
 
-def recompute_mean_var(input: Tensor, rstd: Tensor, inner_dim_indices: List[int], keepdim: bool):
+def recompute_mean_var(
+    input: Tensor, rstd: Tensor, inner_dim_indices: List[int], keepdim: bool
+):
     # for most norm decompositions, it will be the same as the core version except for here.
     # We recompute the mean and variance so that they track gradients through input
 
@@ -145,7 +195,7 @@ def prod(x: List[int]):
     return r
 
 
-@register_decomposition_for_jvp(aten.native_batch_norm_backward)  # @register_decomposition_for_jvp after in core
+@register_decomposition_for_jvp(aten.native_batch_norm_backward)
 def native_batch_norm_backward(
     grad_out: Tensor,
     input: Tensor,
@@ -163,11 +213,13 @@ def native_batch_norm_backward(
     assert input_rank >= 2, "rank of the input must be at least 2"
 
     axis = 1
-    num_features = prod(input_shape) / input_shape[axis]
+    num_features = prod(input_shape) / input_shape[axis]  # type: ignore[arg-type]
     mean = save_mean
     invstd = save_invstd
     if train:
-        assert save_mean is not None and save_invstd is not None, "when train=True, save_mean and save_invstd are required"
+        assert (
+            save_mean is not None and save_invstd is not None
+        ), "when train=True, save_mean and save_invstd are required"
 
         reduciton_dims = [0] + list(range(2, input.dim()))
         assert invstd is not None  # for typing
@@ -177,6 +229,8 @@ def native_batch_norm_backward(
         mean = running_mean
         invstd = torch.rsqrt(running_var + eps)
 
+    assert invstd is not None and mean is not None
+
     broadcast_mask = [1] * input_rank
     broadcast_mask[axis] = input_shape[axis]
 
@@ -207,13 +261,28 @@ def native_batch_norm_backward(
     if output_mask[1]:
         grad_weight = dot_p * invstd
     elif weight is not None:
-        grad_weight = torch.zeros_like(weight)  # should be None but doesn't work with vjp
+        grad_weight = torch.zeros_like(
+            weight
+        )  # should be None but doesn't work with vjp
     else:
         grad_weight = torch.zeros(())  # should be None but doesn't work with vjp
 
     if output_mask[2]:
         grad_bias = grad_output_sum
     else:
-        grad_bias = torch.zeros_like(grad_output_sum)  # should be None but doesn't work with vjp
+        grad_bias = torch.zeros_like(
+            grad_output_sum
+        )  # should be None but doesn't work with vjp
 
     return (grad_input, grad_weight, grad_bias)
+
+
+_register_jit_decomposition_for_jvp(torch.ops.aten.trace.default, use_python=True)
+_register_jit_decomposition_for_jvp(torch.ops.aten.nll_loss_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.nll_loss2d_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten._log_softmax_backward_data.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten._softmax_backward_data.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.log_sigmoid_forward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.native_layer_norm_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.native_batch_norm_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.cudnn_batch_norm_backward.default)

From 42d99e6f196233627a28b8e9efb26a0a166fa370 Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Tue, 6 Sep 2022 21:37:03 -0400
Subject: [PATCH 14/45] Call jit decomposition in VariableType to increase
 forward AD coverage (#84151)

This PR:
- updates forward AD codegen in core to generate code that tries calling into decompositions registered to jit when
   - (1) the function is not in-place or out variant
   - AND (2) the function is differentiable (requires_derivative=True)
   - AND (3) there are no forward AD formulas registered
   - To simplify things we always generating the if/else (as long as (1) is true), but generate 'false' when either (2) or (3) are false.
 - removes the mechanism from functorch
    - (follow up) some functorch tests should be updated here so they no longer have to compute the Jacobian with vjp
  - factors out some logic to generate the any_has_forward_grad condition
     - (bc-breaking) when TensorList inputs unexpectedly have forward grad, the error will no longer contain the name

See https://github.com/pytorch/pytorch/pull/84151#issuecomment-1238519247 for codegen output and more discussion.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84151
Approved by: https://github.com/samdow, https://github.com/albanD, https://github.com/zou3519
---
 functorch/functorch/csrc/BatchRulesHelper.cpp |  14 --
 functorch/functorch/csrc/BatchRulesHelper.h   |   6 -
 functorch/functorch/csrc/BatchRulesViews.cpp  |   3 +-
 functorch/functorch/csrc/DynamicLayer.cpp     |  72 +-----
 functorch/test/test_ops.py                    |  40 +--
 tools/autograd/derivatives.yaml               |  15 +-
 tools/autograd/gen_variable_type.py           | 232 ++++++++++++------
 tools/autograd/templates/VariableType.cpp     |   1 +
 .../autograd/VariableTypeUtilsDependOnOps.h   |  40 +++
 torch/csrc/autograd/functions/utils.h         |  18 ++
 .../jit/runtime/decomposition_registry.cpp    |  20 ++
 .../csrc/jit/runtime/decomposition_registry.h |   6 +
 .../_internal/common_methods_invocations.py   |  17 ++
 .../_internal/opinfo/definitions/_masked.py   |   3 +
 14 files changed, 277 insertions(+), 210 deletions(-)
 create mode 100644 torch/csrc/autograd/VariableTypeUtilsDependOnOps.h

diff --git a/functorch/functorch/csrc/BatchRulesHelper.cpp b/functorch/functorch/csrc/BatchRulesHelper.cpp
index dfd690ac2168..d49ecd5e8737 100644
--- a/functorch/functorch/csrc/BatchRulesHelper.cpp
+++ b/functorch/functorch/csrc/BatchRulesHelper.cpp
@@ -133,20 +133,6 @@ void vmapIncompatibleInplaceError(const char* schema_name) {
     "please file a bug report instead.");
 }
 
-void run_jit_decomposition(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  const auto& schema = op.schema();
-  // TODO: templatize based on op and keep static trace_exec
-  auto * trace_exec = torch::jit::GetDecompositionExecutor(schema);
-  trace_exec->run((*stack));
-  if (stack->back().isTuple()) {
-    IValue tup = stack->back();
-    stack->pop_back();
-    for (const auto& elem: tup.toTuple()->elements()) {
-      stack->push_back(elem);
-    }
-  }
-}
-
 static void handleScalarTypePromotion(Tensor& logical_scalar_tensor, Tensor& second) {
   auto result_type = at::native::result_type(logical_scalar_tensor[0], second);
   if (logical_scalar_tensor.scalar_type() != result_type) {
diff --git a/functorch/functorch/csrc/BatchRulesHelper.h b/functorch/functorch/csrc/BatchRulesHelper.h
index 552a38b20e20..329d0db42b50 100644
--- a/functorch/functorch/csrc/BatchRulesHelper.h
+++ b/functorch/functorch/csrc/BatchRulesHelper.h
@@ -195,12 +195,6 @@ inline void handle_variadic_bdims(std::vector<std::pair<Tensor, optional<int64_t
 #define VARIADIC_BDIMS_BOXED(op) \
   m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_tensor_inputs_batch_rule<decltype(&handle_variadic_bdims), &handle_variadic_bdims>>());
 
-void run_jit_decomposition(const c10::OperatorHandle& op, torch::jit::Stack* stack);
-
-#define RUN_JIT_DECOMPOSITION(op) \
-  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<&run_jit_decomposition>());
-
-
 using UnpackedBatchedTensor = std::tuple<Tensor,optional<int64_t>>;
 
 inline void find_and_unpack_tensors(
diff --git a/functorch/functorch/csrc/BatchRulesViews.cpp b/functorch/functorch/csrc/BatchRulesViews.cpp
index 9c382cbaf207..df42086acef3 100644
--- a/functorch/functorch/csrc/BatchRulesViews.cpp
+++ b/functorch/functorch/csrc/BatchRulesViews.cpp
@@ -15,6 +15,7 @@
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/util/SmallBuffer.h>
 #include <ATen/InferSize.h>
+#include <torch/csrc/jit/runtime/decomposition_registry.h>
 
 namespace at { namespace functorch {
 
@@ -524,7 +525,7 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   VMAP_SUPPORT(chunk, chunk_batching_rule);
   m.impl("flatten.using_ints", static_cast<decltype(&ATEN_FN2(flatten, using_ints))>(native::flatten));
   VMAP_SUPPORT(flip, flip_batch_rule);
-  RUN_JIT_DECOMPOSITION(trace)
+  m.impl("trace", torch::CppFunction::makeFromBoxedFunction<&torch::jit::run_jit_decomposition>());
   VMAP_SUPPORT(tril, VARIADIC_BDIMS_BATCH_RULE(ATEN_FN(tril)));
   VMAP_SUPPORT(triu, VARIADIC_BDIMS_BATCH_RULE(ATEN_FN(triu)));
   VMAP_SUPPORT(repeat, repeat_batch_rule);
diff --git a/functorch/functorch/csrc/DynamicLayer.cpp b/functorch/functorch/csrc/DynamicLayer.cpp
index 08cd4d7a7d6b..c83edf327b2c 100644
--- a/functorch/functorch/csrc/DynamicLayer.cpp
+++ b/functorch/functorch/csrc/DynamicLayer.cpp
@@ -389,43 +389,9 @@ WithoutTop::~WithoutTop() {
   pushDynamicLayer(std::move(layer_));
 }
 
-// NOTE: [forward-mode AD decompositions hack]
-//
-// The mechanism is: in DynamicLayerFrontMode, IF we are dispatching on the
-// jvp transform, AND we have a decomposition for the operation, then run
-// the decomposition.
-//
-// Let's break that down. There are a douple of moving pieces.
-//
-// 0. How do we know what transform we're dispatching on?
-// Easy, check the top of the DynamicLayerStack and read the transform.
-//
-// 1. Next, we must identify when an operation (e.g. nll_loss_backward)
-// gets dispatched to.
-// - register a special kernel to the DynamicLayerFrontMode key
-//   (see JVP_DECOMP)
-// - that special kernel invokes dynamicLayerFrontFallbackOperator with
-//   an arg indicating we're going to use a decomp
-//
-// 2. Next, we need to call the decomposition. See call_decomposition_for_jvp.
-// We currently use python decompositions that we torchscript.
-
-// Ideally c10::OperatorHandle would have a field like this
-// to identify the operator.
-// The stuff here should map 1:1 with the operator name.
-// aten::nll_loss_backward -> nll_loss_backward
-// aten::add.Tensor -> add_Tensor
-
-static void call_decomposition_for_jvp(
+static void dynamicLayerFrontFallback(
     const c10::OperatorHandle& op,
     torch::jit::Stack* stack) {
-  run_jit_decomposition(op, stack);
-}
-
-static void dynamicLayerFrontFallbackOperator(
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack,
-    bool decomp_jvp) {
   auto& dynamicLayerStack = dynamicLayerStackAccessor();
   TORCH_INTERNAL_ASSERT(dynamicLayerStack.size() > 0);
 #ifdef HAS_TORCH_SHOW_DISPATCH_TRACE
@@ -434,13 +400,6 @@ static void dynamicLayerFrontFallbackOperator(
     dump_local_tls();
   }
 #endif
-
-  // Hack: if jvp and we have a decomposition registered, then do the decomposition
-  if (dynamicLayerStack.back().interpreter().key() == TransformType::Jvp &&
-      decomp_jvp) {
-    return call_decomposition_for_jvp(op, stack);
-  }
-
   // Save the current LocalDispatchKeySet (to the current DynamicLayer).
   // Upon exiting the current scope, that LocalDispatchKeySet gets restored.
   // When the current DynamicLayer dispatches to the next (inner) DynamicLayer,
@@ -460,16 +419,6 @@ restoreLocalDispatchKeySetRAII(const c10::impl::LocalDispatchKeySet& key_set) {
   return c10::impl::ForceDispatchKeyGuard(key_set);
 }
 
-void dynamicLayerFrontFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  return dynamicLayerFrontFallbackOperator(op, stack, false);
-}
-
-void dynamicLayerFrontFallBackWithDecomp(
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack) {
-  return dynamicLayerFrontFallbackOperator(op, stack, true);
-}
-
 void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   auto& layer = dynamicLayerStackAccessor().back();
   auto restore_guard = restoreLocalDispatchKeySetRAII(layer.interpreter().getSavedLocalDispatchKeySet());
@@ -486,24 +435,5 @@ TORCH_LIBRARY_IMPL(_, FT_DYNAMIC_LAYER_BACK_MODE_KEY, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&dynamicLayerBackFallback>());
 }
 
-#define JVP_DECOMP(op) \
-  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<&dynamicLayerFrontFallBackWithDecomp>());
-
-#define JVP_DECOMP2(op, overload) \
-  m.impl(#op "." #overload, torch::CppFunction::makeFromBoxedFunction<&dynamicLayerFrontFallBackWithDecomp>());
-
-TORCH_LIBRARY_IMPL(aten, FT_DYNAMIC_LAYER_FRONT_MODE_KEY, m) {
-  JVP_DECOMP(nll_loss_backward);
-  JVP_DECOMP(nll_loss2d_backward);
-  JVP_DECOMP(_log_softmax_backward_data);
-  JVP_DECOMP(_softmax_backward_data);
-  OP_DECOMPOSE(log_sigmoid);
-  JVP_DECOMP(log_sigmoid_forward);
-  JVP_DECOMP(native_layer_norm_backward);
-  JVP_DECOMP(native_batch_norm_backward);
-  JVP_DECOMP(cudnn_batch_norm_backward);
-}
-
-
 }
 } // namespace at
diff --git a/functorch/test/test_ops.py b/functorch/test/test_ops.py
index 8d69fe7e22b5..218ba47b46ed 100644
--- a/functorch/test/test_ops.py
+++ b/functorch/test/test_ops.py
@@ -1047,9 +1047,6 @@ def get_vjp(cotangents, *primals):
         # RuntimeError: Trying to set a forward gradient that has a different size than that of the original Tensor,
         # this is not supported. Tensor is of size [5, 2, 3] while the given forward gradient is of size [1, 2, 3].
         xfail('normal', ''),
-        xfail('_masked.log_softmax', ''),  # NYI: forward-AD for _log_softmax_backward_data
-        xfail('_masked.softmax', ''),  # NYI: forward-AD for _softmax_backward_data
-        xfail('_masked.softmin', ''),  # NYI: forward-AD for _softmax_backward_data
         xfail('cdist', ''),  # NYI: forward-AD for _cdist_forward
         xfail('cholesky', ''),  # NYI: forward-AD for cholesky
         xfail('eig', ''),  # NYI: forward-AD for eig
@@ -1058,10 +1055,7 @@ def get_vjp(cotangents, *primals):
         xfail('nn.functional.grid_sample', ''),  # NYI: forward AD for grid_sampler_2d
         xfail('nn.functional.hardsigmoid', ''),  # NYI: forward AD for hardsigmoid_backward
         xfail('nn.functional.huber_loss', ''),  # NYI: forward AD for huber_loss_backward
-        xfail('nn.functional.instance_norm', ''),  # NYI: forward AD for native_batch_norm_backward
         xfail('nn.functional.logsigmoid', ''),  # not differentiable w.r.t. buffer
-        xfail('nn.functional.softmin', ''),  # NYI: forward-AD for _softmax_backward_data
-        xfail('nn.functional.softmin', 'with_dtype'),  # NYI: forward-AD for _softmax_backward_data
         xfail('renorm', ''),  # NYI: forward AD for renorm
         xfail('symeig', ''),  # NYI: forward AD for symeig
         xfail('nn.functional.multilabel_margin_loss', ''),  # NYI: multilabel_margin_loss_forward
@@ -1075,7 +1069,6 @@ def get_vjp(cotangents, *primals):
         xfail('scatter_reduce', 'mean'),  # NYI: forward-AD for scatter_reduce
         xfail('scatter_reduce', 'prod'),  # NYI: forward-AD for scatter_reduce
         skip('linalg.householder_product', '', device_type='cuda'),  # flaky, I'm not sure why
-        xfail('native_layer_norm', ''),  # NYI: forward-AD for native_layer_norm_backward
         xfail('sparse.sampled_addmm', ''),  # Sparse tensors have no strides
         skip('as_strided_scatter', ''),  # seems flaky
         xfail('segment_reduce', 'offsets'),  # NYI: forward-AD for segment_reduce
@@ -1136,37 +1129,8 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
                     expected = (tree_unflatten(primals_out, spec), tree_unflatten(tangents_out, spec))
                 return expected
 
-            # HACK: obviously pytorch should also have the same coverage
-            # For things that do have the same coverage, we test that jvp x vjp
-            # are the same between PyTorch and functorch. For things that don't,
-            # we check that jacfwd(vjp) and jacrev(vjp) are the same. This results
-            # in slower tests.
-            FUNCTORCH_HAS_FORMULA_BUT_NOT_PYTORCH = {
-                'nn.functional.nll_loss',
-                'softmax',
-                'log_softmax',
-                'nn.functional.cross_entropy',
-                'nn.functional.layer_norm',
-                'nn.functional.batch_norm',
-            }
-            if op.name in FUNCTORCH_HAS_FORMULA_BUT_NOT_PYTORCH:
-                self.assertFalse(op.supports_fwgrad_bwgrad,
-                                 f"{op.name} now supports forward over reverse without a decomposition. " +
-                                 "Please remove the decomposition version")
-
-                def is_differentiable(t):
-                    return isinstance(t, torch.Tensor) and t.dtype == torch.float32
-                args = (cotangents, *primals)
-                if op.name == 'nn.functional.binary_cross_entropy':
-                    argnums = (0, 1)  # targets is float32 but isn't differentiable
-                    atol_rtol = 1.5e-4, 1.3e-06
-                else:
-                    argnums = tuple(i for i in range(len(args)) if is_differentiable(args[i]))
-                    atol_rtol = None
-                self._compare_jacobians_of_vjp(fn, args, argnums, atol_rtol)
-            else:
-                expected = reference(primals, cotangents, primals_tangents, cotangents_tangents)
-                self.assertEqual(result, expected)
+            expected = reference(primals, cotangents, primals_tangents, cotangents_tangents)
+            self.assertEqual(result, expected)
 
     def _make_extremal_inputs(self, shape, device):
         if shape is None:
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 5a8bf46319f0..c5b1ec04fd87 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1956,7 +1956,20 @@
 
 - name: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
   self: log_sigmoid_backward(grad, self, buffer)
-  output: auto_element_wise
+  # HACK: This is just auto_element_wise followed by a view_as. The reason we have
+  # this is bc forward AD was complaining here about the shapes not being the same:
+  # the primal/tangent are 0-D/1-D respectively. This started happening after moving the
+  # jvp decomposition mechanism from functorch to core, possibly due to a batching rule.
+  # In functorch we rely on OP_DECOMPOSE, but now we compute forward AD using an actual
+  # formula.
+  #
+  # We'd like to avoid keeping the entire jvp decomposition mechanism in functorch,
+  # just for this single decomposition, but also want to avoid any cases from regressing:
+  # e.g. test_vmapjvpall_nn_functional_logsigmoid_cuda_float32 (passes on cpu, fails on CUDA).
+  #
+  # We should either figure out what is going on with vmap or perhaps fwd AD could
+  # be more tolerant about 0-dim vs 1-dim tensors
+  output: log_sigmoid_backward(self_t.conj(), self_p, buffer).conj().view_as(self_p)
 
 - name: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
   self: _log_softmax_backward_data(grad, result, dim, self.scalar_type())
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index f9afe838203d..35987ca24266 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -31,6 +31,7 @@
 from torchgen.api.autograd import (
     DifferentiableInput,
     dispatch_strategy,
+    ForwardDerivative,
     gen_differentiable_outputs,
     is_differentiable,
     NativeFunctionWithDifferentiabilityInfo,
@@ -597,8 +598,14 @@
 DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES = CodeTemplate(
     """\
 auto ${tmp_var} = ([&]() {
-  ${guard}
-  return ${base_type_call};
+  if (${try_jit_decomposition_bool} && ${any_has_forward_grad}) {
+    static c10::OperatorName full_name("aten::${op_name}", "${op_overload}");
+    static c10::optional<c10::OperatorHandle> opt_op = c10::Dispatcher::singleton().findSchema(full_name);
+    return impl::run_jit_decomposition_with_args_for_jvp<${returns_and_args}>("${op_name}", *opt_op, ks, ${arg_names});
+  } else {
+    ${guard}
+    return ${base_type_call};
+  }
 })();
 """
 )
@@ -642,6 +649,12 @@
 """
 )
 
+FW_DERIVATIVE_TENSORLIST_CHECK_TEMPLATE = CodeTemplate(
+    """\
+isFwGradDefinedTensorList(${req_inp})\
+"""
+)
+
 FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE = CodeTemplate(
     """\
 auto ${inp}_t_raw = toNonOptFwGrad(${inp});
@@ -972,6 +985,23 @@ def find_args_with_derivatives(
             f"ERROR: derivative ignored for {name} -- specified an autograd function without derivative"
         )
 
+    if requires_derivative and not len(fw_derivatives) == 0:
+        assert sum(len(derivative.var_names) for derivative in fw_derivatives) == len(
+            differentiable_outputs
+        ), (
+            "Expected the number of forward derivatives implemented to match the "
+            "number of differentiable outputs. NB: This only applies when at least "
+            "one forward derivative is implemented. Not implementing any forward "
+            "derivatives is also okay, and we would require inputs to the op to "
+            "not have associated tangents in that case."
+        )
+    try_jit_decomposition = (
+        requires_derivative
+        and len(fw_derivatives) == 0
+        and (not modifies_arguments(f))
+        and (not returns_void)
+    )
+
     def emit_save_inputs() -> List[str]:
         setup: List[str] = []
         if info is None or not info.has_derivatives:
@@ -1338,7 +1368,9 @@ def check_tensorimpl_and_storage(
             )
         return call
 
-    def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str:
+    def emit_call(
+        f: NativeFunction, unpacked_bindings: List[Binding], try_jit_decomposition: bool
+    ) -> str:
         # We only care about adding `at::AutoDispatchBelowAutograd` guard for non-variable dispatch
         # (which corresponds to 'use_derived' strategy). The purpose of this guard is to make sure
         # the baseType operations still dispatch to non-Variable type, even if the arguments passed
@@ -1352,13 +1384,51 @@ def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str:
         else:
             guard = "at::AutoDispatchBelowADInplaceOrView guard;"
 
+        try_jit_decomposition_bool = "true" if try_jit_decomposition else "false"
+        any_has_forward_grad = (
+            get_any_has_fw_grad_cond(derivative=None)
+            if requires_derivative
+            else "false"
+        )
+        return_types = ", ".join(
+            [cpp.return_type(a, symint=True).cpp_type() for a in f.func.returns]
+        )
+        if len(f.func.returns) > 1:
+            return_types = f"std::tuple<{return_types}>"
+
+        arg_types = [
+            cpp.argument_type(a, binds="", symint=True).cpp_type()
+            for a in f.func.arguments.flat_all
+        ]
+        arg_names = [
+            a.name
+            for a in cpp.arguments(
+                f.func.arguments,
+                faithful=True,
+                symint=True,
+                method=False,
+                cpp_no_default_args=set(),
+            )
+        ]
+
         if not modifies_arguments(f) and not returns_void:
+            # Just to keep things simple here, we only care about this path
+            # and always emit the if/else for now
             call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute(
-                base_type_call=base_type_call, tmp_var=TMP_VAR, guard=guard
+                base_type_call=base_type_call,
+                tmp_var=TMP_VAR,
+                guard=guard,
+                try_jit_decomposition_bool=try_jit_decomposition_bool,
+                any_has_forward_grad=any_has_forward_grad,
+                op_name=cpp.name(f.func),
+                op_overload=f.func.name.overload_name,
+                returns_and_args=return_types + ", " + ", ".join(arg_types),
+                arg_names=arg_names,
             )
 
             call += wrap_output(f, unpacked_bindings, TMP_VAR)
         else:
+            assert not try_jit_decomposition
             call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute(
                 base_type_call=base_type_call, guard=guard
             )
@@ -1406,38 +1476,14 @@ def get_any_has_forward_grad_name(var_names: Tuple[str, ...]) -> str:
     def emit_any_has_forward_grad() -> List[str]:
         content: List[str] = []
         for derivative in fw_derivatives:
-            assert derivative.required_inputs_fw_grad is not None
-            requires_fw_grad = " || ".join(
-                [
-                    FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
-                    for inp in differentiable_inputs
-                    if inp.name in derivative.required_inputs_fw_grad
-                ]
-            )
-            if not requires_fw_grad:
-                # Handle functions like stack
-                # For these, we don't unpack anything and always call the user function
-                if not (
-                    len(differentiable_inputs) == 1
-                    and is_tensor_list_type(differentiable_inputs[0].type)
-                ):
-                    raise RuntimeError(
-                        f'No differentiable input to "{name}" is a differentiable Tensor (as the provided '
-                        "forward AD formula does not use any input tangent) even though a forward gradient "
-                        "formula has been defined for it. This case should only happen for function that "
-                        "take a single TensorList as input. All other cases are not supported right now."
-                    )
-                requires_fw_grad = "true"
-
+            requires_fw_grad = get_any_has_fw_grad_cond(derivative=derivative)
             if info and info.output_differentiability_conditions:
                 assert len(info.output_differentiability_conditions) == 1
-                requires_fw_grad = f"({info.output_differentiability_conditions[0]}) && ({requires_fw_grad})"
-
+                requires_fw_grad = f"({info.output_differentiability_conditions[0]}) && {requires_fw_grad}"
             content.append(
                 f"auto {get_any_has_forward_grad_name(derivative.var_names)} = {requires_fw_grad};\n"
                 f"(void){get_any_has_forward_grad_name(derivative.var_names)};"
             )
-
         return content
 
     def emit_check_inplace() -> List[str]:
@@ -1560,46 +1606,83 @@ def emit_fw_derivatives() -> List[str]:
         content.append("\n".join(fw_grad_setters))
         return content
 
-    def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str:
-        def get_msg() -> str:
-            if is_out_fn:
-                msg = "because it is an out= function"
-            else:
-                msg = (
-                    "because it has not been implemented yet.\\nPlease file an issue "
-                    "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
-                    "so that we can prioritize its implementation."
-                )
-            return msg
-
-        res = ""
-        to_check: List[str] = []
-        for inp in list(
-            mapMaybe(
-                gen_differentiable_input,
-                f.func.arguments.non_out + list(f.func.arguments.out),  # type: ignore[operator]
-            )
-        ):
-            if is_tensor_type(inp.type):
-                to_check.append(
-                    FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
-                )
-            elif is_tensor_list_type(inp.type):
-                cond = FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp="_t")
-                res += FW_DERIVATIVE_FORBID_LIST_TEMPLATE.substitute(
-                    arg=inp.name, cond=cond, name=name, msg=get_msg()
+    def get_any_has_fw_grad_cond(derivative: Optional[ForwardDerivative]) -> str:
+        #
+        # Produces a condition string (e.g, "isFwGradDefined(grad_output) || isFwGradDefined(output)")
+        #
+        if derivative is None:
+            # (1) If a derivative is NOT provided, cond will check fw_grad of ALL differentiable inputs
+            # - Used in the out_fn case when we want to forbid fw derivatives
+            # - Used in the case where the fw_derivative is not defined, but we want
+            #   To check if there is a decomposition registered for jvp
+            to_check: List[str] = []
+            for inp in list(
+                mapMaybe(
+                    gen_differentiable_input,
+                    f.func.arguments.non_out + list(f.func.arguments.out),  # type: ignore[operator]
                 )
+            ):
+                if is_tensor_type(inp.type):
+                    to_check.append(
+                        FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
+                    )
+                elif is_tensor_list_type(inp.type):
+                    to_check.append(
+                        FW_DERIVATIVE_TENSORLIST_CHECK_TEMPLATE.substitute(
+                            req_inp=inp.name
+                        )
+                    )
+                else:
+                    raise RuntimeError(
+                        f'Unsupported input type for "{name}" when forbidding forward AD usage.'
+                    )
+            return f'({" || ".join(to_check)})'
+        else:
+            # (2) If derivative is provided, use that information to determine which inputs
+            #     to check fw_grad for
+            assert derivative.required_inputs_fw_grad is not None
+
+            if len(derivative.required_inputs_fw_grad) == 0:
+                # Handle functions like stack
+                # For these, we don't unpack anything and always call the user function
+                if not (
+                    len(differentiable_inputs) == 1
+                    and is_tensor_list_type(differentiable_inputs[0].type)
+                ):
+                    raise RuntimeError(
+                        f'No differentiable input to "{name}" is a differentiable Tensor (as the provided '
+                        "forward AD formula does not use any input tangent) even though a forward gradient "
+                        "formula has been defined for it. This case should only happen for function that "
+                        "take a single TensorList as input. All other cases are not supported right now."
+                    )
+                any_has_fw_grad = "true"
             else:
-                raise RuntimeError(
-                    f'Unsupported input type for "{name}" when forbidding forward AD usage.'
+                any_has_fw_grad = " || ".join(
+                    [
+                        FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
+                        for inp in differentiable_inputs
+                        if inp.name in derivative.required_inputs_fw_grad
+                    ]
                 )
+                any_has_fw_grad = f"({any_has_fw_grad})"
+
+            return any_has_fw_grad
 
-        if len(to_check) > 0:
-            cond = " || ".join(to_check)
-            res += FW_DERIVATIVE_FORBID_TEMPLATE.substitute(
-                cond=cond, name=name, msg=get_msg()
+    def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str:
+        if is_out_fn:
+            msg = "because it is an out= function"
+        else:
+            msg = (
+                "because it has not been implemented yet.\\nPlease file an issue "
+                "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
+                "so that we can prioritize its implementation."
             )
-        return res
+        cond = get_any_has_fw_grad_cond(derivative=None)
+        return (
+            FW_DERIVATIVE_FORBID_TEMPLATE.substitute(cond=cond, name=name, msg=msg)
+            if cond != ""
+            else ""
+        )
 
     body: List[str] = []
     unpack_args_stats, unpacked_bindings = unpack_args(f)
@@ -1613,7 +1696,7 @@ def get_msg() -> str:
         body.extend(setup_derivative(differentiable_inputs))
     body.append(declare_returned_variables(f))
 
-    body.append(emit_call(f, unpacked_bindings))
+    body.append(emit_call(f, unpacked_bindings, try_jit_decomposition))
     if requires_derivative:
         # set_flags has to appear after version_counter, because rebase_history
         # requires that the counter is incremented before it is called
@@ -1623,20 +1706,11 @@ def get_msg() -> str:
     if is_out_fn:
         body.append(emit_forbid_fw_derivatives(is_out_fn=True))
     else:
-        if requires_derivative:
-            body.extend(emit_fw_derivatives())
-            if len(fw_derivatives) == 0:
-                body.append(emit_forbid_fw_derivatives())
+        if requires_derivative and not try_jit_decomposition:
+            if len(fw_derivatives) > 0:
+                body.extend(emit_fw_derivatives())
             else:
-                assert sum(
-                    len(derivative.var_names) for derivative in fw_derivatives
-                ) == len(differentiable_outputs), (
-                    "Expected the number of forward derivatives implemented to match the "
-                    "number of differentiable outputs. NB: This only applies when at least "
-                    "one forward derivative is implemented. Not implementing any forward "
-                    "derivatives is also okay, and we would require inputs to the op to "
-                    "not have associated tangents in that case."
-                )
+                body.append(emit_forbid_fw_derivatives())
 
     if requires_derivative:
         # Save only after the forward AD has been set up
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 9cd2d5c40de7..3c467f83c318 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -1,4 +1,5 @@
 #include "torch/csrc/autograd/VariableTypeUtils.h"
+#include "torch/csrc/autograd/VariableTypeUtilsDependOnOps.h"
 #include "torch/csrc/autograd/generated/VariableType.h"
 #include "torch/csrc/autograd/FunctionsManual.h"
 
diff --git a/torch/csrc/autograd/VariableTypeUtilsDependOnOps.h b/torch/csrc/autograd/VariableTypeUtilsDependOnOps.h
new file mode 100644
index 000000000000..f2569c9d6463
--- /dev/null
+++ b/torch/csrc/autograd/VariableTypeUtilsDependOnOps.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <torch/csrc/jit/runtime/decomposition_registry.h>
+
+// This is the set of helpers in VariableTypeUtils have a dependency on
+// native_functions.yaml meaning the file will need to be re-compiled every time
+// an operator is changed or added. We cannot simply put these functions in
+// VariableType.h and VariableTypeutils.h, since they are included in files like
+// ADInplaceOrViewType_X.cpp which don't always want to be recompiled.
+
+namespace torch {
+namespace autograd {
+namespace impl {
+
+// Depends on torch/csrc/jit/ir/ir.h -> aten/src/ATen/core/interned_strings.h
+template <class Return, class... Args>
+Return run_jit_decomposition_with_args_for_jvp(
+    c10::string_view name,
+    const c10::OperatorHandle& opHandle,
+    c10::DispatchKeySet dispatchKeySet,
+    Args... args) {
+  bool has_decomp = jit::has_jit_decomposition(opHandle.schema());
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      has_decomp,
+      "Trying to use forward AD with ",
+      name,
+      " that does not support it"
+      "because it has not been implemented yet and does not have a decomposition.\\nPlease file an issue "
+      "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
+      "so that we can prioritize its implementation.");
+
+  return c10::KernelFunction::makeFromBoxedKernel(
+             c10::BoxedKernel::makeFromFunction<&jit::run_jit_decomposition>())
+      .call<Return, Args...>(opHandle, dispatchKeySet, args...);
+}
+
+} // namespace impl
+} // namespace autograd
+} // namespace torch
diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h
index a2169f18656f..75df1a0302c9 100644
--- a/torch/csrc/autograd/functions/utils.h
+++ b/torch/csrc/autograd/functions/utils.h
@@ -100,5 +100,23 @@ inline bool isFwGradDefined(const c10::optional<at::Tensor>& t) {
   return t.has_value() && t->defined() && t->_fw_grad(/*level */ 0).defined();
 }
 
+inline bool isFwGradDefinedTensorList(const at::TensorList& variables) {
+  bool ret = false;
+  for (auto& variable : variables) {
+    ret |= isFwGradDefined(variable);
+  }
+  return ret;
+}
+
+inline bool isFwGradDefinedTensorList(
+    const c10::List<c10::optional<at::Tensor>> li) {
+  bool ret = false;
+  for (auto i : c10::irange(li.size())) {
+    auto t = li.get(i);
+    ret |= (t.has_value() && isFwGradDefined(t.value()));
+  }
+  return ret;
+}
+
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/decomposition_registry.cpp b/torch/csrc/jit/runtime/decomposition_registry.cpp
index d55ac7eac9be..bfad602ef2f2 100644
--- a/torch/csrc/jit/runtime/decomposition_registry.cpp
+++ b/torch/csrc/jit/runtime/decomposition_registry.cpp
@@ -160,6 +160,26 @@ void RegisterDecomposition(
   schema_to_decomposition[&schema] = g;
 }
 
+void run_jit_decomposition(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack) {
+  const auto& schema = op.schema();
+  // TODO: templatize based on op and keep static trace_exec
+  auto* trace_exec = torch::jit::GetDecompositionExecutor(schema);
+  trace_exec->run((*stack));
+  if (stack->back().isTuple()) {
+    at::IValue tup = stack->back();
+    stack->pop_back();
+    for (const auto& elem : tup.toTuple()->elements()) {
+      stack->push_back(elem);
+    }
+  }
+}
+
+bool has_jit_decomposition(const FunctionSchema& schema) {
+  return GetDecompositionFunction(schema).has_value();
+}
+
 Function* GetDecompositionExecutor(const FunctionSchema& schema) {
   auto maybe_func = GetDecompositionFunction(schema);
   TORCH_INTERNAL_ASSERT(maybe_func);
diff --git a/torch/csrc/jit/runtime/decomposition_registry.h b/torch/csrc/jit/runtime/decomposition_registry.h
index 4c6ef3029a0b..225204cf60de 100644
--- a/torch/csrc/jit/runtime/decomposition_registry.h
+++ b/torch/csrc/jit/runtime/decomposition_registry.h
@@ -25,5 +25,11 @@ TORCH_API Function* GetDecompositionExecutor(const char* schema_literal);
 
 TORCH_API Function* GetDecompositionExecutor(const FunctionSchema& schema);
 
+TORCH_API void run_jit_decomposition(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+
+TORCH_API bool has_jit_decomposition(const FunctionSchema& schema);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 3f152354e6d2..a9e98a44dcaa 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -39,6 +39,10 @@
 import torch._refs.special
 import torch._refs.linalg
 
+# Make sure that decompositions used for test_forward_mode_AD and
+# test_fn_fwgrad_bwgrad are registered to the jit
+import torch._decomp.decompositions_for_jvp
+
 import torch._prims as prims  # noqa: F401
 
 from torch.utils._pytree import tree_flatten
@@ -10164,6 +10168,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            assert_jit_shape_analysis=True,
            assert_autodiffed=True,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            supports_out=True),
     OpInfo('softmax',
            aliases=('special.softmax', 'nn.functional.softmax',),
@@ -10173,6 +10178,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
            assert_autodiffed=True,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            supports_out=True),
     # `softmin` supports different dtypes based on whether `dtype` argument,
     # is passed or not. Hence two OpInfo entries, one with dtype and other without.
@@ -10185,6 +10191,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            assert_jit_shape_analysis=False,
            assert_autodiffed=False,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            supports_out=False),
     OpInfo('nn.functional.softmin',
            variant_test_name="with_dtype",
@@ -10193,6 +10200,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
            assert_autodiffed=False,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            supports_out=False),
     OpInfo(
         "nn.functional.cross_entropy",
@@ -10201,6 +10209,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_cross_entropy,
         supports_out=False,
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         decorators=(
             DecorateInfo(
                 toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-3)}),
@@ -10292,6 +10301,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            assert_jit_shape_analysis=True,
+           supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_native_layer_norm,
            error_inputs_func=error_inputs_native_layer_norm,
            skips=(
@@ -10663,6 +10673,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            decorators=[
                # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
                # Consider making it a parameter or input, or detaching the gradient
@@ -10681,6 +10692,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
            decorators=[
                DecorateInfo(
@@ -11720,6 +11732,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
            sample_inputs_func=sample_inputs_batch_norm,
            skips=(
@@ -11742,6 +11755,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            decorators=[onlyCUDA, disablecuDNN],
            skips=(
                DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
@@ -14704,6 +14718,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         sample_inputs_func=sample_inputs_softmax_variant,
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         assert_autodiffed=True),
     OpInfo(
         'log_softmax',
@@ -14713,6 +14728,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
         sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         assert_autodiffed=True),
     UnaryUfuncInfo('logit',
                    aten_backward_name='logit_backward',
@@ -15589,6 +15605,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         supports_out=False,
         sample_inputs_func=sample_inputs_nll_loss,
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         assert_jit_shape_analysis=True,
         skips=(
             # RuntimeError:
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index d8a3e8aa948d..cb88766e70c6 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -990,6 +990,7 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
         ),
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         supports_out=False,
     ),
     OpInfo(
@@ -1017,6 +1018,7 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
         ],
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         supports_out=False,
     ),
     OpInfo(
@@ -1037,6 +1039,7 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
         ),
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         supports_out=False,
     ),
     OpInfo(

From d3d163af8061e08097c3ae37079bf61535b81ff1 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Wed, 7 Sep 2022 13:12:49 +0000
Subject: [PATCH 15/45] Add xla/ folder to gitignore (#84632)

As per title
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84632
Approved by: https://github.com/ezyang
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 88d472b456f4..5dbad08f4f4c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -307,6 +307,9 @@ TAGS
 # bazel symlinks
 bazel-*
 
+# xla repo
+xla/
+
 # direnv, posh-direnv
 .envrc
 .psenvrc

From 99b7eb4dfbf8387d15b46913f1ff4e771782f499 Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@fb.com>
Date: Wed, 7 Sep 2022 15:44:20 +0000
Subject: [PATCH 16/45] move internal only PyTorch test defs into fb/
 subdirectories (#84605)

Test Plan: Rely on CI.

Differential Revision: D39289373

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84605
Approved by: https://github.com/DanilBaibak
---
 test/defs.bzl                           | 112 ------------------------
 test/distributed/defs.bzl               |  39 ---------
 test/distributed/fsdp/defs.bzl          |  22 -----
 test/distributed/pipeline/sync/defs.bzl |  22 -----
 4 files changed, 195 deletions(-)
 delete mode 100644 test/defs.bzl
 delete mode 100644 test/distributed/defs.bzl
 delete mode 100644 test/distributed/fsdp/defs.bzl
 delete mode 100644 test/distributed/pipeline/sync/defs.bzl

diff --git a/test/defs.bzl b/test/defs.bzl
deleted file mode 100644
index 0e92326402dd..000000000000
--- a/test/defs.bzl
+++ /dev/null
@@ -1,112 +0,0 @@
-load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
-load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
-load("@fbsource//tools/build_defs/sandcastle:sandcastle_defs.bzl", "is_sandcastle_machine")
-
-def define_python_unittest(pytest = False, **kwargs):
-    build_mode = native.read_config("fbcode", "build_mode_test_label")
-    enable_flatbuffer = bool(native.read_config("fbcode", "caffe2_enable_flatbuffer", None))
-
-    PYTORCH_TEST_WITH_ASAN = "1" if ("asan" in build_mode or build_mode == "dev") else "0"
-
-    PYTORCH_TEST_WITH_DEV_DBG_ASAN = "1" if (build_mode == "dev" or "dev-asan" in build_mode or "dbg-asan" in build_mode or "dbgo-asan" in build_mode) else "0"
-
-    PYTORCH_TEST_WITH_TSAN = "1" if ("tsan" in build_mode) else "0"
-
-    PYTORCH_TEST_WITH_UBSAN = "1" if ("ubsan" in build_mode or build_mode == "dev") else "0"
-
-    NO_MULTIPROCESSING_SPAWN = "1" if is_sandcastle_machine() else "0"
-
-    ENABLE_FLATBUFFER = "1" if enable_flatbuffer else "0"
-
-    # indicates we are running in test env.
-    # "deepcopy" the 'env: Dict[str, str]'
-    kwargs["env"] = dict(kwargs.get("env", {}))
-    kwargs["env"]["PYTORCH_TEST"] = "1"
-    kwargs["env"]["PYTORCH_TEST_FBCODE"] = "1"
-    kwargs["env"]["PYTORCH_TEST_WITH_ASAN"] = PYTORCH_TEST_WITH_ASAN
-    kwargs["env"]["PYTORCH_TEST_WITH_DEV_DBG_ASAN"] = PYTORCH_TEST_WITH_DEV_DBG_ASAN
-    kwargs["env"]["PYTORCH_TEST_WITH_TSAN"] = PYTORCH_TEST_WITH_TSAN
-    kwargs["env"]["PYTORCH_TEST_WITH_UBSAN"] = PYTORCH_TEST_WITH_UBSAN
-    kwargs["env"]["NO_MULTIPROCESSING_SPAWN"] = NO_MULTIPROCESSING_SPAWN
-    kwargs["env"]["ENABLE_FLATBUFFER"] = ENABLE_FLATBUFFER
-
-    # To speed up TP tests.
-    kwargs["env"]["TENSORPIPE_TLS_DATACENTER"] = "test_dc"
-
-    # Run CUDA tests on GPUs
-    if kwargs.get("name").endswith("cuda"):
-        # "deepcopy" the 'tags: List[str]'
-        kwargs["tags"] = list(kwargs.get("tags", []))
-        kwargs["tags"].extend([
-            "re_opts_capabilities={\"platform\": \"gpu-remote-execution\", \"subplatform\": \"P100\"}",
-            "supports_remote_execution",
-            "run_as_bundle",
-            "tpx:experimental-shard-size-for-bundle=100",
-        ])
-        kwargs["env"]["PYTORCH_TEST_REMOTE_GPU"] = "1"
-
-    if pytest:
-        python_pytest(
-            **kwargs
-        )
-    else:
-        python_unittest(
-            **kwargs
-        )
-
-def define_mp_tests(tests, additional_deps = None, pytest = False, **kwargs):
-    # LeakSanitizer doesn't work for python multiprocessing.
-    # See https://fb.workplace.com/groups/fbcode/posts/2625521060818050/
-    # and https://fb.workplace.com/groups/101100140348621/posts/1278688645923092/
-    extra_env = {
-        "ASAN_OPTIONS": "detect_leaks=0",
-        "CUDA_INJECTION64_PATH": "0",  # resolve kineto TSAN flakiness
-    }
-
-    # Serialize test cases since multiple tests running on same GPUs can
-    # deadlock or there can be port conflicts.
-    if "tags" not in kwargs:
-        kwargs["tags"] = []
-    if "serialize_test_cases" not in kwargs["tags"]:
-        kwargs["tags"].append("serialize_test_cases")
-    define_tests(tests, additional_deps, pytest, extra_env, **kwargs)
-
-def define_q_distributed_test(tests, env = None, additional_deps = None, pytest = False, **kwargs):
-    define_tests(tests, additional_deps, pytest, env, **kwargs)
-
-def define_tests(tests, additional_deps = None, pytest = False, extra_env = {}, **kwargs):
-    if additional_deps == None:
-        additional_deps = {}
-
-    provided_tags = kwargs.pop("tags", [])
-
-    env = {
-        "DOCS_SRC_DIR": "$(location //caffe2/docs/source:doc_files)",
-        "MKL_NUM_THREADS": "1",
-        "OMP_NUM_THREADS": "1",
-        "SKIP_TEST_BOTTLENECK": "1",
-    }
-    env.update(extra_env)
-    for name, srcs in tests.items():
-        tags = list(provided_tags)
-
-        test_deps = ["//caffe2:test-lib"] + additional_deps.get(name, [])
-        define_python_unittest(
-            pytest,
-            name = name,
-            srcs = srcs,
-            base_module = "",
-            compile = "with-source",
-            env = env,
-            py_version = ">=3.5",
-            strip_libpar = True,
-            tags = tags,
-            deps = test_deps,
-            # Depend directly on :libtorch so that tests won't be pruned by the
-            # rdep distance heuristic.
-            cpp_deps = ["//caffe2:libtorch"],
-            runtime_deps = [
-                "//caffe2/docs/source:doc_files",
-            ],
-            **kwargs
-        )
diff --git a/test/distributed/defs.bzl b/test/distributed/defs.bzl
deleted file mode 100644
index d3b3040ea4c3..000000000000
--- a/test/distributed/defs.bzl
+++ /dev/null
@@ -1,39 +0,0 @@
-load("@fbsource//tools/build_defs:testpilot_defs.bzl", "special_tags")
-load(
-    "//caffe2/test:defs.bzl",
-    "define_python_unittest",
-)
-
-# These distributed tests need custom environment variables
-def define_distributed_test(**kwargs):
-    # LeakSanitizer doesn't work for python multiprocessing.
-    # See https://fb.workplace.com/groups/fbcode/posts/2625521060818050/
-    # and https://fb.workplace.com/groups/101100140348621/posts/1278688645923092/
-    kwargs["env"]["ASAN_OPTIONS"] = "detect_leaks=0"
-
-    # Resolve kineto TSAN flakiness
-    kwargs["env"]["CUDA_INJECTION64_PATH"] = "0"
-    define_python_unittest(
-        base_module = "",
-        main_module = "fb.test_distributed_trap",
-        py_version = ">=3.5",
-        tags = [special_tags.run_as_bundle],
-        deps = [
-            "//caffe2:test-lib",
-            "//caffe2:torch",
-            "//caffe2/torch/fb/rendezvous:zeus",
-            "//pytorch/vision:torchvision",
-        ],
-        external_deps = [
-            ("numpy", None),
-            ("scipy", None),
-        ],
-        **kwargs
-    )
-
-def define_c10d_distributed_test(srcs, **kwargs):
-    srcs.extend(["fb/test_distributed_trap.py"])
-    define_distributed_test(
-        srcs = srcs + native.glob(["data/*.py"]),
-        **kwargs
-    )
diff --git a/test/distributed/fsdp/defs.bzl b/test/distributed/fsdp/defs.bzl
deleted file mode 100644
index 2e496838c807..000000000000
--- a/test/distributed/fsdp/defs.bzl
+++ /dev/null
@@ -1,22 +0,0 @@
-load("@bazel_skylib//lib:paths.bzl", "paths")
-load(
-    "//caffe2/test:defs.bzl",
-    "define_mp_tests",
-)
-
-def define_fsdp_tests():
-    test_files = native.glob(["**/test_*.py"])
-
-    TESTS = {}
-
-    additional_deps = {}
-    for test_file in test_files:
-        test_file_name = paths.basename(test_file)
-        test_name = test_file_name.replace("test_", "").replace(".py", "")
-        TESTS[test_name] = [test_file]
-        additional_deps[test_name] = ["//pytorch/vision:torchvision"]
-
-    define_mp_tests(
-        tests = TESTS,
-        additional_deps = additional_deps,
-    )
diff --git a/test/distributed/pipeline/sync/defs.bzl b/test/distributed/pipeline/sync/defs.bzl
deleted file mode 100644
index 0de277bddaef..000000000000
--- a/test/distributed/pipeline/sync/defs.bzl
+++ /dev/null
@@ -1,22 +0,0 @@
-load("@bazel_skylib//lib:paths.bzl", "paths")
-load(
-    "//caffe2/test:defs.bzl",
-    "define_tests",
-)
-
-def define_pipeline_tests():
-    test_files = native.glob(["**/test_*.py"])
-
-    TESTS = {}
-
-    for test_file in test_files:
-        test_file_name = paths.basename(test_file)
-        test_name = test_file_name.replace("test_", "").replace(".py", "")
-        TESTS[test_name] = [test_file]
-
-    define_tests(
-        pytest = True,
-        tests = TESTS,
-        external_deps = [("pytest", None)],
-        resources = ["conftest.py"],
-    )

From 87738f2073d808f0f76d607d1593f7683a463f45 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Wed, 7 Sep 2022 02:22:56 +0000
Subject: [PATCH 17/45] Remove expired c10d::broadcast backward compatibility
 check (#84107)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84107
Approved by: https://github.com/wanchaol
---
 .../check_forward_backward_compatibility.py                      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index cf3a25f82853..71560c5c0550 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -120,7 +120,6 @@
     ("aten::segment_reduce", datetime.date(2022, 6, 30)),
     ("aten::_segment_reduce_backward", datetime.date(2022, 6, 30)),
     ("aten::empty.SymInt", datetime.date(9999, 1, 1)),
-    ("c10d::broadcast", datetime.date(2022, 6, 25)),
     ("aten::.*functional", datetime.date(2022, 8, 1)),
     ("aten::_foreach.*", datetime.date(2022, 8, 1)),
     ("aten::unflatten", datetime.date(2022, 8, 10)),

From 50ae5c9141fc752c80e7fe88a123ea77ee0265f9 Mon Sep 17 00:00:00 2001
From: Jianyu Huang <jianyuhuang@fb.com>
Date: Wed, 7 Sep 2022 16:14:23 +0000
Subject: [PATCH 18/45] set workspace size to 4M (#74159)

Summary: Follow D34480690 (https://github.com/pytorch/pytorch/commit/3ec1dd9989ac5441c767f975f5e0fc46847400a2)

Test Plan: CI

Differential Revision: D34636039

Pull Request resolved: https://github.com/pytorch/pytorch/pull/74159
Approved by: https://github.com/xuzhao9
---
 aten/src/ATen/cuda/CUDABlas.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 866f53ee7f87..e1a01ceb6829 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -709,9 +709,11 @@ void gemm_and_bias(
   CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld);
 
   CuBlasLtMatmulPreference preference;
-  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
-  // setting this to 1M.
-  size_t workspaceSize = 1024 * 1024;
+  // See https://github.com/pytorch/pytorch/issues/73328.
+  // Check https://docs.nvidia.com/cuda/cublas/index.html#cublassetworkspace .
+  // Recommended size of user-provided workspace is at least 4MiB (to match
+  // cuBLAS' default workspace pool).
+  size_t workspaceSize = 4 * 1024 * 1024;
   TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
       preference.descriptor(),
       CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,

From ed46b9670ebafa1c6bf7d078dcf5687109fee6ae Mon Sep 17 00:00:00 2001
From: samdow <samdow@fb.com>
Date: Tue, 6 Sep 2022 16:41:00 -0400
Subject: [PATCH 19/45] add randomness kwarg to jacfwd (#84220)

From https://github.com/pytorch/functorch/issues/1010, if a user runs jacfwd with a function that uses randomness, it will fail since the default behavior for vmap is error. This lets the user specify the randomness behavior to jacfwd too since it is doing vmap(jvp(forward)). This is less likely to show up in jacrev since that only vmaps over the backwards pass
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84220
Approved by: https://github.com/zou3519
---
 functorch/functorch/_src/eager_transforms.py |  7 +++++--
 functorch/test/test_vmap.py                  | 16 ++++++++++++++--
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/functorch/functorch/_src/eager_transforms.py b/functorch/functorch/_src/eager_transforms.py
index bc6d2e2ffd81..8750172cb8ce 100644
--- a/functorch/functorch/_src/eager_transforms.py
+++ b/functorch/functorch/_src/eager_transforms.py
@@ -838,7 +838,7 @@ def safe_unflatten(tensor, dim, shape):
     return tensor.unflatten(dim, shape)
 
 
-def jacfwd(func: Callable, argnums: argnums_t = 0, has_aux: bool = False):
+def jacfwd(func: Callable, argnums: argnums_t = 0, has_aux: bool = False, *, randomness: str = "error"):
     """
     Computes the Jacobian of :attr:`func` with respect to the arg(s) at index
     :attr:`argnum` using forward-mode autodiff
@@ -854,6 +854,9 @@ def jacfwd(func: Callable, argnums: argnums_t = 0, has_aux: bool = False):
             the function to be differentiated and the second element is
             auxiliary objects that will not be differentiated.
             Default: False.
+        randomness(str): Flag indicating what type of randomness to use.
+            See :func:`vmap` for more detail. Allowed: "different", "same", "error".
+            Default: "error"
 
     Returns:
         Returns a function that takes in the same inputs as :attr:`func` and
@@ -957,7 +960,7 @@ def push_jvp(basis):
             _, jvp_out = output
             return jvp_out
 
-        results = vmap(push_jvp)(basis)
+        results = vmap(push_jvp, randomness=randomness)(basis)
         if has_aux:
             results, aux = results
             # aux is in the standard basis format, e.g. NxN matrix
diff --git a/functorch/test/test_vmap.py b/functorch/test/test_vmap.py
index ceb3c0c43b3c..6b85f3786108 100644
--- a/functorch/test/test_vmap.py
+++ b/functorch/test/test_vmap.py
@@ -45,7 +45,7 @@
 from collections import namedtuple
 
 import functorch
-from functorch import vmap, grad, grad_and_value, jvp, vjp
+from functorch import vmap, grad, grad_and_value, jvp, vjp, jacfwd
 from functorch.experimental import chunk_vmap
 from functorch._C import reshape_dim_into, reshape_dim_outof
 from functorch._src.make_functional import functional_init_with_buffers
@@ -4479,6 +4479,19 @@ def f(x):
             self._assert_all_slices_unique(output)
 
 
+    def test_jacfwd_with_random(self):
+        # checks on behavior are above, this just checks that jacfwd respects
+        # the randomness param
+
+        x = torch.rand(3, 4)
+        with self.assertRaisesRegex(RuntimeError, r"called random operation while in randomness error mode"):
+            jacfwd(torch.bernoulli)(x)
+
+        # x isn't batched so use bernoulli since it doesn't do inplace randomness
+        jacfwd(torch.bernoulli, randomness="same")(x)
+        jacfwd(torch.bernoulli, randomness="different")(x)
+
+
 class TestTransformFailure(TestCase):
     @parametrize('transform', ['vmap', 'grad', 'grad_and_value', 'vjp', 'jvp', 'jacrev', 'jacfwd'])
     def test_fails_with_autograd_function(self, device, transform):
@@ -4512,7 +4525,6 @@ def f(x):
         with self.assertRaisesRegex(RuntimeError, "autograd.Function"):
             transform(input)
 
-
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(TestVmapOperatorsOpInfo, globals(), only_for=only_for)
 

From 19e27b15562b261e87e3e629cb32cb6876b9caca Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 7 Sep 2022 05:58:32 -0700
Subject: [PATCH 20/45] Make dispatcher registrations of SymInt functions
 backwards compatible (#84557)

Previously, when we SymInt-ify a schema, this is a BC-breaking change
for all people who registered functions for that function; they
must accept c10::SymInt where they previously accepted int64_t.
This is not great.

With this change, I accept old type registrations transparently.  The
idea is in several parts:

- At the registration site, at compile time I have no idea whether or not
  if the function being registered has a SymInt schema or not.  So I
  must defer the exact compatibility check.  What I do instead is
  check if the function pointer registered to me has SymInt in the
  argument or not.  If it does, I assume it is new-style and ensure
  it is also registered to a special sym_ slot on KernelFunction.
  If not, it only goes in the conventional slot.

- At the dispatcher site, I know at compile time whether or not this
  is a SymInt function.  If it is, I check for a sym_ slot on the
  KernelFunction, and preferentially use that.  If no such slot
  exists, I then fall back to the regular slot... but I convert
  all SymInt arguments to int64_t arguments (doing assertions that
  no true symbolic integer was passed.)  I can skip this test entirely
  if the function doesn't have any SymInts in it; in that case I know
  that only the original slot could have been registered. Fortunately,
  both branches of the short circuit typecheck, so I didn't have to
  use SFINAE or if-constexpr to make it work; just a plain if statement
  that I expect the compiler to optimize away.

- Schema validation is now modestly more complicated. There are two parts. First, function schema validation proceeds by checking if the signature in question has any SymInt-like types in it or not. If it does, we do function schema validation against the real types; if it doesn't, we do validation against the fake types (but only for symint; MemoryFormat is always MemoryFormat). Second, cpp signature validation also keeps track of a "symint" cpp signature and a "non-symint" cpp signature. We only compare symint with symint, and non-symint with non-symint. I did not implement checking a conflict between a symint and non-symint cpp signature, though in principle you could try converting the SymInt types to non-SymInt types and doing the comparison that way.

To show it is working, I remove a bunch of c10::asIntArrayRefSlow shims, as the dispatcher is able to insert them automatically now.

I didn't update the Metal registrations (though they can get similar treatment) as OSS CI coverage is insufficient for this case.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Differential Revision: [D39280965](https://our.internmc.facebook.com/intern/diff/D39280965)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84557
Approved by: https://github.com/wconstab
---
 aten/src/ATen/BatchingRegistrations.cpp       | 27 +-------
 aten/src/ATen/core/boxing/KernelFunction.h    | 42 ++++++++++++-
 .../ATen/core/boxing/KernelFunction_impl.h    | 61 ++++++++++++++++---
 aten/src/ATen/core/dispatch/OperatorEntry.cpp | 47 ++++++++------
 aten/src/ATen/core/dispatch/OperatorEntry.h   | 11 ++--
 aten/src/ATen/core/function_schema.cpp        | 20 +++++-
 aten/src/ATen/core/function_schema.h          |  2 +-
 aten/src/ATen/native/vulkan/ops/Factory.cpp   | 10 +--
 aten/src/ATen/native/vulkan/ops/Shape.cpp     |  3 +-
 functorch/functorch/csrc/BatchRulesViews.cpp  | 14 -----
 .../open_registration_extension.cpp           |  4 +-
 test/cpp_extensions/ort_extension.cpp         |  4 +-
 12 files changed, 156 insertions(+), 89 deletions(-)

diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
index fab2c9e60762..02bbbb7088d6 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@@ -185,11 +185,6 @@ Tensor expand_batching_rule(const Tensor& self, IntArrayRef size, bool implicit)
   return self_physical.getPhysicalToLogicalMap().apply(result);
 }
 
-Tensor expand_symint_batching_rule(const Tensor& self, SymIntArrayRef psize, bool implicit) {
-  // TODO: properly support this
-  return expand_batching_rule(self, asIntArrayRefSlow(psize), implicit);
-}
-
 std::vector<Tensor> chunk_batching_rule(const Tensor& self, int64_t chunks, int64_t dim) {
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
   auto dim_physical = self_physical.getPhysicalDim(dim);
@@ -469,11 +464,6 @@ Tensor view_batching_rule(const Tensor& self, IntArrayRef size) {
   return self_physical.getPhysicalToLogicalMap().apply(result);
 }
 
-Tensor view_symint_batching_rule(const Tensor& self, c10::SymIntArrayRef size) {
-  // TODO: properly support this
-  return view_batching_rule(self, asIntArrayRefSlow(size));
-}
-
 Tensor view_as_complex_batching_rule(const Tensor& self) {
   // guard against the user passing in a batch of scalar tensors with batch
   // size equal to 2.
@@ -1004,17 +994,6 @@ Tensor new_empty_batching_rule(
   return physical_view.getPhysicalToLogicalMap().apply(result);
 }
 
-Tensor new_empty_symint_batching_rule(
-    const Tensor& self,
-    c10::SymIntArrayRef size,
-    c10::optional<ScalarType> dtype,
-    c10::optional<Layout> layout,
-    c10::optional<Device> device,
-    c10::optional<bool> pin_memory) {
-  // TODO: properly support this
-  return new_empty_batching_rule(self, asIntArrayRefSlow(size), dtype, layout, device, pin_memory);
-}
-
 Tensor new_empty_strided_batching_rule(
     const Tensor& self,
     IntArrayRef size,
@@ -1112,7 +1091,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   m.impl("tensor_split.sections", tensor_split_sections_batching_rule);
   m.impl("tensor_split.indices", tensor_split_indices_batching_rule);
   m.impl("diagonal", diagonal_batching_rule);
-  m.impl("expand", expand_symint_batching_rule);
+  m.impl("expand", expand_batching_rule);
   m.impl("expand_as", native::expand_as); // composite wrt autograd
   m.impl("movedim.intlist", movedim_batching_rule);
   m.impl("movedim.int", static_cast<Tensor(*)(const Tensor&,int64_t,int64_t)>(native::movedim)); // composite wrt autograd
@@ -1140,7 +1119,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   m.impl("unbind.int", unbind_batching_rule);
   m.impl("unfold", unfold_batching_rule);
   m.impl("unsqueeze", unsqueeze_batching_rule);
-  m.impl("view", view_symint_batching_rule);
+  m.impl("view", view_batching_rule);
   m.impl("view_as", native::view_as); // composite wrt autograd
 
   // clamp operations
@@ -1278,7 +1257,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   m.impl("diagonal_backward", diagonal_backward_batching_rule);
 
   // Tensor.new_* operators
-  m.impl("new_empty", new_empty_symint_batching_rule);
+  m.impl("new_empty", new_empty_batching_rule);
   m.impl("new_empty_strided", new_empty_strided_batching_rule);
   m.impl("new_zeros", new_zeros_batching_rule);
 
diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h
index 8ab34e95046a..0f48c7560d6b 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@@ -14,6 +14,40 @@ class OperatorHandle;
 struct OperatorKernel;
 class KernelFunction;
 
+template <typename T>
+using has_symint =
+  guts::disjunction<
+    std::is_same<c10::SymInt, std::decay_t<T>>,
+    std::is_same<c10::SymIntArrayRef, std::decay_t<T>>,
+    std::is_same<c10::optional<c10::SymInt>, std::decay_t<T>>
+  >;
+
+template <typename T>
+struct remove_symint {
+  using type = T;
+};
+
+template <>
+struct remove_symint<c10::SymInt> {
+  using type = int64_t;
+};
+
+template <>
+struct remove_symint<c10::SymIntArrayRef> {
+  using type = c10::IntArrayRef;
+};
+
+template <>
+struct remove_symint<c10::optional<c10::SymInt>> {
+  using type = c10::optional<int64_t>;
+};
+
+template <typename T>
+using fn_has_symint = typename guts::typelist::true_for_any_type<
+  has_symint,
+  typename guts::infer_function_traits<T>::type::parameter_types
+>;
+
 /**
  * KernelFunction is similar to std::function but stores a kernel function.
  * You can create a KernelFunction from a boxed or unboxed function/functor/lambda
@@ -31,6 +65,7 @@ class TORCH_API KernelFunction final {
   // Fast path for dispatch to allow not touching the boxed kernel in
   // the common case where unboxed is available.
   bool isValidUnboxed() const;
+  bool isValidSymUnboxed() const;
   bool isValid() const;
   bool isFallthrough() const;
 
@@ -182,13 +217,16 @@ class TORCH_API KernelFunction final {
   explicit KernelFunction(
       std::unique_ptr<OperatorKernel> functor,
       InternalBoxedKernelFunction* boxed_kernel_func,
-      void* unboxed_kernel_func);
+      void* unboxed_kernel_func,
+      void* sym_unboxed_kernel_func);
   explicit KernelFunction(
       BoxedKernel boxed_fn,
-      void* unboxed_kernel_func);
+      void* unboxed_kernel_func,
+      void* sym_unboxed_kernel_func);
 
   BoxedKernel boxed_kernel_func_;
   void* unboxed_kernel_func_;
+  void* sym_unboxed_kernel_func_;
 };
 
 }
diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
index c33175e4b99a..8c968e835fa6 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -8,22 +8,29 @@ namespace c10 {
 inline KernelFunction::KernelFunction()
     : boxed_kernel_func_()
     , unboxed_kernel_func_(nullptr)
+    , sym_unboxed_kernel_func_(nullptr)
 {}
 
-inline KernelFunction::KernelFunction(std::unique_ptr<OperatorKernel> functor, InternalBoxedKernelFunction* boxed_kernel_func, void* unboxed_kernel_func)
+inline KernelFunction::KernelFunction(std::unique_ptr<OperatorKernel> functor, InternalBoxedKernelFunction* boxed_kernel_func, void* unboxed_kernel_func, void* sym_unboxed_kernel_func = nullptr)
   : boxed_kernel_func_(std::move(functor), boxed_kernel_func)
   , unboxed_kernel_func_(unboxed_kernel_func)
+  , sym_unboxed_kernel_func_(sym_unboxed_kernel_func)
 {}
 
-inline KernelFunction::KernelFunction(BoxedKernel boxed_fn, void* unboxed_kernel_func)
+inline KernelFunction::KernelFunction(BoxedKernel boxed_fn, void* unboxed_kernel_func, void* sym_unboxed_kernel_func = nullptr)
   : boxed_kernel_func_(std::move(boxed_fn))
   , unboxed_kernel_func_(unboxed_kernel_func)
+  , sym_unboxed_kernel_func_(sym_unboxed_kernel_func)
 {}
 
 inline bool KernelFunction::isValidUnboxed() const {
   return unboxed_kernel_func_ != nullptr;
 }
 
+inline bool KernelFunction::isValidSymUnboxed() const {
+  return sym_unboxed_kernel_func_ != nullptr;
+}
+
 inline bool KernelFunction::isValid() const {
   return boxed_kernel_func_.isValid();
 }
@@ -43,16 +50,52 @@ inline Return callUnboxedKernelFunction(void* unboxed_kernel_func, OperatorKerne
     return (*func)(functor, dispatchKeySet, std::forward<Args>(args)...);
 }
 
+// This template requires you to explicitly specify the argument you want to
+// forward; it doesn't work if you try to deduce it
+
+template <typename T>
+inline typename remove_symint<T>::type unpackSymInt(T x) { return x; }
+
+template <>
+inline typename remove_symint<c10::SymInt>::type unpackSymInt(c10::SymInt x) {
+  return x.expect_int();
+}
+
+template <>
+inline typename remove_symint<c10::SymIntArrayRef>::type unpackSymInt(c10::SymIntArrayRef x) {
+  return c10::asIntArrayRefSlow(x);
+}
+
+template <>
+inline typename remove_symint<c10::optional<c10::SymInt>>::type unpackSymInt(c10::optional<c10::SymInt> x) {
+  return x.has_value() ? c10::make_optional(x->expect_int()) : c10::nullopt;
+}
+
 template<class Return, class... Args>
 C10_ALWAYS_INLINE Return KernelFunction::call(const OperatorHandle& opHandle, DispatchKeySet dispatchKeySet, Args... args) const {
     // note: Args above is intentionally not Args&&. We don't want perfect
     // forwarding, which would require Args to be deduced, but instead we
     // want callers to explicitly specify the Args.
 
-    if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) {
-      auto *functor = boxed_kernel_func_.getFunctor();
-      return callUnboxedKernelFunction<Return, Args...>(
-          unboxed_kernel_func_, functor, dispatchKeySet, std::forward<Args>(args)...);
+    // This should get inlined by compiler
+    if (guts::disjunction<has_symint<Args>...>::value) {
+      if (sym_unboxed_kernel_func_ != nullptr) {
+        auto *functor = boxed_kernel_func_.getFunctor();
+        return callUnboxedKernelFunction<Return, Args...>(
+            sym_unboxed_kernel_func_, functor, dispatchKeySet, std::forward<Args>(args)...);
+      }
+
+      if (unboxed_kernel_func_ != nullptr) {
+        auto *functor = boxed_kernel_func_.getFunctor();
+        return callUnboxedKernelFunction<Return, typename remove_symint<Args>::type...>(
+            unboxed_kernel_func_, functor, dispatchKeySet, unpackSymInt<Args>(args)...);
+      }
+    } else {
+      if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) {
+        auto *functor = boxed_kernel_func_.getFunctor();
+        return callUnboxedKernelFunction<Return, Args...>(
+            unboxed_kernel_func_, functor, dispatchKeySet, std::forward<Args>(args)...);
+      }
     }
 
     return impl::BoxedKernelWrapper<Return(Args...)>::call(
@@ -102,10 +145,14 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunctor(std::unique_ptr<Ope
 #endif
     static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
 
+    auto* unboxed_fn = &impl::wrap_kernel_functor_unboxed<KernelFunctor>::call;
+    void* void_unboxed_fn = reinterpret_cast<void*>(unboxed_fn);
+    bool is_symint = fn_has_symint<decltype(unboxed_fn)>::value;
     return KernelFunction(
         std::move(kernelFunctor),
         &impl::make_boxed_from_unboxed_functor<KernelFunctor, AllowLegacyTypes>::call,
-        reinterpret_cast<void*>(&impl::wrap_kernel_functor_unboxed<KernelFunctor>::call)
+        is_symint ? nullptr : void_unboxed_fn,
+        is_symint ? void_unboxed_fn : nullptr
     );
 }
 
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 139880c6d7fa..01d30c888db2 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -26,6 +26,7 @@ OperatorEntry::OperatorEntry(OperatorName&& operator_name)
 , dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized())
 , kernels_()
 , cpp_signature_()
+, sym_cpp_signature_()
 , is_observed_(ObservedOperators::isObserved(name_))
 {
   // Pick up any backend fallbacks that were registered prior to this
@@ -34,12 +35,11 @@ OperatorEntry::OperatorEntry(OperatorName&& operator_name)
 }
 
 namespace {
-  void checkSchema(const OperatorName& name, const FunctionSchema& from_def, const std::string& from_def_debug, const FunctionSchema& inferred, const std::string& inferred_debug) {
+  void checkSchema(const OperatorName& name, const FunctionSchema& from_def_, const std::string& from_def_debug, const KernelFunction& kernel, const FunctionSchema& inferred_, const std::string& inferred_debug) {
     // TODO: figure out if we can just directly save real schema at def time
-    c10::optional<std::string> schema_difference = findSchemaDifferences(
-      from_def.cloneWithRealTypes(),
-      inferred.cloneWithRealTypes()
-    );
+    FunctionSchema from_def = from_def_.cloneWithRealTypes(kernel.isValidSymUnboxed());
+    FunctionSchema inferred = inferred_.cloneWithRealTypes();
+    c10::optional<std::string> schema_difference = findSchemaDifferences(from_def, inferred);
     if (schema_difference.has_value()) {
       TORCH_CHECK(false,
         "Inferred operator schema for a C++ kernel function doesn't match the expected function schema.\n"
@@ -64,12 +64,24 @@ const AnnotatedKernel& OperatorEntry::ambiguousAutogradOtherKernel() const {
   return kernel;
 }
 
+void OperatorEntry::assertSignatureIsCorrect(const CppSignature call_signature, bool has_symint) const {
+  if (has_symint) {
+    if (C10_UNLIKELY(sym_cpp_signature_.has_value() && (call_signature != sym_cpp_signature_->signature))) {
+      reportSignatureError(call_signature, *sym_cpp_signature_);
+    }
+  } else {
+    if (C10_UNLIKELY(cpp_signature_.has_value() && (call_signature != cpp_signature_->signature))) {
+      reportSignatureError(call_signature, *cpp_signature_);
+    }
+  }
+}
+
 void OperatorEntry::registerSchema(FunctionSchema&& schema, std::string&& debug, std::vector<at::Tag> tags) {
   TORCH_INTERNAL_ASSERT(!schema_.has_value());
   for (const auto& kernel : kernels_) {
     for (const auto &j : kernel.second) {
       if (j.inferred_function_schema != nullptr) {
-        checkSchema(name_, schema, debug, *j.inferred_function_schema, j.debug);
+        checkSchema(name_, schema, debug, j.kernel, *j.inferred_function_schema, j.debug);
       }
     }
   }
@@ -103,25 +115,26 @@ OperatorEntry::AnnotatedKernelContainerIterator OperatorEntry::registerKernel(
   // which means if you could validly change the type of a cpp_signature, then
   // that would also invalidate the old TypedOperatorHandles.
   if (cpp_signature.has_value()) {
-    if (cpp_signature_.has_value()) {
-      TORCH_CHECK(*cpp_signature == cpp_signature_->signature,
+    auto& local_cpp_signature = kernel.isValidSymUnboxed() ? sym_cpp_signature_ : cpp_signature_;
+    if (local_cpp_signature.has_value()) {
+      TORCH_CHECK(*cpp_signature == local_cpp_signature->signature,
         "\nMismatch in kernel C++ signatures\n",
         "  operator: ", (this->schema_.has_value() ? toString(this->schema_->schema) : toString(name_)), "\n",
         "    ", (this->schema_.has_value() ? this->schema_->debug : "no debug info"), "\n",
-        "  kernel 1: ", cpp_signature_->signature.name(), "\n",
-        "    dispatch key: ", toString(cpp_signature_->dispatch_key), "\n",
-        "    ", cpp_signature_->debug, "\n",
+        "  kernel 1: ", local_cpp_signature->signature.name(), "\n",
+        "    dispatch key: ", toString(local_cpp_signature->dispatch_key), "\n",
+        "    ", local_cpp_signature->debug, "\n",
         "  kernel 2: ", cpp_signature->name(), "\n",
         "    dispatch key: ", toString(dispatch_key), "\n",
         "    ", debug, "\n"
       );
     } else {
-      cpp_signature_ = CppSignatureWithDebug { *cpp_signature, debug, dispatch_key };
+      local_cpp_signature = CppSignatureWithDebug { *cpp_signature, debug, dispatch_key };
     }
   }
 
   if (schema_ && inferred_function_schema) {
-    checkSchema(name_, schema_->schema, schema_->debug, *inferred_function_schema, debug);
+    checkSchema(name_, schema_->schema, schema_->debug, kernel, *inferred_function_schema, debug);
   }
 
   // Add the kernel to the kernels list,
@@ -138,7 +151,7 @@ OperatorEntry::AnnotatedKernelContainerIterator OperatorEntry::registerKernel(
                "  operator: ", (schema_.has_value() ? toString(schema_->schema) : toString(name_)), "\n",
                "    ", (this->schema_.has_value() ? this->schema_->debug : "no debug info"), "\n",
                "  dispatch key: ", toString(dispatch_key), "\n",
-               "  previous kernel: ", (cpp_signature_.has_value() ? cpp_signature_->debug : "no debug info"), "\n",
+               "  previous kernel: ", (cpp_signature_.has_value() ? cpp_signature_->debug : (sym_cpp_signature_.has_value() ? sym_cpp_signature_->debug : "no debug info")), "\n",
                "       new kernel: ", debug
     );
   }
@@ -471,13 +484,13 @@ std::string OperatorEntry::listAllDispatchKeys() const {
   return str.str();
 }
 
-void OperatorEntry::reportSignatureError(const CppSignature call_signature) const {
+void OperatorEntry::reportSignatureError(const CppSignature& call_signature, const CppSignatureWithDebug& saved_signature) const {
   TORCH_CHECK(false,
         "\nTried to access or call an operator with a wrong signature.\n",
         "  operator: ", (schema_.has_value() ? toString(schema_->schema) : toString(name_)), "\n",
         "    ", (schema_.has_value() ? schema_->debug : "unknown debug info"), "\n",
-        "  correct signature:  ", cpp_signature_->signature.name(), "\n",
-        "    ", cpp_signature_->debug, "\n",
+        "  correct signature:  ", saved_signature.signature.name(), "\n",
+        "    ", saved_signature.debug, "\n",
         "  accessed/called as: ", call_signature.name(), "\n",
         "This likely happened in a call to OperatorHandle::typed<Return (Args...)>(). ",
         "Please make sure that the function signature matches the signature in the operator registration call."
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h
index 1d9f1495f3c7..a964423d6aa8 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@@ -163,14 +163,10 @@ class TORCH_API OperatorEntry final {
   // Asserts that the given FuncType is correct for calling this operator in an unboxed way.
   template<class FuncType>
   inline void assertSignatureIsCorrect() {
-    assertSignatureIsCorrect(CppSignature::make<FuncType>());
+    assertSignatureIsCorrect(CppSignature::make<FuncType>(), fn_has_symint<FuncType>::value);
   }
 
-  void assertSignatureIsCorrect(const CppSignature call_signature) {
-    if (C10_UNLIKELY(cpp_signature_.has_value() && (call_signature != cpp_signature_->signature))) {
-      reportSignatureError(call_signature);
-    }
-  }
+  void assertSignatureIsCorrect(const CppSignature call_signature, bool has_symint) const;
 
   [[noreturn]] void reportError(DispatchKey dispatchKey) const;
 
@@ -280,11 +276,12 @@ class TORCH_API OperatorEntry final {
     c10::optional<DispatchKey> dispatch_key;
   };
   c10::optional<CppSignatureWithDebug> cpp_signature_;
+  c10::optional<CppSignatureWithDebug> sym_cpp_signature_;
 
   // Whether this operator needs to be observed with RecordFunction
   const bool is_observed_;
 
-  [[noreturn]] void reportSignatureError(CppSignature call_signature) const;
+  [[noreturn]] void reportSignatureError(const CppSignature& call_signature, const CppSignatureWithDebug& saved_signature) const;
   const KernelFunction& computeDispatchTableEntry(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key) const;
   std::pair<const AnnotatedKernel&, const char*> computeDispatchTableEntryWithDebug(
     const c10::Dispatcher& dispatcher, DispatchKey dispatch_key
diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp
index 00a31224a483..7a743c225fcb 100644
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@@ -17,9 +17,23 @@ const std::vector<Argument>& FunctionSchema::getCorrectList(SchemaArgType type)
   }
 }
 
-FunctionSchema FunctionSchema::cloneWithRealTypes() const {
-  auto cloneWithRealTypes = [](const Argument& a) {
-    return a.cloneWithType(a.real_type());
+FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const {
+  auto cloneWithRealTypes = [&](const Argument& a) {
+    if (with_symint) {
+      return a.cloneWithType(a.real_type());
+    }
+    // Don't use real type if it looks like a SymInt
+    // NB: keep this in sync with unpackSymInt in KernelFunction_impl.h
+    if (
+      *a.real_type() == *getTypePtr<c10::SymInt>() ||
+      *a.real_type() == *getTypePtr<c10::optional<c10::SymInt>>() ||
+      *a.real_type() == *getTypePtr<c10::SymIntArrayRef>()
+    ) {
+      // Keep the fake type
+      return a.cloneWithType(a.type());
+    } else {
+      return a.cloneWithType(a.real_type());
+    }
   };
   std::vector<Argument> new_arguments, new_returns;
   std::transform(arguments().begin(), arguments().end(), std::back_inserter(new_arguments), cloneWithRealTypes);
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index bafc0d810320..14f134939d76 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -474,7 +474,7 @@ struct TORCH_API FunctionSchema {
   FunctionSchema cloneWithRemappedTypes(
       const std::function<TypePtr(TypePtr)> type_map) const;
 
-  FunctionSchema cloneWithRealTypes() const;
+  FunctionSchema cloneWithRealTypes(bool with_symint=true) const;
 
   // Check that inputs have the correct types and appends any missing default
   // values.
diff --git a/aten/src/ATen/native/vulkan/ops/Factory.cpp b/aten/src/ATen/native/vulkan/ops/Factory.cpp
index ce09521668f4..06d44ec06193 100644
--- a/aten/src/ATen/native/vulkan/ops/Factory.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Factory.cpp
@@ -29,13 +29,12 @@ Tensor _empty_affine_quantized(
 }
 
 Tensor empty_memory_format(
-    const SymIntArrayRef sym_sizes,
+    const IntArrayRef sizes,
     const c10::optional<ScalarType> dtype,
     const c10::optional<c10::Layout> layout,
     const c10::optional<Device> device,
     const c10::optional<bool> pin_memory,
     const optional<MemoryFormat> memory_format) {
-  auto sizes = c10::asIntArrayRefSlow(sym_sizes);
   return convert(vTensor{
       api::context(),
       sizes,
@@ -56,12 +55,7 @@ Tensor empty_strided(
     const optional<Device> device,
     const optional<bool> pin_memory) {
   return empty_memory_format(
-      c10::SymIntArrayRef::fromIntArrayRef(sizes),
-      dtype,
-      layout,
-      device,
-      pin_memory,
-      c10::MemoryFormat::Contiguous);
+      sizes, dtype, layout, device, pin_memory, c10::MemoryFormat::Contiguous);
 }
 
 #ifdef USE_VULKAN_API
diff --git a/aten/src/ATen/native/vulkan/ops/Shape.cpp b/aten/src/ATen/native/vulkan/ops/Shape.cpp
index e1bda761749d..d8263e59668e 100644
--- a/aten/src/ATen/native/vulkan/ops/Shape.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Shape.cpp
@@ -42,8 +42,7 @@ Tensor view_internal(const Tensor& self_arg, const IntArrayRef shape) {
   return convert(v_output);
 }
 
-inline Tensor view(const Tensor& self_arg, const SymIntArrayRef sym_shape) {
-  auto shape = c10::asIntArrayRefSlow(sym_shape);
+inline Tensor view(const Tensor& self_arg, IntArrayRef shape) {
   return view_internal(self_arg, shape);
 }
 
diff --git a/functorch/functorch/csrc/BatchRulesViews.cpp b/functorch/functorch/csrc/BatchRulesViews.cpp
index df42086acef3..44f1134486c5 100644
--- a/functorch/functorch/csrc/BatchRulesViews.cpp
+++ b/functorch/functorch/csrc/BatchRulesViews.cpp
@@ -439,12 +439,6 @@ std::tuple<Tensor, optional<int64_t>> view_batching_rule(
   return std::make_tuple(self_.view_symint(size_), 0);
 }
 
-Tensor view_symint_decomposition(const Tensor& self,
-            c10::SymIntArrayRef size) {
-  return self.view( c10::asIntArrayRefSlow(size));
-}
-
-
 template <typename F, F Func>
 std::tuple<Tensor, optional<int64_t>> expand_batch_rule(
     const Tensor &self, optional<int64_t> self_bdim, SymIntArrayRef size, bool implicit)
@@ -512,14 +506,6 @@ std::tuple<Tensor, optional<int64_t>> diag_embed_batch_rule(const Tensor& self,
   return std::make_tuple(at::diag_embed(self_, offset, dim1, dim2), 0);
 }
 
-// We need to write a real batching rule to fully support symint.
-// This requires symint variants of other operations, like `view`,
-// which don't exist yet.
-Tensor expand_symint_decomp_hack(const Tensor& self, SymIntArrayRef packed_size, bool implicit) {
-   auto size = asIntArrayRefSlow(packed_size);
-   return self.expand(size, implicit);
-}
-
 TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   VMAP_SUPPORT(diag, diag_batch_rule);
   VMAP_SUPPORT(chunk, chunk_batching_rule);
diff --git a/test/cpp_extensions/open_registration_extension.cpp b/test/cpp_extensions/open_registration_extension.cpp
index 7f43e60a6b39..ad036109903d 100644
--- a/test/cpp_extensions/open_registration_extension.cpp
+++ b/test/cpp_extensions/open_registration_extension.cpp
@@ -49,9 +49,9 @@ at::Tensor custom_empty_memory_format(at::IntArrayRef size, c10::optional<at::Sc
   constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
   return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, c10::dtype_or_default(dtype), memory_format);
 }
-at::Tensor custom_empty_symint(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+at::Tensor custom_empty_symint(c10::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
   constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
-  return at::detail::empty_generic(c10::asIntArrayRefSlow(size), &global_custom_alloc, private_use_ks, c10::dtype_or_default(dtype), memory_format);
+  return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, c10::dtype_or_default(dtype), memory_format);
 }
 
 at::Tensor & custom_fill__scalar(at::Tensor & self, const at::Scalar & value) {
diff --git a/test/cpp_extensions/ort_extension.cpp b/test/cpp_extensions/ort_extension.cpp
index 3422bccd6d38..b646f3b14939 100644
--- a/test/cpp_extensions/ort_extension.cpp
+++ b/test/cpp_extensions/ort_extension.cpp
@@ -20,10 +20,10 @@ Tensor get_tensor(caffe2::TypeMeta dtype, IntArrayRef size) {
   return Tensor(std::move(tensor_impl));
 }
 
-Tensor empty_override(SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device,
+Tensor empty_override(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device,
                       c10::optional<bool> pin_memory, c10::optional<c10::MemoryFormat> optional_memory_format) {
   test_int = 0;
-  return get_tensor(scalarTypeToTypeMeta(dtype_or_default(dtype)), c10::asIntArrayRefSlow(size));
+  return get_tensor(scalarTypeToTypeMeta(dtype_or_default(dtype)), size);
 }
 
 Tensor& add_out_override(const Tensor & a, const Tensor & b , const Scalar& c, Tensor & out) {

From 2b2e0fddf8001c0c662bd582e1d958a74bc84ac4 Mon Sep 17 00:00:00 2001
From: Mateusz Sypniewski <m.odrowaz.sypniewski@gmail.com>
Date: Wed, 7 Sep 2022 07:23:03 -0700
Subject: [PATCH 21/45] Add CUDA Sanitizer (#83984)

Example of a simple synchronization error:
```
a = torch.rand(4, 2, device="cuda")

with torch.cuda.stream(second_stream):
    torch.mul(a, 5, out=a)
```
Output produced by CSAN:
```
============================
CSAN detected a possible data race on tensor with data pointer 139719969079296
Access by stream 94646435460352 during kernel:
aten::mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
writing to argument: self, out, output
With stack trace:
  File "/private/home/sypniewski/pytorch/torch/cuda/_sanitizer.py", line 364, in _handle_kernel_launch
    stack_trace = traceback.StackSummary.extract(
  File "/private/home/sypniewski/pytorch/torch/cuda/_sanitizer.py", line 544, in __torch_dispatch__
    errors = self.event_handler._handle_kernel_launch(
  File "/private/home/sypniewski/pytorch/torch/utils/_python_dispatch.py", line 76, in wrapped
    return f(self, *args, **kwargs)
  File "/private/home/sypniewski/pytorch/tester.py", line 9, in <module>
    torch.mul(a, 5, out=a)

Previous access by stream 0 during kernel:
aten::rand(int[] size, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
writing to argument: output
With stack trace:
  File "/private/home/sypniewski/pytorch/torch/cuda/_sanitizer.py", line 364, in _handle_kernel_launch
    stack_trace = traceback.StackSummary.extract(
  File "/private/home/sypniewski/pytorch/torch/cuda/_sanitizer.py", line 544, in __torch_dispatch__
    errors = self.event_handler._handle_kernel_launch(
  File "/private/home/sypniewski/pytorch/torch/utils/_python_dispatch.py", line 76, in wrapped
    return f(self, *args, **kwargs)
  File "/private/home/sypniewski/pytorch/tester.py", line 6, in <module>
    a = torch.rand(10000, device="cuda")

Tensor was allocated with stack trace:
  File "/private/home/sypniewski/pytorch/torch/cuda/_sanitizer.py", line 420, in _handle_memory_allocation
    traceback.StackSummary.extract(
  File "/private/home/sypniewski/pytorch/torch/utils/_cuda_trace.py", line 23, in fire_callbacks
    cb(*args, **kwargs)
  File "/private/home/sypniewski/pytorch/torch/_ops.py", line 60, in __call__
    return self._op(*args, **kwargs or {})
  File "/private/home/sypniewski/pytorch/torch/cuda/_sanitizer.py", line 541, in __torch_dispatch__
    outputs = func(*args, **kwargs)
  File "/private/home/sypniewski/pytorch/torch/utils/_python_dispatch.py", line 76, in wrapped
    return f(self, *args, **kwargs)
  File "/private/home/sypniewski/pytorch/tester.py", line 6, in <module>
    a = torch.rand(10000, device="cuda")
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/83984
Approved by: https://github.com/ezyang
---
 test/test_cuda_sanitizer.py | 446 ++++++++++++++++++++++++++++
 torch/__init__.py           |   6 +
 torch/cuda/_sanitizer.py    | 559 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1011 insertions(+)
 create mode 100644 test/test_cuda_sanitizer.py
 create mode 100644 torch/cuda/_sanitizer.py

diff --git a/test/test_cuda_sanitizer.py b/test/test_cuda_sanitizer.py
new file mode 100644
index 000000000000..e8629788be59
--- /dev/null
+++ b/test/test_cuda_sanitizer.py
@@ -0,0 +1,446 @@
+# Owner(s): ["module: cuda"]
+
+import sys
+import textwrap
+import traceback
+from typing import List
+
+import torch
+import torch.cuda._sanitizer as csan
+from torch.cuda._sanitizer import StreamId, DataPtr, EventId
+from torch.testing._internal.common_utils import TestCase, run_tests
+
+
+# We cannot import TEST_CUDA from torch.testing._internal.common_cuda here,
+# because if we do that, the TEST_CUDNN line from torch.testing._internal.common_cuda will be executed
+# multiple times as well during the execution of this test suite, and it will
+# cause CUDA OOM error on Windows.
+TEST_CUDA = torch.cuda.is_available()
+
+if not TEST_CUDA:
+    print("CUDA not available, skipping tests", file=sys.stderr)
+    TestCase = object  # noqa: F811
+
+
+class TestArgumentHandler(TestCase):
+    def test_add(self):
+        add_func = torch.ops.aten.add.Tensor
+        a = torch.ones(5, 3, device="cuda")
+        b = torch.randn(5, 3, device="cuda")
+
+        argument_handler = csan.ArgumentHandler()
+        argument_handler.parse_inputs(add_func._schema, (a, b), {})
+        c = torch.add(a, b)
+        argument_handler.parse_outputs(c)
+
+        self.assertEqual({a.data_ptr(), b.data_ptr()}, argument_handler.dataptrs_read)
+        self.assertEqual({c.data_ptr()}, argument_handler.dataptrs_written)
+
+    def test_cat(self):
+        cat_func = torch.ops.aten.cat.default
+        a = torch.ones(2, 4, 5, device="cuda")
+        b = torch.zeros(2, 1, 5, device="cuda")
+        c = torch.rand(2, 7, 5, device="cuda")
+
+        argument_handler = csan.ArgumentHandler()
+        argument_handler.parse_inputs(cat_func._schema, ([a, b, c], 1), {})
+        d = torch.cat((a, b, c), dim=1)
+        argument_handler.parse_outputs(d)
+
+        self.assertEqual(
+            {a.data_ptr(), b.data_ptr(), c.data_ptr()}, argument_handler.dataptrs_read
+        )
+        self.assertEqual({d.data_ptr()}, argument_handler.dataptrs_written)
+
+    def test_split(self):
+        split_func = torch.ops.aten.split.Tensor
+        a = torch.arange(10, device="cuda").reshape(5, 2)
+
+        argument_handler = csan.ArgumentHandler()
+        argument_handler.parse_inputs(split_func._schema, (a, 2), {})
+        out = torch.split(a, 2)
+        argument_handler.parse_outputs(out)
+
+        outputs = {out[0].data_ptr(), out[1].data_ptr(), out[2].data_ptr()}
+        self.assertEqual({a.data_ptr()}, argument_handler.dataptrs_read)
+        self.assertEqual(
+            outputs,
+            argument_handler.dataptrs_written,
+        )
+
+    def test_inplace(self):
+        add_inplace_func = torch.ops.aten.add_.Tensor
+        a = torch.rand(4, 2, device="cuda")
+
+        argument_handler = csan.ArgumentHandler()
+        argument_handler.parse_inputs(add_inplace_func._schema, (a, 5), {})
+        a.add_(5)
+        argument_handler.parse_outputs(a)
+
+        self.assertEqual(set(), argument_handler.dataptrs_read)
+        self.assertEqual({a.data_ptr()}, argument_handler.dataptrs_written)
+
+    def test_out(self):
+        mul_out_func = torch.ops.aten.mul.out
+        a = torch.arange(8, device="cuda")
+        b = torch.empty(8, device="cuda")
+
+        argument_handler = csan.ArgumentHandler()
+        argument_handler.parse_inputs(mul_out_func._schema, (a, 3), {"out": b})
+        torch.mul(a, 3, out=b)
+        argument_handler.parse_outputs(b)
+
+        self.assertEqual({a.data_ptr()}, argument_handler.dataptrs_read)
+        self.assertEqual({b.data_ptr()}, argument_handler.dataptrs_written)
+
+    def test_nonzero(self):
+        nonzero_func = torch.ops.aten.nonzero.default
+        a = torch.ones(5, 3, 2, device="cuda")
+
+        argument_handler = csan.ArgumentHandler()
+        argument_handler.parse_inputs(nonzero_func._schema, (a,), {"as_tuple": True})
+        out = torch.nonzero(a, as_tuple=True)
+        argument_handler.parse_outputs(out)
+
+        outputs = {out[0].data_ptr(), out[1].data_ptr(), out[2].data_ptr()}
+        self.assertEqual({a.data_ptr()}, argument_handler.dataptrs_read)
+        self.assertEqual(outputs, argument_handler.dataptrs_written)
+
+    def test_tensor_names(self):
+        addr_func = torch.ops.aten.addr.default
+        vec = torch.arange(1, 4, device="cuda")
+        M = torch.zeros(3, 3, device="cuda")
+
+        argument_handler = csan.ArgumentHandler()
+        argument_handler.parse_inputs(addr_func._schema, (M, vec, vec), {})
+        out = torch.addr(M, vec, vec)
+        argument_handler.parse_outputs(out)
+
+        self.assertEqual(
+            argument_handler.tensor_names,
+            {
+                M.data_ptr(): ["self"],
+                vec.data_ptr(): ["vec1", "vec2"],
+                out.data_ptr(): ["output"],
+            },
+        )
+
+
+def tensor_id(i: int) -> DataPtr:
+    return i
+
+
+def stream_id(i: int) -> StreamId:
+    return 1000 + i
+
+
+def event_id(i: int) -> EventId:
+    return 2000 + i
+
+
+class TestEventHandler(TestCase):
+    def setUp(self):
+        self.handler = csan.EventHandler()
+
+    def kernel_launch(
+        self,
+        stream: StreamId,
+        read_only: List[DataPtr] = None,
+        read_write: List[DataPtr] = None,
+    ) -> List[csan.SynchronizationError]:
+        if read_only is None:
+            read_only = []
+        if read_write is None:
+            read_write = []
+        return self.handler._handle_kernel_launch(
+            stream,
+            read_only,
+            read_write,
+            "",
+            {k: [""] for k in read_only + read_write},
+        )
+
+    def assert_good_kernel_launch(
+        self,
+        stream: StreamId,
+        read_only: List[DataPtr] = None,
+        read_write: List[DataPtr] = None,
+    ) -> None:
+        self.assertEqual(self.kernel_launch(stream, read_only, read_write), [])
+
+    def assert_bad_kernel_launch(
+        self,
+        number_of_errors: int,
+        stream: StreamId,
+        read_only: List[DataPtr] = None,
+        read_write: List[DataPtr] = None,
+    ) -> None:
+        errors = self.kernel_launch(stream, read_only, read_write)
+        self.assertEqual(len(errors), number_of_errors)
+
+    def test_empty_kernel_launch(self):
+        self.assert_good_kernel_launch(stream_id(0))
+
+    def test_simple_passing(self):
+        self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)])
+        self.assert_good_kernel_launch(stream_id(2), read_only=[tensor_id(1)])
+
+    def test_simple_error(self):
+        self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)])
+        self.assert_bad_kernel_launch(1, stream_id(2), read_write=[tensor_id(1)])
+
+    def test_simple_sync(self):
+        self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)])
+        self.handler._handle_event_record(event_id(0), stream_id(1))
+        self.handler._handle_event_wait(event_id(0), stream_id(2))
+        self.assert_good_kernel_launch(stream_id(2), read_write=[tensor_id(1)])
+
+    def test_reads_check_last_write(self):
+        # Tests that not only the first read operation checks if it is in conflict
+        # with the last write operation, but all read operations do.
+
+        self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)])
+        self.handler._handle_event_record(event_id(0), stream_id(1))
+        self.handler._handle_event_wait(event_id(0), stream_id(2))
+        self.assert_good_kernel_launch(stream_id(2), read_only=[tensor_id(1)])
+
+        self.assert_bad_kernel_launch(1, stream_id(3), read_only=[tensor_id(1)])
+
+    def test_branch_sync(self):
+        # Tests that two streams can read after both waiting for a third, but they
+        # cannot write without further synchronization.
+
+        self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)])
+        self.handler._handle_event_record(event_id(0), stream_id(1))
+        self.handler._handle_event_wait(event_id(0), stream_id(2))
+        self.handler._handle_event_wait(event_id(0), stream_id(3))
+        self.assert_good_kernel_launch(stream_id(2), read_only=[tensor_id(1)])
+        self.assert_good_kernel_launch(stream_id(3), read_only=[tensor_id(1)])
+
+        self.assert_bad_kernel_launch(1, stream_id(2), read_write=[tensor_id(1)])
+
+    def test_chain_sync(self):
+        iterations = 10
+
+        self.assert_good_kernel_launch(stream_id(0), read_only=[tensor_id(1)])
+        for i in range(iterations):
+            self.handler._handle_event_record(event_id(i), stream_id(i))
+            self.handler._handle_event_wait(event_id(i), stream_id(i + 1))
+        self.assert_good_kernel_launch(stream_id(iterations), read_write=[tensor_id(1)])
+
+    def test_expired_record(self):
+        self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)])
+        self.handler._handle_event_record(event_id(0), stream_id(1))
+        self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)])
+        self.handler._handle_event_wait(event_id(0), stream_id(2))
+
+        self.assert_bad_kernel_launch(1, stream_id(2), read_write=[tensor_id(1)])
+
+    def test_deleted_record(self):
+        for should_delete, should_create in [
+            (True, True),
+            (True, False),
+            (False, True),
+        ]:
+            self.setUp()
+            with self.subTest(should_delete=should_delete, should_create=should_create):
+                self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)])
+                self.handler._handle_event_record(event_id(0), stream_id(1))
+
+                if should_delete:
+                    self.handler._handle_event_deletion(event_id(0))
+                if should_create:
+                    self.handler._handle_event_creation(event_id(0))
+
+                self.handler._handle_event_wait(event_id(0), stream_id(2))
+                self.assert_bad_kernel_launch(
+                    1, stream_id(2), read_write=[tensor_id(1)]
+                )
+
+    def test_all_reads_checked_failing(self):
+        iterations = 10
+        for i in range(1, iterations):
+            self.assert_good_kernel_launch(stream_id(i), read_only=[tensor_id(1)])
+            self.handler._handle_event_record(event_id(i), stream_id(i))
+
+        for i in range(1, iterations):
+            self.handler._handle_event_wait(event_id(i), stream_id(0))
+
+        self.assert_good_kernel_launch(stream_id(iterations), read_only=[tensor_id(1)])
+        self.handler._handle_event_record(event_id(iterations), stream_id(i))
+
+        # Does not synchronize with the last read.
+        self.assert_bad_kernel_launch(1, stream_id(0), read_write=[tensor_id(1)])
+
+    def test_all_reads_checked_passing(self):
+        iterations = 10
+        for i in range(1, iterations):
+            self.assert_good_kernel_launch(stream_id(i), read_only=[tensor_id(1)])
+            self.handler._handle_event_record(event_id(i), stream_id(i))
+
+        for i in range(1, iterations):
+            self.handler._handle_event_wait(event_id(i), stream_id(0))
+
+        self.assert_good_kernel_launch(stream_id(0), read_write=[tensor_id(1)])
+
+    def test_multiple_errors(self):
+        iterations = 10
+        self.assert_good_kernel_launch(
+            stream_id(0), read_write=[tensor_id(i) for i in range(iterations)]
+        )
+        self.assert_bad_kernel_launch(
+            iterations,
+            stream_id(1),
+            read_write=[tensor_id(i) for i in range(iterations)],
+        )
+
+    def test_correct_state_merging(self):
+        # Tests that after waiting for an event, a stream's state is indeed set
+        # to the pointwise maximum of its old state and the recorded state.
+
+        self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)])
+        self.assert_good_kernel_launch(stream_id(2), read_write=[tensor_id(2)])
+        self.handler._handle_event_record(event_id(1), stream_id(1))
+        self.handler._handle_event_record(event_id(2), stream_id(2))
+
+        self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)])
+        self.assert_good_kernel_launch(stream_id(2), read_write=[tensor_id(2)])
+        self.handler._handle_event_wait(event_id(1), stream_id(2))
+        self.handler._handle_event_wait(event_id(2), stream_id(1))
+
+        self.handler._handle_event_record(event_id(3), stream_id(2))
+        self.handler._handle_event_wait(event_id(3), stream_id(1))
+        self.assert_good_kernel_launch(
+            stream_id(1), read_write=[tensor_id(1), tensor_id(2)]
+        )
+
+    def test_record_override(self):
+        self.assert_good_kernel_launch(stream_id(1), read_only=[tensor_id(1)])
+        self.assert_good_kernel_launch(stream_id(2), read_only=[tensor_id(2)])
+        self.handler._handle_event_record(event_id(1), stream_id(1))
+        self.handler._handle_event_record(event_id(1), stream_id(2))
+
+        self.handler._handle_event_wait(event_id(1), stream_id(3))
+        self.assert_bad_kernel_launch(1, stream_id(3), read_write=[tensor_id(1)])
+
+    def test_multiple_wait(self):
+        # Tests that a wait operation can be performed multiple times on the same event
+        # by different streams.
+
+        self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)])
+        self.handler._handle_event_record(event_id(1), stream_id(1))
+        self.handler._handle_event_wait(event_id(1), stream_id(2))
+        self.handler._handle_event_wait(event_id(1), stream_id(3))
+
+        self.assert_good_kernel_launch(stream_id(2), read_only=[tensor_id(1)])
+        self.assert_good_kernel_launch(stream_id(3), read_only=[tensor_id(1)])
+
+
+class TestMessages(TestCase):
+    def setUp(self):
+        self.handler = csan.EventHandler()
+
+    def test_ensure_exists(self):
+        ARG = 0
+        for func, out in [
+            (
+                self.handler._handle_event_deletion,
+                f"Found Event with id: {ARG}, but no matching event "
+                "creation in the trace. Backfilling the trace now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+            ),
+            (
+                self.handler._handle_memory_deallocation,
+                f"Found tensor with pointer: {ARG}, but no matching tensor "
+                "allocation in the trace. Backfilling the trace now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+            ),
+        ]:
+            with self.subTest(func=func, out=out):
+                with self.assertLogs() as captured:
+                    func(ARG)
+                self.assertEqual(captured.records[0].getMessage(), out)
+
+    def test_ensure_does_not_exist(self):
+        ARG = 0
+        self.handler._handle_event_creation(ARG)
+        self.handler._handle_stream_creation(ARG)
+        for func, out in [
+            (
+                self.handler._handle_event_creation,
+                "Found duplicate event creation in the trace for event with "
+                f"id: {ARG}. Assuming the trace for event deletion wasn't caught "
+                "and backfilling it now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+            ),
+            (
+                self.handler._handle_stream_creation,
+                "Found duplicate Stream creation in the trace for Stream with "
+                f"id: {ARG}. PyTorch Streams are only created once, so this "
+                "trace entry is ignored.",
+            ),
+        ]:
+            with self.subTest(func=func, out=out):
+                with self.assertLogs() as captured:
+                    func(ARG)
+                self.assertEqual(captured.records[0].getMessage(), out)
+
+    def test_error_message(self):
+        current_access = csan.Access(
+            type=csan.AccessType.WRITE,
+            seq_num=1,
+            stream=stream_id(1),
+            operator="schema",
+            names=["b"],
+            stack_trace=traceback.StackSummary.from_list(
+                [("file", 0, "name", "trace a")]
+            ),
+        )
+        previous_access = csan.Access(
+            type=csan.AccessType.READ,
+            seq_num=2,
+            stream=stream_id(0),
+            operator="schema",
+            names=["a"],
+            stack_trace=traceback.StackSummary.from_list(
+                [("file", 0, "name", "trace b")]
+            ),
+        )
+        error = csan.UnsynchronizedAccessError(
+            data_ptr=tensor_id(1),
+            allocation_stack_trace=traceback.StackSummary.from_list(
+                [("file", 0, "name", "alloc")]
+            ),
+            current_access=current_access,
+            previous_access=previous_access,
+        )
+        self.assertEqual(
+            str(error),
+            textwrap.dedent(
+                """\
+                ============================
+                CSAN detected a possible data race on tensor with data pointer 1
+                Access by stream 1001 during kernel:
+                schema
+                writing to argument: b
+                With stack trace:
+                  File "file", line 0, in name
+                    trace a
+
+                Previous access by stream 1000 during kernel:
+                schema
+                reading from argument: a
+                With stack trace:
+                  File "file", line 0, in name
+                    trace b
+
+                Tensor was allocated with stack trace:
+                  File "file", line 0, in name
+                    alloc
+                """
+            ),
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/__init__.py b/torch/__init__.py
index e186db209e1c..a6e8bc295d08 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -955,3 +955,9 @@ def _register_device_module(device_type, module):
     from . import library
     if not TYPE_CHECKING:
         from . import _meta_registrations
+
+# Enable CUDA Sanitizer
+if 'TORCH_CUDA_SANITIZER' in os.environ:
+    import torch.cuda._sanitizer as csan
+
+    csan.enable_cuda_sanitizer()
diff --git a/torch/cuda/_sanitizer.py b/torch/cuda/_sanitizer.py
new file mode 100644
index 000000000000..aa98b02eefc9
--- /dev/null
+++ b/torch/cuda/_sanitizer.py
@@ -0,0 +1,559 @@
+r"""
+This module introduces CUDA Sanitizer, a tool for detecting synchronization errors
+between kernels ran on different streams. It stores information on accesses to tensors
+to determine if they are synchronized or not. When enabled in a python program and a
+possible data race is detected, a detailed warning will be printed and the program
+will exit.
+
+It can be enabled either by importing this module and using
+:func:`enable_cuda_sanitizer()` or by exporting ``TORCH_CUDA_SANITIZER``
+environment variable.
+"""
+
+import enum
+import functools
+import io
+import logging
+import sys
+import textwrap
+import traceback
+from dataclasses import dataclass, field
+from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, TypeVar
+
+import torch
+import torch.utils._cuda_trace as cuda_trace
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils._pytree import tree_map
+
+
+TK = TypeVar("TK")
+TVa = TypeVar("TVa")
+TVb = TypeVar("TVb")
+
+DataPtr = int
+StreamId = int
+EventId = int
+SeqNum = int
+
+logger = logging.getLogger(__name__)
+
+
+class AccessType(enum.Enum):
+    READ = enum.auto()
+    WRITE = enum.auto()
+
+    def __str__(self):
+        return "reading from" if self is AccessType.READ else "writing to"
+
+
+@dataclass
+class Access:
+    r"""Stores information about a single access to a tensor by a kernel.
+
+    Args:
+        type: either AccessType.READ or AccessType.Write.
+        seq_num: the sequential number of the kernel performing the access.
+        stream: the stream id of the stream executing the kernel.
+        operator: the schema of the launched kernel, which lists the
+            arguments and return type.
+        names: the arguments in the schema this access corresponds to.
+        stack_trace: the stack summary object captured during access.
+    """
+    type: AccessType
+    seq_num: SeqNum
+    stream: StreamId
+    operator: str
+    names: List[str]
+    stack_trace: traceback.StackSummary
+
+
+class SynchronizationError(Exception):
+    """Base class for errors detected by CUDA Sanitizer."""
+
+    pass
+
+
+class UnsynchronizedAccessError(SynchronizationError):
+    """Stores information about two unsynchronized accesses to one data pointer."""
+
+    def __init__(
+        self,
+        data_ptr: DataPtr,
+        allocation_stack_trace: Optional[traceback.StackSummary],
+        current_access: Access,
+        previous_access: Access,
+    ):
+        self.data_ptr = data_ptr
+        self.allocation_stack_trace = allocation_stack_trace
+        self.current_access = current_access
+        self.previous_access = previous_access
+
+    def __str__(self):
+        with io.StringIO() as message:
+            message.write(
+                textwrap.dedent(
+                    f"""\
+                    ============================
+                    CSAN detected a possible data race on tensor with data pointer {self.data_ptr}
+                    Access by stream {self.current_access.stream} during kernel:
+                    {self.current_access.operator}
+                    {self.current_access.type} argument: {', '.join(self.current_access.names)}
+                    With stack trace:
+                    """
+                )
+            )
+            message.write(f"{''.join(self.current_access.stack_trace.format())}\n")
+            message.write(
+                textwrap.dedent(
+                    f"""\
+                    Previous access by stream {self.previous_access.stream} during kernel:
+                    {self.previous_access.operator}
+                    {self.previous_access.type} argument: {', '.join(self.previous_access.names)}
+                    With stack trace:
+                    """
+                )
+            )
+            message.write(f"{''.join(self.previous_access.stack_trace.format())}\n")
+            if self.allocation_stack_trace:
+                message.write(
+                    "Tensor was allocated with stack trace:\n"
+                    f"{''.join(self.allocation_stack_trace.format())}"
+                )
+            else:
+                message.write("Trace for tensor allocation not found.")
+            return message.getvalue()
+
+
+class CUDASanitizerErrors(Exception):
+    """Wrapper class for errors reported by CUDA Sanitizer."""
+
+    def __init__(self, errors: List[SynchronizationError]):
+        self.errors = errors
+
+    def __str__(self):
+        return f"detected {len(self.errors)} errors"
+
+
+def format_log_message(message: str) -> str:
+    return " ".join(line.strip() for line in message.strip().splitlines())
+
+
+@dataclass
+class TensorInfo:
+    r"""Stores information about a single tensor and recent accesses to it.
+
+    Args:
+        allocation_stack_trace: the stack summary object captured during tensor
+            allocation. Can be ``None`` if the allocation wasn't caught by CSAN.
+        reads: list of read accesses to the tensor that were performed since
+            the last write.
+        write: the last write access to the tensor.
+    """
+    allocation_stack_trace: Optional[traceback.StackSummary]
+    reads: List[Access] = field(default_factory=list)
+    write: Optional[Access] = None
+
+
+class _TensorsAccessed:
+    def __init__(self):
+        self.accesses: Dict[DataPtr, TensorInfo] = {}
+
+    def ensure_tensor_exists(self, data_ptr: DataPtr) -> None:
+        if data_ptr not in self.accesses:
+            logger.info(
+                format_log_message(
+                    f"""
+                    Found tensor with pointer: {data_ptr}, but no matching tensor
+                    allocation in the trace. Backfilling the trace now.
+                    Perhaps the sanitizer was enabled after some torch operations?
+                    """
+                )
+            )
+            self.create_tensor(data_ptr, None)
+
+    def ensure_tensor_does_not_exist(self, data_ptr: DataPtr) -> None:
+        if data_ptr in self.accesses:
+            logger.info(
+                format_log_message(
+                    f"""
+                    Found duplicate tensor allocation in the trace for tensor with
+                    pointer: {data_ptr}. Assuming the trace for tensor deallocation
+                    wasn't caught and backfilling it now.
+                    Perhaps the sanitizer was enabled after some torch operations?
+                    """
+                )
+            )
+            self.delete_tensor(data_ptr)
+
+    def create_tensor(
+        self, data_ptr: DataPtr, stack_trace: Optional[traceback.StackSummary]
+    ) -> None:
+        self.accesses[data_ptr] = TensorInfo(stack_trace)
+
+    def delete_tensor(self, data_ptr: DataPtr) -> None:
+        del self.accesses[data_ptr]
+
+    def were_there_reads_since_last_write(self, data_ptr: DataPtr) -> bool:
+        return True if self.accesses[data_ptr].reads else False
+
+    def get_allocation_stack_trace(
+        self, data_ptr: DataPtr
+    ) -> Optional[traceback.StackSummary]:
+        return self.accesses[data_ptr].allocation_stack_trace
+
+    def get_write(self, data_ptr: DataPtr) -> Optional[Access]:
+        return self.accesses[data_ptr].write
+
+    def get_reads(self, data_ptr: DataPtr) -> List[Access]:
+        return self.accesses[data_ptr].reads
+
+    def add_read(self, data_ptr: DataPtr, access: Access) -> None:
+        self.accesses[data_ptr].reads.append(access)
+
+    def set_write(self, data_ptr: DataPtr, access: Access) -> None:
+        self.accesses[data_ptr].write = access
+        self.accesses[data_ptr].reads = []
+
+
+class StreamSynchronizations:
+    def __init__(self):
+        self.current_sync_states: Dict[StreamId, Dict[StreamId, SeqNum]] = {}
+        self.recorded_sync_states: Dict[EventId, Dict[StreamId, SeqNum]] = {}
+
+    def _ensure_stream_exists(self, stream: StreamId) -> None:
+        if stream not in self.current_sync_states:
+            logger.info(
+                format_log_message(
+                    f"""
+                    Found Stream with id: {stream}, but no matching stream
+                    creation in the trace. Backfilling the trace now.
+                    Perhaps the sanitizer was enabled after some torch operations?
+                    """
+                )
+            )
+            self.create_stream(stream)
+
+    def _ensure_event_exists(self, event: EventId) -> None:
+        if event not in self.recorded_sync_states:
+            logger.info(
+                format_log_message(
+                    f"""
+                    Found Event with id: {event}, but no matching event
+                    creation in the trace. Backfilling the trace now.
+                    Perhaps the sanitizer was enabled after some torch operations?
+                    """
+                )
+            )
+            self.create_event(event)
+
+    def _ensure_event_does_not_exist(self, event: EventId) -> None:
+        if event in self.recorded_sync_states:
+            logger.info(
+                format_log_message(
+                    f"""
+                    Found duplicate event creation in the trace for event with
+                    id: {event}. Assuming the trace for event deletion wasn't caught
+                    and backfilling it now.
+                    Perhaps the sanitizer was enabled after some torch operations?
+                    """
+                )
+            )
+            self.delete_event(event)
+
+    def create_stream(self, stream: StreamId) -> None:
+        if stream in self.current_sync_states:
+            logger.info(
+                format_log_message(
+                    f"""
+                    Found duplicate Stream creation in the trace for Stream with
+                    id: {stream}. PyTorch Streams are only created once, so this
+                    trace entry is ignored.
+                    """
+                )
+            )
+        else:
+            self.current_sync_states[stream] = {}
+
+    def create_event(self, event: EventId) -> None:
+        self._ensure_event_does_not_exist(event)
+        self.recorded_sync_states[event] = {}
+
+    def delete_event(self, event: EventId) -> None:
+        self._ensure_event_exists(event)
+        del self.recorded_sync_states[event]
+
+    def update_seq_num(self, stream: StreamId, seq_num: SeqNum) -> None:
+        self._ensure_stream_exists(stream)
+        self.current_sync_states[stream][stream] = seq_num
+
+    def record_state(self, event: EventId, stream: StreamId) -> None:
+        self._ensure_event_exists(event)
+        self._ensure_stream_exists(stream)
+        self.recorded_sync_states[event] = self.current_sync_states[stream].copy()
+
+    def state_wait_for_event(self, stream: StreamId, event: EventId) -> None:
+        self._ensure_event_exists(event)
+        self._ensure_stream_exists(stream)
+        for other_stream, seq_num in self.recorded_sync_states[event].items():
+            self.current_sync_states[stream][other_stream] = max(
+                self.current_sync_states[stream].get(other_stream, -1), seq_num
+            )
+
+    def is_ordered_after(
+        self, current_stream: StreamId, seq_num: SeqNum, other_stream: StreamId
+    ) -> bool:
+        self._ensure_stream_exists(current_stream)
+        self._ensure_stream_exists(other_stream)
+        return seq_num <= self.current_sync_states[current_stream].get(other_stream, -1)
+
+
+class EventHandler:
+    """Analyzes CSAN trace for synchronization errors.
+
+    Stores information on each stream's synchronizations with other streams as well
+    as tensor accesses to determine whether a given kernel launch might cause a
+    data race.
+    """
+
+    def __init__(self):
+        self.tensors_accessed = _TensorsAccessed()
+        self.syncs = StreamSynchronizations()
+        self.seq_num: SeqNum = 0
+
+    def _handle_kernel_launch(
+        self,
+        stream: StreamId,
+        read_only: List[DataPtr],
+        read_write: List[DataPtr],
+        operator: str,
+        tensor_names: Dict[int, List[str]],
+    ) -> List[SynchronizationError]:
+        def check_conflict(
+            data_ptr: DataPtr, current_access: Access, previous_access: Optional[Access]
+        ) -> None:
+            if previous_access is None:
+                return
+            if not self.syncs.is_ordered_after(
+                current_access.stream, previous_access.seq_num, previous_access.stream
+            ):
+                error_list.append(
+                    UnsynchronizedAccessError(
+                        data_ptr,
+                        self.tensors_accessed.get_allocation_stack_trace(data_ptr),
+                        current_access,
+                        previous_access,
+                    )
+                )
+
+        error_list: List[SynchronizationError] = []
+        self.seq_num += 1
+        self.syncs.update_seq_num(stream, self.seq_num)
+        stack_trace = traceback.StackSummary.extract(
+            traceback.walk_stack(None), lookup_lines=False
+        )
+
+        for data_ptr in read_only:
+            self.tensors_accessed.ensure_tensor_exists(data_ptr)
+            current_access = Access(
+                AccessType.READ,
+                self.seq_num,
+                stream,
+                operator,
+                tensor_names[data_ptr],
+                stack_trace,
+            )
+            check_conflict(
+                data_ptr, current_access, self.tensors_accessed.get_write(data_ptr)
+            )
+            self.tensors_accessed.add_read(data_ptr, current_access)
+
+        for data_ptr in read_write:
+            self.tensors_accessed.ensure_tensor_exists(data_ptr)
+            current_access = Access(
+                AccessType.WRITE,
+                self.seq_num,
+                stream,
+                operator,
+                tensor_names[data_ptr],
+                stack_trace,
+            )
+            if self.tensors_accessed.were_there_reads_since_last_write(data_ptr):
+                for previous_access in self.tensors_accessed.get_reads(data_ptr):
+                    check_conflict(data_ptr, current_access, previous_access)
+            else:
+                check_conflict(
+                    data_ptr, current_access, self.tensors_accessed.get_write(data_ptr)
+                )
+            self.tensors_accessed.set_write(data_ptr, current_access)
+
+        return error_list
+
+    def _handle_event_creation(self, event: EventId) -> None:
+        self.syncs.create_event(event)
+
+    def _handle_event_deletion(self, event: EventId) -> None:
+        self.syncs.delete_event(event)
+
+    def _handle_event_record(self, event: EventId, stream: StreamId) -> None:
+        self.syncs.record_state(event, stream)
+
+    def _handle_event_wait(self, event: EventId, stream: StreamId) -> None:
+        self.syncs.state_wait_for_event(stream, event)
+
+    def _handle_memory_allocation(self, data_ptr: DataPtr) -> None:
+        self.tensors_accessed.ensure_tensor_does_not_exist(data_ptr)
+        self.tensors_accessed.create_tensor(
+            data_ptr,
+            traceback.StackSummary.extract(
+                traceback.walk_stack(None), lookup_lines=False
+            ),
+        )
+
+    def _handle_memory_deallocation(self, data_ptr: DataPtr) -> None:
+        self.tensors_accessed.ensure_tensor_exists(data_ptr)
+        self.tensors_accessed.delete_tensor(data_ptr)
+
+    def _handle_stream_creation(self, stream: StreamId) -> None:
+        self.syncs.create_stream(stream)
+
+
+def zip_by_key(a: Dict[TK, TVa], b: Dict[TK, TVb]) -> Iterator[Tuple[TK, TVa, TVb]]:
+    for arg, value in a.items():
+        if arg in b:
+            yield arg, value, b[arg]
+
+
+def zip_arguments(
+    schema: torch.FunctionSchema, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+) -> Iterator[Tuple[torch.Argument, Any]]:
+    schema_args = schema.arguments[: len(args)]
+    schema_kwargs = {arg.name: arg for arg in schema.arguments[len(args) :]}
+
+    yield from zip(schema_args, args)
+
+    for _, argument, value in zip_by_key(schema_kwargs, kwargs):
+        yield (argument, value)
+
+
+class ArgumentHandler:
+    def __init__(self):
+        self.dataptrs_read: Set[int] = set()
+        self.dataptrs_written: Set[int] = set()
+        self.tensor_names: Dict[int, List[str]] = dict()
+
+    def _handle_argument(self, value: Any, is_write: bool, name: str) -> None:
+        if isinstance(value, torch.Tensor) and value.is_cuda:
+            data_ptr = value.data_ptr()
+            if is_write:
+                self.dataptrs_written.add(data_ptr)
+            else:
+                self.dataptrs_read.add(data_ptr)
+            self.tensor_names.setdefault(data_ptr, []).append(name)
+
+    def parse_inputs(
+        self,
+        schema: torch.FunctionSchema,
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> None:
+        for argument, value in zip_arguments(schema, args, kwargs):
+            is_write = argument.alias_info is not None and argument.alias_info.is_write
+            tree_map(
+                functools.partial(
+                    self._handle_argument, is_write=is_write, name=argument.name
+                ),
+                value,
+            )
+
+    def parse_outputs(self, outputs: Any) -> None:
+        tree_map(
+            functools.partial(self._handle_argument, is_write=True, name="output"),
+            outputs,
+        )
+
+
+class CUDASanitizerDispatchMode(TorchDispatchMode):
+    def __init__(self):
+        self.event_handler = EventHandler()
+        torch._C._activate_cuda_trace()
+        cuda_trace.register_callback_for_cuda_event_creation(
+            self.event_handler._handle_event_creation
+        )
+        cuda_trace.register_callback_for_cuda_event_deletion(
+            self.event_handler._handle_event_deletion
+        )
+        cuda_trace.register_callback_for_cuda_event_record(
+            self.event_handler._handle_event_record
+        )
+        cuda_trace.register_callback_for_cuda_event_wait(
+            self.event_handler._handle_event_wait
+        )
+        cuda_trace.register_callback_for_cuda_memory_allocation(
+            self.event_handler._handle_memory_allocation
+        )
+        cuda_trace.register_callback_for_cuda_memory_deallocation(
+            self.event_handler._handle_memory_deallocation
+        )
+        cuda_trace.register_callback_for_cuda_stream_creation(
+            self.event_handler._handle_stream_creation
+        )
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+
+        argument_handler = ArgumentHandler()
+        argument_handler.parse_inputs(func._schema, args, kwargs)
+
+        outputs = func(*args, **kwargs)
+
+        argument_handler.parse_outputs(outputs)
+        errors = self.event_handler._handle_kernel_launch(
+            torch.cuda.current_stream().cuda_stream,
+            list(argument_handler.dataptrs_read - argument_handler.dataptrs_written),
+            list(argument_handler.dataptrs_written),
+            func._schema,
+            argument_handler.tensor_names,
+        )
+        if errors:
+            for error in errors:
+                print(error, file=sys.stderr)
+            raise CUDASanitizerErrors(errors)
+
+        return outputs
+
+
+class CUDASanitizer:
+    """Manages the lifetime of a CUDASanitizer dispatch mode object.
+
+    The CUDASanitizer class wraps the entering/exiting functions of the dispatch mode
+    context manager in the enable function/destructor, respectively. This is to
+    explicitly set the lifetime of the dispatch mode object to that of the application.
+    This approach was deemed more elegant than using the atexit module.
+    """
+
+    def __init__(self):
+        self.dispatch = CUDASanitizerDispatchMode()
+        self.enabled = False
+
+    def enable(self):
+        self.dispatch.__enter__()
+        self.enabled = True
+
+    def __del__(self):
+        if self.enabled:
+            self.dispatch.__exit__(None, None, None)
+
+
+def enable_cuda_sanitizer():
+    """Enables CUDA Sanitizer.
+
+    The sanitizer will begin to analyze low-level CUDA calls invoked by torch functions
+    for synchronization errors. All data races found will be printed to the standard
+    error output along with stack traces of suspected causes. For best results, the
+    sanitizer should be enabled at the very beginning of the program.
+    """
+    cuda_sanitizer.enable()
+
+
+cuda_sanitizer = CUDASanitizer()

From 2feb31cb269bd640ff2858ebe8adb3fb0aec8dc0 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Wed, 7 Sep 2022 15:00:54 +0100
Subject: [PATCH 22/45] Improve torch::jit::as_{module,object} performance
 (#84399)

This caches the import of `torch.jit.ScriptModule`,
`torch.ScriptObject` and `torch.jit.RecursiveScriptClass`. I measure
a ~0.8 us performance uplift locally when calling a `torch.ops`
function with a `ScriptObject` argument.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84399
Approved by: https://github.com/ezyang
---
 torch/csrc/jit/python/module_python.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/torch/csrc/jit/python/module_python.h b/torch/csrc/jit/python/module_python.h
index 5c7f269529f4..ab8bf1b8404f 100644
--- a/torch/csrc/jit/python/module_python.h
+++ b/torch/csrc/jit/python/module_python.h
@@ -10,20 +10,24 @@ namespace torch {
 namespace jit {
 
 inline c10::optional<Module> as_module(py::handle obj) {
-  if (py::isinstance(
-          obj, py::module::import("torch.jit").attr("ScriptModule"))) {
+  static py::handle ScriptModule =
+      py::module::import("torch.jit").attr("ScriptModule");
+  if (py::isinstance(obj, ScriptModule)) {
     return py::cast<Module>(obj.attr("_c"));
   }
   return c10::nullopt;
 }
 
 inline c10::optional<Object> as_object(py::handle obj) {
-  if (py::isinstance(obj, py::module::import("torch").attr("ScriptObject"))) {
+  static py::handle ScriptObject =
+      py::module::import("torch").attr("ScriptObject");
+  if (py::isinstance(obj, ScriptObject)) {
     return py::cast<Object>(obj);
   }
 
-  if (py::isinstance(
-          obj, py::module::import("torch.jit").attr("RecursiveScriptClass"))) {
+  static py::handle RecursiveScriptClass =
+      py::module::import("torch.jit").attr("RecursiveScriptClass");
+  if (py::isinstance(obj, RecursiveScriptClass)) {
     return py::cast<Object>(obj.attr("_c"));
   }
   return c10::nullopt;

From 31ef8ddb8c4467f5b8698ef1eb9bb8bab7056855 Mon Sep 17 00:00:00 2001
From: Wei Wei <wwei6@fb.com>
Date: Wed, 7 Sep 2022 17:21:27 +0000
Subject: [PATCH 23/45] add option to remove passes (#84425)

Summary:
Add a remove_pass method in pass_manager to provide user option to remove any pass.

Reviewed By: wushirong

Differential Revision: D39080077

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84425
Approved by: https://github.com/yinghai
---
 torch/fx/passes/pass_manager.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/torch/fx/passes/pass_manager.py b/torch/fx/passes/pass_manager.py
index 52f1170290db..5a34c5bca362 100644
--- a/torch/fx/passes/pass_manager.py
+++ b/torch/fx/passes/pass_manager.py
@@ -212,6 +212,16 @@ def add_constraint(self, constraint):
         self.constraints.append(constraint)
         self._validated = False
 
+    def remove_pass(self, _passes: List[Callable]):
+        if _passes is None:
+            return
+        passes_left = []
+        for ps in self.passes:
+            if ps.__name__ not in _passes:
+                passes_left.append(ps)
+        self.passes = passes_left
+        self._validated = False
+
     def validate(self):
         """
         Validates that current pass schedule defined by `self.passes` is valid

From acb4a09628284201281e262aaee58e3dc6be9c2b Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 7 Sep 2022 18:02:27 +0000
Subject: [PATCH 24/45] Revert "Call jit decomposition in VariableType to
 increase forward AD coverage (#84151)"

This reverts commit 42d99e6f196233627a28b8e9efb26a0a166fa370.

Reverted https://github.com/pytorch/pytorch/pull/84151 on behalf of https://github.com/malfet due to Regressed test_jvpvjp_nn_functional_layer_norm_cuda_float32, see https://hud.pytorch.org/pytorch/pytorch/commit/42d99e6f196233627a28b8e9efb26a0a166fa370
---
 functorch/functorch/csrc/BatchRulesHelper.cpp |  14 ++
 functorch/functorch/csrc/BatchRulesHelper.h   |   6 +
 functorch/functorch/csrc/BatchRulesViews.cpp  |   3 +-
 functorch/functorch/csrc/DynamicLayer.cpp     |  72 +++++-
 functorch/test/test_ops.py                    |  40 ++-
 tools/autograd/derivatives.yaml               |  15 +-
 tools/autograd/gen_variable_type.py           | 232 ++++++------------
 tools/autograd/templates/VariableType.cpp     |   1 -
 .../autograd/VariableTypeUtilsDependOnOps.h   |  40 ---
 torch/csrc/autograd/functions/utils.h         |  18 --
 .../jit/runtime/decomposition_registry.cpp    |  20 --
 .../csrc/jit/runtime/decomposition_registry.h |   6 -
 .../_internal/common_methods_invocations.py   |  17 --
 .../_internal/opinfo/definitions/_masked.py   |   3 -
 14 files changed, 210 insertions(+), 277 deletions(-)
 delete mode 100644 torch/csrc/autograd/VariableTypeUtilsDependOnOps.h

diff --git a/functorch/functorch/csrc/BatchRulesHelper.cpp b/functorch/functorch/csrc/BatchRulesHelper.cpp
index d49ecd5e8737..dfd690ac2168 100644
--- a/functorch/functorch/csrc/BatchRulesHelper.cpp
+++ b/functorch/functorch/csrc/BatchRulesHelper.cpp
@@ -133,6 +133,20 @@ void vmapIncompatibleInplaceError(const char* schema_name) {
     "please file a bug report instead.");
 }
 
+void run_jit_decomposition(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  const auto& schema = op.schema();
+  // TODO: templatize based on op and keep static trace_exec
+  auto * trace_exec = torch::jit::GetDecompositionExecutor(schema);
+  trace_exec->run((*stack));
+  if (stack->back().isTuple()) {
+    IValue tup = stack->back();
+    stack->pop_back();
+    for (const auto& elem: tup.toTuple()->elements()) {
+      stack->push_back(elem);
+    }
+  }
+}
+
 static void handleScalarTypePromotion(Tensor& logical_scalar_tensor, Tensor& second) {
   auto result_type = at::native::result_type(logical_scalar_tensor[0], second);
   if (logical_scalar_tensor.scalar_type() != result_type) {
diff --git a/functorch/functorch/csrc/BatchRulesHelper.h b/functorch/functorch/csrc/BatchRulesHelper.h
index 329d0db42b50..552a38b20e20 100644
--- a/functorch/functorch/csrc/BatchRulesHelper.h
+++ b/functorch/functorch/csrc/BatchRulesHelper.h
@@ -195,6 +195,12 @@ inline void handle_variadic_bdims(std::vector<std::pair<Tensor, optional<int64_t
 #define VARIADIC_BDIMS_BOXED(op) \
   m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_tensor_inputs_batch_rule<decltype(&handle_variadic_bdims), &handle_variadic_bdims>>());
 
+void run_jit_decomposition(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+
+#define RUN_JIT_DECOMPOSITION(op) \
+  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<&run_jit_decomposition>());
+
+
 using UnpackedBatchedTensor = std::tuple<Tensor,optional<int64_t>>;
 
 inline void find_and_unpack_tensors(
diff --git a/functorch/functorch/csrc/BatchRulesViews.cpp b/functorch/functorch/csrc/BatchRulesViews.cpp
index 44f1134486c5..68a6c377f750 100644
--- a/functorch/functorch/csrc/BatchRulesViews.cpp
+++ b/functorch/functorch/csrc/BatchRulesViews.cpp
@@ -15,7 +15,6 @@
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/util/SmallBuffer.h>
 #include <ATen/InferSize.h>
-#include <torch/csrc/jit/runtime/decomposition_registry.h>
 
 namespace at { namespace functorch {
 
@@ -511,7 +510,7 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   VMAP_SUPPORT(chunk, chunk_batching_rule);
   m.impl("flatten.using_ints", static_cast<decltype(&ATEN_FN2(flatten, using_ints))>(native::flatten));
   VMAP_SUPPORT(flip, flip_batch_rule);
-  m.impl("trace", torch::CppFunction::makeFromBoxedFunction<&torch::jit::run_jit_decomposition>());
+  RUN_JIT_DECOMPOSITION(trace)
   VMAP_SUPPORT(tril, VARIADIC_BDIMS_BATCH_RULE(ATEN_FN(tril)));
   VMAP_SUPPORT(triu, VARIADIC_BDIMS_BATCH_RULE(ATEN_FN(triu)));
   VMAP_SUPPORT(repeat, repeat_batch_rule);
diff --git a/functorch/functorch/csrc/DynamicLayer.cpp b/functorch/functorch/csrc/DynamicLayer.cpp
index c83edf327b2c..08cd4d7a7d6b 100644
--- a/functorch/functorch/csrc/DynamicLayer.cpp
+++ b/functorch/functorch/csrc/DynamicLayer.cpp
@@ -389,9 +389,43 @@ WithoutTop::~WithoutTop() {
   pushDynamicLayer(std::move(layer_));
 }
 
-static void dynamicLayerFrontFallback(
+// NOTE: [forward-mode AD decompositions hack]
+//
+// The mechanism is: in DynamicLayerFrontMode, IF we are dispatching on the
+// jvp transform, AND we have a decomposition for the operation, then run
+// the decomposition.
+//
+// Let's break that down. There are a douple of moving pieces.
+//
+// 0. How do we know what transform we're dispatching on?
+// Easy, check the top of the DynamicLayerStack and read the transform.
+//
+// 1. Next, we must identify when an operation (e.g. nll_loss_backward)
+// gets dispatched to.
+// - register a special kernel to the DynamicLayerFrontMode key
+//   (see JVP_DECOMP)
+// - that special kernel invokes dynamicLayerFrontFallbackOperator with
+//   an arg indicating we're going to use a decomp
+//
+// 2. Next, we need to call the decomposition. See call_decomposition_for_jvp.
+// We currently use python decompositions that we torchscript.
+
+// Ideally c10::OperatorHandle would have a field like this
+// to identify the operator.
+// The stuff here should map 1:1 with the operator name.
+// aten::nll_loss_backward -> nll_loss_backward
+// aten::add.Tensor -> add_Tensor
+
+static void call_decomposition_for_jvp(
     const c10::OperatorHandle& op,
     torch::jit::Stack* stack) {
+  run_jit_decomposition(op, stack);
+}
+
+static void dynamicLayerFrontFallbackOperator(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    bool decomp_jvp) {
   auto& dynamicLayerStack = dynamicLayerStackAccessor();
   TORCH_INTERNAL_ASSERT(dynamicLayerStack.size() > 0);
 #ifdef HAS_TORCH_SHOW_DISPATCH_TRACE
@@ -400,6 +434,13 @@ static void dynamicLayerFrontFallback(
     dump_local_tls();
   }
 #endif
+
+  // Hack: if jvp and we have a decomposition registered, then do the decomposition
+  if (dynamicLayerStack.back().interpreter().key() == TransformType::Jvp &&
+      decomp_jvp) {
+    return call_decomposition_for_jvp(op, stack);
+  }
+
   // Save the current LocalDispatchKeySet (to the current DynamicLayer).
   // Upon exiting the current scope, that LocalDispatchKeySet gets restored.
   // When the current DynamicLayer dispatches to the next (inner) DynamicLayer,
@@ -419,6 +460,16 @@ restoreLocalDispatchKeySetRAII(const c10::impl::LocalDispatchKeySet& key_set) {
   return c10::impl::ForceDispatchKeyGuard(key_set);
 }
 
+void dynamicLayerFrontFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  return dynamicLayerFrontFallbackOperator(op, stack, false);
+}
+
+void dynamicLayerFrontFallBackWithDecomp(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack) {
+  return dynamicLayerFrontFallbackOperator(op, stack, true);
+}
+
 void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   auto& layer = dynamicLayerStackAccessor().back();
   auto restore_guard = restoreLocalDispatchKeySetRAII(layer.interpreter().getSavedLocalDispatchKeySet());
@@ -435,5 +486,24 @@ TORCH_LIBRARY_IMPL(_, FT_DYNAMIC_LAYER_BACK_MODE_KEY, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&dynamicLayerBackFallback>());
 }
 
+#define JVP_DECOMP(op) \
+  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<&dynamicLayerFrontFallBackWithDecomp>());
+
+#define JVP_DECOMP2(op, overload) \
+  m.impl(#op "." #overload, torch::CppFunction::makeFromBoxedFunction<&dynamicLayerFrontFallBackWithDecomp>());
+
+TORCH_LIBRARY_IMPL(aten, FT_DYNAMIC_LAYER_FRONT_MODE_KEY, m) {
+  JVP_DECOMP(nll_loss_backward);
+  JVP_DECOMP(nll_loss2d_backward);
+  JVP_DECOMP(_log_softmax_backward_data);
+  JVP_DECOMP(_softmax_backward_data);
+  OP_DECOMPOSE(log_sigmoid);
+  JVP_DECOMP(log_sigmoid_forward);
+  JVP_DECOMP(native_layer_norm_backward);
+  JVP_DECOMP(native_batch_norm_backward);
+  JVP_DECOMP(cudnn_batch_norm_backward);
+}
+
+
 }
 } // namespace at
diff --git a/functorch/test/test_ops.py b/functorch/test/test_ops.py
index 218ba47b46ed..8d69fe7e22b5 100644
--- a/functorch/test/test_ops.py
+++ b/functorch/test/test_ops.py
@@ -1047,6 +1047,9 @@ def get_vjp(cotangents, *primals):
         # RuntimeError: Trying to set a forward gradient that has a different size than that of the original Tensor,
         # this is not supported. Tensor is of size [5, 2, 3] while the given forward gradient is of size [1, 2, 3].
         xfail('normal', ''),
+        xfail('_masked.log_softmax', ''),  # NYI: forward-AD for _log_softmax_backward_data
+        xfail('_masked.softmax', ''),  # NYI: forward-AD for _softmax_backward_data
+        xfail('_masked.softmin', ''),  # NYI: forward-AD for _softmax_backward_data
         xfail('cdist', ''),  # NYI: forward-AD for _cdist_forward
         xfail('cholesky', ''),  # NYI: forward-AD for cholesky
         xfail('eig', ''),  # NYI: forward-AD for eig
@@ -1055,7 +1058,10 @@ def get_vjp(cotangents, *primals):
         xfail('nn.functional.grid_sample', ''),  # NYI: forward AD for grid_sampler_2d
         xfail('nn.functional.hardsigmoid', ''),  # NYI: forward AD for hardsigmoid_backward
         xfail('nn.functional.huber_loss', ''),  # NYI: forward AD for huber_loss_backward
+        xfail('nn.functional.instance_norm', ''),  # NYI: forward AD for native_batch_norm_backward
         xfail('nn.functional.logsigmoid', ''),  # not differentiable w.r.t. buffer
+        xfail('nn.functional.softmin', ''),  # NYI: forward-AD for _softmax_backward_data
+        xfail('nn.functional.softmin', 'with_dtype'),  # NYI: forward-AD for _softmax_backward_data
         xfail('renorm', ''),  # NYI: forward AD for renorm
         xfail('symeig', ''),  # NYI: forward AD for symeig
         xfail('nn.functional.multilabel_margin_loss', ''),  # NYI: multilabel_margin_loss_forward
@@ -1069,6 +1075,7 @@ def get_vjp(cotangents, *primals):
         xfail('scatter_reduce', 'mean'),  # NYI: forward-AD for scatter_reduce
         xfail('scatter_reduce', 'prod'),  # NYI: forward-AD for scatter_reduce
         skip('linalg.householder_product', '', device_type='cuda'),  # flaky, I'm not sure why
+        xfail('native_layer_norm', ''),  # NYI: forward-AD for native_layer_norm_backward
         xfail('sparse.sampled_addmm', ''),  # Sparse tensors have no strides
         skip('as_strided_scatter', ''),  # seems flaky
         xfail('segment_reduce', 'offsets'),  # NYI: forward-AD for segment_reduce
@@ -1129,8 +1136,37 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
                     expected = (tree_unflatten(primals_out, spec), tree_unflatten(tangents_out, spec))
                 return expected
 
-            expected = reference(primals, cotangents, primals_tangents, cotangents_tangents)
-            self.assertEqual(result, expected)
+            # HACK: obviously pytorch should also have the same coverage
+            # For things that do have the same coverage, we test that jvp x vjp
+            # are the same between PyTorch and functorch. For things that don't,
+            # we check that jacfwd(vjp) and jacrev(vjp) are the same. This results
+            # in slower tests.
+            FUNCTORCH_HAS_FORMULA_BUT_NOT_PYTORCH = {
+                'nn.functional.nll_loss',
+                'softmax',
+                'log_softmax',
+                'nn.functional.cross_entropy',
+                'nn.functional.layer_norm',
+                'nn.functional.batch_norm',
+            }
+            if op.name in FUNCTORCH_HAS_FORMULA_BUT_NOT_PYTORCH:
+                self.assertFalse(op.supports_fwgrad_bwgrad,
+                                 f"{op.name} now supports forward over reverse without a decomposition. " +
+                                 "Please remove the decomposition version")
+
+                def is_differentiable(t):
+                    return isinstance(t, torch.Tensor) and t.dtype == torch.float32
+                args = (cotangents, *primals)
+                if op.name == 'nn.functional.binary_cross_entropy':
+                    argnums = (0, 1)  # targets is float32 but isn't differentiable
+                    atol_rtol = 1.5e-4, 1.3e-06
+                else:
+                    argnums = tuple(i for i in range(len(args)) if is_differentiable(args[i]))
+                    atol_rtol = None
+                self._compare_jacobians_of_vjp(fn, args, argnums, atol_rtol)
+            else:
+                expected = reference(primals, cotangents, primals_tangents, cotangents_tangents)
+                self.assertEqual(result, expected)
 
     def _make_extremal_inputs(self, shape, device):
         if shape is None:
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index c5b1ec04fd87..5a8bf46319f0 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1956,20 +1956,7 @@
 
 - name: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
   self: log_sigmoid_backward(grad, self, buffer)
-  # HACK: This is just auto_element_wise followed by a view_as. The reason we have
-  # this is bc forward AD was complaining here about the shapes not being the same:
-  # the primal/tangent are 0-D/1-D respectively. This started happening after moving the
-  # jvp decomposition mechanism from functorch to core, possibly due to a batching rule.
-  # In functorch we rely on OP_DECOMPOSE, but now we compute forward AD using an actual
-  # formula.
-  #
-  # We'd like to avoid keeping the entire jvp decomposition mechanism in functorch,
-  # just for this single decomposition, but also want to avoid any cases from regressing:
-  # e.g. test_vmapjvpall_nn_functional_logsigmoid_cuda_float32 (passes on cpu, fails on CUDA).
-  #
-  # We should either figure out what is going on with vmap or perhaps fwd AD could
-  # be more tolerant about 0-dim vs 1-dim tensors
-  output: log_sigmoid_backward(self_t.conj(), self_p, buffer).conj().view_as(self_p)
+  output: auto_element_wise
 
 - name: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
   self: _log_softmax_backward_data(grad, result, dim, self.scalar_type())
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 35987ca24266..f9afe838203d 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -31,7 +31,6 @@
 from torchgen.api.autograd import (
     DifferentiableInput,
     dispatch_strategy,
-    ForwardDerivative,
     gen_differentiable_outputs,
     is_differentiable,
     NativeFunctionWithDifferentiabilityInfo,
@@ -598,14 +597,8 @@
 DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES = CodeTemplate(
     """\
 auto ${tmp_var} = ([&]() {
-  if (${try_jit_decomposition_bool} && ${any_has_forward_grad}) {
-    static c10::OperatorName full_name("aten::${op_name}", "${op_overload}");
-    static c10::optional<c10::OperatorHandle> opt_op = c10::Dispatcher::singleton().findSchema(full_name);
-    return impl::run_jit_decomposition_with_args_for_jvp<${returns_and_args}>("${op_name}", *opt_op, ks, ${arg_names});
-  } else {
-    ${guard}
-    return ${base_type_call};
-  }
+  ${guard}
+  return ${base_type_call};
 })();
 """
 )
@@ -649,12 +642,6 @@
 """
 )
 
-FW_DERIVATIVE_TENSORLIST_CHECK_TEMPLATE = CodeTemplate(
-    """\
-isFwGradDefinedTensorList(${req_inp})\
-"""
-)
-
 FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE = CodeTemplate(
     """\
 auto ${inp}_t_raw = toNonOptFwGrad(${inp});
@@ -985,23 +972,6 @@ def find_args_with_derivatives(
             f"ERROR: derivative ignored for {name} -- specified an autograd function without derivative"
         )
 
-    if requires_derivative and not len(fw_derivatives) == 0:
-        assert sum(len(derivative.var_names) for derivative in fw_derivatives) == len(
-            differentiable_outputs
-        ), (
-            "Expected the number of forward derivatives implemented to match the "
-            "number of differentiable outputs. NB: This only applies when at least "
-            "one forward derivative is implemented. Not implementing any forward "
-            "derivatives is also okay, and we would require inputs to the op to "
-            "not have associated tangents in that case."
-        )
-    try_jit_decomposition = (
-        requires_derivative
-        and len(fw_derivatives) == 0
-        and (not modifies_arguments(f))
-        and (not returns_void)
-    )
-
     def emit_save_inputs() -> List[str]:
         setup: List[str] = []
         if info is None or not info.has_derivatives:
@@ -1368,9 +1338,7 @@ def check_tensorimpl_and_storage(
             )
         return call
 
-    def emit_call(
-        f: NativeFunction, unpacked_bindings: List[Binding], try_jit_decomposition: bool
-    ) -> str:
+    def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str:
         # We only care about adding `at::AutoDispatchBelowAutograd` guard for non-variable dispatch
         # (which corresponds to 'use_derived' strategy). The purpose of this guard is to make sure
         # the baseType operations still dispatch to non-Variable type, even if the arguments passed
@@ -1384,51 +1352,13 @@ def emit_call(
         else:
             guard = "at::AutoDispatchBelowADInplaceOrView guard;"
 
-        try_jit_decomposition_bool = "true" if try_jit_decomposition else "false"
-        any_has_forward_grad = (
-            get_any_has_fw_grad_cond(derivative=None)
-            if requires_derivative
-            else "false"
-        )
-        return_types = ", ".join(
-            [cpp.return_type(a, symint=True).cpp_type() for a in f.func.returns]
-        )
-        if len(f.func.returns) > 1:
-            return_types = f"std::tuple<{return_types}>"
-
-        arg_types = [
-            cpp.argument_type(a, binds="", symint=True).cpp_type()
-            for a in f.func.arguments.flat_all
-        ]
-        arg_names = [
-            a.name
-            for a in cpp.arguments(
-                f.func.arguments,
-                faithful=True,
-                symint=True,
-                method=False,
-                cpp_no_default_args=set(),
-            )
-        ]
-
         if not modifies_arguments(f) and not returns_void:
-            # Just to keep things simple here, we only care about this path
-            # and always emit the if/else for now
             call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute(
-                base_type_call=base_type_call,
-                tmp_var=TMP_VAR,
-                guard=guard,
-                try_jit_decomposition_bool=try_jit_decomposition_bool,
-                any_has_forward_grad=any_has_forward_grad,
-                op_name=cpp.name(f.func),
-                op_overload=f.func.name.overload_name,
-                returns_and_args=return_types + ", " + ", ".join(arg_types),
-                arg_names=arg_names,
+                base_type_call=base_type_call, tmp_var=TMP_VAR, guard=guard
             )
 
             call += wrap_output(f, unpacked_bindings, TMP_VAR)
         else:
-            assert not try_jit_decomposition
             call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute(
                 base_type_call=base_type_call, guard=guard
             )
@@ -1476,14 +1406,38 @@ def get_any_has_forward_grad_name(var_names: Tuple[str, ...]) -> str:
     def emit_any_has_forward_grad() -> List[str]:
         content: List[str] = []
         for derivative in fw_derivatives:
-            requires_fw_grad = get_any_has_fw_grad_cond(derivative=derivative)
+            assert derivative.required_inputs_fw_grad is not None
+            requires_fw_grad = " || ".join(
+                [
+                    FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
+                    for inp in differentiable_inputs
+                    if inp.name in derivative.required_inputs_fw_grad
+                ]
+            )
+            if not requires_fw_grad:
+                # Handle functions like stack
+                # For these, we don't unpack anything and always call the user function
+                if not (
+                    len(differentiable_inputs) == 1
+                    and is_tensor_list_type(differentiable_inputs[0].type)
+                ):
+                    raise RuntimeError(
+                        f'No differentiable input to "{name}" is a differentiable Tensor (as the provided '
+                        "forward AD formula does not use any input tangent) even though a forward gradient "
+                        "formula has been defined for it. This case should only happen for function that "
+                        "take a single TensorList as input. All other cases are not supported right now."
+                    )
+                requires_fw_grad = "true"
+
             if info and info.output_differentiability_conditions:
                 assert len(info.output_differentiability_conditions) == 1
-                requires_fw_grad = f"({info.output_differentiability_conditions[0]}) && {requires_fw_grad}"
+                requires_fw_grad = f"({info.output_differentiability_conditions[0]}) && ({requires_fw_grad})"
+
             content.append(
                 f"auto {get_any_has_forward_grad_name(derivative.var_names)} = {requires_fw_grad};\n"
                 f"(void){get_any_has_forward_grad_name(derivative.var_names)};"
             )
+
         return content
 
     def emit_check_inplace() -> List[str]:
@@ -1606,83 +1560,46 @@ def emit_fw_derivatives() -> List[str]:
         content.append("\n".join(fw_grad_setters))
         return content
 
-    def get_any_has_fw_grad_cond(derivative: Optional[ForwardDerivative]) -> str:
-        #
-        # Produces a condition string (e.g, "isFwGradDefined(grad_output) || isFwGradDefined(output)")
-        #
-        if derivative is None:
-            # (1) If a derivative is NOT provided, cond will check fw_grad of ALL differentiable inputs
-            # - Used in the out_fn case when we want to forbid fw derivatives
-            # - Used in the case where the fw_derivative is not defined, but we want
-            #   To check if there is a decomposition registered for jvp
-            to_check: List[str] = []
-            for inp in list(
-                mapMaybe(
-                    gen_differentiable_input,
-                    f.func.arguments.non_out + list(f.func.arguments.out),  # type: ignore[operator]
+    def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str:
+        def get_msg() -> str:
+            if is_out_fn:
+                msg = "because it is an out= function"
+            else:
+                msg = (
+                    "because it has not been implemented yet.\\nPlease file an issue "
+                    "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
+                    "so that we can prioritize its implementation."
+                )
+            return msg
+
+        res = ""
+        to_check: List[str] = []
+        for inp in list(
+            mapMaybe(
+                gen_differentiable_input,
+                f.func.arguments.non_out + list(f.func.arguments.out),  # type: ignore[operator]
+            )
+        ):
+            if is_tensor_type(inp.type):
+                to_check.append(
+                    FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
+                )
+            elif is_tensor_list_type(inp.type):
+                cond = FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp="_t")
+                res += FW_DERIVATIVE_FORBID_LIST_TEMPLATE.substitute(
+                    arg=inp.name, cond=cond, name=name, msg=get_msg()
                 )
-            ):
-                if is_tensor_type(inp.type):
-                    to_check.append(
-                        FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
-                    )
-                elif is_tensor_list_type(inp.type):
-                    to_check.append(
-                        FW_DERIVATIVE_TENSORLIST_CHECK_TEMPLATE.substitute(
-                            req_inp=inp.name
-                        )
-                    )
-                else:
-                    raise RuntimeError(
-                        f'Unsupported input type for "{name}" when forbidding forward AD usage.'
-                    )
-            return f'({" || ".join(to_check)})'
-        else:
-            # (2) If derivative is provided, use that information to determine which inputs
-            #     to check fw_grad for
-            assert derivative.required_inputs_fw_grad is not None
-
-            if len(derivative.required_inputs_fw_grad) == 0:
-                # Handle functions like stack
-                # For these, we don't unpack anything and always call the user function
-                if not (
-                    len(differentiable_inputs) == 1
-                    and is_tensor_list_type(differentiable_inputs[0].type)
-                ):
-                    raise RuntimeError(
-                        f'No differentiable input to "{name}" is a differentiable Tensor (as the provided '
-                        "forward AD formula does not use any input tangent) even though a forward gradient "
-                        "formula has been defined for it. This case should only happen for function that "
-                        "take a single TensorList as input. All other cases are not supported right now."
-                    )
-                any_has_fw_grad = "true"
             else:
-                any_has_fw_grad = " || ".join(
-                    [
-                        FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
-                        for inp in differentiable_inputs
-                        if inp.name in derivative.required_inputs_fw_grad
-                    ]
+                raise RuntimeError(
+                    f'Unsupported input type for "{name}" when forbidding forward AD usage.'
                 )
-                any_has_fw_grad = f"({any_has_fw_grad})"
-
-            return any_has_fw_grad
 
-    def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str:
-        if is_out_fn:
-            msg = "because it is an out= function"
-        else:
-            msg = (
-                "because it has not been implemented yet.\\nPlease file an issue "
-                "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
-                "so that we can prioritize its implementation."
+        if len(to_check) > 0:
+            cond = " || ".join(to_check)
+            res += FW_DERIVATIVE_FORBID_TEMPLATE.substitute(
+                cond=cond, name=name, msg=get_msg()
             )
-        cond = get_any_has_fw_grad_cond(derivative=None)
-        return (
-            FW_DERIVATIVE_FORBID_TEMPLATE.substitute(cond=cond, name=name, msg=msg)
-            if cond != ""
-            else ""
-        )
+        return res
 
     body: List[str] = []
     unpack_args_stats, unpacked_bindings = unpack_args(f)
@@ -1696,7 +1613,7 @@ def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str:
         body.extend(setup_derivative(differentiable_inputs))
     body.append(declare_returned_variables(f))
 
-    body.append(emit_call(f, unpacked_bindings, try_jit_decomposition))
+    body.append(emit_call(f, unpacked_bindings))
     if requires_derivative:
         # set_flags has to appear after version_counter, because rebase_history
         # requires that the counter is incremented before it is called
@@ -1706,11 +1623,20 @@ def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str:
     if is_out_fn:
         body.append(emit_forbid_fw_derivatives(is_out_fn=True))
     else:
-        if requires_derivative and not try_jit_decomposition:
-            if len(fw_derivatives) > 0:
-                body.extend(emit_fw_derivatives())
-            else:
+        if requires_derivative:
+            body.extend(emit_fw_derivatives())
+            if len(fw_derivatives) == 0:
                 body.append(emit_forbid_fw_derivatives())
+            else:
+                assert sum(
+                    len(derivative.var_names) for derivative in fw_derivatives
+                ) == len(differentiable_outputs), (
+                    "Expected the number of forward derivatives implemented to match the "
+                    "number of differentiable outputs. NB: This only applies when at least "
+                    "one forward derivative is implemented. Not implementing any forward "
+                    "derivatives is also okay, and we would require inputs to the op to "
+                    "not have associated tangents in that case."
+                )
 
     if requires_derivative:
         # Save only after the forward AD has been set up
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 3c467f83c318..9cd2d5c40de7 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -1,5 +1,4 @@
 #include "torch/csrc/autograd/VariableTypeUtils.h"
-#include "torch/csrc/autograd/VariableTypeUtilsDependOnOps.h"
 #include "torch/csrc/autograd/generated/VariableType.h"
 #include "torch/csrc/autograd/FunctionsManual.h"
 
diff --git a/torch/csrc/autograd/VariableTypeUtilsDependOnOps.h b/torch/csrc/autograd/VariableTypeUtilsDependOnOps.h
deleted file mode 100644
index f2569c9d6463..000000000000
--- a/torch/csrc/autograd/VariableTypeUtilsDependOnOps.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#pragma once
-
-#include <torch/csrc/jit/runtime/decomposition_registry.h>
-
-// This is the set of helpers in VariableTypeUtils have a dependency on
-// native_functions.yaml meaning the file will need to be re-compiled every time
-// an operator is changed or added. We cannot simply put these functions in
-// VariableType.h and VariableTypeutils.h, since they are included in files like
-// ADInplaceOrViewType_X.cpp which don't always want to be recompiled.
-
-namespace torch {
-namespace autograd {
-namespace impl {
-
-// Depends on torch/csrc/jit/ir/ir.h -> aten/src/ATen/core/interned_strings.h
-template <class Return, class... Args>
-Return run_jit_decomposition_with_args_for_jvp(
-    c10::string_view name,
-    const c10::OperatorHandle& opHandle,
-    c10::DispatchKeySet dispatchKeySet,
-    Args... args) {
-  bool has_decomp = jit::has_jit_decomposition(opHandle.schema());
-
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      has_decomp,
-      "Trying to use forward AD with ",
-      name,
-      " that does not support it"
-      "because it has not been implemented yet and does not have a decomposition.\\nPlease file an issue "
-      "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
-      "so that we can prioritize its implementation.");
-
-  return c10::KernelFunction::makeFromBoxedKernel(
-             c10::BoxedKernel::makeFromFunction<&jit::run_jit_decomposition>())
-      .call<Return, Args...>(opHandle, dispatchKeySet, args...);
-}
-
-} // namespace impl
-} // namespace autograd
-} // namespace torch
diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h
index 75df1a0302c9..a2169f18656f 100644
--- a/torch/csrc/autograd/functions/utils.h
+++ b/torch/csrc/autograd/functions/utils.h
@@ -100,23 +100,5 @@ inline bool isFwGradDefined(const c10::optional<at::Tensor>& t) {
   return t.has_value() && t->defined() && t->_fw_grad(/*level */ 0).defined();
 }
 
-inline bool isFwGradDefinedTensorList(const at::TensorList& variables) {
-  bool ret = false;
-  for (auto& variable : variables) {
-    ret |= isFwGradDefined(variable);
-  }
-  return ret;
-}
-
-inline bool isFwGradDefinedTensorList(
-    const c10::List<c10::optional<at::Tensor>> li) {
-  bool ret = false;
-  for (auto i : c10::irange(li.size())) {
-    auto t = li.get(i);
-    ret |= (t.has_value() && isFwGradDefined(t.value()));
-  }
-  return ret;
-}
-
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/decomposition_registry.cpp b/torch/csrc/jit/runtime/decomposition_registry.cpp
index bfad602ef2f2..d55ac7eac9be 100644
--- a/torch/csrc/jit/runtime/decomposition_registry.cpp
+++ b/torch/csrc/jit/runtime/decomposition_registry.cpp
@@ -160,26 +160,6 @@ void RegisterDecomposition(
   schema_to_decomposition[&schema] = g;
 }
 
-void run_jit_decomposition(
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack) {
-  const auto& schema = op.schema();
-  // TODO: templatize based on op and keep static trace_exec
-  auto* trace_exec = torch::jit::GetDecompositionExecutor(schema);
-  trace_exec->run((*stack));
-  if (stack->back().isTuple()) {
-    at::IValue tup = stack->back();
-    stack->pop_back();
-    for (const auto& elem : tup.toTuple()->elements()) {
-      stack->push_back(elem);
-    }
-  }
-}
-
-bool has_jit_decomposition(const FunctionSchema& schema) {
-  return GetDecompositionFunction(schema).has_value();
-}
-
 Function* GetDecompositionExecutor(const FunctionSchema& schema) {
   auto maybe_func = GetDecompositionFunction(schema);
   TORCH_INTERNAL_ASSERT(maybe_func);
diff --git a/torch/csrc/jit/runtime/decomposition_registry.h b/torch/csrc/jit/runtime/decomposition_registry.h
index 225204cf60de..4c6ef3029a0b 100644
--- a/torch/csrc/jit/runtime/decomposition_registry.h
+++ b/torch/csrc/jit/runtime/decomposition_registry.h
@@ -25,11 +25,5 @@ TORCH_API Function* GetDecompositionExecutor(const char* schema_literal);
 
 TORCH_API Function* GetDecompositionExecutor(const FunctionSchema& schema);
 
-TORCH_API void run_jit_decomposition(
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack);
-
-TORCH_API bool has_jit_decomposition(const FunctionSchema& schema);
-
 } // namespace jit
 } // namespace torch
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index a9e98a44dcaa..3f152354e6d2 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -39,10 +39,6 @@
 import torch._refs.special
 import torch._refs.linalg
 
-# Make sure that decompositions used for test_forward_mode_AD and
-# test_fn_fwgrad_bwgrad are registered to the jit
-import torch._decomp.decompositions_for_jvp
-
 import torch._prims as prims  # noqa: F401
 
 from torch.utils._pytree import tree_flatten
@@ -10168,7 +10164,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            assert_jit_shape_analysis=True,
            assert_autodiffed=True,
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
            supports_out=True),
     OpInfo('softmax',
            aliases=('special.softmax', 'nn.functional.softmax',),
@@ -10178,7 +10173,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
            assert_autodiffed=True,
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
            supports_out=True),
     # `softmin` supports different dtypes based on whether `dtype` argument,
     # is passed or not. Hence two OpInfo entries, one with dtype and other without.
@@ -10191,7 +10185,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            assert_jit_shape_analysis=False,
            assert_autodiffed=False,
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
            supports_out=False),
     OpInfo('nn.functional.softmin',
            variant_test_name="with_dtype",
@@ -10200,7 +10193,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
            assert_autodiffed=False,
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
            supports_out=False),
     OpInfo(
         "nn.functional.cross_entropy",
@@ -10209,7 +10201,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_cross_entropy,
         supports_out=False,
         supports_forward_ad=True,
-        supports_fwgrad_bwgrad=True,
         decorators=(
             DecorateInfo(
                 toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-3)}),
@@ -10301,7 +10292,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            assert_jit_shape_analysis=True,
-           supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_native_layer_norm,
            error_inputs_func=error_inputs_native_layer_norm,
            skips=(
@@ -10673,7 +10663,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
            decorators=[
                # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
                # Consider making it a parameter or input, or detaching the gradient
@@ -10692,7 +10681,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
            decorators=[
                DecorateInfo(
@@ -11732,7 +11720,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
            sample_inputs_func=sample_inputs_batch_norm,
            skips=(
@@ -11755,7 +11742,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
            decorators=[onlyCUDA, disablecuDNN],
            skips=(
                DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
@@ -14718,7 +14704,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         sample_inputs_func=sample_inputs_softmax_variant,
         supports_forward_ad=True,
-        supports_fwgrad_bwgrad=True,
         assert_autodiffed=True),
     OpInfo(
         'log_softmax',
@@ -14728,7 +14713,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
         sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
         supports_forward_ad=True,
-        supports_fwgrad_bwgrad=True,
         assert_autodiffed=True),
     UnaryUfuncInfo('logit',
                    aten_backward_name='logit_backward',
@@ -15605,7 +15589,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         supports_out=False,
         sample_inputs_func=sample_inputs_nll_loss,
         supports_forward_ad=True,
-        supports_fwgrad_bwgrad=True,
         assert_jit_shape_analysis=True,
         skips=(
             # RuntimeError:
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index cb88766e70c6..d8a3e8aa948d 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -990,7 +990,6 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
         ),
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
         supports_forward_ad=True,
-        supports_fwgrad_bwgrad=True,
         supports_out=False,
     ),
     OpInfo(
@@ -1018,7 +1017,6 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
         ],
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
         supports_forward_ad=True,
-        supports_fwgrad_bwgrad=True,
         supports_out=False,
     ),
     OpInfo(
@@ -1039,7 +1037,6 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
         ),
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
         supports_forward_ad=True,
-        supports_fwgrad_bwgrad=True,
         supports_out=False,
     ),
     OpInfo(

From d892d5d6829c315ba9b5038b8796e1c96a54f9b5 Mon Sep 17 00:00:00 2001
From: Eddie Yan <eddiey@nvidia.com>
Date: Wed, 7 Sep 2022 18:30:23 +0000
Subject: [PATCH 25/45] [CUBLAS][TF32][CUDNN] Update numerical_accuracy.rst
 (#79537)

CC @mruberry @ptrblck
Pull Request resolved: https://github.com/pytorch/pytorch/pull/79537
Approved by: https://github.com/ngimel, https://github.com/mruberry
---
 docs/source/notes/numerical_accuracy.rst | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/docs/source/notes/numerical_accuracy.rst b/docs/source/notes/numerical_accuracy.rst
index c952fb1f7c59..b1d05f946041 100644
--- a/docs/source/notes/numerical_accuracy.rst
+++ b/docs/source/notes/numerical_accuracy.rst
@@ -54,16 +54,14 @@ datatype. E.g.:
 TensorFloat-32(TF32) on Nvidia Ampere devices
 ---------------------------------------------
 
-On Ampere Nvidia GPUs, PyTorch by default uses TensorFloat32 (TF32) to speed up mathematically
-intensive operations, in particular matrix multiplications and convolutions. When operation is performed
-using TF32 tensor cores, only the first 10 bits of the input mantissa are read. This leads to less accurate
-results, and surprising results such as multiplying a matrix by identity matrix produces
-results that are different from the input.
-Most neural network workloads have the same convergence behavior when using tf32 as they have
-with fp32, however, if better accuracy is desired, TF32 can be turned off with
-``torch.backends.cuda.matmul.allow_tf32 = False``
-
-For more information see :ref:`TensorFloat32<tf32_on_ampere>`
+On Ampere Nvidia GPUs, PyTorch can use TensorFloat32 (TF32) to speed up mathematically intensive operations, in particular matrix multiplications and convolutions.
+When an operation is performed using TF32 tensor cores, only the first 10 bits of the input mantissa are read.
+This may reduce accuracy and produce surprising results (e.g., multiplying a matrix by the identity matrix may produce results that are different from the input).
+By default, TF32 tensor cores are disabled for matrix multiplications and enabled for convolutions, although most neural network workloads have the same convergence behavior when using TF32 as they have with fp32.
+We recommend enabling TF32 tensor cores for matrix multiplications with ``torch.backends.cuda.matmul.allow_tf32 = True`` if your network does not need full float32 precision.
+If your network needs full float32 precision for both matrix multiplications and convolutions, then TF32 tensor cores can also be disabled for convolutions with ``torch.backends.cudnn.allow_tf32 = False``.
+
+For more information see :ref:`TensorFloat32<tf32_on_ampere>`.
 
 Reduced Precision Reduction for FP16 GEMMs
 ------------------------------------------

From fc4acd4425ca0896ca1c4f0a8bd7e22a51e94731 Mon Sep 17 00:00:00 2001
From: WEN Hao <wenh06@gmail.com>
Date: Wed, 7 Sep 2022 19:12:33 +0000
Subject: [PATCH 26/45] Fix error in the index range math expression in the
 docstring of MultiMarginLoss (#84513)

Fixes #84512

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84513
Approved by: https://github.com/Lezcano, https://github.com/cpuhrsch
---
 torch/nn/modules/loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 85de8c549edb..ed151e64f4f0 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1345,7 +1345,7 @@ class MultiMarginLoss(_WeightedLoss):
     .. math::
         \text{loss}(x, y) = \frac{\sum_i \max(0, \text{margin} - x[y] + x[i])^p}{\text{x.size}(0)}
 
-    where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`
+    where :math:`i \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`
     and :math:`i \neq y`.
 
     Optionally, you can give non-equal weighting on the classes by passing

From c7f6deb6678f4df578584439e4ab26d185da5ef3 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Tue, 6 Sep 2022 14:42:09 -0700
Subject: [PATCH 27/45] [PyTorch] Guard against self assignment in SymInt
 (#84375)

self assignment was broken, now it's not.

Differential Revision: [D39189342](https://our.internmc.facebook.com/intern/diff/D39189342/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84375
Approved by: https://github.com/suo
---
 c10/core/SymInt.h | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 28414f02b833..3fac75488169 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -67,18 +67,22 @@ class C10_API SymInt {
   }
 
   SymInt& operator=(const SymInt& s) {
-    if (s.is_symbolic()) {
-      *this = SymInt::toSymInt(s.toSymIntNodeImpl());
-    } else {
-      data_ = s.data_;
+    if (this != &s) {
+      if (s.is_symbolic()) {
+        *this = SymInt::toSymInt(s.toSymIntNodeImpl());
+      } else {
+        data_ = s.data_;
+      }
     }
     return *this;
   }
   SymInt& operator=(SymInt&& s) {
-    release_(); // release the current SymIntNode if any
-    data_ = s.data_;
-    if (s.is_symbolic())
-      s.data_ = 0;
+    if (this != &s) {
+      release_(); // release the current SymIntNode if any
+      data_ = s.data_;
+      if (s.is_symbolic())
+        s.data_ = 0;
+    };
     return *this;
   }
 

From e14f46f9ddf143dbe894ee40e3a698fb401523ae Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Wed, 7 Sep 2022 07:39:21 -0700
Subject: [PATCH 28/45] Add host and port to TCPStore pyi definition (#84636)

`host` and `port` are already exposed in the `TCPStore` pybind definition, this is a small change adding it in the pyi stub

Differential Revision: [D39311153](https://our.internmc.facebook.com/intern/diff/D39311153)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84636
Approved by: https://github.com/wz337
---
 torch/_C/_distributed_c10d.pyi | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index edcf3c43f9b0..d0158e9d1674 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -145,6 +145,10 @@ class TCPStore(Store):
         wait_for_workers: bool = ...,
         multi_tenant: bool = ...
     ): ...
+    @property
+    def host(self) -> str: ...
+    @property
+    def port(self) -> int: ...
 
 class PrefixStore(Store):
     def __init__(self, prefix: str, store: Store): ...

From 9532c7e267b3ccf2ca500fdae1ed5298c1f0f146 Mon Sep 17 00:00:00 2001
From: samdow <samdow@fb.com>
Date: Wed, 7 Sep 2022 17:50:54 +0000
Subject: [PATCH 29/45] [functorch] add matrix_rank rule (#83760)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/83760
Approved by: https://github.com/zou3519
---
 .../csrc/BatchRulesLinearAlgebra.cpp          | 37 +++++++++++++++++++
 functorch/test/test_vmap.py                   |  2 -
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp b/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
index 08695e88da61..1ba13e4f5d19 100644
--- a/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
+++ b/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
@@ -382,6 +382,41 @@ fourOutputs linalg_lstsq_batch_rule(
   return std::make_tuple(res, 0, res_1, res_1_bdim, res_2, res_2_bdim, res_3, res_3_bdim);
 }
 
+std::tuple<Tensor, c10::optional<int64_t>>
+matrix_rank_atol_rtol_tensor_batch_rule(
+    const Tensor& input, optional<int64_t> input_bdim, const optional<Tensor>& atol, const optional<int64_t> atol_bdim,
+    const optional<Tensor>& rtol, const optional<int64_t> rtol_bdim, bool hermitian) {
+  const auto input_logical_rank = rankWithoutBatchDim(input, input_bdim);
+  TORCH_CHECK(input_logical_rank >= 2,
+            "torch.linalg.matrix_rank: The input tensor input must have at least 2 dimensions.");
+
+  // atol and rtol's dims must be broadcastable to the number of batch dims of input
+  // which is input's dim - 2 (input represents a batch of matrices, so 2 is for the matrix dimensions)
+  const auto input_logical_num_bdims = input_logical_rank - 2;
+  const int64_t atol_logical_num_bdims = atol.has_value() ? rankWithoutBatchDim(*atol, atol_bdim) : 0;
+  const int64_t rtol_logical_num_bdims = rtol.has_value() ? rankWithoutBatchDim(*rtol, rtol_bdim) : 0;
+  const auto max_logical_bdims = std::max({input_logical_num_bdims, atol_logical_num_bdims, rtol_logical_num_bdims});
+
+  auto input_ = moveBatchDimToFront(input, input_bdim);
+  auto atol_ = atol.has_value() ? moveBatchDimToFront(*atol, atol_bdim) : atol;
+  auto rtol_ = rtol.has_value() ? moveBatchDimToFront(*rtol, rtol_bdim) : rtol;
+
+  // pad all inputs to have the same number of (non-vmap) batch dimensions
+  input_ = maybePadToLogicalRank(input_, input_bdim, max_logical_bdims + 2);
+  atol_ = atol_.has_value() ? maybePadToLogicalRank(*atol_, atol_bdim, max_logical_bdims) : atol_;
+  rtol_ = rtol_.has_value() ? maybePadToLogicalRank(*rtol_, rtol_bdim, max_logical_bdims) : rtol_;
+
+  return std::make_tuple(at::linalg_matrix_rank(input_, atol_, rtol_, hermitian), 0);
+}
+
+std::tuple<Tensor,optional<int64_t>>
+matrix_rank_atol_rtol_float_batch_rule(
+    const Tensor& input, optional<int64_t> input_bdim, optional<double> atol, optional<double> rtol, bool hermitian) {
+  TORCH_CHECK(rankWithoutBatchDim(input, input_bdim) >= 2,
+            "torch.linalg.matrix_rank: The input tensor input must have at least 2 dimensions.");
+  return std::make_tuple(linalg_matrix_rank(moveBatchDimToFront(input, input_bdim), atol, rtol, hermitian), 0);
+}
+
 #define LINALG_CHECK_MATRIX_UNARY_BATCH_RULE(fn, num_out) SINGLE_ARG(\
   LinalgCheckMatrixUnaryRuleHelper<\
     func_string_##fn,\
@@ -494,6 +529,8 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   VMAP_SUPPORT(linalg_matrix_exp, matrix_exp_batch_rule);
   VMAP_SUPPORT(_linalg_solve_ex, solve_ex_batch_rule);
   VMAP_SUPPORT(linalg_cross, cross_batch_rule);
+  VMAP_SUPPORT2(linalg_matrix_rank, atol_rtol_tensor, matrix_rank_atol_rtol_tensor_batch_rule);
+  VMAP_SUPPORT2(linalg_matrix_rank, atol_rtol_float, matrix_rank_atol_rtol_float_batch_rule);
 
   VMAP_SUPPORT(_linalg_check_errors, _linalg_check_errors_batch_rule);
 }
diff --git a/functorch/test/test_vmap.py b/functorch/test/test_vmap.py
index 6b85f3786108..85e00dd06bcb 100644
--- a/functorch/test/test_vmap.py
+++ b/functorch/test/test_vmap.py
@@ -3302,8 +3302,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
         # masked index as input which is not supported
         xfail('index_put', ''),
         xfail('isin'),
-        xfail('linalg.matrix_rank'),
-        xfail('linalg.matrix_rank', 'hermitian'),
         xfail('linalg.pinv'),
         xfail('linalg.pinv', 'hermitian'),
         xfail('lu_solve'),

From 6b2111619e801064065c0eaba7ca03f00feef59b Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Wed, 7 Sep 2022 21:44:39 +0000
Subject: [PATCH 30/45] check rate limits of other tokens too (#83632)

we keep running into api rate limit issues but apparently theyre connected to pytorchbot, so check rate limit of our other tokens too

according to https://docs.github.com/en/rest/rate-limit this doesnt count against the rate limit
Pull Request resolved: https://github.com/pytorch/pytorch/pull/83632
Approved by: https://github.com/huydhn
---
 .github/workflows/upload-test-stats.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml
index b649aac2c7c5..688a55b6eabc 100644
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@@ -66,5 +66,9 @@ jobs:
       - name: Get our GITHUB_TOKEN API limit usage
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN}}
+          MERGEBOT_TOKEN: ${{ secrets.MERGEBOT_TOKEN}}
         run: |
           curl -H "Accept: application/vnd.github.v3+json" -H "Authorization: token $GITHUB_TOKEN" https://api.github.com/rate_limit
+          curl -H "Accept: application/vnd.github.v3+json" -H "Authorization: token $PYTORCHBOT_TOKEN" https://api.github.com/rate_limit
+          curl -H "Accept: application/vnd.github.v3+json" -H "Authorization: token $MERGEBOT_TOKEN" https://api.github.com/rate_limit

From e68df8e4a14ce1fbedf6b20e132b11ec7b151f8a Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Wed, 7 Sep 2022 07:55:51 -0700
Subject: [PATCH 31/45] Turn on functionalization by default in functorch
 (#84435)

I talked to @SherlockNoMad abt this PR and we agreed prior to brian coming back it was worth disabling this test for getting functionalization on (and that is already the state of torchdynamo)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84435
Approved by: https://github.com/Chillee
---
 functorch/functorch/_src/config.py | 2 +-
 functorch/test/test_pythonkey.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/functorch/functorch/_src/config.py b/functorch/functorch/_src/config.py
index 583dbcec7455..f233ab5fc4de 100644
--- a/functorch/functorch/_src/config.py
+++ b/functorch/functorch/_src/config.py
@@ -9,7 +9,7 @@
 """
 import os
 
-use_functionalize = False
+use_functionalize = True
 
 # TODO: flip this to true by default
 # Waiting on
diff --git a/functorch/test/test_pythonkey.py b/functorch/test/test_pythonkey.py
index 9f0f2fdc556c..8b35056eed0e 100644
--- a/functorch/test/test_pythonkey.py
+++ b/functorch/test/test_pythonkey.py
@@ -412,7 +412,6 @@ class TestEagerFusionOpInfo(AOTTestCase):
         xfail('cholesky'),
         xfail('cumulative_trapezoid'),
         xfail('diag_embed'),
-        xfail('linalg.householder_product'),
         xfail('logit'),
         xfail('trapezoid'),
         xfail('trapz'),
@@ -664,6 +663,7 @@ def forward(self, x, y):
         assert torch.allclose(inputs[0].grad, cloned_inputs[0].grad)
         assert torch.allclose(inputs[1].grad, cloned_inputs[1].grad)
 
+    @unittest.skip("Breaks with functionalization on by default")
     def test_aot_module_simplified_preserves_stack_trace(self):
         class MockModule(torch.nn.Module):
             def __init__(self):

From 586832ce65607c6a1d1d8245b55d4ec24ddfc0e4 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Wed, 7 Sep 2022 08:26:16 -0700
Subject: [PATCH 32/45] Add underlying_store property for PrefixStore (#84640)

Add a property to `PrefixStore` to retrieve the underlying store it is wrapping around. Open for suggestions on property name. This change is based on discussion in [D39225101](https://www.internalfb.com/diff/D39225101) where we need to read properties of the store that PrefixStore is wrapping around.

Differential Revision: [D39311151](https://our.internmc.facebook.com/intern/diff/D39311151)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84640
Approved by: https://github.com/xush6528
---
 test/distributed/test_store.py              | 13 +++++++++++++
 torch/_C/_distributed_c10d.pyi              |  2 ++
 torch/csrc/distributed/c10d/PrefixStore.cpp |  4 ++++
 torch/csrc/distributed/c10d/PrefixStore.hpp |  2 ++
 torch/csrc/distributed/c10d/init.cpp        |  6 +++++-
 5 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_store.py b/test/distributed/test_store.py
index a32ce948c048..1267928c151d 100644
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@@ -141,6 +141,19 @@ def _create_store(self):
         store.set_timeout(timedelta(seconds=300))
         return store
 
+class PrefixStoreTest(TestCase):
+    def setUp(self):
+        # delete is false as FileStore will automatically clean up the file
+        self.file = tempfile.NamedTemporaryFile(delete=False)
+
+    def test_get_underlying_store(self):
+        tcp_store = dist.TCPStore(host_name=DEFAULT_HOSTNAME, port=0, world_size=1, is_master=True)
+        hash_store = dist.HashStore()
+        file_store = dist.FileStore(self.file.name, world_size=1)
+        for store in [tcp_store, hash_store, file_store]:
+            with self.subTest(f"Testing getting underlying_store for {type(store)}"):
+                prefix_store = dist.PrefixStore("prefix", store)
+                self.assertEqual(prefix_store.underlying_store, store)
 
 class PrefixFileStoreTest(TestCase, StoreTestBase):
     def setUp(self):
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index d0158e9d1674..aad37d6a8c5a 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -152,6 +152,8 @@ class TCPStore(Store):
 
 class PrefixStore(Store):
     def __init__(self, prefix: str, store: Store): ...
+    @property
+    def underlying_store(self) -> Store: ...
 
 class Work:
     def is_completed(self) -> bool: ...
diff --git a/torch/csrc/distributed/c10d/PrefixStore.cpp b/torch/csrc/distributed/c10d/PrefixStore.cpp
index c7442df8d4a2..4a02b62380ac 100644
--- a/torch/csrc/distributed/c10d/PrefixStore.cpp
+++ b/torch/csrc/distributed/c10d/PrefixStore.cpp
@@ -79,4 +79,8 @@ void PrefixStore::setTimeout(const std::chrono::milliseconds& timeout) {
   store_->setTimeout(timeout);
 }
 
+c10::intrusive_ptr<Store> PrefixStore::getUnderlyingStore() {
+  return store_;
+}
+
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/PrefixStore.hpp b/torch/csrc/distributed/c10d/PrefixStore.hpp
index c9e57312fac6..69d2a1b4a0c1 100644
--- a/torch/csrc/distributed/c10d/PrefixStore.hpp
+++ b/torch/csrc/distributed/c10d/PrefixStore.hpp
@@ -42,6 +42,8 @@ class TORCH_API PrefixStore : public Store {
 
   void watchKey(const std::string& key, WatchKeyCallback callback) override;
 
+  c10::intrusive_ptr<Store> getUnderlyingStore();
+
  protected:
   std::string prefix_;
   c10::intrusive_ptr<Store> store_;
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index c222675858f7..f28c389c025a 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1027,7 +1027,11 @@ that adds a prefix to each key inserted to the store.
     prefix (str): The prefix string that is prepended to each key before being inserted into the store.
     store (torch.distributed.store): A store object that forms the underlying key-value store.
       )")
-      .def(py::init<const std::string&, c10::intrusive_ptr<::c10d::Store>>());
+      .def(py::init<const std::string&, c10::intrusive_ptr<::c10d::Store>>())
+      .def_property_readonly(
+          "underlying_store",
+          &::c10d::PrefixStore::getUnderlyingStore,
+          R"(Gets the underlying store object that PrefixStore wraps around.)");
 
   auto processGroup =
       py::class_<

From 29672b2136fc80537edf4632b2cf40f48efe0ab8 Mon Sep 17 00:00:00 2001
From: samdow <samdow@fb.com>
Date: Wed, 7 Sep 2022 20:46:24 +0000
Subject: [PATCH 33/45] [functorch] add pinv batch rule (#83761)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/83761
Approved by: https://github.com/zou3519
---
 .../csrc/BatchRulesLinearAlgebra.cpp          | 32 +++++++++++++++----
 functorch/test/test_ops.py                    |  9 ------
 functorch/test/test_vmap.py                   |  2 --
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp b/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
index 1ba13e4f5d19..46d98e83c0b4 100644
--- a/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
+++ b/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
@@ -382,13 +382,16 @@ fourOutputs linalg_lstsq_batch_rule(
   return std::make_tuple(res, 0, res_1, res_1_bdim, res_2, res_2_bdim, res_3, res_3_bdim);
 }
 
+template<typename F>
 std::tuple<Tensor, c10::optional<int64_t>>
-matrix_rank_atol_rtol_tensor_batch_rule(
-    const Tensor& input, optional<int64_t> input_bdim, const optional<Tensor>& atol, const optional<int64_t> atol_bdim,
-    const optional<Tensor>& rtol, const optional<int64_t> rtol_bdim, bool hermitian) {
-  const auto input_logical_rank = rankWithoutBatchDim(input, input_bdim);
+atol_rtol_tensor_batch_rule(
+    F Func, const Tensor& input, optional<int64_t> input_bdim,
+    const optional<Tensor>& atol, const optional<int64_t> atol_bdim,
+    const optional<Tensor>& rtol, const optional<int64_t> rtol_bdim, bool hermitian, char const *op_name) {
+  auto input_logical_rank = rankWithoutBatchDim(input, input_bdim);
+
   TORCH_CHECK(input_logical_rank >= 2,
-            "torch.linalg.matrix_rank: The input tensor input must have at least 2 dimensions.");
+            op_name, ": The input tensor input must have at least 2 dimensions.");
 
   // atol and rtol's dims must be broadcastable to the number of batch dims of input
   // which is input's dim - 2 (input represents a batch of matrices, so 2 is for the matrix dimensions)
@@ -406,7 +409,23 @@ matrix_rank_atol_rtol_tensor_batch_rule(
   atol_ = atol_.has_value() ? maybePadToLogicalRank(*atol_, atol_bdim, max_logical_bdims) : atol_;
   rtol_ = rtol_.has_value() ? maybePadToLogicalRank(*rtol_, rtol_bdim, max_logical_bdims) : rtol_;
 
-  return std::make_tuple(at::linalg_matrix_rank(input_, atol_, rtol_, hermitian), 0);
+  return std::make_tuple(Func(input_, atol_, rtol_, hermitian), 0);
+}
+
+std::tuple<Tensor, c10::optional<int64_t>>
+matrix_rank_atol_rtol_tensor_batch_rule(
+    const Tensor& input, c10::optional<int64_t> input_bdim, const optional<Tensor>& atol,
+    const c10::optional<int64_t> atol_bdim, const optional<Tensor>& rtol,
+    const c10::optional<int64_t> rtol_bdim, bool hermitian) {
+  return atol_rtol_tensor_batch_rule(ATEN_FN2(linalg_matrix_rank, atol_rtol_tensor), input, input_bdim, atol, atol_bdim, rtol, rtol_bdim, hermitian, "torch.linalg.matrix_rank");
+}
+
+std::tuple<Tensor, c10::optional<int64_t>>
+pinv_batch_rule(
+    const Tensor& input, c10::optional<int64_t> input_bdim, const optional<Tensor>& atol,
+    const c10::optional<int64_t> atol_bdim, const optional<Tensor>& rtol,
+    const c10::optional<int64_t> rtol_bdim, bool hermitian) {
+  return atol_rtol_tensor_batch_rule(ATEN_FN2(linalg_pinv, atol_rtol_tensor), input, input_bdim, atol, atol_bdim, rtol, rtol_bdim, hermitian, "linalg.pinv");
 }
 
 std::tuple<Tensor,optional<int64_t>>
@@ -531,6 +550,7 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   VMAP_SUPPORT(linalg_cross, cross_batch_rule);
   VMAP_SUPPORT2(linalg_matrix_rank, atol_rtol_tensor, matrix_rank_atol_rtol_tensor_batch_rule);
   VMAP_SUPPORT2(linalg_matrix_rank, atol_rtol_float, matrix_rank_atol_rtol_float_batch_rule);
+  VMAP_SUPPORT2(linalg_pinv, atol_rtol_tensor, pinv_batch_rule);
 
   VMAP_SUPPORT(_linalg_check_errors, _linalg_check_errors_batch_rule);
 }
diff --git a/functorch/test/test_ops.py b/functorch/test/test_ops.py
index 8d69fe7e22b5..d011567d5bae 100644
--- a/functorch/test/test_ops.py
+++ b/functorch/test/test_ops.py
@@ -746,14 +746,10 @@ def test_vmapjvpall(self, device, dtype, op):
         xfail('cumprod'),
         xfail('lu_solve'),
         xfail('linalg.det'),
-        xfail('linalg.lstsq', 'grad_oriented'),
-        xfail('linalg.pinv'),
         xfail('masked_fill'),
         xfail('copysign'),
         xfail('linalg.solve'),
         xfail('complex'),
-        xfail('linalg.pinv', 'hermitian'),
-        xfail('pinverse'),
         skip('_masked.mean'),  # ???
         xfail('masked_scatter'),
         xfail('index_fill'),
@@ -839,10 +835,6 @@ def test():
         xfail('linalg.det'),
         xfail('linalg.eig'),
         xfail('linalg.householder_product'),
-        xfail('linalg.lstsq', ''),
-        xfail('linalg.lstsq', 'grad_oriented'),
-        xfail('linalg.pinv'),
-        xfail('linalg.pinv', 'hermitian'),
         xfail('lu'),
         xfail('lu_solve'),
         xfail('lu_unpack'),
@@ -850,7 +842,6 @@ def test():
         xfail('masked_scatter'),
         xfail('masked_select'),
         xfail('nanquantile'),
-        xfail('pinverse'),
         xfail('prod'),
         xfail('put'),
         skip('linalg.det'),  # https://github.com/pytorch/functorch/issues/961
diff --git a/functorch/test/test_vmap.py b/functorch/test/test_vmap.py
index 85e00dd06bcb..afb2684979e7 100644
--- a/functorch/test/test_vmap.py
+++ b/functorch/test/test_vmap.py
@@ -3302,8 +3302,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
         # masked index as input which is not supported
         xfail('index_put', ''),
         xfail('isin'),
-        xfail('linalg.pinv'),
-        xfail('linalg.pinv', 'hermitian'),
         xfail('lu_solve'),
         xfail('lu_unpack'),
         xfail('masked_fill'),

From 9162bc025256d638369c77c845b8a5ed66eeff5a Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 7 Sep 2022 15:43:58 -0400
Subject: [PATCH 34/45] Convert NoopPyInterpreterVTable into a Meyer singleton
 (#84656)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84656
Approved by: https://github.com/wconstab
---
 c10/core/impl/PyInterpreter.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
index a00c872af558..d8c7784a084e 100644
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -67,9 +67,8 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
 };
 
 void PyInterpreter::disarm() noexcept {
-  // Intentionally leaked
-  static PyInterpreterVTable* noop_vtable = new NoopPyInterpreterVTable();
-  vtable_ = noop_vtable;
+  static NoopPyInterpreterVTable noop_vtable;
+  vtable_ = &noop_vtable;
 }
 
 } // namespace impl

From 93359bf9b3503135332d40cb297515efe5290ec6 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 7 Sep 2022 17:19:08 -0400
Subject: [PATCH 35/45] Convert ConcretePyInterpreterVTable into Meyer
 singleton (#84657)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84657
Approved by: https://github.com/wconstab
---
 torch/csrc/autograd/python_variable.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 42151414266d..ce2850e59c9f 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -262,6 +262,11 @@ struct ConcretePyInterpreterVTable final
   void trace_gpu_stream_creation(uintptr_t stream) const override {
     concrete_trace_cuda<trace_cuda_stream_creation_fn_name>(stream);
   }
+
+  static ConcretePyInterpreterVTable* instance() {
+    static ConcretePyInterpreterVTable s;
+    return &s;
+  }
 };
 
 // NOTE [PyInterpreter::decref takes an `is_tensor` arg]
@@ -306,9 +311,11 @@ void ConcretePyInterpreterVTable::decref(PyObject* pyobj, bool is_tensor)
 class PyInterpreterHolder {
  public:
   PyInterpreterHolder()
-      : impl_(new c10::impl::PyInterpreter(new ConcretePyInterpreterVTable())) {
-  }
-  // NB: intentionally leaks the memory
+      : impl_(new c10::impl::PyInterpreter(
+            ConcretePyInterpreterVTable::instance())) {}
+  // NB: intentionally leaks the PyInterpreter, as there may still be
+  // references to it that are live, living in objects that aren't being
+  // destructed while Python is being cleaned up.
   ~PyInterpreterHolder() {
     impl_->disarm();
   }

From 1a1bcc736197f1f7943d568512c3c1e44ba05fbc Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Thu, 8 Sep 2022 01:09:10 +0000
Subject: [PATCH 36/45] Actually chown artifacts (#84672)

Rollback part of https://github.com/pytorch/pytorch/commit/045ebc771d5070696f839e586285ace9c06f1339 to actually chown artifacts folder rather than workspace

Fixes https://github.com/pytorch/pytorch/issues/84644
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84672
Approved by: https://github.com/kit1980, https://github.com/huydhn
---
 .github/workflows/_binary-build-linux.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml
index 8ddd4af29d71..0e27f3589d86 100644
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@@ -202,7 +202,10 @@ jobs:
           docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/${{ inputs.PACKAGE_TYPE }}/build.sh"
       - name: Chown artifacts
         if: always()
-        uses: ./pytorch/.github/actions/chown-workspace
+        shell: bash
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
 
       - uses: actions/upload-artifact@v3
         with:

From 9669e3c6ec6b6f232bed3b29bcd593434992f57d Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 7 Sep 2022 17:25:49 -0400
Subject: [PATCH 37/45] Ignore UB on multiply (#84665)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84665
Approved by: https://github.com/Chillee
---
 aten/src/ATen/native/cpu/BinaryOpsKernel.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 2c9ac5ac15b6..a5dde16024ab 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -68,8 +68,8 @@ void mul_kernel(TensorIteratorBase& iter) {
   } else {
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "mul_cpu", [&]() {
       cpu_kernel_vec(iter,
-        [=](scalar_t a, scalar_t b) -> scalar_t { return a * b; },
-        [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) {
+        [=](scalar_t a, scalar_t b) __ubsan_ignore_undefined__ -> scalar_t { return a * b; },
+        [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) __ubsan_ignore_undefined__ {
           return a * b;
         });
     });

From e0229d6517385a98afeadbc6391d3592d5027c63 Mon Sep 17 00:00:00 2001
From: John Detloff <johndetloff@fb.com>
Date: Thu, 8 Sep 2022 01:49:55 +0000
Subject: [PATCH 38/45] Remove caffe2 mobile (#84338)

We're no longer building Caffe2 mobile as part of our CI, and it adds a lot of clutter to our make files. Any lingering internal dependencies will use the buck build and so wont be effected.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84338
Approved by: https://github.com/dreiss
---
 .jenkins/caffe2/bench.sh          |   54 --
 .jenkins/caffe2/build.sh          |  231 -------
 .jenkins/caffe2/dirty.sh          |    7 -
 .jenkins/pytorch/dirty.sh         |    9 -
 CMakeLists.txt                    |   16 +-
 CONTRIBUTING.md                   |    7 -
 binaries/CMakeLists.txt           |   13 +-
 caffe2/CMakeLists.txt             | 1070 ++++++++++++++---------------
 caffe2/core/CMakeLists.txt        |    2 +-
 caffe2/perfkernels/CMakeLists.txt |    2 +-
 caffe2/utils/CMakeLists.txt       |    2 +-
 cmake/Dependencies.cmake          |    2 +-
 cmake/Summary.cmake               |    1 -
 cmake/public/utils.cmake          |  118 ++--
 scripts/build_android.sh          |   36 +-
 scripts/build_ios.sh              |   47 +-
 16 files changed, 628 insertions(+), 989 deletions(-)
 delete mode 100755 .jenkins/caffe2/bench.sh
 delete mode 100755 .jenkins/caffe2/build.sh
 delete mode 100755 .jenkins/caffe2/dirty.sh
 delete mode 100755 .jenkins/pytorch/dirty.sh

diff --git a/.jenkins/caffe2/bench.sh b/.jenkins/caffe2/bench.sh
deleted file mode 100755
index 55ac4e94df21..000000000000
--- a/.jenkins/caffe2/bench.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-# shellcheck source=./common.sh
-source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
-
-# Anywhere except $ROOT_DIR should work. This is so the python import doesn't
-# get confused by any 'caffe2' directory in cwd
-cd "$INSTALL_PREFIX"
-
-if [[ $BUILD_ENVIRONMENT == *-cuda* ]]; then
-    num_gpus=$(nvidia-smi -L | wc -l)
-elif [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
-    num_gpus=$(rocminfo | grep 'Device Type.*GPU' | wc -l)
-else
-    num_gpus=0
-fi
-
-caffe2_pypath="$(cd /usr && $PYTHON -c 'import os; import caffe2; print(os.path.dirname(os.path.realpath(caffe2.__file__)))')"
-# Resnet50
-if (( $num_gpus == 0 )); then
-    "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --train_data null --batch_size 128 --epoch_size 12800 --num_epochs 2 --use_cpu
-fi
-if (( $num_gpus >= 1 )); then
-    "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --train_data null --batch_size 128 --epoch_size 12800 --num_epochs 2 --num_gpus 1
-    # Let's skip the fp16 bench runs for now, as it recompiles the miopen kernels and can take 10+min to run.
-    # We can resume when we (1) bindmount the miopen cache folder in jenkins; (2) install the pre-compiled miopen kernel library in the docker
-    # "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --train_data null --batch_size 256 --epoch_size 25600 --num_epochs 2 --num_gpus 1 --float16_compute --dtype float16
-fi
-if (( $num_gpus >= 4 )); then
-    "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --train_data null --batch_size 512 --epoch_size 51200 --num_epochs 2 --num_gpus 4
-fi
-
-# ResNext
-if (( $num_gpus == 0 )); then
-    "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --resnext_num_groups 32 --resnext_width_per_group 4 --num_layers 101 --train_data null --batch_size 32 --epoch_size 3200 --num_epochs 2 --use_cpu
-fi
-if (( $num_gpus >= 1 )); then
-    "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --resnext_num_groups 32 --resnext_width_per_group 4 --num_layers 101 --train_data null --batch_size 32 --epoch_size 3200 --num_epochs 2 --num_gpus 1
-    # "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --resnext_num_groups 32 --resnext_width_per_group 4 --num_layers 101 --train_data null --batch_size 64 --epoch_size 3200 --num_epochs 2 --num_gpus 1 --float16_compute --dtype float16
-fi
-if (( $num_gpus >= 4 )); then
-    "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --resnext_num_groups 32 --resnext_width_per_group 4 --num_layers 101 --train_data null --batch_size 128 --epoch_size 12800 --num_epochs 2 --num_gpus 4
-fi
-
-# Shufflenet
-if (( $num_gpus == 0 )); then
-    "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --train_data null --batch_size 32 --epoch_size 3200 --num_epochs 2 --use_cpu --model shufflenet
-fi
-if (( $num_gpus >= 1 )); then
-    "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --train_data null --batch_size 32 --epoch_size 3200 --num_epochs 2 --num_gpus 1 --model shufflenet
-fi
-if (( $num_gpus >= 4 )); then
-    "$PYTHON" "$caffe2_pypath/python/examples/imagenet_trainer.py" --train_data null --batch_size 128 --epoch_size 12800 --num_epochs 2 --num_gpus 4 --model shufflenet
-fi
diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
deleted file mode 100755
index e6e06c1d7db5..000000000000
--- a/.jenkins/caffe2/build.sh
+++ /dev/null
@@ -1,231 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-# shellcheck source=./common.sh
-source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
-
-# CMAKE_ARGS are only passed to 'cmake' and the -Dfoo=bar does not work with
-# setup.py, so we build a list of foo=bars and then either convert it to
-# -Dfoo=bars or export them before running setup.py
-build_args=()
-build_to_cmake () {
-  cmake_args=()
-  for build_arg in $*; do
-    cmake_args+=("-D$build_arg")
-  done
-  echo ${cmake_args[@]}
-}
-
-
-SCCACHE="$(which sccache)"
-
-# Setup ccache if configured to use it (and not sccache)
-if [ -z "${SCCACHE}" ] && which ccache > /dev/null; then
-  mkdir -p ./ccache
-  ln -sf "$(which ccache)" ./ccache/cc
-  ln -sf "$(which ccache)" ./ccache/c++
-  ln -sf "$(which ccache)" ./ccache/gcc
-  ln -sf "$(which ccache)" ./ccache/g++
-  ln -sf "$(which ccache)" ./ccache/x86_64-linux-gnu-gcc
-  if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]]; then
-    mkdir -p ./ccache/cuda
-    ln -sf "$(which ccache)" ./ccache/cuda/nvcc
-  fi
-  export CACHE_WRAPPER_DIR="$PWD/ccache"
-  export PATH="$CACHE_WRAPPER_DIR:$PATH"
-fi
-
-# sccache will fail for CUDA builds if all cores are used for compiling
-if [ -z "$MAX_JOBS" ]; then
-  if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]] && [ -n "${SCCACHE}" ]; then
-    MAX_JOBS=`expr $(nproc) - 1`
-  else
-    MAX_JOBS=$(nproc)
-  fi
-fi
-
-report_compile_cache_stats() {
-  if [[ -n "${SCCACHE}" ]]; then
-    "$SCCACHE" --show-stats
-  elif which ccache > /dev/null; then
-    ccache -s
-  fi
-}
-
-
-###############################################################################
-# Use special scripts for Android and setup builds
-###############################################################################
-if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
-  export ANDROID_NDK=/opt/ndk
-  build_args+=("BUILD_BINARY=ON")
-  build_args+=("BUILD_TEST=ON")
-  build_args+=("USE_OBSERVERS=ON")
-  build_args+=("USE_ZSTD=ON")
-  BUILD_CAFFE2_MOBILE=1 "${ROOT_DIR}/scripts/build_android.sh" $(build_to_cmake ${build_args[@]}) "$@"
-  exit 0
-fi
-
-###############################################################################
-# Set parameters
-###############################################################################
-if [[ "$BUILD_ENVIRONMENT" == *cmake* ]]; then
-  build_args+=("BUILD_PYTHON=OFF")
-else
-  build_args+=("BUILD_PYTHON=ON")
-  build_args+=("PYTHON_EXECUTABLE=${PYTHON}")
-fi
-if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then
-  build_args+=("BLAS=MKL")
-  build_args+=("USE_MKLDNN=ON")
-fi
-build_args+=("BUILD_BINARY=ON")
-build_args+=("BUILD_TEST=ON")
-build_args+=("INSTALL_TEST=ON")
-build_args+=("USE_ZSTD=ON")
-
-if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
-  build_args+=("USE_CUDA=ON")
-  build_args+=("USE_NNPACK=OFF")
-
-  # Target only our CI GPU machine's CUDA arch to speed up the build
-  build_args+=("TORCH_CUDA_ARCH_LIST=Maxwell")
-
-  # Explicitly set path to NVCC such that the symlink to ccache or sccache is used
-  if [ -n "${CACHE_WRAPPER_DIR}" ]; then
-    build_args+=("CUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/cuda/nvcc")
-    build_args+=("CMAKE_CUDA_COMPILER_LAUNCHER=${CACHE_WRAPPER_DIR}/ccache")
-  fi
-
-  # Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit.
-  # Setting PATH to resolve to the right nvcc alone isn't enough.
-  # See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589.
-  export CUDA_PATH="/usr/local/cuda"
-
-  # Ensure the ccache symlink can still find the real nvcc binary.
-  export PATH="/usr/local/cuda/bin:$PATH"
-fi
-if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
-  if [[ -n "$CI" && -z "$PYTORCH_ROCM_ARCH" ]]; then
-      # Set ROCM_ARCH to gfx900 and gfx906 for CI builds, if user doesn't override.
-      echo "Limiting PYTORCH_ROCM_ARCH to gfx90[06] for CI builds"
-      export PYTORCH_ROCM_ARCH="gfx900;gfx906"
-  fi
-  # This is needed to enable ImageInput operator in resnet50_trainer
-  build_args+=("USE_OPENCV=ON")
-  # This is needed to read datasets from https://download.caffe2.ai/databases/resnet_trainer.zip
-  build_args+=("USE_LMDB=ON")
-  # hcc used to run out of memory, silently exiting without stopping
-  # the build process, leaving undefined symbols in the shared lib,
-  # causing undefined symbol errors when later running tests.
-  # We used to set MAX_JOBS to 4 to avoid, but this is no longer an issue.
-  if [ -z "$MAX_JOBS" ]; then
-    export MAX_JOBS=$(($(nproc) - 1))
-  fi
-
-  ########## HIPIFY Caffe2 operators
-  ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_amd.py"
-fi
-
-# Try to include Redis support for Linux builds
-if [ "$(uname)" == "Linux" ]; then
-  build_args+=("USE_REDIS=ON")
-fi
-
-# Use a specialized onnx namespace in CI to catch hardcoded onnx namespace
-build_args+=("ONNX_NAMESPACE=ONNX_NAMESPACE_FOR_C2_CI")
-
-###############################################################################
-# Configure and make
-###############################################################################
-
-if [[ "$BUILD_ENVIRONMENT" == *cmake* ]]; then
-  # cmake-only non-setup.py build, to test cpp only bits. This installs into
-  # /usr/local/caffe2 and installs no Python tests
-  build_args+=("CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
-
-  # Run cmake from ./build_caffe2 directory so it doesn't conflict with
-  # standard PyTorch build directory. Eventually these won't need to
-  # be separate.
-  rm -rf build_caffe2
-  mkdir build_caffe2
-  cd ./build_caffe2
-
-  # We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04)
-  # and use that if so.
-  if [[ -x "$(command -v cmake3)" ]]; then
-      CMAKE_BINARY=cmake3
-  else
-      CMAKE_BINARY=cmake
-  fi
-
-  # Configure
-  ${CMAKE_BINARY} "${ROOT_DIR}" $(build_to_cmake ${build_args[@]}) "$@"
-
-  # Build
-  if [ "$(uname)" == "Linux" ]; then
-    make "-j${MAX_JOBS}" install
-  else
-    echo "Don't know how to build on $(uname)"
-    exit 1
-  fi
-
-  # This is to save test binaries for testing
-  mv "$INSTALL_PREFIX/test/" "$INSTALL_PREFIX/cpp_test/"
-
-  ls -lah $INSTALL_PREFIX
-
-else
-  # Python build. Uses setup.py to install into site-packages
-  build_args+=("USE_LEVELDB=ON")
-  build_args+=("USE_LMDB=ON")
-  build_args+=("USE_OPENCV=ON")
-  build_args+=("BUILD_TEST=ON")
-  # These flags preserve the flags that were used before this refactor (blame
-  # me)
-  build_args+=("USE_GLOG=ON")
-  build_args+=("USE_GFLAGS=ON")
-  build_args+=("USE_FBGEMM=OFF")
-  build_args+=("USE_MKLDNN=OFF")
-  build_args+=("USE_DISTRIBUTED=ON")
-  for build_arg in "${build_args[@]}"; do
-    export $build_arg
-  done
-
-  # sccache will be stuck if  all cores are used for compiling
-  # see https://github.com/pytorch/pytorch/pull/7361
-  if [[ -n "${SCCACHE}" && $BUILD_ENVIRONMENT != *rocm* ]]; then
-    export MAX_JOBS=`expr $(nproc) - 1`
-  fi
-
-  pip install --user dataclasses typing_extensions
-
-  $PYTHON setup.py install --user
-
-  report_compile_cache_stats
-fi
-
-###############################################################################
-# Install ONNX
-###############################################################################
-
-# Install ONNX into a local directory
-pip install --user "file://${ROOT_DIR}/third_party/onnx#egg=onnx"
-
-report_compile_cache_stats
-
-if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
-  # remove sccache wrappers post-build; runtime compilation of MIOpen kernels does not yet fully support them
-  sudo rm -f /opt/cache/bin/cc
-  sudo rm -f /opt/cache/bin/c++
-  sudo rm -f /opt/cache/bin/gcc
-  sudo rm -f /opt/cache/bin/g++
-  pushd /opt/rocm/llvm/bin
-  if [[ -d original ]]; then
-    sudo mv original/clang .
-    sudo mv original/clang++ .
-  fi
-  sudo rm -rf original
-  popd
-fi
diff --git a/.jenkins/caffe2/dirty.sh b/.jenkins/caffe2/dirty.sh
deleted file mode 100755
index 6b9ba544dab9..000000000000
--- a/.jenkins/caffe2/dirty.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-set -ex
-upstream="$1"
-pr="$2"
-git diff --name-only "$upstream" "$pr"
-# For safety, unconditionally trigger for any changes.
-#git diff --name-only "$upstream" "$pr" | grep -Eq '^(CMakeLists.txt|Makefile|.gitmodules|.jenkins/caffe2|binaries|caffe|caffe2|cmake|conda|docker|docs/caffe2|modules|scripts|third_party)'
diff --git a/.jenkins/pytorch/dirty.sh b/.jenkins/pytorch/dirty.sh
deleted file mode 100755
index 230d69606664..000000000000
--- a/.jenkins/pytorch/dirty.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-set -ex
-upstream="$1"
-pr="$2"
-git diff --name-only "$upstream" "$pr"
-# Now that PyTorch build depends on Caffe2, unconditionally trigger
-# for any changes.
-# TODO: Replace this with a NEGATIVE regex that allows us to skip builds when they are unnecessary
-#git diff --name-only "$upstream" "$pr" | grep -Eq '^(aten/|caffe2/|.jenkins/pytorch|docs/(make.bat|Makefile|requirements.txt|source)|mypy|requirements.txt|setup.py|test/|third_party/|tools/|\.gitmodules|torch/)'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b6fedca3f71..379fa2fd7c7e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -165,9 +165,6 @@ option(BUILD_LITE_INTERPRETER "Master flag to build Lite Interpreter" OFF)
 cmake_dependent_option(
     BUILD_CAFFE2_OPS "Build Caffe2 operators" ON
     "BUILD_CAFFE2" OFF)
-cmake_dependent_option(
-    BUILD_CAFFE2_MOBILE "Build libcaffe2 for mobile (deprecating)" OFF
-    "BUILD_CAFFE2" OFF)
 option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
 cmake_dependent_option(
     CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
@@ -591,18 +588,11 @@ if(ANDROID OR IOS OR DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN})
 endif()
 
 # INTERN_BUILD_ATEN_OPS is used to control whether to build ATen/TH operators.
-# It's disabled for caffe2 mobile library.
-if(INTERN_BUILD_MOBILE AND BUILD_CAFFE2_MOBILE)
-  set(INTERN_BUILD_ATEN_OPS OFF)
-else()
-  set(INTERN_BUILD_ATEN_OPS ON)
-endif()
+set(INTERN_BUILD_ATEN_OPS ON)
 
-# BUILD_CAFFE2_MOBILE is the master switch to choose between libcaffe2 v.s. libtorch mobile build.
-# When it's enabled it builds original libcaffe2 mobile library without ATen/TH ops nor TorchScript support;
-# When it's disabled it builds libtorch mobile library, which contains ATen/TH ops and native support for
+# Build libtorch mobile library, which contains ATen/TH ops and native support for
 # TorchScript model, but doesn't contain not-yet-unified caffe2 ops;
-if(INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE)
+if(INTERN_BUILD_MOBILE)
   if(NOT BUILD_SHARED_LIBS AND NOT "${SELECTED_OP_LIST}" STREQUAL "")
     string(APPEND CMAKE_CXX_FLAGS " -DNO_EXPORT")
   endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a007cedbdcac..e2101017d99c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1246,13 +1246,6 @@ In 2018, we merged Caffe2 into the PyTorch source repository. While the
 steady state aspiration is that Caffe2 and PyTorch share code freely,
 in the meantime there will be some separation.
 
-If you submit a PR to only PyTorch or only Caffe2 code, CI will only
-run for the project you edited. The logic for this is implemented
-in `.jenkins/pytorch/dirty.sh` and `.jenkins/caffe2/dirty.sh`; you
-can look at this to see what path prefixes constitute changes.
-This also means if you ADD a new top-level path, or you start
-sharing code between projects, you need to modify these files.
-
 There are a few "unusual" directories which, for historical reasons,
 are Caffe2/PyTorch specific. Here they are:
 
diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt
index b683ee002280..15f47bf52aee 100644
--- a/binaries/CMakeLists.txt
+++ b/binaries/CMakeLists.txt
@@ -1,13 +1,8 @@
 if(INTERN_BUILD_MOBILE)
-  if(BUILD_CAFFE2_MOBILE)
-    #caffe2_binary_target("predictor_verifier.cc")
-    caffe2_binary_target("speed_benchmark.cc")
-  else()
-    caffe2_binary_target("speed_benchmark_torch.cc")
-    caffe2_binary_target("load_benchmark_torch.cc")
-    if(NOT BUILD_LITE_INTERPRETER)
-      caffe2_binary_target("compare_models_torch.cc")
-    endif()
+  caffe2_binary_target("speed_benchmark_torch.cc")
+  caffe2_binary_target("load_benchmark_torch.cc")
+  if(NOT BUILD_LITE_INTERPRETER)
+    caffe2_binary_target("compare_models_torch.cc")
   endif()
   return()
 endif()
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 584d550b2e87..ba24386487f5 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -22,7 +22,7 @@ endif()
 #  OMP - OpenMP for intra-op, native thread pool for inter-op parallelism
 #  NATIVE - using native thread pool for intra- and inter-op parallelism
 #  TBB - using TBB for intra- and native thread pool for inter-op parallelism
-if(INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE)
+if(INTERN_BUILD_MOBILE)
   set(ATEN_THREADING "NATIVE" CACHE STRING "ATen parallel backend")
 else()
   if(USE_OPENMP)
@@ -129,7 +129,7 @@ if(BUILD_CAFFE2 OR (NOT USE_FBGEMM))
 endif()
 
 # Skip modules that are not used by libtorch mobile yet.
-if(BUILD_CAFFE2 AND (NOT INTERN_BUILD_MOBILE OR BUILD_CAFFE2_MOBILE))
+if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
   add_subdirectory(contrib)
   add_subdirectory(predictor)
   add_subdirectory(predictor/emulator)
@@ -166,7 +166,7 @@ if(BUILD_CAFFE2 AND (NOT INTERN_BUILD_MOBILE OR BUILD_CAFFE2_MOBILE))
   # add_subdirectory(test) # todo: use caffe2_gtest_main instead of gtest_main because we will need to call GlobalInit
   add_subdirectory(transforms)
 endif()
-if(NOT BUILD_CAFFE2 AND (NOT INTERN_BUILD_MOBILE OR BUILD_CAFFE2_MOBILE))
+if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
   add_subdirectory(proto)
 endif()
 
@@ -269,7 +269,7 @@ if(PRINT_CMAKE_DEBUG_INFO)
 
 endif()
 
-if(NOT INTERN_BUILD_MOBILE OR BUILD_CAFFE2_MOBILE)
+if(NOT INTERN_BUILD_MOBILE)
   # ---[ List of libraries to link with
   add_library(caffe2_protos STATIC $<TARGET_OBJECTS:Caffe2_PROTO>)
   add_dependencies(caffe2_protos Caffe2_PROTO)
@@ -326,441 +326,437 @@ if(NOT TORCH_INSTALL_LIB_DIR)
   set(TORCH_INSTALL_LIB_DIR lib)
 endif()
 
+set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
 
+# Generate files
+set(TOOLS_PATH "${TORCH_ROOT}/tools")
 
-if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
-  set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
-
-  # Generate files
-  set(TOOLS_PATH "${TORCH_ROOT}/tools")
-
-  configure_file("${TORCH_SRC_DIR}/_utils_internal.py"
-    "${TOOLS_PATH}/shared/_utils_internal.py"
-    COPYONLY)
+configure_file("${TORCH_SRC_DIR}/_utils_internal.py"
+  "${TOOLS_PATH}/shared/_utils_internal.py"
+  COPYONLY)
 
-  # Generate header with version info
-  configure_file("${TORCH_SRC_DIR}/csrc/api/include/torch/version.h.in"
-    "${TORCH_SRC_DIR}/csrc/api/include/torch/version.h"
-    @ONLY)
+# Generate header with version info
+configure_file("${TORCH_SRC_DIR}/csrc/api/include/torch/version.h.in"
+  "${TORCH_SRC_DIR}/csrc/api/include/torch/version.h"
+  @ONLY)
 
-  set(GENERATED_CXX_TORCH
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp"
-    )
+set(GENERATED_CXX_TORCH
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp"
+  )
 
-  if(NOT INTERN_DISABLE_AUTOGRAD AND NOT BUILD_LITE_INTERPRETER)
+if(NOT INTERN_DISABLE_AUTOGRAD AND NOT BUILD_LITE_INTERPRETER)
+  list(APPEND GENERATED_CXX_TORCH
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_0.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_1.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_2.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_3.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_4.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_0.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_1.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_2.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_3.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_4.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_0.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_1.cpp"
+  )
+  if(BUILD_LAZY_TS_BACKEND)
     list(APPEND GENERATED_CXX_TORCH
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_0.cpp"
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_1.cpp"
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_2.cpp"
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_3.cpp"
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_4.cpp"
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_0.cpp"
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_1.cpp"
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_2.cpp"
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_3.cpp"
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_4.cpp"
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_0.cpp"
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_1.cpp"
+      "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNativeFunctions.cpp"
+      "${TORCH_SRC_DIR}/csrc/lazy/generated/RegisterAutogradLazy.cpp"
+      "${TORCH_SRC_DIR}/csrc/lazy/generated/RegisterLazy.cpp"
     )
-    if(BUILD_LAZY_TS_BACKEND)
-      list(APPEND GENERATED_CXX_TORCH
-        "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNativeFunctions.cpp"
-        "${TORCH_SRC_DIR}/csrc/lazy/generated/RegisterAutogradLazy.cpp"
-        "${TORCH_SRC_DIR}/csrc/lazy/generated/RegisterLazy.cpp"
-      )
-    endif()
   endif()
+endif()
 
-  set(GENERATED_H_TORCH
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.h"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/variable_factories.h"
-    )
+set(GENERATED_H_TORCH
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.h"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/variable_factories.h"
+  )
 
-  if(NOT INTERN_DISABLE_AUTOGRAD)
-    list(APPEND GENERATED_H_TORCH
-      "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.h"
-      "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyIr.h"
-      "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNonNativeIr.h"
-      "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNativeFunctions.h"
-    )
-  endif()
+if(NOT INTERN_DISABLE_AUTOGRAD)
+  list(APPEND GENERATED_H_TORCH
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.h"
+    "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyIr.h"
+    "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNonNativeIr.h"
+    "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNativeFunctions.h"
+  )
+endif()
 
-  set(GENERATED_CXX_PYTHON
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_0.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_1.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_2.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_3.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_4.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_0.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_1.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_2.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_fft_functions.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_linalg_functions.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_sparse_functions.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_special_functions.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_return_types.cpp"
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_enum_tag.cpp"
-    )
+set(GENERATED_CXX_PYTHON
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_0.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_1.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_2.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_3.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions_4.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_0.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_1.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_2.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_fft_functions.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_linalg_functions.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_sparse_functions.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_special_functions.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_return_types.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_enum_tag.cpp"
+  )
 
-  set(GENERATED_H_PYTHON
-    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.h"
-    )
+set(GENERATED_H_PYTHON
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.h"
+  )
 
-  set(GENERATED_TESTING_PYTHON
-    "${TORCH_SRC_DIR}/testing/_internal/generated/annotated_fn_args.py"
-    )
+set(GENERATED_TESTING_PYTHON
+  "${TORCH_SRC_DIR}/testing/_internal/generated/annotated_fn_args.py"
+  )
 
-  set(TORCH_GENERATED_CODE
-    ${GENERATED_CXX_TORCH}
-    ${GENERATED_H_TORCH}
-    ${GENERATED_CXX_PYTHON}
-    ${GENERATED_H_PYTHON}
-    ${GENERATED_TESTING_PYTHON}
-    )
+set(TORCH_GENERATED_CODE
+  ${GENERATED_CXX_TORCH}
+  ${GENERATED_H_TORCH}
+  ${GENERATED_CXX_PYTHON}
+  ${GENERATED_H_PYTHON}
+  ${GENERATED_TESTING_PYTHON}
+  )
 
-  set(GEN_PER_OPERATOR_FLAG)
-  if(USE_PER_OPERATOR_HEADERS)
-    list(APPEND GEN_PER_OPERATOR_FLAG "--per_operator_headers")
-  endif()
-
-  file(GLOB_RECURSE autograd_python "${TOOLS_PATH}/autograd/*.py")
-  file(GLOB_RECURSE autograd_yaml "${TOOLS_PATH}/autograd/*.yaml")
-  file(GLOB_RECURSE autograd_templates "${TOOLS_PATH}/autograd/templates/*")
-  add_custom_command(
-    OUTPUT
-    ${TORCH_GENERATED_CODE}
-    COMMAND
-    "${PYTHON_EXECUTABLE}" tools/setup_helpers/generate_code.py
-      --native-functions-path "aten/src/ATen/native/native_functions.yaml"
-      --tags-path "aten/src/ATen/native/tags.yaml"
-      $<$<BOOL:${INTERN_DISABLE_AUTOGRAD}>:--disable-autograd>
-      $<$<BOOL:${SELECTED_OP_LIST}>:--selected-op-list-path="${SELECTED_OP_LIST}">
-      --force_schema_registration
-      --gen_lazy_ts_backend
-      ${GEN_PER_OPERATOR_FLAG}
-    DEPENDS
-      "${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml"
-      "${TORCH_ROOT}/aten/src/ATen/native/tags.yaml"
-      "${TORCH_ROOT}/aten/src/ATen/native/ts_native_functions.yaml"
-      "${TORCH_ROOT}/torch/csrc/lazy/core/shape_inference.h"
-      "${TORCH_ROOT}/torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
-      "${TORCH_ROOT}/aten/src/ATen/templates/DispatchKeyNativeFunctions.h"
-      "${TORCH_ROOT}/aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp"
-      "${TORCH_ROOT}/aten/src/ATen/templates/LazyIr.h"
-      "${TORCH_ROOT}/aten/src/ATen/templates/LazyNonNativeIr.h"
-      "${TORCH_ROOT}/aten/src/ATen/templates/RegisterDispatchKey.cpp"
-      ${autograd_python}
-      ${autograd_yaml}
-      ${autograd_templates}
-      ${torchgen_python}
-    WORKING_DIRECTORY "${TORCH_ROOT}")
-
-
-  # Required workaround for libtorch_python.so build
-  # see https://samthursfield.wordpress.com/2015/11/21/cmake-dependencies-between-targets-and-files-and-custom-commands/#custom-commands-in-different-directories
-  add_custom_target(
-    generate-torch-sources
-    DEPENDS ${TORCH_GENERATED_CODE}
-    )
+set(GEN_PER_OPERATOR_FLAG)
+if(USE_PER_OPERATOR_HEADERS)
+  list(APPEND GEN_PER_OPERATOR_FLAG "--per_operator_headers")
+endif()
 
-  set(TORCH_SRCS ${GENERATED_CXX_TORCH})
-  list(APPEND TORCH_SRCS ${GENERATED_H_TORCH})
-  list(APPEND LIBTORCH_CMAKE_SRCS "")
-
-  list(APPEND LITE_EAGER_SYMOBLICATION_SRCS "")
-  if(USE_SOURCE_DEBUG_ON_MOBILE)
-    append_filelist("libtorch_lite_eager_symbolication" LITE_EAGER_SYMOBLICATION_SRCS)
-    # For source debug on lite interpreter, we have to add dependency on pickling
-    # but references to read/writeArchiveAndTensor is not built for mobile
-    # so this condition specifically says we are building for source debug
-    # on mobile.
-    if(BUILD_LITE_INTERPRETER)
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/serialization/pickle.cpp PROPERTIES COMPILE_FLAGS "-DC10_MOBILE -DFEATURE_TORCH_MOBILE")
-    endif()
-  endif()
+file(GLOB_RECURSE autograd_python "${TOOLS_PATH}/autograd/*.py")
+file(GLOB_RECURSE autograd_yaml "${TOOLS_PATH}/autograd/*.yaml")
+file(GLOB_RECURSE autograd_templates "${TOOLS_PATH}/autograd/templates/*")
+add_custom_command(
+  OUTPUT
+  ${TORCH_GENERATED_CODE}
+  COMMAND
+  "${PYTHON_EXECUTABLE}" tools/setup_helpers/generate_code.py
+    --native-functions-path "aten/src/ATen/native/native_functions.yaml"
+    --tags-path "aten/src/ATen/native/tags.yaml"
+    $<$<BOOL:${INTERN_DISABLE_AUTOGRAD}>:--disable-autograd>
+    $<$<BOOL:${SELECTED_OP_LIST}>:--selected-op-list-path="${SELECTED_OP_LIST}">
+    --force_schema_registration
+    --gen_lazy_ts_backend
+    ${GEN_PER_OPERATOR_FLAG}
+  DEPENDS
+    "${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml"
+    "${TORCH_ROOT}/aten/src/ATen/native/tags.yaml"
+    "${TORCH_ROOT}/aten/src/ATen/native/ts_native_functions.yaml"
+    "${TORCH_ROOT}/torch/csrc/lazy/core/shape_inference.h"
+    "${TORCH_ROOT}/torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
+    "${TORCH_ROOT}/aten/src/ATen/templates/DispatchKeyNativeFunctions.h"
+    "${TORCH_ROOT}/aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp"
+    "${TORCH_ROOT}/aten/src/ATen/templates/LazyIr.h"
+    "${TORCH_ROOT}/aten/src/ATen/templates/LazyNonNativeIr.h"
+    "${TORCH_ROOT}/aten/src/ATen/templates/RegisterDispatchKey.cpp"
+    ${autograd_python}
+    ${autograd_yaml}
+    ${autograd_templates}
+    ${torchgen_python}
+  WORKING_DIRECTORY "${TORCH_ROOT}")
+
+
+# Required workaround for libtorch_python.so build
+# see https://samthursfield.wordpress.com/2015/11/21/cmake-dependencies-between-targets-and-files-and-custom-commands/#custom-commands-in-different-directories
+add_custom_target(
+  generate-torch-sources
+  DEPENDS ${TORCH_GENERATED_CODE}
+  )
 
-  list(APPEND LITE_PROFILER_SRCS "")
-  if(USE_LITE_INTERPRETER_PROFILER)
-    append_filelist("libtorch_edge_profiler_sources " LITE_PROFILER_SRCS)
+set(TORCH_SRCS ${GENERATED_CXX_TORCH})
+list(APPEND TORCH_SRCS ${GENERATED_H_TORCH})
+list(APPEND LIBTORCH_CMAKE_SRCS "")
+
+list(APPEND LITE_EAGER_SYMOBLICATION_SRCS "")
+if(USE_SOURCE_DEBUG_ON_MOBILE)
+  append_filelist("libtorch_lite_eager_symbolication" LITE_EAGER_SYMOBLICATION_SRCS)
+  # For source debug on lite interpreter, we have to add dependency on pickling
+  # but references to read/writeArchiveAndTensor is not built for mobile
+  # so this condition specifically says we are building for source debug
+  # on mobile.
+  if(BUILD_LITE_INTERPRETER)
+    set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/serialization/pickle.cpp PROPERTIES COMPILE_FLAGS "-DC10_MOBILE -DFEATURE_TORCH_MOBILE")
   endif()
+endif()
 
-  # Switch between the full jit interpreter and lite interpreter
-  if(BUILD_LITE_INTERPRETER)
-    append_filelist("libtorch_lite_cmake_sources" LIBTORCH_CMAKE_SRCS)
-    list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_EAGER_SYMOBLICATION_SRCS})
-    list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_PROFILER_SRCS})
-    set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
-  else()
-    append_filelist("libtorch_cmake_sources" LIBTORCH_CMAKE_SRCS)
-    if(BUILD_LAZY_TS_BACKEND)
-      append_filelist("lazy_tensor_ts_sources" LIBTORCH_CMAKE_SRCS)
-    endif()
-    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-      # TODO: Delete this line once https://github.com/pytorch/pytorch/pull/55889 lands
-      set_source_files_properties(../torch/csrc/jit/serialization/export.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
+list(APPEND LITE_PROFILER_SRCS "")
+if(USE_LITE_INTERPRETER_PROFILER)
+  append_filelist("libtorch_edge_profiler_sources " LITE_PROFILER_SRCS)
+endif()
 
-      # TODO: Delete this when https://github.com/pytorch/pytorch/issues/35026 is fixed
-      set_source_files_properties(../torch/csrc/autograd/record_function_ops.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
-    endif()
+# Switch between the full jit interpreter and lite interpreter
+if(BUILD_LITE_INTERPRETER)
+  append_filelist("libtorch_lite_cmake_sources" LIBTORCH_CMAKE_SRCS)
+  list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_EAGER_SYMOBLICATION_SRCS})
+  list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_PROFILER_SRCS})
+  set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+else()
+  append_filelist("libtorch_cmake_sources" LIBTORCH_CMAKE_SRCS)
+  if(BUILD_LAZY_TS_BACKEND)
+    append_filelist("lazy_tensor_ts_sources" LIBTORCH_CMAKE_SRCS)
   endif()
-  list(APPEND TORCH_SRCS ${LIBTORCH_CMAKE_SRCS})
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    # TODO: Delete this line once https://github.com/pytorch/pytorch/pull/55889 lands
+    set_source_files_properties(../torch/csrc/jit/serialization/export.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
 
-  if(PRINT_CMAKE_DEBUG_INFO)
-    message(STATUS "Interpreter sources: ")
-    foreach(tmp ${LIBTORCH_CMAKE_SRCS})
-      message(STATUS "  " ${tmp})
-    endforeach()
+    # TODO: Delete this when https://github.com/pytorch/pytorch/issues/35026 is fixed
+    set_source_files_properties(../torch/csrc/autograd/record_function_ops.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
   endif()
+endif()
+list(APPEND TORCH_SRCS ${LIBTORCH_CMAKE_SRCS})
 
-  # Mobile backend delegate srcs
-  if(INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE)
-    set(DELEGATE_SRCS
-      ${TORCH_SRC_DIR}/csrc/jit/backends/backend_debug_info.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/backends/backend_interface.cpp
+if(PRINT_CMAKE_DEBUG_INFO)
+  message(STATUS "Interpreter sources: ")
+  foreach(tmp ${LIBTORCH_CMAKE_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+endif()
+
+# Mobile backend delegate srcs
+if(INTERN_BUILD_MOBILE)
+  set(DELEGATE_SRCS
+    ${TORCH_SRC_DIR}/csrc/jit/backends/backend_debug_info.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/backends/backend_interface.cpp
+  )
+  list(APPEND TORCH_SRCS ${DELEGATE_SRCS})
+  if(IOS AND USE_COREML_DELEGATE)
+    set(COREML_DELEGATE_SRCS
+      ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/cpp/context.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
+      ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm
+      ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.mm
+      ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLFeatureProvider.mm
     )
-    list(APPEND TORCH_SRCS ${DELEGATE_SRCS})
-    if(IOS AND USE_COREML_DELEGATE)
-      set(COREML_DELEGATE_SRCS
-        ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/cpp/context.cpp
-        ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
-        ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm
-        ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.mm
-        ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLFeatureProvider.mm
-      )
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm PROPERTIES COMPILE_FLAGS "-fno-objc-arc")
-      include_directories(${TORCH_ROOT}/third_party/nlohmann/single_include)
-      list(APPEND TORCH_SRCS ${COREML_DELEGATE_SRCS})
-    endif()
-  endif()
+    set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm PROPERTIES COMPILE_FLAGS "-fno-objc-arc")
+    include_directories(${TORCH_ROOT}/third_party/nlohmann/single_include)
+    list(APPEND TORCH_SRCS ${COREML_DELEGATE_SRCS})
+  endif()
+endif()
+
+# Required workaround for LLVM 9 includes.
+if(NOT MSVC)
+  set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/tensorexpr/llvm_jit.cpp PROPERTIES COMPILE_FLAGS -Wno-noexcept-type)
+  # Force -Werror on several files
+  set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/mkldnn/Pooling.cpp PROPERTIES COMPILE_FLAGS "-Werror")
+endif()
+# Disable certain warnings for GCC-9.X
+if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
+  # See https://github.com/pytorch/pytorch/issues/38856
+  set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/tensorexpr/llvm_jit.cpp PROPERTIES COMPILE_FLAGS "-Wno-redundant-move -Wno-noexcept-type")
+  set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/tensorexpr/llvm_codegen.cpp PROPERTIES COMPILE_FLAGS "-Wno-init-list-lifetime")
+endif()
+
+if(NOT INTERN_DISABLE_MOBILE_INTERP)
+  set(MOBILE_SRCS
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/function.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/import.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/import_data.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/interpreter.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/compatibility/model_compatibility.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/module.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/flatbuffer_loader.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/observer.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/parse_bytecode.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/parse_operators.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/quantization.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/train/export_data.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/train/optim/sgd.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/train/random.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/train/sequential.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/mobile/upgrader_mobile.cpp
+     )
+  list(APPEND TORCH_SRCS ${MOBILE_SRCS})
+  list(APPEND TORCH_SRCS ${LITE_EAGER_SYMOBLICATION_SRCS})
+endif()
+
+# This one needs to be unconditionally added as Functions.cpp is also unconditionally added
+list(APPEND TORCH_SRCS
+  ${TORCH_SRC_DIR}/csrc/autograd/FunctionsManual.cpp
+  ${TORCH_SRC_DIR}/csrc/utils/out_types.cpp
+)
+
+if(NOT INTERN_DISABLE_AUTOGRAD AND NOT BUILD_LITE_INTERPRETER)
+  list(APPEND TORCH_SRCS
+    ${TORCH_SRC_DIR}/csrc/autograd/TraceTypeManual.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/VariableTypeManual.cpp
+  )
+endif()
 
-  # Required workaround for LLVM 9 includes.
-  if(NOT MSVC)
-    set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/tensorexpr/llvm_jit.cpp PROPERTIES COMPILE_FLAGS -Wno-noexcept-type)
-    # Force -Werror on several files
-    set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/mkldnn/Pooling.cpp PROPERTIES COMPILE_FLAGS "-Werror")
-  endif()
-  # Disable certain warnings for GCC-9.X
-  if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
-    # See https://github.com/pytorch/pytorch/issues/38856
-    set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/tensorexpr/llvm_jit.cpp PROPERTIES COMPILE_FLAGS "-Wno-redundant-move -Wno-noexcept-type")
-    set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/tensorexpr/llvm_codegen.cpp PROPERTIES COMPILE_FLAGS "-Wno-init-list-lifetime")
-  endif()
-
-  if(NOT INTERN_DISABLE_MOBILE_INTERP)
-    set(MOBILE_SRCS
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/function.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/import.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/import_data.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/interpreter.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/compatibility/model_compatibility.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/module.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/flatbuffer_loader.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/observer.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/parse_bytecode.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/parse_operators.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/quantization.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/train/export_data.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/train/optim/sgd.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/train/random.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/train/sequential.cpp
-       ${TORCH_SRC_DIR}/csrc/jit/mobile/upgrader_mobile.cpp
-       )
-    list(APPEND TORCH_SRCS ${MOBILE_SRCS})
-    list(APPEND TORCH_SRCS ${LITE_EAGER_SYMOBLICATION_SRCS})
-  endif()
-
-  # This one needs to be unconditionally added as Functions.cpp is also unconditionally added
+if(${USE_ITT})
   list(APPEND TORCH_SRCS
-    ${TORCH_SRC_DIR}/csrc/autograd/FunctionsManual.cpp
-    ${TORCH_SRC_DIR}/csrc/utils/out_types.cpp
+    ${TORCH_SRC_DIR}/csrc/itt_wrapper.cpp
+    ${TORCH_SRC_DIR}/csrc/profiler/itt.cpp
   )
+endif()
 
-  if(NOT INTERN_DISABLE_AUTOGRAD AND NOT BUILD_LITE_INTERPRETER)
-    list(APPEND TORCH_SRCS
-      ${TORCH_SRC_DIR}/csrc/autograd/TraceTypeManual.cpp
-      ${TORCH_SRC_DIR}/csrc/autograd/VariableTypeManual.cpp
-    )
-  endif()
+if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
+  list(APPEND TORCH_SRCS
+    ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/mobile/compatibility/backport.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/mobile/compatibility/backport_manager.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/serialization/onnx.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/serialization/export.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/serialization/export_bytecode.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/serialization/export_module.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/serialization/flatbuffer_serializer.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/serialization/flatbuffer_serializer_jit.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/api/module_save.cpp
+    ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
+  )
 
-  if(${USE_ITT})
+  # Disable legacy import of building without Caffe2 support
+  if(BUILD_CAFFE2)
     list(APPEND TORCH_SRCS
-      ${TORCH_SRC_DIR}/csrc/itt_wrapper.cpp
-      ${TORCH_SRC_DIR}/csrc/profiler/itt.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/serialization/import_legacy.cpp
     )
-  endif()
-
-  if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
-    list(APPEND TORCH_SRCS
-      ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/mobile/compatibility/backport.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/mobile/compatibility/backport_manager.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/serialization/onnx.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/serialization/export.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/serialization/export_bytecode.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/serialization/export_module.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/serialization/flatbuffer_serializer.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/serialization/flatbuffer_serializer_jit.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/api/module_save.cpp
-      ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
+  else()
+    set_source_files_properties(
+      ${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp
+      PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
     )
-
-    # Disable legacy import of building without Caffe2 support
-    if(BUILD_CAFFE2)
-      list(APPEND TORCH_SRCS
-        ${TORCH_SRC_DIR}/csrc/jit/serialization/import_legacy.cpp
-      )
-    else()
-      set_source_files_properties(
-        ${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp
-        PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
-      )
-    endif()
-    if(USE_DISTRIBUTED)
-      append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
-      if(NOT WIN32)
-        append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
-      endif()
+  endif()
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
     endif()
   endif()
+endif()
 
-  if(USE_CUDA OR USE_ROCM)
-    append_filelist("libtorch_cuda_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
-  endif()
+if(USE_CUDA OR USE_ROCM)
+  append_filelist("libtorch_cuda_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
+endif()
 
-  if(USE_CUDA)
-    list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
-    add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
-    if(MSVC)
-      # Delay load nvcuda.dll so we can import torch compiled with cuda on a CPU-only machine
-      set(DELAY_LOAD_FLAGS "-DELAYLOAD:nvcuda.dll;delayimp.lib")
-    else()
-      set(DELAY_LOAD_FLAGS "")
-    endif()
-    target_link_libraries(caffe2_nvrtc ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB} ${DELAY_LOAD_FLAGS})
-    target_include_directories(caffe2_nvrtc PRIVATE ${CUDA_INCLUDE_DIRS})
-    install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-    if(USE_NCCL)
-      list(APPEND Caffe2_GPU_SRCS
-        ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
-    endif()
-    if(USE_DISTRIBUTED)
-      append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
-      if(NOT WIN32)
-        append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
-      endif()
-    endif()
-    set_source_files_properties(
-      ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
-      PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
-    )
-    set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/passes/frozen_conv_add_relu_fusion.cpp PROPERTIES COMPILE_FLAGS "-DUSE_CUDA=1")
-  endif()
-
-  if(BUILD_ONEDNN_GRAPH)
-    list(APPEND Caffe2_CPU_SRCS
-      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/graph_fuser.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/graph_rewriter.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/graph_helper.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/register_interface.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/interface.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/kernel.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/defer_size_check.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/layout_propagation.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/prepare_binary.cpp
-      ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/guard_shape.cpp
-    )
+if(USE_CUDA)
+  list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
+  add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
+  if(MSVC)
+    # Delay load nvcuda.dll so we can import torch compiled with cuda on a CPU-only machine
+    set(DELAY_LOAD_FLAGS "-DELAYLOAD:nvcuda.dll;delayimp.lib")
+  else()
+    set(DELAY_LOAD_FLAGS "")
   endif()
-
-  if(USE_ROCM)
-    list(APPEND Caffe2_HIP_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
-    if(USE_NCCL)
-      list(APPEND Caffe2_HIP_SRCS
-        ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
-    endif()
-    if(USE_DISTRIBUTED)
-      append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
-      if(NOT WIN32)
-        append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
-      endif()
+  target_link_libraries(caffe2_nvrtc ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB} ${DELAY_LOAD_FLAGS})
+  target_include_directories(caffe2_nvrtc PRIVATE ${CUDA_INCLUDE_DIRS})
+  install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+  if(USE_NCCL)
+    list(APPEND Caffe2_GPU_SRCS
+      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
+  endif()
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
     endif()
-    # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
-    # See NOTE [ ATen NVRTC Stub and HIP ]
-    add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
-    target_link_libraries(caffe2_nvrtc ${PYTORCH_HIP_HCC_LIBRARIES} ${ROCM_HIPRTC_LIB})
-    target_compile_definitions(caffe2_nvrtc PRIVATE USE_ROCM __HIP_PLATFORM_HCC__)
-    install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
   endif()
+  set_source_files_properties(
+    ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
+    PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
+  )
+  set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/passes/frozen_conv_add_relu_fusion.cpp PROPERTIES COMPILE_FLAGS "-DUSE_CUDA=1")
+endif()
+
+if(BUILD_ONEDNN_GRAPH)
+  list(APPEND Caffe2_CPU_SRCS
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/graph_fuser.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/graph_rewriter.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/graph_helper.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/register_interface.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/interface.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/kernel.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/defer_size_check.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/layout_propagation.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/prepare_binary.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/guard_shape.cpp
+  )
+endif()
 
-  if(NOT NO_API AND NOT BUILD_LITE_INTERPRETER)
-    list(APPEND TORCH_SRCS
-      ${TORCH_SRC_DIR}/csrc/api/src/cuda.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/data/datasets/mnist.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/distributed.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/random.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/sequential.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/stream.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/enum.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/imethod.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/serialize.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/init.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/module.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/_functions.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/activation.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/adaptive.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/batchnorm.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/normalization.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/instancenorm.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/conv.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/dropout.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/distance.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/embedding.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/fold.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/linear.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/loss.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/padding.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/pixelshuffle.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/pooling.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/rnn.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/upsampling.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/transformer.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/container/functional.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/activation.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/adaptive.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/batchnorm.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/embedding.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/instancenorm.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/normalization.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/conv.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/dropout.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/linear.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/padding.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/pooling.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/rnn.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/vision.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/nn/options/transformer.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/optim/adagrad.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/optim/adam.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/optim/adamw.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/optim/lbfgs.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/optim/optimizer.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/optim/rmsprop.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/optim/serialize.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/optim/sgd.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/optim/schedulers/lr_scheduler.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/optim/schedulers/step_lr.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/serialize/input-archive.cpp
-      ${TORCH_SRC_DIR}/csrc/api/src/serialize/output-archive.cpp
-    )
+if(USE_ROCM)
+  list(APPEND Caffe2_HIP_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
+  if(USE_NCCL)
+    list(APPEND Caffe2_HIP_SRCS
+      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
+  endif()
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
+    endif()
   endif()
+  # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
+  # See NOTE [ ATen NVRTC Stub and HIP ]
+  add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
+  target_link_libraries(caffe2_nvrtc ${PYTORCH_HIP_HCC_LIBRARIES} ${ROCM_HIPRTC_LIB})
+  target_compile_definitions(caffe2_nvrtc PRIVATE USE_ROCM __HIP_PLATFORM_HCC__)
+  install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+endif()
 
-  list(APPEND Caffe2_CPU_SRCS ${TORCH_SRCS})
+if(NOT NO_API AND NOT BUILD_LITE_INTERPRETER)
+  list(APPEND TORCH_SRCS
+    ${TORCH_SRC_DIR}/csrc/api/src/cuda.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/data/datasets/mnist.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/distributed.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/random.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/sequential.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/stream.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/enum.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/imethod.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/serialize.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/init.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/module.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/_functions.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/activation.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/adaptive.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/batchnorm.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/normalization.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/instancenorm.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/conv.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/dropout.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/distance.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/embedding.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/fold.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/linear.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/loss.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/padding.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/pixelshuffle.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/pooling.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/rnn.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/upsampling.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/transformer.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/container/functional.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/activation.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/adaptive.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/batchnorm.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/embedding.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/instancenorm.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/normalization.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/conv.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/dropout.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/linear.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/padding.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/pooling.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/rnn.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/vision.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/nn/options/transformer.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/adagrad.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/adam.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/adamw.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/lbfgs.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/optimizer.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/rmsprop.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/serialize.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/sgd.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/schedulers/lr_scheduler.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/optim/schedulers/step_lr.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/serialize/input-archive.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/serialize/output-archive.cpp
+  )
 endif()
 
+list(APPEND Caffe2_CPU_SRCS ${TORCH_SRCS})
+
 if(USE_MPS)
   list(APPEND Caffe2_CPU_SRCS ${Caffe2_MPS_SRCS})
 endif()
@@ -1079,47 +1075,46 @@ if(BUILD_LITE_INTERPRETER AND SELECTED_OP_LIST)
   add_dependencies(torch_cpu __selected_mobile_ops_header_gen)
 endif()
 
-if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
-  if(NOT NO_API)
-    target_include_directories(torch_cpu PRIVATE
-      ${TORCH_SRC_DIR}/csrc/api
-      ${TORCH_SRC_DIR}/csrc/api/include)
-  endif()
-
-  if(BUILD_SPLIT_CUDA AND MSVC)
-    # -INCLUDE is used to ensure torch_cuda_cpp/cu are linked against in a project that relies on them.
-    target_link_libraries(torch_cuda_cpp INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ")
-    # See [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu] in native_functions.yaml
-    target_link_libraries(torch_cuda_cu INTERFACE "-INCLUDE:?_torch_cuda_cu_linker_symbol_op_cuda@native@at@@YA?AVTensor@2@AEBV32@@Z")
-  elseif(USE_CUDA AND MSVC)
-    # -INCLUDE is used to ensure torch_cuda is linked against in a project that relies on them.
-    # Related issue: https://github.com/pytorch/pytorch/issues/31611
-    target_link_libraries(torch_cuda INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ")
-  endif()
-
-  if(NOT BUILD_LITE_INTERPRETER)
-    set(TH_CPU_INCLUDE
-      # dense
-      aten/src/TH
-      ${CMAKE_CURRENT_BINARY_DIR}/aten/src/TH
-      ${TORCH_ROOT}/aten/src
-      ${CMAKE_CURRENT_BINARY_DIR}/aten/src
-
-      ${CMAKE_BINARY_DIR}/aten/src)
-      target_include_directories(torch_cpu PRIVATE ${TH_CPU_INCLUDE})
-      endif()
-
-  set(ATen_CPU_INCLUDE
+if(NOT NO_API)
+  target_include_directories(torch_cpu PRIVATE
+    ${TORCH_SRC_DIR}/csrc/api
+    ${TORCH_SRC_DIR}/csrc/api/include)
+endif()
+
+if(BUILD_SPLIT_CUDA AND MSVC)
+  # -INCLUDE is used to ensure torch_cuda_cpp/cu are linked against in a project that relies on them.
+  target_link_libraries(torch_cuda_cpp INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ")
+  # See [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu] in native_functions.yaml
+  target_link_libraries(torch_cuda_cu INTERFACE "-INCLUDE:?_torch_cuda_cu_linker_symbol_op_cuda@native@at@@YA?AVTensor@2@AEBV32@@Z")
+elseif(USE_CUDA AND MSVC)
+  # -INCLUDE is used to ensure torch_cuda is linked against in a project that relies on them.
+  # Related issue: https://github.com/pytorch/pytorch/issues/31611
+  target_link_libraries(torch_cuda INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ")
+endif()
+
+if(NOT BUILD_LITE_INTERPRETER)
+  set(TH_CPU_INCLUDE
+    # dense
+    aten/src/TH
+    ${CMAKE_CURRENT_BINARY_DIR}/aten/src/TH
     ${TORCH_ROOT}/aten/src
-    ${CMAKE_CURRENT_BINARY_DIR}/../aten/src
+    ${CMAKE_CURRENT_BINARY_DIR}/aten/src
+
     ${CMAKE_BINARY_DIR}/aten/src)
+    target_include_directories(torch_cpu PRIVATE ${TH_CPU_INCLUDE})
+endif()
 
-    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-      set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/QuantizedLinear.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
-      set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/RNN.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
-      set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
-      set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/quantized/qlinear_unpack.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
-    endif()
+set(ATen_CPU_INCLUDE
+  ${TORCH_ROOT}/aten/src
+  ${CMAKE_CURRENT_BINARY_DIR}/../aten/src
+  ${CMAKE_BINARY_DIR}/aten/src)
+
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/QuantizedLinear.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
+  set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/RNN.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
+  set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
+  set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/quantized/qlinear_unpack.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
+endif()
 
 if(USE_TBB)
   list(APPEND ATen_CPU_INCLUDE ${TBB_INCLUDE_DIR})
@@ -1131,135 +1126,128 @@ if(BUILD_CAFFE2 AND BUILD_CAFFE2_OPS AND USE_FBGEMM)
   target_include_directories(torch_cpu PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../third_party)
 endif()
 
-  target_include_directories(torch_cpu PRIVATE ${ATen_CPU_INCLUDE})
+target_include_directories(torch_cpu PRIVATE ${ATen_CPU_INCLUDE})
 
-  target_include_directories(torch_cpu PRIVATE
-    ${TORCH_SRC_DIR}/csrc)
+target_include_directories(torch_cpu PRIVATE
+  ${TORCH_SRC_DIR}/csrc)
 
-  target_include_directories(torch_cpu PRIVATE
-    ${TORCH_ROOT}/third_party/miniz-2.1.0)
-
-  target_include_directories(torch_cpu PRIVATE
-    ${TORCH_ROOT}/third_party/kineto/libkineto/include)
+target_include_directories(torch_cpu PRIVATE
+  ${TORCH_ROOT}/third_party/miniz-2.1.0)
 
-  if(USE_KINETO)
-    target_include_directories(torch_cpu PRIVATE
-      ${TORCH_ROOT}/third_party/kineto/libkineto/src)
-  endif()
+target_include_directories(torch_cpu PRIVATE
+  ${TORCH_ROOT}/third_party/kineto/libkineto/include)
 
-  install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
-    DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
-    FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
-  install(DIRECTORY "${TORCH_SRC_DIR}/csrc/distributed/c10d"
-    DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}
-    FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
+if(USE_KINETO)
+  target_include_directories(torch_cpu PRIVATE
+    ${TORCH_ROOT}/third_party/kineto/libkineto/src)
+endif()
+
+install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
+  DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
+  FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
+install(DIRECTORY "${TORCH_SRC_DIR}/csrc/distributed/c10d"
+  DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}
+  FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
+install(FILES
+  "${TORCH_SRC_DIR}/script.h"
+  "${TORCH_SRC_DIR}/extension.h"
+  "${TORCH_SRC_DIR}/custom_class.h"
+  "${TORCH_SRC_DIR}/library.h"
+  "${TORCH_SRC_DIR}/custom_class_detail.h"
+  DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
+if(USE_DEPLOY)
   install(FILES
-    "${TORCH_SRC_DIR}/script.h"
-    "${TORCH_SRC_DIR}/extension.h"
-    "${TORCH_SRC_DIR}/custom_class.h"
-    "${TORCH_SRC_DIR}/library.h"
-    "${TORCH_SRC_DIR}/custom_class_detail.h"
+    "${TORCH_SRC_DIR}/deploy.h"
     DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
-  if(USE_DEPLOY)
-    install(FILES
-      "${TORCH_SRC_DIR}/deploy.h"
-      DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
-  endif()
+endif()
 
-  if(BUILD_TEST)
-    if(BUILD_LITE_INTERPRETER)
-      add_subdirectory(
-        ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime
-        ${CMAKE_BINARY_DIR}/test_lite_interpreter_runtime
-      )
-      add_subdirectory(
-        ${TORCH_ROOT}/test/mobile/lightweight_dispatch
-        ${CMAKE_BINARY_DIR}/test_codegen_unboxing
-      )
-    else()
-      add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
-      add_subdirectory(
-        ${TORCH_ROOT}/test/cpp/tensorexpr
-        ${CMAKE_BINARY_DIR}/test_tensorexpr
-      )
-      if(USE_DISTRIBUTED)
-        add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
-        if(NOT WIN32)
-          add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
-          add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
-        endif()
-      endif()
-      if(NOT NO_API)
-        add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
+if(BUILD_TEST)
+  if(BUILD_LITE_INTERPRETER)
+    add_subdirectory(
+      ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime
+      ${CMAKE_BINARY_DIR}/test_lite_interpreter_runtime
+    )
+    add_subdirectory(
+      ${TORCH_ROOT}/test/mobile/lightweight_dispatch
+      ${CMAKE_BINARY_DIR}/test_codegen_unboxing
+    )
+  else()
+    add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
+    add_subdirectory(
+      ${TORCH_ROOT}/test/cpp/tensorexpr
+      ${CMAKE_BINARY_DIR}/test_tensorexpr
+    )
+    if(USE_DISTRIBUTED)
+      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
+      if(NOT WIN32)
+        add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
+        add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
       endif()
+    endif()
+    if(NOT NO_API)
+      add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
+    endif()
 
-      if(USE_LLVM AND LLVM_FOUND)
-        add_subdirectory(
-          ${TORCH_ROOT}/test/mobile/nnc
-          ${CMAKE_BINARY_DIR}/test_mobile_nnc
-        )
-      endif()
-      add_subdirectory(${TORCH_ROOT}/test/cpp/lazy
-                       ${CMAKE_BINARY_DIR}/test_lazy)
+    if(USE_LLVM AND LLVM_FOUND)
+      add_subdirectory(
+        ${TORCH_ROOT}/test/mobile/nnc
+        ${CMAKE_BINARY_DIR}/test_mobile_nnc
+      )
     endif()
+    add_subdirectory(${TORCH_ROOT}/test/cpp/lazy
+                     ${CMAKE_BINARY_DIR}/test_lazy)
   endif()
+endif()
 
-  # XXX This ABI check cannot be run with arm-linux-androideabi-g++
-  if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-    if(DEFINED GLIBCXX_USE_CXX11_ABI)
-      message(STATUS "_GLIBCXX_USE_CXX11_ABI is already defined as a cmake variable")
-    else()
-      message(STATUS "${CMAKE_CXX_COMPILER} ${TORCH_SRC_DIR}/abi-check.cpp -o ${CMAKE_BINARY_DIR}/abi-check")
-      execute_process(
-        COMMAND
-        "${CMAKE_CXX_COMPILER}"
-        "${TORCH_SRC_DIR}/abi-check.cpp"
-        "-o"
-        "${CMAKE_BINARY_DIR}/abi-check"
-        RESULT_VARIABLE ABI_CHECK_COMPILE_RESULT)
-      if(ABI_CHECK_COMPILE_RESULT)
-        message(FATAL_ERROR "Could not compile ABI Check: ${ABI_CHECK_COMPILE_RESULT}")
-      endif()
-      execute_process(
-        COMMAND "${CMAKE_BINARY_DIR}/abi-check"
-        RESULT_VARIABLE ABI_CHECK_RESULT
-        OUTPUT_VARIABLE GLIBCXX_USE_CXX11_ABI)
-      if(ABI_CHECK_RESULT)
-        message(WARNING "Could not run ABI Check: ${ABI_CHECK_RESULT}")
-      endif()
+# XXX This ABI check cannot be run with arm-linux-androideabi-g++
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+  if(DEFINED GLIBCXX_USE_CXX11_ABI)
+    message(STATUS "_GLIBCXX_USE_CXX11_ABI is already defined as a cmake variable")
+  else()
+    message(STATUS "${CMAKE_CXX_COMPILER} ${TORCH_SRC_DIR}/abi-check.cpp -o ${CMAKE_BINARY_DIR}/abi-check")
+    execute_process(
+      COMMAND
+      "${CMAKE_CXX_COMPILER}"
+      "${TORCH_SRC_DIR}/abi-check.cpp"
+      "-o"
+      "${CMAKE_BINARY_DIR}/abi-check"
+      RESULT_VARIABLE ABI_CHECK_COMPILE_RESULT)
+    if(ABI_CHECK_COMPILE_RESULT)
+      message(FATAL_ERROR "Could not compile ABI Check: ${ABI_CHECK_COMPILE_RESULT}")
     endif()
-    message(STATUS "Determined _GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
-  endif()
-
-  # CMake config for external projects.
-  configure_file(
-    ${PROJECT_SOURCE_DIR}/cmake/TorchConfigVersion.cmake.in
-    ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake
-    @ONLY)
-  configure_file(
-    ${TORCH_ROOT}/cmake/TorchConfig.cmake.in
-    ${PROJECT_BINARY_DIR}/TorchConfig.cmake
-    @ONLY)
-  install(FILES
-    ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake
-    ${PROJECT_BINARY_DIR}/TorchConfig.cmake
-    DESTINATION share/cmake/Torch)
+    execute_process(
+      COMMAND "${CMAKE_BINARY_DIR}/abi-check"
+      RESULT_VARIABLE ABI_CHECK_RESULT
+      OUTPUT_VARIABLE GLIBCXX_USE_CXX11_ABI)
+    if(ABI_CHECK_RESULT)
+      message(WARNING "Could not run ABI Check: ${ABI_CHECK_RESULT}")
+    endif()
+  endif()
+  message(STATUS "Determined _GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
+endif()
 
+# CMake config for external projects.
+configure_file(
+  ${PROJECT_SOURCE_DIR}/cmake/TorchConfigVersion.cmake.in
+  ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake
+  @ONLY)
+configure_file(
+  ${TORCH_ROOT}/cmake/TorchConfig.cmake.in
+  ${PROJECT_BINARY_DIR}/TorchConfig.cmake
+  @ONLY)
+install(FILES
+  ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake
+  ${PROJECT_BINARY_DIR}/TorchConfig.cmake
+  DESTINATION share/cmake/Torch)
 
-  # ---[ Torch python bindings build
-  add_subdirectory(../torch torch)
 
+# ---[ Torch python bindings build
+add_subdirectory(../torch torch)
 
-endif()
 # ==========================================================
 # END formerly-libtorch flags
 # ==========================================================
 
-
-
-
-
-
 if(NOT NO_API)
   target_include_directories(torch_cpu PUBLIC
     $<BUILD_INTERFACE:${TORCH_SRC_DIR}/csrc/api>
@@ -1399,7 +1387,7 @@ if(USE_DISTRIBUTED)
   endif()
 endif()
 
-if(NOT INTERN_BUILD_MOBILE OR BUILD_CAFFE2_MOBILE)
+if(NOT INTERN_BUILD_MOBILE)
   caffe2_interface_library(caffe2_protos caffe2_protos_whole)
   target_link_libraries(torch_cpu PRIVATE caffe2_protos_whole)
   if(${CAFFE2_LINK_LOCAL_PROTOBUF})
diff --git a/caffe2/core/CMakeLists.txt b/caffe2/core/CMakeLists.txt
index 91cd11551b34..f59c0e703edf 100644
--- a/caffe2/core/CMakeLists.txt
+++ b/caffe2/core/CMakeLists.txt
@@ -1,4 +1,4 @@
-if((NOT BUILD_CAFFE2) OR (INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE))
+if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE)
   list(APPEND Caffe2_CPU_SRCS
     "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
   )
diff --git a/caffe2/perfkernels/CMakeLists.txt b/caffe2/perfkernels/CMakeLists.txt
index 4316900ba56a..9510ec60dfef 100644
--- a/caffe2/perfkernels/CMakeLists.txt
+++ b/caffe2/perfkernels/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE)
+if(INTERN_BUILD_MOBILE)
   list(APPEND Caffe2_CPU_SRCS
     "${CMAKE_CURRENT_SOURCE_DIR}/embedding_lookup_idx.cc"
   )
diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt
index 3e059d3f5eb3..a7dfe1181e31 100644
--- a/caffe2/utils/CMakeLists.txt
+++ b/caffe2/utils/CMakeLists.txt
@@ -1,4 +1,4 @@
-if((NOT BUILD_CAFFE2) OR (INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE))
+if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE)
   list(APPEND Caffe2_CPU_SRCS
     utils/string_utils.cc
     utils/threadpool/ThreadPool.cc
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 0e96653967da..873ea3e13105 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -78,7 +78,7 @@ if(USE_CUDA)
 endif()
 
 # ---[ Custom Protobuf
-if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND (NOT INTERN_BUILD_MOBILE OR BUILD_CAFFE2_MOBILE))
+if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_BUILD_MOBILE)
   disable_ubsan()
   include(${CMAKE_CURRENT_LIST_DIR}/ProtoBuf.cmake)
   enable_ubsan()
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index a9c6201fb6be..27f8381209e6 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -26,7 +26,6 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  CAFFE2_VERSION        : ${CAFFE2_VERSION}")
   message(STATUS "  BUILD_CAFFE2          : ${BUILD_CAFFE2}")
   message(STATUS "  BUILD_CAFFE2_OPS      : ${BUILD_CAFFE2_OPS}")
-  message(STATUS "  BUILD_CAFFE2_MOBILE   : ${BUILD_CAFFE2_MOBILE}")
   message(STATUS "  BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}")
   message(STATUS "  BUILD_TENSOREXPR_BENCHMARK: ${BUILD_TENSOREXPR_BENCHMARK}")
   message(STATUS "  BUILD_NVFUSER_BENCHMARK: ${BUILD_NVFUSER_BENCHMARK}")
diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
index b0c4cc6f08b5..5944a5a1a626 100644
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@@ -415,72 +415,70 @@ function(torch_compile_options libname)
     list(APPEND private_compile_options -Werror)
   endif()
 
-  if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
-    # until they can be unified, keep these lists synced with setup.py
-    if(MSVC)
+  # until they can be unified, keep these lists synced with setup.py
+  if(MSVC)
 
-      if(MSVC_Z7_OVERRIDE)
-        set(MSVC_DEBINFO_OPTION "/Z7")
-      else()
-        set(MSVC_DEBINFO_OPTION "/Zi")
-      endif()
+    if(MSVC_Z7_OVERRIDE)
+      set(MSVC_DEBINFO_OPTION "/Z7")
+    else()
+      set(MSVC_DEBINFO_OPTION "/Zi")
+    endif()
 
-      target_compile_options(${libname} PUBLIC
-        $<$<COMPILE_LANGUAGE:CXX>:
-          ${MSVC_RUNTIME_LIBRARY_OPTION}
-          $<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:${MSVC_DEBINFO_OPTION}>
-          /EHsc
-          /DNOMINMAX
-          /wd4267
-          /wd4251
-          /wd4522
-          /wd4522
-          /wd4838
-          /wd4305
-          /wd4244
-          /wd4190
-          /wd4101
-          /wd4996
-          /wd4275
-          /bigobj>
-        )
+    target_compile_options(${libname} PUBLIC
+      $<$<COMPILE_LANGUAGE:CXX>:
+        ${MSVC_RUNTIME_LIBRARY_OPTION}
+        $<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:${MSVC_DEBINFO_OPTION}>
+        /EHsc
+        /DNOMINMAX
+        /wd4267
+        /wd4251
+        /wd4522
+        /wd4522
+        /wd4838
+        /wd4305
+        /wd4244
+        /wd4190
+        /wd4101
+        /wd4996
+        /wd4275
+        /bigobj>
+      )
+  else()
+    list(APPEND private_compile_options
+      -Wall
+      -Wextra
+      -Wno-unused-parameter
+      -Wno-unused-function
+      -Wno-unused-result
+      -Wno-missing-field-initializers
+      -Wno-write-strings
+      -Wno-unknown-pragmas
+      -Wno-type-limits
+      -Wno-array-bounds
+      -Wno-unknown-pragmas
+      -Wno-sign-compare
+      -Wno-strict-overflow
+      -Wno-strict-aliasing
+      -Wno-error=deprecated-declarations
+      # Clang has an unfixed bug leading to spurious missing braces
+      # warnings, see https://bugs.llvm.org/show_bug.cgi?id=21629
+      -Wno-missing-braces
+      )
+    if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+      list(APPEND private_compile_options
+        -Wno-range-loop-analysis)
     else()
       list(APPEND private_compile_options
-        -Wall
-        -Wextra
-        -Wno-unused-parameter
-        -Wno-unused-function
-        -Wno-unused-result
-        -Wno-missing-field-initializers
-        -Wno-write-strings
-        -Wno-unknown-pragmas
-        -Wno-type-limits
-        -Wno-array-bounds
-        -Wno-unknown-pragmas
-        -Wno-sign-compare
-        -Wno-strict-overflow
-        -Wno-strict-aliasing
-        -Wno-error=deprecated-declarations
-        # Clang has an unfixed bug leading to spurious missing braces
-        # warnings, see https://bugs.llvm.org/show_bug.cgi?id=21629
-        -Wno-missing-braces
-        )
-      if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-        list(APPEND private_compile_options
-          -Wno-range-loop-analysis)
-      else()
-        list(APPEND private_compile_options
-          # Considered to be flaky.  See the discussion at
-          # https://github.com/pytorch/pytorch/pull/9608
-          -Wno-maybe-uninitialized)
-      endif()
-
+        # Considered to be flaky.  See the discussion at
+        # https://github.com/pytorch/pytorch/pull/9608
+        -Wno-maybe-uninitialized)
     endif()
 
-    if(MSVC)
-    elseif(WERROR)
-      list(APPEND private_compile_options -Wno-strict-overflow)
-    endif()
+  endif()
+
+  if(MSVC)
+  elseif(WERROR)
+    list(APPEND private_compile_options -Wno-strict-overflow)
   endif()
 
   target_compile_options(${libname} PRIVATE
diff --git a/scripts/build_android.sh b/scripts/build_android.sh
index 225caa68abfc..2d6f051ea19f 100755
--- a/scripts/build_android.sh
+++ b/scripts/build_android.sh
@@ -59,30 +59,20 @@ echo "Android NDK version: $ANDROID_NDK_VERSION"
 
 CMAKE_ARGS=()
 
-if [ -z "${BUILD_CAFFE2_MOBILE:-}" ]; then
-  # Build PyTorch mobile
-  CMAKE_ARGS+=("-DCMAKE_PREFIX_PATH=$($PYTHON -c 'import sysconfig; print(sysconfig.get_path("purelib"))')")
-  CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=$($PYTHON -c 'import sys; print(sys.executable)')")
-  CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=OFF")
-  # custom build with selected ops
-  if [ -n "${SELECTED_OP_LIST}" ]; then
-    SELECTED_OP_LIST="$(cd $(dirname $SELECTED_OP_LIST); pwd -P)/$(basename $SELECTED_OP_LIST)"
-    echo "Choose SELECTED_OP_LIST file: $SELECTED_OP_LIST"
-    if [ ! -r ${SELECTED_OP_LIST} ]; then
-      echo "Error: SELECTED_OP_LIST file ${SELECTED_OP_LIST} not found."
-      exit 1
-    fi
-    CMAKE_ARGS+=("-DSELECTED_OP_LIST=${SELECTED_OP_LIST}")
+# Build PyTorch mobile
+CMAKE_ARGS+=("-DCMAKE_PREFIX_PATH=$($PYTHON -c 'import sysconfig; print(sysconfig.get_path("purelib"))')")
+CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=$($PYTHON -c 'import sys; print(sys.executable)')")
+CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=OFF")
+
+# custom build with selected ops
+if [ -n "${SELECTED_OP_LIST}" ]; then
+  SELECTED_OP_LIST="$(cd $(dirname $SELECTED_OP_LIST); pwd -P)/$(basename $SELECTED_OP_LIST)"
+  echo "Choose SELECTED_OP_LIST file: $SELECTED_OP_LIST"
+  if [ ! -r ${SELECTED_OP_LIST} ]; then
+    echo "Error: SELECTED_OP_LIST file ${SELECTED_OP_LIST} not found."
+    exit 1
   fi
-else
-  # Build Caffe2 mobile
-  CMAKE_ARGS+=("-DBUILD_CAFFE2_MOBILE=ON")
-  # Build protobuf from third_party so we have a host protoc binary.
-  echo "Building protoc"
-  $CAFFE2_ROOT/scripts/build_host_protoc.sh
-  # Use locally built protoc because we'll build libprotobuf for the
-  # target architecture and need an exact version match.
-  CMAKE_ARGS+=("-DCAFFE2_CUSTOM_PROTOC_EXECUTABLE=$CAFFE2_ROOT/build_host_protoc/bin/protoc")
+  CMAKE_ARGS+=("-DSELECTED_OP_LIST=${SELECTED_OP_LIST}")
 fi
 
 # If Ninja is installed, prefer it to Make
diff --git a/scripts/build_ios.sh b/scripts/build_ios.sh
index a0402db65a79..335d14b52171 100755
--- a/scripts/build_ios.sh
+++ b/scripts/build_ios.sh
@@ -11,37 +11,24 @@ CAFFE2_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)"
 
 CMAKE_ARGS=()
 
-if [ -z "${BUILD_CAFFE2_MOBILE:-}" ]; then
-  # Build PyTorch mobile
-  CMAKE_ARGS+=("-DCMAKE_PREFIX_PATH=$(python -c 'import sysconfig; print(sysconfig.get_path("purelib"))')")
-  CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=$(python -c 'import sys; print(sys.executable)')")
-  CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=OFF")
-  # custom build with selected ops
-  if [ -n "${SELECTED_OP_LIST}" ]; then
-    SELECTED_OP_LIST="$(cd $(dirname $SELECTED_OP_LIST); pwd -P)/$(basename $SELECTED_OP_LIST)"
-    echo "Choose SELECTED_OP_LIST file: $SELECTED_OP_LIST"
-    if [ ! -r ${SELECTED_OP_LIST} ]; then
-      echo "Error: SELECTED_OP_LIST file ${SELECTED_OP_LIST} not found."
-      exit 1
-    fi
-    CMAKE_ARGS+=("-DSELECTED_OP_LIST=${SELECTED_OP_LIST}")
+# Build PyTorch mobile
+CMAKE_ARGS+=("-DCMAKE_PREFIX_PATH=$(python -c 'import sysconfig; print(sysconfig.get_path("purelib"))')")
+CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=$(python -c 'import sys; print(sys.executable)')")
+CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=OFF")
+
+# custom build with selected ops
+if [ -n "${SELECTED_OP_LIST}" ]; then
+  SELECTED_OP_LIST="$(cd $(dirname $SELECTED_OP_LIST); pwd -P)/$(basename $SELECTED_OP_LIST)"
+  echo "Choose SELECTED_OP_LIST file: $SELECTED_OP_LIST"
+  if [ ! -r ${SELECTED_OP_LIST} ]; then
+    echo "Error: SELECTED_OP_LIST file ${SELECTED_OP_LIST} not found."
+    exit 1
   fi
-  # bitcode
-  if [ "${ENABLE_BITCODE:-}" == '1' ]; then
-    CMAKE_ARGS+=("-DCMAKE_C_FLAGS=-fembed-bitcode")
-    CMAKE_ARGS+=("-DCMAKE_CXX_FLAGS=-fembed-bitcode")
-  fi
-else
-  # Build Caffe2 mobile
-  CMAKE_ARGS+=("-DBUILD_CAFFE2_MOBILE=ON")
-  # Build protobuf from third_party so we have a host protoc binary.
-  echo "Building protoc"
-  BITCODE_FLAGS="-DCMAKE_C_FLAGS=-fembed-bitcode -DCMAKE_CXX_FLAGS=-fembed-bitcode "
-  $CAFFE2_ROOT/scripts/build_host_protoc.sh --other-flags $BITCODE_FLAGS
-  # Use locally built protoc because we'll build libprotobuf for the
-  # target architecture and need an exact version match.
-  CMAKE_ARGS+=("-DCAFFE2_CUSTOM_PROTOC_EXECUTABLE=$CAFFE2_ROOT/build_host_protoc/bin/protoc")
-  # Bitcode is enabled by default for caffe2
+  CMAKE_ARGS+=("-DSELECTED_OP_LIST=${SELECTED_OP_LIST}")
+fi
+
+# bitcode
+if [ "${ENABLE_BITCODE:-}" == '1' ]; then
   CMAKE_ARGS+=("-DCMAKE_C_FLAGS=-fembed-bitcode")
   CMAKE_ARGS+=("-DCMAKE_CXX_FLAGS=-fembed-bitcode")
 fi

From 889540d091086bb31367a602295730f64e2ff690 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 8 Sep 2022 02:34:16 +0000
Subject: [PATCH 39/45] [torchdynamo hash update] update the pinned torchdynamo
 hash (#84678)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned torchdynamo hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84678
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/torchdynamo.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/torchdynamo.txt b/.github/ci_commit_pins/torchdynamo.txt
index 74806c5906ea..992c70b96b45 100644
--- a/.github/ci_commit_pins/torchdynamo.txt
+++ b/.github/ci_commit_pins/torchdynamo.txt
@@ -1 +1 @@
-01bf13a3029c8f6a8e9989e94e620622d33bfe39
+fe3173f7e6c804e6330ac187ea8e4101f45ff9a2

From cb6ba27db3e1e55e9a429fb4a576a9e8389c2b93 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 8 Sep 2022 02:34:33 +0000
Subject: [PATCH 40/45] [vision hash update] update the pinned vision hash
 (#84679)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84679
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 4a4df24cae41..2a1cd7720c6b 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-4c073b09521604e410bac8a0b0fe0f2680724e0d
+84dcf695d64c15f8a0be845ac65901bdde845429

From 0945074a8e4e7d0d07b7a929873d1f0dbdca7173 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Thu, 8 Sep 2022 00:31:58 +0000
Subject: [PATCH 41/45] Preserver stacktrace over functionalization (#84662)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84662
Approved by: https://github.com/Chillee
---
 functorch/functorch/_src/aot_autograd.py | 3 ++-
 functorch/test/test_pythonkey.py         | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/functorch/functorch/_src/aot_autograd.py b/functorch/functorch/_src/aot_autograd.py
index b56fb4ae1811..55b5027877d4 100644
--- a/functorch/functorch/_src/aot_autograd.py
+++ b/functorch/functorch/_src/aot_autograd.py
@@ -306,7 +306,8 @@ def aot_dispatch_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfi
         fx_g = make_fx(joint_forward_backward)(*joint_inputs)
 
         def fake_fn(primals, tangents):
-            return fx_g(primals, tangents)
+            with torch.fx.traceback.override_stack_trace():
+                return torch.fx.Interpreter(fx_g).run(primals, tangents)
 
         # Trace a second time, running functionalization, and THEN running decompositions.
         # functionalization only acts on ATen today, and doesn't currently handle
diff --git a/functorch/test/test_pythonkey.py b/functorch/test/test_pythonkey.py
index 8b35056eed0e..5bb5e02a3f57 100644
--- a/functorch/test/test_pythonkey.py
+++ b/functorch/test/test_pythonkey.py
@@ -663,7 +663,6 @@ def forward(self, x, y):
         assert torch.allclose(inputs[0].grad, cloned_inputs[0].grad)
         assert torch.allclose(inputs[1].grad, cloned_inputs[1].grad)
 
-    @unittest.skip("Breaks with functionalization on by default")
     def test_aot_module_simplified_preserves_stack_trace(self):
         class MockModule(torch.nn.Module):
             def __init__(self):
@@ -701,6 +700,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
         y = torch.randn(128, 30, requires_grad=True)
         inputs = [x, y]
         res = aot_mod(*inputs)
+        res[0].sum().backward()
 
 
 only_for = ("cpu")

From 49ec8d32c706e3df1f777b2361b2ee673269f8b8 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Thu, 8 Sep 2022 03:12:50 +0000
Subject: [PATCH 42/45] Suggest draft PRs in contribution_guide.rst (#84658)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84658
Approved by: https://github.com/huydhn
---
 docs/source/community/contribution_guide.rst | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/source/community/contribution_guide.rst b/docs/source/community/contribution_guide.rst
index a8eaccf9f7bb..a2a89721b64e 100644
--- a/docs/source/community/contribution_guide.rst
+++ b/docs/source/community/contribution_guide.rst
@@ -73,11 +73,13 @@ here is the basic process.
 
 -  **Open a pull request.**
 
-   -  If you are not ready for the pull request to be reviewed, tag it
-      with [WIP]. We will ignore it when doing review passes. If you are
-      working on a complex change, it's good to start things off as WIP,
-      because you will need to spend time looking at CI results to see
-      if things worked out or not.
+   -  If you are not ready for the pull request to be reviewed, create a draft
+      pull request first - you can later convert it to a full PR by pressing
+      "Ready for review" button. You can also prepend the title of the PR with
+      "[WIP]" ("work in progress") while it's still in draft. We will ignore
+      draft PRs when doing review passes. If you are working on a complex change,
+      it's good to start things off as a draft, because you will need to spend
+      time looking at CI results to see if things worked out or not.
    -  Find an appropriate reviewer for your change. We have some folks
       who regularly go through the PR queue and try to review
       everything, but if you happen to know who the maintainer for a

From 942c0f31dfffbc5eb180cadd0fd1302d5e907f64 Mon Sep 17 00:00:00 2001
From: titaiwang <titaiwang@microsoft.com>
Date: Thu, 8 Sep 2022 00:58:09 +0000
Subject: [PATCH 43/45] [ONNX] Align Optional Type in block (#83599)

Why:

Previously, we use `replaceAlluseswith` after adding Optional on the node which is right before output. However, this may break the graph by also changing the nodes that needs the node (original) as input. We only need the node to be optional in output.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/83599
Approved by: https://github.com/justinchuby, https://github.com/BowenBao, https://github.com/malfet
---
 test/onnx/test_pytorch_onnx_onnxruntime.py      |  2 +-
 .../jit/passes/onnx/fixup_onnx_controlflow.cpp  | 17 +++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 4bebcb9c38f1..42278711817e 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -12375,7 +12375,7 @@ def forward(self, x) -> Optional[Tensor]:
 
     @common_utils.parametrize(
         "module_class",
-        (IfNoneOutput, IfNoneInput, LoopNoneOutput),
+        (IfNoneOutput, IfNoneInput, LoopNoneOutput, LoopNoneInput),
         name_fn=lambda module_class: module_class.__name__,
     )
     @common_utils.parametrize("x_size", (0, 1), name_fn=lambda x_size: str(x_size))
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
index 82ee29cb3e05..f25160260ea7 100644
--- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
@@ -195,10 +195,12 @@ Node* ONNXOptionalNode(OptionalTypePtr opt_type, Graph* g) {
 }
 
 // Replaces block output i with an onnx::Optional
-// with `type` taken from opt_type.
-// Needed when control flow has multiple branches, one of which
+// with `type` taken from opt_type. If and Loop Ops shares this function.
+// 1. If Op: Needed when control flow has multiple branches, one of which
 // is defined by `block` and returns a None and another branch
 // returns not-None. The passed-in opt_type should be from the other branch.
+// 2. Loop Op: insert Optional node before output, if input is Optional type
+// or output type is None.
 void ReplaceBlockOutputWithOptional(
     OptionalTypePtr opt_type,
     Block* block,
@@ -206,7 +208,9 @@ void ReplaceBlockOutputWithOptional(
   Node* opt_node = ONNXOptionalNode(opt_type, block->owningGraph());
   opt_node->insertBefore(block->return_node());
   Value* block_output = block->outputs().at(i);
-  block_output->replaceAllUsesWith(opt_node->output());
+  // replace only the last value as Optional type only affects
+  // the value right before output
+  block_output->replaceAllUsesAfterNodeWith(opt_node, opt_node->output());
   if (!block_output->type()->cast<NoneType>()) {
     opt_node->addInput(block_output);
     opt_node->copyMetadata(block_output->node());
@@ -265,7 +269,12 @@ void FixupONNXLoopBlockOutputs(Node* n) {
   for (Block* block : n->blocks()) {
     // output 0 is continue_condition, never None.
     for (const auto i : c10::irange(1, block->outputs().size())) {
-      if (block->outputs().at(i)->type()->cast<NoneType>()) {
+      // Two conditions that we need to replace block output with optional
+      // 1. output is NoneType
+      // 2. input is optional but output type is not
+      if ((block->outputs().at(i)->type()->cast<NoneType>()) ||
+          (block->inputs().at(i + 1)->type()->cast<OptionalType>() &&
+           !block->outputs().at(i)->type()->cast<OptionalType>())) {
         ReplaceBlockOutputWithOptional(
             // Output 0 is continue_condition.
             // Inputs (0, 1) are (loop_counter, cond). So input i + 1

From b288cfd328be3908ffc42b948bf0137940b01e85 Mon Sep 17 00:00:00 2001
From: Aaron Enye Shi <enye.shi@gmail.com>
Date: Thu, 8 Sep 2022 03:37:39 +0000
Subject: [PATCH 44/45] [Profiler] Add quoted metadata API to remove empty
 trace cpu_op metadata (#84128)

Summary: The profiler utility function, stacksToStr, is quoting all metadata values, and therefore even empty metadata fields are being pushed into the trace files. Remove this and add an argument to use quoted metadata api provided by libkineto::GenericTraceActivity.

Test Plan:
Before, a trace file will dump extra empty fields for Module Hierarchy and Call Stack:
```
  {
    "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 798015, "tid": 798264,
    "ts": 1661451887593736, "dur": 21,
    "args": {
      "Trace name": "PyTorch Profiler", "Trace iteration": 0,
      "External id": 513,
      "Profiler Event Index": 0, "Module Hierarchy": "", "Call stack": "", "Fwd thread id": 3, "Sequence number": 1, "ID": 139880536829952, "Parent ID": null
    }
  }
```
After, these fields will not be in the trace file anymore:
```
  {
    "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 1482813, "tid": 1483069,
    "ts": 1661468912444365, "dur": 43,
    "args": {
      "Trace name": "PyTorch Profiler", "Trace iteration": 0,
      "External id": 513,
      "Profiler Event Index": 0, "Fwd thread id": 3, "Sequence number": 1, "ID": 139852271321088, "Parent ID": null
    }
  }
```
Also, with input tracking on, it looks correct compared to previous kineto observer:
```
  {
    "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 1572428, "tid": 1572776,
    "ts": 1661469920242309, "dur": 19,
    "args": {
      "Trace name": "PyTorch Profiler", "Trace iteration": 0,
      "External id": 531,
      "Profiler Event Index": 18, "Input Dims": [[256, 256], [256, 256], []], "Input type": ["float", "float", "Scalar"], "ID": 140023871647232, "Parent ID": 140023871646720
    }
  }
```

Differential Revision: D39041244

Pulled By: aaronenyeshi

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84128
Approved by: https://github.com/robieta
---
 torch/csrc/autograd/profiler_kineto.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 81377774492d..52304562e4ca 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -85,7 +85,7 @@ struct MetadataBase {
   }
 
   void addMetadata(const std::string& key, const std::string& value) {
-    if (kineto_activity_ && !value.empty()) {
+    if (kineto_activity_ && !value.empty() && value != "\"\"") {
       torch::profiler::impl::kineto::addMetadata(kineto_activity_, key, value);
     }
   }

From 8bd9fe3f493073bf8f4a2e428c3048096fb36052 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Wed, 7 Sep 2022 23:39:28 +0000
Subject: [PATCH 45/45] Changes to prepare for fake tensors on in functorch by
 default (#84432)

Fixes some errors you run into in dynamo when turning on fake tensors. I'm waiting on flipping the switch because I need to also get some fixes into dynamo + do benchmarking.

I could manually turn off fake tensors in functorch in dynamo, and then turn it on here if requested, although the changes here are pretty minimal.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84432
Approved by: https://github.com/Chillee
---
 functorch/functorch/_src/aot_autograd.py | 4 ++--
 functorch/functorch/_src/config.py       | 8 +-------
 torch/_subclasses/fake_tensor.py         | 6 +++++-
 torch/fx/experimental/proxy_tensor.py    | 1 +
 torch/fx/passes/backends/cudagraphs.py   | 7 +++++--
 5 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/functorch/functorch/_src/aot_autograd.py b/functorch/functorch/_src/aot_autograd.py
index 55b5027877d4..94f27655a139 100644
--- a/functorch/functorch/_src/aot_autograd.py
+++ b/functorch/functorch/_src/aot_autograd.py
@@ -401,9 +401,9 @@ def create_aot_dispatcher_function(
         **aot_autograd_decompositions,
         **aot_config.decompositions,
     }
-    fake_mode = FakeTensorMode.push() if config.use_fake_tensor else nullcontext()
+    fake_mode = FakeTensorMode if config.use_fake_tensor else nullcontext
 
-    with preserve_rng_state(), fake_mode as mode:
+    with preserve_rng_state(), fake_mode() as mode:
 
         def process_inputs(flat_args):
             if mode:
diff --git a/functorch/functorch/_src/config.py b/functorch/functorch/_src/config.py
index f233ab5fc4de..76e5ad6eacb6 100644
--- a/functorch/functorch/_src/config.py
+++ b/functorch/functorch/_src/config.py
@@ -11,13 +11,7 @@
 
 use_functionalize = True
 
-# TODO: flip this to true by default
-# Waiting on
-#   https://github.com/pytorch/pytorch/pull/81617
-#   https://github.com/pytorch/pytorch/pull/81609
-#   https://github.com/pytorch/pytorch/pull/81604
-#   fix for test_aot_autograd_exhaustive_sgn_cpu_float32 _efficientzerotensor
-#   fix for complex numbers
+# TODO Benchmark
 use_fake_tensor = False
 
 debug_partitioner = os.environ.get('AOT_PARTITIONER_DEBUG', False)
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index c84313e9403d..c325f054bd54 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -865,7 +865,11 @@ def wrap(e, device=None):
             return tree_map(partial(wrap), r)
 
     def may_turn_const(self, t):
-        return t.numel() <= CONSTANT_NUMEL_LIMIT and not t.is_sparse
+        return (
+            t.numel() <= CONSTANT_NUMEL_LIMIT
+            and not t.is_sparse
+            and not isinstance(t, FakeTensor)
+        )
 
     def invalidate_written_to_constants(self, func, flat_arg_tensors, args, kwargs):
         any_constant = any(e.constant is not None for e in flat_arg_tensors)
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index bbecccc38456..528d8b3c7376 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -96,6 +96,7 @@ def has_proxy(obj):
 def set_meta(proxy, val):
     if isinstance(val, FakeTensor):
         proxy.node.meta['val'] = val
+        proxy.node.meta['tensor_meta'] = _extract_tensor_metadata(val)
     elif isinstance(val, PySymInt):
         proxy.node.meta['val'] = val
     elif isinstance(val, torch.Tensor):
diff --git a/torch/fx/passes/backends/cudagraphs.py b/torch/fx/passes/backends/cudagraphs.py
index 7aa4aed45ccf..2d4ccbcfb3dc 100644
--- a/torch/fx/passes/backends/cudagraphs.py
+++ b/torch/fx/passes/backends/cudagraphs.py
@@ -21,15 +21,18 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
 
         found_not_cuda = False
 
+        def meta_fk(meta):
+            return meta["val"] if "val" in meta else meta["fake_result"]
+
         def find_not_cuda(t):
             nonlocal found_not_cuda
             if isinstance(t, torch.Tensor) and t.device.type != 'cuda':
                 found_not_cuda = True
 
         for n in node.all_input_nodes:
-            tree_map(find_not_cuda, n.meta['fake_result'])
+            tree_map(find_not_cuda, meta_fk(n.meta))
 
-        tree_map(find_not_cuda, node.meta['fake_result'])
+        tree_map(find_not_cuda, meta_fk(node.meta))
 
         # NB: factory function is accounted for because the result would be
         # cpu or cuda